1 module toml_foolery.decode.types..string; 2 3 import std.algorithm; 4 import std.array; 5 import std.conv : to; 6 import std.regex; 7 import std.string : strip; 8 import std.uni; 9 10 version(unittest) import dshould; 11 12 13 package(toml_foolery.decode) string parseTomlString(string value) 14 { 15 return 16 value[0..3] == `"""` ? parseTomlBasicMultiLineString(value) : 17 value[0..3] == `'''` ? parseTomlLiteralMultiLineString(value) : 18 value[0..1] == `"` ? parseTomlBasicString(value) : 19 parseTomlLiteralString(value); 20 } 21 22 private string parseTomlBasicString(string value) 23 { 24 return value[1 .. $-1].unescaped; 25 } 26 27 private string parseTomlBasicMultiLineString(string value) 28 { 29 return value[3 .. $-3].unescaped.removeEscapedLinebreaks.removeLeadingNewline; 30 } 31 32 private string parseTomlLiteralString(string value) 33 { 34 return value[1 .. $-1]; 35 } 36 37 private string parseTomlLiteralMultiLineString(string value) 38 { 39 return value[3 .. $-3].removeLeadingNewline; 40 } 41 42 /// Opposite of toml_foolery.encode.string.escaped 43 /// TODO: Parse unicode escape sequences 44 private string unescaped(string s) 45 { 46 enum auto unidecoder = ctRegex!(`(?:\\u[0-9a-fA-F]{4})+|\\U[0-9a-fA-F]{8}`, "g"); 47 48 return s.substitute!( 49 `\"`, "\"", 50 `\\`, "\\", 51 `\b`, "\b", 52 `\f`, "\f", 53 `\n`, "\n", 54 `\r`, "\r", 55 `\t`, "\t", 56 ).to!string.replaceAll!((Captures!string captures) 57 { 58 // Code yoinked from: 59 // https://forum.dlang.org/post/n0bai6$ag0$1@digitalmars.com 60 // Except it needs to be converted to wchar if \u and dchar if \U 61 62 assert(captures.hit[1] == 'u' || captures.hit[1] == 'U', "Unexpected capture: " ~ captures.hit); 63 64 if (captures.hit[1] == 'u') 65 { 66 // case \u#### 67 68 // Since some of code units might not be standalone code points 69 // (surrogates), we match sequences of them and parse them all 70 // at once. Doing them one at a time causes problems since you 71 // can't add a surrogate to a UTF-8 string. Or something. 72 return captures.hit 73 .splitter(`\u`) 74 .filter!((e) => e.length != 0) 75 .map!((e) => e.to!int(16)) 76 .map!((e) => e.to!wchar) 77 .array 78 .to!string; 79 } 80 else 81 { 82 // case \U######## 83 84 return captures.hit[2..$].to!int(16).to!dchar.to!string; 85 } 86 } 87 )(unidecoder).to!string; 88 } 89 90 private string removeEscapedLinebreaks(string value) 91 { 92 enum auto re = ctRegex!(`\\\r?\n\s*`, "g"); 93 return value.replaceAll(re, ""); 94 } 95 96 /// For multiline strings, remove the newline immediately following the opening quotes 97 /// if one exists. 98 private string removeLeadingNewline(string value) 99 { 100 if (value[0] == '\n') 101 { 102 return value[1..$]; 103 } 104 else if (value[0..2] == "\r\n") 105 { 106 return value[2..$]; 107 } 108 else 109 { 110 return value; 111 } 112 } 113 114 @("Basic — Simple") 115 unittest 116 { 117 parseTomlBasicString(`"Hello World!"`).should.equal("Hello World!"); 118 } 119 120 @("Basic — Tabs") 121 unittest 122 { 123 parseTomlBasicString("\"Hello\tWorld!\"").should.equal("Hello\tWorld!"); 124 } 125 126 @("Basic — Escaped chars") 127 unittest 128 { 129 parseTomlBasicString(`"\"Hello\n\tWorld!\""`).should.equal("\"Hello\n\tWorld!\""); 130 } 131 132 @("ML Basic — Simple") 133 unittest 134 { 135 parseTomlBasicMultiLineString("\"\"\"Hello\nWorld!\"\"\"").should.equal("Hello\nWorld!"); 136 } 137 138 @("ML Basic — Leading Newline") 139 unittest 140 { 141 parseTomlBasicMultiLineString("\"\"\"\nHello\nWorld!\n\"\"\"").should.equal("Hello\nWorld!\n"); 142 } 143 144 @("ML Basic — Trailing backslash") 145 unittest 146 { 147 parseTomlBasicMultiLineString("\"\"\"Hello \\\n World!\"\"\"").should.equal("Hello World!"); 148 } 149 150 @("ML Basic — CRLF") 151 unittest 152 { 153 parseTomlBasicMultiLineString("\"\"\"\r\nHello\r\nWorld!\"\"\"").should.equal("Hello\r\nWorld!"); 154 } 155 156 @("Literal — Simple") 157 unittest 158 { 159 parseTomlLiteralString("'Hello World!'").should.equal("Hello World!"); 160 } 161 162 @("Literal — Escaped chars") 163 unittest 164 { 165 parseTomlLiteralString(`'Hello\nWorld!'`).should.equal("Hello\\nWorld!"); 166 } 167 168 @("ML Literal — Simple") 169 unittest 170 { 171 parseTomlLiteralMultiLineString("'''Hello\nWorld!'''").should.equal("Hello\nWorld!"); 172 } 173 174 @("ML Literal — Leading Newline") 175 unittest 176 { 177 parseTomlLiteralMultiLineString("'''\n Hello\nWorld!\n'''").should.equal(" Hello\nWorld!\n"); 178 } 179 180 @("ML Literal — Trailing backslash") 181 unittest 182 { 183 parseTomlLiteralMultiLineString("'''Hello \\\n World!'''").should.equal("Hello \\\n World!"); 184 } 185 186 @("ML Literal — CRLF") 187 unittest 188 { 189 parseTomlBasicMultiLineString("'''Hello\r\nWorld!'''").should.equal("Hello\r\nWorld!"); 190 } 191 192 @(`Basic — Decode \u####`) 193 unittest 194 { 195 parseTomlBasicString(`"\uD834\uDD1E"`).should.equal("𝄞"); 196 } 197 198 @(`Basic — Decode \U########`) 199 unittest 200 { 201 parseTomlBasicString(`"\U000132f9"`).should.equal("𓋹"); 202 } 203 204 @(`ML Basic — Decode \u####`) 205 unittest 206 { 207 parseTomlBasicMultiLineString(`"""\uD834\uDD1E"""`).should.equal("𝄞"); 208 } 209 210 @(`ML Basic — Decode \U########`) 211 unittest 212 { 213 parseTomlBasicMultiLineString(`"""\U000132f9"""`).should.equal("𓋹"); 214 } 215 216 @(`Literal — Don't Decode \u####`) 217 unittest 218 { 219 parseTomlLiteralString(`'\uD834\uDD1E'`).should.equal(`\uD834\uDD1E`); 220 } 221 222 @(`Literal — Don't Decode \U########`) 223 unittest 224 { 225 parseTomlLiteralString(`'\U000132f9'`).should.equal(`\U000132f9`); 226 } 227 228 @(`ML Literal — Don't Decode \u####`) 229 unittest 230 { 231 parseTomlLiteralMultiLineString(`'''\uD834\uDD1E'''`).should.equal(`\uD834\uDD1E`); 232 } 233 234 @(`ML Literal — Don't Decode \U########`) 235 unittest 236 { 237 parseTomlLiteralMultiLineString(`'''\U000132f9'''`).should.equal(`\U000132f9`); 238 }