1 module toml_foolery.decode.types..string; 2 3 import std.algorithm; 4 import std.array; 5 import std.conv : to; 6 import std.regex; 7 import std..string : strip; 8 import std.uni; 9 import std.utf : UTFException; 10 import toml_foolery.decode.exceptions; 11 12 version(unittest) import exceeds_expectations; 13 14 15 package(toml_foolery.decode) string parseTomlString(string value) 16 { 17 return 18 value[0..3] == `"""` ? parseTomlBasicMultiLineString(value) : 19 value[0..3] == `'''` ? parseTomlLiteralMultiLineString(value) : 20 value[0..1] == `"` ? parseTomlBasicString(value) : 21 parseTomlLiteralString(value); 22 } 23 24 private string parseTomlBasicString(string value) 25 { 26 return value[1 .. $-1].unescaped; 27 } 28 29 private string parseTomlBasicMultiLineString(string value) 30 { 31 return value[3 .. $-3].unescaped.removeEscapedLinebreaks.removeLeadingNewline; 32 } 33 34 private string parseTomlLiteralString(string value) 35 { 36 return value[1 .. $-1]; 37 } 38 39 private string parseTomlLiteralMultiLineString(string value) 40 { 41 return value[3 .. $-3].removeLeadingNewline; 42 } 43 44 /// Opposite of toml_foolery.encode.string.escaped 45 private string unescaped(string s) 46 { 47 enum auto unidecoder = ctRegex!(`(?:\\u[0-9a-fA-F]{4})+|\\U[0-9a-fA-F]{8}`, "g"); 48 49 return s.substitute!( 50 `\"`, "\"", 51 `\\`, "\\", 52 `\b`, "\b", 53 `\f`, "\f", 54 `\n`, "\n", 55 `\r`, "\r", 56 `\t`, "\t", 57 ).to!string.replaceAll!((Captures!string captures) 58 { 59 // Code yoinked from: 60 // https://forum.dlang.org/post/n0bai6$ag0$1@digitalmars.com 61 // Except it needs to be converted to wchar if \u and dchar if \U 62 63 assert(captures.hit[1] == 'u' || captures.hit[1] == 'U', "Unexpected capture: " ~ captures.hit); 64 65 if (captures.hit[1] == 'u') 66 { 67 // case \u#### 68 69 try 70 { 71 // Since some of code units might not be standalone code points 72 // (surrogates), we match sequences of them and parse them all 73 // at once. Doing them one at a time causes problems since you 74 // can't add a surrogate to a UTF-8 string. Or something. 75 return captures.hit 76 .splitter(`\u`) 77 .filter!((e) => e.length != 0) 78 .map!((e) => e.to!int(16)) 79 .map!((e) => e.to!wchar) 80 .array 81 .to!string; 82 } 83 catch (UTFException e) 84 { 85 throw new TomlDecodingException("Caught UTFException while decoding a string.", e); 86 } 87 } 88 else 89 { 90 // case \U######## 91 92 try 93 { 94 return captures.hit[2..$].to!uint(16).to!dchar.to!string; 95 } 96 catch (UTFException e) 97 { 98 throw new TomlDecodingException("Caught UTFException while decoding a string.", e); 99 } 100 } 101 } 102 )(unidecoder).to!string; 103 } 104 105 private string removeEscapedLinebreaks(string value) 106 { 107 enum auto re = ctRegex!(`\\\r?\n\s*`, "g"); 108 return value.replaceAll(re, ""); 109 } 110 111 /// For multiline strings, remove the newline immediately following the opening quotes 112 /// if one exists. 113 private string removeLeadingNewline(string value) 114 { 115 if (value[0] == '\n') 116 { 117 return value[1..$]; 118 } 119 else if (value[0..2] == "\r\n") 120 { 121 return value[2..$]; 122 } 123 else 124 { 125 return value; 126 } 127 } 128 129 @("Basic — Simple") 130 unittest 131 { 132 expect(parseTomlBasicString(`"Hello World!"`)).toEqual("Hello World!"); 133 } 134 135 @("Basic — Tabs") 136 unittest 137 { 138 expect(parseTomlBasicString("\"Hello\tWorld!\"")).toEqual("Hello\tWorld!"); 139 } 140 141 @("Basic — Escaped chars") 142 unittest 143 { 144 expect(parseTomlBasicString(`"\"Hello\n\tWorld!\""`)).toEqual("\"Hello\n\tWorld!\""); 145 } 146 147 @("ML Basic — Simple") 148 unittest 149 { 150 expect(parseTomlBasicMultiLineString("\"\"\"Hello\nWorld!\"\"\"")).toEqual("Hello\nWorld!"); 151 } 152 153 @("ML Basic — Leading Newline") 154 unittest 155 { 156 expect(parseTomlBasicMultiLineString("\"\"\"\nHello\nWorld!\n\"\"\"")).toEqual("Hello\nWorld!\n"); 157 } 158 159 @("ML Basic — Trailing backslash") 160 unittest 161 { 162 expect(parseTomlBasicMultiLineString("\"\"\"Hello \\\n World!\"\"\"")).toEqual("Hello World!"); 163 } 164 165 @("ML Basic — CRLF") 166 unittest 167 { 168 expect(parseTomlBasicMultiLineString("\"\"\"\r\nHello\r\nWorld!\"\"\"")).toEqual("Hello\r\nWorld!"); 169 } 170 171 @("Literal — Simple") 172 unittest 173 { 174 expect(parseTomlLiteralString("'Hello World!'")).toEqual("Hello World!"); 175 } 176 177 @("Literal — Escaped chars") 178 unittest 179 { 180 expect(parseTomlLiteralString(`'Hello\nWorld!'`)).toEqual("Hello\\nWorld!"); 181 } 182 183 @("ML Literal — Simple") 184 unittest 185 { 186 expect(parseTomlLiteralMultiLineString("'''Hello\nWorld!'''")).toEqual("Hello\nWorld!"); 187 } 188 189 @("ML Literal — Leading Newline") 190 unittest 191 { 192 expect(parseTomlLiteralMultiLineString("'''\n Hello\nWorld!\n'''")).toEqual(" Hello\nWorld!\n"); 193 } 194 195 @("ML Literal — Trailing backslash") 196 unittest 197 { 198 expect(parseTomlLiteralMultiLineString("'''Hello \\\n World!'''")).toEqual("Hello \\\n World!"); 199 } 200 201 @("ML Literal — CRLF") 202 unittest 203 { 204 expect(parseTomlBasicMultiLineString("'''Hello\r\nWorld!'''")).toEqual("Hello\r\nWorld!"); 205 } 206 207 @(`Basic — Decode \u####`) 208 unittest 209 { 210 expect(parseTomlBasicString(`"\uD834\uDD1E"`)).toEqual("𝄞"); 211 } 212 213 @(`Basic — Decode \U########`) 214 unittest 215 { 216 expect(parseTomlBasicString(`"\U000132f9"`)).toEqual("𓋹"); 217 } 218 219 @(`ML Basic — Decode \u####`) 220 unittest 221 { 222 expect(parseTomlBasicMultiLineString(`"""\uD834\uDD1E"""`)).toEqual("𝄞"); 223 } 224 225 @(`ML Basic — Decode \U########`) 226 unittest 227 { 228 expect(parseTomlBasicMultiLineString(`"""\U000132f9"""`)).toEqual("𓋹"); 229 } 230 231 @(`Literal — Don't Decode \u####`) 232 unittest 233 { 234 expect(parseTomlLiteralString(`'\uD834\uDD1E'`)).toEqual(`\uD834\uDD1E`); 235 } 236 237 @(`Literal — Don't Decode \U########`) 238 unittest 239 { 240 expect(parseTomlLiteralString(`'\U000132f9'`)).toEqual(`\U000132f9`); 241 } 242 243 @(`ML Literal — Don't Decode \u####`) 244 unittest 245 { 246 expect(parseTomlLiteralMultiLineString(`'''\uD834\uDD1E'''`)).toEqual(`\uD834\uDD1E`); 247 } 248 249 @(`ML Literal — Don't Decode \U########`) 250 unittest 251 { 252 expect(parseTomlLiteralMultiLineString(`'''\U000132f9'''`)).toEqual(`\U000132f9`); 253 }