1 module toml_foolery.decode.types..string;
2 
3 import std.algorithm;
4 import std.array;
5 import std.conv : to;
6 import std.regex;
7 import std..string : strip;
8 import std.uni;
9 import std.utf : UTFException;
10 import toml_foolery.decode.exceptions;
11 
12 version(unittest) import exceeds_expectations;
13 
14 
15 package(toml_foolery.decode) string parseTomlString(string value)
16 {
17     return
18         value[0..3] == `"""` ? parseTomlBasicMultiLineString(value) :
19         value[0..3] == `'''` ? parseTomlLiteralMultiLineString(value) :
20         value[0..1] == `"`   ? parseTomlBasicString(value) :
21                                parseTomlLiteralString(value);
22 }
23 
24 private string parseTomlBasicString(string value)
25 {
26     return value[1 .. $-1].unescaped;
27 }
28 
29 private string parseTomlBasicMultiLineString(string value)
30 {
31     return value[3 .. $-3].unescaped.removeEscapedLinebreaks.removeLeadingNewline;
32 }
33 
34 private string parseTomlLiteralString(string value)
35 {
36     return value[1 .. $-1];
37 }
38 
39 private string parseTomlLiteralMultiLineString(string value)
40 {
41     return value[3 .. $-3].removeLeadingNewline;
42 }
43 
44 /// Opposite of toml_foolery.encode.string.escaped
45 private string unescaped(string s)
46 {
47     enum auto unidecoder = ctRegex!(`(?:\\u[0-9a-fA-F]{4})+|\\U[0-9a-fA-F]{8}`, "g");
48 
49     return s.substitute!(
50         `\"`, "\"",
51         `\\`, "\\",
52         `\b`, "\b",
53         `\f`, "\f",
54         `\n`, "\n",
55         `\r`, "\r",
56         `\t`, "\t",
57     ).to!string.replaceAll!((Captures!string captures)
58         {
59             // Code yoinked from:
60             // https://forum.dlang.org/post/n0bai6$ag0$1@digitalmars.com
61             // Except it needs to be converted to wchar if \u and dchar if \U
62 
63             assert(captures.hit[1] == 'u' || captures.hit[1] == 'U', "Unexpected capture: " ~ captures.hit);
64 
65             if (captures.hit[1] == 'u')
66             {
67                 // case \u####
68 
69                 try
70                 {
71                     // Since some of code units might not be standalone code points
72                     // (surrogates), we match sequences of them and parse them all
73                     // at once. Doing them one at a time causes problems since you
74                     // can't add a surrogate to a UTF-8 string. Or something.
75                     return captures.hit
76                         .splitter(`\u`)
77                         .filter!((e) => e.length != 0)
78                         .map!((e) => e.to!int(16))
79                         .map!((e) => e.to!wchar)
80                         .array
81                         .to!string;
82                 }
83                 catch (UTFException e)
84                 {
85                     throw new TomlDecodingException("Caught UTFException while decoding a string.", e);
86                 }
87             }
88             else
89             {
90                 // case \U########
91 
92                 try
93                 {
94                     return captures.hit[2..$].to!uint(16).to!dchar.to!string;
95                 }
96                 catch (UTFException e)
97                 {
98                     throw new TomlDecodingException("Caught UTFException while decoding a string.", e);
99                 }
100             }
101         }
102     )(unidecoder).to!string;
103 }
104 
105 private string removeEscapedLinebreaks(string value)
106 {
107     enum auto re = ctRegex!(`\\\r?\n\s*`, "g");
108     return value.replaceAll(re, "");
109 }
110 
111 /// For multiline strings, remove the newline immediately following the opening quotes
112 /// if one exists.
113 private string removeLeadingNewline(string value)
114 {
115     if (value[0] == '\n')
116     {
117         return value[1..$];
118     }
119     else if (value[0..2] == "\r\n")
120     {
121         return value[2..$];
122     }
123     else
124     {
125         return value;
126     }
127 }
128 
129 @("Basic — Simple")
130 unittest
131 {
132     expect(parseTomlBasicString(`"Hello World!"`)).toEqual("Hello World!");
133 }
134 
135 @("Basic — Tabs")
136 unittest
137 {
138     expect(parseTomlBasicString("\"Hello\tWorld!\"")).toEqual("Hello\tWorld!");
139 }
140 
141 @("Basic — Escaped chars")
142 unittest
143 {
144     expect(parseTomlBasicString(`"\"Hello\n\tWorld!\""`)).toEqual("\"Hello\n\tWorld!\"");
145 }
146 
147 @("ML Basic — Simple")
148 unittest
149 {
150     expect(parseTomlBasicMultiLineString("\"\"\"Hello\nWorld!\"\"\"")).toEqual("Hello\nWorld!");
151 }
152 
153 @("ML Basic — Leading Newline")
154 unittest
155 {
156     expect(parseTomlBasicMultiLineString("\"\"\"\nHello\nWorld!\n\"\"\"")).toEqual("Hello\nWorld!\n");
157 }
158 
159 @("ML Basic — Trailing backslash")
160 unittest
161 {
162     expect(parseTomlBasicMultiLineString("\"\"\"Hello \\\n    World!\"\"\"")).toEqual("Hello World!");
163 }
164 
165 @("ML Basic — CRLF")
166 unittest
167 {
168     expect(parseTomlBasicMultiLineString("\"\"\"\r\nHello\r\nWorld!\"\"\"")).toEqual("Hello\r\nWorld!");
169 }
170 
171 @("Literal — Simple")
172 unittest
173 {
174     expect(parseTomlLiteralString("'Hello World!'")).toEqual("Hello World!");
175 }
176 
177 @("Literal — Escaped chars")
178 unittest
179 {
180     expect(parseTomlLiteralString(`'Hello\nWorld!'`)).toEqual("Hello\\nWorld!");
181 }
182 
183 @("ML Literal — Simple")
184 unittest
185 {
186     expect(parseTomlLiteralMultiLineString("'''Hello\nWorld!'''")).toEqual("Hello\nWorld!");
187 }
188 
189 @("ML Literal — Leading Newline")
190 unittest
191 {
192     expect(parseTomlLiteralMultiLineString("'''\n Hello\nWorld!\n'''")).toEqual(" Hello\nWorld!\n");
193 }
194 
195 @("ML Literal — Trailing backslash")
196 unittest
197 {
198     expect(parseTomlLiteralMultiLineString("'''Hello \\\n    World!'''")).toEqual("Hello \\\n    World!");
199 }
200 
201 @("ML Literal — CRLF")
202 unittest
203 {
204     expect(parseTomlBasicMultiLineString("'''Hello\r\nWorld!'''")).toEqual("Hello\r\nWorld!");
205 }
206 
207 @(`Basic — Decode \u####`)
208 unittest
209 {
210     expect(parseTomlBasicString(`"\uD834\uDD1E"`)).toEqual("𝄞");
211 }
212 
213 @(`Basic — Decode \U########`)
214 unittest
215 {
216     expect(parseTomlBasicString(`"\U000132f9"`)).toEqual("𓋹");
217 }
218 
219 @(`ML Basic — Decode \u####`)
220 unittest
221 {
222     expect(parseTomlBasicMultiLineString(`"""\uD834\uDD1E"""`)).toEqual("𝄞");
223 }
224 
225 @(`ML Basic — Decode \U########`)
226 unittest
227 {
228     expect(parseTomlBasicMultiLineString(`"""\U000132f9"""`)).toEqual("𓋹");
229 }
230 
231 @(`Literal — Don't Decode \u####`)
232 unittest
233 {
234     expect(parseTomlLiteralString(`'\uD834\uDD1E'`)).toEqual(`\uD834\uDD1E`);
235 }
236 
237 @(`Literal — Don't Decode \U########`)
238 unittest
239 {
240     expect(parseTomlLiteralString(`'\U000132f9'`)).toEqual(`\U000132f9`);
241 }
242 
243 @(`ML Literal — Don't Decode \u####`)
244 unittest
245 {
246     expect(parseTomlLiteralMultiLineString(`'''\uD834\uDD1E'''`)).toEqual(`\uD834\uDD1E`);
247 }
248 
249 @(`ML Literal — Don't Decode \U########`)
250 unittest
251 {
252     expect(parseTomlLiteralMultiLineString(`'''\U000132f9'''`)).toEqual(`\U000132f9`);
253 }