1 module toml_foolery.decode.types..string;
2 
3 import std.algorithm;
4 import std.array;
5 import std.conv : to;
6 import std.regex;
7 import std.string : strip;
8 import std.uni;
9 
10 version(unittest) import dshould;
11 
12 
13 package(toml_foolery.decode) string parseTomlString(string value)
14 {
15     return
16         value[0..3] == `"""` ? parseTomlBasicMultiLineString(value) :
17         value[0..3] == `'''` ? parseTomlLiteralMultiLineString(value) :
18         value[0..1] == `"`   ? parseTomlBasicString(value) :
19                                parseTomlLiteralString(value);
20 }
21 
22 private string parseTomlBasicString(string value)
23 {
24     return value[1 .. $-1].unescaped;
25 }
26 
27 private string parseTomlBasicMultiLineString(string value)
28 {
29     return value[3 .. $-3].unescaped.removeEscapedLinebreaks.removeLeadingNewline;
30 }
31 
32 private string parseTomlLiteralString(string value)
33 {
34     return value[1 .. $-1];
35 }
36 
37 private string parseTomlLiteralMultiLineString(string value)
38 {
39     return value[3 .. $-3].removeLeadingNewline;
40 }
41 
42 /// Opposite of toml_foolery.encode.string.escaped
43 /// TODO: Parse unicode escape sequences
44 private string unescaped(string s)
45 {
46     enum auto unidecoder = ctRegex!(`(?:\\u[0-9a-fA-F]{4})+|\\U[0-9a-fA-F]{8}`, "g");
47 
48     return s.substitute!(
49         `\"`, "\"",
50         `\\`, "\\",
51         `\b`, "\b",
52         `\f`, "\f",
53         `\n`, "\n",
54         `\r`, "\r",
55         `\t`, "\t",
56     ).to!string.replaceAll!((Captures!string captures)
57         {
58             // Code yoinked from:
59             // https://forum.dlang.org/post/n0bai6$ag0$1@digitalmars.com
60             // Except it needs to be converted to wchar if \u and dchar if \U
61 
62             assert(captures.hit[1] == 'u' || captures.hit[1] == 'U', "Unexpected capture: " ~ captures.hit);
63 
64             if (captures.hit[1] == 'u')
65             {
66                 // case \u####
67 
68                 // Since some of code units might not be standalone code points
69                 // (surrogates), we match sequences of them and parse them all
70                 // at once. Doing them one at a time causes problems since you
71                 // can't add a surrogate to a UTF-8 string. Or something.
72                 return captures.hit
73                     .splitter(`\u`)
74                     .filter!((e) => e.length != 0)
75                     .map!((e) => e.to!int(16))
76                     .map!((e) => e.to!wchar)
77                     .array
78                     .to!string;
79             }
80             else
81             {
82                 // case \U########
83 
84                 return captures.hit[2..$].to!int(16).to!dchar.to!string;
85             }
86         }
87     )(unidecoder).to!string;
88 }
89 
90 private string removeEscapedLinebreaks(string value)
91 {
92     enum auto re = ctRegex!(`\\\r?\n\s*`, "g");
93     return value.replaceAll(re, "");
94 }
95 
96 /// For multiline strings, remove the newline immediately following the opening quotes
97 /// if one exists.
98 private string removeLeadingNewline(string value)
99 {
100     if (value[0] == '\n')
101     {
102         return value[1..$];
103     }
104     else if (value[0..2] == "\r\n")
105     {
106         return value[2..$];
107     }
108     else
109     {
110         return value;
111     }
112 }
113 
114 @("Basic — Simple")
115 unittest
116 {
117     parseTomlBasicString(`"Hello World!"`).should.equal("Hello World!");
118 }
119 
120 @("Basic — Tabs")
121 unittest
122 {
123     parseTomlBasicString("\"Hello\tWorld!\"").should.equal("Hello\tWorld!");
124 }
125 
126 @("Basic — Escaped chars")
127 unittest
128 {
129     parseTomlBasicString(`"\"Hello\n\tWorld!\""`).should.equal("\"Hello\n\tWorld!\"");
130 }
131 
132 @("ML Basic — Simple")
133 unittest
134 {
135     parseTomlBasicMultiLineString("\"\"\"Hello\nWorld!\"\"\"").should.equal("Hello\nWorld!");
136 }
137 
138 @("ML Basic — Leading Newline")
139 unittest
140 {
141     parseTomlBasicMultiLineString("\"\"\"\nHello\nWorld!\n\"\"\"").should.equal("Hello\nWorld!\n");
142 }
143 
144 @("ML Basic — Trailing backslash")
145 unittest
146 {
147     parseTomlBasicMultiLineString("\"\"\"Hello \\\n    World!\"\"\"").should.equal("Hello World!");
148 }
149 
150 @("ML Basic — CRLF")
151 unittest
152 {
153     parseTomlBasicMultiLineString("\"\"\"\r\nHello\r\nWorld!\"\"\"").should.equal("Hello\r\nWorld!");
154 }
155 
156 @("Literal — Simple")
157 unittest
158 {
159     parseTomlLiteralString("'Hello World!'").should.equal("Hello World!");
160 }
161 
162 @("Literal — Escaped chars")
163 unittest
164 {
165     parseTomlLiteralString(`'Hello\nWorld!'`).should.equal("Hello\\nWorld!");
166 }
167 
168 @("ML Literal — Simple")
169 unittest
170 {
171     parseTomlLiteralMultiLineString("'''Hello\nWorld!'''").should.equal("Hello\nWorld!");
172 }
173 
174 @("ML Literal — Leading Newline")
175 unittest
176 {
177     parseTomlLiteralMultiLineString("'''\n Hello\nWorld!\n'''").should.equal(" Hello\nWorld!\n");
178 }
179 
180 @("ML Literal — Trailing backslash")
181 unittest
182 {
183     parseTomlLiteralMultiLineString("'''Hello \\\n    World!'''").should.equal("Hello \\\n    World!");
184 }
185 
186 @("ML Literal — CRLF")
187 unittest
188 {
189     parseTomlBasicMultiLineString("'''Hello\r\nWorld!'''").should.equal("Hello\r\nWorld!");
190 }
191 
192 @(`Basic — Decode \u####`)
193 unittest
194 {
195     parseTomlBasicString(`"\uD834\uDD1E"`).should.equal("𝄞");
196 }
197 
198 @(`Basic — Decode \U########`)
199 unittest
200 {
201     parseTomlBasicString(`"\U000132f9"`).should.equal("𓋹");
202 }
203 
204 @(`ML Basic — Decode \u####`)
205 unittest
206 {
207     parseTomlBasicMultiLineString(`"""\uD834\uDD1E"""`).should.equal("𝄞");
208 }
209 
210 @(`ML Basic — Decode \U########`)
211 unittest
212 {
213     parseTomlBasicMultiLineString(`"""\U000132f9"""`).should.equal("𓋹");
214 }
215 
216 @(`Literal — Don't Decode \u####`)
217 unittest
218 {
219     parseTomlLiteralString(`'\uD834\uDD1E'`).should.equal(`\uD834\uDD1E`);
220 }
221 
222 @(`Literal — Don't Decode \U########`)
223 unittest
224 {
225     parseTomlLiteralString(`'\U000132f9'`).should.equal(`\U000132f9`);
226 }
227 
228 @(`ML Literal — Don't Decode \u####`)
229 unittest
230 {
231     parseTomlLiteralMultiLineString(`'''\uD834\uDD1E'''`).should.equal(`\uD834\uDD1E`);
232 }
233 
234 @(`ML Literal — Don't Decode \U########`)
235 unittest
236 {
237     parseTomlLiteralMultiLineString(`'''\U000132f9'''`).should.equal(`\U000132f9`);
238 }