Skip to content

Commit 8c41c52

Browse files
sungam3rShane32
andauthored
Small changes for string parsing (#140)
* Small changes for string parsing * Update src/GraphQLParser/LexerContext.cs Co-authored-by: Shane Krueger <[email protected]> Co-authored-by: Shane Krueger <[email protected]>
1 parent cb70e0d commit 8c41c52

File tree

2 files changed

+52
-42
lines changed

2 files changed

+52
-42
lines changed

src/GraphQLParser.Tests/Validation/LexerValidationTests.cs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -370,10 +370,10 @@ public void Lex_UnescapedControlChar_Blockstring_ThrowsExceptionWithCorrectMessa
370370
var exception = Should.Throw<GraphQLSyntaxErrorException>(() => "\"\"\"contains unescaped \u0007 control char".Lex());
371371

372372
exception.Message.ShouldBe(
373-
"Syntax Error GraphQL (1:23) Invalid character within BlockString: \\u0007.\n" +
373+
"Syntax Error GraphQL (1:23) Invalid character within block string: \\u0007.\n" +
374374
"1: \"\"\"contains unescaped \\u0007 control char\n" +
375375
" ^\n");
376-
exception.Description.ShouldBe("Invalid character within BlockString: \\u0007.");
376+
exception.Description.ShouldBe("Invalid character within block string: \\u0007.");
377377
exception.Line.ShouldBe(1);
378378
exception.Column.ShouldBe(23);
379379
}
@@ -412,10 +412,10 @@ public void Lex_UnterminatedBlockString_ThrowsExceptionWithCorrectMessage()
412412
var exception = Should.Throw<GraphQLSyntaxErrorException>(() => "\"\"\"".Lex());
413413

414414
exception.Message.ShouldBe(
415-
"Syntax Error GraphQL (1:4) Unterminated string.\n" +
415+
"Syntax Error GraphQL (1:4) Unterminated block string.\n" +
416416
"1: \"\"\"\n" +
417417
" ^\n");
418-
exception.Description.ShouldBe("Unterminated string.");
418+
exception.Description.ShouldBe("Unterminated block string.");
419419
exception.Line.ShouldBe(1);
420420
exception.Column.ShouldBe(4);
421421
}
@@ -426,10 +426,10 @@ public void Lex_UnterminatedBlockStringWithText_ThrowsExceptionWithCorrectMessag
426426
var exception = Should.Throw<GraphQLSyntaxErrorException>(() => "\"\"\"no end triple-quote\"\"".Lex());
427427

428428
exception.Message.ShouldBe(
429-
"Syntax Error GraphQL (1:25) Unterminated string.\n" +
429+
"Syntax Error GraphQL (1:25) Unterminated block string.\n" +
430430
"1: \"\"\"no end triple-quote\"\"\n" +
431431
" ^\n");
432-
exception.Description.ShouldBe("Unterminated string.");
432+
exception.Description.ShouldBe("Unterminated block string.");
433433
exception.Line.ShouldBe(1);
434434
exception.Column.ShouldBe(25);
435435
}

src/GraphQLParser/LexerContext.cs

Lines changed: 46 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,9 @@ public Token GetToken()
4848

4949
if (code == '"')
5050
{
51-
if (_currentIndex + 2 < _source.Length && _source.Span[_currentIndex + 1] == '"' && _source.Span[_currentIndex + 2] == '"')
52-
{
53-
return ReadBlockString();
54-
}
55-
else
56-
{
57-
return ReadString();
58-
}
51+
return _currentIndex + 2 < _source.Length && _source.Span[_currentIndex + 1] == '"' && _source.Span[_currentIndex + 2] == '"'
52+
? ReadBlockString()
53+
: ReadString();
5954
}
6055

6156
return Throw_From_GetToken2(code);
@@ -133,7 +128,11 @@ private Token ReadComment()
133128
int start = _currentIndex;
134129
char code = NextCode();
135130

136-
Span<char> buffer = stackalloc char[4096];
131+
// The buffer on the stack allows to get rid of intermediate heap allocations if the string
132+
// 1) not too long
133+
// or
134+
// 2) does not contain escape sequences.
135+
Span<char> buffer = stackalloc char[Math.Min(_source.Length - _currentIndex + 32, 4096)];
137136
StringBuilder? sb = null;
138137

139138
int index = 0;
@@ -149,8 +148,7 @@ private Token ReadComment()
149148
}
150149
catch (IndexOutOfRangeException) // fallback to StringBuilder in case of buffer overflow
151150
{
152-
if (sb == null)
153-
sb = new StringBuilder(buffer.Length * 2);
151+
sb ??= new StringBuilder(buffer.Length * 2);
154152

155153
for (int i = 0; i < buffer.Length; ++i)
156154
sb.Append(buffer[i]);
@@ -181,16 +179,25 @@ private Token ReadComment()
181179
);
182180
}
183181

182+
// TODO: this method can still be optimized no not allocate at all if block string:
183+
//
184+
// 1) not too long
185+
// 2) has no escape sequences
186+
// 3) has no '\r' characters
187+
// 4) has no initial whitespace on each line, ignoring the first line (or, has no '\n' characters)
188+
//
189+
// In this case, ROM for the returned token represents unmodified part of the source ROM,
190+
// so it can be just sliced from '_source' as you can see in more simple ReadString method.
184191
private Token ReadBlockString()
185192
{
186-
int start = _currentIndex += 2;
193+
int start = _currentIndex += 2; // skip ""
187194
char code = NextCode();
188195

189-
Span<char> buffer = stackalloc char[4096];
196+
Span<char> buffer = stackalloc char[Math.Min(_source.Length - _currentIndex + 32, 4096)];
190197
StringBuilder? sb = null;
191198

192199
int index = 0;
193-
bool escape = false; //when the last character was \
200+
bool escape = false; // when the last character was \
194201
bool lastWasCr = false;
195202

196203
while (_currentIndex < _source.Length)
@@ -200,30 +207,30 @@ private Token ReadBlockString()
200207
Throw_From_ReadBlockString1(code);
201208
}
202209

203-
//check for """
210+
// check for """
204211
if (code == '"' && _currentIndex + 2 < _source.Length && _source.Span[_currentIndex + 1] == '"' && _source.Span[_currentIndex + 2] == '"')
205212
{
206-
//if last character was \ then go ahead and write out the """, skipping the \
213+
// if last character was \ then go ahead and write out the """, skipping the \
207214
if (escape)
208215
{
209216
escape = false;
210217
}
211218
else
212219
{
213-
//end of blockstring
220+
// end of block string
214221
break;
215222
}
216223
}
217224
else if (escape)
218225
{
219-
//last character was \ so write the \ and then retry this character with escaped = false
226+
// last character was \ so write the \ and then retry this character with escaped = false
220227
code = '\\';
221228
_currentIndex--;
222229
escape = false;
223230
}
224231
else if (code == '\\')
225232
{
226-
//this character is a \ so don't write anything yet, but check the next character
233+
// this character is a \ so don't write anything yet, but check the next character
227234
escape = true;
228235
code = NextCode();
229236
lastWasCr = false;
@@ -237,15 +244,14 @@ private Token ReadBlockString()
237244

238245
if (!(lastWasCr && code == '\n'))
239246
{
240-
//write code
247+
// write code
241248
if (index < buffer.Length)
242249
{
243250
buffer[index++] = code == '\r' ? '\n' : code;
244251
}
245252
else // fallback to StringBuilder in case of buffer overflow
246253
{
247-
if (sb == null)
248-
sb = new StringBuilder(buffer.Length * 2);
254+
sb ??= new StringBuilder(buffer.Length * 2);
249255

250256
for (int i = 0; i < buffer.Length; ++i)
251257
sb.Append(buffer[i]);
@@ -262,18 +268,18 @@ private Token ReadBlockString()
262268

263269
if (_currentIndex >= _source.Length)
264270
{
265-
Throw_From_ReadString2();
271+
Throw_From_ReadBlockString2();
266272
}
267-
_currentIndex += 2;
273+
_currentIndex += 2; // skip ""
268274

269275
if (sb != null)
270276
{
271277
for (int i = 0; i < index; ++i)
272278
sb.Append(buffer[i]);
273279
}
274280

275-
//at this point, if sb != null, then sb has the whole string, otherwise buffer (of length index) has the whole string
276-
//also, all line termination combinations have been replaced with LF
281+
// at this point, if sb != null, then sb has the whole string, otherwise buffer (of length index) has the whole string
282+
// also, all line termination combinations have been replaced with LF
277283

278284
ROM value;
279285
if (sb != null)
@@ -297,11 +303,11 @@ private Token ReadBlockString()
297303

298304
static ROM ProcessBuffer(Span<char> buffer)
299305
{
300-
//scan string to determine maximum valid commonIndent value,
301-
//number of initial blank lines, and number of trailing blank lines
306+
// scan string to determine maximum valid commonIndent value,
307+
// number of initial blank lines, and number of trailing blank lines
302308
int commonIndent = int.MaxValue;
303309
int initialBlankLines = 1;
304-
int skipLinesAfter; //skip all text after line ###, as determined by the number of trailing blank lines
310+
int skipLinesAfter; // skip all text after line ###, as determined by the number of trailing blank lines
305311
{
306312
int trailingBlankLines = 0;
307313
int line = 0;
@@ -347,8 +353,8 @@ static ROM ProcessBuffer(Span<char> buffer)
347353
skipLinesAfter = lines - trailingBlankLines;
348354
}
349355

350-
//step through the input, skipping the initial blank lines and the trailing blank lines,
351-
//and skipping the initial blank characters from the start of each line
356+
// step through the input, skipping the initial blank lines and the trailing blank lines,
357+
// and skipping the initial blank characters from the start of each line
352358
Span<char> output = buffer.Length <= 4096 ? stackalloc char[buffer.Length] : new char[buffer.Length];
353359
int outputIndex = 0;
354360
{
@@ -373,7 +379,7 @@ static ROM ProcessBuffer(Span<char> buffer)
373379
}
374380
}
375381

376-
//return the string value from the output buffer
382+
// return the string value from the output buffer
377383
return output.Slice(0, outputIndex).ToString();
378384
}
379385
}
@@ -383,7 +389,7 @@ private Token ReadString()
383389
int start = _currentIndex;
384390
char code = NextCode();
385391

386-
Span<char> buffer = stackalloc char[4096];
392+
Span<char> buffer = stackalloc char[Math.Min(_source.Length - _currentIndex + 32, 4096)];
387393
StringBuilder? sb = null;
388394

389395
int index = 0;
@@ -404,8 +410,7 @@ private Token ReadString()
404410
}
405411
catch (IndexOutOfRangeException) // fallback to StringBuilder in case of buffer overflow
406412
{
407-
if (sb == null)
408-
sb = new StringBuilder(buffer.Length * 2);
413+
sb ??= new StringBuilder(buffer.Length * 2);
409414

410415
for (int i = 0; i < buffer.Length; ++i)
411416
sb.Append(buffer[i]);
@@ -453,7 +458,12 @@ private void Throw_From_ReadString2()
453458

454459
private void Throw_From_ReadBlockString1(char code)
455460
{
456-
throw new GraphQLSyntaxErrorException($"Invalid character within BlockString: \\u{(int)code:D4}.", _source, _currentIndex);
461+
throw new GraphQLSyntaxErrorException($"Invalid character within block string: \\u{(int)code:D4}.", _source, _currentIndex);
462+
}
463+
464+
private void Throw_From_ReadBlockString2()
465+
{
466+
throw new GraphQLSyntaxErrorException("Unterminated block string.", _source, _currentIndex);
457467
}
458468

459469
// sets escaped only to true

0 commit comments

Comments
 (0)