Skip to content

Small changes for string parsing #140

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions src/GraphQLParser.Tests/Validation/LexerValidationTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -370,10 +370,10 @@ public void Lex_UnescapedControlChar_Blockstring_ThrowsExceptionWithCorrectMessa
var exception = Should.Throw<GraphQLSyntaxErrorException>(() => "\"\"\"contains unescaped \u0007 control char".Lex());

exception.Message.ShouldBe(
"Syntax Error GraphQL (1:23) Invalid character within BlockString: \\u0007.\n" +
"Syntax Error GraphQL (1:23) Invalid character within block string: \\u0007.\n" +
"1: \"\"\"contains unescaped \\u0007 control char\n" +
" ^\n");
exception.Description.ShouldBe("Invalid character within BlockString: \\u0007.");
exception.Description.ShouldBe("Invalid character within block string: \\u0007.");
exception.Line.ShouldBe(1);
exception.Column.ShouldBe(23);
}
Expand Down Expand Up @@ -412,10 +412,10 @@ public void Lex_UnterminatedBlockString_ThrowsExceptionWithCorrectMessage()
var exception = Should.Throw<GraphQLSyntaxErrorException>(() => "\"\"\"".Lex());

exception.Message.ShouldBe(
"Syntax Error GraphQL (1:4) Unterminated string.\n" +
"Syntax Error GraphQL (1:4) Unterminated block string.\n" +
"1: \"\"\"\n" +
" ^\n");
exception.Description.ShouldBe("Unterminated string.");
exception.Description.ShouldBe("Unterminated block string.");
exception.Line.ShouldBe(1);
exception.Column.ShouldBe(4);
}
Expand All @@ -426,10 +426,10 @@ public void Lex_UnterminatedBlockStringWithText_ThrowsExceptionWithCorrectMessag
var exception = Should.Throw<GraphQLSyntaxErrorException>(() => "\"\"\"no end triple-quote\"\"".Lex());

exception.Message.ShouldBe(
"Syntax Error GraphQL (1:25) Unterminated string.\n" +
"Syntax Error GraphQL (1:25) Unterminated block string.\n" +
"1: \"\"\"no end triple-quote\"\"\n" +
" ^\n");
exception.Description.ShouldBe("Unterminated string.");
exception.Description.ShouldBe("Unterminated block string.");
exception.Line.ShouldBe(1);
exception.Column.ShouldBe(25);
}
Expand Down
82 changes: 46 additions & 36 deletions src/GraphQLParser/LexerContext.cs
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,9 @@ public Token GetToken()

if (code == '"')
{
if (_currentIndex + 2 < _source.Length && _source.Span[_currentIndex + 1] == '"' && _source.Span[_currentIndex + 2] == '"')
{
return ReadBlockString();
}
else
{
return ReadString();
}
return _currentIndex + 2 < _source.Length && _source.Span[_currentIndex + 1] == '"' && _source.Span[_currentIndex + 2] == '"'
? ReadBlockString()
: ReadString();
}

return Throw_From_GetToken2(code);
Expand Down Expand Up @@ -133,7 +128,11 @@ private Token ReadComment()
int start = _currentIndex;
char code = NextCode();

Span<char> buffer = stackalloc char[4096];
// The buffer on the stack allows to get rid of intermediate heap allocations if the string
// 1) not too long
// or
// 2) does not contain escape sequences.
Span<char> buffer = stackalloc char[Math.Min(_source.Length - _currentIndex + 32, 4096)];
StringBuilder? sb = null;

int index = 0;
Expand All @@ -149,8 +148,7 @@ private Token ReadComment()
}
catch (IndexOutOfRangeException) // fallback to StringBuilder in case of buffer overflow
{
if (sb == null)
sb = new StringBuilder(buffer.Length * 2);
sb ??= new StringBuilder(buffer.Length * 2);

for (int i = 0; i < buffer.Length; ++i)
sb.Append(buffer[i]);
Expand Down Expand Up @@ -181,16 +179,25 @@ private Token ReadComment()
);
}

// TODO: this method can still be optimized no not allocate at all if block string:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Shane32 I am 100% sure that this can be done, but I was not so brave to do it myself :).

//
// 1) not too long
// 2) has no escape sequences
// 3) has no '\r' characters
// 4) has no initial whitespace on each line, ignoring the first line (or, has no '\n' characters)
//
// In this case, ROM for the returned token represents unmodified part of the source ROM,
// so it can be just sliced from '_source' as you can see in more simple ReadString method.
private Token ReadBlockString()
{
int start = _currentIndex += 2;
int start = _currentIndex += 2; // skip ""
char code = NextCode();

Span<char> buffer = stackalloc char[4096];
Span<char> buffer = stackalloc char[Math.Min(_source.Length - _currentIndex + 32, 4096)];
StringBuilder? sb = null;

int index = 0;
bool escape = false; //when the last character was \
bool escape = false; // when the last character was \
bool lastWasCr = false;

while (_currentIndex < _source.Length)
Expand All @@ -200,30 +207,30 @@ private Token ReadBlockString()
Throw_From_ReadBlockString1(code);
}

//check for """
// check for """
if (code == '"' && _currentIndex + 2 < _source.Length && _source.Span[_currentIndex + 1] == '"' && _source.Span[_currentIndex + 2] == '"')
{
//if last character was \ then go ahead and write out the """, skipping the \
// if last character was \ then go ahead and write out the """, skipping the \
if (escape)
{
escape = false;
}
else
{
//end of blockstring
// end of block string
break;
}
}
else if (escape)
{
//last character was \ so write the \ and then retry this character with escaped = false
// last character was \ so write the \ and then retry this character with escaped = false
code = '\\';
_currentIndex--;
escape = false;
}
else if (code == '\\')
{
//this character is a \ so don't write anything yet, but check the next character
// this character is a \ so don't write anything yet, but check the next character
escape = true;
code = NextCode();
lastWasCr = false;
Expand All @@ -237,15 +244,14 @@ private Token ReadBlockString()

if (!(lastWasCr && code == '\n'))
{
//write code
// write code
if (index < buffer.Length)
{
buffer[index++] = code == '\r' ? '\n' : code;
}
else // fallback to StringBuilder in case of buffer overflow
{
if (sb == null)
sb = new StringBuilder(buffer.Length * 2);
sb ??= new StringBuilder(buffer.Length * 2);

for (int i = 0; i < buffer.Length; ++i)
sb.Append(buffer[i]);
Expand All @@ -262,18 +268,18 @@ private Token ReadBlockString()

if (_currentIndex >= _source.Length)
{
Throw_From_ReadString2();
Throw_From_ReadBlockString2();
}
_currentIndex += 2;
_currentIndex += 2; // skip ""

if (sb != null)
{
for (int i = 0; i < index; ++i)
sb.Append(buffer[i]);
}

//at this point, if sb != null, then sb has the whole string, otherwise buffer (of length index) has the whole string
//also, all line termination combinations have been replaced with LF
// at this point, if sb != null, then sb has the whole string, otherwise buffer (of length index) has the whole string
// also, all line termination combinations have been replaced with LF

ROM value;
if (sb != null)
Expand All @@ -297,11 +303,11 @@ private Token ReadBlockString()

static ROM ProcessBuffer(Span<char> buffer)
{
//scan string to determine maximum valid commonIndent value,
//number of initial blank lines, and number of trailing blank lines
// scan string to determine maximum valid commonIndent value,
// number of initial blank lines, and number of trailing blank lines
int commonIndent = int.MaxValue;
int initialBlankLines = 1;
int skipLinesAfter; //skip all text after line ###, as determined by the number of trailing blank lines
int skipLinesAfter; // skip all text after line ###, as determined by the number of trailing blank lines
{
int trailingBlankLines = 0;
int line = 0;
Expand Down Expand Up @@ -347,8 +353,8 @@ static ROM ProcessBuffer(Span<char> buffer)
skipLinesAfter = lines - trailingBlankLines;
}

//step through the input, skipping the initial blank lines and the trailing blank lines,
//and skipping the initial blank characters from the start of each line
// step through the input, skipping the initial blank lines and the trailing blank lines,
// and skipping the initial blank characters from the start of each line
Span<char> output = buffer.Length <= 4096 ? stackalloc char[buffer.Length] : new char[buffer.Length];
int outputIndex = 0;
{
Expand All @@ -373,7 +379,7 @@ static ROM ProcessBuffer(Span<char> buffer)
}
}

//return the string value from the output buffer
// return the string value from the output buffer
return output.Slice(0, outputIndex).ToString();
}
}
Expand All @@ -383,7 +389,7 @@ private Token ReadString()
int start = _currentIndex;
char code = NextCode();

Span<char> buffer = stackalloc char[4096];
Span<char> buffer = stackalloc char[Math.Min(_source.Length - _currentIndex + 32, 4096)];
StringBuilder? sb = null;

int index = 0;
Expand All @@ -404,8 +410,7 @@ private Token ReadString()
}
catch (IndexOutOfRangeException) // fallback to StringBuilder in case of buffer overflow
{
if (sb == null)
sb = new StringBuilder(buffer.Length * 2);
sb ??= new StringBuilder(buffer.Length * 2);

for (int i = 0; i < buffer.Length; ++i)
sb.Append(buffer[i]);
Expand Down Expand Up @@ -453,7 +458,12 @@ private void Throw_From_ReadString2()

private void Throw_From_ReadBlockString1(char code)
{
throw new GraphQLSyntaxErrorException($"Invalid character within BlockString: \\u{(int)code:D4}.", _source, _currentIndex);
throw new GraphQLSyntaxErrorException($"Invalid character within block string: \\u{(int)code:D4}.", _source, _currentIndex);
}

private void Throw_From_ReadBlockString2()
{
throw new GraphQLSyntaxErrorException("Unterminated block string.", _source, _currentIndex);
}

// sets escaped only to true
Expand Down