diff --git a/src/Directory.Build.props b/src/Directory.Build.props index 87f8028b..f82dbc2d 100644 --- a/src/Directory.Build.props +++ b/src/Directory.Build.props @@ -1,7 +1,7 @@ - 7.1.0-preview + 7.2.0-preview 8.0 true true diff --git a/src/GraphQLParser.Tests/LexerTests.cs b/src/GraphQLParser.Tests/LexerTests.cs index c100f21a..5169e7f1 100644 --- a/src/GraphQLParser.Tests/LexerTests.cs +++ b/src/GraphQLParser.Tests/LexerTests.cs @@ -974,6 +974,47 @@ public void Lex_WhiteSpaceStringToken_HasStringKind() token.Kind.ShouldBe(TokenKind.STRING); } + [Theory] + [InlineData("test", "test")] + [InlineData("te\\\"\"\"st", "te\"\"\"st")] + [InlineData("\ntest", "test")] + [InlineData("\r\ntest", "test")] + [InlineData(" \ntest", "test")] + [InlineData("\t\ntest", "test")] + [InlineData("\n\ntest", "test")] + [InlineData("test\nline2", "test\nline2")] + [InlineData("test\rline2", "test\nline2")] + [InlineData("test\r\nline2", "test\nline2")] + [InlineData("test\r\r\nline2", "test\n\nline2")] + [InlineData("test\r\n\nline2", "test\n\nline2")] + [InlineData("test\n", "test")] + [InlineData("test\n ", "test")] + [InlineData("test\n\t", "test")] + [InlineData("test\n\n", "test")] + [InlineData("test\n line2", "test\nline2")] + [InlineData("test\n\t\tline2", "test\nline2")] + [InlineData("test\n \tline2", "test\nline2")] + [InlineData(" test\nline2", " test\nline2")] + [InlineData(" test\n line2", " test\nline2")] + [InlineData("\n test\n line2", "test\nline2")] + [InlineData(" test\n line2\n\t\tline3\n line4", " test\nline2\n\tline3\n line4")] + [InlineData(" test\n Hello,\n\n world!\n ", " test\nHello,\n\n world!")] + [InlineData(" \n Hello,\r\n\n world!\n ", "Hello,\n\n world!")] + [InlineData(" \n Hello,\r\n\n wor___ld!\n ", "Hello,\n\n wor___ld!")] + [InlineData("\r\n Hello,\r\n World!\r\n\r\n Yours,\r\n GraphQL.\r\n ", "Hello,\n World!\n\nYours,\n GraphQL.")] + [InlineData("Test \\n escaping", "Test \\n escaping")] + [InlineData("Test \\u1234 escaping", "Test \\u1234 escaping")] + [InlineData("Test \\ escaping", "Test \\ escaping")] + public void Lex_BlockString_Tests(string input, string expected) + { + input = input.Replace("___", new string('_', 9000)); + expected = expected.Replace("___", new string('_', 9000)); + input = "\"\"\"" + input + "\"\"\""; + var actual = input.Lex(); + actual.Kind.ShouldBe(TokenKind.STRING); + actual.Value.ToString().ShouldBe(expected); + } + private static Token GetATPunctuationTokenLexer() { return "@".Lex(); diff --git a/src/GraphQLParser.Tests/Validation/LexerValidationTests.cs b/src/GraphQLParser.Tests/Validation/LexerValidationTests.cs index f42cba49..98ea11a2 100644 --- a/src/GraphQLParser.Tests/Validation/LexerValidationTests.cs +++ b/src/GraphQLParser.Tests/Validation/LexerValidationTests.cs @@ -364,6 +364,20 @@ public void Lex_UnescapedControlChar_ThrowsExceptionWithCorrectMessage() exception.Column.ShouldBe(21); } + [Fact] + public void Lex_UnescapedControlChar_Blockstring_ThrowsExceptionWithCorrectMessage() + { + var exception = Should.Throw(() => "\"\"\"contains unescaped \u0007 control char".Lex()); + + exception.Message.ShouldBe( + "Syntax Error GraphQL (1:23) Invalid character within BlockString: \\u0007.\n" + + "1: \"\"\"contains unescaped \\u0007 control char\n" + + " ^\n"); + exception.Description.ShouldBe("Invalid character within BlockString: \\u0007."); + exception.Line.ShouldBe(1); + exception.Column.ShouldBe(23); + } + [Fact] public void Lex_UnterminatedString_ThrowsExceptionWithCorrectMessage() { @@ -391,5 +405,33 @@ public void Lex_UnterminatedStringWithText_ThrowsExceptionWithCorrectMessage() exception.Line.ShouldBe(1); exception.Column.ShouldBe(14); } + + [Fact] + public void Lex_UnterminatedBlockString_ThrowsExceptionWithCorrectMessage() + { + var exception = Should.Throw(() => "\"\"\"".Lex()); + + exception.Message.ShouldBe( + "Syntax Error GraphQL (1:4) Unterminated string.\n" + + "1: \"\"\"\n" + + " ^\n"); + exception.Description.ShouldBe("Unterminated string."); + exception.Line.ShouldBe(1); + exception.Column.ShouldBe(4); + } + + [Fact] + public void Lex_UnterminatedBlockStringWithText_ThrowsExceptionWithCorrectMessage() + { + var exception = Should.Throw(() => "\"\"\"no end triple-quote\"\"".Lex()); + + exception.Message.ShouldBe( + "Syntax Error GraphQL (1:25) Unterminated string.\n" + + "1: \"\"\"no end triple-quote\"\"\n" + + " ^\n"); + exception.Description.ShouldBe("Unterminated string."); + exception.Line.ShouldBe(1); + exception.Column.ShouldBe(25); + } } } diff --git a/src/GraphQLParser/LexerContext.cs b/src/GraphQLParser/LexerContext.cs index 837c38cf..7e42a343 100644 --- a/src/GraphQLParser/LexerContext.cs +++ b/src/GraphQLParser/LexerContext.cs @@ -47,7 +47,16 @@ public Token GetToken() return ReadNumber(); if (code == '"') - return ReadString(); + { + if (_currentIndex + 2 < _source.Length && _source.Span[_currentIndex + 1] == '"' && _source.Span[_currentIndex + 2] == '"') + { + return ReadBlockString(); + } + else + { + return ReadString(); + } + } return Throw_From_GetToken2(code); } @@ -172,6 +181,203 @@ private Token ReadComment() ); } + private Token ReadBlockString() + { + int start = _currentIndex += 2; + char code = NextCode(); + + Span buffer = stackalloc char[4096]; + StringBuilder? sb = null; + + int index = 0; + bool escape = false; //when the last character was \ + bool lastWasCr = false; + + while (_currentIndex < _source.Length) + { + if (code < 0x0020 && code != 0x0009 && code != 0x000A && code != 0x000D) + { + Throw_From_ReadBlockString1(code); + } + + //check for """ + if (code == '"' && _currentIndex + 2 < _source.Length && _source.Span[_currentIndex + 1] == '"' && _source.Span[_currentIndex + 2] == '"') + { + //if last character was \ then go ahead and write out the """, skipping the \ + if (escape) + { + escape = false; + } + else + { + //end of blockstring + break; + } + } + else if (escape) + { + //last character was \ so write the \ and then retry this character with escaped = false + code = '\\'; + _currentIndex--; + escape = false; + } + else if (code == '\\') + { + //this character is a \ so don't write anything yet, but check the next character + escape = true; + code = NextCode(); + lastWasCr = false; + continue; + } + else + { + escape = false; + } + + + if (!(lastWasCr && code == '\n')) + { + //write code + if (index < buffer.Length) + { + buffer[index++] = code == '\r' ? '\n' : code; + } + else // fallback to StringBuilder in case of buffer overflow + { + if (sb == null) + sb = new StringBuilder(buffer.Length * 2); + + for (int i = 0; i < buffer.Length; ++i) + sb.Append(buffer[i]); + + sb.Append(code == '\r' ? '\n' : code); + index = 0; + } + } + + lastWasCr = code == '\r'; + + code = NextCode(); + } + + if (_currentIndex >= _source.Length) + { + Throw_From_ReadString2(); + } + _currentIndex += 2; + + if (sb != null) + { + for (int i = 0; i < index; ++i) + sb.Append(buffer[i]); + } + + //at this point, if sb != null, then sb has the whole string, otherwise buffer (of length index) has the whole string + //also, all line termination combinations have been replaced with LF + + ROM value; + if (sb != null) + { + var chars = new char[sb.Length]; + sb.CopyTo(0, chars, 0, sb.Length); + value = ProcessBuffer(chars); + } + else + { + value = ProcessBuffer(buffer.Slice(0, index)); + } + + return new Token + ( + TokenKind.STRING, + value, + start, + _currentIndex + 1 + ); + + static ROM ProcessBuffer(Span buffer) + { + //scan string to determine maximum valid commonIndent value, + //number of initial blank lines, and number of trailing blank lines + int commonIndent = int.MaxValue; + int initialBlankLines = 1; + int skipLinesAfter; //skip all text after line ###, as determined by the number of trailing blank lines + { + int trailingBlankLines = 0; + int line = 0; + int whitespace = 0; + bool allWhitespace = true; + bool reachedCharacter = false; + for (int index = 0; index < buffer.Length; index++) + { + char code = buffer[index]; + if (code == '\n') + { + if (allWhitespace) + trailingBlankLines += 1; + if (line != 0 && !allWhitespace && whitespace < commonIndent) + commonIndent = whitespace; + line++; + whitespace = 0; + allWhitespace = true; + if (!reachedCharacter) + initialBlankLines++; + } + else if (code == ' ' || code == '\t') + { + if (allWhitespace) + whitespace++; + } + else + { + allWhitespace = false; + if (!reachedCharacter) + initialBlankLines--; + reachedCharacter = true; + trailingBlankLines = 0; + } + } + if (allWhitespace) + trailingBlankLines += 1; + if (line != 0 && !allWhitespace && whitespace < commonIndent) + commonIndent = whitespace; + if (commonIndent == int.MaxValue) + commonIndent = 0; + int lines = line + 1; + skipLinesAfter = lines - trailingBlankLines; + } + + //step through the input, skipping the initial blank lines and the trailing blank lines, + //and skipping the initial blank characters from the start of each line + Span output = buffer.Length <= 4096 ? stackalloc char[buffer.Length] : new char[buffer.Length]; + int outputIndex = 0; + { + int line = 0; + int col = 0; + for (int index = 0; index < buffer.Length; index++) + { + char code = buffer[index]; + if (code == '\n') + { + if (++line >= skipLinesAfter) + break; + col = 0; + if (line > initialBlankLines) + output[outputIndex++] = code; + } + else + { + if (line >= initialBlankLines && (line == 0 || col++ >= commonIndent)) + output[outputIndex++] = code; + } + } + } + + //return the string value from the output buffer + return output.Slice(0, outputIndex).ToString(); + } + } + private Token ReadString() { int start = _currentIndex; @@ -245,6 +451,11 @@ private void Throw_From_ReadString2() throw new GraphQLSyntaxErrorException("Unterminated string.", _source, _currentIndex); } + private void Throw_From_ReadBlockString1(char code) + { + throw new GraphQLSyntaxErrorException($"Invalid character within BlockString: \\u{(int)code:D4}.", _source, _currentIndex); + } + // sets escaped only to true private char ReadCharacterFromString(char currentCharacter, ref bool escaped) { diff --git a/src/GraphQLParser/TokenKind.cs b/src/GraphQLParser/TokenKind.cs index 9c66b8ab..2f5951d1 100644 --- a/src/GraphQLParser/TokenKind.cs +++ b/src/GraphQLParser/TokenKind.cs @@ -93,9 +93,21 @@ public enum TokenKind FLOAT = 17, /// + /// A string value, encoded as either a 'string' or 'block string' + ///

/// Strings are sequences of characters wrapped in double‐quotes ("). (ex. "Hello World"). /// White space and other otherwise‐ignored characters are significant within a string value. - ///
+ ///

+ /// Block strings are sequences of characters wrapped in triple‐quotes ("""). White space, line terminators, + /// quote, and backslash characters may all be used unescaped to enable verbatim text. + /// Since block strings represent freeform text often used in indented positions, the string value semantics + /// of a block string excludes uniform indentation and blank initial and trailing lines. + /// Triple-quotes (""") may be escaped as \""" within the block string. No other escape sequences may be used + /// within a block string. + /// + /// + /// Within a block string, line termination sequences (LF, CR, or CRLF) are always replaced with a line-feed (LF) character. + /// STRING = 18, /// @@ -115,6 +127,6 @@ public enum TokenKind /// /// & /// - AMPERSAND = 21 + AMPERSAND = 21, } }