@@ -840,6 +840,56 @@ FormatToken *FormatTokenLexer::getStashedToken() {
840
840
return FormatTok;
841
841
}
842
842
843
+ // / Truncate the current token to the new length and make the lexer continue
844
+ // / from the end of the truncated token. Used for other languages that have
845
+ // / different token boundaries, like JavaScript in which a comment ends at a
846
+ // / line break regardless of whether the line break follows a backslash. Also
847
+ // / used to set the lexer to the end of whitespace if the lexer regards
848
+ // / whitespace and an unrecognized symbol as one token.
849
+ void FormatTokenLexer::truncateToken (size_t NewLen) {
850
+ assert (NewLen <= FormatTok->TokenText .size ());
851
+ resetLexer (SourceMgr.getFileOffset (Lex->getSourceLocation (
852
+ Lex->getBufferLocation () - FormatTok->TokenText .size () + NewLen)));
853
+ FormatTok->TokenText = FormatTok->TokenText .substr (0 , NewLen);
854
+ FormatTok->ColumnWidth = encoding::columnWidthWithTabs (
855
+ FormatTok->TokenText , FormatTok->OriginalColumn , Style .TabWidth ,
856
+ Encoding);
857
+ FormatTok->Tok .setLength (NewLen);
858
+ }
859
+
860
+ // / Count the length of leading whitespace in a token.
861
+ static size_t countLeadingWhitespace (StringRef Text) {
862
+ // Basically counting the length matched by this regex.
863
+ // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
864
+ // Directly using the regex turned out to be slow. With the regex
865
+ // version formatting all files in this directory took about 1.25
866
+ // seconds. This version took about 0.5 seconds.
867
+ const char *Cur = Text.begin ();
868
+ while (Cur < Text.end ()) {
869
+ if (isspace (Cur[0 ])) {
870
+ ++Cur;
871
+ } else if (Cur[0 ] == ' \\ ' && (Cur[1 ] == ' \n ' || Cur[1 ] == ' \r ' )) {
872
+ // A '\' followed by a newline always escapes the newline, regardless
873
+ // of whether there is another '\' before it.
874
+ // The source has a null byte at the end. So the end of the entire input
875
+ // isn't reached yet. Also the lexer doesn't break apart an escaped
876
+ // newline.
877
+ assert (Text.end () - Cur >= 2 );
878
+ Cur += 2 ;
879
+ } else if (Cur[0 ] == ' ?' && Cur[1 ] == ' ?' && Cur[2 ] == ' /' &&
880
+ (Cur[3 ] == ' \n ' || Cur[3 ] == ' \r ' )) {
881
+ // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
882
+ // characters are quoted individually in this comment because if we write
883
+ // them together some compilers warn that we have a trigraph in the code.
884
+ assert (Text.end () - Cur >= 4 );
885
+ Cur += 4 ;
886
+ } else {
887
+ break ;
888
+ }
889
+ }
890
+ return Cur - Text.begin ();
891
+ }
892
+
843
893
FormatToken *FormatTokenLexer::getNextToken () {
844
894
if (StateStack.top () == LexerState::TOKEN_STASHED) {
845
895
StateStack.pop ();
@@ -854,34 +904,33 @@ FormatToken *FormatTokenLexer::getNextToken() {
854
904
IsFirstToken = false ;
855
905
856
906
// Consume and record whitespace until we find a significant token.
907
+ // Some tok::unknown tokens are not just whitespace, e.g. whitespace
908
+ // followed by a symbol such as backtick. Those symbols may be
909
+ // significant in other languages.
857
910
unsigned WhitespaceLength = TrailingWhitespace;
858
- while (FormatTok->is (tok::unknown)) {
911
+ while (FormatTok->isNot (tok::eof)) {
912
+ auto LeadingWhitespace = countLeadingWhitespace (FormatTok->TokenText );
913
+ if (LeadingWhitespace == 0 )
914
+ break ;
915
+ if (LeadingWhitespace < FormatTok->TokenText .size ())
916
+ truncateToken (LeadingWhitespace);
859
917
StringRef Text = FormatTok->TokenText ;
860
- auto EscapesNewline = [&](int pos) {
861
- // A '\r' here is just part of '\r\n'. Skip it.
862
- if (pos >= 0 && Text[pos] == ' \r ' )
863
- --pos;
864
- // See whether there is an odd number of '\' before this.
865
- // FIXME: This is wrong. A '\' followed by a newline is always removed,
866
- // regardless of whether there is another '\' before it.
867
- // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
868
- unsigned count = 0 ;
869
- for (; pos >= 0 ; --pos, ++count)
870
- if (Text[pos] != ' \\ ' )
871
- break ;
872
- return count & 1 ;
873
- };
874
- // FIXME: This miscounts tok:unknown tokens that are not just
875
- // whitespace, e.g. a '`' character.
918
+ bool InEscape = false ;
876
919
for (int i = 0 , e = Text.size (); i != e; ++i) {
877
920
switch (Text[i]) {
921
+ case ' \r ' :
922
+ // If this is a CRLF sequence, break here and the LF will be handled on
923
+ // the next loop iteration. Otherwise, this is a single Mac CR, treat it
924
+ // the same as a single LF.
925
+ if (i + 1 < e && Text[i + 1 ] == ' \n ' )
926
+ break ;
927
+ LLVM_FALLTHROUGH;
878
928
case ' \n ' :
879
929
++FormatTok->NewlinesBefore ;
880
- FormatTok->HasUnescapedNewline = !EscapesNewline (i - 1 );
881
- FormatTok->LastNewlineOffset = WhitespaceLength + i + 1 ;
882
- Column = 0 ;
883
- break ;
884
- case ' \r ' :
930
+ if (!InEscape)
931
+ FormatTok->HasUnescapedNewline = true ;
932
+ else
933
+ InEscape = false ;
885
934
FormatTok->LastNewlineOffset = WhitespaceLength + i + 1 ;
886
935
Column = 0 ;
887
936
break ;
@@ -897,24 +946,32 @@ FormatToken *FormatTokenLexer::getNextToken() {
897
946
Style .TabWidth - (Style .TabWidth ? Column % Style .TabWidth : 0 );
898
947
break ;
899
948
case ' \\ ' :
900
- if (i + 1 == e || (Text[i + 1 ] != ' \r ' && Text[i + 1 ] != ' \n ' ))
901
- FormatTok->setType (TT_ImplicitStringLiteral);
949
+ case ' ?' :
950
+ case ' /' :
951
+ // The text was entirely whitespace when this loop was entered. Thus
952
+ // this has to be an escape sequence.
953
+ assert (Text.substr (i, 2 ) == " \\\r " || Text.substr (i, 2 ) == " \\\n " ||
954
+ Text.substr (i, 4 ) == " \?\? /\r " ||
955
+ Text.substr (i, 4 ) == " \?\? /\n " ||
956
+ (i >= 1 && (Text.substr (i - 1 , 4 ) == " \?\? /\r " ||
957
+ Text.substr (i - 1 , 4 ) == " \?\? /\n " )) ||
958
+ (i >= 2 && (Text.substr (i - 2 , 4 ) == " \?\? /\r " ||
959
+ Text.substr (i - 2 , 4 ) == " \?\? /\n " )));
960
+ InEscape = true ;
902
961
break ;
903
962
default :
904
- FormatTok->setType (TT_ImplicitStringLiteral);
963
+ // This shouldn't happen.
964
+ assert (false );
905
965
break ;
906
966
}
907
- if (FormatTok->getType () == TT_ImplicitStringLiteral)
908
- break ;
909
967
}
910
-
911
- if (FormatTok->is (TT_ImplicitStringLiteral))
912
- break ;
913
- WhitespaceLength += FormatTok->Tok .getLength ();
914
-
968
+ WhitespaceLength += Text.size ();
915
969
readRawToken (*FormatTok);
916
970
}
917
971
972
+ if (FormatTok->is (tok::unknown))
973
+ FormatTok->setType (TT_ImplicitStringLiteral);
974
+
918
975
// JavaScript and Java do not allow to escape the end of the line with a
919
976
// backslash. Backslashes are syntax errors in plain source, but can occur in
920
977
// comments. When a single line comment ends with a \, it'll cause the next
@@ -928,42 +985,13 @@ FormatToken *FormatTokenLexer::getNextToken() {
928
985
while (BackslashPos != StringRef::npos) {
929
986
if (BackslashPos + 1 < FormatTok->TokenText .size () &&
930
987
FormatTok->TokenText [BackslashPos + 1 ] == ' \n ' ) {
931
- const char *Offset = Lex->getBufferLocation ();
932
- Offset -= FormatTok->TokenText .size ();
933
- Offset += BackslashPos + 1 ;
934
- resetLexer (SourceMgr.getFileOffset (Lex->getSourceLocation (Offset)));
935
- FormatTok->TokenText = FormatTok->TokenText .substr (0 , BackslashPos + 1 );
936
- FormatTok->ColumnWidth = encoding::columnWidthWithTabs (
937
- FormatTok->TokenText , FormatTok->OriginalColumn , Style .TabWidth ,
938
- Encoding);
988
+ truncateToken (BackslashPos + 1 );
939
989
break ;
940
990
}
941
991
BackslashPos = FormatTok->TokenText .find (' \\ ' , BackslashPos + 1 );
942
992
}
943
993
}
944
994
945
- // In case the token starts with escaped newlines, we want to
946
- // take them into account as whitespace - this pattern is quite frequent
947
- // in macro definitions.
948
- // FIXME: Add a more explicit test.
949
- while (FormatTok->TokenText .size () > 1 && FormatTok->TokenText [0 ] == ' \\ ' ) {
950
- unsigned SkippedWhitespace = 0 ;
951
- if (FormatTok->TokenText .size () > 2 &&
952
- (FormatTok->TokenText [1 ] == ' \r ' && FormatTok->TokenText [2 ] == ' \n ' )) {
953
- SkippedWhitespace = 3 ;
954
- } else if (FormatTok->TokenText [1 ] == ' \n ' ) {
955
- SkippedWhitespace = 2 ;
956
- } else {
957
- break ;
958
- }
959
-
960
- ++FormatTok->NewlinesBefore ;
961
- WhitespaceLength += SkippedWhitespace;
962
- FormatTok->LastNewlineOffset = SkippedWhitespace;
963
- Column = 0 ;
964
- FormatTok->TokenText = FormatTok->TokenText .substr (SkippedWhitespace);
965
- }
966
-
967
995
FormatTok->WhitespaceRange = SourceRange (
968
996
WhitespaceStart, WhitespaceStart.getLocWithOffset (WhitespaceLength));
969
997
0 commit comments