Skip to content

Commit 370bee4

Browse files
committed
[clang-format] Fix whitespace counting stuff
The current way of counting whitespace would count backticks as whitespace. For Verilog stuff we need backticks to be handled correctly. For JavaScript the current way is to compare the entire token text to see if it's a backtick. However, when the backtick is the first token following an escaped newline, the escaped newline will be part of the tok::unknown token. Verilog has macros and escaped newlines unlike JavaScript. So we can't regard an entire tok::unknown token as whitespace. Previously, the start of every token would be matched for newlines. Now, it is all whitespace instead of just newlines. The column counting problem has already been fixed for JavaScript in e71b4cb by counting columns elsewhere. Reviewed By: HazardyKnusperkeks Differential Revision: https://reviews.llvm.org/D124748
1 parent b2cb7e8 commit 370bee4

File tree

2 files changed

+92
-62
lines changed

2 files changed

+92
-62
lines changed

clang/lib/Format/FormatTokenLexer.cpp

Lines changed: 90 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -840,6 +840,56 @@ FormatToken *FormatTokenLexer::getStashedToken() {
840840
return FormatTok;
841841
}
842842

843+
/// Truncate the current token to the new length and make the lexer continue
844+
/// from the end of the truncated token. Used for other languages that have
845+
/// different token boundaries, like JavaScript in which a comment ends at a
846+
/// line break regardless of whether the line break follows a backslash. Also
847+
/// used to set the lexer to the end of whitespace if the lexer regards
848+
/// whitespace and an unrecognized symbol as one token.
849+
void FormatTokenLexer::truncateToken(size_t NewLen) {
850+
assert(NewLen <= FormatTok->TokenText.size());
851+
resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
852+
Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
853+
FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
854+
FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
855+
FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
856+
Encoding);
857+
FormatTok->Tok.setLength(NewLen);
858+
}
859+
860+
/// Count the length of leading whitespace in a token.
861+
static size_t countLeadingWhitespace(StringRef Text) {
862+
// Basically counting the length matched by this regex.
863+
// "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
864+
// Directly using the regex turned out to be slow. With the regex
865+
// version formatting all files in this directory took about 1.25
866+
// seconds. This version took about 0.5 seconds.
867+
const char *Cur = Text.begin();
868+
while (Cur < Text.end()) {
869+
if (isspace(Cur[0])) {
870+
++Cur;
871+
} else if (Cur[0] == '\\' && (Cur[1] == '\n' || Cur[1] == '\r')) {
872+
// A '\' followed by a newline always escapes the newline, regardless
873+
// of whether there is another '\' before it.
874+
// The source has a null byte at the end. So the end of the entire input
875+
// isn't reached yet. Also the lexer doesn't break apart an escaped
876+
// newline.
877+
assert(Text.end() - Cur >= 2);
878+
Cur += 2;
879+
} else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' &&
880+
(Cur[3] == '\n' || Cur[3] == '\r')) {
881+
// Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
882+
// characters are quoted individually in this comment because if we write
883+
// them together some compilers warn that we have a trigraph in the code.
884+
assert(Text.end() - Cur >= 4);
885+
Cur += 4;
886+
} else {
887+
break;
888+
}
889+
}
890+
return Cur - Text.begin();
891+
}
892+
843893
FormatToken *FormatTokenLexer::getNextToken() {
844894
if (StateStack.top() == LexerState::TOKEN_STASHED) {
845895
StateStack.pop();
@@ -854,34 +904,33 @@ FormatToken *FormatTokenLexer::getNextToken() {
854904
IsFirstToken = false;
855905

856906
// Consume and record whitespace until we find a significant token.
907+
// Some tok::unknown tokens are not just whitespace, e.g. whitespace
908+
// followed by a symbol such as backtick. Those symbols may be
909+
// significant in other languages.
857910
unsigned WhitespaceLength = TrailingWhitespace;
858-
while (FormatTok->is(tok::unknown)) {
911+
while (FormatTok->isNot(tok::eof)) {
912+
auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
913+
if (LeadingWhitespace == 0)
914+
break;
915+
if (LeadingWhitespace < FormatTok->TokenText.size())
916+
truncateToken(LeadingWhitespace);
859917
StringRef Text = FormatTok->TokenText;
860-
auto EscapesNewline = [&](int pos) {
861-
// A '\r' here is just part of '\r\n'. Skip it.
862-
if (pos >= 0 && Text[pos] == '\r')
863-
--pos;
864-
// See whether there is an odd number of '\' before this.
865-
// FIXME: This is wrong. A '\' followed by a newline is always removed,
866-
// regardless of whether there is another '\' before it.
867-
// FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
868-
unsigned count = 0;
869-
for (; pos >= 0; --pos, ++count)
870-
if (Text[pos] != '\\')
871-
break;
872-
return count & 1;
873-
};
874-
// FIXME: This miscounts tok:unknown tokens that are not just
875-
// whitespace, e.g. a '`' character.
918+
bool InEscape = false;
876919
for (int i = 0, e = Text.size(); i != e; ++i) {
877920
switch (Text[i]) {
921+
case '\r':
922+
// If this is a CRLF sequence, break here and the LF will be handled on
923+
// the next loop iteration. Otherwise, this is a single Mac CR, treat it
924+
// the same as a single LF.
925+
if (i + 1 < e && Text[i + 1] == '\n')
926+
break;
927+
LLVM_FALLTHROUGH;
878928
case '\n':
879929
++FormatTok->NewlinesBefore;
880-
FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
881-
FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
882-
Column = 0;
883-
break;
884-
case '\r':
930+
if (!InEscape)
931+
FormatTok->HasUnescapedNewline = true;
932+
else
933+
InEscape = false;
885934
FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
886935
Column = 0;
887936
break;
@@ -897,24 +946,32 @@ FormatToken *FormatTokenLexer::getNextToken() {
897946
Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
898947
break;
899948
case '\\':
900-
if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
901-
FormatTok->setType(TT_ImplicitStringLiteral);
949+
case '?':
950+
case '/':
951+
// The text was entirely whitespace when this loop was entered. Thus
952+
// this has to be an escape sequence.
953+
assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" ||
954+
Text.substr(i, 4) == "\?\?/\r" ||
955+
Text.substr(i, 4) == "\?\?/\n" ||
956+
(i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" ||
957+
Text.substr(i - 1, 4) == "\?\?/\n")) ||
958+
(i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" ||
959+
Text.substr(i - 2, 4) == "\?\?/\n")));
960+
InEscape = true;
902961
break;
903962
default:
904-
FormatTok->setType(TT_ImplicitStringLiteral);
963+
// This shouldn't happen.
964+
assert(false);
905965
break;
906966
}
907-
if (FormatTok->getType() == TT_ImplicitStringLiteral)
908-
break;
909967
}
910-
911-
if (FormatTok->is(TT_ImplicitStringLiteral))
912-
break;
913-
WhitespaceLength += FormatTok->Tok.getLength();
914-
968+
WhitespaceLength += Text.size();
915969
readRawToken(*FormatTok);
916970
}
917971

972+
if (FormatTok->is(tok::unknown))
973+
FormatTok->setType(TT_ImplicitStringLiteral);
974+
918975
// JavaScript and Java do not allow to escape the end of the line with a
919976
// backslash. Backslashes are syntax errors in plain source, but can occur in
920977
// comments. When a single line comment ends with a \, it'll cause the next
@@ -928,42 +985,13 @@ FormatToken *FormatTokenLexer::getNextToken() {
928985
while (BackslashPos != StringRef::npos) {
929986
if (BackslashPos + 1 < FormatTok->TokenText.size() &&
930987
FormatTok->TokenText[BackslashPos + 1] == '\n') {
931-
const char *Offset = Lex->getBufferLocation();
932-
Offset -= FormatTok->TokenText.size();
933-
Offset += BackslashPos + 1;
934-
resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
935-
FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
936-
FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
937-
FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
938-
Encoding);
988+
truncateToken(BackslashPos + 1);
939989
break;
940990
}
941991
BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
942992
}
943993
}
944994

945-
// In case the token starts with escaped newlines, we want to
946-
// take them into account as whitespace - this pattern is quite frequent
947-
// in macro definitions.
948-
// FIXME: Add a more explicit test.
949-
while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\') {
950-
unsigned SkippedWhitespace = 0;
951-
if (FormatTok->TokenText.size() > 2 &&
952-
(FormatTok->TokenText[1] == '\r' && FormatTok->TokenText[2] == '\n')) {
953-
SkippedWhitespace = 3;
954-
} else if (FormatTok->TokenText[1] == '\n') {
955-
SkippedWhitespace = 2;
956-
} else {
957-
break;
958-
}
959-
960-
++FormatTok->NewlinesBefore;
961-
WhitespaceLength += SkippedWhitespace;
962-
FormatTok->LastNewlineOffset = SkippedWhitespace;
963-
Column = 0;
964-
FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace);
965-
}
966-
967995
FormatTok->WhitespaceRange = SourceRange(
968996
WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
969997

clang/lib/Format/FormatTokenLexer.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ class FormatTokenLexer {
9292

9393
bool tryMergeConflictMarkers();
9494

95+
void truncateToken(size_t NewLen);
96+
9597
FormatToken *getStashedToken();
9698

9799
FormatToken *getNextToken();

0 commit comments

Comments
 (0)