-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[ELF] ScriptLexer: generate tokens lazily #100493
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[ELF] ScriptLexer: generate tokens lazily #100493
Conversation
Created using spr 1.3.5-bogner
@llvm/pr-subscribers-lld-elf @llvm/pr-subscribers-lld Author: Fangrui Song (MaskRay) ChangesThe current tokenize-whole-file approach has a few limitations.
The patch switches to a traditional lexer that generates tokens lazily.
Patch is 21.83 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/100493.diff 7 Files Affected:
diff --git a/lld/ELF/ScriptLexer.cpp b/lld/ELF/ScriptLexer.cpp
index 65055086d6bc2..70c7169973cb6 100644
--- a/lld/ELF/ScriptLexer.cpp
+++ b/lld/ELF/ScriptLexer.cpp
@@ -44,9 +44,8 @@ using namespace lld::elf;
// Returns a whole line containing the current token.
StringRef ScriptLexer::getLine() {
StringRef s = getCurrentMB().getBuffer();
- StringRef tok = tokens[pos - 1];
- size_t pos = s.rfind('\n', tok.data() - s.data());
+ size_t pos = s.rfind('\n', prevTok.data() - s.data());
if (pos != StringRef::npos)
s = s.substr(pos + 1);
return s.substr(0, s.find_first_of("\r\n"));
@@ -54,11 +53,10 @@ StringRef ScriptLexer::getLine() {
// Returns 1-based line number of the current token.
size_t ScriptLexer::getLineNumber() {
- if (pos == 0)
+ if (prevTok.empty())
return 1;
StringRef s = getCurrentMB().getBuffer();
- StringRef tok = tokens[pos - 1];
- const size_t tokOffset = tok.data() - s.data();
+ const size_t tokOffset = prevTok.data() - s.data();
// For the first token, or when going backwards, start from the beginning of
// the buffer. If this token is after the previous token, start from the
@@ -81,8 +79,7 @@ size_t ScriptLexer::getLineNumber() {
// Returns 0-based column number of the current token.
size_t ScriptLexer::getColumnNumber() {
- StringRef tok = tokens[pos - 1];
- return tok.data() - getLine().data();
+ return prevTok.data() - getLine().data();
}
std::string ScriptLexer::getCurrentLocation() {
@@ -90,7 +87,11 @@ std::string ScriptLexer::getCurrentLocation() {
return (filename + ":" + Twine(getLineNumber())).str();
}
-ScriptLexer::ScriptLexer(MemoryBufferRef mb) { tokenize(mb); }
+ScriptLexer::ScriptLexer(MemoryBufferRef mb) {
+ cur.s = mb.getBuffer();
+ cur.filename = mb.getBufferIdentifier();
+ mbs.push_back(mb);
+}
// We don't want to record cascading errors. Keep only the first one.
void ScriptLexer::setError(const Twine &msg) {
@@ -98,70 +99,86 @@ void ScriptLexer::setError(const Twine &msg) {
return;
std::string s = (getCurrentLocation() + ": " + msg).str();
- if (pos)
+ if (prevTok.size())
s += "\n>>> " + getLine().str() + "\n>>> " +
std::string(getColumnNumber(), ' ') + "^";
error(s);
}
-// Split S into linker script tokens.
-void ScriptLexer::tokenize(MemoryBufferRef mb) {
+void ScriptLexer::lexToken() {
std::vector<StringRef> vec;
- mbs.push_back(mb);
- StringRef s = mb.getBuffer();
- StringRef begin = s;
+ StringRef begin = cur.s;
for (;;) {
- s = skipSpace(s);
- if (s.empty())
- break;
+ cur.s = skipSpace(cur.s);
+ if (cur.s.empty()) {
+ // If this buffer is from an INCLUDE command, switch to the "return
+ // value"; otherwise, mark EOF.
+ if (buffers.empty()) {
+ eof = true;
+ return;
+ }
+ cur = buffers.pop_back_val();
+ continue;
+ }
+ curTokState = inExpr;
// Quoted token. Note that double-quote characters are parts of a token
// because, in a glob match context, only unquoted tokens are interpreted
// as glob patterns. Double-quoted tokens are literal patterns in that
// context.
- if (s.starts_with("\"")) {
- size_t e = s.find("\"", 1);
+ if (cur.s.starts_with("\"")) {
+ size_t e = cur.s.find("\"", 1);
if (e == StringRef::npos) {
- StringRef filename = mb.getBufferIdentifier();
- size_t lineno = begin.substr(0, s.data() - begin.data()).count('\n');
- error(filename + ":" + Twine(lineno + 1) + ": unclosed quote");
+ size_t lineno =
+ begin.substr(0, cur.s.data() - begin.data()).count('\n');
+ error(cur.filename + ":" + Twine(lineno + 1) + ": unclosed quote");
return;
}
- vec.push_back(s.take_front(e + 1));
- s = s.substr(e + 1);
- continue;
+ curTok = cur.s.take_front(e + 1);
+ cur.s = cur.s.substr(e + 1);
+ return;
}
// Some operators form separate tokens.
- if (s.starts_with("<<=") || s.starts_with(">>=")) {
- vec.push_back(s.substr(0, 3));
- s = s.substr(3);
- continue;
+ if (cur.s.starts_with("<<=") || cur.s.starts_with(">>=")) {
+ curTok = cur.s.substr(0, 3);
+ cur.s = cur.s.substr(3);
+ return;
}
- if (s.size() > 1 && ((s[1] == '=' && strchr("*/+-<>&^|", s[0])) ||
- (s[0] == s[1] && strchr("<>&|", s[0])))) {
- vec.push_back(s.substr(0, 2));
- s = s.substr(2);
- continue;
+ if (cur.s.size() > 1 &&
+ ((cur.s[1] == '=' && strchr("*/+-!<>&^|", cur.s[0])) ||
+ (cur.s[0] == cur.s[1] && strchr("<>&|", cur.s[0])))) {
+ curTok = cur.s.substr(0, 2);
+ cur.s = cur.s.substr(2);
+ return;
}
- // Unquoted token. This is more relaxed than tokens in C-like language,
- // so that you can write "file-name.cpp" as one bare token, for example.
- size_t pos = s.find_first_not_of(
- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
- "0123456789_.$/\\~=+[]*?-!^:");
+ // Unquoted token. The non-expression token is more relaxed than tokens in
+ // C-like languages, so that you can write "file-name.cpp" as one bare
+ // token.
+ size_t pos;
+ if (inExpr) {
+ pos = cur.s.find_first_not_of(
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+ "0123456789_.$");
+ if (pos == 0 && cur.s.size() >= 2 &&
+ is_contained({"==", "!=", "<=", ">=", "<<", ">>"},
+ cur.s.substr(0, 2)))
+ pos = 2;
+ } else {
+ pos = cur.s.find_first_not_of(
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+ "0123456789_.$/\\~=+[]*?-!^:");
+ }
- // A character that cannot start a word (which is usually a
- // punctuation) forms a single character token.
if (pos == 0)
pos = 1;
- vec.push_back(s.substr(0, pos));
- s = s.substr(pos);
+ curTok = cur.s.substr(0, pos);
+ cur.s = cur.s.substr(pos);
+ break;
}
-
- tokens.insert(tokens.begin() + pos, vec.begin(), vec.end());
}
// Skip leading whitespace characters or comments.
@@ -191,92 +208,31 @@ StringRef ScriptLexer::skipSpace(StringRef s) {
}
// An erroneous token is handled as if it were the last token before EOF.
-bool ScriptLexer::atEOF() { return errorCount() || tokens.size() == pos; }
-
-// Split a given string as an expression.
-// This function returns "3", "*" and "5" for "3*5" for example.
-static std::vector<StringRef> tokenizeExpr(StringRef s) {
- StringRef ops = "!~*/+-<>?^:="; // List of operators
-
- // Quoted strings are literal strings, so we don't want to split it.
- if (s.starts_with("\""))
- return {s};
-
- // Split S with operators as separators.
- std::vector<StringRef> ret;
- while (!s.empty()) {
- size_t e = s.find_first_of(ops);
-
- // No need to split if there is no operator.
- if (e == StringRef::npos) {
- ret.push_back(s);
- break;
- }
-
- // Get a token before the operator.
- if (e != 0)
- ret.push_back(s.substr(0, e));
-
- // Get the operator as a token.
- // Keep !=, ==, >=, <=, << and >> operators as a single tokens.
- if (s.substr(e).starts_with("!=") || s.substr(e).starts_with("==") ||
- s.substr(e).starts_with(">=") || s.substr(e).starts_with("<=") ||
- s.substr(e).starts_with("<<") || s.substr(e).starts_with(">>")) {
- ret.push_back(s.substr(e, 2));
- s = s.substr(e + 2);
- } else {
- ret.push_back(s.substr(e, 1));
- s = s.substr(e + 1);
- }
- }
- return ret;
-}
-
-// In contexts where expressions are expected, the lexer should apply
-// different tokenization rules than the default one. By default,
-// arithmetic operator characters are regular characters, but in the
-// expression context, they should be independent tokens.
-//
-// For example, "foo*3" should be tokenized to "foo", "*" and "3" only
-// in the expression context.
-//
-// This function may split the current token into multiple tokens.
-void ScriptLexer::maybeSplitExpr() {
- if (!inExpr || errorCount() || atEOF())
- return;
-
- std::vector<StringRef> v = tokenizeExpr(tokens[pos]);
- if (v.size() == 1)
- return;
- tokens.erase(tokens.begin() + pos);
- tokens.insert(tokens.begin() + pos, v.begin(), v.end());
-}
+bool ScriptLexer::atEOF() { return eof || errorCount(); }
StringRef ScriptLexer::next() {
- maybeSplitExpr();
-
if (errorCount())
return "";
- if (atEOF()) {
- setError("unexpected EOF");
- return "";
- }
- return tokens[pos++];
+ prevTok = peek();
+ return std::exchange(curTok, StringRef(cur.s.data(), 0));
}
StringRef ScriptLexer::peek() {
- StringRef tok = next();
- if (errorCount())
- return "";
- pos = pos - 1;
- return tok;
+ // curTok is invalid if curTokState and inExpr mismatch.
+ if (curTok.size() && curTokState != inExpr) {
+ cur.s = StringRef(curTok.data(), cur.s.end() - curTok.data());
+ curTok = {};
+ }
+ if (curTok.empty())
+ lexToken();
+ return curTok;
}
bool ScriptLexer::consume(StringRef tok) {
- if (next() == tok)
- return true;
- --pos;
- return false;
+ if (peek() != tok)
+ return false;
+ next();
+ return true;
}
void ScriptLexer::skip() { (void)next(); }
@@ -285,8 +241,12 @@ void ScriptLexer::expect(StringRef expect) {
if (errorCount())
return;
StringRef tok = next();
- if (tok != expect)
- setError(expect + " expected, but got " + tok);
+ if (tok != expect) {
+ if (atEOF())
+ setError("unexpected EOF");
+ else
+ setError(expect + " expected, but got " + tok);
+ }
}
// Returns true if S encloses T.
@@ -297,10 +257,10 @@ static bool encloses(StringRef s, StringRef t) {
MemoryBufferRef ScriptLexer::getCurrentMB() {
// Find input buffer containing the current token.
assert(!mbs.empty());
- if (pos == 0)
+ if (prevTok.empty())
return mbs.back();
for (MemoryBufferRef mb : mbs)
- if (encloses(mb.getBuffer(), tokens[pos - 1]))
+ if (encloses(mb.getBuffer(), cur.s))
return mb;
llvm_unreachable("getCurrentMB: failed to find a token");
}
diff --git a/lld/ELF/ScriptLexer.h b/lld/ELF/ScriptLexer.h
index 7d945d8f570c3..9a92eea947a0f 100644
--- a/lld/ELF/ScriptLexer.h
+++ b/lld/ELF/ScriptLexer.h
@@ -10,6 +10,7 @@
#define LLD_ELF_SCRIPT_LEXER_H
#include "lld/Common/LLVM.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/MemoryBufferRef.h"
#include <vector>
@@ -17,11 +18,28 @@
namespace lld::elf {
class ScriptLexer {
+protected:
+ struct Buffer {
+ // The unparsed buffer and the filename.
+ StringRef s, filename;
+ };
+ // The current buffer and parent buffers due to INCLUDE.
+ Buffer cur;
+ SmallVector<Buffer, 0> buffers;
+
+ // The token before the last next().
+ StringRef prevTok;
+ // The cache value of peek(). This is valid if curTokState and inExpr match.
+ StringRef curTok;
+ // The inExpr state when curTok is cached.
+ bool curTokState = false;
+ bool eof = false;
+
public:
explicit ScriptLexer(MemoryBufferRef mb);
void setError(const Twine &msg);
- void tokenize(MemoryBufferRef mb);
+ void lexToken();
StringRef skipSpace(StringRef s);
bool atEOF();
StringRef next();
@@ -33,9 +51,7 @@ class ScriptLexer {
MemoryBufferRef getCurrentMB();
std::vector<MemoryBufferRef> mbs;
- std::vector<StringRef> tokens;
bool inExpr = false;
- size_t pos = 0;
size_t lastLineNumber = 0;
size_t lastLineNumberOffset = 0;
diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp
index 8637a8b0b2167..8420a3c11aa72 100644
--- a/lld/ELF/ScriptParser.cpp
+++ b/lld/ELF/ScriptParser.cpp
@@ -200,7 +200,7 @@ void ScriptParser::readDynamicList() {
std::tie(locals, globals) = readSymbols();
expect(";");
- if (!atEOF()) {
+ if (peek() != "") {
setError("EOF expected, but got " + next());
return;
}
@@ -215,7 +215,7 @@ void ScriptParser::readDynamicList() {
void ScriptParser::readVersionScript() {
readVersionScriptCommand();
- if (!atEOF())
+ if (peek().size())
setError("EOF expected, but got " + next());
}
@@ -225,7 +225,9 @@ void ScriptParser::readVersionScriptCommand() {
return;
}
- while (!atEOF() && !errorCount() && peek() != "}") {
+ if (atEOF())
+ setError("unexpected EOF");
+ while (peek() != "}" && !atEOF()) {
StringRef verStr = next();
if (verStr == "{") {
setError("anonymous version definition is used in "
@@ -246,6 +248,8 @@ void ScriptParser::readVersion() {
void ScriptParser::readLinkerScript() {
while (!atEOF()) {
StringRef tok = next();
+ if (atEOF())
+ break;
if (tok == ";")
continue;
@@ -307,8 +311,9 @@ void ScriptParser::readDefsym(StringRef name) {
void ScriptParser::readNoCrossRefs(bool to) {
expect("(");
NoCrossRefCommand cmd{{}, to};
- while (!errorCount() && !consume(")"))
+ while (peek() != ")" && !atEOF())
cmd.outputSections.push_back(unquote(next()));
+ expect(")");
if (cmd.outputSections.size() < 2)
warn(getCurrentLocation() + ": ignored with fewer than 2 output sections");
else
@@ -368,9 +373,10 @@ void ScriptParser::readAsNeeded() {
expect("(");
bool orig = config->asNeeded;
config->asNeeded = true;
- while (!errorCount() && !consume(")"))
+ while (peek() != ")" && !atEOF())
addFile(unquote(next()));
config->asNeeded = orig;
+ expect(")");
}
void ScriptParser::readEntry() {
@@ -384,8 +390,9 @@ void ScriptParser::readEntry() {
void ScriptParser::readExtern() {
expect("(");
- while (!errorCount() && !consume(")"))
+ while (peek() != ")" && !atEOF())
config->undefined.push_back(unquote(next()));
+ expect(")");
}
void ScriptParser::readGroup() {
@@ -406,8 +413,12 @@ void ScriptParser::readInclude() {
}
if (std::optional<std::string> path = searchScript(tok)) {
- if (std::optional<MemoryBufferRef> mb = readFile(*path))
- tokenize(*mb);
+ if (std::optional<MemoryBufferRef> mb = readFile(*path)) {
+ buffers.push_back(cur);
+ cur.s = mb->getBuffer();
+ cur.filename = mb->getBufferIdentifier();
+ mbs.push_back(*mb);
+ }
return;
}
setError("cannot find linker script " + tok);
@@ -415,12 +426,13 @@ void ScriptParser::readInclude() {
void ScriptParser::readInput() {
expect("(");
- while (!errorCount() && !consume(")")) {
+ while (peek() != ")" && !atEOF()) {
if (consume("AS_NEEDED"))
readAsNeeded();
else
addFile(unquote(next()));
}
+ expect(")");
}
void ScriptParser::readOutput() {
@@ -435,8 +447,8 @@ void ScriptParser::readOutput() {
void ScriptParser::readOutputArch() {
// OUTPUT_ARCH is ignored for now.
expect("(");
- while (!errorCount() && !consume(")"))
- skip();
+ while (next() != ")" && !atEOF())
+ ;
}
static std::pair<ELFKind, uint16_t> parseBfdName(StringRef s) {
@@ -702,8 +714,9 @@ static int precedence(StringRef op) {
StringMatcher ScriptParser::readFilePatterns() {
StringMatcher Matcher;
- while (!errorCount() && !consume(")"))
+ while (peek() != ")" && !atEOF())
Matcher.addPattern(SingleStringMatcher(next()));
+ expect(")");
return Matcher;
}
@@ -790,7 +803,7 @@ ScriptParser::readInputSectionRules(StringRef filePattern, uint64_t withFlags,
make<InputSectionDescription>(filePattern, withFlags, withoutFlags);
expect("(");
- while (!errorCount() && !consume(")")) {
+ while (peek() != ")" && !atEOF()) {
SortSectionPolicy outer = readSortKind();
SortSectionPolicy inner = SortSectionPolicy::Default;
SmallVector<SectionPattern, 0> v;
@@ -816,6 +829,7 @@ ScriptParser::readInputSectionRules(StringRef filePattern, uint64_t withFlags,
std::move(v.begin(), v.end(), std::back_inserter(cmd->sectionPatterns));
}
+ expect(")");
return cmd;
}
@@ -1098,12 +1112,31 @@ SymbolAssignment *ScriptParser::readProvideHidden(bool provide, bool hidden) {
return cmd;
}
+// Replace whitespace sequence with one single space. The output is used by
+// -Map.
+static void screezeSpaces(std::string &str) {
+ std::string ret;
+ bool flag = false;
+ auto it = str.begin();
+ for (char c : str) {
+ if (isSpace(c)) {
+ if (!flag)
+ *it++ = ' ';
+ flag = true;
+ } else {
+ *it++ = c;
+ flag = false;
+ }
+ }
+ str.erase(it, str.end());
+}
+
SymbolAssignment *ScriptParser::readAssignment(StringRef tok) {
// Assert expression returns Dot, so this is equal to ".=."
if (tok == "ASSERT")
return make<SymbolAssignment>(".", readAssert(), 0, getCurrentLocation());
- size_t oldPos = pos;
+ const char *oldS = prevTok.data();
SymbolAssignment *cmd = nullptr;
bool savedSeenRelroEnd = script->seenRelroEnd;
const StringRef op = peek();
@@ -1127,9 +1160,8 @@ SymbolAssignment *ScriptParser::readAssignment(StringRef tok) {
if (cmd) {
cmd->dataSegmentRelroEnd = !savedSeenRelroEnd && script->seenRelroEnd;
- cmd->commandString =
- tok.str() + " " +
- llvm::join(tokens.begin() + oldPos, tokens.begin() + pos, " ");
+ cmd->commandString = StringRef(oldS, curTok.data() - oldS).str();
+ screezeSpaces(cmd->commandString);
expect(";");
}
return cmd;
@@ -1333,11 +1365,10 @@ ByteCommand *ScriptParser::readByteCommand(StringRef tok) {
if (size == -1)
return nullptr;
- size_t oldPos = pos;
+ const char *oldS = prevTok.data();
Expr e = readParenExpr();
- std::string commandString =
- tok.str() + " " +
- llvm::join(tokens.begin() + oldPos, tokens.begin() + pos, " ");
+ std::string commandString = StringRef(oldS, cur.s.data() - oldS).str();
+ screezeSpaces(commandString);
return make<ByteCommand>(e, size, commandString);
}
diff --git a/lld/test/ELF/invalid-linkerscript.test b/lld/test/ELF/invalid-linkerscript.test
index 4cbedf639cb1a..73b761ce4d571 100644
--- a/lld/test/ELF/invalid-linkerscript.test
+++ b/lld/test/ELF/invalid-linkerscript.test
@@ -15,7 +15,7 @@
# RUN: echo foobar > %t1
# RUN: not ld.lld %t1 no-such-file 2>&1 | FileCheck -check-prefix=ERR1 %s
-# ERR1: unexpected EOF
+# ERR1: error: {{.*}}1:1: unknown directive: foobar
# ERR1: cannot open no-such-file:
# RUN: echo "foo \"bar" > %t2
diff --git a/lld/test/ELF/linkerscript/diag6.test b/lld/test/ELF/linkerscript/diag6.test
index 0ec0400040b54..dc130b451f225 100644
--- a/lld/test/ELF/linkerscript/diag6.test
+++ b/lld/test/ELF/linkerscript/diag6.test
@@ -4,4 +4,4 @@
SECTIONS /*
-CHECK: error: {{.*}}diag6.test:1: unclosed comment in a linker script
+CHECK: error: {{.*}}diag6.test:5: unclosed comment in a linker script
diff --git a/lld/test/ELF/linkerscript/map-file.test b/lld/test/ELF/linkerscript/map-file.test
index 6ec8bafc42b16..cca7a34fb3ac0 100644
--- a/lld/test/ELF/linkerscript/map-file.test
+++ b/lld/test/ELF/linkerscript/map-file.test
@@ -9,15 +9,15 @@
SECTIONS {
. = 0x1000;
.foo : {
- BYTE(0x11)
- SHORT(0x1122)
+ BYTE ( 0x11 )
+ SHORT (0x1122)
LONG(0x11223344)
QUAD(0x1122334455667788)
PROVIDE_HIDDEN(sym4 = .);
. += 0x1000;
*(.foo.1)
PROVIDE(unused1 = 0xff);
- HIDDEN(sym6 = .);
+ HIDDEN( sym6 = . );
. += 0x123 *
(1 + 1);
foo = .;
@@ -34,20 +34,20 @@ SECTIONS {
# CHECK-NEXT: 0 0 1000 1 . = 0x1000
# CHECK-NEXT: 1000 1000 125d 1 .foo
# CHECK-NEXT: 1000 1000 1 1 BYTE ( 0x11 )
-# CHECK-NEXT: 1001 1001 2 1 SHORT ( 0x1122 )
-# CHECK-NEXT: 1003 1003 4 1 LONG ( 0x11223344 )
-# CHECK-NEXT: 1007 1007 8 1 QUAD ( 0x1122334455667788 )
-# CHECK-NEXT: 100f 100f 0 1 PROVIDE_HIDDEN ( sym4 = . )
+# CHECK-NEXT: 1001 1001 2 1 SHORT (0x1122)
+# CHECK-NEXT: 1003 1003 4 1 LONG(0x11223344)
+# CHECK-NEXT: 1007 1007 8 1 QUAD(0x1122334455667788)
+# CHECK-NEXT: 100f 100f 0 1 PROVIDE_HIDDEN(sym4 = .)
# CHECK-NEXT: 100f 100f 1000 1 . += 0x1000
# CHECK-NEXT: 200f 200f 8 1 {{.*}}{{/|\\}}map-file.test.tmp.o:(.foo.1)
-# CHECK-NEXT: 2017 2017 0 1 HIDDEN ( sym6 = . )
-# CHECK-...
[truncated]
|
(@smithp35 Apologize for the increased workload this and other patches from me recently may place on you. I appreciate your continued support and expertise. Given the significance of this patch, I wanted to provide you with this heads-up.) |
Created using spr 1.3.5-bogner
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for working on this. I've only got a few small suggestions, the remainder of the changes look fine and I was able to easily match them with the previous implementation.
One suggestion I'm in two minds about is whether Buffer cur
would be better off named curBuf
as cur
on its own could mean anything. However it is almost always used as cur.s and curBuf.s
isn't much better.
return ""; | ||
pos = pos - 1; | ||
return tok; | ||
// curTok is invalid if curTokState and inExpr mismatch. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps worth explaining why. I think something like:
// Rules for what is a token are different when we are in an expression.
// The cached curTok is invalid when the expression state changes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the suggestion!
// The cache value of peek(). This is valid if curTokState and inExpr match. | ||
StringRef curTok; | ||
// The inExpr state when curTok is cached. | ||
bool curTokState = false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Was thinking if curTokInExpr
might work better given that these are just booleans.
if (curTokInExpr == inExpr) ...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The idea is that I believe we will soon introduce new states, so I avoided inExpr
in the name here.
lld/ELF/ScriptParser.cpp
Outdated
@@ -1098,12 +1112,31 @@ SymbolAssignment *ScriptParser::readProvideHidden(bool provide, bool hidden) { | |||
return cmd; | |||
} | |||
|
|||
// Replace whitespace sequence with one single space. The output is used by | |||
// -Map. | |||
static void screezeSpaces(std::string &str) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you mean squeezeSpaces?
Created using spr 1.3.5-bogner
Thanks for the review! I've added |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you think it would be possible to split this patch into several more atomic changes?
Created using spr 1.3.5-bogner
I'd love to make smaller changes, but I don't find a way to do so. The bullet points describe why these functions are so intertwined that make smaller changes infeasible... |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the updates. I don't have any more comments. Happy when igorkudrin is fine with the changes.
@@ -1098,12 +1113,31 @@ SymbolAssignment *ScriptParser::readProvideHidden(bool provide, bool hidden) { | |||
return cmd; | |||
} | |||
|
|||
// Replace whitespace sequence (including \n) with one single space. The output | |||
// is used by -Map. | |||
static void squeezeSpaces(std::string &str) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was wondering if there was a way of writing this using algorthim's but I think the end result was worse, it also has the property of removing all leading white-space (which may not be a problem if that's been skipped already).
Best I came up with:
std::replace_if(
s.begin(), s.end(), [](char c) { return isspace(c); }, ' ');
s.erase(std::unique(s.begin(), s.end(),
[](char lhs, char rhs) {
return lhs == ' ' && rhs == ' '; }),
s.end();
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree that <algorithm>
functions might make the algorithm more difficult to read. It will likely require two passes. I've now changed this to a more clever loop:
static void squeezeSpaces(std::string &str) {
char prev = '\0';
auto it = str.begin();
for (char c : str)
if (!isSpace(c) || (c = ' ') != prev)
*it++ = prev = c;
str.erase(it, str.end());
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
After #100493, the idiom `while (!errorCount() && !consume("}"))` could lead to inaccurate diagnostics or dead loops. Introduce till to change the code pattern.
`getLineNumber` is both imprecise (when `INCLUDE` is used) and inefficient (see https://reviews.llvm.org/D104137). Track line number precisely now that we have `struct Buffer` abstraction from #100493.
Fix #93947: the cycle detection mechanism added by https://reviews.llvm.org/D37524 also disallowed including a file twice, which is an unnecessary limitation. Now that we have an include stack #100493, supporting multiple inclusion is trivial. Note: a filename can be referenced with many different paths, e.g. a.lds, ./a.lds, ././a.lds. We don't attempt to detect the cycle in the earliest point.
} | ||
return tokens[pos++]; | ||
prevTok = peek(); | ||
return std::exchange(curTok, StringRef(curBuf.s.data(), 0)); | ||
} | ||
|
||
StringRef ScriptLexer::peek() { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Apologies; didn't get a chance to ask this before it went out.
The lexers I've typically seen preemptively read in the first token; peek()
is a no-op that returns the previously read token, while next()
is what reads the next token. By contrast, this patch has an additional bit of state in the lexer: whether or not a token has been read in yet. That creates the issue with atEOF
: unless peek()
has been called at least once, it isn't yet known.
Could the LLD lexer be written that way, or is there a concern that I'm missing? It seems like it would have made less of an upset to the rest of the parser, and it seems simpler to reason about, having less state. (Somewhat subjective, for sure.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The lexer is indeed different from traditional implementations. The reason is to support states (currently just inExpr
or not).
peek
caches the token at curBuf
. curBuf
might not be used if the state changes. Therefore, peek
has to check curBufState
- and prepares to drop the cache if the state changed by the parser.
If next
is made to read the next token, the token might get stale as well when the state changes.
I think atEOF
is still like traditional parsers, which require a peek
to know whether EOF has been encountered.
Some setError
was added in the current way to have minimum impact to the test suite. The setError
uses could be changed, but many tests would require updates.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The lexer is indeed different from traditional implementations. The reason is to support states (currently just
inExpr
or not).peek
caches the token atcurBuf
.curBuf
might not be used if the state changes. Therefore,peek
has to checkcurBufState
- and prepares to drop the cache if the state changed by the parser.If
next
is made to read the next token, the token might get stale as well when the state changes.
It seems like this could be addressed by having state changes do a little bit more work: discarding the current token and re-lexing a token in the new state.
I think
atEOF
is still like traditional parsers, which require apeek
to know whether EOF has been encountered.
This is surprising to me; I had thought that most either maintained a valid atEOF as an invariant, or even called it out as a designated token type. The latter is what I've more typically seen.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems like this could be addressed by having state changes do a little bit more work: discarding the current token and re-lexing a token in the new state.
This is exactly what the current peek()
does.
Are you suggesting that next()
should consume the current token and peek the next one?
Since we don't always call peek()
before next()
, this would require next()
to call peek twice, which would be awkward.
Currently, when peek()
or next()
returns the empty string, they indicate an EOF token.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you suggesting that
next()
should consume the current token and peek the next one? Since we don't always callpeek()
beforenext()
, this would requirenext()
to call peek twice, which would be awkward.
Not exactly; I'd expect peek()
to make no changes to internal state. next()
would lex a token and set it as the current one, update atEOF, and return empty string if atEOF
. It might even be possible to remove atEOF
by representing this as an empty curTok
.
I haven't actually tried to put together a patch for this idea yet, so it may well fall apart in the details. I've also admittedly not done a deep swim through the lexer; mostly just wanted to gauge if this kind of approach is desirable before investing more time.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think I get what you meant and I've tried such a solution. Unfortunately, at least with the current uses, peek() { return curTok; }
isn't feasible. Several places have intricate state transition before peek
The most "strange" places:
- readSectionAddressType:
inExpr
is just changed beforepeek()
peek() == "=" || peek().starts_with("=")
inreadOutputSectionDescription
- ...
If we want to make peek
return curTok, we have to change some consume
and expect
to switch the state and there might be some places that are infeasible. I believe the current expect("("); change state; peek()
way is more readable.
[NFC][HeterogeneousDWARF] Remove MachineModuleInfo references Remove superfluous indirection through MMI member of MF to prepare for it being removed. CodeGen: Remove MachineModuleInfo reference from MachineFunction (llvm#100357) This avoids another unserializable field. Move the DbgInfoAvailable field into the AsmPrinter, which is only really a cache/convenience bit for checking a direct IR module metadata check. [Comgr] Updated expected error message for compile_log test Upstream updates to the ScriptLexer have modified the expected error message generated when linking the invalid file to a relocatable. Instead of an unexpected EOF, we now hit an unknown directive: llvm#100493 Change-Id: I315072967c4c6d1b94b3f6370bb59d4308efe238
The current tokenize-whole-file approach has a few limitations.
maybeSplitExpr
is needed to parseexpressions. It's infeasible to add new states to behave more like GNU
ld.
readInclude
may insert tokens in the middle, leading to a timecomplexity issue with N-nested
INCLUDE
.after an
INCLUDE
.getLineNumber
cannot be made more efficient without significant codecomplexity and memory consumption. https://reviews.llvm.org/D104137
The patch switches to a traditional lexer that generates tokens lazily.
atEOF
behavior is modified: we need to callpeek
to determine EOF.peek
andnext
cannot callsetError
uponatEOF
.consume
no longer reports an error uponatEOF
, the idiomwhile (!errorCount() && !consume(")"))
would cause a dead loop. Use
while (peek() != ")" && !atEOF()) { ... } expect(")")
instead.readInclude
. This can beutilized to address [LLD] Invalid 'there is a cycle in linker script INCLUDEs' error #93947 properly.
tokens
andpos
are removed.commandString
is reimplemented. Since it is used in -Map output,\n
needs to be replaced with space.