-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[ELF] ScriptLexer: generate tokens lazily #100493
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,24 +36,24 @@ using namespace llvm; | |
using namespace lld; | ||
using namespace lld::elf; | ||
|
||
ScriptLexer::ScriptLexer(MemoryBufferRef mb) : curBuf(mb), mbs(1, mb) {} | ||
|
||
// Returns a whole line containing the current token. | ||
StringRef ScriptLexer::getLine() { | ||
StringRef s = getCurrentMB().getBuffer(); | ||
StringRef tok = tokens[pos - 1]; | ||
|
||
size_t pos = s.rfind('\n', tok.data() - s.data()); | ||
size_t pos = s.rfind('\n', prevTok.data() - s.data()); | ||
if (pos != StringRef::npos) | ||
s = s.substr(pos + 1); | ||
return s.substr(0, s.find_first_of("\r\n")); | ||
} | ||
|
||
// Returns 1-based line number of the current token. | ||
size_t ScriptLexer::getLineNumber() { | ||
if (pos == 0) | ||
if (prevTok.empty()) | ||
return 1; | ||
StringRef s = getCurrentMB().getBuffer(); | ||
StringRef tok = tokens[pos - 1]; | ||
const size_t tokOffset = tok.data() - s.data(); | ||
const size_t tokOffset = prevTok.data() - s.data(); | ||
|
||
// For the first token, or when going backwards, start from the beginning of | ||
// the buffer. If this token is after the previous token, start from the | ||
|
@@ -76,40 +76,41 @@ size_t ScriptLexer::getLineNumber() { | |
|
||
// Returns 0-based column number of the current token. | ||
size_t ScriptLexer::getColumnNumber() { | ||
StringRef tok = tokens[pos - 1]; | ||
return tok.data() - getLine().data(); | ||
return prevTok.data() - getLine().data(); | ||
} | ||
|
||
std::string ScriptLexer::getCurrentLocation() { | ||
std::string filename = std::string(getCurrentMB().getBufferIdentifier()); | ||
return (filename + ":" + Twine(getLineNumber())).str(); | ||
} | ||
|
||
ScriptLexer::ScriptLexer(MemoryBufferRef mb) { tokenize(mb); } | ||
|
||
// We don't want to record cascading errors. Keep only the first one. | ||
void ScriptLexer::setError(const Twine &msg) { | ||
if (errorCount()) | ||
return; | ||
|
||
std::string s = (getCurrentLocation() + ": " + msg).str(); | ||
if (pos) | ||
if (prevTok.size()) | ||
s += "\n>>> " + getLine().str() + "\n>>> " + | ||
std::string(getColumnNumber(), ' ') + "^"; | ||
error(s); | ||
} | ||
|
||
// Split S into linker script tokens. | ||
void ScriptLexer::tokenize(MemoryBufferRef mb) { | ||
std::vector<StringRef> vec; | ||
mbs.push_back(mb); | ||
StringRef s = mb.getBuffer(); | ||
StringRef begin = s; | ||
|
||
void ScriptLexer::lex() { | ||
for (;;) { | ||
StringRef &s = curBuf.s; | ||
s = skipSpace(s); | ||
if (s.empty()) | ||
break; | ||
if (s.empty()) { | ||
// If this buffer is from an INCLUDE command, switch to the "return | ||
// value"; otherwise, mark EOF. | ||
if (buffers.empty()) { | ||
eof = true; | ||
return; | ||
} | ||
curBuf = buffers.pop_back_val(); | ||
continue; | ||
} | ||
curTokState = inExpr; | ||
|
||
// Quoted token. Note that double-quote characters are parts of a token | ||
// because, in a glob match context, only unquoted tokens are interpreted | ||
|
@@ -118,45 +119,53 @@ void ScriptLexer::tokenize(MemoryBufferRef mb) { | |
if (s.starts_with("\"")) { | ||
size_t e = s.find("\"", 1); | ||
if (e == StringRef::npos) { | ||
StringRef filename = mb.getBufferIdentifier(); | ||
size_t lineno = begin.substr(0, s.data() - begin.data()).count('\n'); | ||
error(filename + ":" + Twine(lineno + 1) + ": unclosed quote"); | ||
size_t lineno = | ||
StringRef(curBuf.begin, s.data() - curBuf.begin).count('\n'); | ||
error(curBuf.filename + ":" + Twine(lineno + 1) + ": unclosed quote"); | ||
return; | ||
} | ||
|
||
vec.push_back(s.take_front(e + 1)); | ||
curTok = s.take_front(e + 1); | ||
s = s.substr(e + 1); | ||
continue; | ||
return; | ||
} | ||
|
||
// Some operators form separate tokens. | ||
if (s.starts_with("<<=") || s.starts_with(">>=")) { | ||
vec.push_back(s.substr(0, 3)); | ||
curTok = s.substr(0, 3); | ||
s = s.substr(3); | ||
continue; | ||
return; | ||
} | ||
if (s.size() > 1 && ((s[1] == '=' && strchr("*/+-<>&^|", s[0])) || | ||
(s[0] == s[1] && strchr("<>&|", s[0])))) { | ||
vec.push_back(s.substr(0, 2)); | ||
if (s.size() > 1 && (s[1] == '=' && strchr("+-*/!&^|", s[0]))) { | ||
curTok = s.substr(0, 2); | ||
s = s.substr(2); | ||
continue; | ||
return; | ||
} | ||
|
||
// Unquoted token. This is more relaxed than tokens in C-like language, | ||
// so that you can write "file-name.cpp" as one bare token, for example. | ||
size_t pos = s.find_first_not_of( | ||
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" | ||
"0123456789_.$/\\~=+[]*?-!^:"); | ||
// Unquoted token. The non-expression token is more relaxed than tokens in | ||
// C-like languages, so that you can write "file-name.cpp" as one bare | ||
// token. | ||
size_t pos; | ||
if (inExpr) { | ||
pos = s.find_first_not_of( | ||
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" | ||
"0123456789_.$"); | ||
if (pos == 0 && s.size() >= 2 && | ||
((s[0] == s[1] && strchr("<>&|", s[0])) || | ||
is_contained({"==", "!=", "<=", ">=", "<<", ">>"}, s.substr(0, 2)))) | ||
pos = 2; | ||
} else { | ||
pos = s.find_first_not_of( | ||
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" | ||
"0123456789_.$/\\~=+[]*?-!^:"); | ||
} | ||
|
||
// A character that cannot start a word (which is usually a | ||
// punctuation) forms a single character token. | ||
if (pos == 0) | ||
pos = 1; | ||
vec.push_back(s.substr(0, pos)); | ||
curTok = s.substr(0, pos); | ||
s = s.substr(pos); | ||
break; | ||
} | ||
|
||
tokens.insert(tokens.begin() + pos, vec.begin(), vec.end()); | ||
} | ||
|
||
// Skip leading whitespace characters or comments. | ||
|
@@ -185,93 +194,30 @@ StringRef ScriptLexer::skipSpace(StringRef s) { | |
} | ||
} | ||
|
||
// An erroneous token is handled as if it were the last token before EOF. | ||
bool ScriptLexer::atEOF() { return errorCount() || tokens.size() == pos; } | ||
|
||
// Split a given string as an expression. | ||
// This function returns "3", "*" and "5" for "3*5" for example. | ||
static std::vector<StringRef> tokenizeExpr(StringRef s) { | ||
StringRef ops = "!~*/+-<>?^:="; // List of operators | ||
|
||
// Quoted strings are literal strings, so we don't want to split it. | ||
if (s.starts_with("\"")) | ||
return {s}; | ||
|
||
// Split S with operators as separators. | ||
std::vector<StringRef> ret; | ||
while (!s.empty()) { | ||
size_t e = s.find_first_of(ops); | ||
|
||
// No need to split if there is no operator. | ||
if (e == StringRef::npos) { | ||
ret.push_back(s); | ||
break; | ||
} | ||
|
||
// Get a token before the operator. | ||
if (e != 0) | ||
ret.push_back(s.substr(0, e)); | ||
|
||
// Get the operator as a token. | ||
// Keep !=, ==, >=, <=, << and >> operators as a single tokens. | ||
if (s.substr(e).starts_with("!=") || s.substr(e).starts_with("==") || | ||
s.substr(e).starts_with(">=") || s.substr(e).starts_with("<=") || | ||
s.substr(e).starts_with("<<") || s.substr(e).starts_with(">>")) { | ||
ret.push_back(s.substr(e, 2)); | ||
s = s.substr(e + 2); | ||
} else { | ||
ret.push_back(s.substr(e, 1)); | ||
s = s.substr(e + 1); | ||
} | ||
} | ||
return ret; | ||
} | ||
|
||
// In contexts where expressions are expected, the lexer should apply | ||
// different tokenization rules than the default one. By default, | ||
// arithmetic operator characters are regular characters, but in the | ||
// expression context, they should be independent tokens. | ||
// | ||
// For example, "foo*3" should be tokenized to "foo", "*" and "3" only | ||
// in the expression context. | ||
// | ||
// This function may split the current token into multiple tokens. | ||
void ScriptLexer::maybeSplitExpr() { | ||
if (!inExpr || errorCount() || atEOF()) | ||
return; | ||
|
||
std::vector<StringRef> v = tokenizeExpr(tokens[pos]); | ||
if (v.size() == 1) | ||
return; | ||
tokens.erase(tokens.begin() + pos); | ||
tokens.insert(tokens.begin() + pos, v.begin(), v.end()); | ||
} | ||
// Used to determine whether to stop parsing. Treat errors like EOF. | ||
bool ScriptLexer::atEOF() { return eof || errorCount(); } | ||
|
||
StringRef ScriptLexer::next() { | ||
maybeSplitExpr(); | ||
|
||
if (errorCount()) | ||
return ""; | ||
if (atEOF()) { | ||
setError("unexpected EOF"); | ||
return ""; | ||
} | ||
return tokens[pos++]; | ||
prevTok = peek(); | ||
return std::exchange(curTok, StringRef(curBuf.s.data(), 0)); | ||
} | ||
|
||
StringRef ScriptLexer::peek() { | ||
StringRef tok = next(); | ||
if (errorCount()) | ||
return ""; | ||
pos = pos - 1; | ||
return tok; | ||
// curTok is invalid if curTokState and inExpr mismatch. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps worth explaining why. I think something like: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the suggestion! |
||
if (curTok.size() && curTokState != inExpr) { | ||
curBuf.s = StringRef(curTok.data(), curBuf.s.end() - curTok.data()); | ||
curTok = {}; | ||
} | ||
if (curTok.empty()) | ||
lex(); | ||
return curTok; | ||
} | ||
|
||
bool ScriptLexer::consume(StringRef tok) { | ||
if (next() == tok) | ||
return true; | ||
--pos; | ||
return false; | ||
if (peek() != tok) | ||
return false; | ||
next(); | ||
return true; | ||
} | ||
|
||
void ScriptLexer::skip() { (void)next(); } | ||
|
@@ -280,8 +226,12 @@ void ScriptLexer::expect(StringRef expect) { | |
if (errorCount()) | ||
return; | ||
StringRef tok = next(); | ||
if (tok != expect) | ||
setError(expect + " expected, but got " + tok); | ||
if (tok != expect) { | ||
if (atEOF()) | ||
setError("unexpected EOF"); | ||
else | ||
setError(expect + " expected, but got " + tok); | ||
} | ||
} | ||
|
||
// Returns true if S encloses T. | ||
|
@@ -292,10 +242,8 @@ static bool encloses(StringRef s, StringRef t) { | |
MemoryBufferRef ScriptLexer::getCurrentMB() { | ||
// Find input buffer containing the current token. | ||
assert(!mbs.empty()); | ||
if (pos == 0) | ||
return mbs.back(); | ||
for (MemoryBufferRef mb : mbs) | ||
if (encloses(mb.getBuffer(), tokens[pos - 1])) | ||
if (encloses(mb.getBuffer(), curBuf.s)) | ||
return mb; | ||
llvm_unreachable("getCurrentMB: failed to find a token"); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,18 +10,43 @@ | |
#define LLD_ELF_SCRIPT_LEXER_H | ||
|
||
#include "lld/Common/LLVM.h" | ||
#include "llvm/ADT/SmallVector.h" | ||
#include "llvm/ADT/StringRef.h" | ||
#include "llvm/Support/MemoryBufferRef.h" | ||
#include <vector> | ||
|
||
namespace lld::elf { | ||
|
||
class ScriptLexer { | ||
protected: | ||
struct Buffer { | ||
// The remaining content to parse and the filename. | ||
StringRef s, filename; | ||
const char *begin = nullptr; | ||
Buffer() = default; | ||
Buffer(MemoryBufferRef mb) | ||
: s(mb.getBuffer()), filename(mb.getBufferIdentifier()), | ||
begin(mb.getBufferStart()) {} | ||
}; | ||
// The current buffer and parent buffers due to INCLUDE. | ||
Buffer curBuf; | ||
SmallVector<Buffer, 0> buffers; | ||
|
||
// The token before the last next(). | ||
StringRef prevTok; | ||
// Rules for what is a token are different when we are in an expression. | ||
// curTok holds the cached return value of peek() and is invalid when the | ||
// expression state changes. | ||
StringRef curTok; | ||
// The inExpr state when curTok is cached. | ||
bool curTokState = false; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Was thinking if
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The idea is that I believe we will soon introduce new states, so I avoided |
||
bool eof = false; | ||
|
||
public: | ||
explicit ScriptLexer(MemoryBufferRef mb); | ||
|
||
void setError(const Twine &msg); | ||
void tokenize(MemoryBufferRef mb); | ||
void lex(); | ||
StringRef skipSpace(StringRef s); | ||
bool atEOF(); | ||
StringRef next(); | ||
|
@@ -33,15 +58,12 @@ class ScriptLexer { | |
MemoryBufferRef getCurrentMB(); | ||
|
||
std::vector<MemoryBufferRef> mbs; | ||
std::vector<StringRef> tokens; | ||
bool inExpr = false; | ||
size_t pos = 0; | ||
|
||
size_t lastLineNumber = 0; | ||
size_t lastLineNumberOffset = 0; | ||
|
||
private: | ||
void maybeSplitExpr(); | ||
StringRef getLine(); | ||
size_t getLineNumber(); | ||
size_t getColumnNumber(); | ||
|
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Apologies; didn't get a chance to ask this before it went out.
The lexers I've typically seen preemptively read in the first token;
peek()
is a no-op that returns the previously read token, whilenext()
is what reads the next token. By contrast, this patch has an additional bit of state in the lexer: whether or not a token has been read in yet. That creates the issue withatEOF
: unlesspeek()
has been called at least once, it isn't yet known.Could the LLD lexer be written that way, or is there a concern that I'm missing? It seems like it would have made less of an upset to the rest of the parser, and it seems simpler to reason about, having less state. (Somewhat subjective, for sure.)
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The lexer is indeed different from traditional implementations. The reason is to support states (currently just
inExpr
or not).peek
caches the token atcurBuf
.curBuf
might not be used if the state changes. Therefore,peek
has to checkcurBufState
- and prepares to drop the cache if the state changed by the parser.If
next
is made to read the next token, the token might get stale as well when the state changes.I think
atEOF
is still like traditional parsers, which require apeek
to know whether EOF has been encountered.Some
setError
was added in the current way to have minimum impact to the test suite. ThesetError
uses could be changed, but many tests would require updates.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems like this could be addressed by having state changes do a little bit more work: discarding the current token and re-lexing a token in the new state.
This is surprising to me; I had thought that most either maintained a valid atEOF as an invariant, or even called it out as a designated token type. The latter is what I've more typically seen.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is exactly what the current
peek()
does.Are you suggesting that
next()
should consume the current token and peek the next one?Since we don't always call
peek()
beforenext()
, this would requirenext()
to call peek twice, which would be awkward.Currently, when
peek()
ornext()
returns the empty string, they indicate an EOF token.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not exactly; I'd expect
peek()
to make no changes to internal state.next()
would lex a token and set it as the current one, update atEOF, and return empty string ifatEOF
. It might even be possible to removeatEOF
by representing this as an emptycurTok
.I haven't actually tried to put together a patch for this idea yet, so it may well fall apart in the details. I've also admittedly not done a deep swim through the lexer; mostly just wanted to gauge if this kind of approach is desirable before investing more time.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think I get what you meant and I've tried such a solution. Unfortunately, at least with the current uses,
peek() { return curTok; }
isn't feasible. Several places have intricate state transition beforepeek
The most "strange" places:
inExpr
is just changed beforepeek()
peek() == "=" || peek().starts_with("=")
inreadOutputSectionDescription
If we want to make
peek
return curTok, we have to change someconsume
andexpect
to switch the state and there might be some places that are infeasible. I believe the currentexpect("("); change state; peek()
way is more readable.