Skip to content

[ELF] ScriptLexer: generate tokens lazily #100493

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 72 additions & 124 deletions lld/ELF/ScriptLexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,24 +36,24 @@ using namespace llvm;
using namespace lld;
using namespace lld::elf;

ScriptLexer::ScriptLexer(MemoryBufferRef mb) : curBuf(mb), mbs(1, mb) {}

// Returns a whole line containing the current token.
StringRef ScriptLexer::getLine() {
StringRef s = getCurrentMB().getBuffer();
StringRef tok = tokens[pos - 1];

size_t pos = s.rfind('\n', tok.data() - s.data());
size_t pos = s.rfind('\n', prevTok.data() - s.data());
if (pos != StringRef::npos)
s = s.substr(pos + 1);
return s.substr(0, s.find_first_of("\r\n"));
}

// Returns 1-based line number of the current token.
size_t ScriptLexer::getLineNumber() {
if (pos == 0)
if (prevTok.empty())
return 1;
StringRef s = getCurrentMB().getBuffer();
StringRef tok = tokens[pos - 1];
const size_t tokOffset = tok.data() - s.data();
const size_t tokOffset = prevTok.data() - s.data();

// For the first token, or when going backwards, start from the beginning of
// the buffer. If this token is after the previous token, start from the
Expand All @@ -76,40 +76,41 @@ size_t ScriptLexer::getLineNumber() {

// Returns 0-based column number of the current token.
size_t ScriptLexer::getColumnNumber() {
StringRef tok = tokens[pos - 1];
return tok.data() - getLine().data();
return prevTok.data() - getLine().data();
}

std::string ScriptLexer::getCurrentLocation() {
std::string filename = std::string(getCurrentMB().getBufferIdentifier());
return (filename + ":" + Twine(getLineNumber())).str();
}

ScriptLexer::ScriptLexer(MemoryBufferRef mb) { tokenize(mb); }

// We don't want to record cascading errors. Keep only the first one.
void ScriptLexer::setError(const Twine &msg) {
if (errorCount())
return;

std::string s = (getCurrentLocation() + ": " + msg).str();
if (pos)
if (prevTok.size())
s += "\n>>> " + getLine().str() + "\n>>> " +
std::string(getColumnNumber(), ' ') + "^";
error(s);
}

// Split S into linker script tokens.
void ScriptLexer::tokenize(MemoryBufferRef mb) {
std::vector<StringRef> vec;
mbs.push_back(mb);
StringRef s = mb.getBuffer();
StringRef begin = s;

void ScriptLexer::lex() {
for (;;) {
StringRef &s = curBuf.s;
s = skipSpace(s);
if (s.empty())
break;
if (s.empty()) {
// If this buffer is from an INCLUDE command, switch to the "return
// value"; otherwise, mark EOF.
if (buffers.empty()) {
eof = true;
return;
}
curBuf = buffers.pop_back_val();
continue;
}
curTokState = inExpr;

// Quoted token. Note that double-quote characters are parts of a token
// because, in a glob match context, only unquoted tokens are interpreted
Expand All @@ -118,45 +119,53 @@ void ScriptLexer::tokenize(MemoryBufferRef mb) {
if (s.starts_with("\"")) {
size_t e = s.find("\"", 1);
if (e == StringRef::npos) {
StringRef filename = mb.getBufferIdentifier();
size_t lineno = begin.substr(0, s.data() - begin.data()).count('\n');
error(filename + ":" + Twine(lineno + 1) + ": unclosed quote");
size_t lineno =
StringRef(curBuf.begin, s.data() - curBuf.begin).count('\n');
error(curBuf.filename + ":" + Twine(lineno + 1) + ": unclosed quote");
return;
}

vec.push_back(s.take_front(e + 1));
curTok = s.take_front(e + 1);
s = s.substr(e + 1);
continue;
return;
}

// Some operators form separate tokens.
if (s.starts_with("<<=") || s.starts_with(">>=")) {
vec.push_back(s.substr(0, 3));
curTok = s.substr(0, 3);
s = s.substr(3);
continue;
return;
}
if (s.size() > 1 && ((s[1] == '=' && strchr("*/+-<>&^|", s[0])) ||
(s[0] == s[1] && strchr("<>&|", s[0])))) {
vec.push_back(s.substr(0, 2));
if (s.size() > 1 && (s[1] == '=' && strchr("+-*/!&^|", s[0]))) {
curTok = s.substr(0, 2);
s = s.substr(2);
continue;
return;
}

// Unquoted token. This is more relaxed than tokens in C-like language,
// so that you can write "file-name.cpp" as one bare token, for example.
size_t pos = s.find_first_not_of(
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
"0123456789_.$/\\~=+[]*?-!^:");
// Unquoted token. The non-expression token is more relaxed than tokens in
// C-like languages, so that you can write "file-name.cpp" as one bare
// token.
size_t pos;
if (inExpr) {
pos = s.find_first_not_of(
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
"0123456789_.$");
if (pos == 0 && s.size() >= 2 &&
((s[0] == s[1] && strchr("<>&|", s[0])) ||
is_contained({"==", "!=", "<=", ">=", "<<", ">>"}, s.substr(0, 2))))
pos = 2;
} else {
pos = s.find_first_not_of(
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
"0123456789_.$/\\~=+[]*?-!^:");
}

// A character that cannot start a word (which is usually a
// punctuation) forms a single character token.
if (pos == 0)
pos = 1;
vec.push_back(s.substr(0, pos));
curTok = s.substr(0, pos);
s = s.substr(pos);
break;
}

tokens.insert(tokens.begin() + pos, vec.begin(), vec.end());
}

// Skip leading whitespace characters or comments.
Expand Down Expand Up @@ -185,93 +194,30 @@ StringRef ScriptLexer::skipSpace(StringRef s) {
}
}

// An erroneous token is handled as if it were the last token before EOF.
bool ScriptLexer::atEOF() { return errorCount() || tokens.size() == pos; }

// Split a given string as an expression.
// This function returns "3", "*" and "5" for "3*5" for example.
static std::vector<StringRef> tokenizeExpr(StringRef s) {
StringRef ops = "!~*/+-<>?^:="; // List of operators

// Quoted strings are literal strings, so we don't want to split it.
if (s.starts_with("\""))
return {s};

// Split S with operators as separators.
std::vector<StringRef> ret;
while (!s.empty()) {
size_t e = s.find_first_of(ops);

// No need to split if there is no operator.
if (e == StringRef::npos) {
ret.push_back(s);
break;
}

// Get a token before the operator.
if (e != 0)
ret.push_back(s.substr(0, e));

// Get the operator as a token.
// Keep !=, ==, >=, <=, << and >> operators as a single tokens.
if (s.substr(e).starts_with("!=") || s.substr(e).starts_with("==") ||
s.substr(e).starts_with(">=") || s.substr(e).starts_with("<=") ||
s.substr(e).starts_with("<<") || s.substr(e).starts_with(">>")) {
ret.push_back(s.substr(e, 2));
s = s.substr(e + 2);
} else {
ret.push_back(s.substr(e, 1));
s = s.substr(e + 1);
}
}
return ret;
}

// In contexts where expressions are expected, the lexer should apply
// different tokenization rules than the default one. By default,
// arithmetic operator characters are regular characters, but in the
// expression context, they should be independent tokens.
//
// For example, "foo*3" should be tokenized to "foo", "*" and "3" only
// in the expression context.
//
// This function may split the current token into multiple tokens.
void ScriptLexer::maybeSplitExpr() {
if (!inExpr || errorCount() || atEOF())
return;

std::vector<StringRef> v = tokenizeExpr(tokens[pos]);
if (v.size() == 1)
return;
tokens.erase(tokens.begin() + pos);
tokens.insert(tokens.begin() + pos, v.begin(), v.end());
}
// Used to determine whether to stop parsing. Treat errors like EOF.
bool ScriptLexer::atEOF() { return eof || errorCount(); }

StringRef ScriptLexer::next() {
maybeSplitExpr();

if (errorCount())
return "";
if (atEOF()) {
setError("unexpected EOF");
return "";
}
return tokens[pos++];
prevTok = peek();
return std::exchange(curTok, StringRef(curBuf.s.data(), 0));
}

StringRef ScriptLexer::peek() {
Copy link
Contributor

@mysterymath mysterymath Jul 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apologies; didn't get a chance to ask this before it went out.

The lexers I've typically seen preemptively read in the first token; peek() is a no-op that returns the previously read token, while next() is what reads the next token. By contrast, this patch has an additional bit of state in the lexer: whether or not a token has been read in yet. That creates the issue with atEOF: unless peek() has been called at least once, it isn't yet known.

Could the LLD lexer be written that way, or is there a concern that I'm missing? It seems like it would have made less of an upset to the rest of the parser, and it seems simpler to reason about, having less state. (Somewhat subjective, for sure.)

Copy link
Member Author

@MaskRay MaskRay Jul 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The lexer is indeed different from traditional implementations. The reason is to support states (currently just inExpr or not).
peek caches the token at curBuf. curBuf might not be used if the state changes. Therefore, peek has to check curBufState - and prepares to drop the cache if the state changed by the parser.

If next is made to read the next token, the token might get stale as well when the state changes.

I think atEOF is still like traditional parsers, which require a peek to know whether EOF has been encountered.


Some setError was added in the current way to have minimum impact to the test suite. The setError uses could be changed, but many tests would require updates.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The lexer is indeed different from traditional implementations. The reason is to support states (currently just inExpr or not). peek caches the token at curBuf. curBuf might not be used if the state changes. Therefore, peek has to check curBufState - and prepares to drop the cache if the state changed by the parser.

If next is made to read the next token, the token might get stale as well when the state changes.

It seems like this could be addressed by having state changes do a little bit more work: discarding the current token and re-lexing a token in the new state.

I think atEOF is still like traditional parsers, which require a peek to know whether EOF has been encountered.

This is surprising to me; I had thought that most either maintained a valid atEOF as an invariant, or even called it out as a designated token type. The latter is what I've more typically seen.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like this could be addressed by having state changes do a little bit more work: discarding the current token and re-lexing a token in the new state.

This is exactly what the current peek() does.

Are you suggesting that next() should consume the current token and peek the next one?
Since we don't always call peek() before next(), this would require next() to call peek twice, which would be awkward.


Currently, when peek() or next() returns the empty string, they indicate an EOF token.

Copy link
Contributor

@mysterymath mysterymath Jul 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you suggesting that next() should consume the current token and peek the next one? Since we don't always call peek() before next(), this would require next() to call peek twice, which would be awkward.

Not exactly; I'd expect peek() to make no changes to internal state. next() would lex a token and set it as the current one, update atEOF, and return empty string if atEOF. It might even be possible to remove atEOF by representing this as an empty curTok.

I haven't actually tried to put together a patch for this idea yet, so it may well fall apart in the details. I've also admittedly not done a deep swim through the lexer; mostly just wanted to gauge if this kind of approach is desirable before investing more time.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I get what you meant and I've tried such a solution. Unfortunately, at least with the current uses, peek() { return curTok; } isn't feasible. Several places have intricate state transition before peek

The most "strange" places:

  • readSectionAddressType: inExpr is just changed before peek()
  • peek() == "=" || peek().starts_with("=") in readOutputSectionDescription
  • ...

If we want to make peek return curTok, we have to change some consume and expect to switch the state and there might be some places that are infeasible. I believe the current expect("("); change state; peek() way is more readable.

StringRef tok = next();
if (errorCount())
return "";
pos = pos - 1;
return tok;
// curTok is invalid if curTokState and inExpr mismatch.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps worth explaining why. I think something like:
// Rules for what is a token are different when we are in an expression.
// The cached curTok is invalid when the expression state changes.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the suggestion!

if (curTok.size() && curTokState != inExpr) {
curBuf.s = StringRef(curTok.data(), curBuf.s.end() - curTok.data());
curTok = {};
}
if (curTok.empty())
lex();
return curTok;
}

bool ScriptLexer::consume(StringRef tok) {
if (next() == tok)
return true;
--pos;
return false;
if (peek() != tok)
return false;
next();
return true;
}

void ScriptLexer::skip() { (void)next(); }
Expand All @@ -280,8 +226,12 @@ void ScriptLexer::expect(StringRef expect) {
if (errorCount())
return;
StringRef tok = next();
if (tok != expect)
setError(expect + " expected, but got " + tok);
if (tok != expect) {
if (atEOF())
setError("unexpected EOF");
else
setError(expect + " expected, but got " + tok);
}
}

// Returns true if S encloses T.
Expand All @@ -292,10 +242,8 @@ static bool encloses(StringRef s, StringRef t) {
MemoryBufferRef ScriptLexer::getCurrentMB() {
// Find input buffer containing the current token.
assert(!mbs.empty());
if (pos == 0)
return mbs.back();
for (MemoryBufferRef mb : mbs)
if (encloses(mb.getBuffer(), tokens[pos - 1]))
if (encloses(mb.getBuffer(), curBuf.s))
return mb;
llvm_unreachable("getCurrentMB: failed to find a token");
}
30 changes: 26 additions & 4 deletions lld/ELF/ScriptLexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,43 @@
#define LLD_ELF_SCRIPT_LEXER_H

#include "lld/Common/LLVM.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/MemoryBufferRef.h"
#include <vector>

namespace lld::elf {

class ScriptLexer {
protected:
struct Buffer {
// The remaining content to parse and the filename.
StringRef s, filename;
const char *begin = nullptr;
Buffer() = default;
Buffer(MemoryBufferRef mb)
: s(mb.getBuffer()), filename(mb.getBufferIdentifier()),
begin(mb.getBufferStart()) {}
};
// The current buffer and parent buffers due to INCLUDE.
Buffer curBuf;
SmallVector<Buffer, 0> buffers;

// The token before the last next().
StringRef prevTok;
// Rules for what is a token are different when we are in an expression.
// curTok holds the cached return value of peek() and is invalid when the
// expression state changes.
StringRef curTok;
// The inExpr state when curTok is cached.
bool curTokState = false;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was thinking if curTokInExpr might work better given that these are just booleans.

if (curTokInExpr == inExpr) ... 

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea is that I believe we will soon introduce new states, so I avoided inExpr in the name here.

bool eof = false;

public:
explicit ScriptLexer(MemoryBufferRef mb);

void setError(const Twine &msg);
void tokenize(MemoryBufferRef mb);
void lex();
StringRef skipSpace(StringRef s);
bool atEOF();
StringRef next();
Expand All @@ -33,15 +58,12 @@ class ScriptLexer {
MemoryBufferRef getCurrentMB();

std::vector<MemoryBufferRef> mbs;
std::vector<StringRef> tokens;
bool inExpr = false;
size_t pos = 0;

size_t lastLineNumber = 0;
size_t lastLineNumberOffset = 0;

private:
void maybeSplitExpr();
StringRef getLine();
size_t getLineNumber();
size_t getColumnNumber();
Expand Down
Loading
Loading