Skip to content

Commit 1978c21

Browse files
authored
[ELF] ScriptLexer: generate tokens lazily
The current tokenize-whole-file approach has a few limitations. * Lack of state information: `maybeSplitExpr` is needed to parse expressions. It's infeasible to add new states to behave more like GNU ld. * `readInclude` may insert tokens in the middle, leading to a time complexity issue with N-nested `INCLUDE`. * line/column information for diagnostics are inaccurate, especially after an `INCLUDE`. * `getLineNumber` cannot be made more efficient without significant code complexity and memory consumption. https://reviews.llvm.org/D104137 The patch switches to a traditional lexer that generates tokens lazily. * `atEOF` behavior is modified: we need to call `peek` to determine EOF. * `peek` and `next` cannot call `setError` upon `atEOF`. * Since `consume` no longer reports an error upon `atEOF`, the idiom `while (!errorCount() && !consume(")"))` would cause a dead loop. Use `while (peek() != ")" && !atEOF()) { ... } expect(")")` instead. * An include stack is introduced to handle `readInclude`. This can be utilized to address #93947 properly. * `tokens` and `pos` are removed. * `commandString` is reimplemented. Since it is used in -Map output, `\n` needs to be replaced with space. Pull Request: #100493
1 parent b33ef5b commit 1978c21

File tree

7 files changed

+163
-170
lines changed

7 files changed

+163
-170
lines changed

lld/ELF/ScriptLexer.cpp

Lines changed: 72 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -36,24 +36,24 @@ using namespace llvm;
3636
using namespace lld;
3737
using namespace lld::elf;
3838

39+
ScriptLexer::ScriptLexer(MemoryBufferRef mb) : curBuf(mb), mbs(1, mb) {}
40+
3941
// Returns a whole line containing the current token.
4042
StringRef ScriptLexer::getLine() {
4143
StringRef s = getCurrentMB().getBuffer();
42-
StringRef tok = tokens[pos - 1];
4344

44-
size_t pos = s.rfind('\n', tok.data() - s.data());
45+
size_t pos = s.rfind('\n', prevTok.data() - s.data());
4546
if (pos != StringRef::npos)
4647
s = s.substr(pos + 1);
4748
return s.substr(0, s.find_first_of("\r\n"));
4849
}
4950

5051
// Returns 1-based line number of the current token.
5152
size_t ScriptLexer::getLineNumber() {
52-
if (pos == 0)
53+
if (prevTok.empty())
5354
return 1;
5455
StringRef s = getCurrentMB().getBuffer();
55-
StringRef tok = tokens[pos - 1];
56-
const size_t tokOffset = tok.data() - s.data();
56+
const size_t tokOffset = prevTok.data() - s.data();
5757

5858
// For the first token, or when going backwards, start from the beginning of
5959
// the buffer. If this token is after the previous token, start from the
@@ -76,40 +76,41 @@ size_t ScriptLexer::getLineNumber() {
7676

7777
// Returns 0-based column number of the current token.
7878
size_t ScriptLexer::getColumnNumber() {
79-
StringRef tok = tokens[pos - 1];
80-
return tok.data() - getLine().data();
79+
return prevTok.data() - getLine().data();
8180
}
8281

8382
std::string ScriptLexer::getCurrentLocation() {
8483
std::string filename = std::string(getCurrentMB().getBufferIdentifier());
8584
return (filename + ":" + Twine(getLineNumber())).str();
8685
}
8786

88-
ScriptLexer::ScriptLexer(MemoryBufferRef mb) { tokenize(mb); }
89-
9087
// We don't want to record cascading errors. Keep only the first one.
9188
void ScriptLexer::setError(const Twine &msg) {
9289
if (errorCount())
9390
return;
9491

9592
std::string s = (getCurrentLocation() + ": " + msg).str();
96-
if (pos)
93+
if (prevTok.size())
9794
s += "\n>>> " + getLine().str() + "\n>>> " +
9895
std::string(getColumnNumber(), ' ') + "^";
9996
error(s);
10097
}
10198

102-
// Split S into linker script tokens.
103-
void ScriptLexer::tokenize(MemoryBufferRef mb) {
104-
std::vector<StringRef> vec;
105-
mbs.push_back(mb);
106-
StringRef s = mb.getBuffer();
107-
StringRef begin = s;
108-
99+
void ScriptLexer::lex() {
109100
for (;;) {
101+
StringRef &s = curBuf.s;
110102
s = skipSpace(s);
111-
if (s.empty())
112-
break;
103+
if (s.empty()) {
104+
// If this buffer is from an INCLUDE command, switch to the "return
105+
// value"; otherwise, mark EOF.
106+
if (buffers.empty()) {
107+
eof = true;
108+
return;
109+
}
110+
curBuf = buffers.pop_back_val();
111+
continue;
112+
}
113+
curTokState = inExpr;
113114

114115
// Quoted token. Note that double-quote characters are parts of a token
115116
// because, in a glob match context, only unquoted tokens are interpreted
@@ -118,45 +119,53 @@ void ScriptLexer::tokenize(MemoryBufferRef mb) {
118119
if (s.starts_with("\"")) {
119120
size_t e = s.find("\"", 1);
120121
if (e == StringRef::npos) {
121-
StringRef filename = mb.getBufferIdentifier();
122-
size_t lineno = begin.substr(0, s.data() - begin.data()).count('\n');
123-
error(filename + ":" + Twine(lineno + 1) + ": unclosed quote");
122+
size_t lineno =
123+
StringRef(curBuf.begin, s.data() - curBuf.begin).count('\n');
124+
error(curBuf.filename + ":" + Twine(lineno + 1) + ": unclosed quote");
124125
return;
125126
}
126127

127-
vec.push_back(s.take_front(e + 1));
128+
curTok = s.take_front(e + 1);
128129
s = s.substr(e + 1);
129-
continue;
130+
return;
130131
}
131132

132133
// Some operators form separate tokens.
133134
if (s.starts_with("<<=") || s.starts_with(">>=")) {
134-
vec.push_back(s.substr(0, 3));
135+
curTok = s.substr(0, 3);
135136
s = s.substr(3);
136-
continue;
137+
return;
137138
}
138-
if (s.size() > 1 && ((s[1] == '=' && strchr("*/+-<>&^|", s[0])) ||
139-
(s[0] == s[1] && strchr("<>&|", s[0])))) {
140-
vec.push_back(s.substr(0, 2));
139+
if (s.size() > 1 && (s[1] == '=' && strchr("+-*/!&^|", s[0]))) {
140+
curTok = s.substr(0, 2);
141141
s = s.substr(2);
142-
continue;
142+
return;
143143
}
144144

145-
// Unquoted token. This is more relaxed than tokens in C-like language,
146-
// so that you can write "file-name.cpp" as one bare token, for example.
147-
size_t pos = s.find_first_not_of(
148-
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
149-
"0123456789_.$/\\~=+[]*?-!^:");
145+
// Unquoted token. The non-expression token is more relaxed than tokens in
146+
// C-like languages, so that you can write "file-name.cpp" as one bare
147+
// token.
148+
size_t pos;
149+
if (inExpr) {
150+
pos = s.find_first_not_of(
151+
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
152+
"0123456789_.$");
153+
if (pos == 0 && s.size() >= 2 &&
154+
((s[0] == s[1] && strchr("<>&|", s[0])) ||
155+
is_contained({"==", "!=", "<=", ">=", "<<", ">>"}, s.substr(0, 2))))
156+
pos = 2;
157+
} else {
158+
pos = s.find_first_not_of(
159+
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
160+
"0123456789_.$/\\~=+[]*?-!^:");
161+
}
150162

151-
// A character that cannot start a word (which is usually a
152-
// punctuation) forms a single character token.
153163
if (pos == 0)
154164
pos = 1;
155-
vec.push_back(s.substr(0, pos));
165+
curTok = s.substr(0, pos);
156166
s = s.substr(pos);
167+
break;
157168
}
158-
159-
tokens.insert(tokens.begin() + pos, vec.begin(), vec.end());
160169
}
161170

162171
// Skip leading whitespace characters or comments.
@@ -185,93 +194,30 @@ StringRef ScriptLexer::skipSpace(StringRef s) {
185194
}
186195
}
187196

188-
// An erroneous token is handled as if it were the last token before EOF.
189-
bool ScriptLexer::atEOF() { return errorCount() || tokens.size() == pos; }
190-
191-
// Split a given string as an expression.
192-
// This function returns "3", "*" and "5" for "3*5" for example.
193-
static std::vector<StringRef> tokenizeExpr(StringRef s) {
194-
StringRef ops = "!~*/+-<>?^:="; // List of operators
195-
196-
// Quoted strings are literal strings, so we don't want to split it.
197-
if (s.starts_with("\""))
198-
return {s};
199-
200-
// Split S with operators as separators.
201-
std::vector<StringRef> ret;
202-
while (!s.empty()) {
203-
size_t e = s.find_first_of(ops);
204-
205-
// No need to split if there is no operator.
206-
if (e == StringRef::npos) {
207-
ret.push_back(s);
208-
break;
209-
}
210-
211-
// Get a token before the operator.
212-
if (e != 0)
213-
ret.push_back(s.substr(0, e));
214-
215-
// Get the operator as a token.
216-
// Keep !=, ==, >=, <=, << and >> operators as a single tokens.
217-
if (s.substr(e).starts_with("!=") || s.substr(e).starts_with("==") ||
218-
s.substr(e).starts_with(">=") || s.substr(e).starts_with("<=") ||
219-
s.substr(e).starts_with("<<") || s.substr(e).starts_with(">>")) {
220-
ret.push_back(s.substr(e, 2));
221-
s = s.substr(e + 2);
222-
} else {
223-
ret.push_back(s.substr(e, 1));
224-
s = s.substr(e + 1);
225-
}
226-
}
227-
return ret;
228-
}
229-
230-
// In contexts where expressions are expected, the lexer should apply
231-
// different tokenization rules than the default one. By default,
232-
// arithmetic operator characters are regular characters, but in the
233-
// expression context, they should be independent tokens.
234-
//
235-
// For example, "foo*3" should be tokenized to "foo", "*" and "3" only
236-
// in the expression context.
237-
//
238-
// This function may split the current token into multiple tokens.
239-
void ScriptLexer::maybeSplitExpr() {
240-
if (!inExpr || errorCount() || atEOF())
241-
return;
242-
243-
std::vector<StringRef> v = tokenizeExpr(tokens[pos]);
244-
if (v.size() == 1)
245-
return;
246-
tokens.erase(tokens.begin() + pos);
247-
tokens.insert(tokens.begin() + pos, v.begin(), v.end());
248-
}
197+
// Used to determine whether to stop parsing. Treat errors like EOF.
198+
bool ScriptLexer::atEOF() { return eof || errorCount(); }
249199

250200
StringRef ScriptLexer::next() {
251-
maybeSplitExpr();
252-
253-
if (errorCount())
254-
return "";
255-
if (atEOF()) {
256-
setError("unexpected EOF");
257-
return "";
258-
}
259-
return tokens[pos++];
201+
prevTok = peek();
202+
return std::exchange(curTok, StringRef(curBuf.s.data(), 0));
260203
}
261204

262205
StringRef ScriptLexer::peek() {
263-
StringRef tok = next();
264-
if (errorCount())
265-
return "";
266-
pos = pos - 1;
267-
return tok;
206+
// curTok is invalid if curTokState and inExpr mismatch.
207+
if (curTok.size() && curTokState != inExpr) {
208+
curBuf.s = StringRef(curTok.data(), curBuf.s.end() - curTok.data());
209+
curTok = {};
210+
}
211+
if (curTok.empty())
212+
lex();
213+
return curTok;
268214
}
269215

270216
bool ScriptLexer::consume(StringRef tok) {
271-
if (next() == tok)
272-
return true;
273-
--pos;
274-
return false;
217+
if (peek() != tok)
218+
return false;
219+
next();
220+
return true;
275221
}
276222

277223
void ScriptLexer::skip() { (void)next(); }
@@ -280,8 +226,12 @@ void ScriptLexer::expect(StringRef expect) {
280226
if (errorCount())
281227
return;
282228
StringRef tok = next();
283-
if (tok != expect)
284-
setError(expect + " expected, but got " + tok);
229+
if (tok != expect) {
230+
if (atEOF())
231+
setError("unexpected EOF");
232+
else
233+
setError(expect + " expected, but got " + tok);
234+
}
285235
}
286236

287237
// Returns true if S encloses T.
@@ -292,10 +242,8 @@ static bool encloses(StringRef s, StringRef t) {
292242
MemoryBufferRef ScriptLexer::getCurrentMB() {
293243
// Find input buffer containing the current token.
294244
assert(!mbs.empty());
295-
if (pos == 0)
296-
return mbs.back();
297245
for (MemoryBufferRef mb : mbs)
298-
if (encloses(mb.getBuffer(), tokens[pos - 1]))
246+
if (encloses(mb.getBuffer(), curBuf.s))
299247
return mb;
300248
llvm_unreachable("getCurrentMB: failed to find a token");
301249
}

lld/ELF/ScriptLexer.h

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,43 @@
1010
#define LLD_ELF_SCRIPT_LEXER_H
1111

1212
#include "lld/Common/LLVM.h"
13+
#include "llvm/ADT/SmallVector.h"
1314
#include "llvm/ADT/StringRef.h"
1415
#include "llvm/Support/MemoryBufferRef.h"
1516
#include <vector>
1617

1718
namespace lld::elf {
1819

1920
class ScriptLexer {
21+
protected:
22+
struct Buffer {
23+
// The remaining content to parse and the filename.
24+
StringRef s, filename;
25+
const char *begin = nullptr;
26+
Buffer() = default;
27+
Buffer(MemoryBufferRef mb)
28+
: s(mb.getBuffer()), filename(mb.getBufferIdentifier()),
29+
begin(mb.getBufferStart()) {}
30+
};
31+
// The current buffer and parent buffers due to INCLUDE.
32+
Buffer curBuf;
33+
SmallVector<Buffer, 0> buffers;
34+
35+
// The token before the last next().
36+
StringRef prevTok;
37+
// Rules for what is a token are different when we are in an expression.
38+
// curTok holds the cached return value of peek() and is invalid when the
39+
// expression state changes.
40+
StringRef curTok;
41+
// The inExpr state when curTok is cached.
42+
bool curTokState = false;
43+
bool eof = false;
44+
2045
public:
2146
explicit ScriptLexer(MemoryBufferRef mb);
2247

2348
void setError(const Twine &msg);
24-
void tokenize(MemoryBufferRef mb);
49+
void lex();
2550
StringRef skipSpace(StringRef s);
2651
bool atEOF();
2752
StringRef next();
@@ -33,15 +58,12 @@ class ScriptLexer {
3358
MemoryBufferRef getCurrentMB();
3459

3560
std::vector<MemoryBufferRef> mbs;
36-
std::vector<StringRef> tokens;
3761
bool inExpr = false;
38-
size_t pos = 0;
3962

4063
size_t lastLineNumber = 0;
4164
size_t lastLineNumberOffset = 0;
4265

4366
private:
44-
void maybeSplitExpr();
4567
StringRef getLine();
4668
size_t getLineNumber();
4769
size_t getColumnNumber();

0 commit comments

Comments
 (0)