@@ -36,24 +36,24 @@ using namespace llvm;
36
36
using namespace lld ;
37
37
using namespace lld ::elf;
38
38
39
+ ScriptLexer::ScriptLexer (MemoryBufferRef mb) : curBuf(mb), mbs(1 , mb) {}
40
+
39
41
// Returns a whole line containing the current token.
40
42
StringRef ScriptLexer::getLine () {
41
43
StringRef s = getCurrentMB ().getBuffer ();
42
- StringRef tok = tokens[pos - 1 ];
43
44
44
- size_t pos = s.rfind (' \n ' , tok .data () - s.data ());
45
+ size_t pos = s.rfind (' \n ' , prevTok .data () - s.data ());
45
46
if (pos != StringRef::npos)
46
47
s = s.substr (pos + 1 );
47
48
return s.substr (0 , s.find_first_of (" \r\n " ));
48
49
}
49
50
50
51
// Returns 1-based line number of the current token.
51
52
size_t ScriptLexer::getLineNumber () {
52
- if (pos == 0 )
53
+ if (prevTok. empty () )
53
54
return 1 ;
54
55
StringRef s = getCurrentMB ().getBuffer ();
55
- StringRef tok = tokens[pos - 1 ];
56
- const size_t tokOffset = tok.data () - s.data ();
56
+ const size_t tokOffset = prevTok.data () - s.data ();
57
57
58
58
// For the first token, or when going backwards, start from the beginning of
59
59
// the buffer. If this token is after the previous token, start from the
@@ -76,40 +76,41 @@ size_t ScriptLexer::getLineNumber() {
76
76
77
77
// Returns 0-based column number of the current token.
78
78
size_t ScriptLexer::getColumnNumber () {
79
- StringRef tok = tokens[pos - 1 ];
80
- return tok.data () - getLine ().data ();
79
+ return prevTok.data () - getLine ().data ();
81
80
}
82
81
83
82
std::string ScriptLexer::getCurrentLocation () {
84
83
std::string filename = std::string (getCurrentMB ().getBufferIdentifier ());
85
84
return (filename + " :" + Twine (getLineNumber ())).str ();
86
85
}
87
86
88
- ScriptLexer::ScriptLexer (MemoryBufferRef mb) { tokenize (mb); }
89
-
90
87
// We don't want to record cascading errors. Keep only the first one.
91
88
void ScriptLexer::setError (const Twine &msg) {
92
89
if (errorCount ())
93
90
return ;
94
91
95
92
std::string s = (getCurrentLocation () + " : " + msg).str ();
96
- if (pos )
93
+ if (prevTok. size () )
97
94
s += " \n >>> " + getLine ().str () + " \n >>> " +
98
95
std::string (getColumnNumber (), ' ' ) + " ^" ;
99
96
error (s);
100
97
}
101
98
102
- // Split S into linker script tokens.
103
- void ScriptLexer::tokenize (MemoryBufferRef mb) {
104
- std::vector<StringRef> vec;
105
- mbs.push_back (mb);
106
- StringRef s = mb.getBuffer ();
107
- StringRef begin = s;
108
-
99
+ void ScriptLexer::lex () {
109
100
for (;;) {
101
+ StringRef &s = curBuf.s ;
110
102
s = skipSpace (s);
111
- if (s.empty ())
112
- break ;
103
+ if (s.empty ()) {
104
+ // If this buffer is from an INCLUDE command, switch to the "return
105
+ // value"; otherwise, mark EOF.
106
+ if (buffers.empty ()) {
107
+ eof = true ;
108
+ return ;
109
+ }
110
+ curBuf = buffers.pop_back_val ();
111
+ continue ;
112
+ }
113
+ curTokState = inExpr;
113
114
114
115
// Quoted token. Note that double-quote characters are parts of a token
115
116
// because, in a glob match context, only unquoted tokens are interpreted
@@ -118,45 +119,53 @@ void ScriptLexer::tokenize(MemoryBufferRef mb) {
118
119
if (s.starts_with (" \" " )) {
119
120
size_t e = s.find (" \" " , 1 );
120
121
if (e == StringRef::npos) {
121
- StringRef filename = mb. getBufferIdentifier ();
122
- size_t lineno = begin. substr ( 0 , s.data () - begin. data () ).count (' \n ' );
123
- error (filename + " :" + Twine (lineno + 1 ) + " : unclosed quote" );
122
+ size_t lineno =
123
+ StringRef (curBuf. begin , s.data () - curBuf. begin ).count (' \n ' );
124
+ error (curBuf. filename + " :" + Twine (lineno + 1 ) + " : unclosed quote" );
124
125
return ;
125
126
}
126
127
127
- vec. push_back ( s.take_front (e + 1 ) );
128
+ curTok = s.take_front (e + 1 );
128
129
s = s.substr (e + 1 );
129
- continue ;
130
+ return ;
130
131
}
131
132
132
133
// Some operators form separate tokens.
133
134
if (s.starts_with (" <<=" ) || s.starts_with (" >>=" )) {
134
- vec. push_back ( s.substr (0 , 3 ) );
135
+ curTok = s.substr (0 , 3 );
135
136
s = s.substr (3 );
136
- continue ;
137
+ return ;
137
138
}
138
- if (s.size () > 1 && ((s[1 ] == ' =' && strchr (" */+-<>&^|" , s[0 ])) ||
139
- (s[0 ] == s[1 ] && strchr (" <>&|" , s[0 ])))) {
140
- vec.push_back (s.substr (0 , 2 ));
139
+ if (s.size () > 1 && (s[1 ] == ' =' && strchr (" +-*/!&^|" , s[0 ]))) {
140
+ curTok = s.substr (0 , 2 );
141
141
s = s.substr (2 );
142
- continue ;
142
+ return ;
143
143
}
144
144
145
- // Unquoted token. This is more relaxed than tokens in C-like language,
146
- // so that you can write "file-name.cpp" as one bare token, for example.
147
- size_t pos = s.find_first_not_of (
148
- " ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
149
- " 0123456789_.$/\\ ~=+[]*?-!^:" );
145
+ // Unquoted token. The non-expression token is more relaxed than tokens in
146
+ // C-like languages, so that you can write "file-name.cpp" as one bare
147
+ // token.
148
+ size_t pos;
149
+ if (inExpr) {
150
+ pos = s.find_first_not_of (
151
+ " ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
152
+ " 0123456789_.$" );
153
+ if (pos == 0 && s.size () >= 2 &&
154
+ ((s[0 ] == s[1 ] && strchr (" <>&|" , s[0 ])) ||
155
+ is_contained ({" ==" , " !=" , " <=" , " >=" , " <<" , " >>" }, s.substr (0 , 2 ))))
156
+ pos = 2 ;
157
+ } else {
158
+ pos = s.find_first_not_of (
159
+ " ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
160
+ " 0123456789_.$/\\ ~=+[]*?-!^:" );
161
+ }
150
162
151
- // A character that cannot start a word (which is usually a
152
- // punctuation) forms a single character token.
153
163
if (pos == 0 )
154
164
pos = 1 ;
155
- vec. push_back ( s.substr (0 , pos) );
165
+ curTok = s.substr (0 , pos);
156
166
s = s.substr (pos);
167
+ break ;
157
168
}
158
-
159
- tokens.insert (tokens.begin () + pos, vec.begin (), vec.end ());
160
169
}
161
170
162
171
// Skip leading whitespace characters or comments.
@@ -185,93 +194,30 @@ StringRef ScriptLexer::skipSpace(StringRef s) {
185
194
}
186
195
}
187
196
188
- // An erroneous token is handled as if it were the last token before EOF.
189
- bool ScriptLexer::atEOF () { return errorCount () || tokens.size () == pos; }
190
-
191
- // Split a given string as an expression.
192
- // This function returns "3", "*" and "5" for "3*5" for example.
193
- static std::vector<StringRef> tokenizeExpr (StringRef s) {
194
- StringRef ops = " !~*/+-<>?^:=" ; // List of operators
195
-
196
- // Quoted strings are literal strings, so we don't want to split it.
197
- if (s.starts_with (" \" " ))
198
- return {s};
199
-
200
- // Split S with operators as separators.
201
- std::vector<StringRef> ret;
202
- while (!s.empty ()) {
203
- size_t e = s.find_first_of (ops);
204
-
205
- // No need to split if there is no operator.
206
- if (e == StringRef::npos) {
207
- ret.push_back (s);
208
- break ;
209
- }
210
-
211
- // Get a token before the operator.
212
- if (e != 0 )
213
- ret.push_back (s.substr (0 , e));
214
-
215
- // Get the operator as a token.
216
- // Keep !=, ==, >=, <=, << and >> operators as a single tokens.
217
- if (s.substr (e).starts_with (" !=" ) || s.substr (e).starts_with (" ==" ) ||
218
- s.substr (e).starts_with (" >=" ) || s.substr (e).starts_with (" <=" ) ||
219
- s.substr (e).starts_with (" <<" ) || s.substr (e).starts_with (" >>" )) {
220
- ret.push_back (s.substr (e, 2 ));
221
- s = s.substr (e + 2 );
222
- } else {
223
- ret.push_back (s.substr (e, 1 ));
224
- s = s.substr (e + 1 );
225
- }
226
- }
227
- return ret;
228
- }
229
-
230
- // In contexts where expressions are expected, the lexer should apply
231
- // different tokenization rules than the default one. By default,
232
- // arithmetic operator characters are regular characters, but in the
233
- // expression context, they should be independent tokens.
234
- //
235
- // For example, "foo*3" should be tokenized to "foo", "*" and "3" only
236
- // in the expression context.
237
- //
238
- // This function may split the current token into multiple tokens.
239
- void ScriptLexer::maybeSplitExpr () {
240
- if (!inExpr || errorCount () || atEOF ())
241
- return ;
242
-
243
- std::vector<StringRef> v = tokenizeExpr (tokens[pos]);
244
- if (v.size () == 1 )
245
- return ;
246
- tokens.erase (tokens.begin () + pos);
247
- tokens.insert (tokens.begin () + pos, v.begin (), v.end ());
248
- }
197
+ // Used to determine whether to stop parsing. Treat errors like EOF.
198
+ bool ScriptLexer::atEOF () { return eof || errorCount (); }
249
199
250
200
StringRef ScriptLexer::next () {
251
- maybeSplitExpr ();
252
-
253
- if (errorCount ())
254
- return " " ;
255
- if (atEOF ()) {
256
- setError (" unexpected EOF" );
257
- return " " ;
258
- }
259
- return tokens[pos++];
201
+ prevTok = peek ();
202
+ return std::exchange (curTok, StringRef (curBuf.s .data (), 0 ));
260
203
}
261
204
262
205
StringRef ScriptLexer::peek () {
263
- StringRef tok = next ();
264
- if (errorCount ())
265
- return " " ;
266
- pos = pos - 1 ;
267
- return tok;
206
+ // curTok is invalid if curTokState and inExpr mismatch.
207
+ if (curTok.size () && curTokState != inExpr) {
208
+ curBuf.s = StringRef (curTok.data (), curBuf.s .end () - curTok.data ());
209
+ curTok = {};
210
+ }
211
+ if (curTok.empty ())
212
+ lex ();
213
+ return curTok;
268
214
}
269
215
270
216
bool ScriptLexer::consume (StringRef tok) {
271
- if (next () = = tok)
272
- return true ;
273
- --pos ;
274
- return false ;
217
+ if (peek () ! = tok)
218
+ return false ;
219
+ next () ;
220
+ return true ;
275
221
}
276
222
277
223
void ScriptLexer::skip () { (void )next (); }
@@ -280,8 +226,12 @@ void ScriptLexer::expect(StringRef expect) {
280
226
if (errorCount ())
281
227
return ;
282
228
StringRef tok = next ();
283
- if (tok != expect)
284
- setError (expect + " expected, but got " + tok);
229
+ if (tok != expect) {
230
+ if (atEOF ())
231
+ setError (" unexpected EOF" );
232
+ else
233
+ setError (expect + " expected, but got " + tok);
234
+ }
285
235
}
286
236
287
237
// Returns true if S encloses T.
@@ -292,10 +242,8 @@ static bool encloses(StringRef s, StringRef t) {
292
242
MemoryBufferRef ScriptLexer::getCurrentMB () {
293
243
// Find input buffer containing the current token.
294
244
assert (!mbs.empty ());
295
- if (pos == 0 )
296
- return mbs.back ();
297
245
for (MemoryBufferRef mb : mbs)
298
- if (encloses (mb.getBuffer (), tokens[pos - 1 ] ))
246
+ if (encloses (mb.getBuffer (), curBuf. s ))
299
247
return mb;
300
248
llvm_unreachable (" getCurrentMB: failed to find a token" );
301
249
}
0 commit comments