llvm · MaskRay · Jul 26, 2024 · Jul 25, 2024 · Jul 25, 2024 · Jul 25, 2024
diff --git a/lld/ELF/ScriptLexer.cpp b/lld/ELF/ScriptLexer.cpp
@@ -36,24 +36,24 @@ using namespace llvm;
 using namespace lld;
 using namespace lld::elf;
 
+ScriptLexer::ScriptLexer(MemoryBufferRef mb) : curBuf(mb), mbs(1, mb) {}
+
 // Returns a whole line containing the current token.
 StringRef ScriptLexer::getLine() {
   StringRef s = getCurrentMB().getBuffer();
-  StringRef tok = tokens[pos - 1];
 
-  size_t pos = s.rfind('\n', tok.data() - s.data());
+  size_t pos = s.rfind('\n', prevTok.data() - s.data());
   if (pos != StringRef::npos)
     s = s.substr(pos + 1);
   return s.substr(0, s.find_first_of("\r\n"));
 }
 
 // Returns 1-based line number of the current token.
 size_t ScriptLexer::getLineNumber() {
-  if (pos == 0)
+  if (prevTok.empty())
     return 1;
   StringRef s = getCurrentMB().getBuffer();
-  StringRef tok = tokens[pos - 1];
-  const size_t tokOffset = tok.data() - s.data();
+  const size_t tokOffset = prevTok.data() - s.data();
 
   // For the first token, or when going backwards, start from the beginning of
   // the buffer. If this token is after the previous token, start from the
@@ -76,40 +76,41 @@ size_t ScriptLexer::getLineNumber() {
 
 // Returns 0-based column number of the current token.
 size_t ScriptLexer::getColumnNumber() {
-  StringRef tok = tokens[pos - 1];
-  return tok.data() - getLine().data();
+  return prevTok.data() - getLine().data();
 }
 
 std::string ScriptLexer::getCurrentLocation() {
   std::string filename = std::string(getCurrentMB().getBufferIdentifier());
   return (filename + ":" + Twine(getLineNumber())).str();
 }
 
-ScriptLexer::ScriptLexer(MemoryBufferRef mb) { tokenize(mb); }
-
 // We don't want to record cascading errors. Keep only the first one.
 void ScriptLexer::setError(const Twine &msg) {
   if (errorCount())
     return;
 
   std::string s = (getCurrentLocation() + ": " + msg).str();
-  if (pos)
+  if (prevTok.size())
     s += "\n>>> " + getLine().str() + "\n>>> " +
          std::string(getColumnNumber(), ' ') + "^";
   error(s);
 }
 
-// Split S into linker script tokens.
-void ScriptLexer::tokenize(MemoryBufferRef mb) {
-  std::vector<StringRef> vec;
-  mbs.push_back(mb);
-  StringRef s = mb.getBuffer();
-  StringRef begin = s;
-
+void ScriptLexer::lex() {
   for (;;) {
+    StringRef &s = curBuf.s;
     s = skipSpace(s);
-    if (s.empty())
-      break;
+    if (s.empty()) {
+      // If this buffer is from an INCLUDE command, switch to the "return
+      // value"; otherwise, mark EOF.
+      if (buffers.empty()) {
+        eof = true;
+        return;
+      }
+      curBuf = buffers.pop_back_val();
+      continue;
+    }
+    curTokState = inExpr;
 
     // Quoted token. Note that double-quote characters are parts of a token
     // because, in a glob match context, only unquoted tokens are interpreted
@@ -118,45 +119,53 @@ void ScriptLexer::tokenize(MemoryBufferRef mb) {
     if (s.starts_with("\"")) {
       size_t e = s.find("\"", 1);
       if (e == StringRef::npos) {
-        StringRef filename = mb.getBufferIdentifier();
-        size_t lineno = begin.substr(0, s.data() - begin.data()).count('\n');
-        error(filename + ":" + Twine(lineno + 1) + ": unclosed quote");
+        size_t lineno =
+            StringRef(curBuf.begin, s.data() - curBuf.begin).count('\n');
+        error(curBuf.filename + ":" + Twine(lineno + 1) + ": unclosed quote");
         return;
       }
 
-      vec.push_back(s.take_front(e + 1));
+      curTok = s.take_front(e + 1);
       s = s.substr(e + 1);
-      continue;
+      return;
     }
 
     // Some operators form separate tokens.
     if (s.starts_with("<<=") || s.starts_with(">>=")) {
-      vec.push_back(s.substr(0, 3));
+      curTok = s.substr(0, 3);
       s = s.substr(3);
-      continue;
+      return;
     }
-    if (s.size() > 1 && ((s[1] == '=' && strchr("*/+-<>&^|", s[0])) ||
-                         (s[0] == s[1] && strchr("<>&|", s[0])))) {
-      vec.push_back(s.substr(0, 2));
+    if (s.size() > 1 && (s[1] == '=' && strchr("+-*/!&^|", s[0]))) {
+      curTok = s.substr(0, 2);
       s = s.substr(2);
-      continue;
+      return;
     }
 
-    // Unquoted token. This is more relaxed than tokens in C-like language,
-    // so that you can write "file-name.cpp" as one bare token, for example.
-    size_t pos = s.find_first_not_of(
-        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
-        "0123456789_.$/\\~=+[]*?-!^:");
+    // Unquoted token. The non-expression token is more relaxed than tokens in
+    // C-like languages, so that you can write "file-name.cpp" as one bare
+    // token.
+    size_t pos;
+    if (inExpr) {
+      pos = s.find_first_not_of(
+          "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+          "0123456789_.$");
+      if (pos == 0 && s.size() >= 2 &&
+          ((s[0] == s[1] && strchr("<>&|", s[0])) ||
+           is_contained({"==", "!=", "<=", ">=", "<<", ">>"}, s.substr(0, 2))))
+        pos = 2;
+    } else {
+      pos = s.find_first_not_of(
+          "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+          "0123456789_.$/\\~=+[]*?-!^:");
+    }
 
-    // A character that cannot start a word (which is usually a
-    // punctuation) forms a single character token.
     if (pos == 0)
       pos = 1;
-    vec.push_back(s.substr(0, pos));
+    curTok = s.substr(0, pos);
     s = s.substr(pos);
+    break;
   }
-
-  tokens.insert(tokens.begin() + pos, vec.begin(), vec.end());
 }
 
 // Skip leading whitespace characters or comments.
@@ -185,93 +194,30 @@ StringRef ScriptLexer::skipSpace(StringRef s) {
   }
 }
 
-// An erroneous token is handled as if it were the last token before EOF.
-bool ScriptLexer::atEOF() { return errorCount() || tokens.size() == pos; }
-
-// Split a given string as an expression.
-// This function returns "3", "*" and "5" for "3*5" for example.
-static std::vector<StringRef> tokenizeExpr(StringRef s) {
-  StringRef ops = "!~*/+-<>?^:="; // List of operators
-
-  // Quoted strings are literal strings, so we don't want to split it.
-  if (s.starts_with("\""))
-    return {s};
-
-  // Split S with operators as separators.
-  std::vector<StringRef> ret;
-  while (!s.empty()) {
-    size_t e = s.find_first_of(ops);
-
-    // No need to split if there is no operator.
-    if (e == StringRef::npos) {
-      ret.push_back(s);
-      break;
-    }
-
-    // Get a token before the operator.
-    if (e != 0)
-      ret.push_back(s.substr(0, e));
-
-    // Get the operator as a token.
-    // Keep !=, ==, >=, <=, << and >> operators as a single tokens.
-    if (s.substr(e).starts_with("!=") || s.substr(e).starts_with("==") ||
-        s.substr(e).starts_with(">=") || s.substr(e).starts_with("<=") ||
-        s.substr(e).starts_with("<<") || s.substr(e).starts_with(">>")) {
-      ret.push_back(s.substr(e, 2));
-      s = s.substr(e + 2);
-    } else {
-      ret.push_back(s.substr(e, 1));
-      s = s.substr(e + 1);
-    }
-  }
-  return ret;
-}
-
-// In contexts where expressions are expected, the lexer should apply
-// different tokenization rules than the default one. By default,
-// arithmetic operator characters are regular characters, but in the
-// expression context, they should be independent tokens.
-//
-// For example, "foo*3" should be tokenized to "foo", "*" and "3" only
-// in the expression context.
-//
-// This function may split the current token into multiple tokens.
-void ScriptLexer::maybeSplitExpr() {
-  if (!inExpr || errorCount() || atEOF())
-    return;
-
-  std::vector<StringRef> v = tokenizeExpr(tokens[pos]);
-  if (v.size() == 1)
-    return;
-  tokens.erase(tokens.begin() + pos);
-  tokens.insert(tokens.begin() + pos, v.begin(), v.end());
-}
+// Used to determine whether to stop parsing. Treat errors like EOF.
+bool ScriptLexer::atEOF() { return eof || errorCount(); }
 
 StringRef ScriptLexer::next() {
-  maybeSplitExpr();
-
-  if (errorCount())
-    return "";
-  if (atEOF()) {
-    setError("unexpected EOF");
-    return "";
-  }
-  return tokens[pos++];
+  prevTok = peek();
+  return std::exchange(curTok, StringRef(curBuf.s.data(), 0));
 }
 
 StringRef ScriptLexer::peek() {
-  StringRef tok = next();
-  if (errorCount())
-    return "";
-  pos = pos - 1;
-  return tok;
+  // curTok is invalid if curTokState and inExpr mismatch.
+  if (curTok.size() && curTokState != inExpr) {
+    curBuf.s = StringRef(curTok.data(), curBuf.s.end() - curTok.data());
+    curTok = {};
+  }
+  if (curTok.empty())
+    lex();
+  return curTok;
 }
 
 bool ScriptLexer::consume(StringRef tok) {
-  if (next() == tok)
-    return true;
-  --pos;
-  return false;
+  if (peek() != tok)
+    return false;
+  next();
+  return true;
 }
 
 void ScriptLexer::skip() { (void)next(); }
@@ -280,8 +226,12 @@ void ScriptLexer::expect(StringRef expect) {
   if (errorCount())
     return;
   StringRef tok = next();
-  if (tok != expect)
-    setError(expect + " expected, but got " + tok);
+  if (tok != expect) {
+    if (atEOF())
+      setError("unexpected EOF");
+    else
+      setError(expect + " expected, but got " + tok);
+  }
 }
 
 // Returns true if S encloses T.
@@ -292,10 +242,8 @@ static bool encloses(StringRef s, StringRef t) {
 MemoryBufferRef ScriptLexer::getCurrentMB() {
   // Find input buffer containing the current token.
   assert(!mbs.empty());
-  if (pos == 0)
-    return mbs.back();
   for (MemoryBufferRef mb : mbs)
-    if (encloses(mb.getBuffer(), tokens[pos - 1]))
+    if (encloses(mb.getBuffer(), curBuf.s))
       return mb;
   llvm_unreachable("getCurrentMB: failed to find a token");
 }
diff --git a/lld/ELF/ScriptLexer.h b/lld/ELF/ScriptLexer.h
@@ -10,18 +10,43 @@
 #define LLD_ELF_SCRIPT_LEXER_H
 
 #include "lld/Common/LLVM.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/MemoryBufferRef.h"
 #include <vector>
 
 namespace lld::elf {
 
 class ScriptLexer {
+protected:
+  struct Buffer {
+    // The remaining content to parse and the filename.
+    StringRef s, filename;
+    const char *begin = nullptr;
+    Buffer() = default;
+    Buffer(MemoryBufferRef mb)
+        : s(mb.getBuffer()), filename(mb.getBufferIdentifier()),
+          begin(mb.getBufferStart()) {}
+  };
+  // The current buffer and parent buffers due to INCLUDE.
+  Buffer curBuf;
+  SmallVector<Buffer, 0> buffers;
+
+  // The token before the last next().
+  StringRef prevTok;
+  // Rules for what is a token are different when we are in an expression.
+  // curTok holds the cached return value of peek() and is invalid when the
+  // expression state changes.
+  StringRef curTok;
+  // The inExpr state when curTok is cached.
+  bool curTokState = false;
+  bool eof = false;
+
 public:
   explicit ScriptLexer(MemoryBufferRef mb);
 
   void setError(const Twine &msg);
-  void tokenize(MemoryBufferRef mb);
+  void lex();
   StringRef skipSpace(StringRef s);
   bool atEOF();
   StringRef next();
@@ -33,15 +58,12 @@ class ScriptLexer {
   MemoryBufferRef getCurrentMB();
 
   std::vector<MemoryBufferRef> mbs;
-  std::vector<StringRef> tokens;
   bool inExpr = false;
-  size_t pos = 0;
 
   size_t lastLineNumber = 0;
   size_t lastLineNumberOffset = 0;
 
 private:
-  void maybeSplitExpr();
   StringRef getLine();
   size_t getLineNumber();
   size_t getColumnNumber();