highlightjs · joshgoebel · Sep 17, 2020 · Sep 9, 2020 · Sep 9, 2020 · Sep 9, 2020
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -32,7 +32,8 @@
     "test": "mocha test",
     "test-markup": "mocha test/markup",
     "test-detect": "mocha test/detect",
-    "test-browser": "mocha test/browser"
+    "test-browser": "mocha test/browser",
+    "test-parser": "mocha test/parser"
   },
   "engines": {
     "node": "*"

diff --git a/src/highlight.js b/src/highlight.js
@@ -498,20 +498,13 @@ const HLJS = function(hljs) {
           // considered for a potential match
           resumeScanAtSamePosition = false;
         } else {
-          top.matcher.lastIndex = index;
           top.matcher.considerAll();
         }
+        top.matcher.lastIndex = index;
 
         const match = top.matcher.exec(codeToHighlight);
         // console.log("match", match[0], match.rule && match.rule.begin)
 
-        // if our failure to match was the result of a "resumed scan" then we
-        // need to advance one position and revert to full scanning before we
-        // decide there are truly no more matches at all to be had
-        if (!match && top.matcher.resumingScanAtSamePosition()) {
-          advanceOne();
-          continue;
-        }
         if (!match) break;
 
         const beforeMatch = codeToHighlight.substring(index, match.index);

diff --git a/src/lib/mode_compiler.js b/src/lib/mode_compiler.js
@@ -143,7 +143,7 @@ export function compileLanguage(language) {
     }
 
     resumingScanAtSamePosition() {
-      return this.regexIndex != 0;
+      return this.regexIndex !== 0;
     }
 
     considerAll() {
@@ -160,15 +160,58 @@ export function compileLanguage(language) {
     exec(s) {
       const m = this.getMatcher(this.regexIndex);
       m.lastIndex = this.lastIndex;
-      const result = m.exec(s);
+      let result = m.exec(s);
+
+      // The following is because we have no easy way to say "resume scanning at the
+      // existing position but also skip the current rule ONLY". What happens is
+      // all prior rules are also skipped which can result in matching the wrong
+      // thing. Example of matching "booger":
+
+      // our matcher is [string, "booger", number]
+      //
+      // ....booger....
+
+      // if "booger" is ignored then we'd really need a regex to scan from the
+      // SAME position for only: [string, number] but ignoring "booger" (if it
+      // was the first match), a simple resume would scan ahead who knows how
+      // far looking only for "number", ignoring potential string matches (or
+      // future "booger" matches that might be valid.)
+
+      // So what we do: We execute two matchers, one resuming at the same
+      // position, but the second full matcher starting at the position after:
+
+      //     /--- resume first regex match here (for [number])
+      //     |/---- full match here for [string, "booger", number]
+      //     vv
+      // ....booger....
+
+      // Which ever results in a match first is then used. So this 3-4 step
+      // process essentially allows us to say "match at this position, excluding
+      // a prior rule that was ignored".
+      //
+      // 1. Match "booger" first, ignore. Also proves that [string] does non match.
+      // 2. Resume matching for [number]
+      // 3. Match at index + 1 for [string, "booger", number]
+      // 4. If #2 and #3 result in matches, which came first?
+      if (this.resumingScanAtSamePosition()) {
+        if (result && result.index === this.lastIndex) {
+          // result is position +0 and therefore a valid
+          // "resume" match so result stays result
+        } else { // use the second matcher result
+          const m2 = this.getMatcher(0);
+          m2.lastIndex = this.lastIndex + 1;
+          result = m2.exec(s);
+        }
+      }
+
       if (result) {
         this.regexIndex += result.position + 1;
-        if (this.regexIndex === this.count) { // wrap-around
-          this.regexIndex = 0;
+        if (this.regexIndex === this.count) {
+          // wrap-around to considering all matches again
+          this.considerAll();
         }
       }
 
-      // this.regexIndex = 0;
       return result;
     }
   }

diff --git a/test/index.js b/test/index.js
@@ -1,14 +1,14 @@
 'use strict';
 
-const hljs     = require('../build');
+const hljs = require('../build');
 hljs.debugMode(); // tests run in debug mode so errors are raised
 
 // Tests specific to the API exposed inside the hljs object.
 // Right now, that only includes tests for several common regular expressions.
 require('./api');
 
 // Test weird bugs we've fixed over time
-require("./parser")
+require("./parser");
 
 // Tests for auto detection of languages via `highlightAuto`.
 require('./detect');
@@ -26,4 +26,3 @@ require('./markup');
 // isn't actually used to test inside a browser but `jsdom` acts as a virtual
 // browser inside of node.js and runs together with all the other tests.
 require('./special');
-
diff --git a/test/parser/index.js b/test/parser/index.js
@@ -1,5 +1,7 @@
 'use strict';
 
+const should = require('should');
+
 describe('hljs', function() {
   require('./look-ahead-end-matchers');
   require('./resume-scan');

diff --git a/test/parser/resume-scan.js b/test/parser/resume-scan.js
@@ -1,13 +1,26 @@
-const hljs = require('../../build');
+'use strict';
 
-describe("bugs", function () {
+const hljs = require('../../build');
+hljs.debugMode(); // tests run in debug mode so errors are raised
 
+describe("bugs", function() {
   describe("resume scan when a match is ignored", () => {
     it("should continue to highlight later matches", () => {
-      hljs.highlight('java', 'ImmutablePair.of(Stuff.class, "bar")').value
-        .should.equal(
-          'ImmutablePair.of(Stuff.class, <span class="hljs-string">&quot;bar&quot;</span>)'
-      )
-    })
-  })
-})
+      const result = hljs.highlight('java', 'ImmutablePair.of(Stuff.class, "bar")');
+      result.value.should.equal(
+        'ImmutablePair.of(Stuff.class, <span class="hljs-string">&quot;bar&quot;</span>)'
+      );
+    });
+    // previously the match rule was resumed but it would scan ahead too far and ignore
+    // later matches that matched the PRIOR rules... this is because when we "skip" (ignore) a
+    // rule we really only want to skip searching for THAT rule at that same location, we
+    // do not want to stop searching for ALL the prior rules at that location...
+    it("BUT should not skip ahead too far", () => {
+      const result = hljs.highlight('java', 'ImmutablePair.of(Stuff.class, "bar");\n23');
+      result.value.should.equal(
+        'ImmutablePair.of(Stuff.class, <span class="hljs-string">&quot;bar&quot;</span>);\n' +
+        '<span class="hljs-number">23</span>'
+      );
+    });
+  });
+});