Choose diff-sequences for diff algorithm

quark-zju · facebook-github-bot · commit bce27b2980c6 · 2023-03-06T17:20:22.000-08:00
Summary: Before this change, there are 3 Myers diff algorithms used in the dependency tree: - diff-match-patch (1.0.5) - diff (4.0.1) - diff-sequences (via jest -> jest-diff -> diff-sequences) We'd like to simplify the dependency tree. The short answer is: - Use `diff-sequences`, or `jest-diff` which uses `diff-sequences` internally. For best performance, do: - Strip common prefix and suffix. - Make line comparison O(1), avoid `line1 === line2` which can be O(line length). - Consider skipping "cleanup" in `jest-diff` for long input. ---- Long answer of picking a diff library: I wrote a benchmark script to get some idea about their performance: const fs = require('fs') const dmp = new (require('diff-match-patch').diff_match_patch)(); const diff = require('diff'); const ds = require('diff-sequences').default; const jd = require('jest-diff'); dmp.Diff_Timeout = 120; // Diff functions. Output format: Chunk[] // Chunk is one of: // [0, n]: n common lines (same on both side) // [-1, n]: n left-side-only lines // [1, n]: n right-side-only lines function diff1(chars1, chars2) { return dmp.diff_main(chars1, chars2).map(v => [v[0], v[1].length]); } function diff1a(chars1, chars2) { return dmp.diff_main(chars1, chars2, false).map(v => [v[0], v[1].length]); } function diff2(chars1, chars2) { return diff.diffChars(chars1, chars2).map(v => { const d = v.added ? 1 : (v.removed ? -1 : 0); return [d, v.count]; }); } function diff3(chars1, chars2) { function isCommon(ai, bi) { return chars1[ai] == chars2[bi]; } const r = []; let lastA = 0, lastB = 0; function foundSequence(n, na, nb) { if (na > lastA) { r.push([-1, na - lastA]); lastA = na; } if (nb > lastB) { r.push([1, nb - lastB]); lastB = nb; } if (n > 0) { r.push([0, n]); lastA += n; lastB += n; } } ds(chars1.length, chars2.length, isCommon, foundSequence); foundSequence(0, chars1.length, chars2.length); return r; } function diff3a(chars1, chars2) { return jd.diffStringsRaw(chars1, chars2, false).map((d) => [d[0], d[1].length]); } function diff3b(chars1, chars2) { return jd.diffStringsRaw(chars1, chars2, true).map((d) => [d[0], d[1].length]); } function bench(a, b) { const {chars1, chars2} = dmp.diff_linesToChars_(a, b); function stringify(obj) { if (obj.length > 20) { return `${obj.length} items`; } else { return JSON.stringify(obj); } } [ ['diff-match-patch', diff1], ['diff-match-patch (checklines=false)', diff1a], ['diff-sequences', diff3], ['jest-diff (diff-sequences), no cleanup', diff3a], ['jest-diff (diff-sequences), with cleanup', diff3b], ['jsdiff', diff2], ].forEach(([name, diffFunc]) => { // node --expose_gc if (global.gc) { gc(); } const label = ` ${name}`; console.time(label); console.log(' ', stringify(diffFunc(chars1, chars2))); console.timeEnd(label); }); } let a, b; console.log('\nwith common prefix and suffix 1'); a = 'aaaaaaa\n'.repeat(50000) + 'bbbb\n' + 'dddd\n'.repeat(50000); b = 'aaaaaaa\n'.repeat(50000) + 'cccc\n' + 'dddd\n'.repeat(50000); bench(a, b); console.log('\nwith common prefix and suffix 2'); a = 'aaaaaaa\n'.repeat(50000) + 'bbbbbbb\n' + 'dddd\n'.repeat(50000); b = 'aaaaaaa\n'.repeat(50100) + 'cccc\n' + 'dddd\n'.repeat(49900); bench(a, b); console.log('\nwithout common prefix or suffix 1'); a = 'c\n' + 'aaaaaaa\n'.repeat(50000) + 'dddd\n'.repeat(50000); b = 'aaaaaaa\n'.repeat(50000) + 'dddd\n'.repeat(50100) + 'z\n'; bench(a, b); console.log('\nwithout common prefix or suffix 2'); a = 'cccc\n' + 'aaaaaaa\n'.repeat(50000) + 'bbbbbbb\n' + 'dddd\n'.repeat(50000) + 'z\n'; b = 'aaaaaaa\n'.repeat(50100) + 'cccc\n' + 'dddd\n'.repeat(49900) + 'z\ny\n'; bench(a, b); // Hearthstone cards.json in different languages. // This is somewhat challenging since many lines are changed. // wget https://api.hearthstonejson.com/v1/168129/enUS/cards.json -O 1 // wget https://api.hearthstonejson.com/v1/168129/zhCN/cards.json -O 2 // python3 -m json.tool < 1 > 1.json // python3 -m json.tool < 2 > 2.json console.log('\ncards.json with different languages'); a = fs.readFileSync('1.json', {encoding: 'utf-8'}); b = fs.readFileSync('2.json', {encoding: 'utf-8'}); bench(a, b); The output looks like: with common prefix and suffix 1 [[0,50000],[-1,1],[1,1],[0,50000]] diff-match-patch: 5.073ms [[0,50000],[-1,1],[1,1],[0,50000]] diff-match-patch (checklines=false): 0.481ms [[0,50000],[-1,1],[1,1],[0,50000]] diff-sequences: 7.589ms [[0,50000],[-1,1],[1,1],[0,50000]] jest-diff (diff-sequences), no cleanup: 10.915ms [[0,50000],[-1,1],[1,1],[0,50000]] jest-diff (diff-sequences), with cleanup: 10.588ms [[0,50000],[-1,1],[1,1],[0,50000]] jsdiff: 22.664ms with common prefix and suffix 2 [[0,50000],[-1,101],[1,101],[0,49900]] diff-match-patch: 10.688ms [[0,50000],[-1,101],[1,101],[0,49900]] diff-match-patch (checklines=false): 2.619ms [[0,50000],[-1,101],[1,101],[0,49900]] diff-sequences: 12.687ms [[0,50000],[-1,101],[1,101],[0,49900]] jest-diff (diff-sequences), no cleanup: 11.055ms [[0,50000],[-1,101],[1,101],[0,49900]] jest-diff (diff-sequences), with cleanup: 4.356ms [[0,50000],[-1,1],[1,101],[0,49900],[-1,100]] jsdiff: 59.359ms without common prefix or suffix 1 [[-1,1],[0,100000],[1,101]] diff-match-patch: 632.863ms [[-1,1],[0,100000],[1,101]] diff-match-patch (checklines=false): 607.796ms [[-1,1],[0,50000],[1,51],[0,50000],[1,50]] diff-sequences: 12.366ms [[-1,1],[0,50000],[1,51],[0,50000],[1,50]] jest-diff (diff-sequences), no cleanup: 11.096ms [[-1,1],[0,100000],[1,51],[1,50]] jest-diff (diff-sequences), with cleanup: 1.029s [[-1,1],[0,100000],[1,101]] jsdiff: 13.163ms without common prefix or suffix 2 [[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]] diff-match-patch: 2.773s [[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]] diff-match-patch (checklines=false): 1.402s [[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]] diff-sequences: 22.216ms [[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]] jest-diff (diff-sequences), no cleanup: 20.546ms [[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]] jest-diff (diff-sequences), with cleanup: 19.222ms [[-1,1],[0,50000],[-1,1],[1,101],[0,49900],[-1,100],[0,1],[1,1]] jsdiff: 33.82ms cards.json with different languages 67781 items diff-match-patch: 1:04.122 (m:ss.mmm) 57514 items diff-match-patch (checklines=false): 2:00.283 (m:ss.mmm) 67781 items diff-sequences: 1:09.486 (m:ss.mmm) 67781 items jest-diff (diff-sequences), no cleanup: 1:06.452 (m:ss.mmm) 52937 items jest-diff (diff-sequences), with cleanup: 1:09.118 (m:ss.mmm) ... (jsdiff cannot complete this test case in 20+ minutes) Observations: - In the last test case, `jsdiff` does not implement O(D^2) -> O(D) space optimization so it is practically unusable (reported as kpdecker/jsdiff#396). `diff-match-patch` and `jest-diff` both implement the linear space optimization, and have similar performance. - `diff-match-patch` strips common prefix and suffix, which makes it faster than `jest-diff` in "common prefix and suffix" test cases. - Both `diff-match-patch` and `jest-diff` can take a long time on "cleanup". See the "without common prefix or suffix 1" test case. We probably want to only enable cleanup for smaller input. - `diff-match-patch` performs visibly worse on the "without common prefix or suffix 2" test case. From the code it looks like `diff-match-patch` uses some kind of heuristics that tries to speed up things but ends up slowing it down. - Without cleanup, `jest-diff` might output `[1,51],[1,50]` that can be "obviously" merged to `[1,101]`. We might use a lightweight cleanup logic for that. - Reading the code, `diff-match-patch` turns lines into char codes. It cannot handle 65536 unique lines. (https://github.com/google/diff-match-patch/blob/62f2e689f498f9c92dbc588c58750addec9b1654/javascript/diff_match_patch_uncompressed.js#L503) Conclusions: - `jest-diff` (and `diff-sequences` under the hood) is overall the best choice. It has expected time and space complexities, and provides flexibility to skip the potentially slow "cleanup", and can support >65k unique lines. - `jest-diff` misses the "skip common prefix / suffix" optimization that `diff-match-patch` has, and seems practically important (editing a line in the editor - all lines are common prefixes and suffixes except for the line being edited). The optimization is not hard to implement. This diff implements it. - For certain use-cases (ex. linelog) where the diff content is not needed (at least for the left / "a" side), it should use `diff-sequences` to avoid overhead preparing the diff content. - `jest-diff`'s `diffLines` outputs one line per `Diff` but we want one chunk per `Diff`. - `jest-diff`'s `diffStringsRaw` produces one `Diff` per chunk, and because [`string.slice` is O(1) in V8](https://stackoverflow.com/a/72545403), it has acceptable performance. But mapping lines to chars would introduce the 65535 unique line limit undesirably. Reviewed By: evangrayk Differential Revision: D43857949 fbshipit-source-id: 9a3d85ebf10c9b82da8ab5cba4e14e519bbf264d
diff --git a/addons/isl/package.json b/addons/isl/package.json
@@ -6,14 +6,12 @@
     "@testing-library/jest-dom": "^5.14.1",
     "@testing-library/react": "^13.0.0",
     "@testing-library/user-event": "^13.2.1",
-    "@types/diff": "^5.0.2",
     "@types/jest": "^27.0.1",
     "@types/node": "^16.7.13",
     "@types/react": "^18.0.0",
     "@types/react-dom": "^18.0.0",
     "@vscode/webview-ui-toolkit": "^1.0.0",
-    "diff": "^5.0.0",
-    "diff-match-patch": "^1.0.5",
+    "diff-sequences": "^29.4.3",
     "isl-server": "0.0.0",
     "react": "^18.1.0",
     "react-dom": "^18.1.0",
@@ -23,7 +21,6 @@
     "typescript": "^4.4.2"
   },
   "devDependencies": {
-    "@types/diff-match-patch": "^1.0.32",
     "rewire": "^6.0.0",
     "ts-loader": "^9.3.1"
   },
diff --git a/addons/isl/src/linelog.ts b/addons/isl/src/linelog.ts
@@ -29,13 +29,8 @@ SOFTWARE.
 
 */
 
-import {diff_match_patch} from 'diff-match-patch';
-
-const dmp = new diff_match_patch();
-
-// The timeout does not seem to affect dmp performance.
-// But bumping it produces better diff results.
-dmp.Diff_Timeout = 10000;
+// Read D43857949 about the choice of the diff library.
+import diffSequences from 'diff-sequences';
 
 /** Operation code. */
 enum Op {
@@ -400,13 +395,13 @@ class LineLog {
     const [aRev, bRev] = rev ? [rev, rev] : [this.maxRev, this.maxRev + 1];
     const b = text;
 
-    const lines = splitLines(b);
+    const bLines = splitLines(b);
     this.checkOut(aRev);
-    const a = this.content;
-    const blocks = diffLines(a, b);
+    const aLines = splitLines(this.content);
+    const blocks = diffLines(aLines, bLines);
 
     blocks.reverse().forEach(([a1, a2, b1, b2]) => {
-      this.editChunk(a1, a2, bRev, lines.slice(b1, b2));
+      this.editChunk(a1, a2, bRev, bLines.slice(b1, b2));
     });
     this.content = b;
     this.lastCheckoutKey = `${bRev},null`;
@@ -429,41 +424,59 @@ class LineLog {
 }
 
 /**
- * Calculate the differences.
+ * Calculate the line differences. For performance, this function only
+ * returns the line indexes for different chunks. The line contents
+ * are not returned.
  *
- * @param a Content of the "a" side.
- * @param b Content of the "b" side.
+ * @param aLines lines on the "a" side.
+ * @param bLines lines on the "b" side.
  * @returns A list of `(a1, a2, b1, b2)` tuples for the line ranges that
  * are different between "a" and "b".
  */
-function diffLines(a: string, b: string): [LineIdx, LineIdx, LineIdx, LineIdx][] {
-  const {chars1, chars2} = dmp.diff_linesToChars_(a, b);
+function diffLines(aLines: string[], bLines: string[]): [LineIdx, LineIdx, LineIdx, LineIdx][] {
+  // Avoid O(string length) comparison.
+  const [aList, bList] = stringsToInts([aLines, bLines]);
+
+  // Skip common prefix and suffix.
+  let aLen = aList.length;
+  let bLen = bList.length;
+  const minLen = Math.min(aLen, bLen);
+  let commonPrefixLen = 0;
+  while (commonPrefixLen < minLen && aList[commonPrefixLen] === bList[commonPrefixLen]) {
+    commonPrefixLen += 1;
+  }
+  while (aLen > commonPrefixLen && bLen > commonPrefixLen && aList[aLen - 1] === bList[bLen - 1]) {
+    aLen -= 1;
+    bLen -= 1;
+  }
+  aLen -= commonPrefixLen;
+  bLen -= commonPrefixLen;
+
+  // Run the diff algorithm.
   const blocks: [LineIdx, LineIdx, LineIdx, LineIdx][] = [];
-  let a1 = 0,
-    a2 = 0,
-    b1 = 0,
-    b2 = 0;
-  const push = (len: number) => {
+  let a1 = 0;
+  let b1 = 0;
+
+  function isCommon(aIndex: number, bIndex: number) {
+    return aList[aIndex + commonPrefixLen] === bList[bIndex + commonPrefixLen];
+  }
+
+  function foundSequence(n: LineIdx, a2: LineIdx, b2: LineIdx) {
     if (a1 !== a2 || b1 !== b2) {
-      blocks.push([a1, a2, b1, b2]);
-    }
-    a1 = a2 = a2 + len;
-    b1 = b2 = b2 + len;
-  };
-  dmp.diff_main(chars1, chars2, false).forEach(x => {
-    const [op, chars] = x;
-    const len = chars.length;
-    if (op === 0) {
-      push(len);
-    }
-    if (op < 0) {
-      a2 += len;
+      blocks.push([
+        a1 + commonPrefixLen,
+        a2 + commonPrefixLen,
+        b1 + commonPrefixLen,
+        b2 + commonPrefixLen,
+      ]);
     }
-    if (op > 0) {
-      b2 += len;
-    }
-  });
-  push(0);
+    a1 = a2 + n;
+    b1 = b2 + n;
+  }
+
+  diffSequences(aLen, bLen, isCommon, foundSequence);
+  foundSequence(0, aLen, bLen);
+
   return blocks;
 }
 
@@ -485,6 +498,28 @@ function splitLines(s: string): string[] {
   return result;
 }
 
+/**
+ * Make strings with the same content use the same integer
+ * for fast comparasion.
+ */
+function stringsToInts(linesArray: string[][]): number[][] {
+  // This is similar to diff-match-patch's diff_linesToChars_ but is not
+  // limited to 65536 unique lines.
+  const lineMap = new Map<string, number>();
+  return linesArray.map(lines =>
+    lines.map(line => {
+      const existingId = lineMap.get(line);
+      if (existingId != null) {
+        return existingId;
+      } else {
+        const id = lineMap.size;
+        lineMap.set(line, id);
+        return id;
+      }
+    }),
+  );
+}
+
 /** If the assertion fails, throw an `Error` with the given `message`. */
 function assert(condition: boolean, message: string) {
   if (!condition) {
diff --git a/addons/yarn.lock b/addons/yarn.lock
@@ -2517,11 +2517,6 @@
   dependencies:
     "@types/node" "*"
 
-"@types/diff-match-patch@^1.0.32":
-  version "1.0.32"
-  resolved "https://registry.yarnpkg.com/@types/diff-match-patch/-/diff-match-patch-1.0.32.tgz#d9c3b8c914aa8229485351db4865328337a3d09f"
-  integrity sha512-bPYT5ECFiblzsVzyURaNhljBH2Gh1t9LowgUwciMrNAhFewLkHT2H0Mto07Y4/3KCOGZHRQll3CTtQZ0X11D/A==
-
 "@types/diff@^5.0.2":
   version "5.0.2"
   resolved "https://registry.yarnpkg.com/@types/diff/-/diff-5.0.2.tgz#dd565e0086ccf8bc6522c6ebafd8a3125c91c12b"
@@ -4813,11 +4808,6 @@ didyoumean@^1.2.2:
   resolved "https://registry.yarnpkg.com/didyoumean/-/didyoumean-1.2.2.tgz#989346ffe9e839b4555ecf5666edea0d3e8ad037"
   integrity sha512-gxtyfqMg7GKyhQmb056K7M3xszy/myH8w+B4RT+QXBQsvAOdc3XymqDDPHx1BgPgsdAA5SIifona89YtRATDzw==
 
-diff-match-patch@^1.0.5:
-  version "1.0.5"
-  resolved "https://registry.yarnpkg.com/diff-match-patch/-/diff-match-patch-1.0.5.tgz#abb584d5f10cd1196dfc55aa03701592ae3f7b37"
-  integrity sha512-IayShXAgj/QMXgB0IWmKx+rOPuGMhqm5w6jvFxmVenXKIzRqTAAsbBPT3kWQeGANj3jGgvcvv4yK6SxqYmikgw==
-
 diff-sequences@^27.5.1:
   version "27.5.1"
   resolved "https://registry.yarnpkg.com/diff-sequences/-/diff-sequences-27.5.1.tgz#eaecc0d327fd68c8d9672a1e64ab8dccb2ef5327"
@@ -4828,6 +4818,11 @@ diff-sequences@^28.1.1:
   resolved "https://registry.yarnpkg.com/diff-sequences/-/diff-sequences-28.1.1.tgz#9989dc731266dc2903457a70e996f3a041913ac6"
   integrity sha512-FU0iFaH/E23a+a718l8Qa/19bF9p06kgE0KipMOMadwa3SjnaElKzPaUC0vnibs6/B/9ni97s61mcejk8W1fQw==
 
+diff-sequences@^29.4.3:
+  version "29.4.3"
+  resolved "https://registry.yarnpkg.com/diff-sequences/-/diff-sequences-29.4.3.tgz#9314bc1fabe09267ffeca9cbafc457d8499a13f2"
+  integrity sha512-ofrBgwpPhCD85kMKtE9RYFFq6OC1A89oW2vvgWZNCwxrUpRUILopY7lsYyMDSjc8g6U6aiO0Qubg6r4Wgt5ZnA==
+
 diff@^4.0.1:
   version "4.0.2"
   resolved "https://registry.yarnpkg.com/diff/-/diff-4.0.2.tgz#60f3aecb89d5fae520c11aa19efc2bb982aade7d"