Skip to content

Commit bce27b2

Browse files
quark-zjufacebook-github-bot
authored andcommitted
Choose diff-sequences for diff algorithm
Summary: Before this change, there are 3 Myers diff algorithms used in the dependency tree: - diff-match-patch (1.0.5) - diff (4.0.1) - diff-sequences (via jest -> jest-diff -> diff-sequences) We'd like to simplify the dependency tree. The short answer is: - Use `diff-sequences`, or `jest-diff` which uses `diff-sequences` internally. For best performance, do: - Strip common prefix and suffix. - Make line comparison O(1), avoid `line1 === line2` which can be O(line length). - Consider skipping "cleanup" in `jest-diff` for long input. ---- Long answer of picking a diff library: I wrote a benchmark script to get some idea about their performance: const fs = require('fs') const dmp = new (require('diff-match-patch').diff_match_patch)(); const diff = require('diff'); const ds = require('diff-sequences').default; const jd = require('jest-diff'); dmp.Diff_Timeout = 120; // Diff functions. Output format: Chunk[] // Chunk is one of: // [0, n]: n common lines (same on both side) // [-1, n]: n left-side-only lines // [1, n]: n right-side-only lines function diff1(chars1, chars2) { return dmp.diff_main(chars1, chars2).map(v => [v[0], v[1].length]); } function diff1a(chars1, chars2) { return dmp.diff_main(chars1, chars2, false).map(v => [v[0], v[1].length]); } function diff2(chars1, chars2) { return diff.diffChars(chars1, chars2).map(v => { const d = v.added ? 1 : (v.removed ? -1 : 0); return [d, v.count]; }); } function diff3(chars1, chars2) { function isCommon(ai, bi) { return chars1[ai] == chars2[bi]; } const r = []; let lastA = 0, lastB = 0; function foundSequence(n, na, nb) { if (na > lastA) { r.push([-1, na - lastA]); lastA = na; } if (nb > lastB) { r.push([1, nb - lastB]); lastB = nb; } if (n > 0) { r.push([0, n]); lastA += n; lastB += n; } } ds(chars1.length, chars2.length, isCommon, foundSequence); foundSequence(0, chars1.length, chars2.length); return r; } function diff3a(chars1, chars2) { return jd.diffStringsRaw(chars1, chars2, false).map((d) => [d[0], d[1].length]); } function diff3b(chars1, chars2) { return jd.diffStringsRaw(chars1, chars2, true).map((d) => [d[0], d[1].length]); } function bench(a, b) { const {chars1, chars2} = dmp.diff_linesToChars_(a, b); function stringify(obj) { if (obj.length > 20) { return `${obj.length} items`; } else { return JSON.stringify(obj); } } [ ['diff-match-patch', diff1], ['diff-match-patch (checklines=false)', diff1a], ['diff-sequences', diff3], ['jest-diff (diff-sequences), no cleanup', diff3a], ['jest-diff (diff-sequences), with cleanup', diff3b], ['jsdiff', diff2], ].forEach(([name, diffFunc]) => { // node --expose_gc if (global.gc) { gc(); } const label = ` ${name}`; console.time(label); console.log(' ', stringify(diffFunc(chars1, chars2))); console.timeEnd(label); }); } let a, b; console.log('\nwith common prefix and suffix 1'); a = 'aaaaaaa\n'.repeat(50000) + 'bbbb\n' + 'dddd\n'.repeat(50000); b = 'aaaaaaa\n'.repeat(50000) + 'cccc\n' + 'dddd\n'.repeat(50000); bench(a, b); console.log('\nwith common prefix and suffix 2'); a = 'aaaaaaa\n'.repeat(50000) + 'bbbbbbb\n' + 'dddd\n'.repeat(50000); b = 'aaaaaaa\n'.repeat(50100) + 'cccc\n' + 'dddd\n'.repeat(49900); bench(a, b); console.log('\nwithout common prefix or suffix 1'); a = 'c\n' + 'aaaaaaa\n'.repeat(50000) + 'dddd\n'.repeat(50000); b = 'aaaaaaa\n'.repeat(50000) + 'dddd\n'.repeat(50100) + 'z\n'; bench(a, b); console.log('\nwithout common prefix or suffix 2'); a = 'cccc\n' + 'aaaaaaa\n'.repeat(50000) + 'bbbbbbb\n' + 'dddd\n'.repeat(50000) + 'z\n'; b = 'aaaaaaa\n'.repeat(50100) + 'cccc\n' + 'dddd\n'.repeat(49900) + 'z\ny\n'; bench(a, b); // Hearthstone cards.json in different languages. // This is somewhat challenging since many lines are changed. // wget https://api.hearthstonejson.com/v1/168129/enUS/cards.json -O 1 // wget https://api.hearthstonejson.com/v1/168129/zhCN/cards.json -O 2 // python3 -m json.tool < 1 > 1.json // python3 -m json.tool < 2 > 2.json console.log('\ncards.json with different languages'); a = fs.readFileSync('1.json', {encoding: 'utf-8'}); b = fs.readFileSync('2.json', {encoding: 'utf-8'}); bench(a, b); The output looks like: with common prefix and suffix 1 [[0,50000],[-1,1],[1,1],[0,50000]] diff-match-patch: 5.073ms [[0,50000],[-1,1],[1,1],[0,50000]] diff-match-patch (checklines=false): 0.481ms [[0,50000],[-1,1],[1,1],[0,50000]] diff-sequences: 7.589ms [[0,50000],[-1,1],[1,1],[0,50000]] jest-diff (diff-sequences), no cleanup: 10.915ms [[0,50000],[-1,1],[1,1],[0,50000]] jest-diff (diff-sequences), with cleanup: 10.588ms [[0,50000],[-1,1],[1,1],[0,50000]] jsdiff: 22.664ms with common prefix and suffix 2 [[0,50000],[-1,101],[1,101],[0,49900]] diff-match-patch: 10.688ms [[0,50000],[-1,101],[1,101],[0,49900]] diff-match-patch (checklines=false): 2.619ms [[0,50000],[-1,101],[1,101],[0,49900]] diff-sequences: 12.687ms [[0,50000],[-1,101],[1,101],[0,49900]] jest-diff (diff-sequences), no cleanup: 11.055ms [[0,50000],[-1,101],[1,101],[0,49900]] jest-diff (diff-sequences), with cleanup: 4.356ms [[0,50000],[-1,1],[1,101],[0,49900],[-1,100]] jsdiff: 59.359ms without common prefix or suffix 1 [[-1,1],[0,100000],[1,101]] diff-match-patch: 632.863ms [[-1,1],[0,100000],[1,101]] diff-match-patch (checklines=false): 607.796ms [[-1,1],[0,50000],[1,51],[0,50000],[1,50]] diff-sequences: 12.366ms [[-1,1],[0,50000],[1,51],[0,50000],[1,50]] jest-diff (diff-sequences), no cleanup: 11.096ms [[-1,1],[0,100000],[1,51],[1,50]] jest-diff (diff-sequences), with cleanup: 1.029s [[-1,1],[0,100000],[1,101]] jsdiff: 13.163ms without common prefix or suffix 2 [[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]] diff-match-patch: 2.773s [[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]] diff-match-patch (checklines=false): 1.402s [[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]] diff-sequences: 22.216ms [[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]] jest-diff (diff-sequences), no cleanup: 20.546ms [[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]] jest-diff (diff-sequences), with cleanup: 19.222ms [[-1,1],[0,50000],[-1,1],[1,101],[0,49900],[-1,100],[0,1],[1,1]] jsdiff: 33.82ms cards.json with different languages 67781 items diff-match-patch: 1:04.122 (m:ss.mmm) 57514 items diff-match-patch (checklines=false): 2:00.283 (m:ss.mmm) 67781 items diff-sequences: 1:09.486 (m:ss.mmm) 67781 items jest-diff (diff-sequences), no cleanup: 1:06.452 (m:ss.mmm) 52937 items jest-diff (diff-sequences), with cleanup: 1:09.118 (m:ss.mmm) ... (jsdiff cannot complete this test case in 20+ minutes) Observations: - In the last test case, `jsdiff` does not implement O(D^2) -> O(D) space optimization so it is practically unusable (reported as kpdecker/jsdiff#396). `diff-match-patch` and `jest-diff` both implement the linear space optimization, and have similar performance. - `diff-match-patch` strips common prefix and suffix, which makes it faster than `jest-diff` in "common prefix and suffix" test cases. - Both `diff-match-patch` and `jest-diff` can take a long time on "cleanup". See the "without common prefix or suffix 1" test case. We probably want to only enable cleanup for smaller input. - `diff-match-patch` performs visibly worse on the "without common prefix or suffix 2" test case. From the code it looks like `diff-match-patch` uses some kind of heuristics that tries to speed up things but ends up slowing it down. - Without cleanup, `jest-diff` might output `[1,51],[1,50]` that can be "obviously" merged to `[1,101]`. We might use a lightweight cleanup logic for that. - Reading the code, `diff-match-patch` turns lines into char codes. It cannot handle 65536 unique lines. (https://github.com/google/diff-match-patch/blob/62f2e689f498f9c92dbc588c58750addec9b1654/javascript/diff_match_patch_uncompressed.js#L503) Conclusions: - `jest-diff` (and `diff-sequences` under the hood) is overall the best choice. It has expected time and space complexities, and provides flexibility to skip the potentially slow "cleanup", and can support >65k unique lines. - `jest-diff` misses the "skip common prefix / suffix" optimization that `diff-match-patch` has, and seems practically important (editing a line in the editor - all lines are common prefixes and suffixes except for the line being edited). The optimization is not hard to implement. This diff implements it. - For certain use-cases (ex. linelog) where the diff content is not needed (at least for the left / "a" side), it should use `diff-sequences` to avoid overhead preparing the diff content. - `jest-diff`'s `diffLines` outputs one line per `Diff` but we want one chunk per `Diff`. - `jest-diff`'s `diffStringsRaw` produces one `Diff` per chunk, and because [`string.slice` is O(1) in V8](https://stackoverflow.com/a/72545403), it has acceptable performance. But mapping lines to chars would introduce the 65535 unique line limit undesirably. Reviewed By: evangrayk Differential Revision: D43857949 fbshipit-source-id: 9a3d85ebf10c9b82da8ab5cba4e14e519bbf264d
1 parent 41391d8 commit bce27b2

File tree

3 files changed

+80
-53
lines changed

3 files changed

+80
-53
lines changed

addons/isl/package.json

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,12 @@
66
"@testing-library/jest-dom": "^5.14.1",
77
"@testing-library/react": "^13.0.0",
88
"@testing-library/user-event": "^13.2.1",
9-
"@types/diff": "^5.0.2",
109
"@types/jest": "^27.0.1",
1110
"@types/node": "^16.7.13",
1211
"@types/react": "^18.0.0",
1312
"@types/react-dom": "^18.0.0",
1413
"@vscode/webview-ui-toolkit": "^1.0.0",
15-
"diff": "^5.0.0",
16-
"diff-match-patch": "^1.0.5",
14+
"diff-sequences": "^29.4.3",
1715
"isl-server": "0.0.0",
1816
"react": "^18.1.0",
1917
"react-dom": "^18.1.0",
@@ -23,7 +21,6 @@
2321
"typescript": "^4.4.2"
2422
},
2523
"devDependencies": {
26-
"@types/diff-match-patch": "^1.0.32",
2724
"rewire": "^6.0.0",
2825
"ts-loader": "^9.3.1"
2926
},

addons/isl/src/linelog.ts

Lines changed: 74 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,8 @@ SOFTWARE.
2929
3030
*/
3131

32-
import {diff_match_patch} from 'diff-match-patch';
33-
34-
const dmp = new diff_match_patch();
35-
36-
// The timeout does not seem to affect dmp performance.
37-
// But bumping it produces better diff results.
38-
dmp.Diff_Timeout = 10000;
32+
// Read D43857949 about the choice of the diff library.
33+
import diffSequences from 'diff-sequences';
3934

4035
/** Operation code. */
4136
enum Op {
@@ -400,13 +395,13 @@ class LineLog {
400395
const [aRev, bRev] = rev ? [rev, rev] : [this.maxRev, this.maxRev + 1];
401396
const b = text;
402397

403-
const lines = splitLines(b);
398+
const bLines = splitLines(b);
404399
this.checkOut(aRev);
405-
const a = this.content;
406-
const blocks = diffLines(a, b);
400+
const aLines = splitLines(this.content);
401+
const blocks = diffLines(aLines, bLines);
407402

408403
blocks.reverse().forEach(([a1, a2, b1, b2]) => {
409-
this.editChunk(a1, a2, bRev, lines.slice(b1, b2));
404+
this.editChunk(a1, a2, bRev, bLines.slice(b1, b2));
410405
});
411406
this.content = b;
412407
this.lastCheckoutKey = `${bRev},null`;
@@ -429,41 +424,59 @@ class LineLog {
429424
}
430425

431426
/**
432-
* Calculate the differences.
427+
* Calculate the line differences. For performance, this function only
428+
* returns the line indexes for different chunks. The line contents
429+
* are not returned.
433430
*
434-
* @param a Content of the "a" side.
435-
* @param b Content of the "b" side.
431+
* @param aLines lines on the "a" side.
432+
* @param bLines lines on the "b" side.
436433
* @returns A list of `(a1, a2, b1, b2)` tuples for the line ranges that
437434
* are different between "a" and "b".
438435
*/
439-
function diffLines(a: string, b: string): [LineIdx, LineIdx, LineIdx, LineIdx][] {
440-
const {chars1, chars2} = dmp.diff_linesToChars_(a, b);
436+
function diffLines(aLines: string[], bLines: string[]): [LineIdx, LineIdx, LineIdx, LineIdx][] {
437+
// Avoid O(string length) comparison.
438+
const [aList, bList] = stringsToInts([aLines, bLines]);
439+
440+
// Skip common prefix and suffix.
441+
let aLen = aList.length;
442+
let bLen = bList.length;
443+
const minLen = Math.min(aLen, bLen);
444+
let commonPrefixLen = 0;
445+
while (commonPrefixLen < minLen && aList[commonPrefixLen] === bList[commonPrefixLen]) {
446+
commonPrefixLen += 1;
447+
}
448+
while (aLen > commonPrefixLen && bLen > commonPrefixLen && aList[aLen - 1] === bList[bLen - 1]) {
449+
aLen -= 1;
450+
bLen -= 1;
451+
}
452+
aLen -= commonPrefixLen;
453+
bLen -= commonPrefixLen;
454+
455+
// Run the diff algorithm.
441456
const blocks: [LineIdx, LineIdx, LineIdx, LineIdx][] = [];
442-
let a1 = 0,
443-
a2 = 0,
444-
b1 = 0,
445-
b2 = 0;
446-
const push = (len: number) => {
457+
let a1 = 0;
458+
let b1 = 0;
459+
460+
function isCommon(aIndex: number, bIndex: number) {
461+
return aList[aIndex + commonPrefixLen] === bList[bIndex + commonPrefixLen];
462+
}
463+
464+
function foundSequence(n: LineIdx, a2: LineIdx, b2: LineIdx) {
447465
if (a1 !== a2 || b1 !== b2) {
448-
blocks.push([a1, a2, b1, b2]);
449-
}
450-
a1 = a2 = a2 + len;
451-
b1 = b2 = b2 + len;
452-
};
453-
dmp.diff_main(chars1, chars2, false).forEach(x => {
454-
const [op, chars] = x;
455-
const len = chars.length;
456-
if (op === 0) {
457-
push(len);
458-
}
459-
if (op < 0) {
460-
a2 += len;
466+
blocks.push([
467+
a1 + commonPrefixLen,
468+
a2 + commonPrefixLen,
469+
b1 + commonPrefixLen,
470+
b2 + commonPrefixLen,
471+
]);
461472
}
462-
if (op > 0) {
463-
b2 += len;
464-
}
465-
});
466-
push(0);
473+
a1 = a2 + n;
474+
b1 = b2 + n;
475+
}
476+
477+
diffSequences(aLen, bLen, isCommon, foundSequence);
478+
foundSequence(0, aLen, bLen);
479+
467480
return blocks;
468481
}
469482

@@ -485,6 +498,28 @@ function splitLines(s: string): string[] {
485498
return result;
486499
}
487500

501+
/**
502+
* Make strings with the same content use the same integer
503+
* for fast comparasion.
504+
*/
505+
function stringsToInts(linesArray: string[][]): number[][] {
506+
// This is similar to diff-match-patch's diff_linesToChars_ but is not
507+
// limited to 65536 unique lines.
508+
const lineMap = new Map<string, number>();
509+
return linesArray.map(lines =>
510+
lines.map(line => {
511+
const existingId = lineMap.get(line);
512+
if (existingId != null) {
513+
return existingId;
514+
} else {
515+
const id = lineMap.size;
516+
lineMap.set(line, id);
517+
return id;
518+
}
519+
}),
520+
);
521+
}
522+
488523
/** If the assertion fails, throw an `Error` with the given `message`. */
489524
function assert(condition: boolean, message: string) {
490525
if (!condition) {

addons/yarn.lock

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2517,11 +2517,6 @@
25172517
dependencies:
25182518
"@types/node" "*"
25192519

2520-
"@types/diff-match-patch@^1.0.32":
2521-
version "1.0.32"
2522-
resolved "https://registry.yarnpkg.com/@types/diff-match-patch/-/diff-match-patch-1.0.32.tgz#d9c3b8c914aa8229485351db4865328337a3d09f"
2523-
integrity sha512-bPYT5ECFiblzsVzyURaNhljBH2Gh1t9LowgUwciMrNAhFewLkHT2H0Mto07Y4/3KCOGZHRQll3CTtQZ0X11D/A==
2524-
25252520
"@types/diff@^5.0.2":
25262521
version "5.0.2"
25272522
resolved "https://registry.yarnpkg.com/@types/diff/-/diff-5.0.2.tgz#dd565e0086ccf8bc6522c6ebafd8a3125c91c12b"
@@ -4813,11 +4808,6 @@ didyoumean@^1.2.2:
48134808
resolved "https://registry.yarnpkg.com/didyoumean/-/didyoumean-1.2.2.tgz#989346ffe9e839b4555ecf5666edea0d3e8ad037"
48144809
integrity sha512-gxtyfqMg7GKyhQmb056K7M3xszy/myH8w+B4RT+QXBQsvAOdc3XymqDDPHx1BgPgsdAA5SIifona89YtRATDzw==
48154810

4816-
diff-match-patch@^1.0.5:
4817-
version "1.0.5"
4818-
resolved "https://registry.yarnpkg.com/diff-match-patch/-/diff-match-patch-1.0.5.tgz#abb584d5f10cd1196dfc55aa03701592ae3f7b37"
4819-
integrity sha512-IayShXAgj/QMXgB0IWmKx+rOPuGMhqm5w6jvFxmVenXKIzRqTAAsbBPT3kWQeGANj3jGgvcvv4yK6SxqYmikgw==
4820-
48214811
diff-sequences@^27.5.1:
48224812
version "27.5.1"
48234813
resolved "https://registry.yarnpkg.com/diff-sequences/-/diff-sequences-27.5.1.tgz#eaecc0d327fd68c8d9672a1e64ab8dccb2ef5327"
@@ -4828,6 +4818,11 @@ diff-sequences@^28.1.1:
48284818
resolved "https://registry.yarnpkg.com/diff-sequences/-/diff-sequences-28.1.1.tgz#9989dc731266dc2903457a70e996f3a041913ac6"
48294819
integrity sha512-FU0iFaH/E23a+a718l8Qa/19bF9p06kgE0KipMOMadwa3SjnaElKzPaUC0vnibs6/B/9ni97s61mcejk8W1fQw==
48304820

4821+
diff-sequences@^29.4.3:
4822+
version "29.4.3"
4823+
resolved "https://registry.yarnpkg.com/diff-sequences/-/diff-sequences-29.4.3.tgz#9314bc1fabe09267ffeca9cbafc457d8499a13f2"
4824+
integrity sha512-ofrBgwpPhCD85kMKtE9RYFFq6OC1A89oW2vvgWZNCwxrUpRUILopY7lsYyMDSjc8g6U6aiO0Qubg6r4Wgt5ZnA==
4825+
48314826
diff@^4.0.1:
48324827
version "4.0.2"
48334828
resolved "https://registry.yarnpkg.com/diff/-/diff-4.0.2.tgz#60f3aecb89d5fae520c11aa19efc2bb982aade7d"

0 commit comments

Comments
 (0)