You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Summary:
Before this change, there are 3 Myers diff algorithms used in the dependency tree:
- diff-match-patch (1.0.5)
- diff (4.0.1)
- diff-sequences (via jest -> jest-diff -> diff-sequences)
We'd like to simplify the dependency tree. The short answer is:
- Use `diff-sequences`, or `jest-diff` which uses `diff-sequences` internally.
For best performance, do:
- Strip common prefix and suffix.
- Make line comparison O(1), avoid `line1 === line2` which can be O(line
length).
- Consider skipping "cleanup" in `jest-diff` for long input.
----
Long answer of picking a diff library:
I wrote a benchmark script to get some idea about their performance:
const fs = require('fs')
const dmp = new (require('diff-match-patch').diff_match_patch)();
const diff = require('diff');
const ds = require('diff-sequences').default;
const jd = require('jest-diff');
dmp.Diff_Timeout = 120;
// Diff functions. Output format: Chunk[]
// Chunk is one of:
// [0, n]: n common lines (same on both side)
// [-1, n]: n left-side-only lines
// [1, n]: n right-side-only lines
function diff1(chars1, chars2) {
return dmp.diff_main(chars1, chars2).map(v => [v[0], v[1].length]);
}
function diff1a(chars1, chars2) {
return dmp.diff_main(chars1, chars2, false).map(v => [v[0], v[1].length]);
}
function diff2(chars1, chars2) {
return diff.diffChars(chars1, chars2).map(v => {
const d = v.added ? 1 : (v.removed ? -1 : 0);
return [d, v.count];
});
}
function diff3(chars1, chars2) {
function isCommon(ai, bi) {
return chars1[ai] == chars2[bi];
}
const r = [];
let lastA = 0, lastB = 0;
function foundSequence(n, na, nb) {
if (na > lastA) {
r.push([-1, na - lastA]);
lastA = na;
}
if (nb > lastB) {
r.push([1, nb - lastB]);
lastB = nb;
}
if (n > 0) {
r.push([0, n]);
lastA += n;
lastB += n;
}
}
ds(chars1.length, chars2.length, isCommon, foundSequence);
foundSequence(0, chars1.length, chars2.length);
return r;
}
function diff3a(chars1, chars2) {
return jd.diffStringsRaw(chars1, chars2, false).map((d) => [d[0], d[1].length]);
}
function diff3b(chars1, chars2) {
return jd.diffStringsRaw(chars1, chars2, true).map((d) => [d[0], d[1].length]);
}
function bench(a, b) {
const {chars1, chars2} = dmp.diff_linesToChars_(a, b);
function stringify(obj) {
if (obj.length > 20) {
return `${obj.length} items`;
} else {
return JSON.stringify(obj);
}
}
[
['diff-match-patch', diff1],
['diff-match-patch (checklines=false)', diff1a],
['diff-sequences', diff3],
['jest-diff (diff-sequences), no cleanup', diff3a],
['jest-diff (diff-sequences), with cleanup', diff3b],
['jsdiff', diff2],
].forEach(([name, diffFunc]) => {
// node --expose_gc
if (global.gc) {
gc();
}
const label = ` ${name}`;
console.time(label);
console.log(' ', stringify(diffFunc(chars1, chars2)));
console.timeEnd(label);
});
}
let a, b;
console.log('\nwith common prefix and suffix 1');
a = 'aaaaaaa\n'.repeat(50000) + 'bbbb\n' + 'dddd\n'.repeat(50000);
b = 'aaaaaaa\n'.repeat(50000) + 'cccc\n' + 'dddd\n'.repeat(50000);
bench(a, b);
console.log('\nwith common prefix and suffix 2');
a = 'aaaaaaa\n'.repeat(50000) + 'bbbbbbb\n' + 'dddd\n'.repeat(50000);
b = 'aaaaaaa\n'.repeat(50100) + 'cccc\n' + 'dddd\n'.repeat(49900);
bench(a, b);
console.log('\nwithout common prefix or suffix 1');
a = 'c\n' + 'aaaaaaa\n'.repeat(50000) + 'dddd\n'.repeat(50000);
b = 'aaaaaaa\n'.repeat(50000) + 'dddd\n'.repeat(50100) + 'z\n';
bench(a, b);
console.log('\nwithout common prefix or suffix 2');
a = 'cccc\n' + 'aaaaaaa\n'.repeat(50000) + 'bbbbbbb\n' + 'dddd\n'.repeat(50000) + 'z\n';
b = 'aaaaaaa\n'.repeat(50100) + 'cccc\n' + 'dddd\n'.repeat(49900) + 'z\ny\n';
bench(a, b);
// Hearthstone cards.json in different languages.
// This is somewhat challenging since many lines are changed.
// wget https://api.hearthstonejson.com/v1/168129/enUS/cards.json -O 1
// wget https://api.hearthstonejson.com/v1/168129/zhCN/cards.json -O 2
// python3 -m json.tool < 1 > 1.json
// python3 -m json.tool < 2 > 2.json
console.log('\ncards.json with different languages');
a = fs.readFileSync('1.json', {encoding: 'utf-8'});
b = fs.readFileSync('2.json', {encoding: 'utf-8'});
bench(a, b);
The output looks like:
with common prefix and suffix 1
[[0,50000],[-1,1],[1,1],[0,50000]]
diff-match-patch: 5.073ms
[[0,50000],[-1,1],[1,1],[0,50000]]
diff-match-patch (checklines=false): 0.481ms
[[0,50000],[-1,1],[1,1],[0,50000]]
diff-sequences: 7.589ms
[[0,50000],[-1,1],[1,1],[0,50000]]
jest-diff (diff-sequences), no cleanup: 10.915ms
[[0,50000],[-1,1],[1,1],[0,50000]]
jest-diff (diff-sequences), with cleanup: 10.588ms
[[0,50000],[-1,1],[1,1],[0,50000]]
jsdiff: 22.664ms
with common prefix and suffix 2
[[0,50000],[-1,101],[1,101],[0,49900]]
diff-match-patch: 10.688ms
[[0,50000],[-1,101],[1,101],[0,49900]]
diff-match-patch (checklines=false): 2.619ms
[[0,50000],[-1,101],[1,101],[0,49900]]
diff-sequences: 12.687ms
[[0,50000],[-1,101],[1,101],[0,49900]]
jest-diff (diff-sequences), no cleanup: 11.055ms
[[0,50000],[-1,101],[1,101],[0,49900]]
jest-diff (diff-sequences), with cleanup: 4.356ms
[[0,50000],[-1,1],[1,101],[0,49900],[-1,100]]
jsdiff: 59.359ms
without common prefix or suffix 1
[[-1,1],[0,100000],[1,101]]
diff-match-patch: 632.863ms
[[-1,1],[0,100000],[1,101]]
diff-match-patch (checklines=false): 607.796ms
[[-1,1],[0,50000],[1,51],[0,50000],[1,50]]
diff-sequences: 12.366ms
[[-1,1],[0,50000],[1,51],[0,50000],[1,50]]
jest-diff (diff-sequences), no cleanup: 11.096ms
[[-1,1],[0,100000],[1,51],[1,50]]
jest-diff (diff-sequences), with cleanup: 1.029s
[[-1,1],[0,100000],[1,101]]
jsdiff: 13.163ms
without common prefix or suffix 2
[[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]]
diff-match-patch: 2.773s
[[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]]
diff-match-patch (checklines=false): 1.402s
[[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]]
diff-sequences: 22.216ms
[[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]]
jest-diff (diff-sequences), no cleanup: 20.546ms
[[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]]
jest-diff (diff-sequences), with cleanup: 19.222ms
[[-1,1],[0,50000],[-1,1],[1,101],[0,49900],[-1,100],[0,1],[1,1]]
jsdiff: 33.82ms
cards.json with different languages
67781 items
diff-match-patch: 1:04.122 (m:ss.mmm)
57514 items
diff-match-patch (checklines=false): 2:00.283 (m:ss.mmm)
67781 items
diff-sequences: 1:09.486 (m:ss.mmm)
67781 items
jest-diff (diff-sequences), no cleanup: 1:06.452 (m:ss.mmm)
52937 items
jest-diff (diff-sequences), with cleanup: 1:09.118 (m:ss.mmm)
...
(jsdiff cannot complete this test case in 20+ minutes)
Observations:
- In the last test case, `jsdiff` does not implement O(D^2) -> O(D) space
optimization so it is practically unusable (reported as kpdecker/jsdiff#396).
`diff-match-patch` and `jest-diff` both implement the linear space
optimization, and have similar performance.
- `diff-match-patch` strips common prefix and suffix, which makes it faster
than `jest-diff` in "common prefix and suffix" test cases.
- Both `diff-match-patch` and `jest-diff` can take a long time on "cleanup".
See the "without common prefix or suffix 1" test case. We probably want
to only enable cleanup for smaller input.
- `diff-match-patch` performs visibly worse on the "without common prefix
or suffix 2" test case. From the code it looks like `diff-match-patch` uses
some kind of heuristics that tries to speed up things but ends up slowing it
down.
- Without cleanup, `jest-diff` might output `[1,51],[1,50]` that can be
"obviously" merged to `[1,101]`. We might use a lightweight cleanup logic
for that.
- Reading the code, `diff-match-patch` turns lines into char codes. It cannot
handle 65536 unique lines.
(https://github.com/google/diff-match-patch/blob/62f2e689f498f9c92dbc588c58750addec9b1654/javascript/diff_match_patch_uncompressed.js#L503)
Conclusions:
- `jest-diff` (and `diff-sequences` under the hood) is overall the best choice.
It has expected time and space complexities, and provides flexibility to skip
the potentially slow "cleanup", and can support >65k unique lines.
- `jest-diff` misses the "skip common prefix / suffix" optimization that
`diff-match-patch` has, and seems practically important (editing a line in
the editor - all lines are common prefixes and suffixes except for the line
being edited). The optimization is not hard to implement. This diff
implements it.
- For certain use-cases (ex. linelog) where the diff content is not needed
(at least for the left / "a" side), it should use `diff-sequences` to avoid
overhead preparing the diff content.
- `jest-diff`'s `diffLines` outputs one line per `Diff` but we want
one chunk per `Diff`.
- `jest-diff`'s `diffStringsRaw` produces one `Diff` per chunk, and because
[`string.slice` is O(1) in V8](https://stackoverflow.com/a/72545403), it has
acceptable performance. But mapping lines to chars would introduce the
65535 unique line limit undesirably.
Reviewed By: evangrayk
Differential Revision: D43857949
fbshipit-source-id: 9a3d85ebf10c9b82da8ab5cba4e14e519bbf264d
0 commit comments