Skip to content

Commit de19694

Browse files
authored
gh-119105: Differ.compare is too slow [for degenerate cases] (#119492)
``_fancy_replace()`` is no longer recursive. and a single call does a worst-case linear number of ratio() computations instead of quadratic. This renders toothless a universe of pathological cases. Some inputs may produce different output, but that's rare, and I didn't find a case where the final diff appeared to be of materially worse quality. To the contrary, by refusing to even consider synching on lines "far apart", there was more easy-to-digest locality in the output.
1 parent 84be524 commit de19694

File tree

2 files changed

+46
-71
lines changed

2 files changed

+46
-71
lines changed

Lib/difflib.py

Lines changed: 45 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -908,79 +908,52 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
908908
+ abcdefGhijkl
909909
? ^ ^ ^
910910
"""
911-
from operator import ge, gt
912-
# Don't synch up unless the lines have a similarity score of at
913-
# least cutoff; best_ratio tracks the best score seen so far.
914-
# Keep track of all index pairs achieving the best ratio and
915-
# deal with them here. Previously only the smallest pair was
916-
# handled here, and if there are many pairs with the best ratio,
917-
# recursion could grow very deep, and runtime cubic. See:
911+
# Don't synch up unless the lines have a similarity score above
912+
# cutoff. Previously only the smallest pair was handled here,
913+
# and if there are many pairs with the best ratio, recursion
914+
# could grow very deep, and runtime cubic. See:
918915
# https://github.com/python/cpython/issues/119105
919-
best_ratio, cutoff = 0.74, 0.75
916+
#
917+
# Later, more pathological cases prompted removing recursion
918+
# entirely.
919+
cutoff = 0.74999
920920
cruncher = SequenceMatcher(self.charjunk)
921-
eqi, eqj = None, None # 1st indices of equal lines (if any)
922-
# List of index pairs achieving best_ratio. Strictly increasing
923-
# in both index positions.
924-
max_pairs = []
925-
maxi = -1 # `i` index of last pair in max_pairs
926-
927-
# search for the pair that matches best without being identical
928-
# (identical lines must be junk lines, & we don't want to synch
929-
# up on junk -- unless we have to)
930921
crqr = cruncher.real_quick_ratio
931922
cqr = cruncher.quick_ratio
932923
cr = cruncher.ratio
924+
925+
WINDOW = 10
926+
best_i = best_j = None
927+
dump_i, dump_j = alo, blo # smallest indices not yet resolved
933928
for j in range(blo, bhi):
934-
bj = b[j]
935-
cruncher.set_seq2(bj)
936-
# Find new best, if possible. Else search for the smallest i
937-
# (if any) > maxi that equals the best ratio
938-
search_equal = True
939-
for i in range(alo, ahi):
940-
ai = a[i]
941-
if ai == bj:
942-
if eqi is None:
943-
eqi, eqj = i, j
944-
continue
945-
cruncher.set_seq1(ai)
946-
# computing similarity is expensive, so use the quick
947-
# upper bounds first -- have seen this speed up messy
948-
# compares by a factor of 3.
949-
cmp = ge if search_equal and i > maxi else gt
950-
if (cmp(crqr(), best_ratio)
951-
and cmp(cqr(), best_ratio)
952-
and cmp((ratio := cr()), best_ratio)):
953-
if ratio > best_ratio:
954-
best_ratio = ratio
955-
max_pairs.clear()
956-
else:
957-
assert best_ratio == ratio and search_equal
958-
assert i > maxi
959-
max_pairs.append((i, j))
960-
maxi = i
961-
search_equal = False
962-
if best_ratio < cutoff:
963-
assert not max_pairs
964-
# no non-identical "pretty close" pair
965-
if eqi is None:
966-
# no identical pair either -- treat it as a straight replace
967-
yield from self._plain_replace(a, alo, ahi, b, blo, bhi)
968-
return
969-
# no close pair, but an identical pair -- synch up on that
970-
max_pairs = [(eqi, eqj)]
971-
else:
972-
# there's a close pair, so forget the identical pair (if any)
973-
assert max_pairs
974-
eqi = None
975-
976-
last_i, last_j = alo, blo
977-
for this_i, this_j in max_pairs:
978-
# pump out diffs from before the synch point
979-
yield from self._fancy_helper(a, last_i, this_i,
980-
b, last_j, this_j)
929+
cruncher.set_seq2(b[j])
930+
# Search the corresponding i's within WINDOW for rhe highest
931+
# ratio greater than `cutoff`.
932+
aequiv = alo + (j - blo)
933+
arange = range(max(aequiv - WINDOW, dump_i),
934+
min(aequiv + WINDOW + 1, ahi))
935+
if not arange: # likely exit if `a` is shorter than `b`
936+
break
937+
best_ratio = cutoff
938+
for i in arange:
939+
cruncher.set_seq1(a[i])
940+
# Ordering by cheapest to most expensive ratio is very
941+
# valuable, most often getting out early.
942+
if (crqr() > best_ratio
943+
and cqr() > best_ratio
944+
and cr() > best_ratio):
945+
best_i, best_j, best_ratio = i, j, cr()
946+
947+
if best_i is None:
948+
# found nothing to synch on yet - move to next j
949+
continue
950+
951+
# pump out straight replace from before this synch pair
952+
yield from self._fancy_helper(a, dump_i, best_i,
953+
b, dump_j, best_j)
981954
# do intraline marking on the synch pair
982-
aelt, belt = a[this_i], b[this_j]
983-
if eqi is None:
955+
aelt, belt = a[best_i], b[best_j]
956+
if aelt != belt:
984957
# pump out a '-', '?', '+', '?' quad for the synched lines
985958
atags = btags = ""
986959
cruncher.set_seqs(aelt, belt)
@@ -1002,17 +975,18 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
1002975
else:
1003976
# the synch pair is identical
1004977
yield ' ' + aelt
1005-
last_i, last_j = this_i + 1, this_j + 1
978+
dump_i, dump_j = best_i + 1, best_j + 1
979+
best_i = best_j = None
1006980

1007-
# pump out diffs from after the last synch point
1008-
yield from self._fancy_helper(a, last_i, ahi,
1009-
b, last_j, bhi)
981+
# pump out straight replace from after the last synch pair
982+
yield from self._fancy_helper(a, dump_i, ahi,
983+
b, dump_j, bhi)
1010984

1011985
def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
1012986
g = []
1013987
if alo < ahi:
1014988
if blo < bhi:
1015-
g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
989+
g = self._plain_replace(a, alo, ahi, b, blo, bhi)
1016990
else:
1017991
g = self._dump('-', a, alo, ahi)
1018992
elif blo < bhi:
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
``difflib``'s ``DIffer.compare()`` (and so also ``ndiff``) can no longer be provoked into cubic-time behavior, or into unbounded recursion, and should generally be faster in ordinary cases too. Results may change in some cases, although that should be rare. Correctness of diffs is not affected. Some similar lines far apart may be reported as deleting one and adding the other, where before they were displayed on adjacent output lines with markup showing the intraline differences.

0 commit comments

Comments
 (0)