@@ -908,79 +908,52 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
908
908
+ abcdefGhijkl
909
909
? ^ ^ ^
910
910
"""
911
- from operator import ge , gt
912
- # Don't synch up unless the lines have a similarity score of at
913
- # least cutoff; best_ratio tracks the best score seen so far.
914
- # Keep track of all index pairs achieving the best ratio and
915
- # deal with them here. Previously only the smallest pair was
916
- # handled here, and if there are many pairs with the best ratio,
917
- # recursion could grow very deep, and runtime cubic. See:
911
+ # Don't synch up unless the lines have a similarity score above
912
+ # cutoff. Previously only the smallest pair was handled here,
913
+ # and if there are many pairs with the best ratio, recursion
914
+ # could grow very deep, and runtime cubic. See:
918
915
# https://github.com/python/cpython/issues/119105
919
- best_ratio , cutoff = 0.74 , 0.75
916
+ #
917
+ # Later, more pathological cases prompted removing recursion
918
+ # entirely.
919
+ cutoff = 0.74999
920
920
cruncher = SequenceMatcher (self .charjunk )
921
- eqi , eqj = None , None # 1st indices of equal lines (if any)
922
- # List of index pairs achieving best_ratio. Strictly increasing
923
- # in both index positions.
924
- max_pairs = []
925
- maxi = - 1 # `i` index of last pair in max_pairs
926
-
927
- # search for the pair that matches best without being identical
928
- # (identical lines must be junk lines, & we don't want to synch
929
- # up on junk -- unless we have to)
930
921
crqr = cruncher .real_quick_ratio
931
922
cqr = cruncher .quick_ratio
932
923
cr = cruncher .ratio
924
+
925
+ WINDOW = 10
926
+ best_i = best_j = None
927
+ dump_i , dump_j = alo , blo # smallest indices not yet resolved
933
928
for j in range (blo , bhi ):
934
- bj = b [j ]
935
- cruncher .set_seq2 (bj )
936
- # Find new best, if possible. Else search for the smallest i
937
- # (if any) > maxi that equals the best ratio
938
- search_equal = True
939
- for i in range (alo , ahi ):
940
- ai = a [i ]
941
- if ai == bj :
942
- if eqi is None :
943
- eqi , eqj = i , j
944
- continue
945
- cruncher .set_seq1 (ai )
946
- # computing similarity is expensive, so use the quick
947
- # upper bounds first -- have seen this speed up messy
948
- # compares by a factor of 3.
949
- cmp = ge if search_equal and i > maxi else gt
950
- if (cmp (crqr (), best_ratio )
951
- and cmp (cqr (), best_ratio )
952
- and cmp ((ratio := cr ()), best_ratio )):
953
- if ratio > best_ratio :
954
- best_ratio = ratio
955
- max_pairs .clear ()
956
- else :
957
- assert best_ratio == ratio and search_equal
958
- assert i > maxi
959
- max_pairs .append ((i , j ))
960
- maxi = i
961
- search_equal = False
962
- if best_ratio < cutoff :
963
- assert not max_pairs
964
- # no non-identical "pretty close" pair
965
- if eqi is None :
966
- # no identical pair either -- treat it as a straight replace
967
- yield from self ._plain_replace (a , alo , ahi , b , blo , bhi )
968
- return
969
- # no close pair, but an identical pair -- synch up on that
970
- max_pairs = [(eqi , eqj )]
971
- else :
972
- # there's a close pair, so forget the identical pair (if any)
973
- assert max_pairs
974
- eqi = None
975
-
976
- last_i , last_j = alo , blo
977
- for this_i , this_j in max_pairs :
978
- # pump out diffs from before the synch point
979
- yield from self ._fancy_helper (a , last_i , this_i ,
980
- b , last_j , this_j )
929
+ cruncher .set_seq2 (b [j ])
930
+ # Search the corresponding i's within WINDOW for rhe highest
931
+ # ratio greater than `cutoff`.
932
+ aequiv = alo + (j - blo )
933
+ arange = range (max (aequiv - WINDOW , dump_i ),
934
+ min (aequiv + WINDOW + 1 , ahi ))
935
+ if not arange : # likely exit if `a` is shorter than `b`
936
+ break
937
+ best_ratio = cutoff
938
+ for i in arange :
939
+ cruncher .set_seq1 (a [i ])
940
+ # Ordering by cheapest to most expensive ratio is very
941
+ # valuable, most often getting out early.
942
+ if (crqr () > best_ratio
943
+ and cqr () > best_ratio
944
+ and cr () > best_ratio ):
945
+ best_i , best_j , best_ratio = i , j , cr ()
946
+
947
+ if best_i is None :
948
+ # found nothing to synch on yet - move to next j
949
+ continue
950
+
951
+ # pump out straight replace from before this synch pair
952
+ yield from self ._fancy_helper (a , dump_i , best_i ,
953
+ b , dump_j , best_j )
981
954
# do intraline marking on the synch pair
982
- aelt , belt = a [this_i ], b [this_j ]
983
- if eqi is None :
955
+ aelt , belt = a [best_i ], b [best_j ]
956
+ if aelt != belt :
984
957
# pump out a '-', '?', '+', '?' quad for the synched lines
985
958
atags = btags = ""
986
959
cruncher .set_seqs (aelt , belt )
@@ -1002,17 +975,18 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
1002
975
else :
1003
976
# the synch pair is identical
1004
977
yield ' ' + aelt
1005
- last_i , last_j = this_i + 1 , this_j + 1
978
+ dump_i , dump_j = best_i + 1 , best_j + 1
979
+ best_i = best_j = None
1006
980
1007
- # pump out diffs from after the last synch point
1008
- yield from self ._fancy_helper (a , last_i , ahi ,
1009
- b , last_j , bhi )
981
+ # pump out straight replace from after the last synch pair
982
+ yield from self ._fancy_helper (a , dump_i , ahi ,
983
+ b , dump_j , bhi )
1010
984
1011
985
def _fancy_helper (self , a , alo , ahi , b , blo , bhi ):
1012
986
g = []
1013
987
if alo < ahi :
1014
988
if blo < bhi :
1015
- g = self ._fancy_replace (a , alo , ahi , b , blo , bhi )
989
+ g = self ._plain_replace (a , alo , ahi , b , blo , bhi )
1016
990
else :
1017
991
g = self ._dump ('-' , a , alo , ahi )
1018
992
elif blo < bhi :
0 commit comments