diff --git a/Lib/difflib.py b/Lib/difflib.py index 7c7e233b013a76..9f419e5351a2be 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -19,6 +19,11 @@ Class SequenceMatcher: A flexible class for comparing pairs of sequences of any type. +Class ExactSequenceMatcher: + Class for comparing pairs of sequences that uses SuffixAutomaton. + It does not have autojunk option and always calculates exact result. + Additionally, it has balancing "knob" to improve quality of diffs. + Class Differ: For producing human-readable deltas from sequences of lines of text. @@ -28,21 +33,448 @@ __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher', 'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff', - 'unified_diff', 'diff_bytes', 'HtmlDiff', 'Match'] + 'unified_diff', 'diff_bytes', 'HtmlDiff', 'Match', + 'ExactSequenceMatcher'] from _colorize import can_colorize, get_theme from heapq import nlargest as _nlargest -from collections import namedtuple as _namedtuple +from collections import Counter as _Counter, namedtuple as _namedtuple from types import GenericAlias +from sys import maxsize as _MAXSIZE + + +######################################################################## +### Utilities +######################################################################## + + +def _adjust_indices(size, start, stop): + if start < 0: + raise ValueError('Starting index can not be negative') + if stop is None or stop > size: + stop = size + return start, stop + + +def _collapse_adjacent_blocks(blocks): + """Collapses adjacent blocks + """ + i1 = j1 = k1 = 0 + for i2, j2, k2 in blocks: + # Is this block adjacent to i1, j1, k1? + if i1 + k1 == i2 and j1 + k1 == j2: + # Yes, so collapse them -- this just increases the length of + # the first block by the length of the second, and the first + # block so lengthened remains the block to compare against. + k1 += k2 + else: + # Not adjacent. Remember the first block (k1==0 means it's + # the dummy we started with), and make the second block the + # new block to compare against. + if k1: + yield (i1, j1, k1) + i1, j1, k1 = i2, j2, k2 + if k1: + yield (i1, j1, k1) + + +def _expand_block_to_junk(junk, block, a, b, alo, ahi, blo, bhi, *, inverse=False): + """ + Expands block for consecutive matches at both sides if: + a) characters match + b) matching characters are in junk + If inverse == True, (b) condition is inverted to: "are not in junk" + """ + i, j, k = block + while i > alo and j > blo: + el2 = b[j - 1] + ok = el2 not in junk if inverse else el2 in junk + if not ok or a[i - 1] != el2: + break + i -= 1 + j -= 1 + k += 1 + while i + k < ahi and j + k < bhi: + el2 = b[j + k] + ok = el2 not in junk if inverse else el2 in junk + if not ok or a[i + k] != el2: + break + k += 1 + return (i, j, k) + + +######################################################################## +### SequenceMatcherBase +######################################################################## + Match = _namedtuple('Match', 'a b size') + def _calculate_ratio(matches, length): if length: return 2.0 * matches / length return 1.0 -class SequenceMatcher: + +class SequenceMatcherBase: + def __init__(self, isjunk=None, a='', b=''): + """ + Optional arg isjunk is None (the default), or a one-argument + function that takes a sequence element and returns true iff the + element is junk. None is equivalent to passing "lambda x: 0", i.e. + no elements are considered to be junk. For example, pass + lambda x: x in " \\t" + if you're comparing lines as sequences of characters, and don't + want to synch up on blanks or hard tabs. + + Optional arg a is the first of two sequences to be compared. By + default, an empty string. The elements of a must be hashable. See + also .set_seqs() and .set_seq1(). + + Optional arg b is the second of two sequences to be compared. By + default, an empty string. The elements of b must be hashable. See + also .set_seqs() and .set_seq2(). + + Members: + a : Sequence + first sequence + b : Sequence + second sequence; differences are computed as "what do + we need to do to 'a' to change it into 'b'?" + isjunk : Callable | None + a user-supplied function taking a sequence element and + returning true iff the element is "junk" + "junk" elements are unmatchable elements + matching_blocks : list + a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k]; + ascending & non-overlapping in i and in j; terminated by + a dummy (len(a), len(b), 0) sentinel + opcodes : list + a list of (tag, i1, i2, j1, j2) tuples, where tag is + one of + 'replace' a[i1:i2] should be replaced by b[j1:j2] + 'delete' a[i1:i2] should be deleted + 'insert' b[j1:j2] should be inserted + 'equal' a[i1:i2] == b[j1:j2] + """ + self.isjunk = isjunk + self.a = None + self.b = None + self.set_seqs(a, b) + + def set_seqs(self, a, b): + """Set the two sequences to be compared.""" + self.set_seq1(a) + self.set_seq2(b) + + def set_seq1(self, a): + """Set the first sequence to be compared. + + The second sequence to be compared is not changed. + + >>> s = SequenceMatcher(None, "abcd", "bcde") + >>> s.ratio() + 0.75 + >>> s.set_seq1("bcde") + >>> s.ratio() + 1.0 + >>> + + SequenceMatcher computes and caches detailed information about the + second sequence, so if you want to compare one sequence S against + many sequences, use .set_seq2(S) once and call .set_seq1(x) + repeatedly for each of the other sequences. + + See also set_seqs() and set_seq2(). + """ + + if a is self.a: + return + self.a = a + self.matching_blocks = self.opcodes = None + + def set_seq2(self, b): + """Set the second sequence to be compared. + + The first sequence to be compared is not changed. + + >>> s = SequenceMatcher(None, "abcd", "bcde") + >>> s.ratio() + 0.75 + >>> s.set_seq2("abcd") + >>> s.ratio() + 1.0 + >>> + + SequenceMatcher computes and caches detailed information about the + second sequence, so if you want to compare one sequence S against + many sequences, use .set_seq2(S) once and call .set_seq1(x) + repeatedly for each of the other sequences. + + See also set_seqs() and set_seq1(). + """ + + if b is self.b: + return + self.b = b + self.matching_blocks = self.opcodes = None + self.fullbcount = None + self._prepare_seq2() + + def _prepare_seq2(self): + """Preparation function that is called at the end of `set_seq2`. + It is usually used to: + a) Process junk + b) Pre-compile elligible parts of algorithm + """ + pass + + # Abstract Methods ---------------- + # --------------------------------- + + def _get_matching_blocks(self): + """Return list of triples describing matching subsequences. + Implement this to return list[tuple[int, int, int]] and + let `get_matching_blocks` take care of maintenance + """ + raise NotImplementedError + + # Implemented Methods ------------- + # --------------------------------- + + def get_matching_blocks(self): + """Return list of triples describing matching subsequences. + + Each triple is of the form (i, j, n), and means that + a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in + i and in j. New in Python 2.5, it's also guaranteed that if + (i, j, n) and (i', j', n') are adjacent triples in the list, and + the second is not the last triple in the list, then i+n != i' or + j+n != j'. IOW, adjacent triples never describe adjacent equal + blocks. + + The last triple is a dummy, (len(a), len(b), 0), and is the only + triple with n==0. + + When `_get_matching_blocks` is implemented, this method takes care of: + 1. Appending last dummy tripple + 2. Collapsing adjacent blocks + 3. Caching + """ + blocks = self.matching_blocks + if blocks is None: + blocks = self._get_matching_blocks() + blocks = _collapse_adjacent_blocks(blocks) + blocks = list(map(Match._make, blocks)) + # Append dummy at the end + blocks.append(Match(len(self.a), len(self.b), 0)) + # Cache + self.matching_blocks = blocks + return blocks + + def get_opcodes(self): + """Return list of 5-tuples describing how to turn a into b. + + Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple + has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the + tuple preceding it, and likewise for j1 == the previous j2. + + The tags are strings, with these meanings: + + 'replace': a[i1:i2] should be replaced by b[j1:j2] + 'delete': a[i1:i2] should be deleted. + Note that j1==j2 in this case. + 'insert': b[j1:j2] should be inserted at a[i1:i1]. + Note that i1==i2 in this case. + 'equal': a[i1:i2] == b[j1:j2] + + >>> a = "qabxcd" + >>> b = "abycdf" + >>> s = SequenceMatcher(None, a, b) + >>> for tag, i1, i2, j1, j2 in s.get_opcodes(): + ... print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" % + ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))) + delete a[0:1] (q) b[0:0] () + equal a[1:3] (ab) b[0:2] (ab) + replace a[3:4] (x) b[2:3] (y) + equal a[4:6] (cd) b[3:5] (cd) + insert a[6:6] () b[5:6] (f) + """ + + if self.opcodes is not None: + return self.opcodes + i = j = 0 + self.opcodes = answer = [] + for ai, bj, size in self.get_matching_blocks(): + # invariant: we've pumped out correct diffs to change + # a[:i] into b[:j], and the next matching block is + # a[ai:ai+size] == b[bj:bj+size]. So we need to pump + # out a diff to change a[i:ai] into b[j:bj], pump out + # the matching block, and move (i,j) beyond the match + tag = '' + if i < ai and j < bj: + tag = 'replace' + elif i < ai: + tag = 'delete' + elif j < bj: + tag = 'insert' + if tag: + answer.append( (tag, i, ai, j, bj) ) + i, j = ai+size, bj+size + # the list of matching blocks is terminated by a + # sentinel with size 0 + if size: + answer.append( ('equal', ai, i, bj, j) ) + return answer + + def get_grouped_opcodes(self, n=3): + """ Isolate change clusters by eliminating ranges with no changes. + + Return a generator of groups with up to n lines of context. + Each group is in the same format as returned by get_opcodes(). + + >>> from pprint import pprint + >>> a = list(map(str, range(1,40))) + >>> b = a[:] + >>> b[8:8] = ['i'] # Make an insertion + >>> b[20] += 'x' # Make a replacement + >>> b[23:28] = [] # Make a deletion + >>> b[30] += 'y' # Make another replacement + >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes())) + [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)], + [('equal', 16, 19, 17, 20), + ('replace', 19, 20, 20, 21), + ('equal', 20, 22, 21, 23), + ('delete', 22, 27, 23, 23), + ('equal', 27, 30, 23, 26)], + [('equal', 31, 34, 27, 30), + ('replace', 34, 35, 30, 31), + ('equal', 35, 38, 31, 34)]] + """ + + codes = self.get_opcodes() + if not codes: + codes = [("equal", 0, 1, 0, 1)] + # Fixup leading and trailing groups if they show no changes. + if codes[0][0] == 'equal': + tag, i1, i2, j1, j2 = codes[0] + codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2 + if codes[-1][0] == 'equal': + tag, i1, i2, j1, j2 = codes[-1] + codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n) + + nn = n + n + group = [] + for tag, i1, i2, j1, j2 in codes: + # End the current group and start a new one whenever + # there is a large range with no changes. + if tag == 'equal' and i2-i1 > nn: + group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n))) + yield group + group = [] + i1, j1 = max(i1, i2-n), max(j1, j2-n) + group.append((tag, i1, i2, j1 ,j2)) + if group and not (len(group)==1 and group[0][0] == 'equal'): + yield group + + def ratio(self): + """Return a measure of the sequences' similarity (float in [0,1]). + + Where T is the total number of elements in both sequences, and + M is the number of matches, this is 2.0*M / T. + Note that this is 1 if the sequences are identical, and 0 if + they have nothing in common. + + .ratio() is expensive to compute if you haven't already computed + .get_matching_blocks() or .get_opcodes(), in which case you may + want to try .quick_ratio() or .real_quick_ratio() first to get an + upper bound. + + >>> s = SequenceMatcher(None, "abcd", "bcde") + >>> s.ratio() + 0.75 + >>> s.quick_ratio() + 0.75 + >>> s.real_quick_ratio() + 1.0 + """ + + matches = sum(triple[-1] for triple in self.get_matching_blocks()) + return _calculate_ratio(matches, len(self.a) + len(self.b)) + + def quick_ratio(self): + """Return an upper bound on ratio() relatively quickly. + + This isn't defined beyond that it is an upper bound on .ratio(), and + is faster to compute. + """ + + # viewing a and b as multisets, set matches to the cardinality + # of their intersection; this counts the number of matches + # without regard to order, so is clearly an upper bound + if self.fullbcount is None: + self.fullbcount = fullbcount = {} + for elt in self.b: + fullbcount[elt] = fullbcount.get(elt, 0) + 1 + fullbcount = self.fullbcount + # avail[x] is the number of times x appears in 'b' less the + # number of times we've seen it in 'a' so far ... kinda + avail = {} + matches = 0 + for elt in self.a: + if elt in avail: + numb = avail[elt] + else: + numb = fullbcount.get(elt, 0) + avail[elt] = numb - 1 + if numb > 0: + matches += 1 + return _calculate_ratio(matches, len(self.a) + len(self.b)) + + def real_quick_ratio(self): + """Return an upper bound on ratio() very quickly. + + This isn't defined beyond that it is an upper bound on .ratio(), and + is faster to compute than either .ratio() or .quick_ratio(). + """ + + la, lb = len(self.a), len(self.b) + # can't have more matches than the number of elements in the + # shorter sequence + return _calculate_ratio(min(la, lb), la + lb) + + def ratio_if_above(self, cutoff, equal_ok=False): + """Returns ratio if it is higher than cutoff. + + Otherwise, returns None. + + Note, this is the main ratio function that is + used by applications in this module. + """ + # Ordering by cheapest to most expensive ratio is very + # valuable, most often getting out early. + rqr = self.real_quick_ratio() + if equal_ok: + if rqr >= cutoff and self.quick_ratio() >= cutoff: + ratio = self.ratio() + if ratio >= cutoff: + return ratio + else: + if rqr > cutoff and self.quick_ratio() > cutoff: + ratio = self.ratio() + if ratio > cutoff: + return ratio + + __class_getitem__ = classmethod(GenericAlias) + + +######################################################################## +### SequenceMatcher +######################################################################## + + +class SequenceMatcher(SequenceMatcherBase): """ SequenceMatcher is a flexible class for comparing pairs of sequences of @@ -121,33 +553,12 @@ class SequenceMatcher: def __init__(self, isjunk=None, a='', b='', autojunk=True): """Construct a SequenceMatcher. - Optional arg isjunk is None (the default), or a one-argument - function that takes a sequence element and returns true iff the - element is junk. None is equivalent to passing "lambda x: 0", i.e. - no elements are considered to be junk. For example, pass - lambda x: x in " \\t" - if you're comparing lines as sequences of characters, and don't - want to synch up on blanks or hard tabs. - - Optional arg a is the first of two sequences to be compared. By - default, an empty string. The elements of a must be hashable. See - also .set_seqs() and .set_seq1(). - - Optional arg b is the second of two sequences to be compared. By - default, an empty string. The elements of b must be hashable. See - also .set_seqs() and .set_seq2(). - Optional arg autojunk should be set to False to disable the "automatic junk heuristic" that treats popular elements as junk (see module documentation for more information). """ - # Members: - # a - # first sequence - # b - # second sequence; differences are computed as "what do - # we need to do to 'a' to change it into 'b'?" + # Members specific to Sequence Matcher: # b2j # for x in b, b2j[x] is a list of the indices (into b) # at which x appears; junk and popular elements do not appear @@ -155,17 +566,6 @@ def __init__(self, isjunk=None, a='', b='', autojunk=True): # for x in b, fullbcount[x] == the number of times x # appears in b; only materialized if really needed (used # only for computing quick_ratio()) - # matching_blocks - # a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k]; - # ascending & non-overlapping in i and in j; terminated by - # a dummy (len(a), len(b), 0) sentinel - # opcodes - # a list of (tag, i1, i2, j1, j2) tuples, where tag is - # one of - # 'replace' a[i1:i2] should be replaced by b[j1:j2] - # 'delete' a[i1:i2] should be deleted - # 'insert' b[j1:j2] should be inserted - # 'equal' a[i1:i2] == b[j1:j2] # isjunk # a user-supplied function taking a sequence element and # returning true iff the element is "junk" -- this has @@ -176,77 +576,8 @@ def __init__(self, isjunk=None, a='', b='', autojunk=True): # the items in b for which isjunk is True. # bpopular # nonjunk items in b treated as junk by the heuristic (if used). - - self.isjunk = isjunk - self.a = self.b = None self.autojunk = autojunk - self.set_seqs(a, b) - - def set_seqs(self, a, b): - """Set the two sequences to be compared. - - >>> s = SequenceMatcher() - >>> s.set_seqs("abcd", "bcde") - >>> s.ratio() - 0.75 - """ - - self.set_seq1(a) - self.set_seq2(b) - - def set_seq1(self, a): - """Set the first sequence to be compared. - - The second sequence to be compared is not changed. - - >>> s = SequenceMatcher(None, "abcd", "bcde") - >>> s.ratio() - 0.75 - >>> s.set_seq1("bcde") - >>> s.ratio() - 1.0 - >>> - - SequenceMatcher computes and caches detailed information about the - second sequence, so if you want to compare one sequence S against - many sequences, use .set_seq2(S) once and call .set_seq1(x) - repeatedly for each of the other sequences. - - See also set_seqs() and set_seq2(). - """ - - if a is self.a: - return - self.a = a - self.matching_blocks = self.opcodes = None - - def set_seq2(self, b): - """Set the second sequence to be compared. - - The first sequence to be compared is not changed. - - >>> s = SequenceMatcher(None, "abcd", "bcde") - >>> s.ratio() - 0.75 - >>> s.set_seq2("abcd") - >>> s.ratio() - 1.0 - >>> - - SequenceMatcher computes and caches detailed information about the - second sequence, so if you want to compare one sequence S against - many sequences, use .set_seq2(S) once and call .set_seq1(x) - repeatedly for each of the other sequences. - - See also set_seqs() and set_seq1(). - """ - - if b is self.b: - return - self.b = b - self.matching_blocks = self.opcodes = None - self.fullbcount = None - self.__chain_b() + super().__init__(isjunk, a, b) # For each element x in b, set b2j[x] to a list of the indices in # b where x appears; the indices are in increasing order; note that @@ -264,6 +595,9 @@ def set_seq2(self, b): # kinds of matches, it's best to call set_seq2 once, then set_seq1 # repeatedly + def _prepare_seq2(self): + self.__chain_b() + def __chain_b(self): # Because isjunk is a user-defined (not C) function, and we test # for junk a LOT, it's important to minimize the number of calls. @@ -361,7 +695,7 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): # Windiff ends up at the same place as diff, but by pairing up # the unique 'b's and then matching the first two 'a's. - a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.bjunk.__contains__ + a, b, b2j, bjunk = self.a, self.b, self.b2j, self.bjunk if ahi is None: ahi = len(a) if bhi is None: @@ -381,290 +715,86 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): # a[i] matches b[j] if j < blo: continue - if j >= bhi: - break - k = newj2len[j] = j2lenget(j-1, 0) + 1 - if k > bestsize: - besti, bestj, bestsize = i-k+1, j-k+1, k - j2len = newj2len - - # Extend the best by non-junk elements on each end. In particular, - # "popular" non-junk elements aren't in b2j, which greatly speeds - # the inner loop above, but also means "the best" match so far - # doesn't contain any junk *or* popular non-junk elements. - while besti > alo and bestj > blo and \ - not isbjunk(b[bestj-1]) and \ - a[besti-1] == b[bestj-1]: - besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 - while besti+bestsize < ahi and bestj+bestsize < bhi and \ - not isbjunk(b[bestj+bestsize]) and \ - a[besti+bestsize] == b[bestj+bestsize]: - bestsize += 1 - - # Now that we have a wholly interesting match (albeit possibly - # empty!), we may as well suck up the matching junk on each - # side of it too. Can't think of a good reason not to, and it - # saves post-processing the (possibly considerable) expense of - # figuring out what to do with it. In the case of an empty - # interesting match, this is clearly the right thing to do, - # because no other kind of match is possible in the regions. - while besti > alo and bestj > blo and \ - isbjunk(b[bestj-1]) and \ - a[besti-1] == b[bestj-1]: - besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 - while besti+bestsize < ahi and bestj+bestsize < bhi and \ - isbjunk(b[bestj+bestsize]) and \ - a[besti+bestsize] == b[bestj+bestsize]: - bestsize = bestsize + 1 - - return Match(besti, bestj, bestsize) - - def get_matching_blocks(self): - """Return list of triples describing matching subsequences. - - Each triple is of the form (i, j, n), and means that - a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in - i and in j. New in Python 2.5, it's also guaranteed that if - (i, j, n) and (i', j', n') are adjacent triples in the list, and - the second is not the last triple in the list, then i+n != i' or - j+n != j'. IOW, adjacent triples never describe adjacent equal - blocks. - - The last triple is a dummy, (len(a), len(b), 0), and is the only - triple with n==0. - - >>> s = SequenceMatcher(None, "abxcd", "abcd") - >>> list(s.get_matching_blocks()) - [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)] - """ - - if self.matching_blocks is not None: - return self.matching_blocks - la, lb = len(self.a), len(self.b) - - # This is most naturally expressed as a recursive algorithm, but - # at least one user bumped into extreme use cases that exceeded - # the recursion limit on their box. So, now we maintain a list - # ('queue`) of blocks we still need to look at, and append partial - # results to `matching_blocks` in a loop; the matches are sorted - # at the end. - queue = [(0, la, 0, lb)] - matching_blocks = [] - while queue: - alo, ahi, blo, bhi = queue.pop() - i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi) - # a[alo:i] vs b[blo:j] unknown - # a[i:i+k] same as b[j:j+k] - # a[i+k:ahi] vs b[j+k:bhi] unknown - if k: # if k is 0, there was no matching block - matching_blocks.append(x) - if alo < i and blo < j: - queue.append((alo, i, blo, j)) - if i+k < ahi and j+k < bhi: - queue.append((i+k, ahi, j+k, bhi)) - matching_blocks.sort() - - # It's possible that we have adjacent equal blocks in the - # matching_blocks list now. Starting with 2.5, this code was added - # to collapse them. - i1 = j1 = k1 = 0 - non_adjacent = [] - for i2, j2, k2 in matching_blocks: - # Is this block adjacent to i1, j1, k1? - if i1 + k1 == i2 and j1 + k1 == j2: - # Yes, so collapse them -- this just increases the length of - # the first block by the length of the second, and the first - # block so lengthened remains the block to compare against. - k1 += k2 - else: - # Not adjacent. Remember the first block (k1==0 means it's - # the dummy we started with), and make the second block the - # new block to compare against. - if k1: - non_adjacent.append((i1, j1, k1)) - i1, j1, k1 = i2, j2, k2 - if k1: - non_adjacent.append((i1, j1, k1)) - - non_adjacent.append( (la, lb, 0) ) - self.matching_blocks = list(map(Match._make, non_adjacent)) - return self.matching_blocks - - def get_opcodes(self): - """Return list of 5-tuples describing how to turn a into b. - - Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple - has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the - tuple preceding it, and likewise for j1 == the previous j2. - - The tags are strings, with these meanings: - - 'replace': a[i1:i2] should be replaced by b[j1:j2] - 'delete': a[i1:i2] should be deleted. - Note that j1==j2 in this case. - 'insert': b[j1:j2] should be inserted at a[i1:i1]. - Note that i1==i2 in this case. - 'equal': a[i1:i2] == b[j1:j2] - - >>> a = "qabxcd" - >>> b = "abycdf" - >>> s = SequenceMatcher(None, a, b) - >>> for tag, i1, i2, j1, j2 in s.get_opcodes(): - ... print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" % - ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))) - delete a[0:1] (q) b[0:0] () - equal a[1:3] (ab) b[0:2] (ab) - replace a[3:4] (x) b[2:3] (y) - equal a[4:6] (cd) b[3:5] (cd) - insert a[6:6] () b[5:6] (f) - """ - - if self.opcodes is not None: - return self.opcodes - i = j = 0 - self.opcodes = answer = [] - for ai, bj, size in self.get_matching_blocks(): - # invariant: we've pumped out correct diffs to change - # a[:i] into b[:j], and the next matching block is - # a[ai:ai+size] == b[bj:bj+size]. So we need to pump - # out a diff to change a[i:ai] into b[j:bj], pump out - # the matching block, and move (i,j) beyond the match - tag = '' - if i < ai and j < bj: - tag = 'replace' - elif i < ai: - tag = 'delete' - elif j < bj: - tag = 'insert' - if tag: - answer.append( (tag, i, ai, j, bj) ) - i, j = ai+size, bj+size - # the list of matching blocks is terminated by a - # sentinel with size 0 - if size: - answer.append( ('equal', ai, i, bj, j) ) - return answer - - def get_grouped_opcodes(self, n=3): - """ Isolate change clusters by eliminating ranges with no changes. - - Return a generator of groups with up to n lines of context. - Each group is in the same format as returned by get_opcodes(). - - >>> from pprint import pprint - >>> a = list(map(str, range(1,40))) - >>> b = a[:] - >>> b[8:8] = ['i'] # Make an insertion - >>> b[20] += 'x' # Make a replacement - >>> b[23:28] = [] # Make a deletion - >>> b[30] += 'y' # Make another replacement - >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes())) - [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)], - [('equal', 16, 19, 17, 20), - ('replace', 19, 20, 20, 21), - ('equal', 20, 22, 21, 23), - ('delete', 22, 27, 23, 23), - ('equal', 27, 30, 23, 26)], - [('equal', 31, 34, 27, 30), - ('replace', 34, 35, 30, 31), - ('equal', 35, 38, 31, 34)]] - """ - - codes = self.get_opcodes() - if not codes: - codes = [("equal", 0, 1, 0, 1)] - # Fixup leading and trailing groups if they show no changes. - if codes[0][0] == 'equal': - tag, i1, i2, j1, j2 = codes[0] - codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2 - if codes[-1][0] == 'equal': - tag, i1, i2, j1, j2 = codes[-1] - codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n) - - nn = n + n - group = [] - for tag, i1, i2, j1, j2 in codes: - # End the current group and start a new one whenever - # there is a large range with no changes. - if tag == 'equal' and i2-i1 > nn: - group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n))) - yield group - group = [] - i1, j1 = max(i1, i2-n), max(j1, j2-n) - group.append((tag, i1, i2, j1 ,j2)) - if group and not (len(group)==1 and group[0][0] == 'equal'): - yield group - - def ratio(self): - """Return a measure of the sequences' similarity (float in [0,1]). - - Where T is the total number of elements in both sequences, and - M is the number of matches, this is 2.0*M / T. - Note that this is 1 if the sequences are identical, and 0 if - they have nothing in common. - - .ratio() is expensive to compute if you haven't already computed - .get_matching_blocks() or .get_opcodes(), in which case you may - want to try .quick_ratio() or .real_quick_ratio() first to get an - upper bound. + if j >= bhi: + break + k = newj2len[j] = j2lenget(j-1, 0) + 1 + if k > bestsize: + besti, bestj, bestsize = i-k+1, j-k+1, k + j2len = newj2len - >>> s = SequenceMatcher(None, "abcd", "bcde") - >>> s.ratio() - 0.75 - >>> s.quick_ratio() - 0.75 - >>> s.real_quick_ratio() - 1.0 - """ + block = besti, bestj, bestsize + if self.autojunk: + # Extend the best by non-junk elements on each end. In particular, + # "popular" non-junk elements aren't in b2j, which greatly speeds + # the inner loop above, but also means "the best" match so far + # doesn't contain any junk *or* popular non-junk elements. + block = _expand_block_to_junk( + bjunk, block, a, b, alo, ahi, blo, bhi, inverse=True) + + if bjunk: + # Now that we have a wholly interesting match (albeit possibly + # empty!), we may as well suck up the matching junk on each + # side of it too. Can't think of a good reason not to, and it + # saves post-processing the (possibly considerable) expense of + # figuring out what to do with it. In the case of an empty + # interesting match, this is clearly the right thing to do, + # because no other kind of match is possible in the regions. + block = _expand_block_to_junk( + bjunk, block, a, b, alo, ahi, blo, bhi, inverse=False) + + return Match._make(block) + + def _get_matching_blocks(self): + """Return list of triples describing matching subsequences. - matches = sum(triple[-1] for triple in self.get_matching_blocks()) - return _calculate_ratio(matches, len(self.a) + len(self.b)) + Each triple is of the form (i, j, n), and means that + a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in + i and in j. New in Python 2.5, it's also guaranteed that if + (i, j, n) and (i', j', n') are adjacent triples in the list, and + the second is not the last triple in the list, then i+n != i' or + j+n != j'. IOW, adjacent triples never describe adjacent equal + blocks. - def quick_ratio(self): - """Return an upper bound on ratio() relatively quickly. + The last triple is a dummy, (len(a), len(b), 0), and is the only + triple with n==0. - This isn't defined beyond that it is an upper bound on .ratio(), and - is faster to compute. + >>> s = SequenceMatcher(None, "abxcd", "abcd") + >>> list(s.get_matching_blocks()) + [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)] """ - # viewing a and b as multisets, set matches to the cardinality - # of their intersection; this counts the number of matches - # without regard to order, so is clearly an upper bound - if self.fullbcount is None: - self.fullbcount = fullbcount = {} - for elt in self.b: - fullbcount[elt] = fullbcount.get(elt, 0) + 1 - fullbcount = self.fullbcount - # avail[x] is the number of times x appears in 'b' less the - # number of times we've seen it in 'a' so far ... kinda - avail = {} - matches = 0 - for elt in self.a: - if elt in avail: - numb = avail[elt] - else: - numb = fullbcount.get(elt, 0) - avail[elt] = numb - 1 - if numb > 0: - matches += 1 - return _calculate_ratio(matches, len(self.a) + len(self.b)) - - def real_quick_ratio(self): - """Return an upper bound on ratio() very quickly. + la, lb = len(self.a), len(self.b) - This isn't defined beyond that it is an upper bound on .ratio(), and - is faster to compute than either .ratio() or .quick_ratio(). - """ + # This is most naturally expressed as a recursive algorithm, but + # at least one user bumped into extreme use cases that exceeded + # the recursion limit on their box. So, now we maintain a list + # ('queue`) of blocks we still need to look at, and append partial + # results to `matching_blocks` in a loop; the matches are sorted + # at the end. + queue = [(0, la, 0, lb)] + matching_blocks = [] + while queue: + alo, ahi, blo, bhi = queue.pop() + i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi) + # a[alo:i] vs b[blo:j] unknown + # a[i:i+k] same as b[j:j+k] + # a[i+k:ahi] vs b[j+k:bhi] unknown + if k: # if k is 0, there was no matching block + matching_blocks.append(x) + if alo < i and blo < j: + queue.append((alo, i, blo, j)) + if i+k < ahi and j+k < bhi: + queue.append((i+k, ahi, j+k, bhi)) + matching_blocks.sort() + return matching_blocks - la, lb = len(self.a), len(self.b) - # can't have more matches than the number of elements in the - # shorter sequence - return _calculate_ratio(min(la, lb), la + lb) - __class_getitem__ = classmethod(GenericAlias) +######################################################################## +### get_close_matches +######################################################################## -def get_close_matches(word, possibilities, n=3, cutoff=0.6): +def get_close_matches(word, possibilities, n=3, cutoff=0.6, matcher=None): """Use SequenceMatcher to return list of the best "good enough" matches. word is a sequence for which close matches are desired (typically a @@ -679,6 +809,10 @@ def get_close_matches(word, possibilities, n=3, cutoff=0.6): Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities that don't score at least that similar to word are ignored. + Optional arg matcher is a callable that takes 3 positional arguments. + i.e. matcher(isjunk, a, b) which returns SequenceMatcherBase instance + Default (if None) is SequenceMatcher class. + The best (no more than n) matches among the possibilities are returned in a list, sorted by similarity score, most similar first. @@ -697,16 +831,19 @@ def get_close_matches(word, possibilities, n=3, cutoff=0.6): raise ValueError("n must be > 0: %r" % (n,)) if not 0.0 <= cutoff <= 1.0: raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) + if matcher is None: + matcher = SequenceMatcher + elif not callable(matcher): + raise TypeError("matcher must be callable: %r" % (matcher,)) result = [] - s = SequenceMatcher() + s = matcher() s.set_seq2(word) + set_seq1 = s.set_seq1 + ratio_if_above = s.ratio_if_above for x in possibilities: - s.set_seq1(x) - if s.real_quick_ratio() < cutoff or s.quick_ratio() < cutoff: - continue - - ratio = s.ratio() - if ratio >= cutoff: + set_seq1(x) + ratio = ratio_if_above(cutoff, equal_ok=True) + if ratio is not None: result.append((ratio, x)) # Move the best scorers to head of list @@ -715,6 +852,11 @@ def get_close_matches(word, possibilities, n=3, cutoff=0.6): return [x for score, x in result] +######################################################################## +### Differ +######################################################################## + + def _keep_original_ws(s, tag_s): """Replace whitespace with the original whitespace characters in `s`""" return ''.join( @@ -723,7 +865,6 @@ def _keep_original_ws(s, tag_s): ) - class Differ: r""" Differ is a class for comparing sequences of lines of text, and @@ -810,7 +951,8 @@ class Differ: + 5. Flat is better than nested. """ - def __init__(self, linejunk=None, charjunk=None): + def __init__(self, linejunk=None, charjunk=None, + linematcher=None, charmatcher=None): """ Construct a text differencer, with optional filters. @@ -828,10 +970,27 @@ def __init__(self, linejunk=None, charjunk=None): module-level function `IS_CHARACTER_JUNK` may be used to filter out whitespace characters (a blank or tab; **note**: bad idea to include newline in this!). Use of IS_CHARACTER_JUNK is recommended. - """ + - `linematcher`: callable that takes 3 positional arguments. + i.e. matcher(isjunk, a, b) which returns SequenceMatcherBase instance + Default (if None) is SequenceMatcher class. + + - `charmatcher`: callable that takes 3 positional arguments. + i.e. matcher(isjunk, a, b) which returns SequenceMatcherBase instance + Default (if None) is SequenceMatcher class. + """ + if linematcher is None: + linematcher = SequenceMatcher + elif not callable(linematcher): + raise TypeError("linematcher must be callable: %r" % (linematcher,)) + if charmatcher is None: + charmatcher = SequenceMatcher + elif not callable(charmatcher): + raise TypeError("charmatcher must be callable: %r" % (charmatcher,)) self.linejunk = linejunk self.charjunk = charjunk + self.linematcher = linematcher + self.charmatcher = charmatcher def compare(self, a, b): r""" @@ -859,7 +1018,7 @@ def compare(self, a, b): + emu """ - cruncher = SequenceMatcher(self.linejunk, a, b) + cruncher = self.linematcher(self.linejunk, a, b) for tag, alo, ahi, blo, bhi in cruncher.get_opcodes(): if tag == 'replace': g = self._fancy_replace(a, alo, ahi, b, blo, bhi) @@ -920,10 +1079,9 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): # Later, more pathological cases prompted removing recursion # entirely. cutoff = 0.74999 - cruncher = SequenceMatcher(self.charjunk) - crqr = cruncher.real_quick_ratio - cqr = cruncher.quick_ratio - cr = cruncher.ratio + cruncher = self.charmatcher(self.charjunk) + set_seq1 = cruncher.set_seq1 + ratio_if_above = cruncher.ratio_if_above WINDOW = 10 best_i = best_j = None @@ -939,13 +1097,10 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): break best_ratio = cutoff for i in arange: - cruncher.set_seq1(a[i]) - # Ordering by cheapest to most expensive ratio is very - # valuable, most often getting out early. - if (crqr() > best_ratio - and cqr() > best_ratio - and cr() > best_ratio): - best_i, best_j, best_ratio = i, j, cr() + set_seq1(a[i]) + ratio = ratio_if_above(best_ratio, equal_ok=False) + if ratio is not None: + best_i, best_j, best_ratio = i, j, ratio if best_i is None: # found nothing to synch on yet - move to next j @@ -1097,7 +1252,8 @@ def _format_range_unified(start, stop): return '{},{}'.format(beginning, length) def unified_diff(a, b, fromfile='', tofile='', fromfiledate='', - tofiledate='', n=3, lineterm='\n', *, color=False): + tofiledate='', n=3, lineterm='\n', *, color=False, + matcher=None): r""" Compare two sequences of lines; generate the delta as a unified diff. @@ -1118,6 +1274,10 @@ def unified_diff(a, b, fromfile='', tofile='', fromfiledate='', 'git diff --color'. Even if enabled, it can be controlled using environment variables such as 'NO_COLOR'. + Optional arg matcher is a callable that takes 3 positional arguments. + i.e. matcher(isjunk, a, b) which returns SequenceMatcherBase instance + Default (if None) is SequenceMatcher class. + The unidiff format normally has a header for filenames and modification times. Any or all of these may be specified using strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. @@ -1140,6 +1300,10 @@ def unified_diff(a, b, fromfile='', tofile='', fromfiledate='', +tree four """ + if matcher is None: + matcher = SequenceMatcher + elif not callable(matcher): + raise TypeError("matcher must be callable: %r" % (matcher,)) if color and can_colorize(): t = get_theme(force_color=True).difflib @@ -1148,7 +1312,7 @@ def unified_diff(a, b, fromfile='', tofile='', fromfiledate='', _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm) started = False - for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): + for group in matcher(None, a, b).get_grouped_opcodes(n): if not started: started = True fromdate = '\t{}'.format(fromfiledate) if fromfiledate else '' @@ -1190,8 +1354,8 @@ def _format_range_context(start, stop): return '{},{}'.format(beginning, beginning + length - 1) # See http://www.unix.org/single_unix_specification/ -def context_diff(a, b, fromfile='', tofile='', - fromfiledate='', tofiledate='', n=3, lineterm='\n'): +def context_diff(a, b, fromfile='', tofile='', fromfiledate='', tofiledate='', + n=3, lineterm='\n', matcher=None): r""" Compare two sequences of lines; generate the delta as a context diff. @@ -1208,6 +1372,10 @@ def context_diff(a, b, fromfile='', tofile='', For inputs that do not have trailing newlines, set the lineterm argument to "" so that the output will be uniformly newline free. + Optional arg matcher is a callable that takes 3 positional arguments. + i.e. matcher(isjunk, a, b) which returns SequenceMatcherBase instance + Default (if None) is SequenceMatcher class. + The context diff format normally has a header for filenames and modification times. Any or all of these may be specified using strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. @@ -1233,11 +1401,15 @@ def context_diff(a, b, fromfile='', tofile='', ! tree four """ + if matcher is None: + matcher = SequenceMatcher + elif not callable(matcher): + raise TypeError("matcher must be callable: %r" % (matcher,)) _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm) prefix = dict(insert='+ ', delete='- ', replace='! ', equal=' ') started = False - for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): + for group in matcher(None, a, b).get_grouped_opcodes(n): if not started: started = True fromdate = '\t{}'.format(fromfiledate) if fromfiledate else '' @@ -1319,7 +1491,7 @@ def decode(s): for line in lines: yield line.encode('ascii', 'surrogateescape') -def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK): +def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK, differ=None): r""" Compare `a` and `b` (lists of strings); return a `Differ`-style delta. @@ -1337,6 +1509,10 @@ def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK): whitespace characters (a blank or tab; note: it's a bad idea to include newline in this!). + - `differ`: callable that takes 2 positional arguments. + i.e. differ(linejunk, charjunk) which returns `Differ` instance + Default (if None) is Differ class. + Tools/scripts/ndiff.py is a command-line front-end to this function. Example: @@ -1354,10 +1530,15 @@ def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK): + tree + emu """ - return Differ(linejunk, charjunk).compare(a, b) + if differ is None: + differ = Differ + elif not callable(differ): + raise TypeError("differ must be callable: %r" % (differ,)) + + return differ(linejunk, charjunk).compare(a, b) -def _mdiff(fromlines, tolines, context=None, linejunk=None, - charjunk=IS_CHARACTER_JUNK): +def _mdiff(fromlines, tolines, context=None, + linejunk=None, charjunk=IS_CHARACTER_JUNK, differ=None): r"""Returns generator yielding marked up from/to side by side differences. Arguments: @@ -1367,6 +1548,8 @@ def _mdiff(fromlines, tolines, context=None, linejunk=None, if None, all from/to text lines will be generated. linejunk -- passed on to ndiff (see ndiff documentation) charjunk -- passed on to ndiff (see ndiff documentation) + differ -- passed on to ndiff (see ndiff documentation) + charmatcher -- passed on to ndiff (see ndiff documentation) This function returns an iterator which returns a tuple: (from line tuple, to line tuple, boolean flag) @@ -1396,7 +1579,7 @@ def _mdiff(fromlines, tolines, context=None, linejunk=None, change_re = re.compile(r'(\++|\-+|\^+)') # create the difference iterator to generate the differences - diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk) + diff_lines_iterator = ndiff(fromlines, tolines, linejunk, charjunk, differ) def _make_line(lines, format_key, side, num_lines=[0,0]): """Returns line of text with user's change markup and line formatting. @@ -1626,6 +1809,11 @@ def _line_pair_iterator(): return +######################################################################## +### HtmlDiff +######################################################################## + + _file_template = """ @@ -1735,22 +1923,24 @@ class HtmlDiff(object): _legend = _legend _default_prefix = 0 - def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None, - charjunk=IS_CHARACTER_JUNK): + def __init__(self,tabsize=8, wrapcolumn=None, + linejunk=None, charjunk=IS_CHARACTER_JUNK, differ=None): """HtmlDiff instance initializer Arguments: tabsize -- tab stop spacing, defaults to 8. wrapcolumn -- column number where lines are broken and wrapped, defaults to None where lines are not wrapped. - linejunk,charjunk -- keyword arguments passed into ndiff() (used by - HtmlDiff() to generate the side by side HTML differences). See - ndiff() documentation for argument default values and descriptions. + linejunk,charjunk,differ -- keyword arguments + passed into ndiff() (used by HtmlDiff() to generate the side + by side HTML differences). See ndiff() documentation for + argument default values and descriptions. """ self._tabsize = tabsize self._wrapcolumn = wrapcolumn self._linejunk = linejunk self._charjunk = charjunk + self._differ = differ def make_file(self, fromlines, tolines, fromdesc='', todesc='', context=False, numlines=5, *, charset='utf-8'): @@ -2021,8 +2211,9 @@ def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False, context_lines = numlines else: context_lines = None - diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk, - charjunk=self._charjunk) + diffs = _mdiff(fromlines, tolines, context_lines, + linejunk=self._linejunk, charjunk=self._charjunk, + differ=self._differ) # set up iterator to wrap lines that exceed desired width if self._wrapcolumn: @@ -2099,3 +2290,864 @@ def restore(delta, which): for line in delta: if line[:2] in prefixes: yield line[2:] + + +######################################################################## +### DivideAndConquerMatcherMixin +######################################################################## + + +class _Sentinel: + def __init__(self, name): + self.name = name + + def __repr__(self): + return self.name + + __reduce__ = None + + +# Private sentinels +_RANGE = _Sentinel('RANGE') # Range to process +_BLOCK = _Sentinel('BLOCK') # Block to return +_RANGEWITHBLOCKS = _Sentinel('RANGEWITHBLOCKS') # Range to process & pre-evaluated blocks + +# Modifier sentinels. These are returned as first tuple item from `_modifier` +ANCHORBLOCKS = _Sentinel('ANCHORBLOCKS') # List of blocks (not subject to balancing) +RESULTBLOCKS = _Sentinel('RESULTBLOCKS') # List of blocks that terminate recursion + + +_ERR_MSG_DTYPE = 'Unknown data type: {!r}' + + +class DivideAndConquerMatcherMixin: + def _process_range(self, depth, alo, ahi, blo, bhi): + raise NotImplementedError + + def _preprocess_range(self, depth, alo, ahi, blo, bhi): + return None + + def _validate_blocks(self, blocks, alo, ahi, blo, bhi): + # 2.1.1. Prepare for validation + blocks = list(blocks) + if len(blocks) > 1: + blocks.sort() + + # 2.1.2. Validate modifier output + new_blocks = [] + i0, j0 = alo, blo + for ii, jj, kk in blocks: + if not kk: + continue + if not (i0 <= ii <= ii + kk <= ahi and j0 <= jj <= jj + kk <= bhi): + msg = ( + '`self._modifier(...)` returned invalid block, which ' + 'is either out of bounds or overlaps with a nearby one' + 'block={}, last_bound={}, current_interval={}' + ) + raise RuntimeError(msg.format(blocks, (i0, j0), bounds)) + yield (ii, jj, kk) + i0 = ii + kk + j0 = jj + kk + + def _get_matching_blocks(self): + """Return list of triples describing matching subsequences. + + Each triple is of the form (i, j, n), s.t. a[i:i+n] == b[j:j+n]. + + The last triple is a dummy, (len(a), len(b), 0), and is the only + triple with n==0. + + >>> gsm = ExactSequenceMatcher(None, "abxcd", "abcd") + >>> list(gsm.get_matching_blocks()) + [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)] + """ + alo, ahi, blo, bhi = 0, len(self.a), 0, len(self.b) + if alo >= ahi or blo >= bhi: + return + + # 3-element tuples: (data_type, depth, data) + q = [(_RANGE, 1, (alo, ahi, blo, bhi))] + while q: + dtype, depth, data = q.pop() + + # 1. Decision logic for q items + if dtype is _BLOCK: + # Just a block to yield + yield data + continue + + elif dtype is _RANGE: + # Just the range to process + bounds = data + rtype, blocks, validated = self._process_range(depth, *bounds) + + elif dtype is _RANGEWITHBLOCKS: + # Range & pre-evaluated block + bounds, data = data + rtype, blocks, validated = data + + else: + raise RuntimeError(_ERR_MSG_DTYPE.format(dtype)) + + if rtype not in (ANCHORBLOCKS, RESULTBLOCKS): + msg = 'Unknown result type from processed range: {!r}' + raise RuntimeError(msg.format(rtype)) + + if not validated: + blocks = list(self._validate_blocks(blocks, *bounds)) + if not blocks: + continue + + if rtype is RESULTBLOCKS: + yield from blocks + continue + + # 2.1. Interpolate `blocks` with ranges + alo, ahi, blo, bhi = bounds + q_tail = [] + i0, j0 = alo, blo + for block in blocks: + i, j, k = block + if not k: + continue + if i0 < i and j0 < j: + q_tail.append((_RANGE, (i0, i, j0, j))) + q_tail.append((_BLOCK, block)) + i0, j0 = i + k, j + k + + if q_tail: + if i0 < ahi and j0 < bhi: + q_tail.append((_RANGE, (i0, ahi, j0, bhi))) + else: + # No blocks identified. Do not recurse further. + continue + + # 2.2. Yield what is possible straight away + q_tail.reverse() + while q_tail: + dtype, data = q_tail.pop() + if dtype is _BLOCK: + yield data + elif dtype is _RANGE: + q_tail.append((dtype, data)) + q_tail.reverse() + break + else: + raise RuntimeError(_ERR_MSG_DTYPE.format(dtype)) + + # 2.3. append to Q what is not + d = depth + 1 + while q_tail: + dtype, data = q_tail.pop() + if dtype is _BLOCK: + q.append((dtype, d, data)) + elif dtype is _RANGE: + # Try quick evaluation without re-building + # Before cache was overriden + bounds = data + result = self._preprocess_range(d, *bounds) + if result is not None: + q.append((_RANGEWITHBLOCKS, d, (bounds, result))) + else: + q.append((dtype, d, data)) + else: + raise RuntimeError(_ERR_MSG_DTYPE.format(dtype)) + + def _modifier(self, depth, block, alo, ahi, blo, bhi): + """An entry point for intercepting `_get_matching_blocks` algorithm, + which can be implemented by derived class. + + It can be used for: + a) quick peak into what algorithm is doing + b) modification of divide-and-conquer algorithm + + Args: + depth : int + depth 1 is the initial one + block : tuple[start_in_1: int, start_in_b: int, length: int] + Candidate block for recursion loop. It is obtained by + calling find_longest_match for current recursion range + alo, ahi, blo, bhi : int, int, int, int + range of current recursion iteration + + This method returns None for no action. Otherwise, a tuple of 2 items: + 1. rtype : _Sentinel + 2. data : object + + rtype can take 2 sentinel values found in `difflib`. + It indicates what the type of return is and what it means: + + ANCHORBLOCKS - List of anchor blocks. All ranges around + these blocks are subject to further recursion. + e.g. (ANCHORBLOCKS, [(0, 0, 10), (10, 10, 10)]) + + RESULTBLOCKS - List of blocks that terminate recursion + e.g. (RESULTBLOCKS, [(0, 0, 10), (10, 10, 10)]) + + If data contains no blocks or only blocks of 0 length, + the algorithm does not recurse further. + + Note, one can get `a`, `b`, `automaton`, etc from self + """ + return None + + +######################################################################## +### _LCSUBAutomaton +######################################################################## + + +_EARLY_EXIT = _Sentinel('EARLY_EXIT') + + +def _simple_find(pattern, text, start=0, stop=None, *, overlapping=True, stopif=None): + """ + Examples: + >>> list(_simple_find('aa', 'aaaa')) + [0, 1, 2] + >>> list(_simple_find('aa', 'aaaa', overlapping=False)) + [0, 2] + >>> list(_simple_find('_____', '__x__x__x__x__x__x_____')) + [18] + >>> list(_simple_find('_____', '__x__x__x__x__x__x_____', stopif=1)) + [EARLY_EXIT] + """ + if overlapping not in (0, 1): + raise ValueError(f'{overlapping=} not in (0, 1)') + if not text or not pattern: + return + if stop is None: + stop = len(text) + n = stop - start + m = len(pattern) + if m > n: + return + + # 1 element fast path + first = pattern[0] + if m == 1: + for i in range(start, stop): + if text[i] == first: + yield i + return + + # 2 element fast path + last = pattern[-1] + inc = 1 if overlapping else m + i = start + m_m1 = m - 1 + end = stop - m_m1 + if m == 2: + while i < end: + if text[i] == first and text[i + 1] == last: + yield i + i += inc + else: + i += 1 + return + + # 3. Two-way + if stopif is not None: + max_miss = int(n * stopif / m) + + three = m == 3 + four = m == 4 + if three or four: + mid1 = pattern[1] + mid2 = pattern[2] + else: + mid = pattern[1:-1] + k = 0 + while i < end: + if text[i] != first or text[(i_last := i + m_m1)] != last: + i += 1 + continue + + if three: + hit = text[i + 1] == mid1 + elif four: + hit = text[i + 1] == mid1 and text[i + 2] == mid2 + else: + hit = text[i + 1:i_last] == mid + if hit: + yield i + if stopif is not None: + k = 0 + i += inc + else: + if stopif is not None: + k += 1 + if k >= max_miss: + yield _EARLY_EXIT + break + i += 1 + + +class _LCSUBAutomaton: + """Suffix Automaton for finding longest common substring. + + Complexity: + T: O(n1 + n2) ~ n1 + 5 × n2 + S: O(n2) : max_nstates = 2 × n if n <= 1 else 2 × n - 1 + n1 = len(seq1) - the one that is being scanned + n2 = len(seq2) - the one that is being built + + Node Structure: + nodes: [ + lengths: list[int], # length of a match + links: list[int], # link to roll back on mismatch + next1s: list[object], # See "Next logic below" + next2s: list[int], # See "Next logic below" + eposs: list[int], # index of last match position + ] + + Next logic (Memory optimization as > 50% of state have only 1 transition): + next2 == -1 -> empty + next2 == -3 -> next1: dict + next2 >= 0 -> next2 - index, next1 - key + + Examples: + >>> aut = _LCSUBAutomaton('abc') + >>> aut + <_LCSUBAutomaton object; seq2_size=3> + >>> aut.build() + >>> aut.print_states() + 0 (0, 0, {'a': 1, 'b': 2, 'c': 3}, -3, 0) + 1 (1, 0, 'b', 2, 0) + 2 (2, 0, 'c', 3, 1) + 3 (3, 0, None, -1, 2) + """ + def __init__(self, seq2, *, junk=()): + """ + Args: + seq2 : Sequence + Automaton will be built for this sequence. + Note, building is ~5x slower than scanning + junk : Iterable + Items in this set will be treated as unmatchable elements + """ + if not isinstance(junk, frozenset): + junk = frozenset(junk) + self.seq2 = seq2 + self.size2 = len(seq2) + self.junk = junk + self.nodes = None + self.cache = (0, 0) + + def __repr__(self): + kwstring = f'seq2_size={self.size2}' + if self.junk: + kwstring += f', junk_size={len(self.junk)}' + return f'<{type(self).__name__} object; {kwstring}>' + + # CORE ---------------------------- + # --------------------------------- + + def _make_nodes(self, n): + if n <= 0: + raise ValueError(f'{n=} <= 0') + lengths = [0] * n + links = [0] * n + next1s = [None] * n + next2s = [-1] * n + eposs = [0] * n + return lengths, links, next1s, next2s, eposs + + def _build(self, start2, stop2): + """Automaton builder""" + seq2 = self.seq2 + junk = self.junk + # Make Nodes + size = (stop2 - start2) + n_nodes = 4 * size // 3 + 1 # Maximum 25% overallocation + inc = size // 10 + 1 # Then, 10% increments + nodes = self._make_nodes(n_nodes) + lengths, links, next1s, next2s, eposs = nodes + nstates = 1 + # Loop + last_len = 0 + last = 0 + for j in range(start2, stop2): + el = seq2[j] + if el in junk: + last_len = 0 + last = 0 + continue + + if nstates == n_nodes: + for a, b in zip(nodes, self._make_nodes(inc)): + a.extend(b) + n_nodes += inc + + curr = nstates + nstates += 1 + last_len += 1 + # New Node + lengths[curr] = last_len + eposs[curr] = j + + p = last + px1 = next1s[p] + px2 = next2s[p] + cont = True + while 1: + if px2 == -1: + next1s[p] = el + next2s[p] = curr + elif px2 == -3: + if el not in px1: + px1[el] = curr + else: + break + else: + if el != px1: + next1s[p] = {px1: px2, el: curr} + next2s[p] = -3 + else: + break + if not p: + # p is root! + cont = False + break + p = links[p] + px1 = next1s[p] + px2 = next2s[p] + if cont: + if px2 == -3: + q = px1[el] + else: + q = px2 + p_len_p1 = lengths[p] + 1 + if p_len_p1 == lengths[q]: + links[curr] = q + else: + + if nstates == n_nodes: + for a, b in zip(nodes, self._make_nodes(inc)): + a.extend(b) + n_nodes += inc + + clone = nstates + nstates += 1 + # Clone + lengths[clone] = p_len_p1 + links[clone] = links[q] + qx2 = next2s[q] + if qx2 != -1: + qx1 = next1s[q] + if qx2 == -3: + next1s[clone] = qx1.copy() + next2s[clone] = -3 + else: + next1s[clone] = qx1 + next2s[clone] = qx2 + # Copy `eposs[q]` to ensure leftmost match in seq2 + eposs[clone] = eposs[q] + while 1: + if px2 == -3: + if px1.get(el) is q: + px1[el] = clone + else: + break + else: + if px1 == el and px2 == q: + next2s[p] = clone + else: + break + if not p: + # p is root! + break + p = links[p] + px1 = next1s[p] + px2 = next2s[p] + + links[q] = links[curr] = clone + + last = curr + + # Trim unused state space + if nstates < n_nodes: + for item in nodes: + del item[nstates:] + return nodes + + def _finditer(self, seq1, start1, stop1, best=False): + """Core scanning routine. + + Args: + best : bool + False - return all matches, including non-maximal + True - return all matches of maximum length + all these will naturally be maximal + Returns: + generator of tuples (e1, e2, k), where + e1, e2 are ending positions in seq1 and seq2 respectively + k is length of a match + Thus, starting position is: e1 + 1 - k + And stop for a slice is: e1 + 1 + """ + if best not in (0, 1): + raise ValueError(f'{best=} not in (0, 1)') + lengths, links, next1s, next2s, eposs = self.nodes + junk = self.junk + v = 0 + k = 0 + best_len = 0 + results = [] + root_x1 = vx1 = next1s[v] + root_x2 = vx2 = next2s[v] + start2, stop2 = self.cache + size1 = stop1 - start1 + size2 = stop2 - start2 + for i in range(start1, stop1): + el = seq1[i] + if el in junk: + v = 0 + k = 0 + vx1 = root_x1 + vx2 = root_x2 + continue + + while v and (vx2 == -1 or (el not in vx1 if vx2 == -3 else vx1 != el)): + v = links[v] + k = lengths[v] + vx1 = next1s[v] + vx2 = next2s[v] + + if vx2 == -3: + v_new = vx1.get(el) + elif vx2 == -1: + v_new = None + else: + v_new = vx2 if vx1 == el else None + if v_new is not None: + v = v_new + vx1 = next1s[v] + vx2 = next2s[v] + k += 1 + if not best: + yield (i, eposs[v], k) + else: + if k > best_len: + best_len = k + results = [(i, v, k)] + elif k == best_len: + results.append((i, v, k)) + if results: + for i, v, k in results: + yield (i, eposs[v], k) + + # Private API --------------------- + # --------------------------------- + + def _try_find(self, seq1, start1, stop1, start2, stop2): + """Attempts to find match without building automaton. + + Querying in exactly the same range will always succeed + Also, it might be possible if (start2, stop2) is within cached range + + returns None on fail + """ + if start1 >= stop1 or start2 >= stop2: + return (start1, start2, 0) + + c_start, c_stop = self.cache + if c_start <= start2 and stop2 <= c_stop: + it = self._finditer(seq1, start1, stop1, best=True) + for res in it: + break + else: + return (start1, start2, 0) + + e1, e2, k = res + stop_in_seq2 = e2 + 1 + start_in_seq2 = stop_in_seq2 - k + if start_in_seq2 >= start2 and stop_in_seq2 <= stop2: + return (e1 + 1 - k, start_in_seq2, k) + + def _find(self, seq1, start1, stop1, start2, stop2): + """Returns lefmost longest match. + + Does not attempt to retrieve from inexactly built range + Always returns an answer + """ + if start1 >= stop1 or start2 >= stop2: + return (start1, start2, 0) + + if self.cache != (start2, stop2): + self.build(start2, stop2) + + it = self._finditer(seq1, start1, stop1, best=True) + for res in it: + break + else: + return (start1, start2, 0) + + e1, e2, k = res + one_mk = 1 - k + return (e1 + one_mk, e2 + one_mk, k) + + # API ----------------------------- + # --------------------------------- + + def print_states(self, slc=slice(None)): + assert isinstance(slc, slice) + nodes = self.nodes + if nodes is None: + nodes = self.build(0, self.size2) + if slc != slice(None): + nodes = [item[slc] for item in nodes] + for i, state in enumerate(zip(*nodes)): + print(i, state) + + def build(self, start2=0, stop2=None): + """Build automaton for specified range of seq2""" + start2, stop2 = _adjust_indices(self.size2, start2, stop2) + key = (start2, stop2) + if self.cache != key: + self.nodes = None + self.key = (0, 0) + self.nodes = self._build(start2, stop2) + self.cache = key + + def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None): + """Find leftmost longest match. + + Firstly, it will be leftmost in seq1 + Secondly, it will be leftmost in seq2 if more than one occurrence + + Returns: + match: (start_in_seq1, start_in_seq2, match_length) + """ + start1, stop1 = _adjust_indices(len(seq1), start1, stop1) + start2, stop2 = _adjust_indices(self.size2, start2, stop2) + res = self._try_find(seq1, start1, stop1, start2, stop2) + if res is None: + res = self._find(seq1, start1, stop1, start2, stop2) + return res + + def batchfind(self, seq1, bounds_list): + """Performance method for many `find` calls. + + It calls `find` in order that aims to minimize builds needed + Also, does not evaluate same range twice + + Args: + bounds_list : list[tuple[int, int, int, int]] + list of tuples: (start1, stop1, start2, stop2) + """ + if not bounds_list: + return [] + + result = [None] * len(bounds_list) + c_lo, c_hi = self.cache + jobs = list(enumerate(bounds_list)) + jobs.sort(key=lambda x: abs((b := x[1])[2] - c_lo) + abs(b[3] - c_hi)) + evaluated = {} + for i, bounds in jobs: + res = evaluated.get(bounds) + if res is None: + res = self.find(seq1, *bounds) + evaluated[bounds] = res + result[i] = res + return result + + # Leftmost Sequential API --------- + # --------------------------------- + + def _try_find_sequential(self, seq1, start1, stop1, start2, stop2): + if start1 >= stop1 or start2 >= stop2: + return [] + + c_start, c_stop = self.cache + if c_start > start2 or stop2 > c_stop: + return None + + it = self._finditer(seq1, start1, stop1, best=True) + last = next(it, None) + if last is None: + return [] + + e1, e2, k = last + one_mk = 1 - k + j = e2 + one_mk + j2 = e2 + 1 + if j < start2 or stop2 < j2: + return None + + blocks = [(e1 + one_mk, j, k)] + if k * 2 > min(stop1 - e1 - 1, stop2 - e2 - 1): + return blocks + + for block in it: + e1 = block[0] + e2 = block[1] + if e1 - k < last[0]: + continue + elif e2 + 1 > stop2: + break + elif e2 - k < last[1]: + i = e1 + one_mk + patt = seq1[i:i+k] + find_it = _simple_find(patt, self.seq2, last[1] + 1, stop2, stopif=1) + j = next(find_it, None) + if j is _EARLY_EXIT: + break + elif j is not None: + e2 = j - one_mk + blocks.append((i, j, k)) + last = (e1, e2, k) + else: + blocks.append((e1 + one_mk, e2 + one_mk, k)) + last = block + return blocks + + def _find_sequential(self, seq1, start1, stop1, start2, stop2): + blocks = self._try_find_sequential(seq1, start1, stop1, start2, stop2) + if blocks is not None: + return blocks + + if self.cache != (start2, stop2): + self.build(start2, stop2) + it = self._finditer(seq1, start1, stop1, best=True) + last = next(it, None) + if last is None: + return [] + + e1, e2, k = last + one_mk = 1 - k + blocks = [(e1 + one_mk, e2 + one_mk, k)] + if k * 2 > min(stop1 - e1 - 1, stop2 - e2 - 1): + return blocks + + for block in it: + e1 = block[0] + e2 = block[1] + if e1 - k < last[0]: + continue + elif e2 - k < last[1]: + i = e1 + one_mk + patt = seq1[i:i+k] + find_it = _simple_find(patt, self.seq2, last[1] + 1, stop2, stopif=1) + j = next(find_it, None) + if j is _EARLY_EXIT: + break + elif j is not None: + e2 = j - one_mk + blocks.append((i, j, k)) + last = (e1, e2, k) + else: + blocks.append((e1 + one_mk, e2 + one_mk, k)) + last = block + + return blocks + + +######################################################################## +### ExactSequenceMatcher +######################################################################## + + + +class ExactSequenceMatcher(DivideAndConquerMatcherMixin, SequenceMatcherBase): + """ + ExactSequenceMatcher is a flexible class for comparing pairs + of sequences of any type, so long as the sequence elements are hashable. + + It builds upon the same idea as `SequenceMatcher` and with its defaults + its results are exactly the same as the ones of `SequenceMatcher` with + `autojunk` parameter set to False. + + However, while `SequenceMatcher` is able to obtain same result, + it is only practical to use with `autojunk` set to False due to + quadratic worst case complexity of Longest Common Substring algorithm. + + `ExactSequenceMatcher`, on the other hand, uses Suffix Automaton, + which has O(n) complexity guaranteed, making it possible to use exact + calculation on long sequences. + + Comparison to SequenceMatcher: + In terms of results, the following 2 are equivalent: + a) SequenceMatcher(..., autojunk=False) + b) ExactSequenceMatcher(...) + + Time Complexity: + find_longest_match : O(n) + get_matching_blocks : O(n) - O(nlogn) for average diff case + O(n^2) worst case. + + Example of worst case complexity `get_matching_blocks` case: + chars = ['ab'[i % 2] for i in range(100)] + seq1 = '+'.join(chars) + seq2 = '-'.join(chars) + + Space Complexity: + find_longest_match: c × O(n), c ~ 3x (compared to `SequenceMatcher`) + get_matching_blocks: c × O(n), c ~ 3x (compared to `SequenceMatcher`) + + NOTE:TODO:Worst case: + aa-bb-cc-dd-... + bb+aa+dd+cc-... + """ + + def _prepare_seq2(self): + b = self.b + self.bjunk = bjunk = set() + if self.isjunk: + bjunk.update(filter(self.isjunk, _Counter(b))) + self.automaton = _LCSUBAutomaton(b, junk=bjunk) + + def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): + """ + Find longest matching block in a[alo:ahi] and b[blo:bhi]. + By default it will find the longest match in the entirety of a and b. + + Look up docstring of SequenceMatcher.find_longest_match + for more information. + """ + a, b, bjunk = self.a, self.b, self.bjunk + block = self.automaton.find(self.a, alo, ahi, blo, bhi) + + if bjunk: + # Extend match to surrounding junk + block = _expand_block_to_junk( + bjunk, block, a, b, alo, ahi, blo, bhi, inverse=False) + + return Match._make(block) + + def _extend_junk_for_many(self, blocks, alo, ahi, blo, bhi): + a, b, bjunk = self.a, self.b, self.bjunk + if not bjunk: + return blocks + + if not blocks: + # NOTE: Backwards compatibility + block = (alo, blo, 0) + block = _expand_block_to_junk( + bjunk, block, a, b, alo, ahi, blo, bhi, inverse=False) + return [block] + + result = [] + i0, j0 = alo, blo + for block in blocks: + block = _expand_block_to_junk( + bjunk, block, a, b, i0, ahi, j0, bhi, inverse=False) + i, j, k = block + i0 = i + k + j0 = j + k + result.append(block) + return result + + def _preprocess_range(self, depth, alo, ahi, blo, bhi): + blocks = self.automaton._try_find_sequential(self.a, alo, ahi, blo, bhi) + if blocks is not None: + if self.bjunk: + blocks = self._extend_junk_for_many(blocks, alo, ahi, blo, bhi) + return ANCHORBLOCKS, blocks, True + + def _process_range(self, depth, alo, ahi, blo, bhi): + blocks = self.automaton._find_sequential(self.a, alo, ahi, blo, bhi) + if self.bjunk: + blocks = self._extend_junk_for_many(blocks, alo, ahi, blo, bhi) + return ANCHORBLOCKS, blocks, True diff --git a/Lib/test/test_difflib.py b/Lib/test/test_difflib.py index 771fd46e042a41..316eb40f02561d 100644 --- a/Lib/test/test_difflib.py +++ b/Lib/test/test_difflib.py @@ -283,6 +283,7 @@ def test_make_file_usascii_charset_with_nonascii_input(self): self.assertIn('ımplıcıt', output) class TestDiffer(unittest.TestCase): + def test_close_matches_aligned(self): # Of the 4 closely matching pairs, we want 1 to match with 3, # and 2 with 4, to align with a "top to bottom" mental model. @@ -640,6 +641,73 @@ def test_invalid_input(self): ''.join(difflib.restore([], 3)) +class TestLCSUBAutomaton(unittest.TestCase): + def test_find(self): + cases = [ + ('abd', 'abcabd', (0, 3, 3)), + ('dab', 'abcabd', (1, 0, 2)), + ] + collect = [] + for seq1, seq2, expect in cases: + result = difflib._LCSUBAutomaton(seq2).find(seq1) + self.assertEqual(result, expect) + collect.append(result) + + def test_find_with_junk(self): + cases = [ + ('ab_abd', 'abcabd', (3, 3, 3)), + ('abd_', 'ab_abd_', (0, 3, 3)), + ('abcbd', 'abc_bd', (0, 0, 3)), + ('cbd', 'abc_bd', (1, 4, 2)), + ] + for seq1, seq2, expect in cases: + result = difflib._LCSUBAutomaton(seq2, junk=('_')).find(seq1) + self.assertEqual(result, expect) + + def test_batchfind(self): + seq1 = 'fgfedabacba' + seq2 = seq1[::-1] + n = len(seq1) + + intervals = [] + for i in range(n - 1): + for j in range(i + 1, min(i + 5, n)): + intervals.append((i, j)) + bounds_list = [] + for alo, ahi in intervals: + for blo, bhi in intervals: + bounds_list.append((alo, ahi, blo, bhi)) + + aut = difflib._LCSUBAutomaton(seq2) + results1 = [aut.find(seq1, *bounds) for bounds in bounds_list] + results2 = aut.batchfind(seq1, bounds_list) + self.assertEqual(results1, results2) + + +class TestExactSequenceMatcher(unittest.TestCase): + def test_cross_test_with_autojunk_false(self): + cases = [ + ("ABCDEFGHIJKLMNOP" * 10, "ACEGIKMOQBDFHJLNP" * 10), + ( + "".join(chr(ord('a') + i % 10) * (i + 1) for i in range(30)), + "".join(chr(ord('a') + i % 10) * (30 - i) for i in range(30)) + ), + ( + "A" + "X"*99 + "BCDEFGHIJKLMNOPQRSTUVWXYZ"*2, + "BCDEFGHIJKLMNOPQRSTUVWXYZ"*2 + "A" + "X"*99 + ) + ] + for seq1, seq2 in cases: + for isjunk in [None, lambda x: x in 'aeAE']: + sm1 = difflib.SequenceMatcher(isjunk, seq1, seq2, autojunk=False) + sm2 = difflib.ExactSequenceMatcher(isjunk, seq1, seq2) + self.assertEqual(sm1.bjunk, sm2.bjunk) + blocks1 = sm1.get_matching_blocks() + blocks2 = sm2.get_matching_blocks() + self.assertEqual(blocks1, blocks2) + self.assertAlmostEqual(sm1.ratio(), sm2.ratio(), places=3) + + def setUpModule(): difflib.HtmlDiff._default_prefix = 0