github.com/grumpyhome/grumpy@v0.3.1-0.20201208125205-7b775405bdf1/grumpy-runtime-src/third_party/stdlib/difflib.py (about)

     1  """
     2  Module difflib -- helpers for computing deltas between objects.
     3  
     4  Function get_close_matches(word, possibilities, n=3, cutoff=0.6):
     5      Use SequenceMatcher to return list of the best "good enough" matches.
     6  
     7  Function context_diff(a, b):
     8      For two lists of strings, return a delta in context diff format.
     9  
    10  Function ndiff(a, b):
    11      Return a delta: the difference between `a` and `b` (lists of strings).
    12  
    13  Function restore(delta, which):
    14      Return one of the two sequences that generated an ndiff delta.
    15  
    16  Function unified_diff(a, b):
    17      For two lists of strings, return a delta in unified diff format.
    18  
    19  Class SequenceMatcher:
    20      A flexible class for comparing pairs of sequences of any type.
    21  
    22  Class Differ:
    23      For producing human-readable deltas from sequences of lines of text.
    24  
    25  Class HtmlDiff:
    26      For producing HTML side by side comparison with change highlights.
    27  """
    28  
    29  __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
    30             'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff',
    31             'unified_diff', 'HtmlDiff', 'Match']
    32  
    33  import heapq
    34  # from collections import namedtuple as _namedtuple
    35  # from functools import reduce
    36  import functools
    37  reduce = functools.reduce
    38  
    39  import operator
    40  _itemgetter = operator.itemgetter
    41  _property = property
    42  _tuple = tuple
    43  
    44  def setdefault(d, k, default=None):
    45      if k not in d:
    46          d[k] = default
    47      return d[k]
    48  
    49  # Match = _namedtuple('Match', 'a b size')
    50  class Match(tuple):
    51      'Match(a, b, size)'
    52  
    53      __slots__ = ()
    54  
    55      _fields = ('a', 'b', 'size')
    56  
    57      def __new__(_cls, a, b, size):
    58          'Create new instance of Match(a, b, size)'
    59          return _tuple.__new__(_cls, (a, b, size))
    60  
    61      # @classmethod
    62      def _make(cls, iterable, new=tuple.__new__, len=len):
    63          'Make a new Match object from a sequence or iterable'
    64          result = new(cls, iterable)
    65          if len(result) != 3:
    66              raise TypeError('Expected 3 arguments, got %d' % len(result))
    67          return result
    68      _make = classmethod(_make)
    69  
    70      def __repr__(self):
    71          'Return a nicely formatted representation string'
    72          return 'Match(a=%r, b=%r, size=%r)' % self
    73  
    74      def _asdict(self):
    75          'Return a new OrderedDict which maps field names to their values'
    76          return OrderedDict(zip(self._fields, self))
    77  
    78      def _replace(_self, **kwds):
    79          'Return a new Match object replacing specified fields with new values'
    80          result = _self._make(map(kwds.pop, ('a', 'b', 'size'), _self))
    81          if kwds:
    82              raise ValueError('Got unexpected field names: %r' % kwds.keys())
    83          return result
    84  
    85      def __getnewargs__(self):
    86          'Return self as a plain tuple.  Used by copy and pickle.'
    87          return tuple(self)
    88  
    89      __dict__ = _property(_asdict)
    90  
    91      def __getstate__(self):
    92          'Exclude the OrderedDict from pickling'
    93          pass
    94  
    95      a = _property(_itemgetter(0), doc='Alias for field number 0')
    96  
    97      b = _property(_itemgetter(1), doc='Alias for field number 1')
    98  
    99      size = _property(_itemgetter(2), doc='Alias for field number 2')
   100  
   101  def _calculate_ratio(matches, length):
   102      if length:
   103          return 2.0 * matches / length
   104      return 1.0
   105  
   106  class SequenceMatcher(object):
   107  
   108      """
   109      SequenceMatcher is a flexible class for comparing pairs of sequences of
   110      any type, so long as the sequence elements are hashable.  The basic
   111      algorithm predates, and is a little fancier than, an algorithm
   112      published in the late 1980's by Ratcliff and Obershelp under the
   113      hyperbolic name "gestalt pattern matching".  The basic idea is to find
   114      the longest contiguous matching subsequence that contains no "junk"
   115      elements (R-O doesn't address junk).  The same idea is then applied
   116      recursively to the pieces of the sequences to the left and to the right
   117      of the matching subsequence.  This does not yield minimal edit
   118      sequences, but does tend to yield matches that "look right" to people.
   119  
   120      SequenceMatcher tries to compute a "human-friendly diff" between two
   121      sequences.  Unlike e.g. UNIX(tm) diff, the fundamental notion is the
   122      longest *contiguous* & junk-free matching subsequence.  That's what
   123      catches peoples' eyes.  The Windows(tm) windiff has another interesting
   124      notion, pairing up elements that appear uniquely in each sequence.
   125      That, and the method here, appear to yield more intuitive difference
   126      reports than does diff.  This method appears to be the least vulnerable
   127      to synching up on blocks of "junk lines", though (like blank lines in
   128      ordinary text files, or maybe "<P>" lines in HTML files).  That may be
   129      because this is the only method of the 3 that has a *concept* of
   130      "junk" <wink>.
   131  
   132      Example, comparing two strings, and considering blanks to be "junk":
   133  
   134      >>> s = SequenceMatcher(lambda x: x == " ",
   135      ...                     "private Thread currentThread;",
   136      ...                     "private volatile Thread currentThread;")
   137      >>>
   138  
   139      .ratio() returns a float in [0, 1], measuring the "similarity" of the
   140      sequences.  As a rule of thumb, a .ratio() value over 0.6 means the
   141      sequences are close matches:
   142  
   143      >>> print round(s.ratio(), 3)
   144      0.866
   145      >>>
   146  
   147      If you're only interested in where the sequences match,
   148      .get_matching_blocks() is handy:
   149  
   150      >>> for block in s.get_matching_blocks():
   151      ...     print "a[%d] and b[%d] match for %d elements" % block
   152      a[0] and b[0] match for 8 elements
   153      a[8] and b[17] match for 21 elements
   154      a[29] and b[38] match for 0 elements
   155  
   156      Note that the last tuple returned by .get_matching_blocks() is always a
   157      dummy, (len(a), len(b), 0), and this is the only case in which the last
   158      tuple element (number of elements matched) is 0.
   159  
   160      If you want to know how to change the first sequence into the second,
   161      use .get_opcodes():
   162  
   163      >>> for opcode in s.get_opcodes():
   164      ...     print "%6s a[%d:%d] b[%d:%d]" % opcode
   165       equal a[0:8] b[0:8]
   166      insert a[8:8] b[8:17]
   167       equal a[8:29] b[17:38]
   168  
   169      See the Differ class for a fancy human-friendly file differencer, which
   170      uses SequenceMatcher both to compare sequences of lines, and to compare
   171      sequences of characters within similar (near-matching) lines.
   172  
   173      See also function get_close_matches() in this module, which shows how
   174      simple code building on SequenceMatcher can be used to do useful work.
   175  
   176      Timing:  Basic R-O is cubic time worst case and quadratic time expected
   177      case.  SequenceMatcher is quadratic time for the worst case and has
   178      expected-case behavior dependent in a complicated way on how many
   179      elements the sequences have in common; best case time is linear.
   180  
   181      Methods:
   182  
   183      __init__(isjunk=None, a='', b='')
   184          Construct a SequenceMatcher.
   185  
   186      set_seqs(a, b)
   187          Set the two sequences to be compared.
   188  
   189      set_seq1(a)
   190          Set the first sequence to be compared.
   191  
   192      set_seq2(b)
   193          Set the second sequence to be compared.
   194  
   195      find_longest_match(alo, ahi, blo, bhi)
   196          Find longest matching block in a[alo:ahi] and b[blo:bhi].
   197  
   198      get_matching_blocks()
   199          Return list of triples describing matching subsequences.
   200  
   201      get_opcodes()
   202          Return list of 5-tuples describing how to turn a into b.
   203  
   204      ratio()
   205          Return a measure of the sequences' similarity (float in [0,1]).
   206  
   207      quick_ratio()
   208          Return an upper bound on .ratio() relatively quickly.
   209  
   210      real_quick_ratio()
   211          Return an upper bound on ratio() very quickly.
   212      """
   213  
   214      def __init__(self, isjunk=None, a='', b='', autojunk=True):
   215          """Construct a SequenceMatcher.
   216  
   217          Optional arg isjunk is None (the default), or a one-argument
   218          function that takes a sequence element and returns true iff the
   219          element is junk.  None is equivalent to passing "lambda x: 0", i.e.
   220          no elements are considered to be junk.  For example, pass
   221              lambda x: x in " \\t"
   222          if you're comparing lines as sequences of characters, and don't
   223          want to synch up on blanks or hard tabs.
   224  
   225          Optional arg a is the first of two sequences to be compared.  By
   226          default, an empty string.  The elements of a must be hashable.  See
   227          also .set_seqs() and .set_seq1().
   228  
   229          Optional arg b is the second of two sequences to be compared.  By
   230          default, an empty string.  The elements of b must be hashable. See
   231          also .set_seqs() and .set_seq2().
   232  
   233          Optional arg autojunk should be set to False to disable the
   234          "automatic junk heuristic" that treats popular elements as junk
   235          (see module documentation for more information).
   236          """
   237  
   238          # Members:
   239          # a
   240          #      first sequence
   241          # b
   242          #      second sequence; differences are computed as "what do
   243          #      we need to do to 'a' to change it into 'b'?"
   244          # b2j
   245          #      for x in b, b2j[x] is a list of the indices (into b)
   246          #      at which x appears; junk elements do not appear
   247          # fullbcount
   248          #      for x in b, fullbcount[x] == the number of times x
   249          #      appears in b; only materialized if really needed (used
   250          #      only for computing quick_ratio())
   251          # matching_blocks
   252          #      a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
   253          #      ascending & non-overlapping in i and in j; terminated by
   254          #      a dummy (len(a), len(b), 0) sentinel
   255          # opcodes
   256          #      a list of (tag, i1, i2, j1, j2) tuples, where tag is
   257          #      one of
   258          #          'replace'   a[i1:i2] should be replaced by b[j1:j2]
   259          #          'delete'    a[i1:i2] should be deleted
   260          #          'insert'    b[j1:j2] should be inserted
   261          #          'equal'     a[i1:i2] == b[j1:j2]
   262          # isjunk
   263          #      a user-supplied function taking a sequence element and
   264          #      returning true iff the element is "junk" -- this has
   265          #      subtle but helpful effects on the algorithm, which I'll
   266          #      get around to writing up someday <0.9 wink>.
   267          #      DON'T USE!  Only __chain_b uses this.  Use isbjunk.
   268          # isbjunk
   269          #      for x in b, isbjunk(x) == isjunk(x) but much faster;
   270          #      it's really the __contains__ method of a hidden dict.
   271          #      DOES NOT WORK for x in a!
   272          # isbpopular
   273          #      for x in b, isbpopular(x) is true iff b is reasonably long
   274          #      (at least 200 elements) and x accounts for more than 1 + 1% of
   275          #      its elements (when autojunk is enabled).
   276          #      DOES NOT WORK for x in a!
   277  
   278          self.isjunk = isjunk
   279          self.a = self.b = None
   280          self.autojunk = autojunk
   281          self.set_seqs(a, b)
   282  
   283      def set_seqs(self, a, b):
   284          """Set the two sequences to be compared.
   285  
   286          >>> s = SequenceMatcher()
   287          >>> s.set_seqs("abcd", "bcde")
   288          >>> s.ratio()
   289          0.75
   290          """
   291  
   292          self.set_seq1(a)
   293          self.set_seq2(b)
   294  
   295      def set_seq1(self, a):
   296          """Set the first sequence to be compared.
   297  
   298          The second sequence to be compared is not changed.
   299  
   300          >>> s = SequenceMatcher(None, "abcd", "bcde")
   301          >>> s.ratio()
   302          0.75
   303          >>> s.set_seq1("bcde")
   304          >>> s.ratio()
   305          1.0
   306          >>>
   307  
   308          SequenceMatcher computes and caches detailed information about the
   309          second sequence, so if you want to compare one sequence S against
   310          many sequences, use .set_seq2(S) once and call .set_seq1(x)
   311          repeatedly for each of the other sequences.
   312  
   313          See also set_seqs() and set_seq2().
   314          """
   315  
   316          if a is self.a:
   317              return
   318          self.a = a
   319          self.matching_blocks = self.opcodes = None
   320  
   321      def set_seq2(self, b):
   322          """Set the second sequence to be compared.
   323  
   324          The first sequence to be compared is not changed.
   325  
   326          >>> s = SequenceMatcher(None, "abcd", "bcde")
   327          >>> s.ratio()
   328          0.75
   329          >>> s.set_seq2("abcd")
   330          >>> s.ratio()
   331          1.0
   332          >>>
   333  
   334          SequenceMatcher computes and caches detailed information about the
   335          second sequence, so if you want to compare one sequence S against
   336          many sequences, use .set_seq2(S) once and call .set_seq1(x)
   337          repeatedly for each of the other sequences.
   338  
   339          See also set_seqs() and set_seq1().
   340          """
   341  
   342          if b is self.b:
   343              return
   344          self.b = b
   345          self.matching_blocks = self.opcodes = None
   346          self.fullbcount = None
   347          self.__chain_b()
   348  
   349      # For each element x in b, set b2j[x] to a list of the indices in
   350      # b where x appears; the indices are in increasing order; note that
   351      # the number of times x appears in b is len(b2j[x]) ...
   352      # when self.isjunk is defined, junk elements don't show up in this
   353      # map at all, which stops the central find_longest_match method
   354      # from starting any matching block at a junk element ...
   355      # also creates the fast isbjunk function ...
   356      # b2j also does not contain entries for "popular" elements, meaning
   357      # elements that account for more than 1 + 1% of the total elements, and
   358      # when the sequence is reasonably large (>= 200 elements); this can
   359      # be viewed as an adaptive notion of semi-junk, and yields an enormous
   360      # speedup when, e.g., comparing program files with hundreds of
   361      # instances of "return NULL;" ...
   362      # note that this is only called when b changes; so for cross-product
   363      # kinds of matches, it's best to call set_seq2 once, then set_seq1
   364      # repeatedly
   365  
   366      def __chain_b(self):
   367          # Because isjunk is a user-defined (not C) function, and we test
   368          # for junk a LOT, it's important to minimize the number of calls.
   369          # Before the tricks described here, __chain_b was by far the most
   370          # time-consuming routine in the whole module!  If anyone sees
   371          # Jim Roskind, thank him again for profile.py -- I never would
   372          # have guessed that.
   373          # The first trick is to build b2j ignoring the possibility
   374          # of junk.  I.e., we don't call isjunk at all yet.  Throwing
   375          # out the junk later is much cheaper than building b2j "right"
   376          # from the start.
   377          b = self.b
   378          self.b2j = b2j = {}
   379  
   380          for i, elt in enumerate(b):
   381              indices = setdefault(b2j, elt, [])
   382              # indices = b2j.setdefault(elt, [])
   383              indices.append(i)
   384  
   385          # Purge junk elements
   386          junk = set()
   387          isjunk = self.isjunk
   388          if isjunk:
   389              for elt in list(b2j.keys()):  # using list() since b2j is modified
   390                  if isjunk(elt):
   391                      junk.add(elt)
   392                      del b2j[elt]
   393  
   394          # Purge popular elements that are not junk
   395          popular = set()
   396          n = len(b)
   397          if self.autojunk and n >= 200:
   398              ntest = n // 100 + 1
   399              for elt, idxs in list(b2j.items()):
   400                  if len(idxs) > ntest:
   401                      popular.add(elt)
   402                      del b2j[elt]
   403  
   404          # Now for x in b, isjunk(x) == x in junk, but the latter is much faster.
   405          # Sicne the number of *unique* junk elements is probably small, the
   406          # memory burden of keeping this set alive is likely trivial compared to
   407          # the size of b2j.
   408          self.isbjunk = junk.__contains__
   409          self.isbpopular = popular.__contains__
   410  
   411      def find_longest_match(self, alo, ahi, blo, bhi):
   412          """Find longest matching block in a[alo:ahi] and b[blo:bhi].
   413  
   414          If isjunk is not defined:
   415  
   416          Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
   417              alo <= i <= i+k <= ahi
   418              blo <= j <= j+k <= bhi
   419          and for all (i',j',k') meeting those conditions,
   420              k >= k'
   421              i <= i'
   422              and if i == i', j <= j'
   423  
   424          In other words, of all maximal matching blocks, return one that
   425          starts earliest in a, and of all those maximal matching blocks that
   426          start earliest in a, return the one that starts earliest in b.
   427  
   428          >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
   429          >>> s.find_longest_match(0, 5, 0, 9)
   430          Match(a=0, b=4, size=5)
   431  
   432          If isjunk is defined, first the longest matching block is
   433          determined as above, but with the additional restriction that no
   434          junk element appears in the block.  Then that block is extended as
   435          far as possible by matching (only) junk elements on both sides.  So
   436          the resulting block never matches on junk except as identical junk
   437          happens to be adjacent to an "interesting" match.
   438  
   439          Here's the same example as before, but considering blanks to be
   440          junk.  That prevents " abcd" from matching the " abcd" at the tail
   441          end of the second sequence directly.  Instead only the "abcd" can
   442          match, and matches the leftmost "abcd" in the second sequence:
   443  
   444          >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
   445          >>> s.find_longest_match(0, 5, 0, 9)
   446          Match(a=1, b=0, size=4)
   447  
   448          If no blocks match, return (alo, blo, 0).
   449  
   450          >>> s = SequenceMatcher(None, "ab", "c")
   451          >>> s.find_longest_match(0, 2, 0, 1)
   452          Match(a=0, b=0, size=0)
   453          """
   454  
   455          # CAUTION:  stripping common prefix or suffix would be incorrect.
   456          # E.g.,
   457          #    ab
   458          #    acab
   459          # Longest matching block is "ab", but if common prefix is
   460          # stripped, it's "a" (tied with "b").  UNIX(tm) diff does so
   461          # strip, so ends up claiming that ab is changed to acab by
   462          # inserting "ca" in the middle.  That's minimal but unintuitive:
   463          # "it's obvious" that someone inserted "ac" at the front.
   464          # Windiff ends up at the same place as diff, but by pairing up
   465          # the unique 'b's and then matching the first two 'a's.
   466  
   467          a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
   468          besti, bestj, bestsize = alo, blo, 0
   469          # find longest junk-free match
   470          # during an iteration of the loop, j2len[j] = length of longest
   471          # junk-free match ending with a[i-1] and b[j]
   472          j2len = {}
   473          nothing = []
   474          for i in xrange(alo, ahi):
   475              # look at all instances of a[i] in b; note that because
   476              # b2j has no junk keys, the loop is skipped if a[i] is junk
   477              j2lenget = j2len.get
   478              newj2len = {}
   479              for j in b2j.get(a[i], nothing):
   480                  # a[i] matches b[j]
   481                  if j < blo:
   482                      continue
   483                  if j >= bhi:
   484                      break
   485                  k = newj2len[j] = j2lenget(j-1, 0) + 1
   486                  if k > bestsize:
   487                      besti, bestj, bestsize = i-k+1, j-k+1, k
   488              j2len = newj2len
   489  
   490          # Extend the best by non-junk elements on each end.  In particular,
   491          # "popular" non-junk elements aren't in b2j, which greatly speeds
   492          # the inner loop above, but also means "the best" match so far
   493          # doesn't contain any junk *or* popular non-junk elements.
   494          while besti > alo and bestj > blo and \
   495                not isbjunk(b[bestj-1]) and \
   496                a[besti-1] == b[bestj-1]:
   497              besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
   498          while besti+bestsize < ahi and bestj+bestsize < bhi and \
   499                not isbjunk(b[bestj+bestsize]) and \
   500                a[besti+bestsize] == b[bestj+bestsize]:
   501              bestsize += 1
   502  
   503          # Now that we have a wholly interesting match (albeit possibly
   504          # empty!), we may as well suck up the matching junk on each
   505          # side of it too.  Can't think of a good reason not to, and it
   506          # saves post-processing the (possibly considerable) expense of
   507          # figuring out what to do with it.  In the case of an empty
   508          # interesting match, this is clearly the right thing to do,
   509          # because no other kind of match is possible in the regions.
   510          while besti > alo and bestj > blo and \
   511                isbjunk(b[bestj-1]) and \
   512                a[besti-1] == b[bestj-1]:
   513              besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
   514          while besti+bestsize < ahi and bestj+bestsize < bhi and \
   515                isbjunk(b[bestj+bestsize]) and \
   516                a[besti+bestsize] == b[bestj+bestsize]:
   517              bestsize = bestsize + 1
   518  
   519          return Match(besti, bestj, bestsize)
   520  
   521      def get_matching_blocks(self):
   522          """Return list of triples describing matching subsequences.
   523  
   524          Each triple is of the form (i, j, n), and means that
   525          a[i:i+n] == b[j:j+n].  The triples are monotonically increasing in
   526          i and in j.  New in Python 2.5, it's also guaranteed that if
   527          (i, j, n) and (i', j', n') are adjacent triples in the list, and
   528          the second is not the last triple in the list, then i+n != i' or
   529          j+n != j'.  IOW, adjacent triples never describe adjacent equal
   530          blocks.
   531  
   532          The last triple is a dummy, (len(a), len(b), 0), and is the only
   533          triple with n==0.
   534  
   535          >>> s = SequenceMatcher(None, "abxcd", "abcd")
   536          >>> s.get_matching_blocks()
   537          [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)]
   538          """
   539  
   540          if self.matching_blocks is not None:
   541              return self.matching_blocks
   542          la, lb = len(self.a), len(self.b)
   543  
   544          # This is most naturally expressed as a recursive algorithm, but
   545          # at least one user bumped into extreme use cases that exceeded
   546          # the recursion limit on their box.  So, now we maintain a list
   547          # ('queue`) of blocks we still need to look at, and append partial
   548          # results to `matching_blocks` in a loop; the matches are sorted
   549          # at the end.
   550          queue = [(0, la, 0, lb)]
   551          matching_blocks = []
   552          while queue:
   553              alo, ahi, blo, bhi = queue.pop()
   554              i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
   555              # a[alo:i] vs b[blo:j] unknown
   556              # a[i:i+k] same as b[j:j+k]
   557              # a[i+k:ahi] vs b[j+k:bhi] unknown
   558              if k:   # if k is 0, there was no matching block
   559                  matching_blocks.append(x)
   560                  if alo < i and blo < j:
   561                      queue.append((alo, i, blo, j))
   562                  if i+k < ahi and j+k < bhi:
   563                      queue.append((i+k, ahi, j+k, bhi))
   564          matching_blocks.sort()
   565  
   566          # It's possible that we have adjacent equal blocks in the
   567          # matching_blocks list now.  Starting with 2.5, this code was added
   568          # to collapse them.
   569          i1 = j1 = k1 = 0
   570          non_adjacent = []
   571          for i2, j2, k2 in matching_blocks:
   572              # Is this block adjacent to i1, j1, k1?
   573              if i1 + k1 == i2 and j1 + k1 == j2:
   574                  # Yes, so collapse them -- this just increases the length of
   575                  # the first block by the length of the second, and the first
   576                  # block so lengthened remains the block to compare against.
   577                  k1 += k2
   578              else:
   579                  # Not adjacent.  Remember the first block (k1==0 means it's
   580                  # the dummy we started with), and make the second block the
   581                  # new block to compare against.
   582                  if k1:
   583                      non_adjacent.append((i1, j1, k1))
   584                  i1, j1, k1 = i2, j2, k2
   585          if k1:
   586              non_adjacent.append((i1, j1, k1))
   587  
   588          non_adjacent.append( (la, lb, 0) )
   589          self.matching_blocks = map(Match._make, non_adjacent)
   590          return self.matching_blocks
   591  
   592      def get_opcodes(self):
   593          """Return list of 5-tuples describing how to turn a into b.
   594  
   595          Each tuple is of the form (tag, i1, i2, j1, j2).  The first tuple
   596          has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
   597          tuple preceding it, and likewise for j1 == the previous j2.
   598  
   599          The tags are strings, with these meanings:
   600  
   601          'replace':  a[i1:i2] should be replaced by b[j1:j2]
   602          'delete':   a[i1:i2] should be deleted.
   603                      Note that j1==j2 in this case.
   604          'insert':   b[j1:j2] should be inserted at a[i1:i1].
   605                      Note that i1==i2 in this case.
   606          'equal':    a[i1:i2] == b[j1:j2]
   607  
   608          >>> a = "qabxcd"
   609          >>> b = "abycdf"
   610          >>> s = SequenceMatcher(None, a, b)
   611          >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
   612          ...    print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
   613          ...           (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
   614           delete a[0:1] (q) b[0:0] ()
   615            equal a[1:3] (ab) b[0:2] (ab)
   616          replace a[3:4] (x) b[2:3] (y)
   617            equal a[4:6] (cd) b[3:5] (cd)
   618           insert a[6:6] () b[5:6] (f)
   619          """
   620  
   621          if self.opcodes is not None:
   622              return self.opcodes
   623          i = j = 0
   624          self.opcodes = answer = []
   625          for ai, bj, size in self.get_matching_blocks():
   626              # invariant:  we've pumped out correct diffs to change
   627              # a[:i] into b[:j], and the next matching block is
   628              # a[ai:ai+size] == b[bj:bj+size].  So we need to pump
   629              # out a diff to change a[i:ai] into b[j:bj], pump out
   630              # the matching block, and move (i,j) beyond the match
   631              tag = ''
   632              if i < ai and j < bj:
   633                  tag = 'replace'
   634              elif i < ai:
   635                  tag = 'delete'
   636              elif j < bj:
   637                  tag = 'insert'
   638              if tag:
   639                  answer.append( (tag, i, ai, j, bj) )
   640              i, j = ai+size, bj+size
   641              # the list of matching blocks is terminated by a
   642              # sentinel with size 0
   643              if size:
   644                  answer.append( ('equal', ai, i, bj, j) )
   645          return answer
   646  
   647      def get_grouped_opcodes(self, n=3):
   648          """ Isolate change clusters by eliminating ranges with no changes.
   649  
   650          Return a generator of groups with up to n lines of context.
   651          Each group is in the same format as returned by get_opcodes().
   652  
   653          >>> from pprint import pprint
   654          >>> a = map(str, range(1,40))
   655          >>> b = a[:]
   656          >>> b[8:8] = ['i']     # Make an insertion
   657          >>> b[20] += 'x'       # Make a replacement
   658          >>> b[23:28] = []      # Make a deletion
   659          >>> b[30] += 'y'       # Make another replacement
   660          >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes()))
   661          [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)],
   662           [('equal', 16, 19, 17, 20),
   663            ('replace', 19, 20, 20, 21),
   664            ('equal', 20, 22, 21, 23),
   665            ('delete', 22, 27, 23, 23),
   666            ('equal', 27, 30, 23, 26)],
   667           [('equal', 31, 34, 27, 30),
   668            ('replace', 34, 35, 30, 31),
   669            ('equal', 35, 38, 31, 34)]]
   670          """
   671  
   672          codes = self.get_opcodes()
   673          if not codes:
   674              codes = [("equal", 0, 1, 0, 1)]
   675          # Fixup leading and trailing groups if they show no changes.
   676          if codes[0][0] == 'equal':
   677              tag, i1, i2, j1, j2 = codes[0]
   678              codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2
   679          if codes[-1][0] == 'equal':
   680              tag, i1, i2, j1, j2 = codes[-1]
   681              codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)
   682  
   683          nn = n + n
   684          group = []
   685          for tag, i1, i2, j1, j2 in codes:
   686              # End the current group and start a new one whenever
   687              # there is a large range with no changes.
   688              if tag == 'equal' and i2-i1 > nn:
   689                  group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n)))
   690                  yield group
   691                  group = []
   692                  i1, j1 = max(i1, i2-n), max(j1, j2-n)
   693              group.append((tag, i1, i2, j1 ,j2))
   694          if group and not (len(group)==1 and group[0][0] == 'equal'):
   695              yield group
   696  
   697      def ratio(self):
   698          """Return a measure of the sequences' similarity (float in [0,1]).
   699  
   700          Where T is the total number of elements in both sequences, and
   701          M is the number of matches, this is 2.0*M / T.
   702          Note that this is 1 if the sequences are identical, and 0 if
   703          they have nothing in common.
   704  
   705          .ratio() is expensive to compute if you haven't already computed
   706          .get_matching_blocks() or .get_opcodes(), in which case you may
   707          want to try .quick_ratio() or .real_quick_ratio() first to get an
   708          upper bound.
   709  
   710          >>> s = SequenceMatcher(None, "abcd", "bcde")
   711          >>> s.ratio()
   712          0.75
   713          >>> s.quick_ratio()
   714          0.75
   715          >>> s.real_quick_ratio()
   716          1.0
   717          """
   718  
   719          matches = reduce(lambda sum, triple: sum + triple[-1],
   720                           self.get_matching_blocks(), 0)
   721          return _calculate_ratio(matches, len(self.a) + len(self.b))
   722  
   723      def quick_ratio(self):
   724          """Return an upper bound on ratio() relatively quickly.
   725  
   726          This isn't defined beyond that it is an upper bound on .ratio(), and
   727          is faster to compute.
   728          """
   729  
   730          # viewing a and b as multisets, set matches to the cardinality
   731          # of their intersection; this counts the number of matches
   732          # without regard to order, so is clearly an upper bound
   733          if self.fullbcount is None:
   734              self.fullbcount = fullbcount = {}
   735              for elt in self.b:
   736                  fullbcount[elt] = fullbcount.get(elt, 0) + 1
   737          fullbcount = self.fullbcount
   738          # avail[x] is the number of times x appears in 'b' less the
   739          # number of times we've seen it in 'a' so far ... kinda
   740          avail = {}
   741          availhas, matches = avail.__contains__, 0
   742          for elt in self.a:
   743              if availhas(elt):
   744                  numb = avail[elt]
   745              else:
   746                  numb = fullbcount.get(elt, 0)
   747              avail[elt] = numb - 1
   748              if numb > 0:
   749                  matches = matches + 1
   750          return _calculate_ratio(matches, len(self.a) + len(self.b))
   751  
   752      def real_quick_ratio(self):
   753          """Return an upper bound on ratio() very quickly.
   754  
   755          This isn't defined beyond that it is an upper bound on .ratio(), and
   756          is faster to compute than either .ratio() or .quick_ratio().
   757          """
   758  
   759          la, lb = len(self.a), len(self.b)
   760          # can't have more matches than the number of elements in the
   761          # shorter sequence
   762          return _calculate_ratio(min(la, lb), la + lb)
   763  
   764  def get_close_matches(word, possibilities, n=3, cutoff=0.6):
   765      """Use SequenceMatcher to return list of the best "good enough" matches.
   766  
   767      word is a sequence for which close matches are desired (typically a
   768      string).
   769  
   770      possibilities is a list of sequences against which to match word
   771      (typically a list of strings).
   772  
   773      Optional arg n (default 3) is the maximum number of close matches to
   774      return.  n must be > 0.
   775  
   776      Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
   777      that don't score at least that similar to word are ignored.
   778  
   779      The best (no more than n) matches among the possibilities are returned
   780      in a list, sorted by similarity score, most similar first.
   781  
   782      >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
   783      ['apple', 'ape']
   784      >>> import keyword as _keyword
   785      >>> get_close_matches("wheel", _keyword.kwlist)
   786      ['while']
   787      >>> get_close_matches("apple", _keyword.kwlist)
   788      []
   789      >>> get_close_matches("accept", _keyword.kwlist)
   790      ['except']
   791      """
   792  
   793      if not n >  0:
   794          raise ValueError("n must be > 0: %r" % (n,))
   795      if not 0.0 <= cutoff <= 1.0:
   796          raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
   797      result = []
   798      s = SequenceMatcher()
   799      s.set_seq2(word)
   800      for x in possibilities:
   801          s.set_seq1(x)
   802          if s.real_quick_ratio() >= cutoff and \
   803             s.quick_ratio() >= cutoff and \
   804             s.ratio() >= cutoff:
   805              result.append((s.ratio(), x))
   806  
   807      # Move the best scorers to head of list
   808      result = heapq.nlargest(n, result)
   809      # Strip scores for the best n matches
   810      return [x for score, x in result]
   811  
   812  def _count_leading(line, ch):
   813      """
   814      Return number of `ch` characters at the start of `line`.
   815  
   816      Example:
   817  
   818      >>> _count_leading('   abc', ' ')
   819      3
   820      """
   821  
   822      i, n = 0, len(line)
   823      while i < n and line[i] == ch:
   824          i += 1
   825      return i
   826  
   827  class Differ(object):
   828      r"""
   829      Differ is a class for comparing sequences of lines of text, and
   830      producing human-readable differences or deltas.  Differ uses
   831      SequenceMatcher both to compare sequences of lines, and to compare
   832      sequences of characters within similar (near-matching) lines.
   833  
   834      Each line of a Differ delta begins with a two-letter code:
   835  
   836          '- '    line unique to sequence 1
   837          '+ '    line unique to sequence 2
   838          '  '    line common to both sequences
   839          '? '    line not present in either input sequence
   840  
   841      Lines beginning with '? ' attempt to guide the eye to intraline
   842      differences, and were not present in either input sequence.  These lines
   843      can be confusing if the sequences contain tab characters.
   844  
   845      Note that Differ makes no claim to produce a *minimal* diff.  To the
   846      contrary, minimal diffs are often counter-intuitive, because they synch
   847      up anywhere possible, sometimes accidental matches 100 pages apart.
   848      Restricting synch points to contiguous matches preserves some notion of
   849      locality, at the occasional cost of producing a longer diff.
   850  
   851      Example: Comparing two texts.
   852  
   853      First we set up the texts, sequences of individual single-line strings
   854      ending with newlines (such sequences can also be obtained from the
   855      `readlines()` method of file-like objects):
   856  
   857      >>> text1 = '''  1. Beautiful is better than ugly.
   858      ...   2. Explicit is better than implicit.
   859      ...   3. Simple is better than complex.
   860      ...   4. Complex is better than complicated.
   861      ... '''.splitlines(1)
   862      >>> len(text1)
   863      4
   864      >>> text1[0][-1]
   865      '\n'
   866      >>> text2 = '''  1. Beautiful is better than ugly.
   867      ...   3.   Simple is better than complex.
   868      ...   4. Complicated is better than complex.
   869      ...   5. Flat is better than nested.
   870      ... '''.splitlines(1)
   871  
   872      Next we instantiate a Differ object:
   873  
   874      >>> d = Differ()
   875  
   876      Note that when instantiating a Differ object we may pass functions to
   877      filter out line and character 'junk'.  See Differ.__init__ for details.
   878  
   879      Finally, we compare the two:
   880  
   881      >>> result = list(d.compare(text1, text2))
   882  
   883      'result' is a list of strings, so let's pretty-print it:
   884  
   885      >>> from pprint import pprint as _pprint
   886      >>> _pprint(result)
   887      ['    1. Beautiful is better than ugly.\n',
   888       '-   2. Explicit is better than implicit.\n',
   889       '-   3. Simple is better than complex.\n',
   890       '+   3.   Simple is better than complex.\n',
   891       '?     ++\n',
   892       '-   4. Complex is better than complicated.\n',
   893       '?            ^                     ---- ^\n',
   894       '+   4. Complicated is better than complex.\n',
   895       '?           ++++ ^                      ^\n',
   896       '+   5. Flat is better than nested.\n']
   897  
   898      As a single multi-line string it looks like this:
   899  
   900      >>> print ''.join(result),
   901          1. Beautiful is better than ugly.
   902      -   2. Explicit is better than implicit.
   903      -   3. Simple is better than complex.
   904      +   3.   Simple is better than complex.
   905      ?     ++
   906      -   4. Complex is better than complicated.
   907      ?            ^                     ---- ^
   908      +   4. Complicated is better than complex.
   909      ?           ++++ ^                      ^
   910      +   5. Flat is better than nested.
   911  
   912      Methods:
   913  
   914      __init__(linejunk=None, charjunk=None)
   915          Construct a text differencer, with optional filters.
   916  
   917      compare(a, b)
   918          Compare two sequences of lines; generate the resulting delta.
   919      """
   920  
   921      def __init__(self, linejunk=None, charjunk=None):
   922          """
   923          Construct a text differencer, with optional filters.
   924  
   925          The two optional keyword parameters are for filter functions:
   926  
   927          - `linejunk`: A function that should accept a single string argument,
   928            and return true iff the string is junk. The module-level function
   929            `IS_LINE_JUNK` may be used to filter out lines without visible
   930            characters, except for at most one splat ('#').  It is recommended
   931            to leave linejunk None; as of Python 2.3, the underlying
   932            SequenceMatcher class has grown an adaptive notion of "noise" lines
   933            that's better than any static definition the author has ever been
   934            able to craft.
   935  
   936          - `charjunk`: A function that should accept a string of length 1. The
   937            module-level function `IS_CHARACTER_JUNK` may be used to filter out
   938            whitespace characters (a blank or tab; **note**: bad idea to include
   939            newline in this!).  Use of IS_CHARACTER_JUNK is recommended.
   940          """
   941  
   942          self.linejunk = linejunk
   943          self.charjunk = charjunk
   944  
   945      def compare(self, a, b):
   946          r"""
   947          Compare two sequences of lines; generate the resulting delta.
   948  
   949          Each sequence must contain individual single-line strings ending with
   950          newlines. Such sequences can be obtained from the `readlines()` method
   951          of file-like objects.  The delta generated also consists of newline-
   952          terminated strings, ready to be printed as-is via the writeline()
   953          method of a file-like object.
   954  
   955          Example:
   956  
   957          >>> print ''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(1),
   958          ...                                'ore\ntree\nemu\n'.splitlines(1))),
   959          - one
   960          ?  ^
   961          + ore
   962          ?  ^
   963          - two
   964          - three
   965          ?  -
   966          + tree
   967          + emu
   968          """
   969  
   970          cruncher = SequenceMatcher(self.linejunk, a, b)
   971          for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
   972              if tag == 'replace':
   973                  g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
   974              elif tag == 'delete':
   975                  g = self._dump('-', a, alo, ahi)
   976              elif tag == 'insert':
   977                  g = self._dump('+', b, blo, bhi)
   978              elif tag == 'equal':
   979                  g = self._dump(' ', a, alo, ahi)
   980              else:
   981                  raise ValueError, 'unknown tag %r' % (tag,)
   982  
   983              for line in g:
   984                  yield line
   985  
   986      def _dump(self, tag, x, lo, hi):
   987          """Generate comparison results for a same-tagged range."""
   988          for i in xrange(lo, hi):
   989              yield '%s %s' % (tag, x[i])
   990  
   991      def _plain_replace(self, a, alo, ahi, b, blo, bhi):
   992          assert alo < ahi and blo < bhi
   993          # dump the shorter block first -- reduces the burden on short-term
   994          # memory if the blocks are of very different sizes
   995          if bhi - blo < ahi - alo:
   996              first  = self._dump('+', b, blo, bhi)
   997              second = self._dump('-', a, alo, ahi)
   998          else:
   999              first  = self._dump('-', a, alo, ahi)
  1000              second = self._dump('+', b, blo, bhi)
  1001  
  1002          for g in first, second:
  1003              for line in g:
  1004                  yield line
  1005  
  1006      def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
  1007          r"""
  1008          When replacing one block of lines with another, search the blocks
  1009          for *similar* lines; the best-matching pair (if any) is used as a
  1010          synch point, and intraline difference marking is done on the
  1011          similar pair. Lots of work, but often worth it.
  1012  
  1013          Example:
  1014  
  1015          >>> d = Differ()
  1016          >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1,
  1017          ...                            ['abcdefGhijkl\n'], 0, 1)
  1018          >>> print ''.join(results),
  1019          - abcDefghiJkl
  1020          ?    ^  ^  ^
  1021          + abcdefGhijkl
  1022          ?    ^  ^  ^
  1023          """
  1024  
  1025          # don't synch up unless the lines have a similarity score of at
  1026          # least cutoff; best_ratio tracks the best score seen so far
  1027          best_ratio, cutoff = 0.74, 0.75
  1028          cruncher = SequenceMatcher(self.charjunk)
  1029          eqi, eqj = None, None   # 1st indices of equal lines (if any)
  1030  
  1031          # search for the pair that matches best without being identical
  1032          # (identical lines must be junk lines, & we don't want to synch up
  1033          # on junk -- unless we have to)
  1034          for j in xrange(blo, bhi):
  1035              bj = b[j]
  1036              cruncher.set_seq2(bj)
  1037              for i in xrange(alo, ahi):
  1038                  ai = a[i]
  1039                  if ai == bj:
  1040                      if eqi is None:
  1041                          eqi, eqj = i, j
  1042                      continue
  1043                  cruncher.set_seq1(ai)
  1044                  # computing similarity is expensive, so use the quick
  1045                  # upper bounds first -- have seen this speed up messy
  1046                  # compares by a factor of 3.
  1047                  # note that ratio() is only expensive to compute the first
  1048                  # time it's called on a sequence pair; the expensive part
  1049                  # of the computation is cached by cruncher
  1050                  if cruncher.real_quick_ratio() > best_ratio and \
  1051                        cruncher.quick_ratio() > best_ratio and \
  1052                        cruncher.ratio() > best_ratio:
  1053                      best_ratio, best_i, best_j = cruncher.ratio(), i, j
  1054          if best_ratio < cutoff:
  1055              # no non-identical "pretty close" pair
  1056              if eqi is None:
  1057                  # no identical pair either -- treat it as a straight replace
  1058                  for line in self._plain_replace(a, alo, ahi, b, blo, bhi):
  1059                      yield line
  1060                  return
  1061              # no close pair, but an identical pair -- synch up on that
  1062              best_i, best_j, best_ratio = eqi, eqj, 1.0
  1063          else:
  1064              # there's a close pair, so forget the identical pair (if any)
  1065              eqi = None
  1066  
  1067          # a[best_i] very similar to b[best_j]; eqi is None iff they're not
  1068          # identical
  1069  
  1070          # pump out diffs from before the synch point
  1071          for line in self._fancy_helper(a, alo, best_i, b, blo, best_j):
  1072              yield line
  1073  
  1074          # do intraline marking on the synch pair
  1075          aelt, belt = a[best_i], b[best_j]
  1076          if eqi is None:
  1077              # pump out a '-', '?', '+', '?' quad for the synched lines
  1078              atags = btags = ""
  1079              cruncher.set_seqs(aelt, belt)
  1080              for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
  1081                  la, lb = ai2 - ai1, bj2 - bj1
  1082                  if tag == 'replace':
  1083                      atags += '^' * la
  1084                      btags += '^' * lb
  1085                  elif tag == 'delete':
  1086                      atags += '-' * la
  1087                  elif tag == 'insert':
  1088                      btags += '+' * lb
  1089                  elif tag == 'equal':
  1090                      atags += ' ' * la
  1091                      btags += ' ' * lb
  1092                  else:
  1093                      raise ValueError, 'unknown tag %r' % (tag,)
  1094              for line in self._qformat(aelt, belt, atags, btags):
  1095                  yield line
  1096          else:
  1097              # the synch pair is identical
  1098              yield '  ' + aelt
  1099  
  1100          # pump out diffs from after the synch point
  1101          for line in self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi):
  1102              yield line
  1103  
  1104      def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
  1105          g = []
  1106          if alo < ahi:
  1107              if blo < bhi:
  1108                  g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
  1109              else:
  1110                  g = self._dump('-', a, alo, ahi)
  1111          elif blo < bhi:
  1112              g = self._dump('+', b, blo, bhi)
  1113  
  1114          for line in g:
  1115              yield line
  1116  
  1117      def _qformat(self, aline, bline, atags, btags):
  1118          r"""
  1119          Format "?" output and deal with leading tabs.
  1120  
  1121          Example:
  1122  
  1123          >>> d = Differ()
  1124          >>> results = d._qformat('\tabcDefghiJkl\n', '\tabcdefGhijkl\n',
  1125          ...                      '  ^ ^  ^      ', '  ^ ^  ^      ')
  1126          >>> for line in results: print repr(line)
  1127          ...
  1128          '- \tabcDefghiJkl\n'
  1129          '? \t ^ ^  ^\n'
  1130          '+ \tabcdefGhijkl\n'
  1131          '? \t ^ ^  ^\n'
  1132          """
  1133  
  1134          # Can hurt, but will probably help most of the time.
  1135          common = min(_count_leading(aline, "\t"),
  1136                       _count_leading(bline, "\t"))
  1137          common = min(common, _count_leading(atags[:common], " "))
  1138          common = min(common, _count_leading(btags[:common], " "))
  1139          atags = atags[common:].rstrip()
  1140          btags = btags[common:].rstrip()
  1141  
  1142          yield "- " + aline
  1143          if atags:
  1144              yield "? %s%s\n" % ("\t" * common, atags)
  1145  
  1146          yield "+ " + bline
  1147          if btags:
  1148              yield "? %s%s\n" % ("\t" * common, btags)
  1149  
  1150  # With respect to junk, an earlier version of ndiff simply refused to
  1151  # *start* a match with a junk element.  The result was cases like this:
  1152  #     before: private Thread currentThread;
  1153  #     after:  private volatile Thread currentThread;
  1154  # If you consider whitespace to be junk, the longest contiguous match
  1155  # not starting with junk is "e Thread currentThread".  So ndiff reported
  1156  # that "e volatil" was inserted between the 't' and the 'e' in "private".
  1157  # While an accurate view, to people that's absurd.  The current version
  1158  # looks for matching blocks that are entirely junk-free, then extends the
  1159  # longest one of those as far as possible but only with matching junk.
  1160  # So now "currentThread" is matched, then extended to suck up the
  1161  # preceding blank; then "private" is matched, and extended to suck up the
  1162  # following blank; then "Thread" is matched; and finally ndiff reports
  1163  # that "volatile " was inserted before "Thread".  The only quibble
  1164  # remaining is that perhaps it was really the case that " volatile"
  1165  # was inserted after "private".  I can live with that <wink>.
  1166  
  1167  import re
  1168  
  1169  def IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match):
  1170      r"""
  1171      Return 1 for ignorable line: iff `line` is blank or contains a single '#'.
  1172  
  1173      Examples:
  1174  
  1175      >>> IS_LINE_JUNK('\n')
  1176      True
  1177      >>> IS_LINE_JUNK('  #   \n')
  1178      True
  1179      >>> IS_LINE_JUNK('hello\n')
  1180      False
  1181      """
  1182  
  1183      return pat(line) is not None
  1184  
  1185  def IS_CHARACTER_JUNK(ch, ws=" \t"):
  1186      r"""
  1187      Return 1 for ignorable character: iff `ch` is a space or tab.
  1188  
  1189      Examples:
  1190  
  1191      >>> IS_CHARACTER_JUNK(' ')
  1192      True
  1193      >>> IS_CHARACTER_JUNK('\t')
  1194      True
  1195      >>> IS_CHARACTER_JUNK('\n')
  1196      False
  1197      >>> IS_CHARACTER_JUNK('x')
  1198      False
  1199      """
  1200  
  1201      return ch in ws
  1202  
  1203  
  1204  ########################################################################
  1205  ###  Unified Diff
  1206  ########################################################################
  1207  
  1208  def _format_range_unified(start, stop):
  1209      'Convert range to the "ed" format'
  1210      # Per the diff spec at http://www.unix.org/single_unix_specification/
  1211      beginning = start + 1     # lines start numbering with one
  1212      length = stop - start
  1213      if length == 1:
  1214          # return '{}'.format(beginning)
  1215          return '%s' % (beginning)
  1216      if not length:
  1217          beginning -= 1        # empty ranges begin at line just before the range
  1218      return '%s,%s' % (beginning, length)
  1219  
  1220  def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
  1221                   tofiledate='', n=3, lineterm='\n'):
  1222      r"""
  1223      Compare two sequences of lines; generate the delta as a unified diff.
  1224  
  1225      Unified diffs are a compact way of showing line changes and a few
  1226      lines of context.  The number of context lines is set by 'n' which
  1227      defaults to three.
  1228  
  1229      By default, the diff control lines (those with ---, +++, or @@) are
  1230      created with a trailing newline.  This is helpful so that inputs
  1231      created from file.readlines() result in diffs that are suitable for
  1232      file.writelines() since both the inputs and outputs have trailing
  1233      newlines.
  1234  
  1235      For inputs that do not have trailing newlines, set the lineterm
  1236      argument to "" so that the output will be uniformly newline free.
  1237  
  1238      The unidiff format normally has a header for filenames and modification
  1239      times.  Any or all of these may be specified using strings for
  1240      'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
  1241      The modification times are normally expressed in the ISO 8601 format.
  1242  
  1243      Example:
  1244  
  1245      >>> for line in unified_diff('one two three four'.split(),
  1246      ...             'zero one tree four'.split(), 'Original', 'Current',
  1247      ...             '2005-01-26 23:30:50', '2010-04-02 10:20:52',
  1248      ...             lineterm=''):
  1249      ...     print line                  # doctest: +NORMALIZE_WHITESPACE
  1250      --- Original        2005-01-26 23:30:50
  1251      +++ Current         2010-04-02 10:20:52
  1252      @@ -1,4 +1,4 @@
  1253      +zero
  1254       one
  1255      -two
  1256      -three
  1257      +tree
  1258       four
  1259      """
  1260  
  1261      started = False
  1262      for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
  1263          if not started:
  1264              started = True
  1265              # fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
  1266              fromdate = '\t%s' % (fromfiledate) if fromfiledate else ''
  1267              # todate = '\t{}'.format(tofiledate) if tofiledate else ''
  1268              todate = '\t%s' % (tofiledate) if tofiledate else ''
  1269              # yield '--- {}{}{}'.format(fromfile, fromdate, lineterm)
  1270              yield '--- %s%s%s' % (fromfile, fromdate, lineterm)
  1271              # yield '+++ {}{}{}'.format(tofile, todate, lineterm)
  1272              yield '+++ %s%s%s' % (tofile, todate, lineterm)
  1273  
  1274          first, last = group[0], group[-1]
  1275          file1_range = _format_range_unified(first[1], last[2])
  1276          file2_range = _format_range_unified(first[3], last[4])
  1277          # yield '@@ -{} +{} @@{}'.format(file1_range, file2_range, lineterm)
  1278          yield '@@ -%s +%s @@%s' % (file1_range, file2_range, lineterm)
  1279  
  1280          for tag, i1, i2, j1, j2 in group:
  1281              if tag == 'equal':
  1282                  for line in a[i1:i2]:
  1283                      yield ' ' + line
  1284                  continue
  1285              if tag in ('replace', 'delete'):
  1286                  for line in a[i1:i2]:
  1287                      yield '-' + line
  1288              if tag in ('replace', 'insert'):
  1289                  for line in b[j1:j2]:
  1290                      yield '+' + line
  1291  
  1292  
  1293  ########################################################################
  1294  ###  Context Diff
  1295  ########################################################################
  1296  
  1297  def _format_range_context(start, stop):
  1298      'Convert range to the "ed" format'
  1299      # Per the diff spec at http://www.unix.org/single_unix_specification/
  1300      beginning = start + 1     # lines start numbering with one
  1301      length = stop - start
  1302      if not length:
  1303          beginning -= 1        # empty ranges begin at line just before the range
  1304      if length <= 1:
  1305          # return '{}'.format(beginning)
  1306          return '%s' % (beginning)
  1307      # return '{},{}'.format(beginning, beginning + length - 1)
  1308      return '%s,%s' % (beginning, beginning + length - 1)
  1309  
  1310  # See http://www.unix.org/single_unix_specification/
  1311  def context_diff(a, b, fromfile='', tofile='',
  1312                   fromfiledate='', tofiledate='', n=3, lineterm='\n'):
  1313      r"""
  1314      Compare two sequences of lines; generate the delta as a context diff.
  1315  
  1316      Context diffs are a compact way of showing line changes and a few
  1317      lines of context.  The number of context lines is set by 'n' which
  1318      defaults to three.
  1319  
  1320      By default, the diff control lines (those with *** or ---) are
  1321      created with a trailing newline.  This is helpful so that inputs
  1322      created from file.readlines() result in diffs that are suitable for
  1323      file.writelines() since both the inputs and outputs have trailing
  1324      newlines.
  1325  
  1326      For inputs that do not have trailing newlines, set the lineterm
  1327      argument to "" so that the output will be uniformly newline free.
  1328  
  1329      The context diff format normally has a header for filenames and
  1330      modification times.  Any or all of these may be specified using
  1331      strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
  1332      The modification times are normally expressed in the ISO 8601 format.
  1333      If not specified, the strings default to blanks.
  1334  
  1335      Example:
  1336  
  1337      >>> print ''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(1),
  1338      ...       'zero\none\ntree\nfour\n'.splitlines(1), 'Original', 'Current')),
  1339      *** Original
  1340      --- Current
  1341      ***************
  1342      *** 1,4 ****
  1343        one
  1344      ! two
  1345      ! three
  1346        four
  1347      --- 1,4 ----
  1348      + zero
  1349        one
  1350      ! tree
  1351        four
  1352      """
  1353  
  1354      prefix = dict(insert='+ ', delete='- ', replace='! ', equal='  ')
  1355      started = False
  1356      for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
  1357          if not started:
  1358              started = True
  1359              # fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
  1360              fromdate = '\t%s' % (fromfiledate) if fromfiledate else ''
  1361              # todate = '\t{}'.format(tofiledate) if tofiledate else ''
  1362              todate = '\t%s' % (tofiledate) if tofiledate else ''
  1363              # yield '*** {}{}{}'.format(fromfile, fromdate, lineterm)
  1364              yield '*** %s%s%s' % (fromfile, fromdate, lineterm)
  1365              # yield '--- {}{}{}'.format(tofile, todate, lineterm)
  1366              yield '--- %s%s%s' % (tofile, todate, lineterm)
  1367  
  1368          first, last = group[0], group[-1]
  1369          yield '***************' + lineterm
  1370  
  1371          file1_range = _format_range_context(first[1], last[2])
  1372          # yield '*** {} ****{}'.format(file1_range, lineterm)
  1373          yield '*** %s ****%s' % (file1_range, lineterm)
  1374  
  1375          if any(tag in ('replace', 'delete') for tag, _, _, _, _ in group):
  1376              for tag, i1, i2, _, _ in group:
  1377                  if tag != 'insert':
  1378                      for line in a[i1:i2]:
  1379                          yield prefix[tag] + line
  1380  
  1381          file2_range = _format_range_context(first[3], last[4])
  1382          # yield '--- {} ----{}'.format(file2_range, lineterm)
  1383          yield '--- %s ----%s' % (file2_range, lineterm)
  1384  
  1385          if any(tag in ('replace', 'insert') for tag, _, _, _, _ in group):
  1386              for tag, _, _, j1, j2 in group:
  1387                  if tag != 'delete':
  1388                      for line in b[j1:j2]:
  1389                          yield prefix[tag] + line
  1390  
  1391  def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
  1392      r"""
  1393      Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
  1394  
  1395      Optional keyword parameters `linejunk` and `charjunk` are for filter
  1396      functions (or None):
  1397  
  1398      - linejunk: A function that should accept a single string argument, and
  1399        return true iff the string is junk.  The default is None, and is
  1400        recommended; as of Python 2.3, an adaptive notion of "noise" lines is
  1401        used that does a good job on its own.
  1402  
  1403      - charjunk: A function that should accept a string of length 1. The
  1404        default is module-level function IS_CHARACTER_JUNK, which filters out
  1405        whitespace characters (a blank or tab; note: bad idea to include newline
  1406        in this!).
  1407  
  1408      Tools/scripts/ndiff.py is a command-line front-end to this function.
  1409  
  1410      Example:
  1411  
  1412      >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
  1413      ...              'ore\ntree\nemu\n'.splitlines(1))
  1414      >>> print ''.join(diff),
  1415      - one
  1416      ?  ^
  1417      + ore
  1418      ?  ^
  1419      - two
  1420      - three
  1421      ?  -
  1422      + tree
  1423      + emu
  1424      """
  1425      return Differ(linejunk, charjunk).compare(a, b)
  1426  
  1427  def _mdiff(fromlines, tolines, context=None, linejunk=None,
  1428             charjunk=IS_CHARACTER_JUNK):
  1429      r"""Returns generator yielding marked up from/to side by side differences.
  1430  
  1431      Arguments:
  1432      fromlines -- list of text lines to compared to tolines
  1433      tolines -- list of text lines to be compared to fromlines
  1434      context -- number of context lines to display on each side of difference,
  1435                 if None, all from/to text lines will be generated.
  1436      linejunk -- passed on to ndiff (see ndiff documentation)
  1437      charjunk -- passed on to ndiff (see ndiff documentation)
  1438  
  1439      This function returns an iterator which returns a tuple:
  1440      (from line tuple, to line tuple, boolean flag)
  1441  
  1442      from/to line tuple -- (line num, line text)
  1443          line num -- integer or None (to indicate a context separation)
  1444          line text -- original line text with following markers inserted:
  1445              '\0+' -- marks start of added text
  1446              '\0-' -- marks start of deleted text
  1447              '\0^' -- marks start of changed text
  1448              '\1' -- marks end of added/deleted/changed text
  1449  
  1450      boolean flag -- None indicates context separation, True indicates
  1451          either "from" or "to" line contains a change, otherwise False.
  1452  
  1453      This function/iterator was originally developed to generate side by side
  1454      file difference for making HTML pages (see HtmlDiff class for example
  1455      usage).
  1456  
  1457      Note, this function utilizes the ndiff function to generate the side by
  1458      side difference markup.  Optional ndiff arguments may be passed to this
  1459      function and they in turn will be passed to ndiff.
  1460      """
  1461      import re
  1462  
  1463      # regular expression for finding intraline change indices
  1464      change_re = re.compile('(\++|\-+|\^+)')
  1465  
  1466      # create the difference iterator to generate the differences
  1467      diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk)
  1468  
  1469      def _make_line(lines, format_key, side, num_lines=[0,0]):
  1470          """Returns line of text with user's change markup and line formatting.
  1471  
  1472          lines -- list of lines from the ndiff generator to produce a line of
  1473                   text from.  When producing the line of text to return, the
  1474                   lines used are removed from this list.
  1475          format_key -- '+' return first line in list with "add" markup around
  1476                            the entire line.
  1477                        '-' return first line in list with "delete" markup around
  1478                            the entire line.
  1479                        '?' return first line in list with add/delete/change
  1480                            intraline markup (indices obtained from second line)
  1481                        None return first line in list with no markup
  1482          side -- indice into the num_lines list (0=from,1=to)
  1483          num_lines -- from/to current line number.  This is NOT intended to be a
  1484                       passed parameter.  It is present as a keyword argument to
  1485                       maintain memory of the current line numbers between calls
  1486                       of this function.
  1487  
  1488          Note, this function is purposefully not defined at the module scope so
  1489          that data it needs from its parent function (within whose context it
  1490          is defined) does not need to be of module scope.
  1491          """
  1492          num_lines[side] += 1
  1493          # Handle case where no user markup is to be added, just return line of
  1494          # text with user's line format to allow for usage of the line number.
  1495          if format_key is None:
  1496              return (num_lines[side],lines.pop(0)[2:])
  1497          # Handle case of intraline changes
  1498          if format_key == '?':
  1499              text, markers = lines.pop(0), lines.pop(0)
  1500              # find intraline changes (store change type and indices in tuples)
  1501              sub_info = []
  1502              def record_sub_info(match_object,sub_info=sub_info):
  1503                  sub_info.append([match_object.group(1)[0],match_object.span()])
  1504                  return match_object.group(1)
  1505              change_re.sub(record_sub_info,markers)
  1506              # process each tuple inserting our special marks that won't be
  1507              # noticed by an xml/html escaper.
  1508              for key,(begin,end) in sub_info[::-1]:
  1509                  text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:]
  1510              text = text[2:]
  1511          # Handle case of add/delete entire line
  1512          else:
  1513              text = lines.pop(0)[2:]
  1514              # if line of text is just a newline, insert a space so there is
  1515              # something for the user to highlight and see.
  1516              if not text:
  1517                  text = ' '
  1518              # insert marks that won't be noticed by an xml/html escaper.
  1519              text = '\0' + format_key + text + '\1'
  1520          # Return line of text, first allow user's line formatter to do its
  1521          # thing (such as adding the line number) then replace the special
  1522          # marks with what the user's change markup.
  1523          return (num_lines[side],text)
  1524  
  1525      def _line_iterator():
  1526          """Yields from/to lines of text with a change indication.
  1527  
  1528          This function is an iterator.  It itself pulls lines from a
  1529          differencing iterator, processes them and yields them.  When it can
  1530          it yields both a "from" and a "to" line, otherwise it will yield one
  1531          or the other.  In addition to yielding the lines of from/to text, a
  1532          boolean flag is yielded to indicate if the text line(s) have
  1533          differences in them.
  1534  
  1535          Note, this function is purposefully not defined at the module scope so
  1536          that data it needs from its parent function (within whose context it
  1537          is defined) does not need to be of module scope.
  1538          """
  1539          lines = []
  1540          num_blanks_pending, num_blanks_to_yield = 0, 0
  1541          while True:
  1542              # Load up next 4 lines so we can look ahead, create strings which
  1543              # are a concatenation of the first character of each of the 4 lines
  1544              # so we can do some very readable comparisons.
  1545              while len(lines) < 4:
  1546                  try:
  1547                      lines.append(diff_lines_iterator.next())
  1548                  except StopIteration:
  1549                      lines.append('X')
  1550              s = ''.join([line[0] for line in lines])
  1551              if s.startswith('X'):
  1552                  # When no more lines, pump out any remaining blank lines so the
  1553                  # corresponding add/delete lines get a matching blank line so
  1554                  # all line pairs get yielded at the next level.
  1555                  num_blanks_to_yield = num_blanks_pending
  1556              elif s.startswith('-?+?'):
  1557                  # simple intraline change
  1558                  yield _make_line(lines,'?',0), _make_line(lines,'?',1), True
  1559                  continue
  1560              elif s.startswith('--++'):
  1561                  # in delete block, add block coming: we do NOT want to get
  1562                  # caught up on blank lines yet, just process the delete line
  1563                  num_blanks_pending -= 1
  1564                  yield _make_line(lines,'-',0), None, True
  1565                  continue
  1566              elif s.startswith(('--?+', '--+', '- ')):
  1567                  # in delete block and see an intraline change or unchanged line
  1568                  # coming: yield the delete line and then blanks
  1569                  from_line,to_line = _make_line(lines,'-',0), None
  1570                  num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0
  1571              elif s.startswith('-+?'):
  1572                  # intraline change
  1573                  yield _make_line(lines,None,0), _make_line(lines,'?',1), True
  1574                  continue
  1575              elif s.startswith('-?+'):
  1576                  # intraline change
  1577                  yield _make_line(lines,'?',0), _make_line(lines,None,1), True
  1578                  continue
  1579              elif s.startswith('-'):
  1580                  # delete FROM line
  1581                  num_blanks_pending -= 1
  1582                  yield _make_line(lines,'-',0), None, True
  1583                  continue
  1584              elif s.startswith('+--'):
  1585                  # in add block, delete block coming: we do NOT want to get
  1586                  # caught up on blank lines yet, just process the add line
  1587                  num_blanks_pending += 1
  1588                  yield None, _make_line(lines,'+',1), True
  1589                  continue
  1590              elif s.startswith(('+ ', '+-')):
  1591                  # will be leaving an add block: yield blanks then add line
  1592                  from_line, to_line = None, _make_line(lines,'+',1)
  1593                  num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0
  1594              elif s.startswith('+'):
  1595                  # inside an add block, yield the add line
  1596                  num_blanks_pending += 1
  1597                  yield None, _make_line(lines,'+',1), True
  1598                  continue
  1599              elif s.startswith(' '):
  1600                  # unchanged text, yield it to both sides
  1601                  yield _make_line(lines[:],None,0),_make_line(lines,None,1),False
  1602                  continue
  1603              # Catch up on the blank lines so when we yield the next from/to
  1604              # pair, they are lined up.
  1605              while(num_blanks_to_yield < 0):
  1606                  num_blanks_to_yield += 1
  1607                  yield None,('','\n'),True
  1608              while(num_blanks_to_yield > 0):
  1609                  num_blanks_to_yield -= 1
  1610                  yield ('','\n'),None,True
  1611              if s.startswith('X'):
  1612                  raise StopIteration
  1613              else:
  1614                  yield from_line,to_line,True
  1615  
  1616      def _line_pair_iterator():
  1617          """Yields from/to lines of text with a change indication.
  1618  
  1619          This function is an iterator.  It itself pulls lines from the line
  1620          iterator.  Its difference from that iterator is that this function
  1621          always yields a pair of from/to text lines (with the change
  1622          indication).  If necessary it will collect single from/to lines
  1623          until it has a matching pair from/to pair to yield.
  1624  
  1625          Note, this function is purposefully not defined at the module scope so
  1626          that data it needs from its parent function (within whose context it
  1627          is defined) does not need to be of module scope.
  1628          """
  1629          line_iterator = _line_iterator()
  1630          fromlines,tolines=[],[]
  1631          while True:
  1632              # Collecting lines of text until we have a from/to pair
  1633              while (len(fromlines)==0 or len(tolines)==0):
  1634                  from_line, to_line, found_diff =line_iterator.next()
  1635                  if from_line is not None:
  1636                      fromlines.append((from_line,found_diff))
  1637                  if to_line is not None:
  1638                      tolines.append((to_line,found_diff))
  1639              # Once we have a pair, remove them from the collection and yield it
  1640              from_line, fromDiff = fromlines.pop(0)
  1641              to_line, to_diff = tolines.pop(0)
  1642              yield (from_line,to_line,fromDiff or to_diff)
  1643  
  1644      # Handle case where user does not want context differencing, just yield
  1645      # them up without doing anything else with them.
  1646      line_pair_iterator = _line_pair_iterator()
  1647      if context is None:
  1648          while True:
  1649              yield line_pair_iterator.next()
  1650      # Handle case where user wants context differencing.  We must do some
  1651      # storage of lines until we know for sure that they are to be yielded.
  1652      else:
  1653          context += 1
  1654          lines_to_write = 0
  1655          while True:
  1656              # Store lines up until we find a difference, note use of a
  1657              # circular queue because we only need to keep around what
  1658              # we need for context.
  1659              index, contextLines = 0, [None]*(context)
  1660              found_diff = False
  1661              while(found_diff is False):
  1662                  from_line, to_line, found_diff = line_pair_iterator.next()
  1663                  i = index % context
  1664                  contextLines[i] = (from_line, to_line, found_diff)
  1665                  index += 1
  1666              # Yield lines that we have collected so far, but first yield
  1667              # the user's separator.
  1668              if index > context:
  1669                  yield None, None, None
  1670                  lines_to_write = context
  1671              else:
  1672                  lines_to_write = index
  1673                  index = 0
  1674              while(lines_to_write):
  1675                  i = index % context
  1676                  index += 1
  1677                  yield contextLines[i]
  1678                  lines_to_write -= 1
  1679              # Now yield the context lines after the change
  1680              lines_to_write = context-1
  1681              while(lines_to_write):
  1682                  from_line, to_line, found_diff = line_pair_iterator.next()
  1683                  # If another change within the context, extend the context
  1684                  if found_diff:
  1685                      lines_to_write = context-1
  1686                  else:
  1687                      lines_to_write -= 1
  1688                  yield from_line, to_line, found_diff
  1689  
  1690  
  1691  _file_template = """
  1692  <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  1693            "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  1694  
  1695  <html>
  1696  
  1697  <head>
  1698      <meta http-equiv="Content-Type"
  1699            content="text/html; charset=ISO-8859-1" />
  1700      <title></title>
  1701      <style type="text/css">%(styles)s
  1702      </style>
  1703  </head>
  1704  
  1705  <body>
  1706      %(table)s%(legend)s
  1707  </body>
  1708  
  1709  </html>"""
  1710  
  1711  _styles = """
  1712          table.diff {font-family:Courier; border:medium;}
  1713          .diff_header {background-color:#e0e0e0}
  1714          td.diff_header {text-align:right}
  1715          .diff_next {background-color:#c0c0c0}
  1716          .diff_add {background-color:#aaffaa}
  1717          .diff_chg {background-color:#ffff77}
  1718          .diff_sub {background-color:#ffaaaa}"""
  1719  
  1720  _table_template = """
  1721      <table class="diff" id="difflib_chg_%(prefix)s_top"
  1722             cellspacing="0" cellpadding="0" rules="groups" >
  1723          <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
  1724          <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
  1725          %(header_row)s
  1726          <tbody>
  1727  %(data_rows)s        </tbody>
  1728      </table>"""
  1729  
  1730  _legend = """
  1731      <table class="diff" summary="Legends">
  1732          <tr> <th colspan="2"> Legends </th> </tr>
  1733          <tr> <td> <table border="" summary="Colors">
  1734                        <tr><th> Colors </th> </tr>
  1735                        <tr><td class="diff_add">&nbsp;Added&nbsp;</td></tr>
  1736                        <tr><td class="diff_chg">Changed</td> </tr>
  1737                        <tr><td class="diff_sub">Deleted</td> </tr>
  1738                    </table></td>
  1739               <td> <table border="" summary="Links">
  1740                        <tr><th colspan="2"> Links </th> </tr>
  1741                        <tr><td>(f)irst change</td> </tr>
  1742                        <tr><td>(n)ext change</td> </tr>
  1743                        <tr><td>(t)op</td> </tr>
  1744                    </table></td> </tr>
  1745      </table>"""
  1746  
  1747  class HtmlDiff(object):
  1748      """For producing HTML side by side comparison with change highlights.
  1749  
  1750      This class can be used to create an HTML table (or a complete HTML file
  1751      containing the table) showing a side by side, line by line comparison
  1752      of text with inter-line and intra-line change highlights.  The table can
  1753      be generated in either full or contextual difference mode.
  1754  
  1755      The following methods are provided for HTML generation:
  1756  
  1757      make_table -- generates HTML for a single side by side table
  1758      make_file -- generates complete HTML file with a single side by side table
  1759  
  1760      See tools/scripts/diff.py for an example usage of this class.
  1761      """
  1762  
  1763      _file_template = _file_template
  1764      _styles = _styles
  1765      _table_template = _table_template
  1766      _legend = _legend
  1767      _default_prefix = 0
  1768  
  1769      def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None,
  1770                   charjunk=IS_CHARACTER_JUNK):
  1771          """HtmlDiff instance initializer
  1772  
  1773          Arguments:
  1774          tabsize -- tab stop spacing, defaults to 8.
  1775          wrapcolumn -- column number where lines are broken and wrapped,
  1776              defaults to None where lines are not wrapped.
  1777          linejunk,charjunk -- keyword arguments passed into ndiff() (used to by
  1778              HtmlDiff() to generate the side by side HTML differences).  See
  1779              ndiff() documentation for argument default values and descriptions.
  1780          """
  1781          self._tabsize = tabsize
  1782          self._wrapcolumn = wrapcolumn
  1783          self._linejunk = linejunk
  1784          self._charjunk = charjunk
  1785  
  1786      def make_file(self,fromlines,tolines,fromdesc='',todesc='',context=False,
  1787                    numlines=5):
  1788          """Returns HTML file of side by side comparison with change highlights
  1789  
  1790          Arguments:
  1791          fromlines -- list of "from" lines
  1792          tolines -- list of "to" lines
  1793          fromdesc -- "from" file column header string
  1794          todesc -- "to" file column header string
  1795          context -- set to True for contextual differences (defaults to False
  1796              which shows full differences).
  1797          numlines -- number of context lines.  When context is set True,
  1798              controls number of lines displayed before and after the change.
  1799              When context is False, controls the number of lines to place
  1800              the "next" link anchors before the next change (so click of
  1801              "next" link jumps to just before the change).
  1802          """
  1803  
  1804          return self._file_template % dict(
  1805              styles = self._styles,
  1806              legend = self._legend,
  1807              table = self.make_table(fromlines,tolines,fromdesc,todesc,
  1808                                      context=context,numlines=numlines))
  1809  
  1810      def _tab_newline_replace(self,fromlines,tolines):
  1811          """Returns from/to line lists with tabs expanded and newlines removed.
  1812  
  1813          Instead of tab characters being replaced by the number of spaces
  1814          needed to fill in to the next tab stop, this function will fill
  1815          the space with tab characters.  This is done so that the difference
  1816          algorithms can identify changes in a file when tabs are replaced by
  1817          spaces and vice versa.  At the end of the HTML generation, the tab
  1818          characters will be replaced with a nonbreakable space.
  1819          """
  1820          def expand_tabs(line):
  1821              # hide real spaces
  1822              line = line.replace(' ','\0')
  1823              # expand tabs into spaces
  1824              line = line.expandtabs(self._tabsize)
  1825              # replace spaces from expanded tabs back into tab characters
  1826              # (we'll replace them with markup after we do differencing)
  1827              line = line.replace(' ','\t')
  1828              return line.replace('\0',' ').rstrip('\n')
  1829          fromlines = [expand_tabs(line) for line in fromlines]
  1830          tolines = [expand_tabs(line) for line in tolines]
  1831          return fromlines,tolines
  1832  
  1833      def _split_line(self,data_list,line_num,text):
  1834          """Builds list of text lines by splitting text lines at wrap point
  1835  
  1836          This function will determine if the input text line needs to be
  1837          wrapped (split) into separate lines.  If so, the first wrap point
  1838          will be determined and the first line appended to the output
  1839          text line list.  This function is used recursively to handle
  1840          the second part of the split line to further split it.
  1841          """
  1842          # if blank line or context separator, just add it to the output list
  1843          if not line_num:
  1844              data_list.append((line_num,text))
  1845              return
  1846  
  1847          # if line text doesn't need wrapping, just add it to the output list
  1848          size = len(text)
  1849          max = self._wrapcolumn
  1850          if (size <= max) or ((size -(text.count('\0')*3)) <= max):
  1851              data_list.append((line_num,text))
  1852              return
  1853  
  1854          # scan text looking for the wrap point, keeping track if the wrap
  1855          # point is inside markers
  1856          i = 0
  1857          n = 0
  1858          mark = ''
  1859          while n < max and i < size:
  1860              if text[i] == '\0':
  1861                  i += 1
  1862                  mark = text[i]
  1863                  i += 1
  1864              elif text[i] == '\1':
  1865                  i += 1
  1866                  mark = ''
  1867              else:
  1868                  i += 1
  1869                  n += 1
  1870  
  1871          # wrap point is inside text, break it up into separate lines
  1872          line1 = text[:i]
  1873          line2 = text[i:]
  1874  
  1875          # if wrap point is inside markers, place end marker at end of first
  1876          # line and start marker at beginning of second line because each
  1877          # line will have its own table tag markup around it.
  1878          if mark:
  1879              line1 = line1 + '\1'
  1880              line2 = '\0' + mark + line2
  1881  
  1882          # tack on first line onto the output list
  1883          data_list.append((line_num,line1))
  1884  
  1885          # use this routine again to wrap the remaining text
  1886          self._split_line(data_list,'>',line2)
  1887  
  1888      def _line_wrapper(self,diffs):
  1889          """Returns iterator that splits (wraps) mdiff text lines"""
  1890  
  1891          # pull from/to data and flags from mdiff iterator
  1892          for fromdata,todata,flag in diffs:
  1893              # check for context separators and pass them through
  1894              if flag is None:
  1895                  yield fromdata,todata,flag
  1896                  continue
  1897              (fromline,fromtext),(toline,totext) = fromdata,todata
  1898              # for each from/to line split it at the wrap column to form
  1899              # list of text lines.
  1900              fromlist,tolist = [],[]
  1901              self._split_line(fromlist,fromline,fromtext)
  1902              self._split_line(tolist,toline,totext)
  1903              # yield from/to line in pairs inserting blank lines as
  1904              # necessary when one side has more wrapped lines
  1905              while fromlist or tolist:
  1906                  if fromlist:
  1907                      fromdata = fromlist.pop(0)
  1908                  else:
  1909                      fromdata = ('',' ')
  1910                  if tolist:
  1911                      todata = tolist.pop(0)
  1912                  else:
  1913                      todata = ('',' ')
  1914                  yield fromdata,todata,flag
  1915  
  1916      def _collect_lines(self,diffs):
  1917          """Collects mdiff output into separate lists
  1918  
  1919          Before storing the mdiff from/to data into a list, it is converted
  1920          into a single line of text with HTML markup.
  1921          """
  1922  
  1923          fromlist,tolist,flaglist = [],[],[]
  1924          # pull from/to data and flags from mdiff style iterator
  1925          for fromdata,todata,flag in diffs:
  1926              try:
  1927                  # store HTML markup of the lines into the lists
  1928                  fromlist.append(self._format_line(0,flag,*fromdata))
  1929                  tolist.append(self._format_line(1,flag,*todata))
  1930              except TypeError:
  1931                  # exceptions occur for lines where context separators go
  1932                  fromlist.append(None)
  1933                  tolist.append(None)
  1934              flaglist.append(flag)
  1935          return fromlist,tolist,flaglist
  1936  
  1937      def _format_line(self,side,flag,linenum,text):
  1938          """Returns HTML markup of "from" / "to" text lines
  1939  
  1940          side -- 0 or 1 indicating "from" or "to" text
  1941          flag -- indicates if difference on line
  1942          linenum -- line number (used for line number column)
  1943          text -- line text to be marked up
  1944          """
  1945          try:
  1946              linenum = '%d' % linenum
  1947              id = ' id="%s%s"' % (self._prefix[side],linenum)
  1948          except TypeError:
  1949              # handle blank lines where linenum is '>' or ''
  1950              id = ''
  1951          # replace those things that would get confused with HTML symbols
  1952          text=text.replace("&","&amp;").replace(">","&gt;").replace("<","&lt;")
  1953  
  1954          # make space non-breakable so they don't get compressed or line wrapped
  1955          text = text.replace(' ','&nbsp;').rstrip()
  1956  
  1957          return '<td class="diff_header"%s>%s</td><td nowrap="nowrap">%s</td>' \
  1958                 % (id,linenum,text)
  1959  
  1960      def _make_prefix(self):
  1961          """Create unique anchor prefixes"""
  1962  
  1963          # Generate a unique anchor prefix so multiple tables
  1964          # can exist on the same HTML page without conflicts.
  1965          fromprefix = "from%d_" % HtmlDiff._default_prefix
  1966          toprefix = "to%d_" % HtmlDiff._default_prefix
  1967          HtmlDiff._default_prefix += 1
  1968          # store prefixes so line format method has access
  1969          self._prefix = [fromprefix,toprefix]
  1970  
  1971      def _convert_flags(self,fromlist,tolist,flaglist,context,numlines):
  1972          """Makes list of "next" links"""
  1973  
  1974          # all anchor names will be generated using the unique "to" prefix
  1975          toprefix = self._prefix[1]
  1976  
  1977          # process change flags, generating middle column of next anchors/links
  1978          next_id = ['']*len(flaglist)
  1979          next_href = ['']*len(flaglist)
  1980          num_chg, in_change = 0, False
  1981          last = 0
  1982          for i,flag in enumerate(flaglist):
  1983              if flag:
  1984                  if not in_change:
  1985                      in_change = True
  1986                      last = i
  1987                      # at the beginning of a change, drop an anchor a few lines
  1988                      # (the context lines) before the change for the previous
  1989                      # link
  1990                      i = max([0,i-numlines])
  1991                      next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg)
  1992                      # at the beginning of a change, drop a link to the next
  1993                      # change
  1994                      num_chg += 1
  1995                      next_href[last] = '<a href="#difflib_chg_%s_%d">n</a>' % (
  1996                           toprefix,num_chg)
  1997              else:
  1998                  in_change = False
  1999          # check for cases where there is no content to avoid exceptions
  2000          if not flaglist:
  2001              flaglist = [False]
  2002              next_id = ['']
  2003              next_href = ['']
  2004              last = 0
  2005              if context:
  2006                  fromlist = ['<td></td><td>&nbsp;No Differences Found&nbsp;</td>']
  2007                  tolist = fromlist
  2008              else:
  2009                  fromlist = tolist = ['<td></td><td>&nbsp;Empty File&nbsp;</td>']
  2010          # if not a change on first line, drop a link
  2011          if not flaglist[0]:
  2012              next_href[0] = '<a href="#difflib_chg_%s_0">f</a>' % toprefix
  2013          # redo the last link to link to the top
  2014          next_href[last] = '<a href="#difflib_chg_%s_top">t</a>' % (toprefix)
  2015  
  2016          return fromlist,tolist,flaglist,next_href,next_id
  2017  
  2018      def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False,
  2019                     numlines=5):
  2020          """Returns HTML table of side by side comparison with change highlights
  2021  
  2022          Arguments:
  2023          fromlines -- list of "from" lines
  2024          tolines -- list of "to" lines
  2025          fromdesc -- "from" file column header string
  2026          todesc -- "to" file column header string
  2027          context -- set to True for contextual differences (defaults to False
  2028              which shows full differences).
  2029          numlines -- number of context lines.  When context is set True,
  2030              controls number of lines displayed before and after the change.
  2031              When context is False, controls the number of lines to place
  2032              the "next" link anchors before the next change (so click of
  2033              "next" link jumps to just before the change).
  2034          """
  2035  
  2036          # make unique anchor prefixes so that multiple tables may exist
  2037          # on the same page without conflict.
  2038          self._make_prefix()
  2039  
  2040          # change tabs to spaces before it gets more difficult after we insert
  2041          # markup
  2042          fromlines,tolines = self._tab_newline_replace(fromlines,tolines)
  2043  
  2044          # create diffs iterator which generates side by side from/to data
  2045          if context:
  2046              context_lines = numlines
  2047          else:
  2048              context_lines = None
  2049          diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk,
  2050                        charjunk=self._charjunk)
  2051  
  2052          # set up iterator to wrap lines that exceed desired width
  2053          if self._wrapcolumn:
  2054              diffs = self._line_wrapper(diffs)
  2055  
  2056          # collect up from/to lines and flags into lists (also format the lines)
  2057          fromlist,tolist,flaglist = self._collect_lines(diffs)
  2058  
  2059          # process change flags, generating middle column of next anchors/links
  2060          fromlist,tolist,flaglist,next_href,next_id = self._convert_flags(
  2061              fromlist,tolist,flaglist,context,numlines)
  2062  
  2063          s = []
  2064          fmt = '            <tr><td class="diff_next"%s>%s</td>%s' + \
  2065                '<td class="diff_next">%s</td>%s</tr>\n'
  2066          for i in range(len(flaglist)):
  2067              if flaglist[i] is None:
  2068                  # mdiff yields None on separator lines skip the bogus ones
  2069                  # generated for the first line
  2070                  if i > 0:
  2071                      s.append('        </tbody>        \n        <tbody>\n')
  2072              else:
  2073                  s.append( fmt % (next_id[i],next_href[i],fromlist[i],
  2074                                             next_href[i],tolist[i]))
  2075          if fromdesc or todesc:
  2076              header_row = '<thead><tr>%s%s%s%s</tr></thead>' % (
  2077                  '<th class="diff_next"><br /></th>',
  2078                  '<th colspan="2" class="diff_header">%s</th>' % fromdesc,
  2079                  '<th class="diff_next"><br /></th>',
  2080                  '<th colspan="2" class="diff_header">%s</th>' % todesc)
  2081          else:
  2082              header_row = ''
  2083  
  2084          table = self._table_template % dict(
  2085              data_rows=''.join(s),
  2086              header_row=header_row,
  2087              prefix=self._prefix[1])
  2088  
  2089          return table.replace('\0+','<span class="diff_add">'). \
  2090                       replace('\0-','<span class="diff_sub">'). \
  2091                       replace('\0^','<span class="diff_chg">'). \
  2092                       replace('\1','</span>'). \
  2093                       replace('\t','&nbsp;')
  2094  
  2095  del re
  2096  
  2097  def restore(delta, which):
  2098      r"""
  2099      Generate one of the two sequences that generated a delta.
  2100  
  2101      Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract
  2102      lines originating from file 1 or 2 (parameter `which`), stripping off line
  2103      prefixes.
  2104  
  2105      Examples:
  2106  
  2107      >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
  2108      ...              'ore\ntree\nemu\n'.splitlines(1))
  2109      >>> diff = list(diff)
  2110      >>> print ''.join(restore(diff, 1)),
  2111      one
  2112      two
  2113      three
  2114      >>> print ''.join(restore(diff, 2)),
  2115      ore
  2116      tree
  2117      emu
  2118      """
  2119      try:
  2120          tag = {1: "- ", 2: "+ "}[int(which)]
  2121      except KeyError:
  2122          raise ValueError, ('unknown delta choice (must be 1 or 2): %r'
  2123                             % which)
  2124      prefixes = ("  ", tag)
  2125      for line in delta:
  2126          if line[:2] in prefixes:
  2127              yield line[2:]
  2128  
  2129  # def _test():
  2130  #     import doctest, difflib
  2131  #     return doctest.testmod(difflib)
  2132  
  2133  # if __name__ == "__main__":
  2134  #     _test()