github.com/grumpyhome/grumpy@v0.3.1-0.20201208125205-7b775405bdf1/grumpy-runtime-src/third_party/stdlib/difflib.py (about) 1 """ 2 Module difflib -- helpers for computing deltas between objects. 3 4 Function get_close_matches(word, possibilities, n=3, cutoff=0.6): 5 Use SequenceMatcher to return list of the best "good enough" matches. 6 7 Function context_diff(a, b): 8 For two lists of strings, return a delta in context diff format. 9 10 Function ndiff(a, b): 11 Return a delta: the difference between `a` and `b` (lists of strings). 12 13 Function restore(delta, which): 14 Return one of the two sequences that generated an ndiff delta. 15 16 Function unified_diff(a, b): 17 For two lists of strings, return a delta in unified diff format. 18 19 Class SequenceMatcher: 20 A flexible class for comparing pairs of sequences of any type. 21 22 Class Differ: 23 For producing human-readable deltas from sequences of lines of text. 24 25 Class HtmlDiff: 26 For producing HTML side by side comparison with change highlights. 27 """ 28 29 __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher', 30 'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff', 31 'unified_diff', 'HtmlDiff', 'Match'] 32 33 import heapq 34 # from collections import namedtuple as _namedtuple 35 # from functools import reduce 36 import functools 37 reduce = functools.reduce 38 39 import operator 40 _itemgetter = operator.itemgetter 41 _property = property 42 _tuple = tuple 43 44 def setdefault(d, k, default=None): 45 if k not in d: 46 d[k] = default 47 return d[k] 48 49 # Match = _namedtuple('Match', 'a b size') 50 class Match(tuple): 51 'Match(a, b, size)' 52 53 __slots__ = () 54 55 _fields = ('a', 'b', 'size') 56 57 def __new__(_cls, a, b, size): 58 'Create new instance of Match(a, b, size)' 59 return _tuple.__new__(_cls, (a, b, size)) 60 61 # @classmethod 62 def _make(cls, iterable, new=tuple.__new__, len=len): 63 'Make a new Match object from a sequence or iterable' 64 result = new(cls, iterable) 65 if len(result) != 3: 66 raise TypeError('Expected 3 arguments, got %d' % len(result)) 67 return result 68 _make = classmethod(_make) 69 70 def __repr__(self): 71 'Return a nicely formatted representation string' 72 return 'Match(a=%r, b=%r, size=%r)' % self 73 74 def _asdict(self): 75 'Return a new OrderedDict which maps field names to their values' 76 return OrderedDict(zip(self._fields, self)) 77 78 def _replace(_self, **kwds): 79 'Return a new Match object replacing specified fields with new values' 80 result = _self._make(map(kwds.pop, ('a', 'b', 'size'), _self)) 81 if kwds: 82 raise ValueError('Got unexpected field names: %r' % kwds.keys()) 83 return result 84 85 def __getnewargs__(self): 86 'Return self as a plain tuple. Used by copy and pickle.' 87 return tuple(self) 88 89 __dict__ = _property(_asdict) 90 91 def __getstate__(self): 92 'Exclude the OrderedDict from pickling' 93 pass 94 95 a = _property(_itemgetter(0), doc='Alias for field number 0') 96 97 b = _property(_itemgetter(1), doc='Alias for field number 1') 98 99 size = _property(_itemgetter(2), doc='Alias for field number 2') 100 101 def _calculate_ratio(matches, length): 102 if length: 103 return 2.0 * matches / length 104 return 1.0 105 106 class SequenceMatcher(object): 107 108 """ 109 SequenceMatcher is a flexible class for comparing pairs of sequences of 110 any type, so long as the sequence elements are hashable. The basic 111 algorithm predates, and is a little fancier than, an algorithm 112 published in the late 1980's by Ratcliff and Obershelp under the 113 hyperbolic name "gestalt pattern matching". The basic idea is to find 114 the longest contiguous matching subsequence that contains no "junk" 115 elements (R-O doesn't address junk). The same idea is then applied 116 recursively to the pieces of the sequences to the left and to the right 117 of the matching subsequence. This does not yield minimal edit 118 sequences, but does tend to yield matches that "look right" to people. 119 120 SequenceMatcher tries to compute a "human-friendly diff" between two 121 sequences. Unlike e.g. UNIX(tm) diff, the fundamental notion is the 122 longest *contiguous* & junk-free matching subsequence. That's what 123 catches peoples' eyes. The Windows(tm) windiff has another interesting 124 notion, pairing up elements that appear uniquely in each sequence. 125 That, and the method here, appear to yield more intuitive difference 126 reports than does diff. This method appears to be the least vulnerable 127 to synching up on blocks of "junk lines", though (like blank lines in 128 ordinary text files, or maybe "<P>" lines in HTML files). That may be 129 because this is the only method of the 3 that has a *concept* of 130 "junk" <wink>. 131 132 Example, comparing two strings, and considering blanks to be "junk": 133 134 >>> s = SequenceMatcher(lambda x: x == " ", 135 ... "private Thread currentThread;", 136 ... "private volatile Thread currentThread;") 137 >>> 138 139 .ratio() returns a float in [0, 1], measuring the "similarity" of the 140 sequences. As a rule of thumb, a .ratio() value over 0.6 means the 141 sequences are close matches: 142 143 >>> print round(s.ratio(), 3) 144 0.866 145 >>> 146 147 If you're only interested in where the sequences match, 148 .get_matching_blocks() is handy: 149 150 >>> for block in s.get_matching_blocks(): 151 ... print "a[%d] and b[%d] match for %d elements" % block 152 a[0] and b[0] match for 8 elements 153 a[8] and b[17] match for 21 elements 154 a[29] and b[38] match for 0 elements 155 156 Note that the last tuple returned by .get_matching_blocks() is always a 157 dummy, (len(a), len(b), 0), and this is the only case in which the last 158 tuple element (number of elements matched) is 0. 159 160 If you want to know how to change the first sequence into the second, 161 use .get_opcodes(): 162 163 >>> for opcode in s.get_opcodes(): 164 ... print "%6s a[%d:%d] b[%d:%d]" % opcode 165 equal a[0:8] b[0:8] 166 insert a[8:8] b[8:17] 167 equal a[8:29] b[17:38] 168 169 See the Differ class for a fancy human-friendly file differencer, which 170 uses SequenceMatcher both to compare sequences of lines, and to compare 171 sequences of characters within similar (near-matching) lines. 172 173 See also function get_close_matches() in this module, which shows how 174 simple code building on SequenceMatcher can be used to do useful work. 175 176 Timing: Basic R-O is cubic time worst case and quadratic time expected 177 case. SequenceMatcher is quadratic time for the worst case and has 178 expected-case behavior dependent in a complicated way on how many 179 elements the sequences have in common; best case time is linear. 180 181 Methods: 182 183 __init__(isjunk=None, a='', b='') 184 Construct a SequenceMatcher. 185 186 set_seqs(a, b) 187 Set the two sequences to be compared. 188 189 set_seq1(a) 190 Set the first sequence to be compared. 191 192 set_seq2(b) 193 Set the second sequence to be compared. 194 195 find_longest_match(alo, ahi, blo, bhi) 196 Find longest matching block in a[alo:ahi] and b[blo:bhi]. 197 198 get_matching_blocks() 199 Return list of triples describing matching subsequences. 200 201 get_opcodes() 202 Return list of 5-tuples describing how to turn a into b. 203 204 ratio() 205 Return a measure of the sequences' similarity (float in [0,1]). 206 207 quick_ratio() 208 Return an upper bound on .ratio() relatively quickly. 209 210 real_quick_ratio() 211 Return an upper bound on ratio() very quickly. 212 """ 213 214 def __init__(self, isjunk=None, a='', b='', autojunk=True): 215 """Construct a SequenceMatcher. 216 217 Optional arg isjunk is None (the default), or a one-argument 218 function that takes a sequence element and returns true iff the 219 element is junk. None is equivalent to passing "lambda x: 0", i.e. 220 no elements are considered to be junk. For example, pass 221 lambda x: x in " \\t" 222 if you're comparing lines as sequences of characters, and don't 223 want to synch up on blanks or hard tabs. 224 225 Optional arg a is the first of two sequences to be compared. By 226 default, an empty string. The elements of a must be hashable. See 227 also .set_seqs() and .set_seq1(). 228 229 Optional arg b is the second of two sequences to be compared. By 230 default, an empty string. The elements of b must be hashable. See 231 also .set_seqs() and .set_seq2(). 232 233 Optional arg autojunk should be set to False to disable the 234 "automatic junk heuristic" that treats popular elements as junk 235 (see module documentation for more information). 236 """ 237 238 # Members: 239 # a 240 # first sequence 241 # b 242 # second sequence; differences are computed as "what do 243 # we need to do to 'a' to change it into 'b'?" 244 # b2j 245 # for x in b, b2j[x] is a list of the indices (into b) 246 # at which x appears; junk elements do not appear 247 # fullbcount 248 # for x in b, fullbcount[x] == the number of times x 249 # appears in b; only materialized if really needed (used 250 # only for computing quick_ratio()) 251 # matching_blocks 252 # a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k]; 253 # ascending & non-overlapping in i and in j; terminated by 254 # a dummy (len(a), len(b), 0) sentinel 255 # opcodes 256 # a list of (tag, i1, i2, j1, j2) tuples, where tag is 257 # one of 258 # 'replace' a[i1:i2] should be replaced by b[j1:j2] 259 # 'delete' a[i1:i2] should be deleted 260 # 'insert' b[j1:j2] should be inserted 261 # 'equal' a[i1:i2] == b[j1:j2] 262 # isjunk 263 # a user-supplied function taking a sequence element and 264 # returning true iff the element is "junk" -- this has 265 # subtle but helpful effects on the algorithm, which I'll 266 # get around to writing up someday <0.9 wink>. 267 # DON'T USE! Only __chain_b uses this. Use isbjunk. 268 # isbjunk 269 # for x in b, isbjunk(x) == isjunk(x) but much faster; 270 # it's really the __contains__ method of a hidden dict. 271 # DOES NOT WORK for x in a! 272 # isbpopular 273 # for x in b, isbpopular(x) is true iff b is reasonably long 274 # (at least 200 elements) and x accounts for more than 1 + 1% of 275 # its elements (when autojunk is enabled). 276 # DOES NOT WORK for x in a! 277 278 self.isjunk = isjunk 279 self.a = self.b = None 280 self.autojunk = autojunk 281 self.set_seqs(a, b) 282 283 def set_seqs(self, a, b): 284 """Set the two sequences to be compared. 285 286 >>> s = SequenceMatcher() 287 >>> s.set_seqs("abcd", "bcde") 288 >>> s.ratio() 289 0.75 290 """ 291 292 self.set_seq1(a) 293 self.set_seq2(b) 294 295 def set_seq1(self, a): 296 """Set the first sequence to be compared. 297 298 The second sequence to be compared is not changed. 299 300 >>> s = SequenceMatcher(None, "abcd", "bcde") 301 >>> s.ratio() 302 0.75 303 >>> s.set_seq1("bcde") 304 >>> s.ratio() 305 1.0 306 >>> 307 308 SequenceMatcher computes and caches detailed information about the 309 second sequence, so if you want to compare one sequence S against 310 many sequences, use .set_seq2(S) once and call .set_seq1(x) 311 repeatedly for each of the other sequences. 312 313 See also set_seqs() and set_seq2(). 314 """ 315 316 if a is self.a: 317 return 318 self.a = a 319 self.matching_blocks = self.opcodes = None 320 321 def set_seq2(self, b): 322 """Set the second sequence to be compared. 323 324 The first sequence to be compared is not changed. 325 326 >>> s = SequenceMatcher(None, "abcd", "bcde") 327 >>> s.ratio() 328 0.75 329 >>> s.set_seq2("abcd") 330 >>> s.ratio() 331 1.0 332 >>> 333 334 SequenceMatcher computes and caches detailed information about the 335 second sequence, so if you want to compare one sequence S against 336 many sequences, use .set_seq2(S) once and call .set_seq1(x) 337 repeatedly for each of the other sequences. 338 339 See also set_seqs() and set_seq1(). 340 """ 341 342 if b is self.b: 343 return 344 self.b = b 345 self.matching_blocks = self.opcodes = None 346 self.fullbcount = None 347 self.__chain_b() 348 349 # For each element x in b, set b2j[x] to a list of the indices in 350 # b where x appears; the indices are in increasing order; note that 351 # the number of times x appears in b is len(b2j[x]) ... 352 # when self.isjunk is defined, junk elements don't show up in this 353 # map at all, which stops the central find_longest_match method 354 # from starting any matching block at a junk element ... 355 # also creates the fast isbjunk function ... 356 # b2j also does not contain entries for "popular" elements, meaning 357 # elements that account for more than 1 + 1% of the total elements, and 358 # when the sequence is reasonably large (>= 200 elements); this can 359 # be viewed as an adaptive notion of semi-junk, and yields an enormous 360 # speedup when, e.g., comparing program files with hundreds of 361 # instances of "return NULL;" ... 362 # note that this is only called when b changes; so for cross-product 363 # kinds of matches, it's best to call set_seq2 once, then set_seq1 364 # repeatedly 365 366 def __chain_b(self): 367 # Because isjunk is a user-defined (not C) function, and we test 368 # for junk a LOT, it's important to minimize the number of calls. 369 # Before the tricks described here, __chain_b was by far the most 370 # time-consuming routine in the whole module! If anyone sees 371 # Jim Roskind, thank him again for profile.py -- I never would 372 # have guessed that. 373 # The first trick is to build b2j ignoring the possibility 374 # of junk. I.e., we don't call isjunk at all yet. Throwing 375 # out the junk later is much cheaper than building b2j "right" 376 # from the start. 377 b = self.b 378 self.b2j = b2j = {} 379 380 for i, elt in enumerate(b): 381 indices = setdefault(b2j, elt, []) 382 # indices = b2j.setdefault(elt, []) 383 indices.append(i) 384 385 # Purge junk elements 386 junk = set() 387 isjunk = self.isjunk 388 if isjunk: 389 for elt in list(b2j.keys()): # using list() since b2j is modified 390 if isjunk(elt): 391 junk.add(elt) 392 del b2j[elt] 393 394 # Purge popular elements that are not junk 395 popular = set() 396 n = len(b) 397 if self.autojunk and n >= 200: 398 ntest = n // 100 + 1 399 for elt, idxs in list(b2j.items()): 400 if len(idxs) > ntest: 401 popular.add(elt) 402 del b2j[elt] 403 404 # Now for x in b, isjunk(x) == x in junk, but the latter is much faster. 405 # Sicne the number of *unique* junk elements is probably small, the 406 # memory burden of keeping this set alive is likely trivial compared to 407 # the size of b2j. 408 self.isbjunk = junk.__contains__ 409 self.isbpopular = popular.__contains__ 410 411 def find_longest_match(self, alo, ahi, blo, bhi): 412 """Find longest matching block in a[alo:ahi] and b[blo:bhi]. 413 414 If isjunk is not defined: 415 416 Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where 417 alo <= i <= i+k <= ahi 418 blo <= j <= j+k <= bhi 419 and for all (i',j',k') meeting those conditions, 420 k >= k' 421 i <= i' 422 and if i == i', j <= j' 423 424 In other words, of all maximal matching blocks, return one that 425 starts earliest in a, and of all those maximal matching blocks that 426 start earliest in a, return the one that starts earliest in b. 427 428 >>> s = SequenceMatcher(None, " abcd", "abcd abcd") 429 >>> s.find_longest_match(0, 5, 0, 9) 430 Match(a=0, b=4, size=5) 431 432 If isjunk is defined, first the longest matching block is 433 determined as above, but with the additional restriction that no 434 junk element appears in the block. Then that block is extended as 435 far as possible by matching (only) junk elements on both sides. So 436 the resulting block never matches on junk except as identical junk 437 happens to be adjacent to an "interesting" match. 438 439 Here's the same example as before, but considering blanks to be 440 junk. That prevents " abcd" from matching the " abcd" at the tail 441 end of the second sequence directly. Instead only the "abcd" can 442 match, and matches the leftmost "abcd" in the second sequence: 443 444 >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd") 445 >>> s.find_longest_match(0, 5, 0, 9) 446 Match(a=1, b=0, size=4) 447 448 If no blocks match, return (alo, blo, 0). 449 450 >>> s = SequenceMatcher(None, "ab", "c") 451 >>> s.find_longest_match(0, 2, 0, 1) 452 Match(a=0, b=0, size=0) 453 """ 454 455 # CAUTION: stripping common prefix or suffix would be incorrect. 456 # E.g., 457 # ab 458 # acab 459 # Longest matching block is "ab", but if common prefix is 460 # stripped, it's "a" (tied with "b"). UNIX(tm) diff does so 461 # strip, so ends up claiming that ab is changed to acab by 462 # inserting "ca" in the middle. That's minimal but unintuitive: 463 # "it's obvious" that someone inserted "ac" at the front. 464 # Windiff ends up at the same place as diff, but by pairing up 465 # the unique 'b's and then matching the first two 'a's. 466 467 a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk 468 besti, bestj, bestsize = alo, blo, 0 469 # find longest junk-free match 470 # during an iteration of the loop, j2len[j] = length of longest 471 # junk-free match ending with a[i-1] and b[j] 472 j2len = {} 473 nothing = [] 474 for i in xrange(alo, ahi): 475 # look at all instances of a[i] in b; note that because 476 # b2j has no junk keys, the loop is skipped if a[i] is junk 477 j2lenget = j2len.get 478 newj2len = {} 479 for j in b2j.get(a[i], nothing): 480 # a[i] matches b[j] 481 if j < blo: 482 continue 483 if j >= bhi: 484 break 485 k = newj2len[j] = j2lenget(j-1, 0) + 1 486 if k > bestsize: 487 besti, bestj, bestsize = i-k+1, j-k+1, k 488 j2len = newj2len 489 490 # Extend the best by non-junk elements on each end. In particular, 491 # "popular" non-junk elements aren't in b2j, which greatly speeds 492 # the inner loop above, but also means "the best" match so far 493 # doesn't contain any junk *or* popular non-junk elements. 494 while besti > alo and bestj > blo and \ 495 not isbjunk(b[bestj-1]) and \ 496 a[besti-1] == b[bestj-1]: 497 besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 498 while besti+bestsize < ahi and bestj+bestsize < bhi and \ 499 not isbjunk(b[bestj+bestsize]) and \ 500 a[besti+bestsize] == b[bestj+bestsize]: 501 bestsize += 1 502 503 # Now that we have a wholly interesting match (albeit possibly 504 # empty!), we may as well suck up the matching junk on each 505 # side of it too. Can't think of a good reason not to, and it 506 # saves post-processing the (possibly considerable) expense of 507 # figuring out what to do with it. In the case of an empty 508 # interesting match, this is clearly the right thing to do, 509 # because no other kind of match is possible in the regions. 510 while besti > alo and bestj > blo and \ 511 isbjunk(b[bestj-1]) and \ 512 a[besti-1] == b[bestj-1]: 513 besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 514 while besti+bestsize < ahi and bestj+bestsize < bhi and \ 515 isbjunk(b[bestj+bestsize]) and \ 516 a[besti+bestsize] == b[bestj+bestsize]: 517 bestsize = bestsize + 1 518 519 return Match(besti, bestj, bestsize) 520 521 def get_matching_blocks(self): 522 """Return list of triples describing matching subsequences. 523 524 Each triple is of the form (i, j, n), and means that 525 a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in 526 i and in j. New in Python 2.5, it's also guaranteed that if 527 (i, j, n) and (i', j', n') are adjacent triples in the list, and 528 the second is not the last triple in the list, then i+n != i' or 529 j+n != j'. IOW, adjacent triples never describe adjacent equal 530 blocks. 531 532 The last triple is a dummy, (len(a), len(b), 0), and is the only 533 triple with n==0. 534 535 >>> s = SequenceMatcher(None, "abxcd", "abcd") 536 >>> s.get_matching_blocks() 537 [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)] 538 """ 539 540 if self.matching_blocks is not None: 541 return self.matching_blocks 542 la, lb = len(self.a), len(self.b) 543 544 # This is most naturally expressed as a recursive algorithm, but 545 # at least one user bumped into extreme use cases that exceeded 546 # the recursion limit on their box. So, now we maintain a list 547 # ('queue`) of blocks we still need to look at, and append partial 548 # results to `matching_blocks` in a loop; the matches are sorted 549 # at the end. 550 queue = [(0, la, 0, lb)] 551 matching_blocks = [] 552 while queue: 553 alo, ahi, blo, bhi = queue.pop() 554 i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi) 555 # a[alo:i] vs b[blo:j] unknown 556 # a[i:i+k] same as b[j:j+k] 557 # a[i+k:ahi] vs b[j+k:bhi] unknown 558 if k: # if k is 0, there was no matching block 559 matching_blocks.append(x) 560 if alo < i and blo < j: 561 queue.append((alo, i, blo, j)) 562 if i+k < ahi and j+k < bhi: 563 queue.append((i+k, ahi, j+k, bhi)) 564 matching_blocks.sort() 565 566 # It's possible that we have adjacent equal blocks in the 567 # matching_blocks list now. Starting with 2.5, this code was added 568 # to collapse them. 569 i1 = j1 = k1 = 0 570 non_adjacent = [] 571 for i2, j2, k2 in matching_blocks: 572 # Is this block adjacent to i1, j1, k1? 573 if i1 + k1 == i2 and j1 + k1 == j2: 574 # Yes, so collapse them -- this just increases the length of 575 # the first block by the length of the second, and the first 576 # block so lengthened remains the block to compare against. 577 k1 += k2 578 else: 579 # Not adjacent. Remember the first block (k1==0 means it's 580 # the dummy we started with), and make the second block the 581 # new block to compare against. 582 if k1: 583 non_adjacent.append((i1, j1, k1)) 584 i1, j1, k1 = i2, j2, k2 585 if k1: 586 non_adjacent.append((i1, j1, k1)) 587 588 non_adjacent.append( (la, lb, 0) ) 589 self.matching_blocks = map(Match._make, non_adjacent) 590 return self.matching_blocks 591 592 def get_opcodes(self): 593 """Return list of 5-tuples describing how to turn a into b. 594 595 Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple 596 has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the 597 tuple preceding it, and likewise for j1 == the previous j2. 598 599 The tags are strings, with these meanings: 600 601 'replace': a[i1:i2] should be replaced by b[j1:j2] 602 'delete': a[i1:i2] should be deleted. 603 Note that j1==j2 in this case. 604 'insert': b[j1:j2] should be inserted at a[i1:i1]. 605 Note that i1==i2 in this case. 606 'equal': a[i1:i2] == b[j1:j2] 607 608 >>> a = "qabxcd" 609 >>> b = "abycdf" 610 >>> s = SequenceMatcher(None, a, b) 611 >>> for tag, i1, i2, j1, j2 in s.get_opcodes(): 612 ... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" % 613 ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])) 614 delete a[0:1] (q) b[0:0] () 615 equal a[1:3] (ab) b[0:2] (ab) 616 replace a[3:4] (x) b[2:3] (y) 617 equal a[4:6] (cd) b[3:5] (cd) 618 insert a[6:6] () b[5:6] (f) 619 """ 620 621 if self.opcodes is not None: 622 return self.opcodes 623 i = j = 0 624 self.opcodes = answer = [] 625 for ai, bj, size in self.get_matching_blocks(): 626 # invariant: we've pumped out correct diffs to change 627 # a[:i] into b[:j], and the next matching block is 628 # a[ai:ai+size] == b[bj:bj+size]. So we need to pump 629 # out a diff to change a[i:ai] into b[j:bj], pump out 630 # the matching block, and move (i,j) beyond the match 631 tag = '' 632 if i < ai and j < bj: 633 tag = 'replace' 634 elif i < ai: 635 tag = 'delete' 636 elif j < bj: 637 tag = 'insert' 638 if tag: 639 answer.append( (tag, i, ai, j, bj) ) 640 i, j = ai+size, bj+size 641 # the list of matching blocks is terminated by a 642 # sentinel with size 0 643 if size: 644 answer.append( ('equal', ai, i, bj, j) ) 645 return answer 646 647 def get_grouped_opcodes(self, n=3): 648 """ Isolate change clusters by eliminating ranges with no changes. 649 650 Return a generator of groups with up to n lines of context. 651 Each group is in the same format as returned by get_opcodes(). 652 653 >>> from pprint import pprint 654 >>> a = map(str, range(1,40)) 655 >>> b = a[:] 656 >>> b[8:8] = ['i'] # Make an insertion 657 >>> b[20] += 'x' # Make a replacement 658 >>> b[23:28] = [] # Make a deletion 659 >>> b[30] += 'y' # Make another replacement 660 >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes())) 661 [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)], 662 [('equal', 16, 19, 17, 20), 663 ('replace', 19, 20, 20, 21), 664 ('equal', 20, 22, 21, 23), 665 ('delete', 22, 27, 23, 23), 666 ('equal', 27, 30, 23, 26)], 667 [('equal', 31, 34, 27, 30), 668 ('replace', 34, 35, 30, 31), 669 ('equal', 35, 38, 31, 34)]] 670 """ 671 672 codes = self.get_opcodes() 673 if not codes: 674 codes = [("equal", 0, 1, 0, 1)] 675 # Fixup leading and trailing groups if they show no changes. 676 if codes[0][0] == 'equal': 677 tag, i1, i2, j1, j2 = codes[0] 678 codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2 679 if codes[-1][0] == 'equal': 680 tag, i1, i2, j1, j2 = codes[-1] 681 codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n) 682 683 nn = n + n 684 group = [] 685 for tag, i1, i2, j1, j2 in codes: 686 # End the current group and start a new one whenever 687 # there is a large range with no changes. 688 if tag == 'equal' and i2-i1 > nn: 689 group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n))) 690 yield group 691 group = [] 692 i1, j1 = max(i1, i2-n), max(j1, j2-n) 693 group.append((tag, i1, i2, j1 ,j2)) 694 if group and not (len(group)==1 and group[0][0] == 'equal'): 695 yield group 696 697 def ratio(self): 698 """Return a measure of the sequences' similarity (float in [0,1]). 699 700 Where T is the total number of elements in both sequences, and 701 M is the number of matches, this is 2.0*M / T. 702 Note that this is 1 if the sequences are identical, and 0 if 703 they have nothing in common. 704 705 .ratio() is expensive to compute if you haven't already computed 706 .get_matching_blocks() or .get_opcodes(), in which case you may 707 want to try .quick_ratio() or .real_quick_ratio() first to get an 708 upper bound. 709 710 >>> s = SequenceMatcher(None, "abcd", "bcde") 711 >>> s.ratio() 712 0.75 713 >>> s.quick_ratio() 714 0.75 715 >>> s.real_quick_ratio() 716 1.0 717 """ 718 719 matches = reduce(lambda sum, triple: sum + triple[-1], 720 self.get_matching_blocks(), 0) 721 return _calculate_ratio(matches, len(self.a) + len(self.b)) 722 723 def quick_ratio(self): 724 """Return an upper bound on ratio() relatively quickly. 725 726 This isn't defined beyond that it is an upper bound on .ratio(), and 727 is faster to compute. 728 """ 729 730 # viewing a and b as multisets, set matches to the cardinality 731 # of their intersection; this counts the number of matches 732 # without regard to order, so is clearly an upper bound 733 if self.fullbcount is None: 734 self.fullbcount = fullbcount = {} 735 for elt in self.b: 736 fullbcount[elt] = fullbcount.get(elt, 0) + 1 737 fullbcount = self.fullbcount 738 # avail[x] is the number of times x appears in 'b' less the 739 # number of times we've seen it in 'a' so far ... kinda 740 avail = {} 741 availhas, matches = avail.__contains__, 0 742 for elt in self.a: 743 if availhas(elt): 744 numb = avail[elt] 745 else: 746 numb = fullbcount.get(elt, 0) 747 avail[elt] = numb - 1 748 if numb > 0: 749 matches = matches + 1 750 return _calculate_ratio(matches, len(self.a) + len(self.b)) 751 752 def real_quick_ratio(self): 753 """Return an upper bound on ratio() very quickly. 754 755 This isn't defined beyond that it is an upper bound on .ratio(), and 756 is faster to compute than either .ratio() or .quick_ratio(). 757 """ 758 759 la, lb = len(self.a), len(self.b) 760 # can't have more matches than the number of elements in the 761 # shorter sequence 762 return _calculate_ratio(min(la, lb), la + lb) 763 764 def get_close_matches(word, possibilities, n=3, cutoff=0.6): 765 """Use SequenceMatcher to return list of the best "good enough" matches. 766 767 word is a sequence for which close matches are desired (typically a 768 string). 769 770 possibilities is a list of sequences against which to match word 771 (typically a list of strings). 772 773 Optional arg n (default 3) is the maximum number of close matches to 774 return. n must be > 0. 775 776 Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities 777 that don't score at least that similar to word are ignored. 778 779 The best (no more than n) matches among the possibilities are returned 780 in a list, sorted by similarity score, most similar first. 781 782 >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"]) 783 ['apple', 'ape'] 784 >>> import keyword as _keyword 785 >>> get_close_matches("wheel", _keyword.kwlist) 786 ['while'] 787 >>> get_close_matches("apple", _keyword.kwlist) 788 [] 789 >>> get_close_matches("accept", _keyword.kwlist) 790 ['except'] 791 """ 792 793 if not n > 0: 794 raise ValueError("n must be > 0: %r" % (n,)) 795 if not 0.0 <= cutoff <= 1.0: 796 raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) 797 result = [] 798 s = SequenceMatcher() 799 s.set_seq2(word) 800 for x in possibilities: 801 s.set_seq1(x) 802 if s.real_quick_ratio() >= cutoff and \ 803 s.quick_ratio() >= cutoff and \ 804 s.ratio() >= cutoff: 805 result.append((s.ratio(), x)) 806 807 # Move the best scorers to head of list 808 result = heapq.nlargest(n, result) 809 # Strip scores for the best n matches 810 return [x for score, x in result] 811 812 def _count_leading(line, ch): 813 """ 814 Return number of `ch` characters at the start of `line`. 815 816 Example: 817 818 >>> _count_leading(' abc', ' ') 819 3 820 """ 821 822 i, n = 0, len(line) 823 while i < n and line[i] == ch: 824 i += 1 825 return i 826 827 class Differ(object): 828 r""" 829 Differ is a class for comparing sequences of lines of text, and 830 producing human-readable differences or deltas. Differ uses 831 SequenceMatcher both to compare sequences of lines, and to compare 832 sequences of characters within similar (near-matching) lines. 833 834 Each line of a Differ delta begins with a two-letter code: 835 836 '- ' line unique to sequence 1 837 '+ ' line unique to sequence 2 838 ' ' line common to both sequences 839 '? ' line not present in either input sequence 840 841 Lines beginning with '? ' attempt to guide the eye to intraline 842 differences, and were not present in either input sequence. These lines 843 can be confusing if the sequences contain tab characters. 844 845 Note that Differ makes no claim to produce a *minimal* diff. To the 846 contrary, minimal diffs are often counter-intuitive, because they synch 847 up anywhere possible, sometimes accidental matches 100 pages apart. 848 Restricting synch points to contiguous matches preserves some notion of 849 locality, at the occasional cost of producing a longer diff. 850 851 Example: Comparing two texts. 852 853 First we set up the texts, sequences of individual single-line strings 854 ending with newlines (such sequences can also be obtained from the 855 `readlines()` method of file-like objects): 856 857 >>> text1 = ''' 1. Beautiful is better than ugly. 858 ... 2. Explicit is better than implicit. 859 ... 3. Simple is better than complex. 860 ... 4. Complex is better than complicated. 861 ... '''.splitlines(1) 862 >>> len(text1) 863 4 864 >>> text1[0][-1] 865 '\n' 866 >>> text2 = ''' 1. Beautiful is better than ugly. 867 ... 3. Simple is better than complex. 868 ... 4. Complicated is better than complex. 869 ... 5. Flat is better than nested. 870 ... '''.splitlines(1) 871 872 Next we instantiate a Differ object: 873 874 >>> d = Differ() 875 876 Note that when instantiating a Differ object we may pass functions to 877 filter out line and character 'junk'. See Differ.__init__ for details. 878 879 Finally, we compare the two: 880 881 >>> result = list(d.compare(text1, text2)) 882 883 'result' is a list of strings, so let's pretty-print it: 884 885 >>> from pprint import pprint as _pprint 886 >>> _pprint(result) 887 [' 1. Beautiful is better than ugly.\n', 888 '- 2. Explicit is better than implicit.\n', 889 '- 3. Simple is better than complex.\n', 890 '+ 3. Simple is better than complex.\n', 891 '? ++\n', 892 '- 4. Complex is better than complicated.\n', 893 '? ^ ---- ^\n', 894 '+ 4. Complicated is better than complex.\n', 895 '? ++++ ^ ^\n', 896 '+ 5. Flat is better than nested.\n'] 897 898 As a single multi-line string it looks like this: 899 900 >>> print ''.join(result), 901 1. Beautiful is better than ugly. 902 - 2. Explicit is better than implicit. 903 - 3. Simple is better than complex. 904 + 3. Simple is better than complex. 905 ? ++ 906 - 4. Complex is better than complicated. 907 ? ^ ---- ^ 908 + 4. Complicated is better than complex. 909 ? ++++ ^ ^ 910 + 5. Flat is better than nested. 911 912 Methods: 913 914 __init__(linejunk=None, charjunk=None) 915 Construct a text differencer, with optional filters. 916 917 compare(a, b) 918 Compare two sequences of lines; generate the resulting delta. 919 """ 920 921 def __init__(self, linejunk=None, charjunk=None): 922 """ 923 Construct a text differencer, with optional filters. 924 925 The two optional keyword parameters are for filter functions: 926 927 - `linejunk`: A function that should accept a single string argument, 928 and return true iff the string is junk. The module-level function 929 `IS_LINE_JUNK` may be used to filter out lines without visible 930 characters, except for at most one splat ('#'). It is recommended 931 to leave linejunk None; as of Python 2.3, the underlying 932 SequenceMatcher class has grown an adaptive notion of "noise" lines 933 that's better than any static definition the author has ever been 934 able to craft. 935 936 - `charjunk`: A function that should accept a string of length 1. The 937 module-level function `IS_CHARACTER_JUNK` may be used to filter out 938 whitespace characters (a blank or tab; **note**: bad idea to include 939 newline in this!). Use of IS_CHARACTER_JUNK is recommended. 940 """ 941 942 self.linejunk = linejunk 943 self.charjunk = charjunk 944 945 def compare(self, a, b): 946 r""" 947 Compare two sequences of lines; generate the resulting delta. 948 949 Each sequence must contain individual single-line strings ending with 950 newlines. Such sequences can be obtained from the `readlines()` method 951 of file-like objects. The delta generated also consists of newline- 952 terminated strings, ready to be printed as-is via the writeline() 953 method of a file-like object. 954 955 Example: 956 957 >>> print ''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(1), 958 ... 'ore\ntree\nemu\n'.splitlines(1))), 959 - one 960 ? ^ 961 + ore 962 ? ^ 963 - two 964 - three 965 ? - 966 + tree 967 + emu 968 """ 969 970 cruncher = SequenceMatcher(self.linejunk, a, b) 971 for tag, alo, ahi, blo, bhi in cruncher.get_opcodes(): 972 if tag == 'replace': 973 g = self._fancy_replace(a, alo, ahi, b, blo, bhi) 974 elif tag == 'delete': 975 g = self._dump('-', a, alo, ahi) 976 elif tag == 'insert': 977 g = self._dump('+', b, blo, bhi) 978 elif tag == 'equal': 979 g = self._dump(' ', a, alo, ahi) 980 else: 981 raise ValueError, 'unknown tag %r' % (tag,) 982 983 for line in g: 984 yield line 985 986 def _dump(self, tag, x, lo, hi): 987 """Generate comparison results for a same-tagged range.""" 988 for i in xrange(lo, hi): 989 yield '%s %s' % (tag, x[i]) 990 991 def _plain_replace(self, a, alo, ahi, b, blo, bhi): 992 assert alo < ahi and blo < bhi 993 # dump the shorter block first -- reduces the burden on short-term 994 # memory if the blocks are of very different sizes 995 if bhi - blo < ahi - alo: 996 first = self._dump('+', b, blo, bhi) 997 second = self._dump('-', a, alo, ahi) 998 else: 999 first = self._dump('-', a, alo, ahi) 1000 second = self._dump('+', b, blo, bhi) 1001 1002 for g in first, second: 1003 for line in g: 1004 yield line 1005 1006 def _fancy_replace(self, a, alo, ahi, b, blo, bhi): 1007 r""" 1008 When replacing one block of lines with another, search the blocks 1009 for *similar* lines; the best-matching pair (if any) is used as a 1010 synch point, and intraline difference marking is done on the 1011 similar pair. Lots of work, but often worth it. 1012 1013 Example: 1014 1015 >>> d = Differ() 1016 >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1, 1017 ... ['abcdefGhijkl\n'], 0, 1) 1018 >>> print ''.join(results), 1019 - abcDefghiJkl 1020 ? ^ ^ ^ 1021 + abcdefGhijkl 1022 ? ^ ^ ^ 1023 """ 1024 1025 # don't synch up unless the lines have a similarity score of at 1026 # least cutoff; best_ratio tracks the best score seen so far 1027 best_ratio, cutoff = 0.74, 0.75 1028 cruncher = SequenceMatcher(self.charjunk) 1029 eqi, eqj = None, None # 1st indices of equal lines (if any) 1030 1031 # search for the pair that matches best without being identical 1032 # (identical lines must be junk lines, & we don't want to synch up 1033 # on junk -- unless we have to) 1034 for j in xrange(blo, bhi): 1035 bj = b[j] 1036 cruncher.set_seq2(bj) 1037 for i in xrange(alo, ahi): 1038 ai = a[i] 1039 if ai == bj: 1040 if eqi is None: 1041 eqi, eqj = i, j 1042 continue 1043 cruncher.set_seq1(ai) 1044 # computing similarity is expensive, so use the quick 1045 # upper bounds first -- have seen this speed up messy 1046 # compares by a factor of 3. 1047 # note that ratio() is only expensive to compute the first 1048 # time it's called on a sequence pair; the expensive part 1049 # of the computation is cached by cruncher 1050 if cruncher.real_quick_ratio() > best_ratio and \ 1051 cruncher.quick_ratio() > best_ratio and \ 1052 cruncher.ratio() > best_ratio: 1053 best_ratio, best_i, best_j = cruncher.ratio(), i, j 1054 if best_ratio < cutoff: 1055 # no non-identical "pretty close" pair 1056 if eqi is None: 1057 # no identical pair either -- treat it as a straight replace 1058 for line in self._plain_replace(a, alo, ahi, b, blo, bhi): 1059 yield line 1060 return 1061 # no close pair, but an identical pair -- synch up on that 1062 best_i, best_j, best_ratio = eqi, eqj, 1.0 1063 else: 1064 # there's a close pair, so forget the identical pair (if any) 1065 eqi = None 1066 1067 # a[best_i] very similar to b[best_j]; eqi is None iff they're not 1068 # identical 1069 1070 # pump out diffs from before the synch point 1071 for line in self._fancy_helper(a, alo, best_i, b, blo, best_j): 1072 yield line 1073 1074 # do intraline marking on the synch pair 1075 aelt, belt = a[best_i], b[best_j] 1076 if eqi is None: 1077 # pump out a '-', '?', '+', '?' quad for the synched lines 1078 atags = btags = "" 1079 cruncher.set_seqs(aelt, belt) 1080 for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes(): 1081 la, lb = ai2 - ai1, bj2 - bj1 1082 if tag == 'replace': 1083 atags += '^' * la 1084 btags += '^' * lb 1085 elif tag == 'delete': 1086 atags += '-' * la 1087 elif tag == 'insert': 1088 btags += '+' * lb 1089 elif tag == 'equal': 1090 atags += ' ' * la 1091 btags += ' ' * lb 1092 else: 1093 raise ValueError, 'unknown tag %r' % (tag,) 1094 for line in self._qformat(aelt, belt, atags, btags): 1095 yield line 1096 else: 1097 # the synch pair is identical 1098 yield ' ' + aelt 1099 1100 # pump out diffs from after the synch point 1101 for line in self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi): 1102 yield line 1103 1104 def _fancy_helper(self, a, alo, ahi, b, blo, bhi): 1105 g = [] 1106 if alo < ahi: 1107 if blo < bhi: 1108 g = self._fancy_replace(a, alo, ahi, b, blo, bhi) 1109 else: 1110 g = self._dump('-', a, alo, ahi) 1111 elif blo < bhi: 1112 g = self._dump('+', b, blo, bhi) 1113 1114 for line in g: 1115 yield line 1116 1117 def _qformat(self, aline, bline, atags, btags): 1118 r""" 1119 Format "?" output and deal with leading tabs. 1120 1121 Example: 1122 1123 >>> d = Differ() 1124 >>> results = d._qformat('\tabcDefghiJkl\n', '\tabcdefGhijkl\n', 1125 ... ' ^ ^ ^ ', ' ^ ^ ^ ') 1126 >>> for line in results: print repr(line) 1127 ... 1128 '- \tabcDefghiJkl\n' 1129 '? \t ^ ^ ^\n' 1130 '+ \tabcdefGhijkl\n' 1131 '? \t ^ ^ ^\n' 1132 """ 1133 1134 # Can hurt, but will probably help most of the time. 1135 common = min(_count_leading(aline, "\t"), 1136 _count_leading(bline, "\t")) 1137 common = min(common, _count_leading(atags[:common], " ")) 1138 common = min(common, _count_leading(btags[:common], " ")) 1139 atags = atags[common:].rstrip() 1140 btags = btags[common:].rstrip() 1141 1142 yield "- " + aline 1143 if atags: 1144 yield "? %s%s\n" % ("\t" * common, atags) 1145 1146 yield "+ " + bline 1147 if btags: 1148 yield "? %s%s\n" % ("\t" * common, btags) 1149 1150 # With respect to junk, an earlier version of ndiff simply refused to 1151 # *start* a match with a junk element. The result was cases like this: 1152 # before: private Thread currentThread; 1153 # after: private volatile Thread currentThread; 1154 # If you consider whitespace to be junk, the longest contiguous match 1155 # not starting with junk is "e Thread currentThread". So ndiff reported 1156 # that "e volatil" was inserted between the 't' and the 'e' in "private". 1157 # While an accurate view, to people that's absurd. The current version 1158 # looks for matching blocks that are entirely junk-free, then extends the 1159 # longest one of those as far as possible but only with matching junk. 1160 # So now "currentThread" is matched, then extended to suck up the 1161 # preceding blank; then "private" is matched, and extended to suck up the 1162 # following blank; then "Thread" is matched; and finally ndiff reports 1163 # that "volatile " was inserted before "Thread". The only quibble 1164 # remaining is that perhaps it was really the case that " volatile" 1165 # was inserted after "private". I can live with that <wink>. 1166 1167 import re 1168 1169 def IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match): 1170 r""" 1171 Return 1 for ignorable line: iff `line` is blank or contains a single '#'. 1172 1173 Examples: 1174 1175 >>> IS_LINE_JUNK('\n') 1176 True 1177 >>> IS_LINE_JUNK(' # \n') 1178 True 1179 >>> IS_LINE_JUNK('hello\n') 1180 False 1181 """ 1182 1183 return pat(line) is not None 1184 1185 def IS_CHARACTER_JUNK(ch, ws=" \t"): 1186 r""" 1187 Return 1 for ignorable character: iff `ch` is a space or tab. 1188 1189 Examples: 1190 1191 >>> IS_CHARACTER_JUNK(' ') 1192 True 1193 >>> IS_CHARACTER_JUNK('\t') 1194 True 1195 >>> IS_CHARACTER_JUNK('\n') 1196 False 1197 >>> IS_CHARACTER_JUNK('x') 1198 False 1199 """ 1200 1201 return ch in ws 1202 1203 1204 ######################################################################## 1205 ### Unified Diff 1206 ######################################################################## 1207 1208 def _format_range_unified(start, stop): 1209 'Convert range to the "ed" format' 1210 # Per the diff spec at http://www.unix.org/single_unix_specification/ 1211 beginning = start + 1 # lines start numbering with one 1212 length = stop - start 1213 if length == 1: 1214 # return '{}'.format(beginning) 1215 return '%s' % (beginning) 1216 if not length: 1217 beginning -= 1 # empty ranges begin at line just before the range 1218 return '%s,%s' % (beginning, length) 1219 1220 def unified_diff(a, b, fromfile='', tofile='', fromfiledate='', 1221 tofiledate='', n=3, lineterm='\n'): 1222 r""" 1223 Compare two sequences of lines; generate the delta as a unified diff. 1224 1225 Unified diffs are a compact way of showing line changes and a few 1226 lines of context. The number of context lines is set by 'n' which 1227 defaults to three. 1228 1229 By default, the diff control lines (those with ---, +++, or @@) are 1230 created with a trailing newline. This is helpful so that inputs 1231 created from file.readlines() result in diffs that are suitable for 1232 file.writelines() since both the inputs and outputs have trailing 1233 newlines. 1234 1235 For inputs that do not have trailing newlines, set the lineterm 1236 argument to "" so that the output will be uniformly newline free. 1237 1238 The unidiff format normally has a header for filenames and modification 1239 times. Any or all of these may be specified using strings for 1240 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. 1241 The modification times are normally expressed in the ISO 8601 format. 1242 1243 Example: 1244 1245 >>> for line in unified_diff('one two three four'.split(), 1246 ... 'zero one tree four'.split(), 'Original', 'Current', 1247 ... '2005-01-26 23:30:50', '2010-04-02 10:20:52', 1248 ... lineterm=''): 1249 ... print line # doctest: +NORMALIZE_WHITESPACE 1250 --- Original 2005-01-26 23:30:50 1251 +++ Current 2010-04-02 10:20:52 1252 @@ -1,4 +1,4 @@ 1253 +zero 1254 one 1255 -two 1256 -three 1257 +tree 1258 four 1259 """ 1260 1261 started = False 1262 for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): 1263 if not started: 1264 started = True 1265 # fromdate = '\t{}'.format(fromfiledate) if fromfiledate else '' 1266 fromdate = '\t%s' % (fromfiledate) if fromfiledate else '' 1267 # todate = '\t{}'.format(tofiledate) if tofiledate else '' 1268 todate = '\t%s' % (tofiledate) if tofiledate else '' 1269 # yield '--- {}{}{}'.format(fromfile, fromdate, lineterm) 1270 yield '--- %s%s%s' % (fromfile, fromdate, lineterm) 1271 # yield '+++ {}{}{}'.format(tofile, todate, lineterm) 1272 yield '+++ %s%s%s' % (tofile, todate, lineterm) 1273 1274 first, last = group[0], group[-1] 1275 file1_range = _format_range_unified(first[1], last[2]) 1276 file2_range = _format_range_unified(first[3], last[4]) 1277 # yield '@@ -{} +{} @@{}'.format(file1_range, file2_range, lineterm) 1278 yield '@@ -%s +%s @@%s' % (file1_range, file2_range, lineterm) 1279 1280 for tag, i1, i2, j1, j2 in group: 1281 if tag == 'equal': 1282 for line in a[i1:i2]: 1283 yield ' ' + line 1284 continue 1285 if tag in ('replace', 'delete'): 1286 for line in a[i1:i2]: 1287 yield '-' + line 1288 if tag in ('replace', 'insert'): 1289 for line in b[j1:j2]: 1290 yield '+' + line 1291 1292 1293 ######################################################################## 1294 ### Context Diff 1295 ######################################################################## 1296 1297 def _format_range_context(start, stop): 1298 'Convert range to the "ed" format' 1299 # Per the diff spec at http://www.unix.org/single_unix_specification/ 1300 beginning = start + 1 # lines start numbering with one 1301 length = stop - start 1302 if not length: 1303 beginning -= 1 # empty ranges begin at line just before the range 1304 if length <= 1: 1305 # return '{}'.format(beginning) 1306 return '%s' % (beginning) 1307 # return '{},{}'.format(beginning, beginning + length - 1) 1308 return '%s,%s' % (beginning, beginning + length - 1) 1309 1310 # See http://www.unix.org/single_unix_specification/ 1311 def context_diff(a, b, fromfile='', tofile='', 1312 fromfiledate='', tofiledate='', n=3, lineterm='\n'): 1313 r""" 1314 Compare two sequences of lines; generate the delta as a context diff. 1315 1316 Context diffs are a compact way of showing line changes and a few 1317 lines of context. The number of context lines is set by 'n' which 1318 defaults to three. 1319 1320 By default, the diff control lines (those with *** or ---) are 1321 created with a trailing newline. This is helpful so that inputs 1322 created from file.readlines() result in diffs that are suitable for 1323 file.writelines() since both the inputs and outputs have trailing 1324 newlines. 1325 1326 For inputs that do not have trailing newlines, set the lineterm 1327 argument to "" so that the output will be uniformly newline free. 1328 1329 The context diff format normally has a header for filenames and 1330 modification times. Any or all of these may be specified using 1331 strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. 1332 The modification times are normally expressed in the ISO 8601 format. 1333 If not specified, the strings default to blanks. 1334 1335 Example: 1336 1337 >>> print ''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(1), 1338 ... 'zero\none\ntree\nfour\n'.splitlines(1), 'Original', 'Current')), 1339 *** Original 1340 --- Current 1341 *************** 1342 *** 1,4 **** 1343 one 1344 ! two 1345 ! three 1346 four 1347 --- 1,4 ---- 1348 + zero 1349 one 1350 ! tree 1351 four 1352 """ 1353 1354 prefix = dict(insert='+ ', delete='- ', replace='! ', equal=' ') 1355 started = False 1356 for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): 1357 if not started: 1358 started = True 1359 # fromdate = '\t{}'.format(fromfiledate) if fromfiledate else '' 1360 fromdate = '\t%s' % (fromfiledate) if fromfiledate else '' 1361 # todate = '\t{}'.format(tofiledate) if tofiledate else '' 1362 todate = '\t%s' % (tofiledate) if tofiledate else '' 1363 # yield '*** {}{}{}'.format(fromfile, fromdate, lineterm) 1364 yield '*** %s%s%s' % (fromfile, fromdate, lineterm) 1365 # yield '--- {}{}{}'.format(tofile, todate, lineterm) 1366 yield '--- %s%s%s' % (tofile, todate, lineterm) 1367 1368 first, last = group[0], group[-1] 1369 yield '***************' + lineterm 1370 1371 file1_range = _format_range_context(first[1], last[2]) 1372 # yield '*** {} ****{}'.format(file1_range, lineterm) 1373 yield '*** %s ****%s' % (file1_range, lineterm) 1374 1375 if any(tag in ('replace', 'delete') for tag, _, _, _, _ in group): 1376 for tag, i1, i2, _, _ in group: 1377 if tag != 'insert': 1378 for line in a[i1:i2]: 1379 yield prefix[tag] + line 1380 1381 file2_range = _format_range_context(first[3], last[4]) 1382 # yield '--- {} ----{}'.format(file2_range, lineterm) 1383 yield '--- %s ----%s' % (file2_range, lineterm) 1384 1385 if any(tag in ('replace', 'insert') for tag, _, _, _, _ in group): 1386 for tag, _, _, j1, j2 in group: 1387 if tag != 'delete': 1388 for line in b[j1:j2]: 1389 yield prefix[tag] + line 1390 1391 def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK): 1392 r""" 1393 Compare `a` and `b` (lists of strings); return a `Differ`-style delta. 1394 1395 Optional keyword parameters `linejunk` and `charjunk` are for filter 1396 functions (or None): 1397 1398 - linejunk: A function that should accept a single string argument, and 1399 return true iff the string is junk. The default is None, and is 1400 recommended; as of Python 2.3, an adaptive notion of "noise" lines is 1401 used that does a good job on its own. 1402 1403 - charjunk: A function that should accept a string of length 1. The 1404 default is module-level function IS_CHARACTER_JUNK, which filters out 1405 whitespace characters (a blank or tab; note: bad idea to include newline 1406 in this!). 1407 1408 Tools/scripts/ndiff.py is a command-line front-end to this function. 1409 1410 Example: 1411 1412 >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1), 1413 ... 'ore\ntree\nemu\n'.splitlines(1)) 1414 >>> print ''.join(diff), 1415 - one 1416 ? ^ 1417 + ore 1418 ? ^ 1419 - two 1420 - three 1421 ? - 1422 + tree 1423 + emu 1424 """ 1425 return Differ(linejunk, charjunk).compare(a, b) 1426 1427 def _mdiff(fromlines, tolines, context=None, linejunk=None, 1428 charjunk=IS_CHARACTER_JUNK): 1429 r"""Returns generator yielding marked up from/to side by side differences. 1430 1431 Arguments: 1432 fromlines -- list of text lines to compared to tolines 1433 tolines -- list of text lines to be compared to fromlines 1434 context -- number of context lines to display on each side of difference, 1435 if None, all from/to text lines will be generated. 1436 linejunk -- passed on to ndiff (see ndiff documentation) 1437 charjunk -- passed on to ndiff (see ndiff documentation) 1438 1439 This function returns an iterator which returns a tuple: 1440 (from line tuple, to line tuple, boolean flag) 1441 1442 from/to line tuple -- (line num, line text) 1443 line num -- integer or None (to indicate a context separation) 1444 line text -- original line text with following markers inserted: 1445 '\0+' -- marks start of added text 1446 '\0-' -- marks start of deleted text 1447 '\0^' -- marks start of changed text 1448 '\1' -- marks end of added/deleted/changed text 1449 1450 boolean flag -- None indicates context separation, True indicates 1451 either "from" or "to" line contains a change, otherwise False. 1452 1453 This function/iterator was originally developed to generate side by side 1454 file difference for making HTML pages (see HtmlDiff class for example 1455 usage). 1456 1457 Note, this function utilizes the ndiff function to generate the side by 1458 side difference markup. Optional ndiff arguments may be passed to this 1459 function and they in turn will be passed to ndiff. 1460 """ 1461 import re 1462 1463 # regular expression for finding intraline change indices 1464 change_re = re.compile('(\++|\-+|\^+)') 1465 1466 # create the difference iterator to generate the differences 1467 diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk) 1468 1469 def _make_line(lines, format_key, side, num_lines=[0,0]): 1470 """Returns line of text with user's change markup and line formatting. 1471 1472 lines -- list of lines from the ndiff generator to produce a line of 1473 text from. When producing the line of text to return, the 1474 lines used are removed from this list. 1475 format_key -- '+' return first line in list with "add" markup around 1476 the entire line. 1477 '-' return first line in list with "delete" markup around 1478 the entire line. 1479 '?' return first line in list with add/delete/change 1480 intraline markup (indices obtained from second line) 1481 None return first line in list with no markup 1482 side -- indice into the num_lines list (0=from,1=to) 1483 num_lines -- from/to current line number. This is NOT intended to be a 1484 passed parameter. It is present as a keyword argument to 1485 maintain memory of the current line numbers between calls 1486 of this function. 1487 1488 Note, this function is purposefully not defined at the module scope so 1489 that data it needs from its parent function (within whose context it 1490 is defined) does not need to be of module scope. 1491 """ 1492 num_lines[side] += 1 1493 # Handle case where no user markup is to be added, just return line of 1494 # text with user's line format to allow for usage of the line number. 1495 if format_key is None: 1496 return (num_lines[side],lines.pop(0)[2:]) 1497 # Handle case of intraline changes 1498 if format_key == '?': 1499 text, markers = lines.pop(0), lines.pop(0) 1500 # find intraline changes (store change type and indices in tuples) 1501 sub_info = [] 1502 def record_sub_info(match_object,sub_info=sub_info): 1503 sub_info.append([match_object.group(1)[0],match_object.span()]) 1504 return match_object.group(1) 1505 change_re.sub(record_sub_info,markers) 1506 # process each tuple inserting our special marks that won't be 1507 # noticed by an xml/html escaper. 1508 for key,(begin,end) in sub_info[::-1]: 1509 text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:] 1510 text = text[2:] 1511 # Handle case of add/delete entire line 1512 else: 1513 text = lines.pop(0)[2:] 1514 # if line of text is just a newline, insert a space so there is 1515 # something for the user to highlight and see. 1516 if not text: 1517 text = ' ' 1518 # insert marks that won't be noticed by an xml/html escaper. 1519 text = '\0' + format_key + text + '\1' 1520 # Return line of text, first allow user's line formatter to do its 1521 # thing (such as adding the line number) then replace the special 1522 # marks with what the user's change markup. 1523 return (num_lines[side],text) 1524 1525 def _line_iterator(): 1526 """Yields from/to lines of text with a change indication. 1527 1528 This function is an iterator. It itself pulls lines from a 1529 differencing iterator, processes them and yields them. When it can 1530 it yields both a "from" and a "to" line, otherwise it will yield one 1531 or the other. In addition to yielding the lines of from/to text, a 1532 boolean flag is yielded to indicate if the text line(s) have 1533 differences in them. 1534 1535 Note, this function is purposefully not defined at the module scope so 1536 that data it needs from its parent function (within whose context it 1537 is defined) does not need to be of module scope. 1538 """ 1539 lines = [] 1540 num_blanks_pending, num_blanks_to_yield = 0, 0 1541 while True: 1542 # Load up next 4 lines so we can look ahead, create strings which 1543 # are a concatenation of the first character of each of the 4 lines 1544 # so we can do some very readable comparisons. 1545 while len(lines) < 4: 1546 try: 1547 lines.append(diff_lines_iterator.next()) 1548 except StopIteration: 1549 lines.append('X') 1550 s = ''.join([line[0] for line in lines]) 1551 if s.startswith('X'): 1552 # When no more lines, pump out any remaining blank lines so the 1553 # corresponding add/delete lines get a matching blank line so 1554 # all line pairs get yielded at the next level. 1555 num_blanks_to_yield = num_blanks_pending 1556 elif s.startswith('-?+?'): 1557 # simple intraline change 1558 yield _make_line(lines,'?',0), _make_line(lines,'?',1), True 1559 continue 1560 elif s.startswith('--++'): 1561 # in delete block, add block coming: we do NOT want to get 1562 # caught up on blank lines yet, just process the delete line 1563 num_blanks_pending -= 1 1564 yield _make_line(lines,'-',0), None, True 1565 continue 1566 elif s.startswith(('--?+', '--+', '- ')): 1567 # in delete block and see an intraline change or unchanged line 1568 # coming: yield the delete line and then blanks 1569 from_line,to_line = _make_line(lines,'-',0), None 1570 num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0 1571 elif s.startswith('-+?'): 1572 # intraline change 1573 yield _make_line(lines,None,0), _make_line(lines,'?',1), True 1574 continue 1575 elif s.startswith('-?+'): 1576 # intraline change 1577 yield _make_line(lines,'?',0), _make_line(lines,None,1), True 1578 continue 1579 elif s.startswith('-'): 1580 # delete FROM line 1581 num_blanks_pending -= 1 1582 yield _make_line(lines,'-',0), None, True 1583 continue 1584 elif s.startswith('+--'): 1585 # in add block, delete block coming: we do NOT want to get 1586 # caught up on blank lines yet, just process the add line 1587 num_blanks_pending += 1 1588 yield None, _make_line(lines,'+',1), True 1589 continue 1590 elif s.startswith(('+ ', '+-')): 1591 # will be leaving an add block: yield blanks then add line 1592 from_line, to_line = None, _make_line(lines,'+',1) 1593 num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0 1594 elif s.startswith('+'): 1595 # inside an add block, yield the add line 1596 num_blanks_pending += 1 1597 yield None, _make_line(lines,'+',1), True 1598 continue 1599 elif s.startswith(' '): 1600 # unchanged text, yield it to both sides 1601 yield _make_line(lines[:],None,0),_make_line(lines,None,1),False 1602 continue 1603 # Catch up on the blank lines so when we yield the next from/to 1604 # pair, they are lined up. 1605 while(num_blanks_to_yield < 0): 1606 num_blanks_to_yield += 1 1607 yield None,('','\n'),True 1608 while(num_blanks_to_yield > 0): 1609 num_blanks_to_yield -= 1 1610 yield ('','\n'),None,True 1611 if s.startswith('X'): 1612 raise StopIteration 1613 else: 1614 yield from_line,to_line,True 1615 1616 def _line_pair_iterator(): 1617 """Yields from/to lines of text with a change indication. 1618 1619 This function is an iterator. It itself pulls lines from the line 1620 iterator. Its difference from that iterator is that this function 1621 always yields a pair of from/to text lines (with the change 1622 indication). If necessary it will collect single from/to lines 1623 until it has a matching pair from/to pair to yield. 1624 1625 Note, this function is purposefully not defined at the module scope so 1626 that data it needs from its parent function (within whose context it 1627 is defined) does not need to be of module scope. 1628 """ 1629 line_iterator = _line_iterator() 1630 fromlines,tolines=[],[] 1631 while True: 1632 # Collecting lines of text until we have a from/to pair 1633 while (len(fromlines)==0 or len(tolines)==0): 1634 from_line, to_line, found_diff =line_iterator.next() 1635 if from_line is not None: 1636 fromlines.append((from_line,found_diff)) 1637 if to_line is not None: 1638 tolines.append((to_line,found_diff)) 1639 # Once we have a pair, remove them from the collection and yield it 1640 from_line, fromDiff = fromlines.pop(0) 1641 to_line, to_diff = tolines.pop(0) 1642 yield (from_line,to_line,fromDiff or to_diff) 1643 1644 # Handle case where user does not want context differencing, just yield 1645 # them up without doing anything else with them. 1646 line_pair_iterator = _line_pair_iterator() 1647 if context is None: 1648 while True: 1649 yield line_pair_iterator.next() 1650 # Handle case where user wants context differencing. We must do some 1651 # storage of lines until we know for sure that they are to be yielded. 1652 else: 1653 context += 1 1654 lines_to_write = 0 1655 while True: 1656 # Store lines up until we find a difference, note use of a 1657 # circular queue because we only need to keep around what 1658 # we need for context. 1659 index, contextLines = 0, [None]*(context) 1660 found_diff = False 1661 while(found_diff is False): 1662 from_line, to_line, found_diff = line_pair_iterator.next() 1663 i = index % context 1664 contextLines[i] = (from_line, to_line, found_diff) 1665 index += 1 1666 # Yield lines that we have collected so far, but first yield 1667 # the user's separator. 1668 if index > context: 1669 yield None, None, None 1670 lines_to_write = context 1671 else: 1672 lines_to_write = index 1673 index = 0 1674 while(lines_to_write): 1675 i = index % context 1676 index += 1 1677 yield contextLines[i] 1678 lines_to_write -= 1 1679 # Now yield the context lines after the change 1680 lines_to_write = context-1 1681 while(lines_to_write): 1682 from_line, to_line, found_diff = line_pair_iterator.next() 1683 # If another change within the context, extend the context 1684 if found_diff: 1685 lines_to_write = context-1 1686 else: 1687 lines_to_write -= 1 1688 yield from_line, to_line, found_diff 1689 1690 1691 _file_template = """ 1692 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 1693 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> 1694 1695 <html> 1696 1697 <head> 1698 <meta http-equiv="Content-Type" 1699 content="text/html; charset=ISO-8859-1" /> 1700 <title></title> 1701 <style type="text/css">%(styles)s 1702 </style> 1703 </head> 1704 1705 <body> 1706 %(table)s%(legend)s 1707 </body> 1708 1709 </html>""" 1710 1711 _styles = """ 1712 table.diff {font-family:Courier; border:medium;} 1713 .diff_header {background-color:#e0e0e0} 1714 td.diff_header {text-align:right} 1715 .diff_next {background-color:#c0c0c0} 1716 .diff_add {background-color:#aaffaa} 1717 .diff_chg {background-color:#ffff77} 1718 .diff_sub {background-color:#ffaaaa}""" 1719 1720 _table_template = """ 1721 <table class="diff" id="difflib_chg_%(prefix)s_top" 1722 cellspacing="0" cellpadding="0" rules="groups" > 1723 <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup> 1724 <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup> 1725 %(header_row)s 1726 <tbody> 1727 %(data_rows)s </tbody> 1728 </table>""" 1729 1730 _legend = """ 1731 <table class="diff" summary="Legends"> 1732 <tr> <th colspan="2"> Legends </th> </tr> 1733 <tr> <td> <table border="" summary="Colors"> 1734 <tr><th> Colors </th> </tr> 1735 <tr><td class="diff_add"> Added </td></tr> 1736 <tr><td class="diff_chg">Changed</td> </tr> 1737 <tr><td class="diff_sub">Deleted</td> </tr> 1738 </table></td> 1739 <td> <table border="" summary="Links"> 1740 <tr><th colspan="2"> Links </th> </tr> 1741 <tr><td>(f)irst change</td> </tr> 1742 <tr><td>(n)ext change</td> </tr> 1743 <tr><td>(t)op</td> </tr> 1744 </table></td> </tr> 1745 </table>""" 1746 1747 class HtmlDiff(object): 1748 """For producing HTML side by side comparison with change highlights. 1749 1750 This class can be used to create an HTML table (or a complete HTML file 1751 containing the table) showing a side by side, line by line comparison 1752 of text with inter-line and intra-line change highlights. The table can 1753 be generated in either full or contextual difference mode. 1754 1755 The following methods are provided for HTML generation: 1756 1757 make_table -- generates HTML for a single side by side table 1758 make_file -- generates complete HTML file with a single side by side table 1759 1760 See tools/scripts/diff.py for an example usage of this class. 1761 """ 1762 1763 _file_template = _file_template 1764 _styles = _styles 1765 _table_template = _table_template 1766 _legend = _legend 1767 _default_prefix = 0 1768 1769 def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None, 1770 charjunk=IS_CHARACTER_JUNK): 1771 """HtmlDiff instance initializer 1772 1773 Arguments: 1774 tabsize -- tab stop spacing, defaults to 8. 1775 wrapcolumn -- column number where lines are broken and wrapped, 1776 defaults to None where lines are not wrapped. 1777 linejunk,charjunk -- keyword arguments passed into ndiff() (used to by 1778 HtmlDiff() to generate the side by side HTML differences). See 1779 ndiff() documentation for argument default values and descriptions. 1780 """ 1781 self._tabsize = tabsize 1782 self._wrapcolumn = wrapcolumn 1783 self._linejunk = linejunk 1784 self._charjunk = charjunk 1785 1786 def make_file(self,fromlines,tolines,fromdesc='',todesc='',context=False, 1787 numlines=5): 1788 """Returns HTML file of side by side comparison with change highlights 1789 1790 Arguments: 1791 fromlines -- list of "from" lines 1792 tolines -- list of "to" lines 1793 fromdesc -- "from" file column header string 1794 todesc -- "to" file column header string 1795 context -- set to True for contextual differences (defaults to False 1796 which shows full differences). 1797 numlines -- number of context lines. When context is set True, 1798 controls number of lines displayed before and after the change. 1799 When context is False, controls the number of lines to place 1800 the "next" link anchors before the next change (so click of 1801 "next" link jumps to just before the change). 1802 """ 1803 1804 return self._file_template % dict( 1805 styles = self._styles, 1806 legend = self._legend, 1807 table = self.make_table(fromlines,tolines,fromdesc,todesc, 1808 context=context,numlines=numlines)) 1809 1810 def _tab_newline_replace(self,fromlines,tolines): 1811 """Returns from/to line lists with tabs expanded and newlines removed. 1812 1813 Instead of tab characters being replaced by the number of spaces 1814 needed to fill in to the next tab stop, this function will fill 1815 the space with tab characters. This is done so that the difference 1816 algorithms can identify changes in a file when tabs are replaced by 1817 spaces and vice versa. At the end of the HTML generation, the tab 1818 characters will be replaced with a nonbreakable space. 1819 """ 1820 def expand_tabs(line): 1821 # hide real spaces 1822 line = line.replace(' ','\0') 1823 # expand tabs into spaces 1824 line = line.expandtabs(self._tabsize) 1825 # replace spaces from expanded tabs back into tab characters 1826 # (we'll replace them with markup after we do differencing) 1827 line = line.replace(' ','\t') 1828 return line.replace('\0',' ').rstrip('\n') 1829 fromlines = [expand_tabs(line) for line in fromlines] 1830 tolines = [expand_tabs(line) for line in tolines] 1831 return fromlines,tolines 1832 1833 def _split_line(self,data_list,line_num,text): 1834 """Builds list of text lines by splitting text lines at wrap point 1835 1836 This function will determine if the input text line needs to be 1837 wrapped (split) into separate lines. If so, the first wrap point 1838 will be determined and the first line appended to the output 1839 text line list. This function is used recursively to handle 1840 the second part of the split line to further split it. 1841 """ 1842 # if blank line or context separator, just add it to the output list 1843 if not line_num: 1844 data_list.append((line_num,text)) 1845 return 1846 1847 # if line text doesn't need wrapping, just add it to the output list 1848 size = len(text) 1849 max = self._wrapcolumn 1850 if (size <= max) or ((size -(text.count('\0')*3)) <= max): 1851 data_list.append((line_num,text)) 1852 return 1853 1854 # scan text looking for the wrap point, keeping track if the wrap 1855 # point is inside markers 1856 i = 0 1857 n = 0 1858 mark = '' 1859 while n < max and i < size: 1860 if text[i] == '\0': 1861 i += 1 1862 mark = text[i] 1863 i += 1 1864 elif text[i] == '\1': 1865 i += 1 1866 mark = '' 1867 else: 1868 i += 1 1869 n += 1 1870 1871 # wrap point is inside text, break it up into separate lines 1872 line1 = text[:i] 1873 line2 = text[i:] 1874 1875 # if wrap point is inside markers, place end marker at end of first 1876 # line and start marker at beginning of second line because each 1877 # line will have its own table tag markup around it. 1878 if mark: 1879 line1 = line1 + '\1' 1880 line2 = '\0' + mark + line2 1881 1882 # tack on first line onto the output list 1883 data_list.append((line_num,line1)) 1884 1885 # use this routine again to wrap the remaining text 1886 self._split_line(data_list,'>',line2) 1887 1888 def _line_wrapper(self,diffs): 1889 """Returns iterator that splits (wraps) mdiff text lines""" 1890 1891 # pull from/to data and flags from mdiff iterator 1892 for fromdata,todata,flag in diffs: 1893 # check for context separators and pass them through 1894 if flag is None: 1895 yield fromdata,todata,flag 1896 continue 1897 (fromline,fromtext),(toline,totext) = fromdata,todata 1898 # for each from/to line split it at the wrap column to form 1899 # list of text lines. 1900 fromlist,tolist = [],[] 1901 self._split_line(fromlist,fromline,fromtext) 1902 self._split_line(tolist,toline,totext) 1903 # yield from/to line in pairs inserting blank lines as 1904 # necessary when one side has more wrapped lines 1905 while fromlist or tolist: 1906 if fromlist: 1907 fromdata = fromlist.pop(0) 1908 else: 1909 fromdata = ('',' ') 1910 if tolist: 1911 todata = tolist.pop(0) 1912 else: 1913 todata = ('',' ') 1914 yield fromdata,todata,flag 1915 1916 def _collect_lines(self,diffs): 1917 """Collects mdiff output into separate lists 1918 1919 Before storing the mdiff from/to data into a list, it is converted 1920 into a single line of text with HTML markup. 1921 """ 1922 1923 fromlist,tolist,flaglist = [],[],[] 1924 # pull from/to data and flags from mdiff style iterator 1925 for fromdata,todata,flag in diffs: 1926 try: 1927 # store HTML markup of the lines into the lists 1928 fromlist.append(self._format_line(0,flag,*fromdata)) 1929 tolist.append(self._format_line(1,flag,*todata)) 1930 except TypeError: 1931 # exceptions occur for lines where context separators go 1932 fromlist.append(None) 1933 tolist.append(None) 1934 flaglist.append(flag) 1935 return fromlist,tolist,flaglist 1936 1937 def _format_line(self,side,flag,linenum,text): 1938 """Returns HTML markup of "from" / "to" text lines 1939 1940 side -- 0 or 1 indicating "from" or "to" text 1941 flag -- indicates if difference on line 1942 linenum -- line number (used for line number column) 1943 text -- line text to be marked up 1944 """ 1945 try: 1946 linenum = '%d' % linenum 1947 id = ' id="%s%s"' % (self._prefix[side],linenum) 1948 except TypeError: 1949 # handle blank lines where linenum is '>' or '' 1950 id = '' 1951 # replace those things that would get confused with HTML symbols 1952 text=text.replace("&","&").replace(">",">").replace("<","<") 1953 1954 # make space non-breakable so they don't get compressed or line wrapped 1955 text = text.replace(' ',' ').rstrip() 1956 1957 return '<td class="diff_header"%s>%s</td><td nowrap="nowrap">%s</td>' \ 1958 % (id,linenum,text) 1959 1960 def _make_prefix(self): 1961 """Create unique anchor prefixes""" 1962 1963 # Generate a unique anchor prefix so multiple tables 1964 # can exist on the same HTML page without conflicts. 1965 fromprefix = "from%d_" % HtmlDiff._default_prefix 1966 toprefix = "to%d_" % HtmlDiff._default_prefix 1967 HtmlDiff._default_prefix += 1 1968 # store prefixes so line format method has access 1969 self._prefix = [fromprefix,toprefix] 1970 1971 def _convert_flags(self,fromlist,tolist,flaglist,context,numlines): 1972 """Makes list of "next" links""" 1973 1974 # all anchor names will be generated using the unique "to" prefix 1975 toprefix = self._prefix[1] 1976 1977 # process change flags, generating middle column of next anchors/links 1978 next_id = ['']*len(flaglist) 1979 next_href = ['']*len(flaglist) 1980 num_chg, in_change = 0, False 1981 last = 0 1982 for i,flag in enumerate(flaglist): 1983 if flag: 1984 if not in_change: 1985 in_change = True 1986 last = i 1987 # at the beginning of a change, drop an anchor a few lines 1988 # (the context lines) before the change for the previous 1989 # link 1990 i = max([0,i-numlines]) 1991 next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg) 1992 # at the beginning of a change, drop a link to the next 1993 # change 1994 num_chg += 1 1995 next_href[last] = '<a href="#difflib_chg_%s_%d">n</a>' % ( 1996 toprefix,num_chg) 1997 else: 1998 in_change = False 1999 # check for cases where there is no content to avoid exceptions 2000 if not flaglist: 2001 flaglist = [False] 2002 next_id = [''] 2003 next_href = [''] 2004 last = 0 2005 if context: 2006 fromlist = ['<td></td><td> No Differences Found </td>'] 2007 tolist = fromlist 2008 else: 2009 fromlist = tolist = ['<td></td><td> Empty File </td>'] 2010 # if not a change on first line, drop a link 2011 if not flaglist[0]: 2012 next_href[0] = '<a href="#difflib_chg_%s_0">f</a>' % toprefix 2013 # redo the last link to link to the top 2014 next_href[last] = '<a href="#difflib_chg_%s_top">t</a>' % (toprefix) 2015 2016 return fromlist,tolist,flaglist,next_href,next_id 2017 2018 def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False, 2019 numlines=5): 2020 """Returns HTML table of side by side comparison with change highlights 2021 2022 Arguments: 2023 fromlines -- list of "from" lines 2024 tolines -- list of "to" lines 2025 fromdesc -- "from" file column header string 2026 todesc -- "to" file column header string 2027 context -- set to True for contextual differences (defaults to False 2028 which shows full differences). 2029 numlines -- number of context lines. When context is set True, 2030 controls number of lines displayed before and after the change. 2031 When context is False, controls the number of lines to place 2032 the "next" link anchors before the next change (so click of 2033 "next" link jumps to just before the change). 2034 """ 2035 2036 # make unique anchor prefixes so that multiple tables may exist 2037 # on the same page without conflict. 2038 self._make_prefix() 2039 2040 # change tabs to spaces before it gets more difficult after we insert 2041 # markup 2042 fromlines,tolines = self._tab_newline_replace(fromlines,tolines) 2043 2044 # create diffs iterator which generates side by side from/to data 2045 if context: 2046 context_lines = numlines 2047 else: 2048 context_lines = None 2049 diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk, 2050 charjunk=self._charjunk) 2051 2052 # set up iterator to wrap lines that exceed desired width 2053 if self._wrapcolumn: 2054 diffs = self._line_wrapper(diffs) 2055 2056 # collect up from/to lines and flags into lists (also format the lines) 2057 fromlist,tolist,flaglist = self._collect_lines(diffs) 2058 2059 # process change flags, generating middle column of next anchors/links 2060 fromlist,tolist,flaglist,next_href,next_id = self._convert_flags( 2061 fromlist,tolist,flaglist,context,numlines) 2062 2063 s = [] 2064 fmt = ' <tr><td class="diff_next"%s>%s</td>%s' + \ 2065 '<td class="diff_next">%s</td>%s</tr>\n' 2066 for i in range(len(flaglist)): 2067 if flaglist[i] is None: 2068 # mdiff yields None on separator lines skip the bogus ones 2069 # generated for the first line 2070 if i > 0: 2071 s.append(' </tbody> \n <tbody>\n') 2072 else: 2073 s.append( fmt % (next_id[i],next_href[i],fromlist[i], 2074 next_href[i],tolist[i])) 2075 if fromdesc or todesc: 2076 header_row = '<thead><tr>%s%s%s%s</tr></thead>' % ( 2077 '<th class="diff_next"><br /></th>', 2078 '<th colspan="2" class="diff_header">%s</th>' % fromdesc, 2079 '<th class="diff_next"><br /></th>', 2080 '<th colspan="2" class="diff_header">%s</th>' % todesc) 2081 else: 2082 header_row = '' 2083 2084 table = self._table_template % dict( 2085 data_rows=''.join(s), 2086 header_row=header_row, 2087 prefix=self._prefix[1]) 2088 2089 return table.replace('\0+','<span class="diff_add">'). \ 2090 replace('\0-','<span class="diff_sub">'). \ 2091 replace('\0^','<span class="diff_chg">'). \ 2092 replace('\1','</span>'). \ 2093 replace('\t',' ') 2094 2095 del re 2096 2097 def restore(delta, which): 2098 r""" 2099 Generate one of the two sequences that generated a delta. 2100 2101 Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract 2102 lines originating from file 1 or 2 (parameter `which`), stripping off line 2103 prefixes. 2104 2105 Examples: 2106 2107 >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1), 2108 ... 'ore\ntree\nemu\n'.splitlines(1)) 2109 >>> diff = list(diff) 2110 >>> print ''.join(restore(diff, 1)), 2111 one 2112 two 2113 three 2114 >>> print ''.join(restore(diff, 2)), 2115 ore 2116 tree 2117 emu 2118 """ 2119 try: 2120 tag = {1: "- ", 2: "+ "}[int(which)] 2121 except KeyError: 2122 raise ValueError, ('unknown delta choice (must be 1 or 2): %r' 2123 % which) 2124 prefixes = (" ", tag) 2125 for line in delta: 2126 if line[:2] in prefixes: 2127 yield line[2:] 2128 2129 # def _test(): 2130 # import doctest, difflib 2131 # return doctest.testmod(difflib) 2132 2133 # if __name__ == "__main__": 2134 # _test()