github.com/blend/go-sdk@v1.20220411.3/diff/match_patch.go (about) 1 /* 2 3 Copyright (c) 2022 - Present. Blend Labs, Inc. All rights reserved 4 Use of this source code is governed by a MIT license that can be found in the LICENSE file. 5 6 */ 7 8 package diff 9 10 import ( 11 "math" 12 "regexp" 13 "strconv" 14 "strings" 15 "time" 16 "unicode/utf8" 17 ) 18 19 // New creates a new MatchPatch object with default parameters. 20 func New() *MatchPatch { 21 // Defaults. 22 return &MatchPatch{ 23 Timeout: time.Second, 24 EditCost: 4, 25 MatchThreshold: 0.5, 26 MatchDistance: 1000, 27 PatchDeleteThreshold: 0.5, 28 PatchMargin: 4, 29 MatchMaxBits: 32, 30 } 31 } 32 33 // MatchPatch holds the configuration for diff-match-patch operations. 34 type MatchPatch struct { 35 // Number of seconds to map a diff before giving up (0 for infinity). 36 Timeout time.Duration 37 // Cost of an empty edit operation in terms of edit characters. 38 EditCost int 39 // How far to search for a match (0 = exact location, 1000+ = broad match). A match this many characters away from the expected location will add 1.0 to the score (0.0 is a perfect match). 40 MatchDistance int 41 // When deleting a large block of text (over ~64 characters), how close do the contents have to be to match the expected contents. (0.0 = perfection, 1.0 = very loose). Note that MatchThreshold controls how closely the end points of a delete need to match. 42 PatchDeleteThreshold float64 43 // Chunk size for context length. 44 PatchMargin int 45 // The number of bits in an int. 46 MatchMaxBits int 47 // At what point is no match declared (0.0 = perfection, 1.0 = very loose). 48 MatchThreshold float64 49 } 50 51 // Diff finds the differences between two texts. 52 // 53 // If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character. 54 // 55 // `checklines` indicates if we should do a line level diff, or treat the text as an atomic unit. 56 func (dmp *MatchPatch) Diff(text1, text2 string, checklines bool) []Diff { 57 return dmp.DiffRunes([]rune(text1), []rune(text2), checklines) 58 } 59 60 // DiffRunes finds the differences between two rune sequences. 61 // 62 // If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character. 63 // 64 // `checklines` indicates if we should do a line level diff, or treat the text as an atomic unit. 65 func (dmp *MatchPatch) DiffRunes(text1, text2 []rune, checklines bool) []Diff { 66 var deadline time.Time 67 if dmp.Timeout > 0 { 68 deadline = time.Now().Add(dmp.Timeout) 69 } 70 return dmp.diffMainRunes(text1, text2, checklines, deadline) 71 } 72 73 func (dmp *MatchPatch) diffMainRunes(text1, text2 []rune, checklines bool, deadline time.Time) []Diff { 74 if runesEqual(text1, text2) { 75 var diffs []Diff 76 if len(text1) > 0 { 77 diffs = append(diffs, Diff{DiffEqual, string(text1)}) 78 } 79 return diffs 80 } 81 82 // Trim off common prefix (speedup). 83 commonlength := commonPrefixLength(text1, text2) 84 commonprefix := text1[:commonlength] 85 text1 = text1[commonlength:] 86 text2 = text2[commonlength:] 87 88 // Trim off common suffix (speedup). 89 commonlength = commonSuffixLength(text1, text2) 90 commonsuffix := text1[len(text1)-commonlength:] 91 text1 = text1[:len(text1)-commonlength] 92 text2 = text2[:len(text2)-commonlength] 93 94 // Compute the diff on the middle block. 95 diffs := dmp.diffCompute(text1, text2, checklines, deadline) 96 97 // Restore the prefix and suffix. 98 if len(commonprefix) != 0 { 99 diffs = append([]Diff{{DiffEqual, string(commonprefix)}}, diffs...) 100 } 101 if len(commonsuffix) != 0 { 102 diffs = append(diffs, Diff{DiffEqual, string(commonsuffix)}) 103 } 104 105 return dmp.diffCleanupMerge(diffs) 106 } 107 108 // diffCompute finds the differences between two rune slices. Assumes that the texts do not have any common prefix or suffix. 109 func (dmp *MatchPatch) diffCompute(text1, text2 []rune, checklines bool, deadline time.Time) []Diff { 110 diffs := []Diff{} 111 if len(text1) == 0 { 112 // Just add some text (speedup). 113 return append(diffs, Diff{DiffInsert, string(text2)}) 114 } else if len(text2) == 0 { 115 // Just delete some text (speedup). 116 return append(diffs, Diff{DiffDelete, string(text1)}) 117 } 118 119 var longtext, shorttext []rune 120 if len(text1) > len(text2) { 121 longtext = text1 122 shorttext = text2 123 } else { 124 longtext = text2 125 shorttext = text1 126 } 127 128 if i := runesIndex(longtext, shorttext); i != -1 { 129 op := DiffInsert 130 // Swap insertions for deletions if diff is reversed. 131 if len(text1) > len(text2) { 132 op = DiffDelete 133 } 134 // Shorter text is inside the longer text (speedup). 135 return []Diff{ 136 {op, string(longtext[:i])}, 137 {DiffEqual, string(shorttext)}, 138 {op, string(longtext[i+len(shorttext):])}, 139 } 140 } else if len(shorttext) == 1 { 141 // Single character string. 142 // After the previous speedup, the character can't be an equality. 143 return []Diff{ 144 {DiffDelete, string(text1)}, 145 {DiffInsert, string(text2)}, 146 } 147 // Check to see if the problem can be split in two. 148 } else if hm := dmp.diffHalfMatch(text1, text2); hm != nil { 149 // A half-match was found, sort out the return data. 150 text1A := hm[0] 151 text1B := hm[1] 152 text2A := hm[2] 153 text2B := hm[3] 154 midCommon := hm[4] 155 // Send both pairs off for separate processing. 156 diffsA := dmp.diffMainRunes(text1A, text2A, checklines, deadline) 157 diffsB := dmp.diffMainRunes(text1B, text2B, checklines, deadline) 158 // Merge the results. 159 diffs := diffsA 160 diffs = append(diffs, Diff{DiffEqual, string(midCommon)}) 161 diffs = append(diffs, diffsB...) 162 return diffs 163 } else if checklines && len(text1) > 100 && len(text2) > 100 { 164 return dmp.diffLineMode(text1, text2, deadline) 165 } 166 return dmp.diffBisectRunes(text1, text2, deadline) 167 } 168 169 // diffLineMode does a quick line-level diff on both []runes, then rediff the parts for greater accuracy. This speedup can produce non-minimal diffs. 170 func (dmp *MatchPatch) diffLineMode(text1, text2 []rune, deadline time.Time) []Diff { 171 // Scan the text on a line-by-line basis first. 172 text1, text2, linearray := dmp.diffLinesToRunes(string(text1), string(text2)) 173 174 diffs := dmp.diffMainRunes(text1, text2, false, deadline) 175 176 // Convert the diff back to original text. 177 diffs = dmp.diffCharsToLines(diffs, linearray) 178 // Eliminate freak matches (e.g. blank lines) 179 diffs = dmp.diffCleanupSemantic(diffs) 180 181 // Rediff any replacement blocks, this time character-by-character. 182 // Add a dummy entry at the end. 183 diffs = append(diffs, Diff{DiffEqual, ""}) 184 185 pointer := 0 186 countDelete := 0 187 countInsert := 0 188 189 // NOTE: Rune slices are slower than using strings in this case. 190 textDelete := "" 191 textInsert := "" 192 193 for pointer < len(diffs) { 194 switch diffs[pointer].Type { 195 case DiffInsert: 196 countInsert++ 197 textInsert += diffs[pointer].Text 198 case DiffDelete: 199 countDelete++ 200 textDelete += diffs[pointer].Text 201 case DiffEqual: 202 // Upon reaching an equality, check for prior redundancies. 203 if countDelete >= 1 && countInsert >= 1 { 204 // Delete the offending records and add the merged ones. 205 diffs = splice(diffs, pointer-countDelete-countInsert, 206 countDelete+countInsert) 207 208 pointer = pointer - countDelete - countInsert 209 a := dmp.diffMainRunes([]rune(textDelete), []rune(textInsert), false, deadline) 210 for j := len(a) - 1; j >= 0; j-- { 211 diffs = splice(diffs, pointer, 0, a[j]) 212 } 213 pointer = pointer + len(a) 214 } 215 216 countInsert = 0 217 countDelete = 0 218 textDelete = "" 219 textInsert = "" 220 } 221 pointer++ 222 } 223 224 return diffs[:len(diffs)-1] // Remove the dummy entry at the end. 225 } 226 227 // diffBisect finds the 'middle snake' of a diff, split the problem in two and return the recursively constructed diff. 228 // If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character. 229 // See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations. 230 func (dmp *MatchPatch) diffBisect(text1, text2 string, deadline time.Time) []Diff { 231 // Unused in this code, but retained for interface compatibility. 232 return dmp.diffBisectRunes([]rune(text1), []rune(text2), deadline) 233 } 234 235 // diffBisect finds the 'middle snake' of a diff, splits the problem in two and returns the recursively constructed diff. 236 // See Myers's 1986 paper: An O(ND) Difference Algorithm and Its Variations. 237 func (dmp *MatchPatch) diffBisectRunes(runes1, runes2 []rune, deadline time.Time) []Diff { 238 // Cache the text lengths to prevent multiple calls. 239 runes1Len, runes2Len := len(runes1), len(runes2) 240 241 maxD := (runes1Len + runes2Len + 1) / 2 242 vOffset := maxD 243 vLength := 2 * maxD 244 245 v1 := make([]int, vLength) 246 v2 := make([]int, vLength) 247 for i := range v1 { 248 v1[i] = -1 249 v2[i] = -1 250 } 251 v1[vOffset+1] = 0 252 v2[vOffset+1] = 0 253 254 delta := runes1Len - runes2Len 255 // If the total number of characters is odd, then the front path will collide with the reverse path. 256 front := (delta%2 != 0) 257 // Offsets for start and end of k loop. Prevents mapping of space beyond the grid. 258 k1start := 0 259 k1end := 0 260 k2start := 0 261 k2end := 0 262 for d := 0; d < maxD; d++ { 263 // Bail out if deadline is reached. 264 if !deadline.IsZero() && d%16 == 0 && time.Now().After(deadline) { 265 break 266 } 267 268 // Walk the front path one step. 269 for k1 := -d + k1start; k1 <= d-k1end; k1 += 2 { 270 k1Offset := vOffset + k1 271 var x1 int 272 273 if k1 == -d || (k1 != d && v1[k1Offset-1] < v1[k1Offset+1]) { 274 x1 = v1[k1Offset+1] 275 } else { 276 x1 = v1[k1Offset-1] + 1 277 } 278 279 y1 := x1 - k1 280 for x1 < runes1Len && y1 < runes2Len { 281 if runes1[x1] != runes2[y1] { 282 break 283 } 284 x1++ 285 y1++ 286 } 287 v1[k1Offset] = x1 288 if x1 > runes1Len { 289 // Ran off the right of the graph. 290 k1end += 2 291 } else if y1 > runes2Len { 292 // Ran off the bottom of the graph. 293 k1start += 2 294 } else if front { 295 k2Offset := vOffset + delta - k1 296 if k2Offset >= 0 && k2Offset < vLength && v2[k2Offset] != -1 { 297 // Mirror x2 onto top-left coordinate system. 298 x2 := runes1Len - v2[k2Offset] 299 if x1 >= x2 { 300 // Overlap detected. 301 return dmp.diffBisectSplit(runes1, runes2, x1, y1, deadline) 302 } 303 } 304 } 305 } 306 // Walk the reverse path one step. 307 for k2 := -d + k2start; k2 <= d-k2end; k2 += 2 { 308 k2Offset := vOffset + k2 309 var x2 int 310 if k2 == -d || (k2 != d && v2[k2Offset-1] < v2[k2Offset+1]) { 311 x2 = v2[k2Offset+1] 312 } else { 313 x2 = v2[k2Offset-1] + 1 314 } 315 var y2 = x2 - k2 316 for x2 < runes1Len && y2 < runes2Len { 317 if runes1[runes1Len-x2-1] != runes2[runes2Len-y2-1] { 318 break 319 } 320 x2++ 321 y2++ 322 } 323 v2[k2Offset] = x2 324 if x2 > runes1Len { 325 // Ran off the left of the graph. 326 k2end += 2 327 } else if y2 > runes2Len { 328 // Ran off the top of the graph. 329 k2start += 2 330 } else if !front { 331 k1Offset := vOffset + delta - k2 332 if k1Offset >= 0 && k1Offset < vLength && v1[k1Offset] != -1 { 333 x1 := v1[k1Offset] 334 y1 := vOffset + x1 - k1Offset 335 // Mirror x2 onto top-left coordinate system. 336 x2 = runes1Len - x2 337 if x1 >= x2 { 338 // Overlap detected. 339 return dmp.diffBisectSplit(runes1, runes2, x1, y1, deadline) 340 } 341 } 342 } 343 } 344 } 345 // Diff took too long and hit the deadline or number of diffs equals number of characters, no commonality at all. 346 return []Diff{ 347 {DiffDelete, string(runes1)}, 348 {DiffInsert, string(runes2)}, 349 } 350 } 351 352 func (dmp *MatchPatch) diffBisectSplit(runes1, runes2 []rune, x, y int, 353 deadline time.Time) []Diff { 354 runes1a := runes1[:x] 355 runes2a := runes2[:y] 356 runes1b := runes1[x:] 357 runes2b := runes2[y:] 358 359 // Compute both diffs serially. 360 diffs := dmp.diffMainRunes(runes1a, runes2a, false, deadline) 361 diffsb := dmp.diffMainRunes(runes1b, runes2b, false, deadline) 362 363 return append(diffs, diffsb...) 364 } 365 366 // diffLinesToChars splits two texts into a list of strings, and educes the texts to a string of hashes where each Unicode character represents one line. 367 // It's slightly faster to call DiffLinesToRunes first, followed by DiffMainRunes. 368 func (dmp *MatchPatch) diffLinesToChars(text1, text2 string) (string, string, []string) { 369 chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2) 370 return chars1, chars2, lineArray 371 } 372 373 // diffLinesToRunes splits two texts into a list of runes. 374 func (dmp *MatchPatch) diffLinesToRunes(text1, text2 string) ([]rune, []rune, []string) { 375 chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2) 376 return []rune(chars1), []rune(chars2), lineArray 377 } 378 379 // diffCharsToLines rehydrates the text in a diff from a string of line hashes to real lines of text. 380 func (dmp *MatchPatch) diffCharsToLines(diffs []Diff, lineArray []string) []Diff { 381 hydrated := make([]Diff, 0, len(diffs)) 382 for _, aDiff := range diffs { 383 chars := strings.Split(aDiff.Text, IndexSeparator) 384 text := make([]string, len(chars)) 385 386 for i, r := range chars { 387 i1, err := strconv.Atoi(r) 388 if err == nil { 389 text[i] = lineArray[i1] 390 } 391 } 392 393 aDiff.Text = strings.Join(text, "") 394 hydrated = append(hydrated, aDiff) 395 } 396 return hydrated 397 } 398 399 // DiffCommonPrefix determines the common prefix length of two strings. 400 func (dmp *MatchPatch) diffCommonPrefix(text1, text2 string) int { 401 // Unused in this code, but retained for interface compatibility. 402 return commonPrefixLength([]rune(text1), []rune(text2)) 403 } 404 405 // diffCommonSuffix determines the common suffix length of two strings. 406 func (dmp *MatchPatch) diffCommonSuffix(text1, text2 string) int { 407 // Unused in this code, but retained for interface compatibility. 408 return commonSuffixLength([]rune(text1), []rune(text2)) 409 } 410 411 // diffCommonOverlap determines if the suffix of one string is the prefix of another. 412 func (dmp *MatchPatch) diffCommonOverlap(text1 string, text2 string) int { 413 // Cache the text lengths to prevent multiple calls. 414 text1Length := len(text1) 415 text2Length := len(text2) 416 // Eliminate the null case. 417 if text1Length == 0 || text2Length == 0 { 418 return 0 419 } 420 // Truncate the longer string. 421 if text1Length > text2Length { 422 text1 = text1[text1Length-text2Length:] 423 } else if text1Length < text2Length { 424 text2 = text2[0:text1Length] 425 } 426 textLength := int(math.Min(float64(text1Length), float64(text2Length))) 427 // Quick check for the worst case. 428 if text1 == text2 { 429 return textLength 430 } 431 432 // Start by looking for a single character match and increase length until no match is found. Performance analysis: http://neil.fraser.name/news/2010/11/04/ 433 best := 0 434 length := 1 435 for { 436 pattern := text1[textLength-length:] 437 found := strings.Index(text2, pattern) 438 if found == -1 { 439 break 440 } 441 length += found 442 if found == 0 || text1[textLength-length:] == text2[0:length] { 443 best = length 444 length++ 445 } 446 } 447 448 return best 449 } 450 451 // DiffHalfMatch checks whether the two texts share a substring which is at least half the length of the longer text. This speedup can produce non-minimal diffs. 452 func (dmp *MatchPatch) DiffHalfMatch(text1, text2 string) []string { 453 // Unused in this code, but retained for interface compatibility. 454 runeSlices := dmp.diffHalfMatch([]rune(text1), []rune(text2)) 455 if runeSlices == nil { 456 return nil 457 } 458 459 result := make([]string, len(runeSlices)) 460 for i, r := range runeSlices { 461 result[i] = string(r) 462 } 463 return result 464 } 465 466 func (dmp *MatchPatch) diffHalfMatch(text1, text2 []rune) [][]rune { 467 if dmp.Timeout <= 0 { 468 // Don't risk returning a non-optimal diff if we have unlimited time. 469 return nil 470 } 471 472 var longtext, shorttext []rune 473 if len(text1) > len(text2) { 474 longtext = text1 475 shorttext = text2 476 } else { 477 longtext = text2 478 shorttext = text1 479 } 480 481 if len(longtext) < 4 || len(shorttext)*2 < len(longtext) { 482 return nil // Pointless. 483 } 484 485 // First check if the second quarter is the seed for a half-match. 486 hm1 := dmp.diffHalfMatchI(longtext, shorttext, int(float64(len(longtext)+3)/4)) 487 488 // Check again based on the third quarter. 489 hm2 := dmp.diffHalfMatchI(longtext, shorttext, int(float64(len(longtext)+1)/2)) 490 491 if hm1 == nil && hm2 == nil { 492 return nil 493 } 494 495 var hm [][]rune 496 if hm2 == nil { 497 hm = hm1 498 } else if hm1 == nil { 499 hm = hm2 500 } else { 501 // Both matched. Select the longest. 502 if len(hm1[4]) > len(hm2[4]) { 503 hm = hm1 504 } else { 505 hm = hm2 506 } 507 } 508 509 // A half-match was found, sort out the return data. 510 if len(text1) > len(text2) { 511 return hm 512 } 513 514 return [][]rune{hm[2], hm[3], hm[0], hm[1], hm[4]} 515 } 516 517 // diffHalfMatchI checks if a substring of shorttext exist within longtext such that the substring is at least half the length of longtext? 518 // Returns a slice containing the prefix of longtext, the suffix of longtext, the prefix of shorttext, the suffix of shorttext and the common middle, or null if there was no match. 519 func (dmp *MatchPatch) diffHalfMatchI(l, s []rune, i int) [][]rune { 520 var bestCommonA []rune 521 var bestCommonB []rune 522 var bestCommonLen int 523 var bestLongtextA []rune 524 var bestLongtextB []rune 525 var bestShorttextA []rune 526 var bestShorttextB []rune 527 528 // Start with a 1/4 length substring at position i as a seed. 529 seed := l[i : i+len(l)/4] 530 531 for j := runesIndexOf(s, seed, 0); j != -1; j = runesIndexOf(s, seed, j+1) { 532 prefixLength := commonPrefixLength(l[i:], s[j:]) 533 suffixLength := commonSuffixLength(l[:i], s[:j]) 534 535 if bestCommonLen < suffixLength+prefixLength { 536 bestCommonA = s[j-suffixLength : j] 537 bestCommonB = s[j : j+prefixLength] 538 bestCommonLen = len(bestCommonA) + len(bestCommonB) 539 bestLongtextA = l[:i-suffixLength] 540 bestLongtextB = l[i+prefixLength:] 541 bestShorttextA = s[:j-suffixLength] 542 bestShorttextB = s[j+prefixLength:] 543 } 544 } 545 546 if bestCommonLen*2 < len(l) { 547 return nil 548 } 549 550 return [][]rune{ 551 bestLongtextA, 552 bestLongtextB, 553 bestShorttextA, 554 bestShorttextB, 555 append(bestCommonA, bestCommonB...), 556 } 557 } 558 559 // diffCleanupSemantic reduces the number of edits by eliminating semantically trivial equalities. 560 func (dmp *MatchPatch) diffCleanupSemantic(diffs []Diff) []Diff { 561 changes := false 562 // Stack of indices where equalities are found. 563 equalities := make([]int, 0, len(diffs)) 564 565 var lastequality string 566 // Always equal to diffs[equalities[equalitiesLength - 1]][1] 567 var pointer int // Index of current position. 568 // Number of characters that changed prior to the equality. 569 var lengthInsertions1, lengthDeletions1 int 570 // Number of characters that changed after the equality. 571 var lengthInsertions2, lengthDeletions2 int 572 573 for pointer < len(diffs) { 574 if diffs[pointer].Type == DiffEqual { 575 // Equality found. 576 equalities = append(equalities, pointer) 577 lengthInsertions1 = lengthInsertions2 578 lengthDeletions1 = lengthDeletions2 579 lengthInsertions2 = 0 580 lengthDeletions2 = 0 581 lastequality = diffs[pointer].Text 582 } else { 583 // An insertion or deletion. 584 585 if diffs[pointer].Type == DiffInsert { 586 lengthInsertions2 += utf8.RuneCountInString(diffs[pointer].Text) 587 } else { 588 lengthDeletions2 += utf8.RuneCountInString(diffs[pointer].Text) 589 } 590 // Eliminate an equality that is smaller or equal to the edits on both sides of it. 591 difference1 := int(math.Max(float64(lengthInsertions1), float64(lengthDeletions1))) 592 difference2 := int(math.Max(float64(lengthInsertions2), float64(lengthDeletions2))) 593 if utf8.RuneCountInString(lastequality) > 0 && 594 (utf8.RuneCountInString(lastequality) <= difference1) && 595 (utf8.RuneCountInString(lastequality) <= difference2) { 596 // Duplicate record. 597 insPoint := equalities[len(equalities)-1] 598 diffs = splice(diffs, insPoint, 0, Diff{DiffDelete, lastequality}) 599 600 // Change second copy to insert. 601 diffs[insPoint+1].Type = DiffInsert 602 // Throw away the equality we just deleted. 603 equalities = equalities[:len(equalities)-1] 604 605 if len(equalities) > 0 { 606 equalities = equalities[:len(equalities)-1] 607 } 608 pointer = -1 609 if len(equalities) > 0 { 610 pointer = equalities[len(equalities)-1] 611 } 612 613 lengthInsertions1 = 0 // Reset the counters. 614 lengthDeletions1 = 0 615 lengthInsertions2 = 0 616 lengthDeletions2 = 0 617 lastequality = "" 618 changes = true 619 } 620 } 621 pointer++ 622 } 623 624 // Normalize the diff. 625 if changes { 626 diffs = dmp.diffCleanupMerge(diffs) 627 } 628 diffs = dmp.diffCleanupSemanticLossless(diffs) 629 // Find any overlaps between deletions and insertions. 630 // e.g: <del>abcxxx</del><ins>xxxdef</ins> 631 // -> <del>abc</del>xxx<ins>def</ins> 632 // e.g: <del>xxxabc</del><ins>defxxx</ins> 633 // -> <ins>def</ins>xxx<del>abc</del> 634 // Only extract an overlap if it is as big as the edit ahead or behind it. 635 pointer = 1 636 for pointer < len(diffs) { 637 if diffs[pointer-1].Type == DiffDelete && 638 diffs[pointer].Type == DiffInsert { 639 deletion := diffs[pointer-1].Text 640 insertion := diffs[pointer].Text 641 overlapLength1 := dmp.diffCommonOverlap(deletion, insertion) 642 overlapLength2 := dmp.diffCommonOverlap(insertion, deletion) 643 if overlapLength1 >= overlapLength2 { 644 if float64(overlapLength1) >= float64(utf8.RuneCountInString(deletion))/2 || 645 float64(overlapLength1) >= float64(utf8.RuneCountInString(insertion))/2 { 646 647 // Overlap found. Insert an equality and trim the surrounding edits. 648 diffs = splice(diffs, pointer, 0, Diff{DiffEqual, insertion[:overlapLength1]}) 649 diffs[pointer-1].Text = 650 deletion[0 : len(deletion)-overlapLength1] 651 diffs[pointer+1].Text = insertion[overlapLength1:] 652 pointer++ 653 } 654 } else { 655 if float64(overlapLength2) >= float64(utf8.RuneCountInString(deletion))/2 || 656 float64(overlapLength2) >= float64(utf8.RuneCountInString(insertion))/2 { 657 // Reverse overlap found. Insert an equality and swap and trim the surrounding edits. 658 overlap := Diff{DiffEqual, deletion[:overlapLength2]} 659 diffs = splice(diffs, pointer, 0, overlap) 660 diffs[pointer-1].Type = DiffInsert 661 diffs[pointer-1].Text = insertion[0 : len(insertion)-overlapLength2] 662 diffs[pointer+1].Type = DiffDelete 663 diffs[pointer+1].Text = deletion[overlapLength2:] 664 pointer++ 665 } 666 } 667 pointer++ 668 } 669 pointer++ 670 } 671 672 return diffs 673 } 674 675 // Define some regex patterns for matching boundaries. 676 var ( 677 nonAlphaNumericRegex = regexp.MustCompile(`[^a-zA-Z0-9]`) 678 whitespaceRegex = regexp.MustCompile(`\s`) 679 linebreakRegex = regexp.MustCompile(`[\r\n]`) 680 blanklineEndRegex = regexp.MustCompile(`\n\r?\n$`) 681 blanklineStartRegex = regexp.MustCompile(`^\r?\n\r?\n`) 682 ) 683 684 // diffCleanupSemanticScore computes a score representing whether the internal boundary falls on logical boundaries. 685 // Scores range from 6 (best) to 0 (worst). Closure, but does not reference any external variables. 686 func (dmp *MatchPatch) diffCleanupSemanticScore(one, two string) int { 687 if len(one) == 0 || len(two) == 0 { 688 // Edges are the best. 689 return 6 690 } 691 692 // Each port of this function behaves slightly differently due to subtle differences in each language's definition of things like 'whitespace'. Since this function's purpose is largely cosmetic, the choice has been made to use each language's native features rather than force total conformity. 693 rune1, _ := utf8.DecodeLastRuneInString(one) 694 rune2, _ := utf8.DecodeRuneInString(two) 695 char1 := string(rune1) 696 char2 := string(rune2) 697 698 nonAlphaNumeric1 := nonAlphaNumericRegex.MatchString(char1) 699 nonAlphaNumeric2 := nonAlphaNumericRegex.MatchString(char2) 700 whitespace1 := nonAlphaNumeric1 && whitespaceRegex.MatchString(char1) 701 whitespace2 := nonAlphaNumeric2 && whitespaceRegex.MatchString(char2) 702 lineBreak1 := whitespace1 && linebreakRegex.MatchString(char1) 703 lineBreak2 := whitespace2 && linebreakRegex.MatchString(char2) 704 blankLine1 := lineBreak1 && blanklineEndRegex.MatchString(one) 705 blankLine2 := lineBreak2 && blanklineEndRegex.MatchString(two) 706 707 if blankLine1 || blankLine2 { 708 // Five points for blank lines. 709 return 5 710 } else if lineBreak1 || lineBreak2 { 711 // Four points for line breaks. 712 return 4 713 } else if nonAlphaNumeric1 && !whitespace1 && whitespace2 { 714 // Three points for end of sentences. 715 return 3 716 } else if whitespace1 || whitespace2 { 717 // Two points for whitespace. 718 return 2 719 } else if nonAlphaNumeric1 || nonAlphaNumeric2 { 720 // One point for non-alphanumeric. 721 return 1 722 } 723 return 0 724 } 725 726 // diffCleanupSemanticLossless looks for single edits surrounded on both sides by equalities which can be shifted sideways to align the edit to a word boundary. 727 // E.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came. 728 func (dmp *MatchPatch) diffCleanupSemanticLossless(diffs []Diff) []Diff { 729 pointer := 1 730 731 // Intentionally ignore the first and last element (don't need checking). 732 for pointer < len(diffs)-1 { 733 if diffs[pointer-1].Type == DiffEqual && 734 diffs[pointer+1].Type == DiffEqual { 735 736 // This is a single edit surrounded by equalities. 737 equality1 := diffs[pointer-1].Text 738 edit := diffs[pointer].Text 739 equality2 := diffs[pointer+1].Text 740 741 // First, shift the edit as far left as possible. 742 commonOffset := dmp.diffCommonSuffix(equality1, edit) 743 if commonOffset > 0 { 744 commonString := edit[len(edit)-commonOffset:] 745 equality1 = equality1[0 : len(equality1)-commonOffset] 746 edit = commonString + edit[:len(edit)-commonOffset] 747 equality2 = commonString + equality2 748 } 749 750 // Second, step character by character right, looking for the best fit. 751 bestEquality1 := equality1 752 bestEdit := edit 753 bestEquality2 := equality2 754 bestScore := dmp.diffCleanupSemanticScore(equality1, edit) + 755 dmp.diffCleanupSemanticScore(edit, equality2) 756 757 for len(edit) != 0 && len(equality2) != 0 { 758 _, sz := utf8.DecodeRuneInString(edit) 759 if len(equality2) < sz || edit[:sz] != equality2[:sz] { 760 break 761 } 762 equality1 += edit[:sz] 763 edit = edit[sz:] + equality2[:sz] 764 equality2 = equality2[sz:] 765 score := dmp.diffCleanupSemanticScore(equality1, edit) + 766 dmp.diffCleanupSemanticScore(edit, equality2) 767 // The >= encourages trailing rather than leading whitespace on edits. 768 if score >= bestScore { 769 bestScore = score 770 bestEquality1 = equality1 771 bestEdit = edit 772 bestEquality2 = equality2 773 } 774 } 775 776 if diffs[pointer-1].Text != bestEquality1 { 777 // We have an improvement, save it back to the diff. 778 if len(bestEquality1) != 0 { 779 diffs[pointer-1].Text = bestEquality1 780 } else { 781 diffs = splice(diffs, pointer-1, 1) 782 pointer-- 783 } 784 785 diffs[pointer].Text = bestEdit 786 if len(bestEquality2) != 0 { 787 diffs[pointer+1].Text = bestEquality2 788 } else { 789 diffs = append(diffs[:pointer+1], diffs[pointer+2:]...) 790 pointer-- 791 } 792 } 793 } 794 pointer++ 795 } 796 797 return diffs 798 } 799 800 // diffCleanupEfficiency reduces the number of edits by eliminating operationally trivial equalities. 801 func (dmp *MatchPatch) diffCleanupEfficiency(diffs []Diff) []Diff { 802 changes := false 803 // Stack of indices where equalities are found. 804 type equality struct { 805 data int 806 next *equality 807 } 808 var equalities *equality 809 // Always equal to equalities[equalitiesLength-1][1] 810 lastequality := "" 811 pointer := 0 // Index of current position. 812 // Is there an insertion operation before the last equality. 813 preIns := false 814 // Is there a deletion operation before the last equality. 815 preDel := false 816 // Is there an insertion operation after the last equality. 817 postIns := false 818 // Is there a deletion operation after the last equality. 819 postDel := false 820 for pointer < len(diffs) { 821 if diffs[pointer].Type == DiffEqual { // Equality found. 822 if len(diffs[pointer].Text) < dmp.EditCost && 823 (postIns || postDel) { 824 // Candidate found. 825 equalities = &equality{ 826 data: pointer, 827 next: equalities, 828 } 829 preIns = postIns 830 preDel = postDel 831 lastequality = diffs[pointer].Text 832 } else { 833 // Not a candidate, and can never become one. 834 equalities = nil 835 lastequality = "" 836 } 837 postIns = false 838 postDel = false 839 } else { // An insertion or deletion. 840 if diffs[pointer].Type == DiffDelete { 841 postDel = true 842 } else { 843 postIns = true 844 } 845 846 // Five types to be split: 847 // <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del> 848 // <ins>A</ins>X<ins>C</ins><del>D</del> 849 // <ins>A</ins><del>B</del>X<ins>C</ins> 850 // <ins>A</del>X<ins>C</ins><del>D</del> 851 // <ins>A</ins><del>B</del>X<del>C</del> 852 var sumPres int 853 if preIns { 854 sumPres++ 855 } 856 if preDel { 857 sumPres++ 858 } 859 if postIns { 860 sumPres++ 861 } 862 if postDel { 863 sumPres++ 864 } 865 if len(lastequality) > 0 && 866 ((preIns && preDel && postIns && postDel) || 867 ((len(lastequality) < dmp.EditCost/2) && sumPres == 3)) { 868 869 insPoint := equalities.data 870 871 // Duplicate record. 872 diffs = splice(diffs, insPoint, 0, Diff{DiffDelete, lastequality}) 873 874 // Change second copy to insert. 875 diffs[insPoint+1].Type = DiffInsert 876 // Throw away the equality we just deleted. 877 equalities = equalities.next 878 lastequality = "" 879 880 if preIns && preDel { 881 // No changes made which could affect previous entry, keep going. 882 postIns = true 883 postDel = true 884 equalities = nil 885 } else { 886 if equalities != nil { 887 equalities = equalities.next 888 } 889 if equalities != nil { 890 pointer = equalities.data 891 } else { 892 pointer = -1 893 } 894 postIns = false 895 postDel = false 896 } 897 changes = true 898 } 899 } 900 pointer++ 901 } 902 903 if changes { 904 diffs = dmp.diffCleanupMerge(diffs) 905 } 906 907 return diffs 908 } 909 910 // diffCleanupMerge reorders and merges like edit sections. Merge equalities. 911 // Any edit section can move as long as it doesn't cross an equality. 912 func (dmp *MatchPatch) diffCleanupMerge(diffs []Diff) []Diff { 913 // Add a dummy entry at the end. 914 diffs = append(diffs, Diff{DiffEqual, ""}) 915 pointer := 0 916 countDelete := 0 917 countInsert := 0 918 commonlength := 0 919 textDelete := []rune(nil) 920 textInsert := []rune(nil) 921 922 for pointer < len(diffs) { 923 switch diffs[pointer].Type { 924 case DiffInsert: 925 countInsert++ 926 textInsert = append(textInsert, []rune(diffs[pointer].Text)...) 927 pointer++ 928 break 929 case DiffDelete: 930 countDelete++ 931 textDelete = append(textDelete, []rune(diffs[pointer].Text)...) 932 pointer++ 933 break 934 case DiffEqual: 935 // Upon reaching an equality, check for prior redundancies. 936 if countDelete+countInsert > 1 { 937 if countDelete != 0 && countInsert != 0 { 938 // Factor out any common prefixies. 939 commonlength = commonPrefixLength(textInsert, textDelete) 940 if commonlength != 0 { 941 x := pointer - countDelete - countInsert 942 if x > 0 && diffs[x-1].Type == DiffEqual { 943 diffs[x-1].Text += string(textInsert[:commonlength]) 944 } else { 945 diffs = append([]Diff{{DiffEqual, string(textInsert[:commonlength])}}, diffs...) 946 pointer++ 947 } 948 textInsert = textInsert[commonlength:] 949 textDelete = textDelete[commonlength:] 950 } 951 // Factor out any common suffixies. 952 commonlength = commonSuffixLength(textInsert, textDelete) 953 if commonlength != 0 { 954 insertIndex := len(textInsert) - commonlength 955 deleteIndex := len(textDelete) - commonlength 956 diffs[pointer].Text = string(textInsert[insertIndex:]) + diffs[pointer].Text 957 textInsert = textInsert[:insertIndex] 958 textDelete = textDelete[:deleteIndex] 959 } 960 } 961 // Delete the offending records and add the merged ones. 962 if countDelete == 0 { 963 diffs = splice(diffs, pointer-countInsert, 964 countDelete+countInsert, 965 Diff{DiffInsert, string(textInsert)}) 966 } else if countInsert == 0 { 967 diffs = splice(diffs, pointer-countDelete, 968 countDelete+countInsert, 969 Diff{DiffDelete, string(textDelete)}) 970 } else { 971 diffs = splice(diffs, pointer-countDelete-countInsert, 972 countDelete+countInsert, 973 Diff{DiffDelete, string(textDelete)}, 974 Diff{DiffInsert, string(textInsert)}) 975 } 976 977 pointer = pointer - countDelete - countInsert + 1 978 if countDelete != 0 { 979 pointer++ 980 } 981 if countInsert != 0 { 982 pointer++ 983 } 984 } else if pointer != 0 && diffs[pointer-1].Type == DiffEqual { 985 // Merge this equality with the previous one. 986 diffs[pointer-1].Text += diffs[pointer].Text 987 diffs = append(diffs[:pointer], diffs[pointer+1:]...) 988 } else { 989 pointer++ 990 } 991 countInsert = 0 992 countDelete = 0 993 textDelete = nil 994 textInsert = nil 995 break 996 } 997 } 998 999 if len(diffs[len(diffs)-1].Text) == 0 { 1000 diffs = diffs[0 : len(diffs)-1] // Remove the dummy entry at the end. 1001 } 1002 1003 // Second pass: look for single edits surrounded on both sides by equalities which can be shifted sideways to eliminate an equality. E.g: A<ins>BA</ins>C -> <ins>AB</ins>AC 1004 changes := false 1005 pointer = 1 1006 // Intentionally ignore the first and last element (don't need checking). 1007 for pointer < (len(diffs) - 1) { 1008 if diffs[pointer-1].Type == DiffEqual && 1009 diffs[pointer+1].Type == DiffEqual { 1010 // This is a single edit surrounded by equalities. 1011 if strings.HasSuffix(diffs[pointer].Text, diffs[pointer-1].Text) { 1012 // Shift the edit over the previous equality. 1013 diffs[pointer].Text = diffs[pointer-1].Text + 1014 diffs[pointer].Text[:len(diffs[pointer].Text)-len(diffs[pointer-1].Text)] 1015 diffs[pointer+1].Text = diffs[pointer-1].Text + diffs[pointer+1].Text 1016 diffs = splice(diffs, pointer-1, 1) 1017 changes = true 1018 } else if strings.HasPrefix(diffs[pointer].Text, diffs[pointer+1].Text) { 1019 // Shift the edit over the next equality. 1020 diffs[pointer-1].Text += diffs[pointer+1].Text 1021 diffs[pointer].Text = 1022 diffs[pointer].Text[len(diffs[pointer+1].Text):] + diffs[pointer+1].Text 1023 diffs = splice(diffs, pointer+1, 1) 1024 changes = true 1025 } 1026 } 1027 pointer++ 1028 } 1029 1030 // If shifts were made, the diff needs reordering and another shift sweep. 1031 if changes { 1032 diffs = dmp.diffCleanupMerge(diffs) 1033 } 1034 1035 return diffs 1036 } 1037 1038 // diffXIndex returns the equivalent location in s2. 1039 func (dmp *MatchPatch) diffXIndex(diffs []Diff, loc int) int { 1040 chars1 := 0 1041 chars2 := 0 1042 lastChars1 := 0 1043 lastChars2 := 0 1044 lastDiff := Diff{} 1045 for i := 0; i < len(diffs); i++ { 1046 aDiff := diffs[i] 1047 if aDiff.Type != DiffInsert { 1048 // Equality or deletion. 1049 chars1 += len(aDiff.Text) 1050 } 1051 if aDiff.Type != DiffDelete { 1052 // Equality or insertion. 1053 chars2 += len(aDiff.Text) 1054 } 1055 if chars1 > loc { 1056 // Overshot the location. 1057 lastDiff = aDiff 1058 break 1059 } 1060 lastChars1 = chars1 1061 lastChars2 = chars2 1062 } 1063 if lastDiff.Type == DiffDelete { 1064 // The location was deleted. 1065 return lastChars2 1066 } 1067 // Add the remaining character length. 1068 return lastChars2 + (loc - lastChars1) 1069 } 1070 1071 // diffLinesToStrings splits two texts into a list of strings. Each string represents one line. 1072 func (dmp *MatchPatch) diffLinesToStrings(text1, text2 string) (string, string, []string) { 1073 // '\x00' is a valid character, but various debuggers don't like it. So we'll insert a junk entry to avoid generating a null character. 1074 lineArray := []string{""} // e.g. lineArray[4] == 'Hello\n' 1075 1076 //Each string has the index of lineArray which it points to 1077 strIndexArray1 := dmp.diffLinesToStringsMunge(text1, &lineArray) 1078 strIndexArray2 := dmp.diffLinesToStringsMunge(text2, &lineArray) 1079 1080 return intArrayToString(strIndexArray1), intArrayToString(strIndexArray2), lineArray 1081 } 1082 1083 // diffLinesToStringsMunge splits a text into an array of strings, and reduces the texts to a []string. 1084 func (dmp *MatchPatch) diffLinesToStringsMunge(text string, lineArray *[]string) []uint32 { 1085 // Walk the text, pulling out a substring for each line. text.split('\n') would would temporarily double our memory footprint. Modifying text would create many large strings to garbage collect. 1086 lineHash := map[string]int{} // e.g. lineHash['Hello\n'] == 4 1087 lineStart := 0 1088 lineEnd := -1 1089 strs := []uint32{} 1090 1091 for lineEnd < len(text)-1 { 1092 lineEnd = indexOf(text, "\n", lineStart) 1093 1094 if lineEnd == -1 { 1095 lineEnd = len(text) - 1 1096 } 1097 1098 line := text[lineStart : lineEnd+1] 1099 lineStart = lineEnd + 1 1100 lineValue, ok := lineHash[line] 1101 1102 if ok { 1103 strs = append(strs, uint32(lineValue)) 1104 } else { 1105 *lineArray = append(*lineArray, line) 1106 lineHash[line] = len(*lineArray) - 1 1107 strs = append(strs, uint32(len(*lineArray)-1)) 1108 } 1109 } 1110 1111 return strs 1112 } 1113 1114 // runesIndex is the equivalent of strings.Index for rune slices. 1115 func runesIndex(r1, r2 []rune) int { 1116 last := len(r1) - len(r2) 1117 for i := 0; i <= last; i++ { 1118 if runesEqual(r1[i:i+len(r2)], r2) { 1119 return i 1120 } 1121 } 1122 return -1 1123 } 1124 1125 // runesIndexOf returns the index of pattern in target, starting at target[i]. 1126 func runesIndexOf(target, pattern []rune, i int) int { 1127 if i > len(target)-1 { 1128 return -1 1129 } 1130 if i <= 0 { 1131 return runesIndex(target, pattern) 1132 } 1133 ind := runesIndex(target[i:], pattern) 1134 if ind == -1 { 1135 return -1 1136 } 1137 return ind + i 1138 } 1139 1140 func runesEqual(r1, r2 []rune) bool { 1141 if len(r1) != len(r2) { 1142 return false 1143 } 1144 for i, c := range r1 { 1145 if c != r2[i] { 1146 return false 1147 } 1148 } 1149 return true 1150 } 1151 1152 // indexOf returns the first index of pattern in str, starting at str[i]. 1153 func indexOf(str string, pattern string, i int) int { 1154 if i > len(str)-1 { 1155 return -1 1156 } 1157 if i <= 0 { 1158 return strings.Index(str, pattern) 1159 } 1160 ind := strings.Index(str[i:], pattern) 1161 if ind == -1 { 1162 return -1 1163 } 1164 return ind + i 1165 } 1166 1167 // lastIndexOf returns the last index of pattern in str, starting at str[i]. 1168 func lastIndexOf(str string, pattern string, i int) int { 1169 if i < 0 { 1170 return -1 1171 } 1172 if i >= len(str) { 1173 return strings.LastIndex(str, pattern) 1174 } 1175 _, size := utf8.DecodeRuneInString(str[i:]) 1176 return strings.LastIndex(str[:i+size], pattern) 1177 } 1178 1179 func intArrayToString(ns []uint32) string { 1180 if len(ns) == 0 { 1181 return "" 1182 } 1183 1184 indexSeparator := IndexSeparator[0] 1185 1186 // Appr. 3 chars per num plus the comma. 1187 b := []byte{} 1188 for _, n := range ns { 1189 b = strconv.AppendInt(b, int64(n), 10) 1190 b = append(b, indexSeparator) 1191 } 1192 b = b[:len(b)-1] 1193 return string(b) 1194 } 1195 1196 // unescaper unescapes selected chars for compatibility with JavaScript's encodeURI. 1197 // In speed critical applications this could be dropped since the receiving application will certainly decode these fine. Note that this function is case-sensitive. Thus "%3F" would not be unescaped. But this is ok because it is only called with the output of HttpUtility.UrlEncode which returns lowercase hex. Example: "%3f" -> "?", "%24" -> "$", etc. 1198 var unescaper = strings.NewReplacer( 1199 "%21", "!", "%7E", "~", "%27", "'", 1200 "%28", "(", "%29", ")", "%3B", ";", 1201 "%2F", "/", "%3F", "?", "%3A", ":", 1202 "%40", "@", "%26", "&", "%3D", "=", 1203 "%2B", "+", "%24", "$", "%2C", ",", "%23", "#", "%2A", "*") 1204 1205 func min(x, y int) int { 1206 if x < y { 1207 return x 1208 } 1209 return y 1210 } 1211 1212 func max(x, y int) int { 1213 if x > y { 1214 return x 1215 } 1216 return y 1217 } 1218 1219 // splice removes amount elements from slice at index index, replacing them with elements. 1220 func splice(slice []Diff, index int, amount int, elements ...Diff) []Diff { 1221 if len(elements) == amount { 1222 // Easy case: overwrite the relevant items. 1223 copy(slice[index:], elements) 1224 return slice 1225 } 1226 if len(elements) < amount { 1227 // Fewer new items than old. 1228 // Copy in the new items. 1229 copy(slice[index:], elements) 1230 // Shift the remaining items left. 1231 copy(slice[index+len(elements):], slice[index+amount:]) 1232 // Calculate the new end of the slice. 1233 end := len(slice) - amount + len(elements) 1234 // Zero stranded elements at end so that they can be garbage collected. 1235 tail := slice[end:] 1236 for i := range tail { 1237 tail[i] = Diff{} 1238 } 1239 return slice[:end] 1240 } 1241 // More new items than old. 1242 // Make room in slice for new elements. 1243 // There's probably an even more efficient way to do this, 1244 // but this is simple and clear. 1245 need := len(slice) - amount + len(elements) 1246 for len(slice) < need { 1247 slice = append(slice, Diff{}) 1248 } 1249 // Shift slice elements right to make room for new elements. 1250 copy(slice[index+len(elements):], slice[index+amount:]) 1251 // Copy in new elements. 1252 copy(slice[index:], elements) 1253 return slice 1254 } 1255 1256 // commonPrefixLength returns the length of the common prefix of two rune slices. 1257 func commonPrefixLength(text1, text2 []rune) int { 1258 // Linear search. See comment in commonSuffixLength. 1259 n := 0 1260 for ; n < len(text1) && n < len(text2); n++ { 1261 if text1[n] != text2[n] { 1262 return n 1263 } 1264 } 1265 return n 1266 } 1267 1268 // commonSuffixLength returns the length of the common suffix of two rune slices. 1269 func commonSuffixLength(text1, text2 []rune) int { 1270 // Use linear search rather than the binary search discussed at https://neil.fraser.name/news/2007/10/09/. 1271 // See discussion at https://github.com/sergi/go-diff/issues/54. 1272 i1 := len(text1) 1273 i2 := len(text2) 1274 for n := 0; ; n++ { 1275 i1-- 1276 i2-- 1277 if i1 < 0 || i2 < 0 || text1[i1] != text2[i2] { 1278 return n 1279 } 1280 } 1281 }