go.charczuk.com@v0.0.0-20240327042549-bc490516bd1a/sdk/diff/match_patch.go (about) 1 /* 2 3 Copyright (c) 2023 - Present. Will Charczuk. All rights reserved. 4 Use of this source code is governed by a MIT license that can be found in the LICENSE file at the root of the repository. 5 6 */ 7 8 package diff 9 10 import ( 11 "math" 12 "regexp" 13 "strconv" 14 "strings" 15 "time" 16 "unicode/utf8" 17 ) 18 19 // New creates a new MatchPatch object with default parameters. 20 func New() *MatchPatch { 21 // Defaults. 22 return &MatchPatch{ 23 Timeout: time.Second, 24 EditCost: 4, 25 MatchThreshold: 0.5, 26 MatchDistance: 1000, 27 PatchDeleteThreshold: 0.5, 28 PatchMargin: 4, 29 MatchMaxBits: 32, 30 } 31 } 32 33 // MatchPatch holds the configuration for diff-match-patch operations. 34 type MatchPatch struct { 35 // Number of seconds to map a diff before giving up (0 for infinity). 36 Timeout time.Duration 37 // Cost of an empty edit operation in terms of edit characters. 38 EditCost int 39 // How far to search for a match (0 = exact location, 1000+ = broad match). A match this many characters away from the expected location will add 1.0 to the score (0.0 is a perfect match). 40 MatchDistance int 41 // When deleting a large block of text (over ~64 characters), how close do the contents have to be to match the expected contents. (0.0 = perfection, 1.0 = very loose). Note that MatchThreshold controls how closely the end points of a delete need to match. 42 PatchDeleteThreshold float64 43 // Chunk size for context length. 44 PatchMargin int 45 // The number of bits in an int. 46 MatchMaxBits int 47 // At what point is no match declared (0.0 = perfection, 1.0 = very loose). 48 MatchThreshold float64 49 } 50 51 // Diff finds the differences between two texts. 52 // 53 // If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character. 54 // 55 // `checklines` indicates if we should do a line level diff, or treat the text as an atomic unit. 56 func (dmp *MatchPatch) Diff(text1, text2 string, checklines bool) []Diff { 57 return dmp.DiffRunes([]rune(text1), []rune(text2), checklines) 58 } 59 60 // DiffRunes finds the differences between two rune sequences. 61 // 62 // If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character. 63 // 64 // `checklines` indicates if we should do a line level diff, or treat the text as an atomic unit. 65 func (dmp *MatchPatch) DiffRunes(text1, text2 []rune, checklines bool) []Diff { 66 var deadline time.Time 67 if dmp.Timeout > 0 { 68 deadline = time.Now().Add(dmp.Timeout) 69 } 70 return dmp.diffMainRunes(text1, text2, checklines, deadline) 71 } 72 73 func (dmp *MatchPatch) diffMainRunes(text1, text2 []rune, checklines bool, deadline time.Time) []Diff { 74 if runesEqual(text1, text2) { 75 var diffs []Diff 76 if len(text1) > 0 { 77 diffs = append(diffs, Diff{DiffEqual, string(text1)}) 78 } 79 return diffs 80 } 81 82 // Trim off common prefix (speedup). 83 commonlength := commonPrefixLength(text1, text2) 84 commonprefix := text1[:commonlength] 85 text1 = text1[commonlength:] 86 text2 = text2[commonlength:] 87 88 // Trim off common suffix (speedup). 89 commonlength = commonSuffixLength(text1, text2) 90 commonsuffix := text1[len(text1)-commonlength:] 91 text1 = text1[:len(text1)-commonlength] 92 text2 = text2[:len(text2)-commonlength] 93 94 // Compute the diff on the middle block. 95 diffs := dmp.diffCompute(text1, text2, checklines, deadline) 96 97 // Restore the prefix and suffix. 98 if len(commonprefix) != 0 { 99 diffs = append([]Diff{{DiffEqual, string(commonprefix)}}, diffs...) 100 } 101 if len(commonsuffix) != 0 { 102 diffs = append(diffs, Diff{DiffEqual, string(commonsuffix)}) 103 } 104 105 return dmp.diffCleanupMerge(diffs) 106 } 107 108 // diffCompute finds the differences between two rune slices. Assumes that the texts do not have any common prefix or suffix. 109 func (dmp *MatchPatch) diffCompute(text1, text2 []rune, checklines bool, deadline time.Time) []Diff { 110 diffs := []Diff{} 111 if len(text1) == 0 { 112 // Just add some text (speedup). 113 return append(diffs, Diff{DiffInsert, string(text2)}) 114 } else if len(text2) == 0 { 115 // Just delete some text (speedup). 116 return append(diffs, Diff{DiffDelete, string(text1)}) 117 } 118 119 var longtext, shorttext []rune 120 if len(text1) > len(text2) { 121 longtext = text1 122 shorttext = text2 123 } else { 124 longtext = text2 125 shorttext = text1 126 } 127 128 if i := runesIndex(longtext, shorttext); i != -1 { 129 op := DiffInsert 130 // Swap insertions for deletions if diff is reversed. 131 if len(text1) > len(text2) { 132 op = DiffDelete 133 } 134 // Shorter text is inside the longer text (speedup). 135 return []Diff{ 136 {op, string(longtext[:i])}, 137 {DiffEqual, string(shorttext)}, 138 {op, string(longtext[i+len(shorttext):])}, 139 } 140 } else if len(shorttext) == 1 { 141 // Single character string. 142 // After the previous speedup, the character can't be an equality. 143 return []Diff{ 144 {DiffDelete, string(text1)}, 145 {DiffInsert, string(text2)}, 146 } 147 // Check to see if the problem can be split in two. 148 } else if hm := dmp.diffHalfMatch(text1, text2); hm != nil { 149 // A half-match was found, sort out the return data. 150 text1A := hm[0] 151 text1B := hm[1] 152 text2A := hm[2] 153 text2B := hm[3] 154 midCommon := hm[4] 155 // Send both pairs off for separate processing. 156 diffsA := dmp.diffMainRunes(text1A, text2A, checklines, deadline) 157 diffsB := dmp.diffMainRunes(text1B, text2B, checklines, deadline) 158 // Merge the results. 159 diffs := diffsA 160 diffs = append(diffs, Diff{DiffEqual, string(midCommon)}) 161 diffs = append(diffs, diffsB...) 162 return diffs 163 } else if checklines && len(text1) > 100 && len(text2) > 100 { 164 return dmp.diffLineMode(text1, text2, deadline) 165 } 166 return dmp.diffBisectRunes(text1, text2, deadline) 167 } 168 169 // diffLineMode does a quick line-level diff on both []runes, then rediff the parts for greater accuracy. This speedup can produce non-minimal diffs. 170 func (dmp *MatchPatch) diffLineMode(text1, text2 []rune, deadline time.Time) []Diff { 171 // Scan the text on a line-by-line basis first. 172 text1, text2, linearray := dmp.diffLinesToRunes(string(text1), string(text2)) 173 174 diffs := dmp.diffMainRunes(text1, text2, false, deadline) 175 176 // Convert the diff back to original text. 177 diffs = dmp.diffCharsToLines(diffs, linearray) 178 // Eliminate freak matches (e.g. blank lines) 179 diffs = dmp.diffCleanupSemantic(diffs) 180 181 // Rediff any replacement blocks, this time character-by-character. 182 // Add a dummy entry at the end. 183 diffs = append(diffs, Diff{DiffEqual, ""}) 184 185 pointer := 0 186 countDelete := 0 187 countInsert := 0 188 189 // NOTE: Rune slices are slower than using strings in this case. 190 textDelete := "" 191 textInsert := "" 192 193 for pointer < len(diffs) { 194 switch diffs[pointer].Type { 195 case DiffInsert: 196 countInsert++ 197 textInsert += diffs[pointer].Text 198 case DiffDelete: 199 countDelete++ 200 textDelete += diffs[pointer].Text 201 case DiffEqual: 202 // Upon reaching an equality, check for prior redundancies. 203 if countDelete >= 1 && countInsert >= 1 { 204 // Delete the offending records and add the merged ones. 205 diffs = splice(diffs, pointer-countDelete-countInsert, 206 countDelete+countInsert) 207 208 pointer = pointer - countDelete - countInsert 209 a := dmp.diffMainRunes([]rune(textDelete), []rune(textInsert), false, deadline) 210 for j := len(a) - 1; j >= 0; j-- { 211 diffs = splice(diffs, pointer, 0, a[j]) 212 } 213 pointer = pointer + len(a) 214 } 215 216 countInsert = 0 217 countDelete = 0 218 textDelete = "" 219 textInsert = "" 220 } 221 pointer++ 222 } 223 224 if len(diffs) > 0 { 225 return diffs[:len(diffs)-1] // Remove the dummy entry at the end. 226 } 227 return diffs 228 } 229 230 // diffBisect finds the 'middle snake' of a diff, split the problem in two and return the recursively constructed diff. 231 // If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character. 232 // See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations. 233 func (dmp *MatchPatch) diffBisect(text1, text2 string, deadline time.Time) []Diff { 234 // Unused in this code, but retained for interface compatibility. 235 return dmp.diffBisectRunes([]rune(text1), []rune(text2), deadline) 236 } 237 238 // diffBisect finds the 'middle snake' of a diff, splits the problem in two and returns the recursively constructed diff. 239 // See Myers's 1986 paper: An O(ND) Difference Algorithm and Its Variations. 240 func (dmp *MatchPatch) diffBisectRunes(runes1, runes2 []rune, deadline time.Time) []Diff { 241 // Cache the text lengths to prevent multiple calls. 242 runes1Len, runes2Len := len(runes1), len(runes2) 243 244 maxD := (runes1Len + runes2Len + 1) / 2 245 vOffset := maxD 246 vLength := 2 * maxD 247 248 v1 := make([]int, vLength) 249 v2 := make([]int, vLength) 250 for i := range v1 { 251 v1[i] = -1 252 v2[i] = -1 253 } 254 v1[vOffset+1] = 0 255 v2[vOffset+1] = 0 256 257 delta := runes1Len - runes2Len 258 // If the total number of characters is odd, then the front path will collide with the reverse path. 259 front := (delta%2 != 0) 260 // Offsets for start and end of k loop. Prevents mapping of space beyond the grid. 261 k1start := 0 262 k1end := 0 263 k2start := 0 264 k2end := 0 265 for d := 0; d < maxD; d++ { 266 // Bail out if deadline is reached. 267 if !deadline.IsZero() && d%16 == 0 && time.Now().After(deadline) { 268 break 269 } 270 271 // Walk the front path one step. 272 for k1 := -d + k1start; k1 <= d-k1end; k1 += 2 { 273 k1Offset := vOffset + k1 274 var x1 int 275 276 if k1 == -d || (k1 != d && v1[k1Offset-1] < v1[k1Offset+1]) { 277 x1 = v1[k1Offset+1] 278 } else { 279 x1 = v1[k1Offset-1] + 1 280 } 281 282 y1 := x1 - k1 283 for x1 < runes1Len && y1 < runes2Len { 284 if runes1[x1] != runes2[y1] { 285 break 286 } 287 x1++ 288 y1++ 289 } 290 v1[k1Offset] = x1 291 if x1 > runes1Len { 292 // Ran off the right of the graph. 293 k1end += 2 294 } else if y1 > runes2Len { 295 // Ran off the bottom of the graph. 296 k1start += 2 297 } else if front { 298 k2Offset := vOffset + delta - k1 299 if k2Offset >= 0 && k2Offset < vLength && v2[k2Offset] != -1 { 300 // Mirror x2 onto top-left coordinate system. 301 x2 := runes1Len - v2[k2Offset] 302 if x1 >= x2 { 303 // Overlap detected. 304 return dmp.diffBisectSplit(runes1, runes2, x1, y1, deadline) 305 } 306 } 307 } 308 } 309 // Walk the reverse path one step. 310 for k2 := -d + k2start; k2 <= d-k2end; k2 += 2 { 311 k2Offset := vOffset + k2 312 var x2 int 313 if k2 == -d || (k2 != d && v2[k2Offset-1] < v2[k2Offset+1]) { 314 x2 = v2[k2Offset+1] 315 } else { 316 x2 = v2[k2Offset-1] + 1 317 } 318 var y2 = x2 - k2 319 for x2 < runes1Len && y2 < runes2Len { 320 if runes1[runes1Len-x2-1] != runes2[runes2Len-y2-1] { 321 break 322 } 323 x2++ 324 y2++ 325 } 326 v2[k2Offset] = x2 327 if x2 > runes1Len { 328 // Ran off the left of the graph. 329 k2end += 2 330 } else if y2 > runes2Len { 331 // Ran off the top of the graph. 332 k2start += 2 333 } else if !front { 334 k1Offset := vOffset + delta - k2 335 if k1Offset >= 0 && k1Offset < vLength && v1[k1Offset] != -1 { 336 x1 := v1[k1Offset] 337 y1 := vOffset + x1 - k1Offset 338 // Mirror x2 onto top-left coordinate system. 339 x2 = runes1Len - x2 340 if x1 >= x2 { 341 // Overlap detected. 342 return dmp.diffBisectSplit(runes1, runes2, x1, y1, deadline) 343 } 344 } 345 } 346 } 347 } 348 // Diff took too long and hit the deadline or number of diffs equals number of characters, no commonality at all. 349 return []Diff{ 350 {DiffDelete, string(runes1)}, 351 {DiffInsert, string(runes2)}, 352 } 353 } 354 355 func (dmp *MatchPatch) diffBisectSplit(runes1, runes2 []rune, x, y int, 356 deadline time.Time) []Diff { 357 runes1a := runes1[:x] 358 runes2a := runes2[:y] 359 runes1b := runes1[x:] 360 runes2b := runes2[y:] 361 362 // Compute both diffs serially. 363 diffs := dmp.diffMainRunes(runes1a, runes2a, false, deadline) 364 diffsb := dmp.diffMainRunes(runes1b, runes2b, false, deadline) 365 366 return append(diffs, diffsb...) 367 } 368 369 // diffLinesToChars splits two texts into a list of strings, and educes the texts to a string of hashes where each Unicode character represents one line. 370 // It's slightly faster to call DiffLinesToRunes first, followed by DiffMainRunes. 371 func (dmp *MatchPatch) diffLinesToChars(text1, text2 string) (string, string, []string) { 372 chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2) 373 return chars1, chars2, lineArray 374 } 375 376 // diffLinesToRunes splits two texts into a list of runes. 377 func (dmp *MatchPatch) diffLinesToRunes(text1, text2 string) ([]rune, []rune, []string) { 378 chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2) 379 return []rune(chars1), []rune(chars2), lineArray 380 } 381 382 // diffCharsToLines rehydrates the text in a diff from a string of line hashes to real lines of text. 383 func (dmp *MatchPatch) diffCharsToLines(diffs []Diff, lineArray []string) []Diff { 384 hydrated := make([]Diff, 0, len(diffs)) 385 for _, aDiff := range diffs { 386 chars := strings.Split(aDiff.Text, IndexSeparator) 387 text := make([]string, len(chars)) 388 389 for i, r := range chars { 390 i1, err := strconv.Atoi(r) 391 if err == nil { 392 text[i] = lineArray[i1] 393 } 394 } 395 396 aDiff.Text = strings.Join(text, "") 397 hydrated = append(hydrated, aDiff) 398 } 399 return hydrated 400 } 401 402 // DiffCommonPrefix determines the common prefix length of two strings. 403 func (dmp *MatchPatch) diffCommonPrefix(text1, text2 string) int { 404 // Unused in this code, but retained for interface compatibility. 405 return commonPrefixLength([]rune(text1), []rune(text2)) 406 } 407 408 // diffCommonSuffix determines the common suffix length of two strings. 409 func (dmp *MatchPatch) diffCommonSuffix(text1, text2 string) int { 410 // Unused in this code, but retained for interface compatibility. 411 return commonSuffixLength([]rune(text1), []rune(text2)) 412 } 413 414 // diffCommonOverlap determines if the suffix of one string is the prefix of another. 415 func (dmp *MatchPatch) diffCommonOverlap(text1 string, text2 string) int { 416 // Cache the text lengths to prevent multiple calls. 417 text1Length := len(text1) 418 text2Length := len(text2) 419 // Eliminate the null case. 420 if text1Length == 0 || text2Length == 0 { 421 return 0 422 } 423 // Truncate the longer string. 424 if text1Length > text2Length { 425 text1 = text1[text1Length-text2Length:] 426 } else if text1Length < text2Length { 427 text2 = text2[0:text1Length] 428 } 429 textLength := int(math.Min(float64(text1Length), float64(text2Length))) 430 // Quick check for the worst case. 431 if text1 == text2 { 432 return textLength 433 } 434 435 // Start by looking for a single character match and increase length until no match is found. Performance analysis: http://neil.fraser.name/news/2010/11/04/ 436 best := 0 437 length := 1 438 for { 439 pattern := text1[textLength-length:] 440 found := strings.Index(text2, pattern) 441 if found == -1 { 442 break 443 } 444 length += found 445 if found == 0 || text1[textLength-length:] == text2[0:length] { 446 best = length 447 length++ 448 } 449 } 450 451 return best 452 } 453 454 // DiffHalfMatch checks whether the two texts share a substring which is at least half the length of the longer text. This speedup can produce non-minimal diffs. 455 func (dmp *MatchPatch) DiffHalfMatch(text1, text2 string) []string { 456 // Unused in this code, but retained for interface compatibility. 457 runeSlices := dmp.diffHalfMatch([]rune(text1), []rune(text2)) 458 if runeSlices == nil { 459 return nil 460 } 461 462 result := make([]string, len(runeSlices)) 463 for i, r := range runeSlices { 464 result[i] = string(r) 465 } 466 return result 467 } 468 469 func (dmp *MatchPatch) diffHalfMatch(text1, text2 []rune) [][]rune { 470 if dmp.Timeout <= 0 { 471 // Don't risk returning a non-optimal diff if we have unlimited time. 472 return nil 473 } 474 475 var longtext, shorttext []rune 476 if len(text1) > len(text2) { 477 longtext = text1 478 shorttext = text2 479 } else { 480 longtext = text2 481 shorttext = text1 482 } 483 484 if len(longtext) < 4 || len(shorttext)*2 < len(longtext) { 485 return nil // Pointless. 486 } 487 488 // First check if the second quarter is the seed for a half-match. 489 hm1 := dmp.diffHalfMatchI(longtext, shorttext, int(float64(len(longtext)+3)/4)) 490 491 // Check again based on the third quarter. 492 hm2 := dmp.diffHalfMatchI(longtext, shorttext, int(float64(len(longtext)+1)/2)) 493 494 if hm1 == nil && hm2 == nil { 495 return nil 496 } 497 498 var hm [][]rune 499 if hm2 == nil { 500 hm = hm1 501 } else if hm1 == nil { 502 hm = hm2 503 } else { 504 // Both matched. Select the longest. 505 if len(hm1[4]) > len(hm2[4]) { 506 hm = hm1 507 } else { 508 hm = hm2 509 } 510 } 511 512 // A half-match was found, sort out the return data. 513 if len(text1) > len(text2) { 514 return hm 515 } 516 517 return [][]rune{hm[2], hm[3], hm[0], hm[1], hm[4]} 518 } 519 520 // diffHalfMatchI checks if a substring of shorttext exist within longtext such that the substring is at least half the length of longtext? 521 // Returns a slice containing the prefix of longtext, the suffix of longtext, the prefix of shorttext, the suffix of shorttext and the common middle, or null if there was no match. 522 func (dmp *MatchPatch) diffHalfMatchI(l, s []rune, i int) [][]rune { 523 var bestCommonA []rune 524 var bestCommonB []rune 525 var bestCommonLen int 526 var bestLongtextA []rune 527 var bestLongtextB []rune 528 var bestShorttextA []rune 529 var bestShorttextB []rune 530 531 // Start with a 1/4 length substring at position i as a seed. 532 seed := l[i : i+len(l)/4] 533 534 for j := runesIndexOf(s, seed, 0); j != -1; j = runesIndexOf(s, seed, j+1) { 535 prefixLength := commonPrefixLength(l[i:], s[j:]) 536 suffixLength := commonSuffixLength(l[:i], s[:j]) 537 538 if bestCommonLen < suffixLength+prefixLength { 539 bestCommonA = s[j-suffixLength : j] 540 bestCommonB = s[j : j+prefixLength] 541 bestCommonLen = len(bestCommonA) + len(bestCommonB) 542 bestLongtextA = l[:i-suffixLength] 543 bestLongtextB = l[i+prefixLength:] 544 bestShorttextA = s[:j-suffixLength] 545 bestShorttextB = s[j+prefixLength:] 546 } 547 } 548 549 if bestCommonLen*2 < len(l) { 550 return nil 551 } 552 553 return [][]rune{ 554 bestLongtextA, 555 bestLongtextB, 556 bestShorttextA, 557 bestShorttextB, 558 append(bestCommonA, bestCommonB...), 559 } 560 } 561 562 // diffCleanupSemantic reduces the number of edits by eliminating semantically trivial equalities. 563 func (dmp *MatchPatch) diffCleanupSemantic(diffs []Diff) []Diff { 564 changes := false 565 // Stack of indices where equalities are found. 566 equalities := make([]int, 0, len(diffs)) 567 568 var lastequality string 569 // Always equal to diffs[equalities[equalitiesLength - 1]][1] 570 var pointer int // Index of current position. 571 // Number of characters that changed prior to the equality. 572 var lengthInsertions1, lengthDeletions1 int 573 // Number of characters that changed after the equality. 574 var lengthInsertions2, lengthDeletions2 int 575 576 for pointer < len(diffs) { 577 if diffs[pointer].Type == DiffEqual { 578 // Equality found. 579 equalities = append(equalities, pointer) 580 lengthInsertions1 = lengthInsertions2 581 lengthDeletions1 = lengthDeletions2 582 lengthInsertions2 = 0 583 lengthDeletions2 = 0 584 lastequality = diffs[pointer].Text 585 } else { 586 // An insertion or deletion. 587 588 if diffs[pointer].Type == DiffInsert { 589 lengthInsertions2 += utf8.RuneCountInString(diffs[pointer].Text) 590 } else { 591 lengthDeletions2 += utf8.RuneCountInString(diffs[pointer].Text) 592 } 593 // Eliminate an equality that is smaller or equal to the edits on both sides of it. 594 difference1 := int(math.Max(float64(lengthInsertions1), float64(lengthDeletions1))) 595 difference2 := int(math.Max(float64(lengthInsertions2), float64(lengthDeletions2))) 596 if utf8.RuneCountInString(lastequality) > 0 && 597 (utf8.RuneCountInString(lastequality) <= difference1) && 598 (utf8.RuneCountInString(lastequality) <= difference2) { 599 // Duplicate record. 600 insPoint := equalities[len(equalities)-1] 601 diffs = splice(diffs, insPoint, 0, Diff{DiffDelete, lastequality}) 602 603 // Change second copy to insert. 604 diffs[insPoint+1].Type = DiffInsert 605 // Throw away the equality we just deleted. 606 equalities = equalities[:len(equalities)-1] 607 608 if len(equalities) > 0 { 609 equalities = equalities[:len(equalities)-1] 610 } 611 pointer = -1 612 if len(equalities) > 0 { 613 pointer = equalities[len(equalities)-1] 614 } 615 616 lengthInsertions1 = 0 // Reset the counters. 617 lengthDeletions1 = 0 618 lengthInsertions2 = 0 619 lengthDeletions2 = 0 620 lastequality = "" 621 changes = true 622 } 623 } 624 pointer++ 625 } 626 627 // Normalize the diff. 628 if changes { 629 diffs = dmp.diffCleanupMerge(diffs) 630 } 631 diffs = dmp.diffCleanupSemanticLossless(diffs) 632 // Find any overlaps between deletions and insertions. 633 // e.g: <del>abcxxx</del><ins>xxxdef</ins> 634 // -> <del>abc</del>xxx<ins>def</ins> 635 // e.g: <del>xxxabc</del><ins>defxxx</ins> 636 // -> <ins>def</ins>xxx<del>abc</del> 637 // Only extract an overlap if it is as big as the edit ahead or behind it. 638 pointer = 1 639 for pointer < len(diffs) { 640 if diffs[pointer-1].Type == DiffDelete && 641 diffs[pointer].Type == DiffInsert { 642 deletion := diffs[pointer-1].Text 643 insertion := diffs[pointer].Text 644 overlapLength1 := dmp.diffCommonOverlap(deletion, insertion) 645 overlapLength2 := dmp.diffCommonOverlap(insertion, deletion) 646 if overlapLength1 >= overlapLength2 { 647 if float64(overlapLength1) >= float64(utf8.RuneCountInString(deletion))/2 || 648 float64(overlapLength1) >= float64(utf8.RuneCountInString(insertion))/2 { 649 650 // Overlap found. Insert an equality and trim the surrounding edits. 651 diffs = splice(diffs, pointer, 0, Diff{DiffEqual, insertion[:overlapLength1]}) 652 diffs[pointer-1].Text = 653 deletion[0 : len(deletion)-overlapLength1] 654 diffs[pointer+1].Text = insertion[overlapLength1:] 655 pointer++ 656 } 657 } else { 658 if float64(overlapLength2) >= float64(utf8.RuneCountInString(deletion))/2 || 659 float64(overlapLength2) >= float64(utf8.RuneCountInString(insertion))/2 { 660 // Reverse overlap found. Insert an equality and swap and trim the surrounding edits. 661 overlap := Diff{DiffEqual, deletion[:overlapLength2]} 662 diffs = splice(diffs, pointer, 0, overlap) 663 diffs[pointer-1].Type = DiffInsert 664 diffs[pointer-1].Text = insertion[0 : len(insertion)-overlapLength2] 665 diffs[pointer+1].Type = DiffDelete 666 diffs[pointer+1].Text = deletion[overlapLength2:] 667 pointer++ 668 } 669 } 670 pointer++ 671 } 672 pointer++ 673 } 674 675 return diffs 676 } 677 678 // Define some regex patterns for matching boundaries. 679 var ( 680 nonAlphaNumericRegex = regexp.MustCompile(`[^a-zA-Z0-9]`) 681 whitespaceRegex = regexp.MustCompile(`\s`) 682 linebreakRegex = regexp.MustCompile(`[\r\n]`) 683 blanklineEndRegex = regexp.MustCompile(`\n\r?\n$`) 684 blanklineStartRegex = regexp.MustCompile(`^\r?\n\r?\n`) 685 ) 686 687 // diffCleanupSemanticScore computes a score representing whether the internal boundary falls on logical boundaries. 688 // Scores range from 6 (best) to 0 (worst). Closure, but does not reference any external variables. 689 func (dmp *MatchPatch) diffCleanupSemanticScore(one, two string) int { 690 if len(one) == 0 || len(two) == 0 { 691 // Edges are the best. 692 return 6 693 } 694 695 // Each port of this function behaves slightly differently due to subtle differences in each language's definition of things like 'whitespace'. Since this function's purpose is largely cosmetic, the choice has been made to use each language's native features rather than force total conformity. 696 rune1, _ := utf8.DecodeLastRuneInString(one) 697 rune2, _ := utf8.DecodeRuneInString(two) 698 char1 := string(rune1) 699 char2 := string(rune2) 700 701 nonAlphaNumeric1 := nonAlphaNumericRegex.MatchString(char1) 702 nonAlphaNumeric2 := nonAlphaNumericRegex.MatchString(char2) 703 whitespace1 := nonAlphaNumeric1 && whitespaceRegex.MatchString(char1) 704 whitespace2 := nonAlphaNumeric2 && whitespaceRegex.MatchString(char2) 705 lineBreak1 := whitespace1 && linebreakRegex.MatchString(char1) 706 lineBreak2 := whitespace2 && linebreakRegex.MatchString(char2) 707 blankLine1 := lineBreak1 && blanklineEndRegex.MatchString(one) 708 blankLine2 := lineBreak2 && blanklineEndRegex.MatchString(two) 709 710 if blankLine1 || blankLine2 { 711 // Five points for blank lines. 712 return 5 713 } else if lineBreak1 || lineBreak2 { 714 // Four points for line breaks. 715 return 4 716 } else if nonAlphaNumeric1 && !whitespace1 && whitespace2 { 717 // Three points for end of sentences. 718 return 3 719 } else if whitespace1 || whitespace2 { 720 // Two points for whitespace. 721 return 2 722 } else if nonAlphaNumeric1 || nonAlphaNumeric2 { 723 // One point for non-alphanumeric. 724 return 1 725 } 726 return 0 727 } 728 729 // diffCleanupSemanticLossless looks for single edits surrounded on both sides by equalities which can be shifted sideways to align the edit to a word boundary. 730 // E.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came. 731 func (dmp *MatchPatch) diffCleanupSemanticLossless(diffs []Diff) []Diff { 732 pointer := 1 733 734 // Intentionally ignore the first and last element (don't need checking). 735 for pointer < len(diffs)-1 { 736 if diffs[pointer-1].Type == DiffEqual && 737 diffs[pointer+1].Type == DiffEqual { 738 739 // This is a single edit surrounded by equalities. 740 equality1 := diffs[pointer-1].Text 741 edit := diffs[pointer].Text 742 equality2 := diffs[pointer+1].Text 743 744 // First, shift the edit as far left as possible. 745 commonOffset := dmp.diffCommonSuffix(equality1, edit) 746 if commonOffset > 0 { 747 commonString := edit[len(edit)-commonOffset:] 748 equality1 = equality1[0 : len(equality1)-commonOffset] 749 edit = commonString + edit[:len(edit)-commonOffset] 750 equality2 = commonString + equality2 751 } 752 753 // Second, step character by character right, looking for the best fit. 754 bestEquality1 := equality1 755 bestEdit := edit 756 bestEquality2 := equality2 757 bestScore := dmp.diffCleanupSemanticScore(equality1, edit) + 758 dmp.diffCleanupSemanticScore(edit, equality2) 759 760 for len(edit) != 0 && len(equality2) != 0 { 761 _, sz := utf8.DecodeRuneInString(edit) 762 if len(equality2) < sz || edit[:sz] != equality2[:sz] { 763 break 764 } 765 equality1 += edit[:sz] 766 edit = edit[sz:] + equality2[:sz] 767 equality2 = equality2[sz:] 768 score := dmp.diffCleanupSemanticScore(equality1, edit) + 769 dmp.diffCleanupSemanticScore(edit, equality2) 770 // The >= encourages trailing rather than leading whitespace on edits. 771 if score >= bestScore { 772 bestScore = score 773 bestEquality1 = equality1 774 bestEdit = edit 775 bestEquality2 = equality2 776 } 777 } 778 779 if diffs[pointer-1].Text != bestEquality1 { 780 // We have an improvement, save it back to the diff. 781 if len(bestEquality1) != 0 { 782 diffs[pointer-1].Text = bestEquality1 783 } else { 784 diffs = splice(diffs, pointer-1, 1) 785 pointer-- 786 } 787 788 diffs[pointer].Text = bestEdit 789 if len(bestEquality2) != 0 { 790 diffs[pointer+1].Text = bestEquality2 791 } else { 792 diffs = append(diffs[:pointer+1], diffs[pointer+2:]...) 793 pointer-- 794 } 795 } 796 } 797 pointer++ 798 } 799 800 return diffs 801 } 802 803 // diffCleanupEfficiency reduces the number of edits by eliminating operationally trivial equalities. 804 func (dmp *MatchPatch) diffCleanupEfficiency(diffs []Diff) []Diff { 805 changes := false 806 // Stack of indices where equalities are found. 807 type equality struct { 808 data int 809 next *equality 810 } 811 var equalities *equality 812 // Always equal to equalities[equalitiesLength-1][1] 813 lastequality := "" 814 pointer := 0 // Index of current position. 815 // Is there an insertion operation before the last equality. 816 preIns := false 817 // Is there a deletion operation before the last equality. 818 preDel := false 819 // Is there an insertion operation after the last equality. 820 postIns := false 821 // Is there a deletion operation after the last equality. 822 postDel := false 823 for pointer < len(diffs) { 824 if diffs[pointer].Type == DiffEqual { // Equality found. 825 if len(diffs[pointer].Text) < dmp.EditCost && 826 (postIns || postDel) { 827 // Candidate found. 828 equalities = &equality{ 829 data: pointer, 830 next: equalities, 831 } 832 preIns = postIns 833 preDel = postDel 834 lastequality = diffs[pointer].Text 835 } else { 836 // Not a candidate, and can never become one. 837 equalities = nil 838 lastequality = "" 839 } 840 postIns = false 841 postDel = false 842 } else { // An insertion or deletion. 843 if diffs[pointer].Type == DiffDelete { 844 postDel = true 845 } else { 846 postIns = true 847 } 848 849 // Five types to be split: 850 // <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del> 851 // <ins>A</ins>X<ins>C</ins><del>D</del> 852 // <ins>A</ins><del>B</del>X<ins>C</ins> 853 // <ins>A</del>X<ins>C</ins><del>D</del> 854 // <ins>A</ins><del>B</del>X<del>C</del> 855 var sumPres int 856 if preIns { 857 sumPres++ 858 } 859 if preDel { 860 sumPres++ 861 } 862 if postIns { 863 sumPres++ 864 } 865 if postDel { 866 sumPres++ 867 } 868 if len(lastequality) > 0 && 869 ((preIns && preDel && postIns && postDel) || 870 ((len(lastequality) < dmp.EditCost/2) && sumPres == 3)) { 871 872 insPoint := equalities.data 873 874 // Duplicate record. 875 diffs = splice(diffs, insPoint, 0, Diff{DiffDelete, lastequality}) 876 877 // Change second copy to insert. 878 diffs[insPoint+1].Type = DiffInsert 879 // Throw away the equality we just deleted. 880 equalities = equalities.next 881 lastequality = "" 882 883 if preIns && preDel { 884 // No changes made which could affect previous entry, keep going. 885 postIns = true 886 postDel = true 887 equalities = nil 888 } else { 889 if equalities != nil { 890 equalities = equalities.next 891 } 892 if equalities != nil { 893 pointer = equalities.data 894 } else { 895 pointer = -1 896 } 897 postIns = false 898 postDel = false 899 } 900 changes = true 901 } 902 } 903 pointer++ 904 } 905 906 if changes { 907 diffs = dmp.diffCleanupMerge(diffs) 908 } 909 910 return diffs 911 } 912 913 // diffCleanupMerge reorders and merges like edit sections. Merge equalities. 914 // Any edit section can move as long as it doesn't cross an equality. 915 func (dmp *MatchPatch) diffCleanupMerge(diffs []Diff) []Diff { 916 // Add a dummy entry at the end. 917 diffs = append(diffs, Diff{DiffEqual, ""}) 918 pointer := 0 919 countDelete := 0 920 countInsert := 0 921 commonlength := 0 922 textDelete := []rune(nil) 923 textInsert := []rune(nil) 924 925 for pointer < len(diffs) { 926 switch diffs[pointer].Type { 927 case DiffInsert: 928 countInsert++ 929 textInsert = append(textInsert, []rune(diffs[pointer].Text)...) 930 pointer++ 931 case DiffDelete: 932 countDelete++ 933 textDelete = append(textDelete, []rune(diffs[pointer].Text)...) 934 pointer++ 935 case DiffEqual: 936 // Upon reaching an equality, check for prior redundancies. 937 if countDelete+countInsert > 1 { 938 if countDelete != 0 && countInsert != 0 { 939 // Factor out any common prefixies. 940 commonlength = commonPrefixLength(textInsert, textDelete) 941 if commonlength != 0 { 942 x := pointer - countDelete - countInsert 943 if x > 0 && diffs[x-1].Type == DiffEqual { 944 diffs[x-1].Text += string(textInsert[:commonlength]) 945 } else { 946 diffs = append([]Diff{{DiffEqual, string(textInsert[:commonlength])}}, diffs...) 947 pointer++ 948 } 949 textInsert = textInsert[commonlength:] 950 textDelete = textDelete[commonlength:] 951 } 952 // Factor out any common suffixies. 953 commonlength = commonSuffixLength(textInsert, textDelete) 954 if commonlength != 0 { 955 insertIndex := len(textInsert) - commonlength 956 deleteIndex := len(textDelete) - commonlength 957 diffs[pointer].Text = string(textInsert[insertIndex:]) + diffs[pointer].Text 958 textInsert = textInsert[:insertIndex] 959 textDelete = textDelete[:deleteIndex] 960 } 961 } 962 // Delete the offending records and add the merged ones. 963 if countDelete == 0 { 964 diffs = splice(diffs, pointer-countInsert, 965 countDelete+countInsert, 966 Diff{DiffInsert, string(textInsert)}) 967 } else if countInsert == 0 { 968 diffs = splice(diffs, pointer-countDelete, 969 countDelete+countInsert, 970 Diff{DiffDelete, string(textDelete)}) 971 } else { 972 diffs = splice(diffs, pointer-countDelete-countInsert, 973 countDelete+countInsert, 974 Diff{DiffDelete, string(textDelete)}, 975 Diff{DiffInsert, string(textInsert)}) 976 } 977 978 pointer = pointer - countDelete - countInsert + 1 979 if countDelete != 0 { 980 pointer++ 981 } 982 if countInsert != 0 { 983 pointer++ 984 } 985 } else if pointer != 0 && diffs[pointer-1].Type == DiffEqual { 986 // Merge this equality with the previous one. 987 diffs[pointer-1].Text += diffs[pointer].Text 988 diffs = append(diffs[:pointer], diffs[pointer+1:]...) 989 } else { 990 pointer++ 991 } 992 countInsert = 0 993 countDelete = 0 994 textDelete = nil 995 textInsert = nil 996 } 997 } 998 999 if len(diffs[len(diffs)-1].Text) == 0 { 1000 diffs = diffs[0 : len(diffs)-1] // Remove the dummy entry at the end. 1001 } 1002 1003 // Second pass: look for single edits surrounded on both sides by equalities which can be shifted sideways to eliminate an equality. E.g: A<ins>BA</ins>C -> <ins>AB</ins>AC 1004 changes := false 1005 pointer = 1 1006 // Intentionally ignore the first and last element (don't need checking). 1007 for pointer < (len(diffs) - 1) { 1008 if diffs[pointer-1].Type == DiffEqual && 1009 diffs[pointer+1].Type == DiffEqual { 1010 // This is a single edit surrounded by equalities. 1011 if strings.HasSuffix(diffs[pointer].Text, diffs[pointer-1].Text) { 1012 // Shift the edit over the previous equality. 1013 diffs[pointer].Text = diffs[pointer-1].Text + 1014 diffs[pointer].Text[:len(diffs[pointer].Text)-len(diffs[pointer-1].Text)] 1015 diffs[pointer+1].Text = diffs[pointer-1].Text + diffs[pointer+1].Text 1016 diffs = splice(diffs, pointer-1, 1) 1017 changes = true 1018 } else if strings.HasPrefix(diffs[pointer].Text, diffs[pointer+1].Text) { 1019 // Shift the edit over the next equality. 1020 diffs[pointer-1].Text += diffs[pointer+1].Text 1021 diffs[pointer].Text = 1022 diffs[pointer].Text[len(diffs[pointer+1].Text):] + diffs[pointer+1].Text 1023 diffs = splice(diffs, pointer+1, 1) 1024 changes = true 1025 } 1026 } 1027 pointer++ 1028 } 1029 1030 // If shifts were made, the diff needs reordering and another shift sweep. 1031 if changes { 1032 diffs = dmp.diffCleanupMerge(diffs) 1033 } 1034 1035 return diffs 1036 } 1037 1038 // diffXIndex returns the equivalent location in s2. 1039 func (dmp *MatchPatch) diffXIndex(diffs []Diff, loc int) int { 1040 chars1 := 0 1041 chars2 := 0 1042 lastChars1 := 0 1043 lastChars2 := 0 1044 lastDiff := Diff{} 1045 for i := 0; i < len(diffs); i++ { 1046 aDiff := diffs[i] 1047 if aDiff.Type != DiffInsert { 1048 // Equality or deletion. 1049 chars1 += len(aDiff.Text) 1050 } 1051 if aDiff.Type != DiffDelete { 1052 // Equality or insertion. 1053 chars2 += len(aDiff.Text) 1054 } 1055 if chars1 > loc { 1056 // Overshot the location. 1057 lastDiff = aDiff 1058 break 1059 } 1060 lastChars1 = chars1 1061 lastChars2 = chars2 1062 } 1063 if lastDiff.Type == DiffDelete { 1064 // The location was deleted. 1065 return lastChars2 1066 } 1067 // Add the remaining character length. 1068 return lastChars2 + (loc - lastChars1) 1069 } 1070 1071 // diffLinesToStrings splits two texts into a list of strings. Each string represents one line. 1072 func (dmp *MatchPatch) diffLinesToStrings(text1, text2 string) (string, string, []string) { 1073 // '\x00' is a valid character, but various debuggers don't like it. So we'll insert a junk entry to avoid generating a null character. 1074 lineArray := []string{""} // e.g. lineArray[4] == 'Hello\n' 1075 1076 //Each string has the index of lineArray which it points to 1077 strIndexArray1 := dmp.diffLinesToStringsMunge(text1, &lineArray) 1078 strIndexArray2 := dmp.diffLinesToStringsMunge(text2, &lineArray) 1079 1080 return intArrayToString(strIndexArray1), intArrayToString(strIndexArray2), lineArray 1081 } 1082 1083 // diffLinesToStringsMunge splits a text into an array of strings, and reduces the texts to a []string. 1084 func (dmp *MatchPatch) diffLinesToStringsMunge(text string, lineArray *[]string) []uint32 { 1085 // Walk the text, pulling out a substring for each line. text.split('\n') would would temporarily double our memory footprint. Modifying text would create many large strings to garbage collect. 1086 lineHash := map[string]int{} // e.g. lineHash['Hello\n'] == 4 1087 lineStart := 0 1088 lineEnd := -1 1089 strs := []uint32{} 1090 1091 for lineEnd < len(text)-1 { 1092 lineEnd = indexOf(text, "\n", lineStart) 1093 1094 if lineEnd == -1 { 1095 lineEnd = len(text) - 1 1096 } 1097 1098 line := text[lineStart : lineEnd+1] 1099 lineStart = lineEnd + 1 1100 lineValue, ok := lineHash[line] 1101 1102 if ok { 1103 strs = append(strs, uint32(lineValue)) 1104 } else { 1105 *lineArray = append(*lineArray, line) 1106 lineHash[line] = len(*lineArray) - 1 1107 strs = append(strs, uint32(len(*lineArray)-1)) 1108 } 1109 } 1110 1111 return strs 1112 } 1113 1114 // runesIndex is the equivalent of strings.Index for rune slices. 1115 func runesIndex(r1, r2 []rune) int { 1116 last := len(r1) - len(r2) 1117 for i := 0; i <= last; i++ { 1118 if runesEqual(r1[i:i+len(r2)], r2) { 1119 return i 1120 } 1121 } 1122 return -1 1123 } 1124 1125 // runesIndexOf returns the index of pattern in target, starting at target[i]. 1126 func runesIndexOf(target, pattern []rune, i int) int { 1127 if i > len(target)-1 { 1128 return -1 1129 } 1130 if i <= 0 { 1131 return runesIndex(target, pattern) 1132 } 1133 ind := runesIndex(target[i:], pattern) 1134 if ind == -1 { 1135 return -1 1136 } 1137 return ind + i 1138 } 1139 1140 func runesEqual(r1, r2 []rune) bool { 1141 if len(r1) != len(r2) { 1142 return false 1143 } 1144 for i, c := range r1 { 1145 if c != r2[i] { 1146 return false 1147 } 1148 } 1149 return true 1150 } 1151 1152 // indexOf returns the first index of pattern in str, starting at str[i]. 1153 func indexOf(str string, pattern string, i int) int { 1154 if i > len(str)-1 { 1155 return -1 1156 } 1157 if i <= 0 { 1158 return strings.Index(str, pattern) 1159 } 1160 ind := strings.Index(str[i:], pattern) 1161 if ind == -1 { 1162 return -1 1163 } 1164 return ind + i 1165 } 1166 1167 // lastIndexOf returns the last index of pattern in str, starting at str[i]. 1168 func lastIndexOf(str string, pattern string, i int) int { 1169 if i < 0 { 1170 return -1 1171 } 1172 if i >= len(str) { 1173 return strings.LastIndex(str, pattern) 1174 } 1175 _, size := utf8.DecodeRuneInString(str[i:]) 1176 return strings.LastIndex(str[:i+size], pattern) 1177 } 1178 1179 func intArrayToString(ns []uint32) string { 1180 if len(ns) == 0 { 1181 return "" 1182 } 1183 1184 indexSeparator := IndexSeparator[0] 1185 1186 // Appr. 3 chars per num plus the comma. 1187 b := []byte{} 1188 for _, n := range ns { 1189 b = strconv.AppendInt(b, int64(n), 10) 1190 b = append(b, indexSeparator) 1191 } 1192 b = b[:len(b)-1] 1193 return string(b) 1194 } 1195 1196 // unescaper unescapes selected chars for compatibility with JavaScript's encodeURI. 1197 // In speed critical applications this could be dropped since the receiving application will certainly decode these fine. Note that this function is case-sensitive. Thus "%3F" would not be unescaped. But this is ok because it is only called with the output of HttpUtility.UrlEncode which returns lowercase hex. Example: "%3f" -> "?", "%24" -> "$", etc. 1198 var unescaper = strings.NewReplacer( 1199 "%21", "!", "%7E", "~", "%27", "'", 1200 "%28", "(", "%29", ")", "%3B", ";", 1201 "%2F", "/", "%3F", "?", "%3A", ":", 1202 "%40", "@", "%26", "&", "%3D", "=", 1203 "%2B", "+", "%24", "$", "%2C", ",", "%23", "#", "%2A", "*") 1204 1205 func min(x, y int) int { 1206 if x < y { 1207 return x 1208 } 1209 return y 1210 } 1211 1212 func max(x, y int) int { 1213 if x > y { 1214 return x 1215 } 1216 return y 1217 } 1218 1219 // splice removes amount elements from slice at index index, replacing them with elements. 1220 func splice(slice []Diff, index int, amount int, elements ...Diff) []Diff { 1221 if len(elements) == amount { 1222 // Easy case: overwrite the relevant items. 1223 copy(slice[index:], elements) 1224 return slice 1225 } 1226 if len(elements) < amount { 1227 // Fewer new items than old. 1228 // Copy in the new items. 1229 copy(slice[index:], elements) 1230 // Shift the remaining items left. 1231 copy(slice[index+len(elements):], slice[index+amount:]) 1232 // Calculate the new end of the slice. 1233 end := len(slice) - amount + len(elements) 1234 // Zero stranded elements at end so that they can be garbage collected. 1235 tail := slice[end:] 1236 for i := range tail { 1237 tail[i] = Diff{} 1238 } 1239 return slice[:end] 1240 } 1241 // More new items than old. 1242 // Make room in slice for new elements. 1243 // There's probably an even more efficient way to do this, 1244 // but this is simple and clear. 1245 need := len(slice) - amount + len(elements) 1246 for len(slice) < need { 1247 slice = append(slice, Diff{}) 1248 } 1249 // Shift slice elements right to make room for new elements. 1250 copy(slice[index+len(elements):], slice[index+amount:]) 1251 // Copy in new elements. 1252 copy(slice[index:], elements) 1253 return slice 1254 } 1255 1256 // commonPrefixLength returns the length of the common prefix of two rune slices. 1257 func commonPrefixLength(text1, text2 []rune) int { 1258 // Linear search. See comment in commonSuffixLength. 1259 n := 0 1260 for ; n < len(text1) && n < len(text2); n++ { 1261 if text1[n] != text2[n] { 1262 return n 1263 } 1264 } 1265 return n 1266 } 1267 1268 // commonSuffixLength returns the length of the common suffix of two rune slices. 1269 func commonSuffixLength(text1, text2 []rune) int { 1270 // Use linear search rather than the binary search discussed at https://neil.fraser.name/news/2007/10/09/. 1271 // See discussion at https://github.com/sergi/go-diff/issues/54. 1272 i1 := len(text1) 1273 i2 := len(text2) 1274 for n := 0; ; n++ { 1275 i1-- 1276 i2-- 1277 if i1 < 0 || i2 < 0 || text1[i1] != text2[i2] { 1278 return n 1279 } 1280 } 1281 }