github.com/cockroachdb/tools@v0.0.0-20230222021103-a6d27438930d/internal/diff/lcs/old.go (about) 1 // Copyright 2022 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package lcs 6 7 // TODO(adonovan): remove unclear references to "old" in this package. 8 9 import ( 10 "fmt" 11 ) 12 13 // A Diff is a replacement of a portion of A by a portion of B. 14 type Diff struct { 15 Start, End int // offsets of portion to delete in A 16 ReplStart, ReplEnd int // offset of replacement text in B 17 } 18 19 // DiffStrings returns the differences between two strings. 20 // It does not respect rune boundaries. 21 func DiffStrings(a, b string) []Diff { return diff(stringSeqs{a, b}) } 22 23 // DiffBytes returns the differences between two byte sequences. 24 // It does not respect rune boundaries. 25 func DiffBytes(a, b []byte) []Diff { return diff(bytesSeqs{a, b}) } 26 27 // DiffRunes returns the differences between two rune sequences. 28 func DiffRunes(a, b []rune) []Diff { return diff(runesSeqs{a, b}) } 29 30 func diff(seqs sequences) []Diff { 31 // A limit on how deeply the LCS algorithm should search. The value is just a guess. 32 const maxDiffs = 30 33 diff, _ := compute(seqs, twosided, maxDiffs/2) 34 return diff 35 } 36 37 // compute computes the list of differences between two sequences, 38 // along with the LCS. It is exercised directly by tests. 39 // The algorithm is one of {forward, backward, twosided}. 40 func compute(seqs sequences, algo func(*editGraph) lcs, limit int) ([]Diff, lcs) { 41 if limit <= 0 { 42 limit = 1 << 25 // effectively infinity 43 } 44 alen, blen := seqs.lengths() 45 g := &editGraph{ 46 seqs: seqs, 47 vf: newtriang(limit), 48 vb: newtriang(limit), 49 limit: limit, 50 ux: alen, 51 uy: blen, 52 delta: alen - blen, 53 } 54 lcs := algo(g) 55 diffs := lcs.toDiffs(alen, blen) 56 return diffs, lcs 57 } 58 59 // editGraph carries the information for computing the lcs of two sequences. 60 type editGraph struct { 61 seqs sequences 62 vf, vb label // forward and backward labels 63 64 limit int // maximal value of D 65 // the bounding rectangle of the current edit graph 66 lx, ly, ux, uy int 67 delta int // common subexpression: (ux-lx)-(uy-ly) 68 } 69 70 // toDiffs converts an LCS to a list of edits. 71 func (lcs lcs) toDiffs(alen, blen int) []Diff { 72 var diffs []Diff 73 var pa, pb int // offsets in a, b 74 for _, l := range lcs { 75 if pa < l.X || pb < l.Y { 76 diffs = append(diffs, Diff{pa, l.X, pb, l.Y}) 77 } 78 pa = l.X + l.Len 79 pb = l.Y + l.Len 80 } 81 if pa < alen || pb < blen { 82 diffs = append(diffs, Diff{pa, alen, pb, blen}) 83 } 84 return diffs 85 } 86 87 // --- FORWARD --- 88 89 // fdone decides if the forwward path has reached the upper right 90 // corner of the rectangle. If so, it also returns the computed lcs. 91 func (e *editGraph) fdone(D, k int) (bool, lcs) { 92 // x, y, k are relative to the rectangle 93 x := e.vf.get(D, k) 94 y := x - k 95 if x == e.ux && y == e.uy { 96 return true, e.forwardlcs(D, k) 97 } 98 return false, nil 99 } 100 101 // run the forward algorithm, until success or up to the limit on D. 102 func forward(e *editGraph) lcs { 103 e.setForward(0, 0, e.lx) 104 if ok, ans := e.fdone(0, 0); ok { 105 return ans 106 } 107 // from D to D+1 108 for D := 0; D < e.limit; D++ { 109 e.setForward(D+1, -(D + 1), e.getForward(D, -D)) 110 if ok, ans := e.fdone(D+1, -(D + 1)); ok { 111 return ans 112 } 113 e.setForward(D+1, D+1, e.getForward(D, D)+1) 114 if ok, ans := e.fdone(D+1, D+1); ok { 115 return ans 116 } 117 for k := -D + 1; k <= D-1; k += 2 { 118 // these are tricky and easy to get backwards 119 lookv := e.lookForward(k, e.getForward(D, k-1)+1) 120 lookh := e.lookForward(k, e.getForward(D, k+1)) 121 if lookv > lookh { 122 e.setForward(D+1, k, lookv) 123 } else { 124 e.setForward(D+1, k, lookh) 125 } 126 if ok, ans := e.fdone(D+1, k); ok { 127 return ans 128 } 129 } 130 } 131 // D is too large 132 // find the D path with maximal x+y inside the rectangle and 133 // use that to compute the found part of the lcs 134 kmax := -e.limit - 1 135 diagmax := -1 136 for k := -e.limit; k <= e.limit; k += 2 { 137 x := e.getForward(e.limit, k) 138 y := x - k 139 if x+y > diagmax && x <= e.ux && y <= e.uy { 140 diagmax, kmax = x+y, k 141 } 142 } 143 return e.forwardlcs(e.limit, kmax) 144 } 145 146 // recover the lcs by backtracking from the farthest point reached 147 func (e *editGraph) forwardlcs(D, k int) lcs { 148 var ans lcs 149 for x := e.getForward(D, k); x != 0 || x-k != 0; { 150 if ok(D-1, k-1) && x-1 == e.getForward(D-1, k-1) { 151 // if (x-1,y) is labelled D-1, x--,D--,k--,continue 152 D, k, x = D-1, k-1, x-1 153 continue 154 } else if ok(D-1, k+1) && x == e.getForward(D-1, k+1) { 155 // if (x,y-1) is labelled D-1, x, D--,k++, continue 156 D, k = D-1, k+1 157 continue 158 } 159 // if (x-1,y-1)--(x,y) is a diagonal, prepend,x--,y--, continue 160 y := x - k 161 ans = ans.prepend(x+e.lx-1, y+e.ly-1) 162 x-- 163 } 164 return ans 165 } 166 167 // start at (x,y), go up the diagonal as far as possible, 168 // and label the result with d 169 func (e *editGraph) lookForward(k, relx int) int { 170 rely := relx - k 171 x, y := relx+e.lx, rely+e.ly 172 if x < e.ux && y < e.uy { 173 x += e.seqs.commonPrefixLen(x, e.ux, y, e.uy) 174 } 175 return x 176 } 177 178 func (e *editGraph) setForward(d, k, relx int) { 179 x := e.lookForward(k, relx) 180 e.vf.set(d, k, x-e.lx) 181 } 182 183 func (e *editGraph) getForward(d, k int) int { 184 x := e.vf.get(d, k) 185 return x 186 } 187 188 // --- BACKWARD --- 189 190 // bdone decides if the backward path has reached the lower left corner 191 func (e *editGraph) bdone(D, k int) (bool, lcs) { 192 // x, y, k are relative to the rectangle 193 x := e.vb.get(D, k) 194 y := x - (k + e.delta) 195 if x == 0 && y == 0 { 196 return true, e.backwardlcs(D, k) 197 } 198 return false, nil 199 } 200 201 // run the backward algorithm, until success or up to the limit on D. 202 func backward(e *editGraph) lcs { 203 e.setBackward(0, 0, e.ux) 204 if ok, ans := e.bdone(0, 0); ok { 205 return ans 206 } 207 // from D to D+1 208 for D := 0; D < e.limit; D++ { 209 e.setBackward(D+1, -(D + 1), e.getBackward(D, -D)-1) 210 if ok, ans := e.bdone(D+1, -(D + 1)); ok { 211 return ans 212 } 213 e.setBackward(D+1, D+1, e.getBackward(D, D)) 214 if ok, ans := e.bdone(D+1, D+1); ok { 215 return ans 216 } 217 for k := -D + 1; k <= D-1; k += 2 { 218 // these are tricky and easy to get wrong 219 lookv := e.lookBackward(k, e.getBackward(D, k-1)) 220 lookh := e.lookBackward(k, e.getBackward(D, k+1)-1) 221 if lookv < lookh { 222 e.setBackward(D+1, k, lookv) 223 } else { 224 e.setBackward(D+1, k, lookh) 225 } 226 if ok, ans := e.bdone(D+1, k); ok { 227 return ans 228 } 229 } 230 } 231 232 // D is too large 233 // find the D path with minimal x+y inside the rectangle and 234 // use that to compute the part of the lcs found 235 kmax := -e.limit - 1 236 diagmin := 1 << 25 237 for k := -e.limit; k <= e.limit; k += 2 { 238 x := e.getBackward(e.limit, k) 239 y := x - (k + e.delta) 240 if x+y < diagmin && x >= 0 && y >= 0 { 241 diagmin, kmax = x+y, k 242 } 243 } 244 if kmax < -e.limit { 245 panic(fmt.Sprintf("no paths when limit=%d?", e.limit)) 246 } 247 return e.backwardlcs(e.limit, kmax) 248 } 249 250 // recover the lcs by backtracking 251 func (e *editGraph) backwardlcs(D, k int) lcs { 252 var ans lcs 253 for x := e.getBackward(D, k); x != e.ux || x-(k+e.delta) != e.uy; { 254 if ok(D-1, k-1) && x == e.getBackward(D-1, k-1) { 255 // D--, k--, x unchanged 256 D, k = D-1, k-1 257 continue 258 } else if ok(D-1, k+1) && x+1 == e.getBackward(D-1, k+1) { 259 // D--, k++, x++ 260 D, k, x = D-1, k+1, x+1 261 continue 262 } 263 y := x - (k + e.delta) 264 ans = ans.append(x+e.lx, y+e.ly) 265 x++ 266 } 267 return ans 268 } 269 270 // start at (x,y), go down the diagonal as far as possible, 271 func (e *editGraph) lookBackward(k, relx int) int { 272 rely := relx - (k + e.delta) // forward k = k + e.delta 273 x, y := relx+e.lx, rely+e.ly 274 if x > 0 && y > 0 { 275 x -= e.seqs.commonSuffixLen(0, x, 0, y) 276 } 277 return x 278 } 279 280 // convert to rectangle, and label the result with d 281 func (e *editGraph) setBackward(d, k, relx int) { 282 x := e.lookBackward(k, relx) 283 e.vb.set(d, k, x-e.lx) 284 } 285 286 func (e *editGraph) getBackward(d, k int) int { 287 x := e.vb.get(d, k) 288 return x 289 } 290 291 // -- TWOSIDED --- 292 293 func twosided(e *editGraph) lcs { 294 // The termination condition could be improved, as either the forward 295 // or backward pass could succeed before Myers' Lemma applies. 296 // Aside from questions of efficiency (is the extra testing cost-effective) 297 // this is more likely to matter when e.limit is reached. 298 e.setForward(0, 0, e.lx) 299 e.setBackward(0, 0, e.ux) 300 301 // from D to D+1 302 for D := 0; D < e.limit; D++ { 303 // just finished a backwards pass, so check 304 if got, ok := e.twoDone(D, D); ok { 305 return e.twolcs(D, D, got) 306 } 307 // do a forwards pass (D to D+1) 308 e.setForward(D+1, -(D + 1), e.getForward(D, -D)) 309 e.setForward(D+1, D+1, e.getForward(D, D)+1) 310 for k := -D + 1; k <= D-1; k += 2 { 311 // these are tricky and easy to get backwards 312 lookv := e.lookForward(k, e.getForward(D, k-1)+1) 313 lookh := e.lookForward(k, e.getForward(D, k+1)) 314 if lookv > lookh { 315 e.setForward(D+1, k, lookv) 316 } else { 317 e.setForward(D+1, k, lookh) 318 } 319 } 320 // just did a forward pass, so check 321 if got, ok := e.twoDone(D+1, D); ok { 322 return e.twolcs(D+1, D, got) 323 } 324 // do a backward pass, D to D+1 325 e.setBackward(D+1, -(D + 1), e.getBackward(D, -D)-1) 326 e.setBackward(D+1, D+1, e.getBackward(D, D)) 327 for k := -D + 1; k <= D-1; k += 2 { 328 // these are tricky and easy to get wrong 329 lookv := e.lookBackward(k, e.getBackward(D, k-1)) 330 lookh := e.lookBackward(k, e.getBackward(D, k+1)-1) 331 if lookv < lookh { 332 e.setBackward(D+1, k, lookv) 333 } else { 334 e.setBackward(D+1, k, lookh) 335 } 336 } 337 } 338 339 // D too large. combine a forward and backward partial lcs 340 // first, a forward one 341 kmax := -e.limit - 1 342 diagmax := -1 343 for k := -e.limit; k <= e.limit; k += 2 { 344 x := e.getForward(e.limit, k) 345 y := x - k 346 if x+y > diagmax && x <= e.ux && y <= e.uy { 347 diagmax, kmax = x+y, k 348 } 349 } 350 if kmax < -e.limit { 351 panic(fmt.Sprintf("no forward paths when limit=%d?", e.limit)) 352 } 353 lcs := e.forwardlcs(e.limit, kmax) 354 // now a backward one 355 // find the D path with minimal x+y inside the rectangle and 356 // use that to compute the lcs 357 diagmin := 1 << 25 // infinity 358 for k := -e.limit; k <= e.limit; k += 2 { 359 x := e.getBackward(e.limit, k) 360 y := x - (k + e.delta) 361 if x+y < diagmin && x >= 0 && y >= 0 { 362 diagmin, kmax = x+y, k 363 } 364 } 365 if kmax < -e.limit { 366 panic(fmt.Sprintf("no backward paths when limit=%d?", e.limit)) 367 } 368 lcs = append(lcs, e.backwardlcs(e.limit, kmax)...) 369 // These may overlap (e.forwardlcs and e.backwardlcs return sorted lcs) 370 ans := lcs.fix() 371 return ans 372 } 373 374 // Does Myers' Lemma apply? 375 func (e *editGraph) twoDone(df, db int) (int, bool) { 376 if (df+db+e.delta)%2 != 0 { 377 return 0, false // diagonals cannot overlap 378 } 379 kmin := -db + e.delta 380 if -df > kmin { 381 kmin = -df 382 } 383 kmax := db + e.delta 384 if df < kmax { 385 kmax = df 386 } 387 for k := kmin; k <= kmax; k += 2 { 388 x := e.vf.get(df, k) 389 u := e.vb.get(db, k-e.delta) 390 if u <= x { 391 // is it worth looking at all the other k? 392 for l := k; l <= kmax; l += 2 { 393 x := e.vf.get(df, l) 394 y := x - l 395 u := e.vb.get(db, l-e.delta) 396 v := u - l 397 if x == u || u == 0 || v == 0 || y == e.uy || x == e.ux { 398 return l, true 399 } 400 } 401 return k, true 402 } 403 } 404 return 0, false 405 } 406 407 func (e *editGraph) twolcs(df, db, kf int) lcs { 408 // db==df || db+1==df 409 x := e.vf.get(df, kf) 410 y := x - kf 411 kb := kf - e.delta 412 u := e.vb.get(db, kb) 413 v := u - kf 414 415 // Myers proved there is a df-path from (0,0) to (u,v) 416 // and a db-path from (x,y) to (N,M). 417 // In the first case the overall path is the forward path 418 // to (u,v) followed by the backward path to (N,M). 419 // In the second case the path is the backward path to (x,y) 420 // followed by the forward path to (x,y) from (0,0). 421 422 // Look for some special cases to avoid computing either of these paths. 423 if x == u { 424 // "babaab" "cccaba" 425 // already patched together 426 lcs := e.forwardlcs(df, kf) 427 lcs = append(lcs, e.backwardlcs(db, kb)...) 428 return lcs.sort() 429 } 430 431 // is (u-1,v) or (u,v-1) labelled df-1? 432 // if so, that forward df-1-path plus a horizontal or vertical edge 433 // is the df-path to (u,v), then plus the db-path to (N,M) 434 if u > 0 && ok(df-1, u-1-v) && e.vf.get(df-1, u-1-v) == u-1 { 435 // "aabbab" "cbcabc" 436 lcs := e.forwardlcs(df-1, u-1-v) 437 lcs = append(lcs, e.backwardlcs(db, kb)...) 438 return lcs.sort() 439 } 440 if v > 0 && ok(df-1, (u-(v-1))) && e.vf.get(df-1, u-(v-1)) == u { 441 // "abaabb" "bcacab" 442 lcs := e.forwardlcs(df-1, u-(v-1)) 443 lcs = append(lcs, e.backwardlcs(db, kb)...) 444 return lcs.sort() 445 } 446 447 // The path can't possibly contribute to the lcs because it 448 // is all horizontal or vertical edges 449 if u == 0 || v == 0 || x == e.ux || y == e.uy { 450 // "abaabb" "abaaaa" 451 if u == 0 || v == 0 { 452 return e.backwardlcs(db, kb) 453 } 454 return e.forwardlcs(df, kf) 455 } 456 457 // is (x+1,y) or (x,y+1) labelled db-1? 458 if x+1 <= e.ux && ok(db-1, x+1-y-e.delta) && e.vb.get(db-1, x+1-y-e.delta) == x+1 { 459 // "bababb" "baaabb" 460 lcs := e.backwardlcs(db-1, kb+1) 461 lcs = append(lcs, e.forwardlcs(df, kf)...) 462 return lcs.sort() 463 } 464 if y+1 <= e.uy && ok(db-1, x-(y+1)-e.delta) && e.vb.get(db-1, x-(y+1)-e.delta) == x { 465 // "abbbaa" "cabacc" 466 lcs := e.backwardlcs(db-1, kb-1) 467 lcs = append(lcs, e.forwardlcs(df, kf)...) 468 return lcs.sort() 469 } 470 471 // need to compute another path 472 // "aabbaa" "aacaba" 473 lcs := e.backwardlcs(db, kb) 474 oldx, oldy := e.ux, e.uy 475 e.ux = u 476 e.uy = v 477 lcs = append(lcs, forward(e)...) 478 e.ux, e.uy = oldx, oldy 479 return lcs.sort() 480 }