github.com/cockroachdb/tools@v0.0.0-20230222021103-a6d27438930d/internal/diff/lcs/old.go

github.com/cockroachdb/tools@v0.0.0-20230222021103-a6d27438930d/internal/diff/lcs/old.go (about)

     1  // Copyright 2022 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package lcs
     6  
     7  // TODO(adonovan): remove unclear references to "old" in this package.
     8  
     9  import (
    10  	"fmt"
    11  )
    12  
    13  // A Diff is a replacement of a portion of A by a portion of B.
    14  type Diff struct {
    15  	Start, End         int // offsets of portion to delete in A
    16  	ReplStart, ReplEnd int // offset of replacement text in B
    17  }
    18  
    19  // DiffStrings returns the differences between two strings.
    20  // It does not respect rune boundaries.
    21  func DiffStrings(a, b string) []Diff { return diff(stringSeqs{a, b}) }
    22  
    23  // DiffBytes returns the differences between two byte sequences.
    24  // It does not respect rune boundaries.
    25  func DiffBytes(a, b []byte) []Diff { return diff(bytesSeqs{a, b}) }
    26  
    27  // DiffRunes returns the differences between two rune sequences.
    28  func DiffRunes(a, b []rune) []Diff { return diff(runesSeqs{a, b}) }
    29  
    30  func diff(seqs sequences) []Diff {
    31  	// A limit on how deeply the LCS algorithm should search. The value is just a guess.
    32  	const maxDiffs = 30
    33  	diff, _ := compute(seqs, twosided, maxDiffs/2)
    34  	return diff
    35  }
    36  
    37  // compute computes the list of differences between two sequences,
    38  // along with the LCS. It is exercised directly by tests.
    39  // The algorithm is one of {forward, backward, twosided}.
    40  func compute(seqs sequences, algo func(*editGraph) lcs, limit int) ([]Diff, lcs) {
    41  	if limit <= 0 {
    42  		limit = 1 << 25 // effectively infinity
    43  	}
    44  	alen, blen := seqs.lengths()
    45  	g := &editGraph{
    46  		seqs:  seqs,
    47  		vf:    newtriang(limit),
    48  		vb:    newtriang(limit),
    49  		limit: limit,
    50  		ux:    alen,
    51  		uy:    blen,
    52  		delta: alen - blen,
    53  	}
    54  	lcs := algo(g)
    55  	diffs := lcs.toDiffs(alen, blen)
    56  	return diffs, lcs
    57  }
    58  
    59  // editGraph carries the information for computing the lcs of two sequences.
    60  type editGraph struct {
    61  	seqs   sequences
    62  	vf, vb label // forward and backward labels
    63  
    64  	limit int // maximal value of D
    65  	// the bounding rectangle of the current edit graph
    66  	lx, ly, ux, uy int
    67  	delta          int // common subexpression: (ux-lx)-(uy-ly)
    68  }
    69  
    70  // toDiffs converts an LCS to a list of edits.
    71  func (lcs lcs) toDiffs(alen, blen int) []Diff {
    72  	var diffs []Diff
    73  	var pa, pb int // offsets in a, b
    74  	for _, l := range lcs {
    75  		if pa < l.X || pb < l.Y {
    76  			diffs = append(diffs, Diff{pa, l.X, pb, l.Y})
    77  		}
    78  		pa = l.X + l.Len
    79  		pb = l.Y + l.Len
    80  	}
    81  	if pa < alen || pb < blen {
    82  		diffs = append(diffs, Diff{pa, alen, pb, blen})
    83  	}
    84  	return diffs
    85  }
    86  
    87  // --- FORWARD ---
    88  
    89  // fdone decides if the forwward path has reached the upper right
    90  // corner of the rectangle. If so, it also returns the computed lcs.
    91  func (e *editGraph) fdone(D, k int) (bool, lcs) {
    92  	// x, y, k are relative to the rectangle
    93  	x := e.vf.get(D, k)
    94  	y := x - k
    95  	if x == e.ux && y == e.uy {
    96  		return true, e.forwardlcs(D, k)
    97  	}
    98  	return false, nil
    99  }
   100  
   101  // run the forward algorithm, until success or up to the limit on D.
   102  func forward(e *editGraph) lcs {
   103  	e.setForward(0, 0, e.lx)
   104  	if ok, ans := e.fdone(0, 0); ok {
   105  		return ans
   106  	}
   107  	// from D to D+1
   108  	for D := 0; D < e.limit; D++ {
   109  		e.setForward(D+1, -(D + 1), e.getForward(D, -D))
   110  		if ok, ans := e.fdone(D+1, -(D + 1)); ok {
   111  			return ans
   112  		}
   113  		e.setForward(D+1, D+1, e.getForward(D, D)+1)
   114  		if ok, ans := e.fdone(D+1, D+1); ok {
   115  			return ans
   116  		}
   117  		for k := -D + 1; k <= D-1; k += 2 {
   118  			// these are tricky and easy to get backwards
   119  			lookv := e.lookForward(k, e.getForward(D, k-1)+1)
   120  			lookh := e.lookForward(k, e.getForward(D, k+1))
   121  			if lookv > lookh {
   122  				e.setForward(D+1, k, lookv)
   123  			} else {
   124  				e.setForward(D+1, k, lookh)
   125  			}
   126  			if ok, ans := e.fdone(D+1, k); ok {
   127  				return ans
   128  			}
   129  		}
   130  	}
   131  	// D is too large
   132  	// find the D path with maximal x+y inside the rectangle and
   133  	// use that to compute the found part of the lcs
   134  	kmax := -e.limit - 1
   135  	diagmax := -1
   136  	for k := -e.limit; k <= e.limit; k += 2 {
   137  		x := e.getForward(e.limit, k)
   138  		y := x - k
   139  		if x+y > diagmax && x <= e.ux && y <= e.uy {
   140  			diagmax, kmax = x+y, k
   141  		}
   142  	}
   143  	return e.forwardlcs(e.limit, kmax)
   144  }
   145  
   146  // recover the lcs by backtracking from the farthest point reached
   147  func (e *editGraph) forwardlcs(D, k int) lcs {
   148  	var ans lcs
   149  	for x := e.getForward(D, k); x != 0 || x-k != 0; {
   150  		if ok(D-1, k-1) && x-1 == e.getForward(D-1, k-1) {
   151  			// if (x-1,y) is labelled D-1, x--,D--,k--,continue
   152  			D, k, x = D-1, k-1, x-1
   153  			continue
   154  		} else if ok(D-1, k+1) && x == e.getForward(D-1, k+1) {
   155  			// if (x,y-1) is labelled D-1, x, D--,k++, continue
   156  			D, k = D-1, k+1
   157  			continue
   158  		}
   159  		// if (x-1,y-1)--(x,y) is a diagonal, prepend,x--,y--, continue
   160  		y := x - k
   161  		ans = ans.prepend(x+e.lx-1, y+e.ly-1)
   162  		x--
   163  	}
   164  	return ans
   165  }
   166  
   167  // start at (x,y), go up the diagonal as far as possible,
   168  // and label the result with d
   169  func (e *editGraph) lookForward(k, relx int) int {
   170  	rely := relx - k
   171  	x, y := relx+e.lx, rely+e.ly
   172  	if x < e.ux && y < e.uy {
   173  		x += e.seqs.commonPrefixLen(x, e.ux, y, e.uy)
   174  	}
   175  	return x
   176  }
   177  
   178  func (e *editGraph) setForward(d, k, relx int) {
   179  	x := e.lookForward(k, relx)
   180  	e.vf.set(d, k, x-e.lx)
   181  }
   182  
   183  func (e *editGraph) getForward(d, k int) int {
   184  	x := e.vf.get(d, k)
   185  	return x
   186  }
   187  
   188  // --- BACKWARD ---
   189  
   190  // bdone decides if the backward path has reached the lower left corner
   191  func (e *editGraph) bdone(D, k int) (bool, lcs) {
   192  	// x, y, k are relative to the rectangle
   193  	x := e.vb.get(D, k)
   194  	y := x - (k + e.delta)
   195  	if x == 0 && y == 0 {
   196  		return true, e.backwardlcs(D, k)
   197  	}
   198  	return false, nil
   199  }
   200  
   201  // run the backward algorithm, until success or up to the limit on D.
   202  func backward(e *editGraph) lcs {
   203  	e.setBackward(0, 0, e.ux)
   204  	if ok, ans := e.bdone(0, 0); ok {
   205  		return ans
   206  	}
   207  	// from D to D+1
   208  	for D := 0; D < e.limit; D++ {
   209  		e.setBackward(D+1, -(D + 1), e.getBackward(D, -D)-1)
   210  		if ok, ans := e.bdone(D+1, -(D + 1)); ok {
   211  			return ans
   212  		}
   213  		e.setBackward(D+1, D+1, e.getBackward(D, D))
   214  		if ok, ans := e.bdone(D+1, D+1); ok {
   215  			return ans
   216  		}
   217  		for k := -D + 1; k <= D-1; k += 2 {
   218  			// these are tricky and easy to get wrong
   219  			lookv := e.lookBackward(k, e.getBackward(D, k-1))
   220  			lookh := e.lookBackward(k, e.getBackward(D, k+1)-1)
   221  			if lookv < lookh {
   222  				e.setBackward(D+1, k, lookv)
   223  			} else {
   224  				e.setBackward(D+1, k, lookh)
   225  			}
   226  			if ok, ans := e.bdone(D+1, k); ok {
   227  				return ans
   228  			}
   229  		}
   230  	}
   231  
   232  	// D is too large
   233  	// find the D path with minimal x+y inside the rectangle and
   234  	// use that to compute the part of the lcs found
   235  	kmax := -e.limit - 1
   236  	diagmin := 1 << 25
   237  	for k := -e.limit; k <= e.limit; k += 2 {
   238  		x := e.getBackward(e.limit, k)
   239  		y := x - (k + e.delta)
   240  		if x+y < diagmin && x >= 0 && y >= 0 {
   241  			diagmin, kmax = x+y, k
   242  		}
   243  	}
   244  	if kmax < -e.limit {
   245  		panic(fmt.Sprintf("no paths when limit=%d?", e.limit))
   246  	}
   247  	return e.backwardlcs(e.limit, kmax)
   248  }
   249  
   250  // recover the lcs by backtracking
   251  func (e *editGraph) backwardlcs(D, k int) lcs {
   252  	var ans lcs
   253  	for x := e.getBackward(D, k); x != e.ux || x-(k+e.delta) != e.uy; {
   254  		if ok(D-1, k-1) && x == e.getBackward(D-1, k-1) {
   255  			// D--, k--, x unchanged
   256  			D, k = D-1, k-1
   257  			continue
   258  		} else if ok(D-1, k+1) && x+1 == e.getBackward(D-1, k+1) {
   259  			// D--, k++, x++
   260  			D, k, x = D-1, k+1, x+1
   261  			continue
   262  		}
   263  		y := x - (k + e.delta)
   264  		ans = ans.append(x+e.lx, y+e.ly)
   265  		x++
   266  	}
   267  	return ans
   268  }
   269  
   270  // start at (x,y), go down the diagonal as far as possible,
   271  func (e *editGraph) lookBackward(k, relx int) int {
   272  	rely := relx - (k + e.delta) // forward k = k + e.delta
   273  	x, y := relx+e.lx, rely+e.ly
   274  	if x > 0 && y > 0 {
   275  		x -= e.seqs.commonSuffixLen(0, x, 0, y)
   276  	}
   277  	return x
   278  }
   279  
   280  // convert to rectangle, and label the result with d
   281  func (e *editGraph) setBackward(d, k, relx int) {
   282  	x := e.lookBackward(k, relx)
   283  	e.vb.set(d, k, x-e.lx)
   284  }
   285  
   286  func (e *editGraph) getBackward(d, k int) int {
   287  	x := e.vb.get(d, k)
   288  	return x
   289  }
   290  
   291  // -- TWOSIDED ---
   292  
   293  func twosided(e *editGraph) lcs {
   294  	// The termination condition could be improved, as either the forward
   295  	// or backward pass could succeed before Myers' Lemma applies.
   296  	// Aside from questions of efficiency (is the extra testing cost-effective)
   297  	// this is more likely to matter when e.limit is reached.
   298  	e.setForward(0, 0, e.lx)
   299  	e.setBackward(0, 0, e.ux)
   300  
   301  	// from D to D+1
   302  	for D := 0; D < e.limit; D++ {
   303  		// just finished a backwards pass, so check
   304  		if got, ok := e.twoDone(D, D); ok {
   305  			return e.twolcs(D, D, got)
   306  		}
   307  		// do a forwards pass (D to D+1)
   308  		e.setForward(D+1, -(D + 1), e.getForward(D, -D))
   309  		e.setForward(D+1, D+1, e.getForward(D, D)+1)
   310  		for k := -D + 1; k <= D-1; k += 2 {
   311  			// these are tricky and easy to get backwards
   312  			lookv := e.lookForward(k, e.getForward(D, k-1)+1)
   313  			lookh := e.lookForward(k, e.getForward(D, k+1))
   314  			if lookv > lookh {
   315  				e.setForward(D+1, k, lookv)
   316  			} else {
   317  				e.setForward(D+1, k, lookh)
   318  			}
   319  		}
   320  		// just did a forward pass, so check
   321  		if got, ok := e.twoDone(D+1, D); ok {
   322  			return e.twolcs(D+1, D, got)
   323  		}
   324  		// do a backward pass, D to D+1
   325  		e.setBackward(D+1, -(D + 1), e.getBackward(D, -D)-1)
   326  		e.setBackward(D+1, D+1, e.getBackward(D, D))
   327  		for k := -D + 1; k <= D-1; k += 2 {
   328  			// these are tricky and easy to get wrong
   329  			lookv := e.lookBackward(k, e.getBackward(D, k-1))
   330  			lookh := e.lookBackward(k, e.getBackward(D, k+1)-1)
   331  			if lookv < lookh {
   332  				e.setBackward(D+1, k, lookv)
   333  			} else {
   334  				e.setBackward(D+1, k, lookh)
   335  			}
   336  		}
   337  	}
   338  
   339  	// D too large. combine a forward and backward partial lcs
   340  	// first, a forward one
   341  	kmax := -e.limit - 1
   342  	diagmax := -1
   343  	for k := -e.limit; k <= e.limit; k += 2 {
   344  		x := e.getForward(e.limit, k)
   345  		y := x - k
   346  		if x+y > diagmax && x <= e.ux && y <= e.uy {
   347  			diagmax, kmax = x+y, k
   348  		}
   349  	}
   350  	if kmax < -e.limit {
   351  		panic(fmt.Sprintf("no forward paths when limit=%d?", e.limit))
   352  	}
   353  	lcs := e.forwardlcs(e.limit, kmax)
   354  	// now a backward one
   355  	// find the D path with minimal x+y inside the rectangle and
   356  	// use that to compute the lcs
   357  	diagmin := 1 << 25 // infinity
   358  	for k := -e.limit; k <= e.limit; k += 2 {
   359  		x := e.getBackward(e.limit, k)
   360  		y := x - (k + e.delta)
   361  		if x+y < diagmin && x >= 0 && y >= 0 {
   362  			diagmin, kmax = x+y, k
   363  		}
   364  	}
   365  	if kmax < -e.limit {
   366  		panic(fmt.Sprintf("no backward paths when limit=%d?", e.limit))
   367  	}
   368  	lcs = append(lcs, e.backwardlcs(e.limit, kmax)...)
   369  	// These may overlap (e.forwardlcs and e.backwardlcs return sorted lcs)
   370  	ans := lcs.fix()
   371  	return ans
   372  }
   373  
   374  // Does Myers' Lemma apply?
   375  func (e *editGraph) twoDone(df, db int) (int, bool) {
   376  	if (df+db+e.delta)%2 != 0 {
   377  		return 0, false // diagonals cannot overlap
   378  	}
   379  	kmin := -db + e.delta
   380  	if -df > kmin {
   381  		kmin = -df
   382  	}
   383  	kmax := db + e.delta
   384  	if df < kmax {
   385  		kmax = df
   386  	}
   387  	for k := kmin; k <= kmax; k += 2 {
   388  		x := e.vf.get(df, k)
   389  		u := e.vb.get(db, k-e.delta)
   390  		if u <= x {
   391  			// is it worth looking at all the other k?
   392  			for l := k; l <= kmax; l += 2 {
   393  				x := e.vf.get(df, l)
   394  				y := x - l
   395  				u := e.vb.get(db, l-e.delta)
   396  				v := u - l
   397  				if x == u || u == 0 || v == 0 || y == e.uy || x == e.ux {
   398  					return l, true
   399  				}
   400  			}
   401  			return k, true
   402  		}
   403  	}
   404  	return 0, false
   405  }
   406  
   407  func (e *editGraph) twolcs(df, db, kf int) lcs {
   408  	// db==df || db+1==df
   409  	x := e.vf.get(df, kf)
   410  	y := x - kf
   411  	kb := kf - e.delta
   412  	u := e.vb.get(db, kb)
   413  	v := u - kf
   414  
   415  	// Myers proved there is a df-path from (0,0) to (u,v)
   416  	// and a db-path from (x,y) to (N,M).
   417  	// In the first case the overall path is the forward path
   418  	// to (u,v) followed by the backward path to (N,M).
   419  	// In the second case the path is the backward path to (x,y)
   420  	// followed by the forward path to (x,y) from (0,0).
   421  
   422  	// Look for some special cases to avoid computing either of these paths.
   423  	if x == u {
   424  		// "babaab" "cccaba"
   425  		// already patched together
   426  		lcs := e.forwardlcs(df, kf)
   427  		lcs = append(lcs, e.backwardlcs(db, kb)...)
   428  		return lcs.sort()
   429  	}
   430  
   431  	// is (u-1,v) or (u,v-1) labelled df-1?
   432  	// if so, that forward df-1-path plus a horizontal or vertical edge
   433  	// is the df-path to (u,v), then plus the db-path to (N,M)
   434  	if u > 0 && ok(df-1, u-1-v) && e.vf.get(df-1, u-1-v) == u-1 {
   435  		//  "aabbab" "cbcabc"
   436  		lcs := e.forwardlcs(df-1, u-1-v)
   437  		lcs = append(lcs, e.backwardlcs(db, kb)...)
   438  		return lcs.sort()
   439  	}
   440  	if v > 0 && ok(df-1, (u-(v-1))) && e.vf.get(df-1, u-(v-1)) == u {
   441  		//  "abaabb" "bcacab"
   442  		lcs := e.forwardlcs(df-1, u-(v-1))
   443  		lcs = append(lcs, e.backwardlcs(db, kb)...)
   444  		return lcs.sort()
   445  	}
   446  
   447  	// The path can't possibly contribute to the lcs because it
   448  	// is all horizontal or vertical edges
   449  	if u == 0 || v == 0 || x == e.ux || y == e.uy {
   450  		// "abaabb" "abaaaa"
   451  		if u == 0 || v == 0 {
   452  			return e.backwardlcs(db, kb)
   453  		}
   454  		return e.forwardlcs(df, kf)
   455  	}
   456  
   457  	// is (x+1,y) or (x,y+1) labelled db-1?
   458  	if x+1 <= e.ux && ok(db-1, x+1-y-e.delta) && e.vb.get(db-1, x+1-y-e.delta) == x+1 {
   459  		// "bababb" "baaabb"
   460  		lcs := e.backwardlcs(db-1, kb+1)
   461  		lcs = append(lcs, e.forwardlcs(df, kf)...)
   462  		return lcs.sort()
   463  	}
   464  	if y+1 <= e.uy && ok(db-1, x-(y+1)-e.delta) && e.vb.get(db-1, x-(y+1)-e.delta) == x {
   465  		// "abbbaa" "cabacc"
   466  		lcs := e.backwardlcs(db-1, kb-1)
   467  		lcs = append(lcs, e.forwardlcs(df, kf)...)
   468  		return lcs.sort()
   469  	}
   470  
   471  	// need to compute another path
   472  	// "aabbaa" "aacaba"
   473  	lcs := e.backwardlcs(db, kb)
   474  	oldx, oldy := e.ux, e.uy
   475  	e.ux = u
   476  	e.uy = v
   477  	lcs = append(lcs, forward(e)...)
   478  	e.ux, e.uy = oldx, oldy
   479  	return lcs.sort()
   480  }