src.elv.sh@v0.21.0-dev.0.20240515223629-06979efb9a2a/pkg/diff/diff.go (about)

     1  // Copyright 2022 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package diff
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"sort"
    11  	"strings"
    12  )
    13  
    14  // A pair is a pair of values tracked for both the x and y side of a diff.
    15  // It is typically a pair of line indexes.
    16  type pair struct{ x, y int }
    17  
    18  // Diff returns an anchored diff of the two texts old and new
    19  // in the “unified diff” format. If old and new are identical,
    20  // Diff returns a nil slice (no output).
    21  //
    22  // Unix diff implementations typically look for a diff with
    23  // the smallest number of lines inserted and removed,
    24  // which can in the worst case take time quadratic in the
    25  // number of lines in the texts. As a result, many implementations
    26  // either can be made to run for a long time or cut off the search
    27  // after a predetermined amount of work.
    28  //
    29  // In contrast, this implementation looks for a diff with the
    30  // smallest number of “unique” lines inserted and removed,
    31  // where unique means a line that appears just once in both old and new.
    32  // We call this an “anchored diff” because the unique lines anchor
    33  // the chosen matching regions. An anchored diff is usually clearer
    34  // than a standard diff, because the algorithm does not try to
    35  // reuse unrelated blank lines or closing braces.
    36  // The algorithm also guarantees to run in O(n log n) time
    37  // instead of the standard O(n²) time.
    38  //
    39  // Some systems call this approach a “patience diff,” named for
    40  // the “patience sorting” algorithm, itself named for a solitaire card game.
    41  // We avoid that name for two reasons. First, the name has been used
    42  // for a few different variants of the algorithm, so it is imprecise.
    43  // Second, the name is frequently interpreted as meaning that you have
    44  // to wait longer (to be patient) for the diff, meaning that it is a slower algorithm,
    45  // when in fact the algorithm is faster than the standard one.
    46  func Diff(oldName, old, newName, new string) []byte {
    47  	if old == new {
    48  		return nil
    49  	}
    50  	// Print diff header.
    51  	var out bytes.Buffer
    52  	fmt.Fprintf(&out, "diff %s %s\n", oldName, newName)
    53  	fmt.Fprintf(&out, "--- %s\n", oldName)
    54  	fmt.Fprintf(&out, "+++ %s\n", newName)
    55  	out.Write(DiffNoHeader(old, new))
    56  	return out.Bytes()
    57  }
    58  
    59  func DiffNoHeader(old, new string) []byte {
    60  	x := lines(old)
    61  	y := lines(new)
    62  
    63  	var out bytes.Buffer
    64  
    65  	// Loop over matches to consider,
    66  	// expanding each match to include surrounding lines,
    67  	// and then printing diff chunks.
    68  	// To avoid setup/teardown cases outside the loop,
    69  	// tgs returns a leading {0,0} and trailing {len(x), len(y)} pair
    70  	// in the sequence of matches.
    71  	var (
    72  		done  pair     // printed up to x[:done.x] and y[:done.y]
    73  		chunk pair     // start lines of current chunk
    74  		count pair     // number of lines from each side in current chunk
    75  		ctext []string // lines for current chunk
    76  	)
    77  	for _, m := range tgs(x, y) {
    78  		if m.x < done.x {
    79  			// Already handled scanning forward from earlier match.
    80  			continue
    81  		}
    82  
    83  		// Expand matching lines as far possible,
    84  		// establishing that x[start.x:end.x] == y[start.y:end.y].
    85  		// Note that on the first (or last) iteration we may (or definitey do)
    86  		// have an empty match: start.x==end.x and start.y==end.y.
    87  		start := m
    88  		for start.x > done.x && start.y > done.y && x[start.x-1] == y[start.y-1] {
    89  			start.x--
    90  			start.y--
    91  		}
    92  		end := m
    93  		for end.x < len(x) && end.y < len(y) && x[end.x] == y[end.y] {
    94  			end.x++
    95  			end.y++
    96  		}
    97  
    98  		// Emit the mismatched lines before start into this chunk.
    99  		// (No effect on first sentinel iteration, when start = {0,0}.)
   100  		for _, s := range x[done.x:start.x] {
   101  			ctext = append(ctext, "-"+s)
   102  			count.x++
   103  		}
   104  		for _, s := range y[done.y:start.y] {
   105  			ctext = append(ctext, "+"+s)
   106  			count.y++
   107  		}
   108  
   109  		// If we're not at EOF and have too few common lines,
   110  		// the chunk includes all the common lines and continues.
   111  		const C = 3 // number of context lines
   112  		if (end.x < len(x) || end.y < len(y)) &&
   113  			(end.x-start.x < C || (len(ctext) > 0 && end.x-start.x < 2*C)) {
   114  			for _, s := range x[start.x:end.x] {
   115  				ctext = append(ctext, " "+s)
   116  				count.x++
   117  				count.y++
   118  			}
   119  			done = end
   120  			continue
   121  		}
   122  
   123  		// End chunk with common lines for context.
   124  		if len(ctext) > 0 {
   125  			n := end.x - start.x
   126  			if n > C {
   127  				n = C
   128  			}
   129  			for _, s := range x[start.x : start.x+n] {
   130  				ctext = append(ctext, " "+s)
   131  				count.x++
   132  				count.y++
   133  			}
   134  			done = pair{start.x + n, start.y + n}
   135  
   136  			// Format and emit chunk.
   137  			// Convert line numbers to 1-indexed.
   138  			// Special case: empty file shows up as 0,0 not 1,0.
   139  			if count.x > 0 {
   140  				chunk.x++
   141  			}
   142  			if count.y > 0 {
   143  				chunk.y++
   144  			}
   145  			fmt.Fprintf(&out, "@@ -%d,%d +%d,%d @@\n", chunk.x, count.x, chunk.y, count.y)
   146  			for _, s := range ctext {
   147  				out.WriteString(s)
   148  			}
   149  			count.x = 0
   150  			count.y = 0
   151  			ctext = ctext[:0]
   152  		}
   153  
   154  		// If we reached EOF, we're done.
   155  		if end.x >= len(x) && end.y >= len(y) {
   156  			break
   157  		}
   158  
   159  		// Otherwise start a new chunk.
   160  		chunk = pair{end.x - C, end.y - C}
   161  		for _, s := range x[chunk.x:end.x] {
   162  			ctext = append(ctext, " "+s)
   163  			count.x++
   164  			count.y++
   165  		}
   166  		done = end
   167  	}
   168  
   169  	return out.Bytes()
   170  }
   171  
   172  // lines returns the lines in the file x, including newlines.
   173  // If the file does not end in a newline, one is supplied
   174  // along with a warning about the missing newline.
   175  func lines(x string) []string {
   176  	l := strings.SplitAfter(x, "\n")
   177  	if l[len(l)-1] == "" {
   178  		l = l[:len(l)-1]
   179  	} else {
   180  		// Treat last line as having a message about the missing newline attached,
   181  		// using the same text as BSD/GNU diff (including the leading backslash).
   182  		l[len(l)-1] += "\n\\ No newline at end of file\n"
   183  	}
   184  	return l
   185  }
   186  
   187  // tgs returns the pairs of indexes of the longest common subsequence
   188  // of unique lines in x and y, where a unique line is one that appears
   189  // once in x and once in y.
   190  //
   191  // The longest common subsequence algorithm is as described in
   192  // Thomas G. Szymanski, “A Special Case of the Maximal Common
   193  // Subsequence Problem,” Princeton TR #170 (January 1975),
   194  // available at https://research.swtch.com/tgs170.pdf.
   195  func tgs(x, y []string) []pair {
   196  	// Count the number of times each string appears in a and b.
   197  	// We only care about 0, 1, many, counted as 0, -1, -2
   198  	// for the x side and 0, -4, -8 for the y side.
   199  	// Using negative numbers now lets us distinguish positive line numbers later.
   200  	m := make(map[string]int)
   201  	for _, s := range x {
   202  		if c := m[s]; c > -2 {
   203  			m[s] = c - 1
   204  		}
   205  	}
   206  	for _, s := range y {
   207  		if c := m[s]; c > -8 {
   208  			m[s] = c - 4
   209  		}
   210  	}
   211  
   212  	// Now unique strings can be identified by m[s] = -1+-4.
   213  	//
   214  	// Gather the indexes of those strings in x and y, building:
   215  	//	xi[i] = increasing indexes of unique strings in x.
   216  	//	yi[i] = increasing indexes of unique strings in y.
   217  	//	inv[i] = index j such that x[xi[i]] = y[yi[j]].
   218  	var xi, yi, inv []int
   219  	for i, s := range y {
   220  		if m[s] == -1+-4 {
   221  			m[s] = len(yi)
   222  			yi = append(yi, i)
   223  		}
   224  	}
   225  	for i, s := range x {
   226  		if j, ok := m[s]; ok && j >= 0 {
   227  			xi = append(xi, i)
   228  			inv = append(inv, j)
   229  		}
   230  	}
   231  
   232  	// Apply Algorithm A from Szymanski's paper.
   233  	// In those terms, A = J = inv and B = [0, n).
   234  	// We add sentinel pairs {0,0}, and {len(x),len(y)}
   235  	// to the returned sequence, to help the processing loop.
   236  	J := inv
   237  	n := len(xi)
   238  	T := make([]int, n)
   239  	L := make([]int, n)
   240  	for i := range T {
   241  		T[i] = n + 1
   242  	}
   243  	for i := 0; i < n; i++ {
   244  		k := sort.Search(n, func(k int) bool {
   245  			return T[k] >= J[i]
   246  		})
   247  		T[k] = J[i]
   248  		L[i] = k + 1
   249  	}
   250  	k := 0
   251  	for _, v := range L {
   252  		if k < v {
   253  			k = v
   254  		}
   255  	}
   256  	seq := make([]pair, 2+k)
   257  	seq[1+k] = pair{len(x), len(y)} // sentinel at end
   258  	lastj := n
   259  	for i := n - 1; i >= 0; i-- {
   260  		if L[i] == k && J[i] < lastj {
   261  			seq[k] = pair{xi[i], yi[J[i]]}
   262  			k--
   263  		}
   264  	}
   265  	seq[0] = pair{0, 0} // sentinel at start
   266  	return seq
   267  }