github.com/code-to-go/safepool.lib@v0.0.0-20221205180519-ee25e63c226e/algo/hashsplit.go (about)

     1  package algo
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"hash"
     7  	"io"
     8  	"strconv"
     9  	"strings"
    10  
    11  	"github.com/code-to-go/safepool.lib/core"
    12  
    13  	"github.com/chmduquesne/rollinghash/buzhash32"
    14  	"golang.org/x/crypto/blake2b"
    15  )
    16  
    17  const windowSize = 32
    18  
    19  type HashBlock struct {
    20  	Hash   []byte
    21  	Length uint32
    22  }
    23  
    24  func getHashBlock(hashFun hash.Hash, length uint32, bufs ...[]byte) HashBlock {
    25  	hashFun.Reset()
    26  	for _, buf := range bufs {
    27  		hashFun.Write(buf)
    28  	}
    29  	block := HashBlock{
    30  		Length: length,
    31  	}
    32  	copy(block.Hash[:], hashFun.Sum(nil))
    33  	return block
    34  }
    35  
    36  func HashSplit(r io.Reader, splitBits uint, hashFun hash.Hash) (blocks []HashBlock, err error) {
    37  	var zeroes [windowSize]byte
    38  
    39  	if hashFun == nil {
    40  		hashFun, err = blake2b.New256(nil)
    41  		if core.IsErr(err, "cannot create blake2b hash function: %v") {
    42  			return nil, err
    43  		}
    44  	}
    45  
    46  	h := buzhash32.New()
    47  	h.Write(zeroes[:])
    48  	mask := uint32(0xffffffff)
    49  	mask = mask >> uint32(32-splitBits)
    50  
    51  	buf := make([]byte, 0, mask*2)
    52  	inp := make([]byte, 1024)
    53  
    54  	for {
    55  		n, err := r.Read(inp)
    56  		if err == io.EOF {
    57  			if len(buf) > 0 {
    58  				blocks = append(blocks, getHashBlock(hashFun, uint32(len(buf)), buf))
    59  			}
    60  			break
    61  		} else if err != nil {
    62  			return nil, err
    63  		}
    64  
    65  		step := 1
    66  		for i := 0; i < n; i += step {
    67  			b := inp[i]
    68  			h.Roll(inp[i])
    69  			buf = append(buf, b)
    70  
    71  			sum32 := h.Sum32()
    72  			if sum32&mask == mask {
    73  				blocks = append(blocks, getHashBlock(hashFun, uint32(len(buf)), buf))
    74  				buf = buf[:0]
    75  			}
    76  		}
    77  	}
    78  
    79  	return blocks, err
    80  }
    81  
    82  type EditOp int
    83  
    84  const (
    85  	EditOpInsert EditOp = iota
    86  	EditOpDelete
    87  )
    88  
    89  type Range struct {
    90  	Start  uint32
    91  	Length uint32
    92  }
    93  
    94  type Edit struct {
    95  	Slice Range
    96  	With  Range
    97  }
    98  
    99  func (h *HashBlock) String() string {
   100  	return fmt.Sprintf("%d", h.Length)
   101  }
   102  
   103  //ab b
   104  
   105  func HashDiff2(source, dest []HashBlock) []Edit {
   106  	sLen := len(source)
   107  	dLen := len(dest)
   108  	column := make([]int, sLen+1)
   109  	actions := make([][]Edit, sLen)
   110  
   111  	var diffs []Edit
   112  	var sOffset, dOffset uint32
   113  
   114  	for y := 1; y <= sLen; y++ {
   115  		column[y] = y
   116  	}
   117  
   118  	for x := 1; x <= dLen; x++ {
   119  		column[0] = x
   120  		lastkey := x - 1
   121  		for y := 1; y <= sLen; y++ {
   122  			oldkey := column[y]
   123  			var incr int
   124  
   125  			if bytes.Compare(source[y-1].Hash[:], dest[x-1].Hash[:]) != 0 {
   126  				incr = 1
   127  			}
   128  
   129  			insert := column[y] + 1
   130  			delete := column[y-1] + 1
   131  			if insert <= delete && insert <= lastkey+incr {
   132  				column[y] = insert
   133  			} else if delete <= lastkey+incr {
   134  				column[y] = delete
   135  			} else {
   136  				column[y] = lastkey + incr
   137  				if incr > 0 {
   138  				}
   139  			}
   140  			lastkey = oldkey
   141  			sOffset += source[y-1].Length
   142  		}
   143  		dOffset += dest[x-1].Length
   144  	}
   145  	println(actions)
   146  	return diffs
   147  }
   148  
   149  func sameBlock(a, b HashBlock) bool {
   150  	return bytes.Equal(a.Hash[:], b.Hash[:])
   151  }
   152  
   153  func HashDiff(source, dest []HashBlock) []Edit {
   154  	var i, j int
   155  	sLen := len(source)
   156  	dLen := len(dest)
   157  	// ln := min(sLen, dLen)
   158  
   159  	// for i < ln && bytes.Compare(source[i].Hash[:], dest[i].Hash[:]) == 0 {
   160  	// 	i++
   161  	// }
   162  	// for j < ln && bytes.Compare(source[sLen-j-1].Hash[:], dest[dLen-j-1].Hash[:]) == 0 {
   163  	// 	j++
   164  	// }
   165  
   166  	var sOffset, dOffset uint32
   167  	var edits []Edit
   168  	for i < sLen && j < dLen {
   169  		s := source[i]
   170  		d := dest[j]
   171  		switch {
   172  		case sameBlock(s, d):
   173  			i++
   174  			j++
   175  			sOffset += s.Length
   176  			dOffset += d.Length
   177  		case i+1 < sLen && sameBlock(source[i+1], d):
   178  			//Delete operation
   179  			edits = append(edits, Edit{
   180  				Slice: Range{sOffset, s.Length},
   181  				With:  Range{dOffset, 0},
   182  			})
   183  			j++
   184  			sOffset += s.Length
   185  		case j+1 < dLen && sameBlock(s, dest[j+1]):
   186  			edits = append(edits, Edit{
   187  				Slice: Range{sOffset, 0},
   188  				With:  Range{dOffset, d.Length},
   189  			})
   190  			i++
   191  			dOffset += d.Length
   192  		default:
   193  			edits = append(edits, Edit{
   194  				Slice: Range{sOffset, s.Length},
   195  				With:  Range{dOffset, d.Length},
   196  			})
   197  			i++
   198  			j++
   199  			sOffset += s.Length
   200  			dOffset += d.Length
   201  		}
   202  	}
   203  
   204  	return edits
   205  
   206  }
   207  
   208  const (
   209  	traceMatch = iota
   210  	traceReplace
   211  	traceInsert
   212  	traceDelete
   213  )
   214  
   215  func traceMatrixToString(edits [][]int) string {
   216  	b := strings.Builder{}
   217  
   218  	for i := 1; i < len(edits); i++ {
   219  		b.WriteRune('|')
   220  		for j := 1; j < len(edits[i]); j++ {
   221  			b.WriteString(strconv.Itoa(edits[i][j]))
   222  			b.WriteRune(' ')
   223  		}
   224  		b.WriteRune('|')
   225  		b.WriteRune('\n')
   226  	}
   227  	return b.String()
   228  }
   229  
   230  func levenshteinEditDistance(dest, source []HashBlock) []Edit {
   231  	sLen := len(source)
   232  	dLen := len(dest)
   233  	column := make([]int, sLen+1)
   234  	trace := make([][]int, dLen+1)
   235  
   236  	var sOffset, dOffset uint32
   237  
   238  	for y := 1; y <= sLen; y++ {
   239  		column[y] = y
   240  	}
   241  
   242  	for x := 1; x <= dLen; x++ {
   243  		trace[x] = make([]int, sLen+1)
   244  		column[0] = x
   245  		lastkey := x - 1
   246  		for y := 1; y <= sLen; y++ {
   247  			oldkey := column[y]
   248  			i := 0
   249  
   250  			tr := traceMatch
   251  			if bytes.Compare(source[y-1].Hash[:], dest[x-1].Hash[:]) != 0 {
   252  				i = 1
   253  				tr = traceReplace
   254  			}
   255  
   256  			cost := lastkey + i
   257  			tr = traceReplace
   258  			if column[y]+1 < cost {
   259  				cost = column[y] + 1
   260  				tr = traceInsert
   261  			}
   262  			if column[y-1]+1 < cost {
   263  				cost = column[y-1] + 1
   264  				tr = traceDelete
   265  			}
   266  
   267  			column[y] = cost
   268  			trace[x][y] = tr
   269  
   270  			lastkey = oldkey
   271  			sOffset += source[y-1].Length
   272  		}
   273  		dOffset += dest[x-1].Length
   274  	}
   275  	print(traceMatrixToString(trace))
   276  	return reconstructEdit(source, dest, trace)
   277  }
   278  
   279  func reconstructEdit(source, dest []HashBlock, trace [][]int) []Edit {
   280  	i := len(trace) - 1
   281  	j := len(trace[i]) - 1
   282  
   283  	var edits []Edit
   284  	for i > 0 && j > 0 {
   285  		switch trace[i][j] {
   286  		case traceMatch:
   287  			i, j = i-1, j-1
   288  			println("skip", i, j)
   289  		case traceInsert:
   290  			j -= 1
   291  			println("insert", i, j)
   292  			edits = append(edits, Edit{})
   293  		case traceDelete:
   294  			i -= 1
   295  			println("delete", i, j)
   296  			edits = append(edits, Edit{})
   297  
   298  		case traceReplace:
   299  			i, j = i-1, j-1
   300  			println("replace", i, j)
   301  			edits = append(edits, Edit{})
   302  		default:
   303  			i = 0
   304  		}
   305  	}
   306  	return edits
   307  }