github.com/code-to-go/safepool.lib@v0.0.0-20221205180519-ee25e63c226e/algo/hashsplit.go (about) 1 package algo 2 3 import ( 4 "bytes" 5 "fmt" 6 "hash" 7 "io" 8 "strconv" 9 "strings" 10 11 "github.com/code-to-go/safepool.lib/core" 12 13 "github.com/chmduquesne/rollinghash/buzhash32" 14 "golang.org/x/crypto/blake2b" 15 ) 16 17 const windowSize = 32 18 19 type HashBlock struct { 20 Hash []byte 21 Length uint32 22 } 23 24 func getHashBlock(hashFun hash.Hash, length uint32, bufs ...[]byte) HashBlock { 25 hashFun.Reset() 26 for _, buf := range bufs { 27 hashFun.Write(buf) 28 } 29 block := HashBlock{ 30 Length: length, 31 } 32 copy(block.Hash[:], hashFun.Sum(nil)) 33 return block 34 } 35 36 func HashSplit(r io.Reader, splitBits uint, hashFun hash.Hash) (blocks []HashBlock, err error) { 37 var zeroes [windowSize]byte 38 39 if hashFun == nil { 40 hashFun, err = blake2b.New256(nil) 41 if core.IsErr(err, "cannot create blake2b hash function: %v") { 42 return nil, err 43 } 44 } 45 46 h := buzhash32.New() 47 h.Write(zeroes[:]) 48 mask := uint32(0xffffffff) 49 mask = mask >> uint32(32-splitBits) 50 51 buf := make([]byte, 0, mask*2) 52 inp := make([]byte, 1024) 53 54 for { 55 n, err := r.Read(inp) 56 if err == io.EOF { 57 if len(buf) > 0 { 58 blocks = append(blocks, getHashBlock(hashFun, uint32(len(buf)), buf)) 59 } 60 break 61 } else if err != nil { 62 return nil, err 63 } 64 65 step := 1 66 for i := 0; i < n; i += step { 67 b := inp[i] 68 h.Roll(inp[i]) 69 buf = append(buf, b) 70 71 sum32 := h.Sum32() 72 if sum32&mask == mask { 73 blocks = append(blocks, getHashBlock(hashFun, uint32(len(buf)), buf)) 74 buf = buf[:0] 75 } 76 } 77 } 78 79 return blocks, err 80 } 81 82 type EditOp int 83 84 const ( 85 EditOpInsert EditOp = iota 86 EditOpDelete 87 ) 88 89 type Range struct { 90 Start uint32 91 Length uint32 92 } 93 94 type Edit struct { 95 Slice Range 96 With Range 97 } 98 99 func (h *HashBlock) String() string { 100 return fmt.Sprintf("%d", h.Length) 101 } 102 103 //ab b 104 105 func HashDiff2(source, dest []HashBlock) []Edit { 106 sLen := len(source) 107 dLen := len(dest) 108 column := make([]int, sLen+1) 109 actions := make([][]Edit, sLen) 110 111 var diffs []Edit 112 var sOffset, dOffset uint32 113 114 for y := 1; y <= sLen; y++ { 115 column[y] = y 116 } 117 118 for x := 1; x <= dLen; x++ { 119 column[0] = x 120 lastkey := x - 1 121 for y := 1; y <= sLen; y++ { 122 oldkey := column[y] 123 var incr int 124 125 if bytes.Compare(source[y-1].Hash[:], dest[x-1].Hash[:]) != 0 { 126 incr = 1 127 } 128 129 insert := column[y] + 1 130 delete := column[y-1] + 1 131 if insert <= delete && insert <= lastkey+incr { 132 column[y] = insert 133 } else if delete <= lastkey+incr { 134 column[y] = delete 135 } else { 136 column[y] = lastkey + incr 137 if incr > 0 { 138 } 139 } 140 lastkey = oldkey 141 sOffset += source[y-1].Length 142 } 143 dOffset += dest[x-1].Length 144 } 145 println(actions) 146 return diffs 147 } 148 149 func sameBlock(a, b HashBlock) bool { 150 return bytes.Equal(a.Hash[:], b.Hash[:]) 151 } 152 153 func HashDiff(source, dest []HashBlock) []Edit { 154 var i, j int 155 sLen := len(source) 156 dLen := len(dest) 157 // ln := min(sLen, dLen) 158 159 // for i < ln && bytes.Compare(source[i].Hash[:], dest[i].Hash[:]) == 0 { 160 // i++ 161 // } 162 // for j < ln && bytes.Compare(source[sLen-j-1].Hash[:], dest[dLen-j-1].Hash[:]) == 0 { 163 // j++ 164 // } 165 166 var sOffset, dOffset uint32 167 var edits []Edit 168 for i < sLen && j < dLen { 169 s := source[i] 170 d := dest[j] 171 switch { 172 case sameBlock(s, d): 173 i++ 174 j++ 175 sOffset += s.Length 176 dOffset += d.Length 177 case i+1 < sLen && sameBlock(source[i+1], d): 178 //Delete operation 179 edits = append(edits, Edit{ 180 Slice: Range{sOffset, s.Length}, 181 With: Range{dOffset, 0}, 182 }) 183 j++ 184 sOffset += s.Length 185 case j+1 < dLen && sameBlock(s, dest[j+1]): 186 edits = append(edits, Edit{ 187 Slice: Range{sOffset, 0}, 188 With: Range{dOffset, d.Length}, 189 }) 190 i++ 191 dOffset += d.Length 192 default: 193 edits = append(edits, Edit{ 194 Slice: Range{sOffset, s.Length}, 195 With: Range{dOffset, d.Length}, 196 }) 197 i++ 198 j++ 199 sOffset += s.Length 200 dOffset += d.Length 201 } 202 } 203 204 return edits 205 206 } 207 208 const ( 209 traceMatch = iota 210 traceReplace 211 traceInsert 212 traceDelete 213 ) 214 215 func traceMatrixToString(edits [][]int) string { 216 b := strings.Builder{} 217 218 for i := 1; i < len(edits); i++ { 219 b.WriteRune('|') 220 for j := 1; j < len(edits[i]); j++ { 221 b.WriteString(strconv.Itoa(edits[i][j])) 222 b.WriteRune(' ') 223 } 224 b.WriteRune('|') 225 b.WriteRune('\n') 226 } 227 return b.String() 228 } 229 230 func levenshteinEditDistance(dest, source []HashBlock) []Edit { 231 sLen := len(source) 232 dLen := len(dest) 233 column := make([]int, sLen+1) 234 trace := make([][]int, dLen+1) 235 236 var sOffset, dOffset uint32 237 238 for y := 1; y <= sLen; y++ { 239 column[y] = y 240 } 241 242 for x := 1; x <= dLen; x++ { 243 trace[x] = make([]int, sLen+1) 244 column[0] = x 245 lastkey := x - 1 246 for y := 1; y <= sLen; y++ { 247 oldkey := column[y] 248 i := 0 249 250 tr := traceMatch 251 if bytes.Compare(source[y-1].Hash[:], dest[x-1].Hash[:]) != 0 { 252 i = 1 253 tr = traceReplace 254 } 255 256 cost := lastkey + i 257 tr = traceReplace 258 if column[y]+1 < cost { 259 cost = column[y] + 1 260 tr = traceInsert 261 } 262 if column[y-1]+1 < cost { 263 cost = column[y-1] + 1 264 tr = traceDelete 265 } 266 267 column[y] = cost 268 trace[x][y] = tr 269 270 lastkey = oldkey 271 sOffset += source[y-1].Length 272 } 273 dOffset += dest[x-1].Length 274 } 275 print(traceMatrixToString(trace)) 276 return reconstructEdit(source, dest, trace) 277 } 278 279 func reconstructEdit(source, dest []HashBlock, trace [][]int) []Edit { 280 i := len(trace) - 1 281 j := len(trace[i]) - 1 282 283 var edits []Edit 284 for i > 0 && j > 0 { 285 switch trace[i][j] { 286 case traceMatch: 287 i, j = i-1, j-1 288 println("skip", i, j) 289 case traceInsert: 290 j -= 1 291 println("insert", i, j) 292 edits = append(edits, Edit{}) 293 case traceDelete: 294 i -= 1 295 println("delete", i, j) 296 edits = append(edits, Edit{}) 297 298 case traceReplace: 299 i, j = i-1, j-1 300 println("replace", i, j) 301 edits = append(edits, Edit{}) 302 default: 303 i = 0 304 } 305 } 306 return edits 307 }