github.com/mutagen-io/mutagen@v0.18.0-rc1/pkg/synchronization/rsync/engine.go (about) 1 package rsync 2 3 import ( 4 "bufio" 5 "bytes" 6 "crypto/sha1" 7 "errors" 8 "fmt" 9 "hash" 10 "io" 11 "math" 12 13 "google.golang.org/protobuf/proto" 14 ) 15 16 // EnsureValid verifies that block hash invariants are respected. 17 func (h *BlockHash) EnsureValid() error { 18 // A nil block hash is not valid. 19 if h == nil { 20 return errors.New("nil block hash") 21 } 22 23 // Ensure that the strong signature is valid. 24 if len(h.Strong) == 0 { 25 return errors.New("empty strong signature") 26 } 27 28 // Success. 29 return nil 30 } 31 32 // EnsureValid verifies that signature invariants are respected. 33 func (s *Signature) EnsureValid() error { 34 // A nil signature is not valid. 35 if s == nil { 36 return errors.New("nil signature") 37 } 38 39 // Ensure that all block hashes are valid. 40 for _, h := range s.Hashes { 41 if err := h.EnsureValid(); err != nil { 42 return fmt.Errorf("invalid block hash: %w", err) 43 } 44 } 45 46 // If the block size is 0, then the last block size should also be 0 and 47 // there shouldn't be any hashes. 48 if s.BlockSize == 0 { 49 if s.LastBlockSize != 0 { 50 return errors.New("block size of 0 with non-0 last block size") 51 } else if len(s.Hashes) != 0 { 52 return errors.New("block size of 0 with non-0 number of hashes") 53 } 54 return nil 55 } 56 57 // If block size is non-0, then the last block size should be non-0 but less 58 // than or equal to the block size. 59 if s.LastBlockSize == 0 { 60 return errors.New("non-0 block size with last block size of 0") 61 } else if s.LastBlockSize > s.BlockSize { 62 return errors.New("last block size greater than block size") 63 } 64 65 // If the block size is non-0, then a non-zero number of blocks should have 66 // been hashed. 67 if len(s.Hashes) == 0 { 68 return errors.New("non-0 block size with no block hashes") 69 } 70 71 // Success. 72 return nil 73 } 74 75 // isEmpty return true if the signature represents an empty file. 76 func (s *Signature) isEmpty() bool { 77 // In theory, we might also want to test that LastBlockSize == 0 and that 78 // there aren't any hashes, but so long as the invariants of signature are 79 // maintained, this check if sufficient. 80 return s.BlockSize == 0 81 } 82 83 // EnsureValid verifies that operation invariants are respected. 84 func (o *Operation) EnsureValid() error { 85 // A nil operation is not valid. 86 if o == nil { 87 return errors.New("nil operation") 88 } 89 90 // Ensure that the operation parameters are valid. 91 if len(o.Data) > 0 { 92 if o.Start != 0 { 93 return errors.New("data operation with non-0 block start index") 94 } else if o.Count != 0 { 95 return errors.New("data operation with non-0 block count") 96 } 97 } else if o.Count == 0 { 98 return errors.New("block operation with 0 block count") 99 } 100 101 // Success. 102 return nil 103 } 104 105 // resetToZeroMaintainingCapacity resets an Operation to its zero-value, but 106 // leaves capacity in the data slice. It's worth noting that the zero-value 107 // state is not a valid state for an Operation. 108 func (o *Operation) resetToZeroMaintainingCapacity() { 109 // Reset the data slice, but maintain its capacity. 110 o.Data = o.Data[:0] 111 112 // Reset start and count. 113 o.Start = 0 114 o.Count = 0 115 } 116 117 // isZeroValue indicates whether or not an Operation has its zero-value. It's 118 // worth noting that the zero-value state is not a valid state for an Operation. 119 func (o *Operation) isZeroValue() bool { 120 return len(o.Data) == 0 && o.Start == 0 && o.Count == 0 121 } 122 123 const ( 124 // minimumOptimalBlockSize is the minimum block size that will be returned 125 // by OptimalBlockSizeForBaseLength. It has to be chosen so that it is at 126 // least a few orders of magnitude larger than the size of a BlockHash. 127 minimumOptimalBlockSize = 1 << 10 128 // maximumOptimalBlockSize is the maximum block size that will be returned 129 // by OptimalBlockSizeForBaseLength. It mostly just needs to be bounded by 130 // what can fit into a reasonably sized in-memory buffer, particularly if 131 // multiple rsync engines are running. maximumBlockSize also needs to be 132 // less than or equal to (2^32)-1 for the weak hash algorithm to work. 133 maximumOptimalBlockSize = 1 << 16 134 // DefaultBlockSize is the default block size that will be used if a zero 135 // value is passed into Engine.Signature for the blockSize parameter. 136 DefaultBlockSize = 1 << 13 137 // DefaultMaximumDataOperationSize is the default maximum data size 138 // permitted per operation. The optimal value for this isn't at all 139 // correlated with block size - it's just what's reasonable to hold 140 // in-memory and pass over the wire in a single transmission. This value 141 // will be used if a zero value is passed into Engine.Deltify or 142 // Engine.DeltifyBytes for the maxDataOpSize parameter. 143 DefaultMaximumDataOperationSize = 1 << 16 144 ) 145 146 // OptimalBlockSizeForBaseLength uses a simpler heuristic to choose a block 147 // size based on the base length. It starts by choosing the optimal block length 148 // using the formula given in the rsync thesis. It then enforces that the block 149 // size is within a sensible range. 150 // TODO: Should we add rounding to "nice" values, e.g. the nearest multiple of 151 // 1024 bytes? Would this improve read throughput? 152 func OptimalBlockSizeForBaseLength(baseLength uint64) uint64 { 153 // Compute the optimal block length (see the rsync thesis) assuming one 154 // change per file. 155 result := uint64(math.Sqrt(24.0 * float64(baseLength))) 156 157 // Ensure it's within the allowed range. 158 if result < minimumOptimalBlockSize { 159 result = minimumOptimalBlockSize 160 } else if result > maximumOptimalBlockSize { 161 result = maximumOptimalBlockSize 162 } 163 164 // Done. 165 return result 166 } 167 168 // OptimalBlockSizeForBase is a convenience function that will determine the 169 // optimal block size for a base that implements io.Seeker. It calls down to 170 // OptimalBlockSizeForBaseLength. After determining the base's length, it will 171 // attempt to reset the base to its original position. 172 func OptimalBlockSizeForBase(base io.Seeker) (uint64, error) { 173 if currentOffset, err := base.Seek(0, io.SeekCurrent); err != nil { 174 return 0, fmt.Errorf("unable to determine current base offset: %w", err) 175 } else if currentOffset < 0 { 176 return 0, fmt.Errorf("seek return negative starting location: %w", err) 177 } else if length, err := base.Seek(0, io.SeekEnd); err != nil { 178 return 0, fmt.Errorf("unable to compute base length: %w", err) 179 } else if length < 0 { 180 return 0, errors.New("seek returned negative offset") 181 } else if _, err = base.Seek(currentOffset, io.SeekStart); err != nil { 182 return 0, fmt.Errorf("unable to reset base: %w", err) 183 } else { 184 return OptimalBlockSizeForBaseLength(uint64(length)), nil 185 } 186 } 187 188 // OperationTransmitter transmits an operation. Operation objects and their data 189 // buffers are re-used between calls to the transmitter, so the transmitter 190 // should not return until it has either transmitted the operation or copied it 191 // for later transmission. 192 type OperationTransmitter func(*Operation) error 193 194 // Engine provides rsync functionality without any notion of transport. It is 195 // designed to be re-used to avoid heavy buffer allocation. 196 type Engine struct { 197 // buffer is a re-usable buffer that will be used for reading data and 198 // setting up operations. 199 buffer []byte 200 // strongHasher is the strong hash function to use for the engine. 201 strongHasher hash.Hash 202 // strongHashBuffer is a re-usable buffer that can be used by methods to 203 // receive digests. 204 strongHashBuffer []byte 205 // targetReader is a re-usable bufio.Reader that will be used for delta 206 // creation operations. 207 targetReader *bufio.Reader 208 // operation is a re-usable operation object used for transmissions to avoid 209 // allocations. 210 operation *Operation 211 } 212 213 // NewEngine creates a new rsync engine. 214 func NewEngine() *Engine { 215 // Create the strong hash function. 216 // TODO: We might want to allow users to specify other strong hash functions 217 // for the engine to use (e.g. BLAKE2 functions), but for now we just use 218 // SHA-1 since it's a good balance of speed and robustness for rsync 219 // purposes. 220 strongHasher := sha1.New() 221 222 // Create the engine. 223 return &Engine{ 224 strongHasher: strongHasher, 225 strongHashBuffer: make([]byte, strongHasher.Size()), 226 targetReader: bufio.NewReader(nil), 227 operation: &Operation{}, 228 } 229 } 230 231 // bufferWithSize lazily allocates the engine's internal buffer, ensuring that 232 // it is the required size. The capacity of the internal buffer is retained 233 // between calls to avoid allocations if possible. 234 func (e *Engine) bufferWithSize(size uint64) []byte { 235 // Check if the buffer currently has the required capacity. If it does, then 236 // use that space. Note that we're checking *capacity* - you're allowed to 237 // slice a buffer up to its capacity, not just its length. Of course, if you 238 // don't own the buffer, you could run into problems with accessing data 239 // outside the slice that was given to you, but this buffer is completely 240 // internal, so that's not a concern. 241 if uint64(cap(e.buffer)) >= size { 242 return e.buffer[:size] 243 } 244 245 // If we couldn't use our existing buffer, create a new one, but store it 246 // for later re-use. 247 e.buffer = make([]byte, size) 248 return e.buffer 249 } 250 251 const ( 252 // m is the weak hash modulus. I think they now recommend that it be the 253 // largest prime less than 2^16, but this value is fine as well. 254 m = 1 << 16 255 ) 256 257 // weakHash computes a fast checksum that can be rolled (updated without full 258 // recomputation). This particular hash is detailed on page 55 of the rsync 259 // thesis. It is not theoretically optimal, but it's fine for our purposes. 260 func (e *Engine) weakHash(data []byte, blockSize uint64) (uint32, uint32, uint32) { 261 // Compute hash components. 262 var r1, r2 uint32 263 for i, b := range data { 264 r1 += uint32(b) 265 r2 += (uint32(blockSize) - uint32(i)) * uint32(b) 266 } 267 r1 = r1 % m 268 r2 = r2 % m 269 270 // Compute the hash. 271 result := r1 + m*r2 272 273 // Done. 274 return result, r1, r2 275 } 276 277 // rollWeakHash updates the checksum computed by weakHash by adding and removing 278 // a byte. 279 func (e *Engine) rollWeakHash(r1, r2 uint32, out, in byte, blockSize uint64) (uint32, uint32, uint32) { 280 // Update components. 281 r1 = (r1 - uint32(out) + uint32(in)) % m 282 r2 = (r2 - uint32(blockSize)*uint32(out) + r1) % m 283 284 // Compute the hash. 285 result := r1 + m*r2 286 287 // Done. 288 return result, r1, r2 289 } 290 291 // strongHash computes a slow but strong hash for a block of data. If allocate 292 // is true, then a new byte slice will be allocated to receive the digest, 293 // otherwise the engine's internal digest buffer will be used, but then the 294 // digest will only be valid until the next call to strongHash. 295 func (e *Engine) strongHash(data []byte, allocate bool) []byte { 296 // Reset the hasher. 297 e.strongHasher.Reset() 298 299 // Digest the data. The Hash interface guarantees that writes succeed. 300 e.strongHasher.Write(data) 301 302 // Compute the output location. 303 var output []byte 304 if !allocate { 305 output = e.strongHashBuffer[:0] 306 } 307 308 // Compute the digest. 309 return e.strongHasher.Sum(output) 310 } 311 312 // Signature computes the signature for a base stream. If the provided block 313 // size is 0, this method will attempt to compute the optimal block size (which 314 // requires that base implement io.Seeker), and failing that will fall back to a 315 // default block size. 316 func (e *Engine) Signature(base io.Reader, blockSize uint64) (*Signature, error) { 317 // Choose a block size if none is specified. If the base also implements 318 // io.Seeker (which most will since they need to for Patch), then use the 319 // optimal block size, otherwise use the default. 320 if blockSize == 0 { 321 if baseSeeker, ok := base.(io.Seeker); ok { 322 if s, err := OptimalBlockSizeForBase(baseSeeker); err == nil { 323 blockSize = s 324 } else { 325 blockSize = DefaultBlockSize 326 } 327 } else { 328 blockSize = DefaultBlockSize 329 } 330 } 331 332 // Create the result. 333 result := &Signature{ 334 BlockSize: blockSize, 335 } 336 337 // Create a buffer with which to read blocks. 338 buffer := e.bufferWithSize(blockSize) 339 340 // Read blocks and append their hashes until we reach EOF. 341 eof := false 342 for !eof { 343 // Read the next block and watch for errors. If we receive io.EOF, then 344 // nothing was read, and we should break immediately. This means that 345 // the base had a length that was a multiple of the block size. If we 346 // receive io.ErrUnexpectedEOF, then something was read but we're still 347 // at the end of the file, so we should hash this block but not go 348 // through the loop again. All other errors are terminal. 349 n, err := io.ReadFull(base, buffer) 350 if err == io.EOF { 351 result.LastBlockSize = blockSize 352 break 353 } else if err == io.ErrUnexpectedEOF { 354 result.LastBlockSize = uint64(n) 355 eof = true 356 } else if err != nil { 357 return nil, fmt.Errorf("unable to read data block: %w", err) 358 } 359 360 // Compute hashes for the the block that was read. For short blocks, we 361 // still use the full block size when computing the weak hash. We could 362 // alternatively use the short block length, but it doesn't matter - all 363 // that matters is that we keep consistency when we compute the short 364 // block weak hash when searching in Deltify. 365 weak, _, _ := e.weakHash(buffer[:n], blockSize) 366 strong := e.strongHash(buffer[:n], true) 367 368 // Add the block hash. 369 result.Hashes = append(result.Hashes, &BlockHash{ 370 Weak: weak, 371 Strong: strong, 372 }) 373 } 374 375 // If there are no hashes, then clear out the block sizes. 376 if len(result.Hashes) == 0 { 377 result.BlockSize = 0 378 result.LastBlockSize = 0 379 } 380 381 // Success. 382 return result, nil 383 } 384 385 // BytesSignature computes the signature for a byte slice. 386 func (e *Engine) BytesSignature(base []byte, blockSize uint64) *Signature { 387 // Perform the signature and watch for errors (which shouldn't be able to 388 // occur in-memory). 389 result, err := e.Signature(bytes.NewReader(base), blockSize) 390 if err != nil { 391 panic(fmt.Errorf("in-memory signature failure: %w", err)) 392 } 393 394 // Success. 395 return result 396 } 397 398 // dualModeReader unifies the io.Reader and io.ByteReader interfaces. It is used 399 // in deltify operations to ensure that bytes can be efficiently extracted from 400 // targets. 401 type dualModeReader interface { 402 io.Reader 403 io.ByteReader 404 } 405 406 // min implements simple minimum finding for uint64 values. 407 func min(a, b uint64) uint64 { 408 if a < b { 409 return a 410 } 411 return b 412 } 413 414 // transmitData transmits a data operation using the engine's internal operation 415 // object. 416 func (e *Engine) transmitData(data []byte, transmit OperationTransmitter) error { 417 // Set the operation parameters. 418 *e.operation = Operation{ 419 Data: data, 420 } 421 422 // Transmit. 423 return transmit(e.operation) 424 } 425 426 // transmitBlock transmits a block operation using the engine's internal 427 // operation object. 428 func (e *Engine) transmitBlock(start, count uint64, transmit OperationTransmitter) error { 429 // Set the operation parameters. 430 *e.operation = Operation{ 431 Start: start, 432 Count: count, 433 } 434 435 // Transmit. 436 return transmit(e.operation) 437 } 438 439 // chunkAndTransmitAll is a fast-path routine for simply transmitting all data 440 // in a target stream. This is used when there are no blocks to match because 441 // the base stream is empty. 442 func (e *Engine) chunkAndTransmitAll(target io.Reader, maxDataOpSize uint64, transmit OperationTransmitter) error { 443 // Verify that maxDataOpSize is sane. 444 if maxDataOpSize == 0 { 445 maxDataOpSize = DefaultMaximumDataOperationSize 446 } 447 448 // Create a buffer to transmit data operations. 449 buffer := e.bufferWithSize(maxDataOpSize) 450 451 // Loop until the entire target has been transmitted as data operations. 452 for { 453 if n, err := io.ReadFull(target, buffer); err == io.EOF { 454 return nil 455 } else if err == io.ErrUnexpectedEOF { 456 if err = e.transmitData(buffer[:n], transmit); err != nil { 457 return fmt.Errorf("unable to transmit data operation: %w", err) 458 } 459 return nil 460 } else if err != nil { 461 return fmt.Errorf("unable to read target: %w", err) 462 } else if err = e.transmitData(buffer, transmit); err != nil { 463 return fmt.Errorf("unable to transmit data operation: %w", err) 464 } 465 } 466 } 467 468 // Deltify computes delta operations to reconstitute the target data stream 469 // using the base stream (based on the provided base signature). It streams 470 // operations to the provided transmission function. The internal engine buffer 471 // will be resized to the sum of the maximum data operation size plus the block 472 // size, and retained for the lifetime of the engine, so a reasonable value 473 // for the maximum data operation size should be provided. For performance 474 // reasons, this method does not validate that the provided signature satisfies 475 // expected invariants. It is the responsibility of the caller to verify that 476 // the signature is valid by calling its EnsureValid method. This is not 477 // necessary for signatures generated in the same process, but should be done 478 // for signatures received from untrusted locations (e.g. over the network). An 479 // invalid signature can result in undefined behavior. 480 func (e *Engine) Deltify(target io.Reader, base *Signature, maxDataOpSize uint64, transmit OperationTransmitter) error { 481 // Verify that the maximum data operation size is sane. 482 if maxDataOpSize == 0 { 483 maxDataOpSize = DefaultMaximumDataOperationSize 484 } 485 486 // If the base is empty, then there's no way we'll find any matching blocks, 487 // so just send the entire file. 488 if len(base.Hashes) == 0 { 489 return e.chunkAndTransmitAll(target, maxDataOpSize, transmit) 490 } 491 492 // Create a set of block and data transmitters that efficiently coalesce 493 // adjacent block operations and provide data chunking. Some corresponding 494 // finalization logic is required at the end of this function. 495 var coalescedStart, coalescedCount uint64 496 sendBlock := func(index uint64) error { 497 if coalescedCount > 0 { 498 if coalescedStart+coalescedCount == index { 499 coalescedCount++ 500 return nil 501 } else if err := e.transmitBlock(coalescedStart, coalescedCount, transmit); err != nil { 502 return nil 503 } 504 } 505 coalescedStart = index 506 coalescedCount = 1 507 return nil 508 } 509 sendData := func(data []byte) error { 510 if len(data) > 0 && coalescedCount > 0 { 511 if err := e.transmitBlock(coalescedStart, coalescedCount, transmit); err != nil { 512 return err 513 } 514 coalescedStart = 0 515 coalescedCount = 0 516 } 517 for len(data) > 0 { 518 sendSize := min(uint64(len(data)), maxDataOpSize) 519 if err := e.transmitData(data[:sendSize], transmit); err != nil { 520 return err 521 } 522 data = data[sendSize:] 523 } 524 return nil 525 } 526 527 // Ensure that the target implements io.Reader and io.ByteReader. If it can 528 // do this natively, great! If not, wrap it in our re-usable buffered 529 // reader, but ensure that it is released when we're done so that we don't 530 // retain it indefinitely. 531 bufferedTarget, ok := target.(dualModeReader) 532 if !ok { 533 e.targetReader.Reset(target) 534 bufferedTarget = e.targetReader 535 defer func() { 536 e.targetReader.Reset(nil) 537 }() 538 } 539 540 // Create a lookup table that maps weak hashes to all matching block hashes. 541 // If the last block is short, we extract it and hold it separately, because 542 // when doing match searches, we assume that all blocks in this map have a 543 // full block size worth of data. 544 // 545 // The rsync technical report (see the section titled "Checksum searching") 546 // actually advocates a 3-tier search (i.e. an additional 16-bit hash layer 547 // before the weak hash), but I think this probably isn't necessary with 548 // modern hardware and hashing algorithms. 549 // 550 // TODO: This is currently a little expensive because it requires a slice 551 // allocation for each map entry. I suspect that the collision rate for weak 552 // hashes is actually sufficiently low that we could make each map value a 553 // fixed array of int that would limit the number of matches we could try, 554 // but save us a lot of allocating. We would have to use an int, because 555 // invalid values would likely need to be -1. This might be an unnecessary 556 // operation though, because this map is only generated for non-empty bases, 557 // which typically don't come in large numbers. For a few files, generating 558 // these maps with slice values is fine. It also might be a bit slow since 559 // each insertion would require a linear search to find the insertion 560 // location within the array. 561 hashes := base.Hashes 562 haveShortLastBlock := false 563 var lastBlockIndex uint64 564 var shortLastBlock *BlockHash 565 if base.LastBlockSize != base.BlockSize { 566 haveShortLastBlock = true 567 lastBlockIndex = uint64(len(hashes) - 1) 568 shortLastBlock = hashes[lastBlockIndex] 569 hashes = hashes[:lastBlockIndex] 570 } 571 weakToBlockHashes := make(map[uint32][]uint64, len(hashes)) 572 for i, h := range hashes { 573 weakToBlockHashes[h.Weak] = append(weakToBlockHashes[h.Weak], uint64(i)) 574 } 575 576 // Create a buffer that we can use to load data and search for matches. We 577 // start by filling it with a block's worth of data and then continuously 578 // appending bytes until we either fill the buffer (at which point we 579 // transmit data preceding the block and truncate) or find a match (at 580 // which point we transmit data preceding the block and then transmit the 581 // block match). Once we're unable to append a new byte or refill with a 582 // full block, we terminate our search and send the remaining data 583 // (potentially searching for one last short block match at the end of the 584 // buffer). 585 // 586 // We choose the buffer size to hold a chunk of data of the maximum allowed 587 // transmission size and a block of data. This size choice is somewhat 588 // arbitrary since we have a data chunking function and could load more data 589 // before doing a truncation/transmission, but this is also a reasonable 590 // amount of data to hold in memory at any given time. We could choose a 591 // larger preceding data chunk size to have less frequent truncations, but 592 // (a) truncations are cheap and (b) we'll probably be doing a lot of 593 // sequential block matching cycles where we just continuously match blocks 594 // at the beginning of the buffer and then refill, so truncations won't be 595 // all that common. 596 buffer := e.bufferWithSize(maxDataOpSize + base.BlockSize) 597 598 // Track the occupancy of the buffer. 599 var occupancy uint64 600 601 // Track the weak hash and its parameters for the block at the end of the 602 // buffer. 603 var weak, r1, r2 uint32 604 605 // Loop over the contents of the file and search for matches. 606 for { 607 // If the buffer is empty, then we need to read in a block's worth of 608 // data (if possible) and calculate the weak hash and its parameters. If 609 // the buffer is non-empty but less than a block's worth of data, then 610 // we've broken an invariant in our code. Otherwise, we need to move the 611 // search block one byte forward and roll the hash. 612 if occupancy == 0 { 613 if n, err := io.ReadFull(bufferedTarget, buffer[:base.BlockSize]); err == io.EOF || err == io.ErrUnexpectedEOF { 614 occupancy = uint64(n) 615 break 616 } else if err != nil { 617 return fmt.Errorf("unable to perform initial buffer fill: %w", err) 618 } else { 619 occupancy = base.BlockSize 620 weak, r1, r2 = e.weakHash(buffer[:occupancy], base.BlockSize) 621 } 622 } else if occupancy < base.BlockSize { 623 panic("buffer contains less than a block worth of data") 624 } else { 625 if b, err := bufferedTarget.ReadByte(); err == io.EOF { 626 break 627 } else if err != nil { 628 return fmt.Errorf("unable to read target byte: %w", err) 629 } else { 630 weak, r1, r2 = e.rollWeakHash(r1, r2, buffer[occupancy-base.BlockSize], b, base.BlockSize) 631 buffer[occupancy] = b 632 occupancy++ 633 } 634 } 635 636 // Look for a block match for the block at the end of the buffer. 637 potentials := weakToBlockHashes[weak] 638 match := false 639 var matchIndex uint64 640 if len(potentials) > 0 { 641 strong := e.strongHash(buffer[occupancy-base.BlockSize:occupancy], false) 642 for _, p := range potentials { 643 if bytes.Equal(base.Hashes[p].Strong, strong) { 644 match = true 645 matchIndex = p 646 break 647 } 648 } 649 } 650 651 // If there's a match, send any data preceding the match and then send 652 // the match. Otherwise, if we've reached buffer capacity, send the data 653 // preceding the search block. 654 if match { 655 if err := sendData(buffer[:occupancy-base.BlockSize]); err != nil { 656 return fmt.Errorf("unable to transmit data preceding match: %w", err) 657 } else if err = sendBlock(matchIndex); err != nil { 658 return fmt.Errorf("unable to transmit match: %w", err) 659 } 660 occupancy = 0 661 } else if occupancy == uint64(len(buffer)) { 662 if err := sendData(buffer[:occupancy-base.BlockSize]); err != nil { 663 return fmt.Errorf("unable to transmit data before truncation: %w", err) 664 } 665 copy(buffer[:base.BlockSize], buffer[occupancy-base.BlockSize:occupancy]) 666 occupancy = base.BlockSize 667 } 668 } 669 670 // If we have a short last block and the occupancy of the buffer is large 671 // enough that it could match, then check for a match. 672 if haveShortLastBlock && occupancy >= base.LastBlockSize { 673 potentialLastBlockMatch := buffer[occupancy-base.LastBlockSize : occupancy] 674 // For short blocks, we still use the full block size when computing the 675 // weak hash. We could alternatively use the short block length, but it 676 // doesn't matter - all that matters is that we keep consistency when we 677 // compute the short block weak hash in Signature. 678 if w, _, _ := e.weakHash(potentialLastBlockMatch, base.BlockSize); w == shortLastBlock.Weak { 679 if bytes.Equal(e.strongHash(potentialLastBlockMatch, false), shortLastBlock.Strong) { 680 if err := sendData(buffer[:occupancy-base.LastBlockSize]); err != nil { 681 return fmt.Errorf("unable to transmit data: %w", err) 682 } else if err = sendBlock(lastBlockIndex); err != nil { 683 return fmt.Errorf("unable to transmit operation: %w", err) 684 } 685 occupancy = 0 686 } 687 } 688 } 689 690 // Send any data remaining in the buffer. 691 if err := sendData(buffer[:occupancy]); err != nil { 692 return fmt.Errorf("unable to send final data operation: %w", err) 693 } 694 695 // Send any final pending coalesced operation. This can't be done as a defer 696 // because we need to watch for errors. 697 if coalescedCount > 0 { 698 if err := e.transmitBlock(coalescedStart, coalescedCount, transmit); err != nil { 699 return fmt.Errorf("unable to send final block operation: %w", err) 700 } 701 } 702 703 // Success. 704 return nil 705 } 706 707 // DeltifyBytes computes delta operations for a byte slice. Unlike the streaming 708 // Deltify method, it returns a slice of operations, which should be reasonable 709 // since the target data can already fit into memory. The internal engine buffer 710 // will be resized to the sum of the maximum data operation size plus the block 711 // size, and retained for the lifetime of the engine, so a reasonable value 712 // for the maximum data operation size should be provided. For performance 713 // reasons, this method does not validate that the provided signature satisfies 714 // expected invariants. It is the responsibility of the caller to verify that 715 // the signature is valid by calling its EnsureValid method. This is not 716 // necessary for signatures generated in the same process, but should be done 717 // for signatures received from untrusted locations (e.g. over the network). An 718 // invalid signature can result in undefined behavior. 719 func (e *Engine) DeltifyBytes(target []byte, base *Signature, maxDataOpSize uint64) []*Operation { 720 // Create an empty result. 721 var delta []*Operation 722 723 // Create an operation transmitter to populate the result. 724 transmit := func(o *Operation) error { 725 delta = append(delta, proto.Clone(o).(*Operation)) 726 return nil 727 } 728 729 // Wrap up the bytes in a reader. 730 reader := bytes.NewReader(target) 731 732 // Compute the delta and watch for errors (which shouldn't occur for for 733 // in-memory data). 734 if err := e.Deltify(reader, base, maxDataOpSize, transmit); err != nil { 735 panic(fmt.Errorf("in-memory deltification failure: %w", err)) 736 } 737 738 // Success. 739 return delta 740 } 741 742 // Patch applies a single operation against a base stream to reconstitute the 743 // target into the destination stream. For performance reasons, this method does 744 // not validate that the provided signature and operation satisfy expected 745 // invariants. It is the responsibility of the caller to verify that the 746 // signature and operation are valid by calling their respective EnsureValid 747 // methods. This is not necessary for signatures and operations generated in the 748 // same process, but should be done for signatures and operations received from 749 // untrusted locations (e.g. over the network). An invalid signature or 750 // operation can result in undefined behavior. 751 func (e *Engine) Patch(destination io.Writer, base io.ReadSeeker, signature *Signature, operation *Operation) error { 752 // Handle the operation based on type. 753 if len(operation.Data) > 0 { 754 // Write data operations directly to the destination. 755 if _, err := destination.Write(operation.Data); err != nil { 756 return fmt.Errorf("unable to write data: %w", err) 757 } 758 } else { 759 // Seek to the start of the requested block in base. 760 // TODO: We should technically validate that operation.Index 761 // multiplied by the block size can't overflow an int64. Worst case 762 // at the moment it will cause the seek operation to fail. 763 if _, err := base.Seek(int64(operation.Start)*int64(signature.BlockSize), io.SeekStart); err != nil { 764 return fmt.Errorf("unable to seek to base location: %w", err) 765 } 766 767 // Copy the requested number of blocks. 768 for c := uint64(0); c < operation.Count; c++ { 769 // Compute the size to copy. 770 copyLength := signature.BlockSize 771 if operation.Start+c == uint64(len(signature.Hashes)-1) { 772 copyLength = signature.LastBlockSize 773 } 774 775 // Create a buffer of the required size. 776 buffer := e.bufferWithSize(copyLength) 777 778 // Copy the block. 779 if _, err := io.ReadFull(base, buffer); err != nil { 780 return fmt.Errorf("unable to read block data: %w", err) 781 } else if _, err = destination.Write(buffer); err != nil { 782 return fmt.Errorf("unable to write block data: %w", err) 783 } 784 } 785 } 786 787 // Success. 788 return nil 789 } 790 791 // PatchBytes applies a series of operations against a base byte slice to 792 // reconstitute the target byte slice. For performance reasons, this method does 793 // not validate that the provided signature and operation satisfy expected 794 // invariants. It is the responsibility of the caller to verify that the 795 // signature and operation are valid by calling their respective EnsureValid 796 // methods. This is not necessary for signatures and operations generated in the 797 // same process, but should be done for signatures and operations received from 798 // untrusted locations (e.g. over the network). An invalid signature or 799 // operation can result in undefined behavior. 800 func (e *Engine) PatchBytes(base []byte, signature *Signature, delta []*Operation) ([]byte, error) { 801 // Wrap up the base bytes in a reader. 802 baseReader := bytes.NewReader(base) 803 804 // Create an output buffer. 805 output := bytes.NewBuffer(nil) 806 807 // Perform application. 808 for _, o := range delta { 809 if err := e.Patch(output, baseReader, signature, o); err != nil { 810 return nil, err 811 } 812 } 813 814 // Success. 815 return output.Bytes(), nil 816 }