github.com/mutagen-io/mutagen@v0.18.0-rc1/pkg/synchronization/rsync/engine.go (about)

     1  package rsync
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	"crypto/sha1"
     7  	"errors"
     8  	"fmt"
     9  	"hash"
    10  	"io"
    11  	"math"
    12  
    13  	"google.golang.org/protobuf/proto"
    14  )
    15  
    16  // EnsureValid verifies that block hash invariants are respected.
    17  func (h *BlockHash) EnsureValid() error {
    18  	// A nil block hash is not valid.
    19  	if h == nil {
    20  		return errors.New("nil block hash")
    21  	}
    22  
    23  	// Ensure that the strong signature is valid.
    24  	if len(h.Strong) == 0 {
    25  		return errors.New("empty strong signature")
    26  	}
    27  
    28  	// Success.
    29  	return nil
    30  }
    31  
    32  // EnsureValid verifies that signature invariants are respected.
    33  func (s *Signature) EnsureValid() error {
    34  	// A nil signature is not valid.
    35  	if s == nil {
    36  		return errors.New("nil signature")
    37  	}
    38  
    39  	// Ensure that all block hashes are valid.
    40  	for _, h := range s.Hashes {
    41  		if err := h.EnsureValid(); err != nil {
    42  			return fmt.Errorf("invalid block hash: %w", err)
    43  		}
    44  	}
    45  
    46  	// If the block size is 0, then the last block size should also be 0 and
    47  	// there shouldn't be any hashes.
    48  	if s.BlockSize == 0 {
    49  		if s.LastBlockSize != 0 {
    50  			return errors.New("block size of 0 with non-0 last block size")
    51  		} else if len(s.Hashes) != 0 {
    52  			return errors.New("block size of 0 with non-0 number of hashes")
    53  		}
    54  		return nil
    55  	}
    56  
    57  	// If block size is non-0, then the last block size should be non-0 but less
    58  	// than or equal to the block size.
    59  	if s.LastBlockSize == 0 {
    60  		return errors.New("non-0 block size with last block size of 0")
    61  	} else if s.LastBlockSize > s.BlockSize {
    62  		return errors.New("last block size greater than block size")
    63  	}
    64  
    65  	// If the block size is non-0, then a non-zero number of blocks should have
    66  	// been hashed.
    67  	if len(s.Hashes) == 0 {
    68  		return errors.New("non-0 block size with no block hashes")
    69  	}
    70  
    71  	// Success.
    72  	return nil
    73  }
    74  
    75  // isEmpty return true if the signature represents an empty file.
    76  func (s *Signature) isEmpty() bool {
    77  	// In theory, we might also want to test that LastBlockSize == 0 and that
    78  	// there aren't any hashes, but so long as the invariants of signature are
    79  	// maintained, this check if sufficient.
    80  	return s.BlockSize == 0
    81  }
    82  
    83  // EnsureValid verifies that operation invariants are respected.
    84  func (o *Operation) EnsureValid() error {
    85  	// A nil operation is not valid.
    86  	if o == nil {
    87  		return errors.New("nil operation")
    88  	}
    89  
    90  	// Ensure that the operation parameters are valid.
    91  	if len(o.Data) > 0 {
    92  		if o.Start != 0 {
    93  			return errors.New("data operation with non-0 block start index")
    94  		} else if o.Count != 0 {
    95  			return errors.New("data operation with non-0 block count")
    96  		}
    97  	} else if o.Count == 0 {
    98  		return errors.New("block operation with 0 block count")
    99  	}
   100  
   101  	// Success.
   102  	return nil
   103  }
   104  
   105  // resetToZeroMaintainingCapacity resets an Operation to its zero-value, but
   106  // leaves capacity in the data slice. It's worth noting that the zero-value
   107  // state is not a valid state for an Operation.
   108  func (o *Operation) resetToZeroMaintainingCapacity() {
   109  	// Reset the data slice, but maintain its capacity.
   110  	o.Data = o.Data[:0]
   111  
   112  	// Reset start and count.
   113  	o.Start = 0
   114  	o.Count = 0
   115  }
   116  
   117  // isZeroValue indicates whether or not an Operation has its zero-value. It's
   118  // worth noting that the zero-value state is not a valid state for an Operation.
   119  func (o *Operation) isZeroValue() bool {
   120  	return len(o.Data) == 0 && o.Start == 0 && o.Count == 0
   121  }
   122  
   123  const (
   124  	// minimumOptimalBlockSize is the minimum block size that will be returned
   125  	// by OptimalBlockSizeForBaseLength. It has to be chosen so that it is at
   126  	// least a few orders of magnitude larger than the size of a BlockHash.
   127  	minimumOptimalBlockSize = 1 << 10
   128  	// maximumOptimalBlockSize is the maximum block size that will be returned
   129  	// by OptimalBlockSizeForBaseLength. It mostly just needs to be bounded by
   130  	// what can fit into a reasonably sized in-memory buffer, particularly if
   131  	// multiple rsync engines are running. maximumBlockSize also needs to be
   132  	// less than or equal to (2^32)-1 for the weak hash algorithm to work.
   133  	maximumOptimalBlockSize = 1 << 16
   134  	// DefaultBlockSize is the default block size that will be used if a zero
   135  	// value is passed into Engine.Signature for the blockSize parameter.
   136  	DefaultBlockSize = 1 << 13
   137  	// DefaultMaximumDataOperationSize is the default maximum data size
   138  	// permitted per operation. The optimal value for this isn't at all
   139  	// correlated with block size - it's just what's reasonable to hold
   140  	// in-memory and pass over the wire in a single transmission. This value
   141  	// will be used if a zero value is passed into Engine.Deltify or
   142  	// Engine.DeltifyBytes for the maxDataOpSize parameter.
   143  	DefaultMaximumDataOperationSize = 1 << 16
   144  )
   145  
   146  // OptimalBlockSizeForBaseLength uses a simpler heuristic to choose a block
   147  // size based on the base length. It starts by choosing the optimal block length
   148  // using the formula given in the rsync thesis. It then enforces that the block
   149  // size is within a sensible range.
   150  // TODO: Should we add rounding to "nice" values, e.g. the nearest multiple of
   151  // 1024 bytes? Would this improve read throughput?
   152  func OptimalBlockSizeForBaseLength(baseLength uint64) uint64 {
   153  	// Compute the optimal block length (see the rsync thesis) assuming one
   154  	// change per file.
   155  	result := uint64(math.Sqrt(24.0 * float64(baseLength)))
   156  
   157  	// Ensure it's within the allowed range.
   158  	if result < minimumOptimalBlockSize {
   159  		result = minimumOptimalBlockSize
   160  	} else if result > maximumOptimalBlockSize {
   161  		result = maximumOptimalBlockSize
   162  	}
   163  
   164  	// Done.
   165  	return result
   166  }
   167  
   168  // OptimalBlockSizeForBase is a convenience function that will determine the
   169  // optimal block size for a base that implements io.Seeker. It calls down to
   170  // OptimalBlockSizeForBaseLength. After determining the base's length, it will
   171  // attempt to reset the base to its original position.
   172  func OptimalBlockSizeForBase(base io.Seeker) (uint64, error) {
   173  	if currentOffset, err := base.Seek(0, io.SeekCurrent); err != nil {
   174  		return 0, fmt.Errorf("unable to determine current base offset: %w", err)
   175  	} else if currentOffset < 0 {
   176  		return 0, fmt.Errorf("seek return negative starting location: %w", err)
   177  	} else if length, err := base.Seek(0, io.SeekEnd); err != nil {
   178  		return 0, fmt.Errorf("unable to compute base length: %w", err)
   179  	} else if length < 0 {
   180  		return 0, errors.New("seek returned negative offset")
   181  	} else if _, err = base.Seek(currentOffset, io.SeekStart); err != nil {
   182  		return 0, fmt.Errorf("unable to reset base: %w", err)
   183  	} else {
   184  		return OptimalBlockSizeForBaseLength(uint64(length)), nil
   185  	}
   186  }
   187  
   188  // OperationTransmitter transmits an operation. Operation objects and their data
   189  // buffers are re-used between calls to the transmitter, so the transmitter
   190  // should not return until it has either transmitted the operation or copied it
   191  // for later transmission.
   192  type OperationTransmitter func(*Operation) error
   193  
   194  // Engine provides rsync functionality without any notion of transport. It is
   195  // designed to be re-used to avoid heavy buffer allocation.
   196  type Engine struct {
   197  	// buffer is a re-usable buffer that will be used for reading data and
   198  	// setting up operations.
   199  	buffer []byte
   200  	// strongHasher is the strong hash function to use for the engine.
   201  	strongHasher hash.Hash
   202  	// strongHashBuffer is a re-usable buffer that can be used by methods to
   203  	// receive digests.
   204  	strongHashBuffer []byte
   205  	// targetReader is a re-usable bufio.Reader that will be used for delta
   206  	// creation operations.
   207  	targetReader *bufio.Reader
   208  	// operation is a re-usable operation object used for transmissions to avoid
   209  	// allocations.
   210  	operation *Operation
   211  }
   212  
   213  // NewEngine creates a new rsync engine.
   214  func NewEngine() *Engine {
   215  	// Create the strong hash function.
   216  	// TODO: We might want to allow users to specify other strong hash functions
   217  	// for the engine to use (e.g. BLAKE2 functions), but for now we just use
   218  	// SHA-1 since it's a good balance of speed and robustness for rsync
   219  	// purposes.
   220  	strongHasher := sha1.New()
   221  
   222  	// Create the engine.
   223  	return &Engine{
   224  		strongHasher:     strongHasher,
   225  		strongHashBuffer: make([]byte, strongHasher.Size()),
   226  		targetReader:     bufio.NewReader(nil),
   227  		operation:        &Operation{},
   228  	}
   229  }
   230  
   231  // bufferWithSize lazily allocates the engine's internal buffer, ensuring that
   232  // it is the required size. The capacity of the internal buffer is retained
   233  // between calls to avoid allocations if possible.
   234  func (e *Engine) bufferWithSize(size uint64) []byte {
   235  	// Check if the buffer currently has the required capacity. If it does, then
   236  	// use that space. Note that we're checking *capacity* - you're allowed to
   237  	// slice a buffer up to its capacity, not just its length. Of course, if you
   238  	// don't own the buffer, you could run into problems with accessing data
   239  	// outside the slice that was given to you, but this buffer is completely
   240  	// internal, so that's not a concern.
   241  	if uint64(cap(e.buffer)) >= size {
   242  		return e.buffer[:size]
   243  	}
   244  
   245  	// If we couldn't use our existing buffer, create a new one, but store it
   246  	// for later re-use.
   247  	e.buffer = make([]byte, size)
   248  	return e.buffer
   249  }
   250  
   251  const (
   252  	// m is the weak hash modulus. I think they now recommend that it be the
   253  	// largest prime less than 2^16, but this value is fine as well.
   254  	m = 1 << 16
   255  )
   256  
   257  // weakHash computes a fast checksum that can be rolled (updated without full
   258  // recomputation). This particular hash is detailed on page 55 of the rsync
   259  // thesis. It is not theoretically optimal, but it's fine for our purposes.
   260  func (e *Engine) weakHash(data []byte, blockSize uint64) (uint32, uint32, uint32) {
   261  	// Compute hash components.
   262  	var r1, r2 uint32
   263  	for i, b := range data {
   264  		r1 += uint32(b)
   265  		r2 += (uint32(blockSize) - uint32(i)) * uint32(b)
   266  	}
   267  	r1 = r1 % m
   268  	r2 = r2 % m
   269  
   270  	// Compute the hash.
   271  	result := r1 + m*r2
   272  
   273  	// Done.
   274  	return result, r1, r2
   275  }
   276  
   277  // rollWeakHash updates the checksum computed by weakHash by adding and removing
   278  // a byte.
   279  func (e *Engine) rollWeakHash(r1, r2 uint32, out, in byte, blockSize uint64) (uint32, uint32, uint32) {
   280  	// Update components.
   281  	r1 = (r1 - uint32(out) + uint32(in)) % m
   282  	r2 = (r2 - uint32(blockSize)*uint32(out) + r1) % m
   283  
   284  	// Compute the hash.
   285  	result := r1 + m*r2
   286  
   287  	// Done.
   288  	return result, r1, r2
   289  }
   290  
   291  // strongHash computes a slow but strong hash for a block of data. If allocate
   292  // is true, then a new byte slice will be allocated to receive the digest,
   293  // otherwise the engine's internal digest buffer will be used, but then the
   294  // digest will only be valid until the next call to strongHash.
   295  func (e *Engine) strongHash(data []byte, allocate bool) []byte {
   296  	// Reset the hasher.
   297  	e.strongHasher.Reset()
   298  
   299  	// Digest the data. The Hash interface guarantees that writes succeed.
   300  	e.strongHasher.Write(data)
   301  
   302  	// Compute the output location.
   303  	var output []byte
   304  	if !allocate {
   305  		output = e.strongHashBuffer[:0]
   306  	}
   307  
   308  	// Compute the digest.
   309  	return e.strongHasher.Sum(output)
   310  }
   311  
   312  // Signature computes the signature for a base stream. If the provided block
   313  // size is 0, this method will attempt to compute the optimal block size (which
   314  // requires that base implement io.Seeker), and failing that will fall back to a
   315  // default block size.
   316  func (e *Engine) Signature(base io.Reader, blockSize uint64) (*Signature, error) {
   317  	// Choose a block size if none is specified. If the base also implements
   318  	// io.Seeker (which most will since they need to for Patch), then use the
   319  	// optimal block size, otherwise use the default.
   320  	if blockSize == 0 {
   321  		if baseSeeker, ok := base.(io.Seeker); ok {
   322  			if s, err := OptimalBlockSizeForBase(baseSeeker); err == nil {
   323  				blockSize = s
   324  			} else {
   325  				blockSize = DefaultBlockSize
   326  			}
   327  		} else {
   328  			blockSize = DefaultBlockSize
   329  		}
   330  	}
   331  
   332  	// Create the result.
   333  	result := &Signature{
   334  		BlockSize: blockSize,
   335  	}
   336  
   337  	// Create a buffer with which to read blocks.
   338  	buffer := e.bufferWithSize(blockSize)
   339  
   340  	// Read blocks and append their hashes until we reach EOF.
   341  	eof := false
   342  	for !eof {
   343  		// Read the next block and watch for errors. If we receive io.EOF, then
   344  		// nothing was read, and we should break immediately. This means that
   345  		// the base had a length that was a multiple of the block size. If we
   346  		// receive io.ErrUnexpectedEOF, then something was read but we're still
   347  		// at the end of the file, so we should hash this block but not go
   348  		// through the loop again. All other errors are terminal.
   349  		n, err := io.ReadFull(base, buffer)
   350  		if err == io.EOF {
   351  			result.LastBlockSize = blockSize
   352  			break
   353  		} else if err == io.ErrUnexpectedEOF {
   354  			result.LastBlockSize = uint64(n)
   355  			eof = true
   356  		} else if err != nil {
   357  			return nil, fmt.Errorf("unable to read data block: %w", err)
   358  		}
   359  
   360  		// Compute hashes for the the block that was read. For short blocks, we
   361  		// still use the full block size when computing the weak hash. We could
   362  		// alternatively use the short block length, but it doesn't matter - all
   363  		// that matters is that we keep consistency when we compute the short
   364  		// block weak hash when searching in Deltify.
   365  		weak, _, _ := e.weakHash(buffer[:n], blockSize)
   366  		strong := e.strongHash(buffer[:n], true)
   367  
   368  		// Add the block hash.
   369  		result.Hashes = append(result.Hashes, &BlockHash{
   370  			Weak:   weak,
   371  			Strong: strong,
   372  		})
   373  	}
   374  
   375  	// If there are no hashes, then clear out the block sizes.
   376  	if len(result.Hashes) == 0 {
   377  		result.BlockSize = 0
   378  		result.LastBlockSize = 0
   379  	}
   380  
   381  	// Success.
   382  	return result, nil
   383  }
   384  
   385  // BytesSignature computes the signature for a byte slice.
   386  func (e *Engine) BytesSignature(base []byte, blockSize uint64) *Signature {
   387  	// Perform the signature and watch for errors (which shouldn't be able to
   388  	// occur in-memory).
   389  	result, err := e.Signature(bytes.NewReader(base), blockSize)
   390  	if err != nil {
   391  		panic(fmt.Errorf("in-memory signature failure: %w", err))
   392  	}
   393  
   394  	// Success.
   395  	return result
   396  }
   397  
   398  // dualModeReader unifies the io.Reader and io.ByteReader interfaces. It is used
   399  // in deltify operations to ensure that bytes can be efficiently extracted from
   400  // targets.
   401  type dualModeReader interface {
   402  	io.Reader
   403  	io.ByteReader
   404  }
   405  
   406  // min implements simple minimum finding for uint64 values.
   407  func min(a, b uint64) uint64 {
   408  	if a < b {
   409  		return a
   410  	}
   411  	return b
   412  }
   413  
   414  // transmitData transmits a data operation using the engine's internal operation
   415  // object.
   416  func (e *Engine) transmitData(data []byte, transmit OperationTransmitter) error {
   417  	// Set the operation parameters.
   418  	*e.operation = Operation{
   419  		Data: data,
   420  	}
   421  
   422  	// Transmit.
   423  	return transmit(e.operation)
   424  }
   425  
   426  // transmitBlock transmits a block operation using the engine's internal
   427  // operation object.
   428  func (e *Engine) transmitBlock(start, count uint64, transmit OperationTransmitter) error {
   429  	// Set the operation parameters.
   430  	*e.operation = Operation{
   431  		Start: start,
   432  		Count: count,
   433  	}
   434  
   435  	// Transmit.
   436  	return transmit(e.operation)
   437  }
   438  
   439  // chunkAndTransmitAll is a fast-path routine for simply transmitting all data
   440  // in a target stream. This is used when there are no blocks to match because
   441  // the base stream is empty.
   442  func (e *Engine) chunkAndTransmitAll(target io.Reader, maxDataOpSize uint64, transmit OperationTransmitter) error {
   443  	// Verify that maxDataOpSize is sane.
   444  	if maxDataOpSize == 0 {
   445  		maxDataOpSize = DefaultMaximumDataOperationSize
   446  	}
   447  
   448  	// Create a buffer to transmit data operations.
   449  	buffer := e.bufferWithSize(maxDataOpSize)
   450  
   451  	// Loop until the entire target has been transmitted as data operations.
   452  	for {
   453  		if n, err := io.ReadFull(target, buffer); err == io.EOF {
   454  			return nil
   455  		} else if err == io.ErrUnexpectedEOF {
   456  			if err = e.transmitData(buffer[:n], transmit); err != nil {
   457  				return fmt.Errorf("unable to transmit data operation: %w", err)
   458  			}
   459  			return nil
   460  		} else if err != nil {
   461  			return fmt.Errorf("unable to read target: %w", err)
   462  		} else if err = e.transmitData(buffer, transmit); err != nil {
   463  			return fmt.Errorf("unable to transmit data operation: %w", err)
   464  		}
   465  	}
   466  }
   467  
   468  // Deltify computes delta operations to reconstitute the target data stream
   469  // using the base stream (based on the provided base signature). It streams
   470  // operations to the provided transmission function. The internal engine buffer
   471  // will be resized to the sum of the maximum data operation size plus the block
   472  // size, and retained for the lifetime of the engine, so a reasonable value
   473  // for the maximum data operation size should be provided. For performance
   474  // reasons, this method does not validate that the provided signature satisfies
   475  // expected invariants. It is the responsibility of the caller to verify that
   476  // the signature is valid by calling its EnsureValid method. This is not
   477  // necessary for signatures generated in the same process, but should be done
   478  // for signatures received from untrusted locations (e.g. over the network). An
   479  // invalid signature can result in undefined behavior.
   480  func (e *Engine) Deltify(target io.Reader, base *Signature, maxDataOpSize uint64, transmit OperationTransmitter) error {
   481  	// Verify that the maximum data operation size is sane.
   482  	if maxDataOpSize == 0 {
   483  		maxDataOpSize = DefaultMaximumDataOperationSize
   484  	}
   485  
   486  	// If the base is empty, then there's no way we'll find any matching blocks,
   487  	// so just send the entire file.
   488  	if len(base.Hashes) == 0 {
   489  		return e.chunkAndTransmitAll(target, maxDataOpSize, transmit)
   490  	}
   491  
   492  	// Create a set of block and data transmitters that efficiently coalesce
   493  	// adjacent block operations and provide data chunking. Some corresponding
   494  	// finalization logic is required at the end of this function.
   495  	var coalescedStart, coalescedCount uint64
   496  	sendBlock := func(index uint64) error {
   497  		if coalescedCount > 0 {
   498  			if coalescedStart+coalescedCount == index {
   499  				coalescedCount++
   500  				return nil
   501  			} else if err := e.transmitBlock(coalescedStart, coalescedCount, transmit); err != nil {
   502  				return nil
   503  			}
   504  		}
   505  		coalescedStart = index
   506  		coalescedCount = 1
   507  		return nil
   508  	}
   509  	sendData := func(data []byte) error {
   510  		if len(data) > 0 && coalescedCount > 0 {
   511  			if err := e.transmitBlock(coalescedStart, coalescedCount, transmit); err != nil {
   512  				return err
   513  			}
   514  			coalescedStart = 0
   515  			coalescedCount = 0
   516  		}
   517  		for len(data) > 0 {
   518  			sendSize := min(uint64(len(data)), maxDataOpSize)
   519  			if err := e.transmitData(data[:sendSize], transmit); err != nil {
   520  				return err
   521  			}
   522  			data = data[sendSize:]
   523  		}
   524  		return nil
   525  	}
   526  
   527  	// Ensure that the target implements io.Reader and io.ByteReader. If it can
   528  	// do this natively, great! If not, wrap it in our re-usable buffered
   529  	// reader, but ensure that it is released when we're done so that we don't
   530  	// retain it indefinitely.
   531  	bufferedTarget, ok := target.(dualModeReader)
   532  	if !ok {
   533  		e.targetReader.Reset(target)
   534  		bufferedTarget = e.targetReader
   535  		defer func() {
   536  			e.targetReader.Reset(nil)
   537  		}()
   538  	}
   539  
   540  	// Create a lookup table that maps weak hashes to all matching block hashes.
   541  	// If the last block is short, we extract it and hold it separately, because
   542  	// when doing match searches, we assume that all blocks in this map have a
   543  	// full block size worth of data.
   544  	//
   545  	// The rsync technical report (see the section titled "Checksum searching")
   546  	// actually advocates a 3-tier search (i.e. an additional 16-bit hash layer
   547  	// before the weak hash), but I think this probably isn't necessary with
   548  	// modern hardware and hashing algorithms.
   549  	//
   550  	// TODO: This is currently a little expensive because it requires a slice
   551  	// allocation for each map entry. I suspect that the collision rate for weak
   552  	// hashes is actually sufficiently low that we could make each map value a
   553  	// fixed array of int that would limit the number of matches we could try,
   554  	// but save us a lot of allocating. We would have to use an int, because
   555  	// invalid values would likely need to be -1. This might be an unnecessary
   556  	// operation though, because this map is only generated for non-empty bases,
   557  	// which typically don't come in large numbers. For a few files, generating
   558  	// these maps with slice values is fine. It also might be a bit slow since
   559  	// each insertion would require a linear search to find the insertion
   560  	// location within the array.
   561  	hashes := base.Hashes
   562  	haveShortLastBlock := false
   563  	var lastBlockIndex uint64
   564  	var shortLastBlock *BlockHash
   565  	if base.LastBlockSize != base.BlockSize {
   566  		haveShortLastBlock = true
   567  		lastBlockIndex = uint64(len(hashes) - 1)
   568  		shortLastBlock = hashes[lastBlockIndex]
   569  		hashes = hashes[:lastBlockIndex]
   570  	}
   571  	weakToBlockHashes := make(map[uint32][]uint64, len(hashes))
   572  	for i, h := range hashes {
   573  		weakToBlockHashes[h.Weak] = append(weakToBlockHashes[h.Weak], uint64(i))
   574  	}
   575  
   576  	// Create a buffer that we can use to load data and search for matches. We
   577  	// start by filling it with a block's worth of data and then continuously
   578  	// appending bytes until we either fill the buffer (at which point we
   579  	// transmit data preceding the block and truncate) or find a match (at
   580  	// which point we transmit data preceding the block and then transmit the
   581  	// block match). Once we're unable to append a new byte or refill with a
   582  	// full block, we terminate our search and send the remaining data
   583  	// (potentially searching for one last short block match at the end of the
   584  	// buffer).
   585  	//
   586  	// We choose the buffer size to hold a chunk of data of the maximum allowed
   587  	// transmission size and a block of data. This size choice is somewhat
   588  	// arbitrary since we have a data chunking function and could load more data
   589  	// before doing a truncation/transmission, but this is also a reasonable
   590  	// amount of data to hold in memory at any given time. We could choose a
   591  	// larger preceding data chunk size to have less frequent truncations, but
   592  	// (a) truncations are cheap and (b) we'll probably be doing a lot of
   593  	// sequential block matching cycles where we just continuously match blocks
   594  	// at the beginning of the buffer and then refill, so truncations won't be
   595  	// all that common.
   596  	buffer := e.bufferWithSize(maxDataOpSize + base.BlockSize)
   597  
   598  	// Track the occupancy of the buffer.
   599  	var occupancy uint64
   600  
   601  	// Track the weak hash and its parameters for the block at the end of the
   602  	// buffer.
   603  	var weak, r1, r2 uint32
   604  
   605  	// Loop over the contents of the file and search for matches.
   606  	for {
   607  		// If the buffer is empty, then we need to read in a block's worth of
   608  		// data (if possible) and calculate the weak hash and its parameters. If
   609  		// the buffer is non-empty but less than a block's worth of data, then
   610  		// we've broken an invariant in our code. Otherwise, we need to move the
   611  		// search block one byte forward and roll the hash.
   612  		if occupancy == 0 {
   613  			if n, err := io.ReadFull(bufferedTarget, buffer[:base.BlockSize]); err == io.EOF || err == io.ErrUnexpectedEOF {
   614  				occupancy = uint64(n)
   615  				break
   616  			} else if err != nil {
   617  				return fmt.Errorf("unable to perform initial buffer fill: %w", err)
   618  			} else {
   619  				occupancy = base.BlockSize
   620  				weak, r1, r2 = e.weakHash(buffer[:occupancy], base.BlockSize)
   621  			}
   622  		} else if occupancy < base.BlockSize {
   623  			panic("buffer contains less than a block worth of data")
   624  		} else {
   625  			if b, err := bufferedTarget.ReadByte(); err == io.EOF {
   626  				break
   627  			} else if err != nil {
   628  				return fmt.Errorf("unable to read target byte: %w", err)
   629  			} else {
   630  				weak, r1, r2 = e.rollWeakHash(r1, r2, buffer[occupancy-base.BlockSize], b, base.BlockSize)
   631  				buffer[occupancy] = b
   632  				occupancy++
   633  			}
   634  		}
   635  
   636  		// Look for a block match for the block at the end of the buffer.
   637  		potentials := weakToBlockHashes[weak]
   638  		match := false
   639  		var matchIndex uint64
   640  		if len(potentials) > 0 {
   641  			strong := e.strongHash(buffer[occupancy-base.BlockSize:occupancy], false)
   642  			for _, p := range potentials {
   643  				if bytes.Equal(base.Hashes[p].Strong, strong) {
   644  					match = true
   645  					matchIndex = p
   646  					break
   647  				}
   648  			}
   649  		}
   650  
   651  		// If there's a match, send any data preceding the match and then send
   652  		// the match. Otherwise, if we've reached buffer capacity, send the data
   653  		// preceding the search block.
   654  		if match {
   655  			if err := sendData(buffer[:occupancy-base.BlockSize]); err != nil {
   656  				return fmt.Errorf("unable to transmit data preceding match: %w", err)
   657  			} else if err = sendBlock(matchIndex); err != nil {
   658  				return fmt.Errorf("unable to transmit match: %w", err)
   659  			}
   660  			occupancy = 0
   661  		} else if occupancy == uint64(len(buffer)) {
   662  			if err := sendData(buffer[:occupancy-base.BlockSize]); err != nil {
   663  				return fmt.Errorf("unable to transmit data before truncation: %w", err)
   664  			}
   665  			copy(buffer[:base.BlockSize], buffer[occupancy-base.BlockSize:occupancy])
   666  			occupancy = base.BlockSize
   667  		}
   668  	}
   669  
   670  	// If we have a short last block and the occupancy of the buffer is large
   671  	// enough that it could match, then check for a match.
   672  	if haveShortLastBlock && occupancy >= base.LastBlockSize {
   673  		potentialLastBlockMatch := buffer[occupancy-base.LastBlockSize : occupancy]
   674  		// For short blocks, we still use the full block size when computing the
   675  		// weak hash. We could alternatively use the short block length, but it
   676  		// doesn't matter - all that matters is that we keep consistency when we
   677  		// compute the short block weak hash in Signature.
   678  		if w, _, _ := e.weakHash(potentialLastBlockMatch, base.BlockSize); w == shortLastBlock.Weak {
   679  			if bytes.Equal(e.strongHash(potentialLastBlockMatch, false), shortLastBlock.Strong) {
   680  				if err := sendData(buffer[:occupancy-base.LastBlockSize]); err != nil {
   681  					return fmt.Errorf("unable to transmit data: %w", err)
   682  				} else if err = sendBlock(lastBlockIndex); err != nil {
   683  					return fmt.Errorf("unable to transmit operation: %w", err)
   684  				}
   685  				occupancy = 0
   686  			}
   687  		}
   688  	}
   689  
   690  	// Send any data remaining in the buffer.
   691  	if err := sendData(buffer[:occupancy]); err != nil {
   692  		return fmt.Errorf("unable to send final data operation: %w", err)
   693  	}
   694  
   695  	// Send any final pending coalesced operation. This can't be done as a defer
   696  	// because we need to watch for errors.
   697  	if coalescedCount > 0 {
   698  		if err := e.transmitBlock(coalescedStart, coalescedCount, transmit); err != nil {
   699  			return fmt.Errorf("unable to send final block operation: %w", err)
   700  		}
   701  	}
   702  
   703  	// Success.
   704  	return nil
   705  }
   706  
   707  // DeltifyBytes computes delta operations for a byte slice. Unlike the streaming
   708  // Deltify method, it returns a slice of operations, which should be reasonable
   709  // since the target data can already fit into memory. The internal engine buffer
   710  // will be resized to the sum of the maximum data operation size plus the block
   711  // size, and retained for the lifetime of the engine, so a reasonable value
   712  // for the maximum data operation size should be provided. For performance
   713  // reasons, this method does not validate that the provided signature satisfies
   714  // expected invariants. It is the responsibility of the caller to verify that
   715  // the signature is valid by calling its EnsureValid method. This is not
   716  // necessary for signatures generated in the same process, but should be done
   717  // for signatures received from untrusted locations (e.g. over the network). An
   718  // invalid signature can result in undefined behavior.
   719  func (e *Engine) DeltifyBytes(target []byte, base *Signature, maxDataOpSize uint64) []*Operation {
   720  	// Create an empty result.
   721  	var delta []*Operation
   722  
   723  	// Create an operation transmitter to populate the result.
   724  	transmit := func(o *Operation) error {
   725  		delta = append(delta, proto.Clone(o).(*Operation))
   726  		return nil
   727  	}
   728  
   729  	// Wrap up the bytes in a reader.
   730  	reader := bytes.NewReader(target)
   731  
   732  	// Compute the delta and watch for errors (which shouldn't occur for for
   733  	// in-memory data).
   734  	if err := e.Deltify(reader, base, maxDataOpSize, transmit); err != nil {
   735  		panic(fmt.Errorf("in-memory deltification failure: %w", err))
   736  	}
   737  
   738  	// Success.
   739  	return delta
   740  }
   741  
   742  // Patch applies a single operation against a base stream to reconstitute the
   743  // target into the destination stream. For performance reasons, this method does
   744  // not validate that the provided signature and operation satisfy expected
   745  // invariants. It is the responsibility of the caller to verify that the
   746  // signature and operation are valid by calling their respective EnsureValid
   747  // methods. This is not necessary for signatures and operations generated in the
   748  // same process, but should be done for signatures and operations received from
   749  // untrusted locations (e.g. over the network). An invalid signature or
   750  // operation can result in undefined behavior.
   751  func (e *Engine) Patch(destination io.Writer, base io.ReadSeeker, signature *Signature, operation *Operation) error {
   752  	// Handle the operation based on type.
   753  	if len(operation.Data) > 0 {
   754  		// Write data operations directly to the destination.
   755  		if _, err := destination.Write(operation.Data); err != nil {
   756  			return fmt.Errorf("unable to write data: %w", err)
   757  		}
   758  	} else {
   759  		// Seek to the start of the requested block in base.
   760  		// TODO: We should technically validate that operation.Index
   761  		// multiplied by the block size can't overflow an int64. Worst case
   762  		// at the moment it will cause the seek operation to fail.
   763  		if _, err := base.Seek(int64(operation.Start)*int64(signature.BlockSize), io.SeekStart); err != nil {
   764  			return fmt.Errorf("unable to seek to base location: %w", err)
   765  		}
   766  
   767  		// Copy the requested number of blocks.
   768  		for c := uint64(0); c < operation.Count; c++ {
   769  			// Compute the size to copy.
   770  			copyLength := signature.BlockSize
   771  			if operation.Start+c == uint64(len(signature.Hashes)-1) {
   772  				copyLength = signature.LastBlockSize
   773  			}
   774  
   775  			// Create a buffer of the required size.
   776  			buffer := e.bufferWithSize(copyLength)
   777  
   778  			// Copy the block.
   779  			if _, err := io.ReadFull(base, buffer); err != nil {
   780  				return fmt.Errorf("unable to read block data: %w", err)
   781  			} else if _, err = destination.Write(buffer); err != nil {
   782  				return fmt.Errorf("unable to write block data: %w", err)
   783  			}
   784  		}
   785  	}
   786  
   787  	// Success.
   788  	return nil
   789  }
   790  
   791  // PatchBytes applies a series of operations against a base byte slice to
   792  // reconstitute the target byte slice. For performance reasons, this method does
   793  // not validate that the provided signature and operation satisfy expected
   794  // invariants. It is the responsibility of the caller to verify that the
   795  // signature and operation are valid by calling their respective EnsureValid
   796  // methods. This is not necessary for signatures and operations generated in the
   797  // same process, but should be done for signatures and operations received from
   798  // untrusted locations (e.g. over the network). An invalid signature or
   799  // operation can result in undefined behavior.
   800  func (e *Engine) PatchBytes(base []byte, signature *Signature, delta []*Operation) ([]byte, error) {
   801  	// Wrap up the base bytes in a reader.
   802  	baseReader := bytes.NewReader(base)
   803  
   804  	// Create an output buffer.
   805  	output := bytes.NewBuffer(nil)
   806  
   807  	// Perform application.
   808  	for _, o := range delta {
   809  		if err := e.Patch(output, baseReader, signature, o); err != nil {
   810  			return nil, err
   811  		}
   812  	}
   813  
   814  	// Success.
   815  	return output.Bytes(), nil
   816  }