github.com/creachadair/ffs@v0.17.3/block/splitter.go (about)

     1  // Copyright 2019 Michael J. Fromberger. All Rights Reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package block implements content-sensitive partitioning of a stream of byte
    16  // data into blocks, using a rolling hash function.
    17  //
    18  // The algorithm used to split data into blocks is based on the one from LBFS:
    19  //
    20  //	http://pdos.csail.mit.edu/lbfs/
    21  //
    22  // As described in the SOSP 2001 paper "A Low-Bandwidth Network File System":
    23  //
    24  //	https://pdos.csail.mit.edu/papers/lbfs:sosp01/lbfs.pdf
    25  //
    26  // This package provides an implementation of the Rabin-Karp modular rolling
    27  // hash algorithm; other algorithms can be plugged in by implementing the
    28  // Hasher and Hash interfaces.
    29  package block
    30  
    31  // TODO(Sep 2021): The LBFS paper seems to be inaccessible from MIT.
    32  // There's a presentation about it here: http://www.scs.stanford.edu/nyu/02fa/notes/l15.pdf
    33  
    34  import (
    35  	"bufio"
    36  	"io"
    37  )
    38  
    39  // These values are the defaults used if none are specified in the config.
    40  const (
    41  	// DefaultMin is the default minimum block size, in bytes.
    42  	DefaultMin = 2048
    43  
    44  	// DefaultSize is the default target block size, in bytes.
    45  	DefaultSize = 16384
    46  
    47  	// DefaultMax is the default maximum block size, in bytes.
    48  	DefaultMax = 65536
    49  )
    50  
    51  // DefaultHasher is used by a Splitter if no hasher is set in its config.
    52  var DefaultHasher = RabinKarpHasher(1031, 2147483659, 48)
    53  
    54  // A SplitConfig contains the settings to construct a splitter.
    55  type SplitConfig struct {
    56  	// The rolling hash to use. If nil, uses DefaultHasher.
    57  	Hasher
    58  
    59  	// Minimum block size, in bytes. The splitter will not split a block until
    60  	// it is at least this size.
    61  	Min int
    62  
    63  	// Desired block size, in bytes. The splitter will attempt to generate
    64  	// blocks of approximately this average size.
    65  	Size int
    66  
    67  	// Maximum block size, in bytes. The splitter will split any block that
    68  	// exceeds this size, even if the rolling hash does not find a break.
    69  	Max int
    70  }
    71  
    72  // Hash implements the Hasher interface for a SplitConfig.
    73  func (c *SplitConfig) Hash() Hash {
    74  	if c == nil || c.Hasher == nil {
    75  		return DefaultHasher.Hash()
    76  	}
    77  	return c.Hasher.Hash()
    78  }
    79  
    80  func (c *SplitConfig) min() int {
    81  	if c == nil || c.Min <= 0 {
    82  		return DefaultMin
    83  	}
    84  	return c.Min
    85  }
    86  
    87  func (c *SplitConfig) size() int {
    88  	if c == nil || c.Size <= 0 {
    89  		return DefaultSize
    90  	}
    91  	return c.Size
    92  }
    93  
    94  func (c *SplitConfig) max() int {
    95  	if c == nil || c.Max <= 0 {
    96  		return DefaultMax
    97  	}
    98  	return c.Max
    99  }
   100  
   101  // NewSplitter constructs a Splitter that reads its data from r and partitions
   102  // it into blocks using the rolling hash from c. A nil *SplitConfig is ready
   103  // for use with default sizes and hash settings.
   104  func NewSplitter(r io.Reader, c *SplitConfig) *Splitter {
   105  	var buf *bufio.Reader
   106  	if v, ok := r.(*bufio.Reader); ok {
   107  		buf = v
   108  	} else {
   109  		buf = bufio.NewReaderSize(r, c.max())
   110  	}
   111  	return &Splitter{
   112  		reader: buf,
   113  		config: c,
   114  
   115  		hash: c.Hash(),
   116  		min:  c.min(),
   117  		exp:  c.size(),
   118  		buf:  make([]byte, c.max()),
   119  	}
   120  }
   121  
   122  // A Splitter wraps an underlying io.Reader to split the data from the reader
   123  // into blocks using a rolling hash.
   124  type Splitter struct {
   125  	reader *bufio.Reader // The underlying source of block data.
   126  	config *SplitConfig  // a saved copy of the config
   127  
   128  	hash Hash   // The rolling hash used to find breakpoints.
   129  	min  int    // Minimum block size in bytes.
   130  	exp  int    // Expected block size in bytes.
   131  	next int    // Next unused offset in buf.
   132  	end  int    // End of previous block.
   133  	buf  []byte // Incoming data buffer.
   134  }
   135  
   136  // Config returns the SplitConfig used to construct s, which may be nil.
   137  func (s *Splitter) Config() *SplitConfig { return s.config }
   138  
   139  // Next returns the next available block, or an error.  The slice returned is
   140  // only valid until a subsequent call of Next.  Returns nil, io.EOF when no
   141  // further blocks are available.
   142  func (s *Splitter) Next() ([]byte, error) {
   143  	// Shift out the previous block, if any.  This invalidates any previous
   144  	// slice returned by this method, as the data have moved.
   145  	if s.end > 0 {
   146  		copy(s.buf, s.buf[s.end:])
   147  		s.next -= s.end
   148  		s.end = 0
   149  	}
   150  
   151  	i := s.end // The position of the next potential block boundary
   152  	for {
   153  		// Try to read more data into the buffer.  An EOF at this point is not
   154  		// an error, since there may be data left in the buffer from earlier.
   155  		nr, err := s.reader.Read(s.buf[s.next:])
   156  		if err != nil && err != io.EOF {
   157  			return nil, err
   158  		}
   159  		s.next += nr
   160  
   161  		// Look for a block boundary: A point where the hash value goes to 1
   162  		// modulo the desired block size, or we run out of buffered data.
   163  		isCut := false
   164  		for ; i < s.next; i++ {
   165  			u := s.hash.Update(s.buf[i])
   166  			isCut = u%uint64(s.exp) == 1 && i-s.end >= s.min
   167  			if isCut {
   168  				break
   169  			}
   170  		}
   171  
   172  		// If we found a block cut, or have reached the maximum block size, or
   173  		// there is no input left, update state and return the block.
   174  		if isCut || i >= len(s.buf) || (i > s.end && err == io.EOF) {
   175  			block := s.buf[s.end:i]
   176  			s.end = i
   177  			return block, nil
   178  		}
   179  
   180  		// We didn't find a cut, and there's room for more data in the buffer.
   181  		// If there's still something left to read, go back for another chunk.
   182  		if err == io.EOF {
   183  			break
   184  		}
   185  	}
   186  	// No more blocks available, end of input.
   187  	return nil, io.EOF
   188  }
   189  
   190  // Split splits blocks from s and passes each block in sequence to f, until
   191  // there are no further blocks or until f returns an error.  If f returns an
   192  // error, processing stops and that error is returned to the caller of Split.
   193  //
   194  // The slice passed to f is only valid while f is active; if f wishes to store
   195  // a block for later use, it must be copied.
   196  func (s *Splitter) Split(f func(data []byte) error) error {
   197  	for {
   198  		block, err := s.Next()
   199  		if err == io.EOF {
   200  			return nil
   201  		} else if err != nil {
   202  			return err
   203  		} else if err := f(block); err != nil {
   204  			return err
   205  		}
   206  	}
   207  }
   208  
   209  /*
   210   Implementation notes:
   211  
   212   The Splitter maintains a buffer big enough to hold a full maximum-length block
   213   of data.  The buffer is organized as follows:
   214  
   215      0                                                          len(buf)
   216     |abcdefghijklmnopqrs----------------------------------------|
   217              ^end       ^next
   218  
   219   All the bytes in buf[:end] belong to the previous block. If end > 0, the first
   220   step is to shift out those old bytes. Note that in doing so, we invalidate the
   221   previous buffer reported to the caller, if any:
   222  
   223     |ijklmnopqrs------------------------------------------------|
   224      ^end       ^next
   225  
   226   Now, if next < len(buf), try to fill the buffer with new data:
   227  
   228     |ijklmnopqrsAAAAAAAAAAAAAAAAAAAAAAAAAAA---------------------|
   229      ^end                                  ^next
   230  
   231   Now we scan forward from i = end until we reach next or find a block boundary.
   232   For a position to count as a block boundary, it must be on a hash cut at least
   233   minBytes greater than end; or, it must be at the maximum block size.
   234  
   235     |ijklmnopqrsAAAAAAAAAA*AAAAAAAAAAAAAAAA---------------------|
   236      ^end                 ^i               ^next
   237  
   238   There are now four possibilities to consider:
   239  
   240    (a) If i is at a hash cut at least min greater than end:
   241        This is a normal block, which we must return.
   242    (b) If i == len(buf):
   243        This is a long block, capped by the max block size, which we must return.
   244    (c) If i == next, i > end, and input is at EOF:
   245        This is a non-empty tail block, which we must return.
   246  
   247   If none of (a)-(c) apply, it means we have not seen a block boundary and have
   248   space left in the buffer. If the input is not exhausted, we go back and try to
   249   read another chunk from the input; otherwise we report EOF.
   250  
   251   If we do have a block to return, its data are in buf[0:i]. We update end to i,
   252   to mark the end of the block for the next call.
   253  
   254     [*********************]<< returned block
   255     |ijklmnopqrsAAAAAAAAAA*AAAAAAAAAAAAAAAA---------------------|
   256                           ^end             ^next
   257                           ^i
   258  
   259   At this point, the buffer is in a clean state for the next iteration.
   260  */