github.com/creachadair/ffs@v0.17.3/block/splitter.go (about) 1 // Copyright 2019 Michael J. Fromberger. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package block implements content-sensitive partitioning of a stream of byte 16 // data into blocks, using a rolling hash function. 17 // 18 // The algorithm used to split data into blocks is based on the one from LBFS: 19 // 20 // http://pdos.csail.mit.edu/lbfs/ 21 // 22 // As described in the SOSP 2001 paper "A Low-Bandwidth Network File System": 23 // 24 // https://pdos.csail.mit.edu/papers/lbfs:sosp01/lbfs.pdf 25 // 26 // This package provides an implementation of the Rabin-Karp modular rolling 27 // hash algorithm; other algorithms can be plugged in by implementing the 28 // Hasher and Hash interfaces. 29 package block 30 31 // TODO(Sep 2021): The LBFS paper seems to be inaccessible from MIT. 32 // There's a presentation about it here: http://www.scs.stanford.edu/nyu/02fa/notes/l15.pdf 33 34 import ( 35 "bufio" 36 "io" 37 ) 38 39 // These values are the defaults used if none are specified in the config. 40 const ( 41 // DefaultMin is the default minimum block size, in bytes. 42 DefaultMin = 2048 43 44 // DefaultSize is the default target block size, in bytes. 45 DefaultSize = 16384 46 47 // DefaultMax is the default maximum block size, in bytes. 48 DefaultMax = 65536 49 ) 50 51 // DefaultHasher is used by a Splitter if no hasher is set in its config. 52 var DefaultHasher = RabinKarpHasher(1031, 2147483659, 48) 53 54 // A SplitConfig contains the settings to construct a splitter. 55 type SplitConfig struct { 56 // The rolling hash to use. If nil, uses DefaultHasher. 57 Hasher 58 59 // Minimum block size, in bytes. The splitter will not split a block until 60 // it is at least this size. 61 Min int 62 63 // Desired block size, in bytes. The splitter will attempt to generate 64 // blocks of approximately this average size. 65 Size int 66 67 // Maximum block size, in bytes. The splitter will split any block that 68 // exceeds this size, even if the rolling hash does not find a break. 69 Max int 70 } 71 72 // Hash implements the Hasher interface for a SplitConfig. 73 func (c *SplitConfig) Hash() Hash { 74 if c == nil || c.Hasher == nil { 75 return DefaultHasher.Hash() 76 } 77 return c.Hasher.Hash() 78 } 79 80 func (c *SplitConfig) min() int { 81 if c == nil || c.Min <= 0 { 82 return DefaultMin 83 } 84 return c.Min 85 } 86 87 func (c *SplitConfig) size() int { 88 if c == nil || c.Size <= 0 { 89 return DefaultSize 90 } 91 return c.Size 92 } 93 94 func (c *SplitConfig) max() int { 95 if c == nil || c.Max <= 0 { 96 return DefaultMax 97 } 98 return c.Max 99 } 100 101 // NewSplitter constructs a Splitter that reads its data from r and partitions 102 // it into blocks using the rolling hash from c. A nil *SplitConfig is ready 103 // for use with default sizes and hash settings. 104 func NewSplitter(r io.Reader, c *SplitConfig) *Splitter { 105 var buf *bufio.Reader 106 if v, ok := r.(*bufio.Reader); ok { 107 buf = v 108 } else { 109 buf = bufio.NewReaderSize(r, c.max()) 110 } 111 return &Splitter{ 112 reader: buf, 113 config: c, 114 115 hash: c.Hash(), 116 min: c.min(), 117 exp: c.size(), 118 buf: make([]byte, c.max()), 119 } 120 } 121 122 // A Splitter wraps an underlying io.Reader to split the data from the reader 123 // into blocks using a rolling hash. 124 type Splitter struct { 125 reader *bufio.Reader // The underlying source of block data. 126 config *SplitConfig // a saved copy of the config 127 128 hash Hash // The rolling hash used to find breakpoints. 129 min int // Minimum block size in bytes. 130 exp int // Expected block size in bytes. 131 next int // Next unused offset in buf. 132 end int // End of previous block. 133 buf []byte // Incoming data buffer. 134 } 135 136 // Config returns the SplitConfig used to construct s, which may be nil. 137 func (s *Splitter) Config() *SplitConfig { return s.config } 138 139 // Next returns the next available block, or an error. The slice returned is 140 // only valid until a subsequent call of Next. Returns nil, io.EOF when no 141 // further blocks are available. 142 func (s *Splitter) Next() ([]byte, error) { 143 // Shift out the previous block, if any. This invalidates any previous 144 // slice returned by this method, as the data have moved. 145 if s.end > 0 { 146 copy(s.buf, s.buf[s.end:]) 147 s.next -= s.end 148 s.end = 0 149 } 150 151 i := s.end // The position of the next potential block boundary 152 for { 153 // Try to read more data into the buffer. An EOF at this point is not 154 // an error, since there may be data left in the buffer from earlier. 155 nr, err := s.reader.Read(s.buf[s.next:]) 156 if err != nil && err != io.EOF { 157 return nil, err 158 } 159 s.next += nr 160 161 // Look for a block boundary: A point where the hash value goes to 1 162 // modulo the desired block size, or we run out of buffered data. 163 isCut := false 164 for ; i < s.next; i++ { 165 u := s.hash.Update(s.buf[i]) 166 isCut = u%uint64(s.exp) == 1 && i-s.end >= s.min 167 if isCut { 168 break 169 } 170 } 171 172 // If we found a block cut, or have reached the maximum block size, or 173 // there is no input left, update state and return the block. 174 if isCut || i >= len(s.buf) || (i > s.end && err == io.EOF) { 175 block := s.buf[s.end:i] 176 s.end = i 177 return block, nil 178 } 179 180 // We didn't find a cut, and there's room for more data in the buffer. 181 // If there's still something left to read, go back for another chunk. 182 if err == io.EOF { 183 break 184 } 185 } 186 // No more blocks available, end of input. 187 return nil, io.EOF 188 } 189 190 // Split splits blocks from s and passes each block in sequence to f, until 191 // there are no further blocks or until f returns an error. If f returns an 192 // error, processing stops and that error is returned to the caller of Split. 193 // 194 // The slice passed to f is only valid while f is active; if f wishes to store 195 // a block for later use, it must be copied. 196 func (s *Splitter) Split(f func(data []byte) error) error { 197 for { 198 block, err := s.Next() 199 if err == io.EOF { 200 return nil 201 } else if err != nil { 202 return err 203 } else if err := f(block); err != nil { 204 return err 205 } 206 } 207 } 208 209 /* 210 Implementation notes: 211 212 The Splitter maintains a buffer big enough to hold a full maximum-length block 213 of data. The buffer is organized as follows: 214 215 0 len(buf) 216 |abcdefghijklmnopqrs----------------------------------------| 217 ^end ^next 218 219 All the bytes in buf[:end] belong to the previous block. If end > 0, the first 220 step is to shift out those old bytes. Note that in doing so, we invalidate the 221 previous buffer reported to the caller, if any: 222 223 |ijklmnopqrs------------------------------------------------| 224 ^end ^next 225 226 Now, if next < len(buf), try to fill the buffer with new data: 227 228 |ijklmnopqrsAAAAAAAAAAAAAAAAAAAAAAAAAAA---------------------| 229 ^end ^next 230 231 Now we scan forward from i = end until we reach next or find a block boundary. 232 For a position to count as a block boundary, it must be on a hash cut at least 233 minBytes greater than end; or, it must be at the maximum block size. 234 235 |ijklmnopqrsAAAAAAAAAA*AAAAAAAAAAAAAAAA---------------------| 236 ^end ^i ^next 237 238 There are now four possibilities to consider: 239 240 (a) If i is at a hash cut at least min greater than end: 241 This is a normal block, which we must return. 242 (b) If i == len(buf): 243 This is a long block, capped by the max block size, which we must return. 244 (c) If i == next, i > end, and input is at EOF: 245 This is a non-empty tail block, which we must return. 246 247 If none of (a)-(c) apply, it means we have not seen a block boundary and have 248 space left in the buffer. If the input is not exhausted, we go back and try to 249 read another chunk from the input; otherwise we report EOF. 250 251 If we do have a block to return, its data are in buf[0:i]. We update end to i, 252 to mark the end of the block for the next call. 253 254 [*********************]<< returned block 255 |ijklmnopqrsAAAAAAAAAA*AAAAAAAAAAAAAAAA---------------------| 256 ^end ^next 257 ^i 258 259 At this point, the buffer is in a clean state for the next iteration. 260 */