github.com/ndau/noms@v1.0.5/go/sloppy/sloppy.go (about) 1 // Copyright 2017 Attic Labs, Inc. All rights reserved. 2 // Licensed under the Apache License, version 2.0: 3 // http://www.apache.org/licenses/LICENSE-2.0 4 5 package sloppy 6 7 import ( 8 "github.com/ndau/noms/go/d" 9 ) 10 11 const ( 12 maxOffsetPOT = uint16(12) 13 maxTableSize = 1 << 14 14 maxLength = 1<<12 - 1 15 tableMask = maxTableSize - 1 16 shift = uint32(20) 17 ) 18 19 // TODO: Make this configurable 20 var maxOffset = int(1<<maxOffsetPOT - 1) 21 22 // Sloppy is a logical variant of Snappy. Its purpose to provide a kind of 23 // estimate of how a given byte sequence *will be* compressed by Snappy. It is 24 // useful when a byte stream is fed into a rolling hash with the goal of 25 // achieving a given average chunk byte length *after compression*. Sloppy is 26 // logically similar to snappy, but prefers "copies" which are closer to the 27 // repeated byte sequence (snappy prefers to refer to the *first* instance of a 28 // repeated byte sequence). This is important for mitigating the likelihood that 29 // altering any byte in an input stream will cause chunk boundaries to be 30 // redrawn downstream. 31 // 32 // The high-level approach is to maintain a logical mapping between four-byte 33 // sequences which have been observed in the stream so-far and the integer 34 // offset of observed sequence (the mapping is done with a "cheap" hash-function 35 // which permits false-positives because they can be trivial filtered out). In 36 // the non-matched state, for each new byte consumed, a uint32 is computed from 37 // the next 4 bytes and then a look-up is performed to check for a matching 4 38 // bytes earlier in the stream. Snappy and sloppy behave roughly identical thus 39 // far. 40 // 41 // When in the "matched state" (attempting to extend the current match), Snappy 42 // does not re-index new 4-byte sequences, but Sloppy does. The reason for this 43 // is that Sloppy would like match the most recent occurence as it moves 44 // forward. 45 // 46 // Lastly, Sloppy adds two novel heuritics, both aimed at further mitigating 47 // the chance of chunk boundaries being redrawn because of byte value changes: 48 // 49 // 1) During the first 2 bytes of match, it *continues* to look for closer 50 // matches (effectively prefering a closer but shorter copy to a further but 51 // longer one). The reason for this is that when sequences repeat frequently in 52 // a byte stream, randomness provides for a good chance that a one or two byte 53 // prefix on a repeated sequence will match "far away". E.g. 54 // 55 // "23hello my friend, 12hello my friend, 01hello my friend, 23hello my friend" 56 // 57 // In the above sequence, sloppy would prefer to copy the final 58 // "hello my friend" 19 bytes backwards rather than "23hello my friend" quite a 59 // bit further. 60 // 61 // 2) Sloppy will only emit copies which are "worth it". I.e. The longer the 62 // reference back, the longer the length of the copy must be. 63 type Sloppy struct { 64 enc encoder 65 idx int 66 matching bool 67 matchOffset, matchLength int 68 table [maxTableSize]uint32 69 } 70 71 // New returns a new sloppy encoder which will encode to |f|. If |f| ever 72 // returns false, then encoding ends immediately. |f| is a callback because 73 // the primary use is that the "encoded" byte stream is fed byte-by-byte 74 // into a rolling hash function. 75 func New(f func(b byte) bool) *Sloppy { 76 return &Sloppy{ 77 binaryEncoder{f}, 78 0, 79 false, 80 0, 0, 81 [maxTableSize]uint32{}, 82 } 83 } 84 85 // Update continues the encoding of a given input stream. The caller is expected 86 // to call update after having (ONLY) appended bytes to |src|. When |Update| 87 // returns, sloppy will have emitted 0 or more literals or copies by calling 88 // the |sf.f|. Note that sloppy will ALWAYS buffer the final three bytes of 89 // input. 90 func (sl *Sloppy) Update(src []byte) { 91 // Only consume up to the point that a "look-ahead" can include 4 bytes. 92 for ; sl.idx < len(src)-3; sl.idx++ { 93 nextHash := fbhash(load32(src, sl.idx)) 94 95 if sl.matching && (sl.matchLength > maxLength || src[sl.idx] != src[sl.matchOffset+sl.matchLength]) { 96 // End Match 97 if sl.maybeCopy(src) { 98 return // terminate if consumer has "closed" 99 } 100 } 101 102 // Look for a match if we are beyond the first byte AND either there is no 103 // match yet, OR we are matching, but fewer than 3 bytes have been 104 // matched. The later condition allows for giving up to 2 bytes of a copy 105 // in order to reference a "closer" sequence. Empirical tests on 106 // structured data, suggests this reduces the average offset by ~2/3. 107 if sl.idx > 0 && (!sl.matching || sl.matchLength < 3) { 108 matchPos := int(sl.table[nextHash&tableMask]) 109 110 if sl.idx > matchPos && 111 src[sl.idx] == src[matchPos] && // filter false positives 112 sl.idx-matchPos <= maxOffset && // don't refer back beyond maxOffset 113 (!sl.matching || matchPos >= sl.matchOffset+4) { // if we are "rematching", ensure the new match is at least 4 bytes closer 114 115 if sl.matching { 116 // We are dropping an existing match for a closer one. Emit the 117 // matched bytes as literals 118 if sl.dontCopy(src, sl.idx-sl.matchLength, sl.idx) { 119 return // terminate if consumer has "closed" 120 } 121 } 122 123 // Begin a new match 124 sl.matching = true 125 sl.matchOffset = matchPos 126 sl.matchLength = 0 127 } 128 } 129 130 // Store new hashed offset 131 sl.table[nextHash&tableMask] = uint32(sl.idx) 132 133 if sl.matching { 134 sl.matchLength++ 135 } else { 136 if sl.enc.emitLiteral(src[sl.idx]) { 137 return // terminate if consumer has "closed" 138 } 139 } 140 } 141 } 142 143 func (sl *Sloppy) Reset() { 144 sl.idx = 0 145 sl.matching = false 146 sl.matchOffset = 0 147 sl.matchLength = 0 148 sl.table = [maxTableSize]uint32{} 149 } 150 151 // len >= 2^(2 + log2(maxOffset) - log2(maxOffset-off)). IOW, for the first 1/2 152 // of the maxOffset, a copy must be >= 4. For 1/2 of what remains, a copy must 153 // be >= 8, etc... 154 func copyLongEnough(off, len uint16) bool { 155 d.PanicIfTrue(off == 0) 156 157 p := uint16(0) 158 x := (1 << maxOffsetPOT) - off 159 for x > 0 { 160 x = x >> 1 161 p++ 162 } 163 164 i := maxOffsetPOT - p 165 min := 4 166 for i > 0 { 167 min = min << 1 168 i-- 169 } 170 171 return int(len) >= min 172 } 173 174 // Emit matches bytes as literals. 175 func (sl *Sloppy) dontCopy(src []byte, from, to int) bool { 176 for ; from < to; from++ { 177 if sl.enc.emitLiteral(src[from]) { 178 return true 179 } 180 } 181 return false 182 } 183 184 // Emit a copy if the length is sufficient for a given offset 185 func (sl *Sloppy) maybeCopy(src []byte) bool { 186 off, len := uint16(sl.idx-(sl.matchOffset+sl.matchLength)), uint16(sl.matchLength) 187 sl.matching = false 188 sl.matchOffset = 0 189 sl.matchLength = 0 190 191 if !copyLongEnough(off, len) { 192 return sl.dontCopy(src, sl.idx-int(len), sl.idx) 193 } 194 195 return sl.enc.emitCopy(off, len) 196 } 197 198 type encoder interface { 199 emitLiteral(b byte) bool 200 emitCopy(offset, length uint16) bool 201 } 202 203 type binaryEncoder struct { 204 f func(b byte) bool 205 } 206 207 func (be binaryEncoder) emitLiteral(b byte) bool { 208 return be.f(b) 209 } 210 211 func (be binaryEncoder) emitCopy(offset, length uint16) bool { 212 // all copies are encoded as 3 bytes. 213 // 12 bits for offset and 12 bits for length 214 215 // 8 MSBits of offset 216 if be.f(byte(offset >> 4)) { 217 return true 218 } 219 220 // 4 LSBits offset | 4 MSBits length 221 if be.f(byte(offset<<4) | byte(length>>4)) { 222 return true 223 } 224 225 // 8 LSBits of length 226 if be.f(byte(length)) { 227 return true 228 } 229 230 return false 231 } 232 233 func fbhash(u uint32) uint32 { 234 return (u * 0x1e35a7bd) >> shift 235 } 236 237 func load32(b []byte, i int) uint32 { 238 b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line. 239 return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 240 }