github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/store/sloppy/sloppy.go (about) 1 // Copyright 2019 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // This file incorporates work covered by the following copyright and 16 // permission notice: 17 // 18 // Copyright 2017 Attic Labs, Inc. All rights reserved. 19 // Licensed under the Apache License, version 2.0: 20 // http://www.apache.org/licenses/LICENSE-2.0 21 22 package sloppy 23 24 import ( 25 "github.com/dolthub/dolt/go/store/d" 26 ) 27 28 const ( 29 maxOffsetPOT = uint16(12) 30 maxTableSize = 1 << 14 31 maxLength = 1<<12 - 1 32 tableMask = maxTableSize - 1 33 shift = uint32(20) 34 ) 35 36 // TODO: Make this configurable 37 var maxOffset = int(1<<maxOffsetPOT - 1) 38 39 // Sloppy is a logical variant of Snappy. Its purpose to provide a kind of 40 // estimate of how a given byte sequence *will be* compressed by Snappy. It is 41 // useful when a byte stream is fed into a rolling hash with the goal of 42 // achieving a given average chunk byte length *after compression*. Sloppy is 43 // logically similar to snappy, but prefers "copies" which are closer to the 44 // repeated byte sequence (snappy prefers to refer to the *first* instance of a 45 // repeated byte sequence). This is important for mitigating the likelihood that 46 // altering any byte in an input stream will cause chunk boundaries to be 47 // redrawn downstream. 48 // 49 // The high-level approach is to maintain a logical mapping between four-byte 50 // sequences which have been observed in the stream so-far and the integer 51 // offset of observed sequence (the mapping is done with a "cheap" hash-function 52 // which permits false-positives because they can be trivial filtered out). In 53 // the non-matched state, for each new byte consumed, a uint32 is computed from 54 // the next 4 bytes and then a look-up is performed to check for a matching 4 55 // bytes earlier in the stream. Snappy and sloppy behave roughly identical thus 56 // far. 57 // 58 // When in the "matched state" (attempting to extend the current match), Snappy 59 // does not re-index new 4-byte sequences, but Sloppy does. The reason for this 60 // is that Sloppy would like match the most recent occurence as it moves 61 // forward. 62 // 63 // Lastly, Sloppy adds two novel heuritics, both aimed at further mitigating 64 // the chance of chunk boundaries being redrawn because of byte value changes: 65 // 66 // 1) During the first 2 bytes of match, it *continues* to look for closer 67 // matches (effectively prefering a closer but shorter copy to a further but 68 // longer one). The reason for this is that when sequences repeat frequently in 69 // a byte stream, randomness provides for a good chance that a one or two byte 70 // prefix on a repeated sequence will match "far away". E.g. 71 // 72 // "23hello my friend, 12hello my friend, 01hello my friend, 23hello my friend" 73 // 74 // In the above sequence, sloppy would prefer to copy the final 75 // "hello my friend" 19 bytes backwards rather than "23hello my friend" quite a 76 // bit further. 77 // 78 // 2) Sloppy will only emit copies which are "worth it". I.e. The longer the 79 // reference back, the longer the length of the copy must be. 80 type Sloppy struct { 81 enc encoder 82 idx int 83 matching bool 84 matchOffset, matchLength int 85 table [maxTableSize]uint32 86 } 87 88 // New returns a new sloppy encoder which will encode to |f|. If |f| ever 89 // returns false, then encoding ends immediately. |f| is a callback because 90 // the primary use is that the "encoded" byte stream is fed byte-by-byte 91 // into a rolling hash function. 92 func New(f func(b byte) bool) *Sloppy { 93 return &Sloppy{ 94 binaryEncoder{f}, 95 0, 96 false, 97 0, 0, 98 [maxTableSize]uint32{}, 99 } 100 } 101 102 // Update continues the encoding of a given input stream. The caller is expected 103 // to call update after having (ONLY) appended bytes to |src|. When |Update| 104 // returns, sloppy will have emitted 0 or more literals or copies by calling 105 // the |sf.f|. Note that sloppy will ALWAYS buffer the final three bytes of 106 // input. 107 func (sl *Sloppy) Update(src []byte) { 108 // Only consume up to the point that a "look-ahead" can include 4 bytes. 109 for ; sl.idx < len(src)-3; sl.idx++ { 110 nextHash := fbhash(load32(src, sl.idx)) 111 112 if sl.matching && (sl.matchLength > maxLength || src[sl.idx] != src[sl.matchOffset+sl.matchLength]) { 113 // End Match 114 if sl.maybeCopy(src) { 115 return // terminate if consumer has "closed" 116 } 117 } 118 119 // Look for a match if we are beyond the first byte AND either there is no 120 // match yet, OR we are matching, but fewer than 3 bytes have been 121 // matched. The later condition allows for giving up to 2 bytes of a copy 122 // in order to reference a "closer" sequence. Empirical tests on 123 // structured data, suggests this reduces the average offset by ~2/3. 124 if sl.idx > 0 && (!sl.matching || sl.matchLength < 3) { 125 matchPos := int(sl.table[nextHash&tableMask]) 126 127 if sl.idx > matchPos && 128 src[sl.idx] == src[matchPos] && // filter false positives 129 sl.idx-matchPos <= maxOffset && // don't refer back beyond maxOffset 130 (!sl.matching || matchPos >= sl.matchOffset+4) { // if we are "rematching", ensure the new match is at least 4 bytes closer 131 132 if sl.matching { 133 // We are dropping an existing match for a closer one. Emit the 134 // matched bytes as literals 135 if sl.dontCopy(src, sl.idx-sl.matchLength, sl.idx) { 136 return // terminate if consumer has "closed" 137 } 138 } 139 140 // Begin a new match 141 sl.matching = true 142 sl.matchOffset = matchPos 143 sl.matchLength = 0 144 } 145 } 146 147 // Store new hashed offset 148 sl.table[nextHash&tableMask] = uint32(sl.idx) 149 150 if sl.matching { 151 sl.matchLength++ 152 } else { 153 if sl.enc.emitLiteral(src[sl.idx]) { 154 return // terminate if consumer has "closed" 155 } 156 } 157 } 158 } 159 160 func (sl *Sloppy) Reset() { 161 sl.idx = 0 162 sl.matching = false 163 sl.matchOffset = 0 164 sl.matchLength = 0 165 sl.table = [maxTableSize]uint32{} 166 } 167 168 // len >= 2^(2 + log2(maxOffset) - log2(maxOffset-off)). IOW, for the first 1/2 169 // of the maxOffset, a copy must be >= 4. For 1/2 of what remains, a copy must 170 // be >= 8, etc... 171 func copyLongEnough(off, len uint16) bool { 172 d.PanicIfTrue(off == 0) 173 174 p := uint16(0) 175 x := (1 << maxOffsetPOT) - off 176 for x > 0 { 177 x = x >> 1 178 p++ 179 } 180 181 i := maxOffsetPOT - p 182 min := 4 183 for i > 0 { 184 min = min << 1 185 i-- 186 } 187 188 return int(len) >= min 189 } 190 191 // Emit matches bytes as literals. 192 func (sl *Sloppy) dontCopy(src []byte, from, to int) bool { 193 for ; from < to; from++ { 194 if sl.enc.emitLiteral(src[from]) { 195 return true 196 } 197 } 198 return false 199 } 200 201 // Emit a copy if the length is sufficient for a given offset 202 func (sl *Sloppy) maybeCopy(src []byte) bool { 203 off, len := uint16(sl.idx-(sl.matchOffset+sl.matchLength)), uint16(sl.matchLength) 204 sl.matching = false 205 sl.matchOffset = 0 206 sl.matchLength = 0 207 208 if !copyLongEnough(off, len) { 209 return sl.dontCopy(src, sl.idx-int(len), sl.idx) 210 } 211 212 return sl.enc.emitCopy(off, len) 213 } 214 215 type encoder interface { 216 emitLiteral(b byte) bool 217 emitCopy(offset, length uint16) bool 218 } 219 220 type binaryEncoder struct { 221 f func(b byte) bool 222 } 223 224 func (be binaryEncoder) emitLiteral(b byte) bool { 225 return be.f(b) 226 } 227 228 func (be binaryEncoder) emitCopy(offset, length uint16) bool { 229 // all copies are encoded as 3 bytes. 230 // 12 bits for offset and 12 bits for length 231 232 // 8 MSBits of offset 233 if be.f(byte(offset >> 4)) { 234 return true 235 } 236 237 // 4 LSBits offset | 4 MSBits length 238 if be.f(byte(offset<<4) | byte(length>>4)) { 239 return true 240 } 241 242 // 8 LSBits of length 243 if be.f(byte(length)) { 244 return true 245 } 246 247 return false 248 } 249 250 func fbhash(u uint32) uint32 { 251 return (u * 0x1e35a7bd) >> shift 252 } 253 254 func load32(b []byte, i int) uint32 { 255 b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line. 256 return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 257 }