github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/prolly/tree/node_splitter.go (about) 1 // Copyright 2021 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // This file incorporates work covered by the following copyright and 16 // permission notice: 17 // 18 // Copyright 2016 Attic Labs, Inc. All rights reserved. 19 // Licensed under the Apache License, version 2.0: 20 // http://www.apache.org/licenses/LICENSE-2.0 21 22 package tree 23 24 import ( 25 "crypto/sha512" 26 "encoding/binary" 27 "math" 28 29 "github.com/kch42/buzhash" 30 "github.com/zeebo/xxh3" 31 ) 32 33 const ( 34 minChunkSize = 1 << 9 35 maxChunkSize = 1 << 14 36 ) 37 38 var levelSalt = [...]uint64{ 39 saltFromLevel(1), 40 saltFromLevel(2), 41 saltFromLevel(3), 42 saltFromLevel(4), 43 saltFromLevel(5), 44 saltFromLevel(6), 45 saltFromLevel(7), 46 saltFromLevel(8), 47 saltFromLevel(9), 48 saltFromLevel(10), 49 saltFromLevel(11), 50 saltFromLevel(12), 51 saltFromLevel(13), 52 saltFromLevel(14), 53 saltFromLevel(15), 54 } 55 56 // splitterFactory makes a nodeSplitter. 57 type splitterFactory func(level uint8) nodeSplitter 58 59 var defaultSplitterFactory splitterFactory = newKeySplitter 60 61 // nodeSplitter decides where Item streams should be split into chunks. 62 type nodeSplitter interface { 63 // Append provides more nodeItems to the splitter. Splitter's make chunk 64 // boundary decisions based on the Item contents. Upon return, callers 65 // can use CrossedBoundary() to see if a chunk boundary has crossed. 66 Append(key, values Item) error 67 68 // CrossedBoundary returns true if the provided nodeItems have caused a chunk 69 // boundary to be crossed. 70 CrossedBoundary() bool 71 72 // Reset resets the state of the splitter. 73 Reset() 74 } 75 76 // rollingHashSplitter is a nodeSplitter that makes chunk boundary decisions using 77 // a rolling value hasher that processes Item pairs in a byte-wise fashion. 78 // 79 // rollingHashSplitter uses a dynamic hash pattern designed to constrain the chunk 80 // Size distribution by reducing the likelihood of forming very large or very small 81 // chunks. As the Size of the current chunk grows, rollingHashSplitter changes the 82 // target pattern to make it easier to match. The result is a chunk Size distribution 83 // that is closer to a binomial distribution, rather than geometric. 84 type rollingHashSplitter struct { 85 bz *buzhash.BuzHash 86 offset uint32 87 window uint32 88 salt byte 89 90 crossedBoundary bool 91 } 92 93 const ( 94 // The window Size to use for computing the rolling hash. This is way more than necessary assuming random data 95 // (two bytes would be sufficient with a target chunk Size of 4k). The benefit of a larger window is it allows 96 // for better distribution on input with lower entropy. At a target chunk Size of 4k, any given byte changing 97 // has roughly a 1.5% chance of affecting an existing boundary, which seems like an acceptable trade-off. The 98 // choice of a prime number provides better distribution for repeating input. 99 rollingHashWindow = uint32(67) 100 ) 101 102 var _ nodeSplitter = &rollingHashSplitter{} 103 104 func newRollingHashSplitter(salt uint8) nodeSplitter { 105 return &rollingHashSplitter{ 106 bz: buzhash.NewBuzHash(rollingHashWindow), 107 window: rollingHashWindow, 108 salt: byte(salt), 109 } 110 } 111 112 var _ splitterFactory = newRollingHashSplitter 113 114 // Append implements NodeSplitter 115 func (sns *rollingHashSplitter) Append(key, value Item) (err error) { 116 for _, byt := range key { 117 _ = sns.hashByte(byt) 118 } 119 for _, byt := range value { 120 _ = sns.hashByte(byt) 121 } 122 return nil 123 } 124 125 func (sns *rollingHashSplitter) hashByte(b byte) bool { 126 sns.offset++ 127 128 if sns.crossedBoundary { 129 return true 130 } 131 132 sns.bz.HashByte(b ^ sns.salt) 133 134 if sns.offset < minChunkSize { 135 return true 136 } 137 if sns.offset > maxChunkSize { 138 sns.crossedBoundary = true 139 return true 140 } 141 142 hash := sns.bz.Sum32() 143 patt := rollingHashPattern(sns.offset) 144 sns.crossedBoundary = hash&patt == patt 145 146 return sns.crossedBoundary 147 } 148 149 // CrossedBoundary implements NodeSplitter 150 func (sns *rollingHashSplitter) CrossedBoundary() bool { 151 return sns.crossedBoundary 152 } 153 154 // Reset implements NodeSplitter 155 func (sns *rollingHashSplitter) Reset() { 156 sns.crossedBoundary = false 157 sns.offset = 0 158 sns.bz = buzhash.NewBuzHash(sns.window) 159 } 160 161 func rollingHashPattern(offset uint32) uint32 { 162 shift := 15 - (offset >> 10) 163 return 1<<shift - 1 164 } 165 166 // keySplitter is a nodeSplitter that makes chunk boundary decisions on the hash of 167 // the key of a Item pair. In contrast to the rollingHashSplitter, keySplitter 168 // tries to create chunks that have an average number of Item pairs, rather than 169 // an average number of bytes. However, because the target number of Item pairs 170 // is computed directly from the chunk size and count, the practical difference in 171 // the distribution of chunk sizes is minimal. 172 // 173 // keySplitter uses a dynamic threshold modeled on a weibull distribution 174 // (https://en.wikipedia.org/wiki/Weibull_distribution). As the size of the current 175 // trunk increases, it becomes easier to pass the threshold, reducing the likelihood 176 // of forming very large or very small chunks. 177 type keySplitter struct { 178 count, size uint32 179 crossedBoundary bool 180 181 salt uint64 182 } 183 184 func newKeySplitter(level uint8) nodeSplitter { 185 return &keySplitter{ 186 salt: levelSalt[level], 187 } 188 } 189 190 var _ splitterFactory = newKeySplitter 191 192 func (ks *keySplitter) Append(key, value Item) error { 193 thisSize := uint32(len(key) + len(value)) 194 ks.size += thisSize 195 196 if ks.size < minChunkSize { 197 return nil 198 } 199 if ks.size > maxChunkSize { 200 ks.crossedBoundary = true 201 return nil 202 } 203 204 h := xxHash32(key, ks.salt) 205 ks.crossedBoundary = weibullCheck(ks.size, thisSize, h) 206 return nil 207 } 208 209 func (ks *keySplitter) CrossedBoundary() bool { 210 return ks.crossedBoundary 211 } 212 213 func (ks *keySplitter) Reset() { 214 ks.size = 0 215 ks.crossedBoundary = false 216 } 217 218 const ( 219 targetSize float64 = 4096 220 maxUint32 float64 = math.MaxUint32 221 222 // weibull params 223 K = 4. 224 225 // TODO: seems like this should be targetSize / math.Gamma(1 + 1/K). 226 L = targetSize 227 ) 228 229 // weibullCheck returns true if we should split 230 // at |hash| for a given record inserted into a 231 // chunk of size |size|, where the record's size 232 // is |thisSize|. |size| is the size of the chunk 233 // after the record is inserted, so includes 234 // |thisSize| in it. 235 // 236 // weibullCheck attempts to form chunks whose 237 // sizes match the weibull distribution. 238 // 239 // The logic is as follows: given that we haven't 240 // split on any of the records up to |size - thisSize|, 241 // the probability that we should split on this record 242 // is (CDF(end) - CDF(start)) / (1 - CDF(start)), or, 243 // the precentage of the remaining portion of the CDF 244 // that this record actually covers. We split is |hash|, 245 // treated as a uniform random number between [0,1), 246 // is less than this percentage. 247 func weibullCheck(size, thisSize, hash uint32) bool { 248 startx := float64(size - thisSize) 249 start := -math.Expm1(-math.Pow(startx/L, K)) 250 251 endx := float64(size) 252 end := -math.Expm1(-math.Pow(endx/L, K)) 253 254 p := float64(hash) / maxUint32 255 d := 1 - start 256 if d <= 0 { 257 return true 258 } 259 target := (end - start) / d 260 return p < target 261 } 262 263 func xxHash32(b []byte, salt uint64) uint32 { 264 return uint32(xxh3.HashSeed(b, salt)) 265 } 266 267 func saltFromLevel(level uint8) (salt uint64) { 268 full := sha512.Sum512([]byte{level}) 269 return binary.LittleEndian.Uint64(full[:8]) 270 }