github.com/coocood/badger@v1.5.1-0.20200528065104-c02ac3616d04/table/sstable/builder.go (about) 1 /* 2 * Copyright 2017 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package sstable 18 19 import ( 20 "bytes" 21 "encoding/binary" 22 "math" 23 "os" 24 "reflect" 25 "unsafe" 26 27 "github.com/coocood/badger/fileutil" 28 "github.com/coocood/badger/options" 29 "github.com/coocood/badger/surf" 30 "github.com/coocood/badger/y" 31 "github.com/coocood/bbloom" 32 "github.com/dgryski/go-farm" 33 "golang.org/x/time/rate" 34 ) 35 36 type entrySlice struct { 37 data []byte 38 endOffs []uint32 39 } 40 41 func (es *entrySlice) append(entry []byte) { 42 es.data = append(es.data, entry...) 43 es.endOffs = append(es.endOffs, uint32(len(es.data))) 44 } 45 46 func (es *entrySlice) appendVal(val *y.ValueStruct) { 47 es.data = val.EncodeTo(es.data) 48 es.endOffs = append(es.endOffs, uint32(len(es.data))) 49 } 50 51 func (es *entrySlice) getLast() []byte { 52 return es.getEntry(es.length() - 1) 53 } 54 55 func (es *entrySlice) getEntry(i int) []byte { 56 var startOff uint32 57 if i > 0 { 58 startOff = es.endOffs[i-1] 59 } 60 endOff := es.endOffs[i] 61 return es.data[startOff:endOff] 62 } 63 64 func (es *entrySlice) length() int { 65 return len(es.endOffs) 66 } 67 68 func (es *entrySlice) size() int { 69 return len(es.data) + 4*len(es.endOffs) 70 } 71 72 func (es *entrySlice) reset() { 73 es.data = es.data[:0] 74 es.endOffs = es.endOffs[:0] 75 } 76 77 const headerSize = 4 78 79 // Builder is used in building a table. 80 type Builder struct { 81 counter int // Number of keys written for the current block. 82 83 idxFileName string 84 w *fileutil.DirectWriter 85 buf []byte 86 writtenLen int 87 rawWrittenLen int 88 compression options.CompressionType 89 90 baseKeys entrySlice 91 92 blockEndOffsets []uint32 // Base offsets of every block. 93 94 // end offsets of every entry within the current block being built. 95 // The offsets are relative to the start of the block. 96 entryEndOffsets []uint32 97 98 smallest y.Key 99 biggest y.Key 100 101 hashEntries []hashEntry 102 bloomFpr float64 103 useGlobalTS bool 104 opt options.TableBuilderOptions 105 useSuRF bool 106 107 surfKeys [][]byte 108 surfVals [][]byte 109 110 tmpKeys entrySlice 111 tmpVals entrySlice 112 tmpOldOffs []uint32 113 114 singleKeyOldVers entrySlice 115 oldBlock []byte 116 } 117 118 // NewTableBuilder makes a new TableBuilder. 119 // If the limiter is nil, the write speed during table build will not be limited. 120 func NewTableBuilder(f *os.File, limiter *rate.Limiter, level int, opt options.TableBuilderOptions) *Builder { 121 t := float64(opt.LevelSizeMultiplier) 122 fprBase := math.Pow(t, 1/(t-1)) * opt.LogicalBloomFPR * (t - 1) 123 levelFactor := math.Pow(t, float64(opt.MaxLevels-level)) 124 125 return &Builder{ 126 idxFileName: f.Name() + idxFileSuffix, 127 w: fileutil.NewDirectWriter(f, opt.WriteBufferSize, limiter), 128 buf: make([]byte, 0, 4*1024), 129 hashEntries: make([]hashEntry, 0, 4*1024), 130 bloomFpr: fprBase / levelFactor, 131 compression: opt.CompressionPerLevel[level], 132 opt: opt, 133 useSuRF: level >= opt.SuRFStartLevel, 134 // add one byte so the offset would never be 0, so oldOffset is 0 means no old version. 135 oldBlock: []byte{0}, 136 } 137 } 138 139 func NewExternalTableBuilder(f *os.File, limiter *rate.Limiter, opt options.TableBuilderOptions, compression options.CompressionType) *Builder { 140 return &Builder{ 141 idxFileName: f.Name() + idxFileSuffix, 142 w: fileutil.NewDirectWriter(f, opt.WriteBufferSize, limiter), 143 buf: make([]byte, 0, 4*1024), 144 hashEntries: make([]hashEntry, 0, 4*1024), 145 bloomFpr: opt.LogicalBloomFPR, 146 useGlobalTS: true, 147 compression: compression, 148 opt: opt, 149 } 150 } 151 152 // Reset this builder with new file. 153 func (b *Builder) Reset(f *os.File) { 154 b.resetBuffers() 155 b.w.Reset(f) 156 b.idxFileName = f.Name() + idxFileSuffix 157 } 158 159 // SetIsManaged should be called when ingesting a table into a managed DB. 160 func (b *Builder) SetIsManaged() { 161 b.useGlobalTS = false 162 } 163 164 func (b *Builder) resetBuffers() { 165 b.counter = 0 166 b.buf = b.buf[:0] 167 b.writtenLen = 0 168 b.rawWrittenLen = 0 169 b.baseKeys.reset() 170 b.blockEndOffsets = b.blockEndOffsets[:0] 171 b.entryEndOffsets = b.entryEndOffsets[:0] 172 b.hashEntries = b.hashEntries[:0] 173 b.surfKeys = nil 174 b.surfVals = nil 175 b.smallest.UserKey = b.smallest.UserKey[:0] 176 b.biggest.UserKey = b.biggest.UserKey[:0] 177 b.oldBlock = b.oldBlock[:0] 178 } 179 180 // Close closes the TableBuilder. 181 func (b *Builder) Close() {} 182 183 // Empty returns whether it's empty. 184 func (b *Builder) Empty() bool { return b.writtenLen+len(b.buf)+b.tmpKeys.length() == 0 } 185 186 // keyDiff returns the first index at which the two keys are different. 187 func keyDiffIdx(k1, k2 []byte) int { 188 var i int 189 for i = 0; i < len(k1) && i < len(k2); i++ { 190 if k1[i] != k2[i] { 191 break 192 } 193 } 194 return i 195 } 196 197 func (b *Builder) addIndex(key y.Key) { 198 if b.smallest.IsEmpty() { 199 b.smallest.Copy(key) 200 } 201 if b.biggest.SameUserKey(key) { 202 return 203 } 204 b.biggest.Copy(key) 205 206 keyHash := farm.Fingerprint64(key.UserKey) 207 // It is impossible that a single table contains 16 million keys. 208 y.Assert(b.baseKeys.length() < maxBlockCnt) 209 210 pos := entryPosition{uint16(b.baseKeys.length()), uint8(b.counter)} 211 if b.useSuRF { 212 b.surfKeys = append(b.surfKeys, y.SafeCopy(nil, key.UserKey)) 213 b.surfVals = append(b.surfVals, pos.encode()) 214 } else { 215 b.hashEntries = append(b.hashEntries, hashEntry{pos, keyHash}) 216 } 217 } 218 219 func (b *Builder) addHelper(key y.Key, v y.ValueStruct) { 220 // Add key to bloom filter. 221 if len(key.UserKey) > 0 { 222 b.addIndex(key) 223 } 224 b.tmpKeys.append(key.UserKey) 225 v.Version = key.Version 226 b.tmpVals.appendVal(&v) 227 b.tmpOldOffs = append(b.tmpOldOffs, 0) 228 b.counter++ 229 } 230 231 // oldEntry format: 232 // numEntries(4) | endOffsets(4 * numEntries) | entries 233 // 234 // entry format: 235 // version(8) | value 236 func (b *Builder) addOld(key y.Key, v y.ValueStruct) { 237 v.Version = key.Version 238 keyIdx := b.tmpKeys.length() - 1 239 startOff := b.tmpOldOffs[keyIdx] 240 if startOff == 0 { 241 startOff = uint32(len(b.oldBlock)) 242 b.tmpOldOffs[keyIdx] = startOff 243 } 244 b.singleKeyOldVers.appendVal(&v) 245 } 246 247 // entryFormat 248 // no old entry: 249 // diffKeyLen(2) | diffKey | 0 | version(8) | value 250 // has old entry: 251 // diffKeyLen(2) | diffKey | 1 | oldOffset(4) | version(8) | value 252 func (b *Builder) finishBlock() error { 253 if b.tmpKeys.length() == 0 { 254 return nil 255 } 256 if b.singleKeyOldVers.length() > 0 { 257 b.flushSingleKeyOldVers() 258 } 259 firstKey := b.tmpKeys.getEntry(0) 260 lastKey := b.tmpKeys.getLast() 261 blockCommonLen := keyDiffIdx(firstKey, lastKey) 262 for i := 0; i < b.tmpKeys.length(); i++ { 263 key := b.tmpKeys.getEntry(i) 264 b.buf = appendU16(b.buf, uint16(len(key)-blockCommonLen)) 265 b.buf = append(b.buf, key[blockCommonLen:]...) 266 if b.tmpOldOffs[i] == 0 { 267 b.buf = append(b.buf, 0) 268 } else { 269 b.buf = append(b.buf, 1) 270 b.buf = append(b.buf, u32ToBytes(b.tmpOldOffs[i])...) 271 } 272 b.buf = append(b.buf, b.tmpVals.getEntry(i)...) 273 b.entryEndOffsets = append(b.entryEndOffsets, uint32(len(b.buf))) 274 } 275 b.buf = append(b.buf, u32SliceToBytes(b.entryEndOffsets)...) 276 b.buf = append(b.buf, u32ToBytes(uint32(len(b.entryEndOffsets)))...) 277 b.buf = appendU16(b.buf, uint16(blockCommonLen)) 278 279 // Add base key. 280 b.baseKeys.append(firstKey) 281 282 before := b.w.Offset() 283 if err := b.compression.Compress(b.w, b.buf); err != nil { 284 return err 285 } 286 size := b.w.Offset() - before 287 b.blockEndOffsets = append(b.blockEndOffsets, uint32(b.writtenLen+int(size))) 288 b.writtenLen += int(size) 289 b.rawWrittenLen += len(b.buf) 290 291 // Reset the block for the next build. 292 b.entryEndOffsets = b.entryEndOffsets[:0] 293 b.counter = 0 294 b.buf = b.buf[:0] 295 b.tmpKeys.reset() 296 b.tmpVals.reset() 297 b.tmpOldOffs = b.tmpOldOffs[:0] 298 return nil 299 } 300 301 // Add adds a key-value pair to the block. 302 // If doNotRestart is true, we will not restart even if b.counter >= restartInterval. 303 func (b *Builder) Add(key y.Key, value y.ValueStruct) error { 304 var lastUserKey []byte 305 if b.tmpKeys.length() > 0 { 306 lastUserKey = b.tmpKeys.getLast() 307 } 308 // Check old before check finish block, so two blocks never have the same key. 309 if bytes.Equal(lastUserKey, key.UserKey) { 310 b.addOld(key, value) 311 return nil 312 } else if b.singleKeyOldVers.length() > 0 { 313 b.flushSingleKeyOldVers() 314 } 315 if b.shouldFinishBlock() { 316 if err := b.finishBlock(); err != nil { 317 return err 318 } 319 } 320 b.addHelper(key, value) 321 return nil // Currently, there is no meaningful error. 322 } 323 324 func (b *Builder) flushSingleKeyOldVers() { 325 // numEntries 326 b.oldBlock = append(b.oldBlock, u32ToBytes(uint32(b.singleKeyOldVers.length()))...) 327 // endOffsets 328 b.oldBlock = append(b.oldBlock, u32SliceToBytes(b.singleKeyOldVers.endOffs)...) 329 // entries 330 b.oldBlock = append(b.oldBlock, b.singleKeyOldVers.data...) 331 b.singleKeyOldVers.reset() 332 } 333 334 func (b *Builder) shouldFinishBlock() bool { 335 // If there is no entry till now, we will return false. 336 if b.tmpKeys.length() == 0 { 337 return false 338 } 339 return uint32(b.tmpKeys.size()+b.tmpVals.size()) > uint32(b.opt.BlockSize) 340 } 341 342 // ReachedCapacity returns true if we... roughly (?) reached capacity? 343 func (b *Builder) ReachedCapacity(capacity int64) bool { 344 estimateSz := b.rawWrittenLen + len(b.buf) + 345 4*len(b.blockEndOffsets) + b.baseKeys.size() + len(b.oldBlock) 346 return int64(estimateSz) > capacity 347 } 348 349 // EstimateSize returns the size of the SST to build. 350 func (b *Builder) EstimateSize() int { 351 size := b.rawWrittenLen + len(b.buf) + 4*len(b.blockEndOffsets) + b.baseKeys.size() + len(b.oldBlock) 352 if !b.useSuRF { 353 size += 3 * int(float32(len(b.hashEntries))/b.opt.HashUtilRatio) 354 } 355 return size 356 } 357 358 const ( 359 idSmallest byte = iota 360 idBiggest 361 idBaseKeysEndOffs 362 idBaseKeys 363 idBlockEndOffsets 364 idBloomFilter 365 idHashIndex 366 idSuRFIndex 367 idOldBlockLen 368 ) 369 370 // Finish finishes the table by appending the index. 371 func (b *Builder) Finish() error { 372 err := b.finishBlock() // This will never start a new block. 373 if err != nil { 374 return err 375 } 376 if len(b.oldBlock) > 1 { 377 err = b.w.Append(b.oldBlock) 378 if err != nil { 379 return err 380 } 381 } 382 if err = b.w.Finish(); err != nil { 383 return err 384 } 385 idxFile, err := y.OpenTruncFile(b.idxFileName, false) 386 if err != nil { 387 return err 388 } 389 b.w.Reset(idxFile) 390 391 // Don't compress the global ts, because it may be updated during ingest. 392 ts := uint64(0) 393 if b.useGlobalTS { 394 // External builder doesn't append ts to the keys, the output sst should has a non-zero global ts. 395 ts = 1 396 } 397 398 encoder := newMetaEncoder(b.buf, b.compression, ts) 399 encoder.append(b.smallest.UserKey, idSmallest) 400 encoder.append(b.biggest.UserKey, idBiggest) 401 encoder.append(u32SliceToBytes(b.baseKeys.endOffs), idBaseKeysEndOffs) 402 encoder.append(b.baseKeys.data, idBaseKeys) 403 encoder.append(u32SliceToBytes(b.blockEndOffsets), idBlockEndOffsets) 404 if len(b.oldBlock) > 1 { 405 encoder.append(u32ToBytes(uint32(len(b.oldBlock))), idOldBlockLen) 406 } 407 408 var bloomFilter []byte 409 if !b.useSuRF { 410 bf := bbloom.New(float64(len(b.hashEntries)), b.bloomFpr) 411 for _, he := range b.hashEntries { 412 bf.Add(he.hash) 413 } 414 bloomFilter = bf.BinaryMarshal() 415 } 416 encoder.append(bloomFilter, idBloomFilter) 417 418 var hashIndex []byte 419 if !b.useSuRF { 420 hashIndex = buildHashIndex(b.hashEntries, b.opt.HashUtilRatio) 421 } 422 encoder.append(hashIndex, idHashIndex) 423 424 var surfIndex []byte 425 if b.useSuRF && len(b.surfKeys) > 0 { 426 hl := uint32(b.opt.SuRFOptions.HashSuffixLen) 427 rl := uint32(b.opt.SuRFOptions.RealSuffixLen) 428 sb := surf.NewBuilder(3, hl, rl) 429 sf := sb.Build(b.surfKeys, b.surfVals, b.opt.SuRFOptions.BitsPerKeyHint) 430 surfIndex = sf.Marshal() 431 } 432 encoder.append(surfIndex, idSuRFIndex) 433 434 if err := encoder.finish(b.w); err != nil { 435 return err 436 } 437 438 return b.w.Finish() 439 } 440 441 func appendU16(buf []byte, v uint16) []byte { 442 return append(buf, byte(v), byte(v>>8)) 443 } 444 445 func u32ToBytes(v uint32) []byte { 446 var uBuf [4]byte 447 binary.LittleEndian.PutUint32(uBuf[:], v) 448 return uBuf[:] 449 } 450 451 func u64ToBytes(v uint64) []byte { 452 var uBuf [8]byte 453 binary.LittleEndian.PutUint64(uBuf[:], v) 454 return uBuf[:] 455 } 456 457 func u32SliceToBytes(u32s []uint32) []byte { 458 if len(u32s) == 0 { 459 return nil 460 } 461 var b []byte 462 hdr := (*reflect.SliceHeader)(unsafe.Pointer(&b)) 463 hdr.Len = len(u32s) * 4 464 hdr.Cap = hdr.Len 465 hdr.Data = uintptr(unsafe.Pointer(&u32s[0])) 466 return b 467 } 468 469 func bytesToU32Slice(b []byte) []uint32 { 470 if len(b) == 0 { 471 return nil 472 } 473 var u32s []uint32 474 hdr := (*reflect.SliceHeader)(unsafe.Pointer(&u32s)) 475 hdr.Len = len(b) / 4 476 hdr.Cap = hdr.Len 477 hdr.Data = uintptr(unsafe.Pointer(&b[0])) 478 return u32s 479 } 480 481 func bytesToU32(b []byte) uint32 { 482 return binary.LittleEndian.Uint32(b) 483 } 484 485 func bytesToU64(b []byte) uint64 { 486 return binary.LittleEndian.Uint64(b) 487 } 488 489 type metaEncoder struct { 490 buf []byte 491 compression options.CompressionType 492 } 493 494 func newMetaEncoder(buf []byte, compression options.CompressionType, globalTS uint64) *metaEncoder { 495 buf = append(buf, u64ToBytes(globalTS)...) 496 buf = append(buf, byte(compression)) 497 return &metaEncoder{ 498 buf: buf, 499 compression: compression, 500 } 501 } 502 503 func (e *metaEncoder) append(d []byte, id byte) { 504 e.buf = append(e.buf, id) 505 e.buf = append(e.buf, u32ToBytes(uint32(len(d)))...) 506 e.buf = append(e.buf, d...) 507 } 508 509 func (e *metaEncoder) finish(w *fileutil.DirectWriter) error { 510 if e.compression == options.None { 511 return w.Append(e.buf) 512 } 513 514 if err := w.Append(e.buf[:9]); err != nil { 515 return err 516 } 517 return e.compression.Compress(w, e.buf[9:]) 518 } 519 520 type metaDecoder struct { 521 buf []byte 522 globalTS uint64 523 compression options.CompressionType 524 525 cursor int 526 } 527 528 func newMetaDecoder(buf []byte) (*metaDecoder, error) { 529 globalTS := bytesToU64(buf[:8]) 530 compression := options.CompressionType(buf[8]) 531 buf = buf[9:] 532 if compression != options.None { 533 buf1, err := compression.Decompress(buf) 534 if err != nil { 535 return nil, err 536 } 537 buf = buf1 538 } 539 return &metaDecoder{ 540 buf: buf, 541 globalTS: globalTS, 542 compression: compression, 543 }, nil 544 } 545 546 func (e *metaDecoder) valid() bool { 547 return e.cursor < len(e.buf) 548 } 549 550 func (e *metaDecoder) currentId() byte { 551 return e.buf[e.cursor] 552 } 553 554 func (e *metaDecoder) decode() []byte { 555 cursor := e.cursor + 1 556 l := int(bytesToU32(e.buf[cursor:])) 557 cursor += 4 558 d := e.buf[cursor : cursor+l] 559 return d 560 } 561 562 func (e *metaDecoder) next() { 563 l := int(bytesToU32(e.buf[e.cursor+1:])) 564 e.cursor += 1 + 4 + l 565 }