github.com/pingcap/badger@v1.5.1-0.20230103063557-828f39b09b6d/table/sstable/builder.go (about) 1 /* 2 * Copyright 2017 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package sstable 18 19 import ( 20 "bytes" 21 "encoding/binary" 22 "math" 23 "os" 24 "reflect" 25 "unsafe" 26 27 "github.com/coocood/bbloom" 28 "github.com/dgryski/go-farm" 29 "github.com/pingcap/badger/fileutil" 30 "github.com/pingcap/badger/options" 31 "github.com/pingcap/badger/surf" 32 "github.com/pingcap/badger/y" 33 "golang.org/x/time/rate" 34 ) 35 36 type entrySlice struct { 37 data []byte 38 endOffs []uint32 39 } 40 41 func (es *entrySlice) append(entry []byte) { 42 es.data = append(es.data, entry...) 43 es.endOffs = append(es.endOffs, uint32(len(es.data))) 44 } 45 46 func (es *entrySlice) appendVal(val *y.ValueStruct) { 47 es.data = val.EncodeTo(es.data) 48 es.endOffs = append(es.endOffs, uint32(len(es.data))) 49 } 50 51 func (es *entrySlice) getLast() []byte { 52 return es.getEntry(es.length() - 1) 53 } 54 55 func (es *entrySlice) getEntry(i int) []byte { 56 var startOff uint32 57 if i > 0 { 58 startOff = es.endOffs[i-1] 59 } 60 endOff := es.endOffs[i] 61 return es.data[startOff:endOff] 62 } 63 64 func (es *entrySlice) length() int { 65 return len(es.endOffs) 66 } 67 68 func (es *entrySlice) size() int { 69 return len(es.data) + 4*len(es.endOffs) 70 } 71 72 func (es *entrySlice) reset() { 73 es.data = es.data[:0] 74 es.endOffs = es.endOffs[:0] 75 } 76 77 const headerSize = 4 78 79 // Builder is used in building a table. 80 type Builder struct { 81 counter int // Number of keys written for the current block. 82 83 file *os.File 84 w tableWriter 85 buf []byte 86 writtenLen int 87 rawWrittenLen int 88 compression options.CompressionType 89 90 baseKeys entrySlice 91 92 blockEndOffsets []uint32 // Base offsets of every block. 93 94 // end offsets of every entry within the current block being built. 95 // The offsets are relative to the start of the block. 96 entryEndOffsets []uint32 97 98 smallest y.Key 99 biggest y.Key 100 101 hashEntries []hashEntry 102 bloomFpr float64 103 useGlobalTS bool 104 opt options.TableBuilderOptions 105 useSuRF bool 106 107 surfKeys [][]byte 108 surfVals [][]byte 109 110 tmpKeys entrySlice 111 tmpVals entrySlice 112 tmpOldOffs []uint32 113 114 singleKeyOldVers entrySlice 115 oldBlock []byte 116 } 117 118 type tableWriter interface { 119 Reset(f *os.File) 120 Write(b []byte) (int, error) 121 Offset() int64 122 Finish() error 123 } 124 125 type inMemWriter struct { 126 *bytes.Buffer 127 } 128 129 func (w *inMemWriter) Reset(_ *os.File) { 130 w.Buffer.Reset() 131 } 132 133 func (w *inMemWriter) Offset() int64 { 134 return int64(w.Len()) 135 } 136 137 func (w *inMemWriter) Finish() error { 138 return nil 139 } 140 141 // NewTableBuilder makes a new TableBuilder. 142 // If the f is nil, the builder builds in-memory result. 143 // If the limiter is nil, the write speed during table build will not be limited. 144 func NewTableBuilder(f *os.File, limiter *rate.Limiter, level int, opt options.TableBuilderOptions) *Builder { 145 t := float64(opt.LevelSizeMultiplier) 146 fprBase := math.Pow(t, 1/(t-1)) * opt.LogicalBloomFPR * (t - 1) 147 levelFactor := math.Pow(t, float64(opt.MaxLevels-level)) 148 b := &Builder{ 149 file: f, 150 buf: make([]byte, 0, 4*1024), 151 hashEntries: make([]hashEntry, 0, 4*1024), 152 bloomFpr: fprBase / levelFactor, 153 compression: opt.CompressionPerLevel[level], 154 opt: opt, 155 useSuRF: level >= opt.SuRFStartLevel, 156 // add one byte so the offset would never be 0, so oldOffset is 0 means no old version. 157 oldBlock: []byte{0}, 158 } 159 if f != nil { 160 b.w = fileutil.NewDirectWriter(f, opt.WriteBufferSize, limiter) 161 } else { 162 b.w = &inMemWriter{Buffer: bytes.NewBuffer(make([]byte, 0, opt.MaxTableSize))} 163 } 164 return b 165 } 166 167 func NewExternalTableBuilder(f *os.File, limiter *rate.Limiter, opt options.TableBuilderOptions, compression options.CompressionType) *Builder { 168 return &Builder{ 169 file: f, 170 w: fileutil.NewDirectWriter(f, opt.WriteBufferSize, limiter), 171 buf: make([]byte, 0, 4*1024), 172 hashEntries: make([]hashEntry, 0, 4*1024), 173 bloomFpr: opt.LogicalBloomFPR, 174 useGlobalTS: true, 175 compression: compression, 176 opt: opt, 177 } 178 } 179 180 // Reset this builder with new file. 181 func (b *Builder) Reset(f *os.File) { 182 b.file = f 183 b.resetBuffers() 184 b.w.Reset(f) 185 } 186 187 // SetIsManaged should be called when ingesting a table into a managed DB. 188 func (b *Builder) SetIsManaged() { 189 b.useGlobalTS = false 190 } 191 192 func (b *Builder) resetBuffers() { 193 b.counter = 0 194 b.buf = b.buf[:0] 195 b.writtenLen = 0 196 b.rawWrittenLen = 0 197 b.baseKeys.reset() 198 b.blockEndOffsets = b.blockEndOffsets[:0] 199 b.entryEndOffsets = b.entryEndOffsets[:0] 200 b.hashEntries = b.hashEntries[:0] 201 b.surfKeys = nil 202 b.surfVals = nil 203 b.smallest.UserKey = b.smallest.UserKey[:0] 204 b.biggest.UserKey = b.biggest.UserKey[:0] 205 b.oldBlock = b.oldBlock[:0] 206 } 207 208 // Close closes the TableBuilder. 209 func (b *Builder) Close() {} 210 211 // Empty returns whether it's empty. 212 func (b *Builder) Empty() bool { return b.writtenLen+len(b.buf)+b.tmpKeys.length() == 0 } 213 214 // keyDiff returns the first index at which the two keys are different. 215 func keyDiffIdx(k1, k2 []byte) int { 216 var i int 217 for i = 0; i < len(k1) && i < len(k2); i++ { 218 if k1[i] != k2[i] { 219 break 220 } 221 } 222 return i 223 } 224 225 func (b *Builder) addIndex(key y.Key) { 226 if b.smallest.IsEmpty() { 227 b.smallest.Copy(key) 228 } 229 if b.biggest.SameUserKey(key) { 230 return 231 } 232 b.biggest.Copy(key) 233 234 keyHash := farm.Fingerprint64(key.UserKey) 235 // It is impossible that a single table contains 16 million keys. 236 y.Assert(b.baseKeys.length() < maxBlockCnt) 237 238 pos := entryPosition{uint16(b.baseKeys.length()), uint8(b.counter)} 239 if b.useSuRF { 240 b.surfKeys = append(b.surfKeys, y.SafeCopy(nil, key.UserKey)) 241 b.surfVals = append(b.surfVals, pos.encode()) 242 } else { 243 b.hashEntries = append(b.hashEntries, hashEntry{pos, keyHash}) 244 } 245 } 246 247 func (b *Builder) addHelper(key y.Key, v y.ValueStruct) { 248 // Add key to bloom filter. 249 if len(key.UserKey) > 0 { 250 b.addIndex(key) 251 } 252 b.tmpKeys.append(key.UserKey) 253 v.Version = key.Version 254 b.tmpVals.appendVal(&v) 255 b.tmpOldOffs = append(b.tmpOldOffs, 0) 256 b.counter++ 257 } 258 259 // oldEntry format: 260 // numEntries(4) | endOffsets(4 * numEntries) | entries 261 // 262 // entry format: 263 // version(8) | value 264 func (b *Builder) addOld(key y.Key, v y.ValueStruct) { 265 v.Version = key.Version 266 keyIdx := b.tmpKeys.length() - 1 267 startOff := b.tmpOldOffs[keyIdx] 268 if startOff == 0 { 269 startOff = uint32(len(b.oldBlock)) 270 b.tmpOldOffs[keyIdx] = startOff 271 } 272 b.singleKeyOldVers.appendVal(&v) 273 } 274 275 // entryFormat 276 // no old entry: 277 // diffKeyLen(2) | diffKey | 0 | version(8) | value 278 // has old entry: 279 // diffKeyLen(2) | diffKey | 1 | oldOffset(4) | version(8) | value 280 func (b *Builder) finishBlock() error { 281 if b.tmpKeys.length() == 0 { 282 return nil 283 } 284 if b.singleKeyOldVers.length() > 0 { 285 b.flushSingleKeyOldVers() 286 } 287 firstKey := b.tmpKeys.getEntry(0) 288 lastKey := b.tmpKeys.getLast() 289 blockCommonLen := keyDiffIdx(firstKey, lastKey) 290 for i := 0; i < b.tmpKeys.length(); i++ { 291 key := b.tmpKeys.getEntry(i) 292 b.buf = appendU16(b.buf, uint16(len(key)-blockCommonLen)) 293 b.buf = append(b.buf, key[blockCommonLen:]...) 294 if b.tmpOldOffs[i] == 0 { 295 b.buf = append(b.buf, 0) 296 } else { 297 b.buf = append(b.buf, 1) 298 b.buf = append(b.buf, u32ToBytes(b.tmpOldOffs[i])...) 299 } 300 b.buf = append(b.buf, b.tmpVals.getEntry(i)...) 301 b.entryEndOffsets = append(b.entryEndOffsets, uint32(len(b.buf))) 302 } 303 b.buf = append(b.buf, u32SliceToBytes(b.entryEndOffsets)...) 304 b.buf = append(b.buf, u32ToBytes(uint32(len(b.entryEndOffsets)))...) 305 b.buf = appendU16(b.buf, uint16(blockCommonLen)) 306 307 // Add base key. 308 b.baseKeys.append(firstKey) 309 310 before := b.w.Offset() 311 if err := b.compression.Compress(b.w, b.buf); err != nil { 312 return err 313 } 314 size := b.w.Offset() - before 315 b.blockEndOffsets = append(b.blockEndOffsets, uint32(b.writtenLen+int(size))) 316 b.writtenLen += int(size) 317 b.rawWrittenLen += len(b.buf) 318 319 // Reset the block for the next build. 320 b.entryEndOffsets = b.entryEndOffsets[:0] 321 b.counter = 0 322 b.buf = b.buf[:0] 323 b.tmpKeys.reset() 324 b.tmpVals.reset() 325 b.tmpOldOffs = b.tmpOldOffs[:0] 326 return nil 327 } 328 329 // Add adds a key-value pair to the block. 330 // If doNotRestart is true, we will not restart even if b.counter >= restartInterval. 331 func (b *Builder) Add(key y.Key, value y.ValueStruct) error { 332 var lastUserKey []byte 333 if b.tmpKeys.length() > 0 { 334 lastUserKey = b.tmpKeys.getLast() 335 } 336 // Check old before check finish block, so two blocks never have the same key. 337 if bytes.Equal(lastUserKey, key.UserKey) { 338 b.addOld(key, value) 339 return nil 340 } else if b.singleKeyOldVers.length() > 0 { 341 b.flushSingleKeyOldVers() 342 } 343 if b.shouldFinishBlock() { 344 if err := b.finishBlock(); err != nil { 345 return err 346 } 347 } 348 b.addHelper(key, value) 349 return nil // Currently, there is no meaningful error. 350 } 351 352 func (b *Builder) flushSingleKeyOldVers() { 353 // numEntries 354 b.oldBlock = append(b.oldBlock, u32ToBytes(uint32(b.singleKeyOldVers.length()))...) 355 // endOffsets 356 b.oldBlock = append(b.oldBlock, u32SliceToBytes(b.singleKeyOldVers.endOffs)...) 357 // entries 358 b.oldBlock = append(b.oldBlock, b.singleKeyOldVers.data...) 359 b.singleKeyOldVers.reset() 360 } 361 362 func (b *Builder) shouldFinishBlock() bool { 363 // If there is no entry till now, we will return false. 364 if b.tmpKeys.length() == 0 { 365 return false 366 } 367 return uint32(b.tmpKeys.size()+b.tmpVals.size()) > uint32(b.opt.BlockSize) 368 } 369 370 // ReachedCapacity returns true if we... roughly (?) reached capacity? 371 func (b *Builder) ReachedCapacity(capacity int64) bool { 372 estimateSz := b.rawWrittenLen + len(b.buf) + 373 4*len(b.blockEndOffsets) + b.baseKeys.size() + len(b.oldBlock) 374 return int64(estimateSz) > capacity 375 } 376 377 // EstimateSize returns the size of the SST to build. 378 func (b *Builder) EstimateSize() int { 379 size := b.rawWrittenLen + len(b.buf) + 4*len(b.blockEndOffsets) + b.baseKeys.size() + len(b.oldBlock) 380 if !b.useSuRF { 381 size += 3 * int(float32(len(b.hashEntries))/b.opt.HashUtilRatio) 382 } 383 return size 384 } 385 386 const ( 387 idSmallest byte = iota 388 idBiggest 389 idBaseKeysEndOffs 390 idBaseKeys 391 idBlockEndOffsets 392 idBloomFilter 393 idHashIndex 394 idSuRFIndex 395 idOldBlockLen 396 ) 397 398 // BuildResult contains the build result info, if it's file based compaction, fileName should be used to open Table. 399 // If it's in memory compaction, FileData and IndexData contains the data. 400 type BuildResult struct { 401 FileName string 402 FileData []byte 403 IndexData []byte 404 } 405 406 // Finish finishes the table by appending the index. 407 func (b *Builder) Finish() (*BuildResult, error) { 408 err := b.finishBlock() // This will never start a new block. 409 if err != nil { 410 return nil, err 411 } 412 if len(b.oldBlock) > 1 { 413 _, err = b.w.Write(b.oldBlock) 414 if err != nil { 415 return nil, err 416 } 417 } 418 if err = b.w.Finish(); err != nil { 419 return nil, err 420 } 421 result := new(BuildResult) 422 if b.file != nil { 423 idxFile, err := y.OpenTruncFile(IndexFilename(b.file.Name()), false) 424 if err != nil { 425 return nil, err 426 } 427 result.FileName = b.file.Name() 428 b.w.Reset(idxFile) 429 } else { 430 result.FileData = y.Copy(b.w.(*inMemWriter).Bytes()) 431 b.w.Reset(nil) 432 } 433 434 // Don't compress the global ts, because it may be updated during ingest. 435 ts := uint64(0) 436 if b.useGlobalTS { 437 // External builder doesn't append ts to the keys, the output sst should has a non-zero global ts. 438 ts = 1 439 } 440 441 encoder := newMetaEncoder(b.buf, b.compression, ts) 442 encoder.append(b.smallest.UserKey, idSmallest) 443 encoder.append(b.biggest.UserKey, idBiggest) 444 encoder.append(u32SliceToBytes(b.baseKeys.endOffs), idBaseKeysEndOffs) 445 encoder.append(b.baseKeys.data, idBaseKeys) 446 encoder.append(u32SliceToBytes(b.blockEndOffsets), idBlockEndOffsets) 447 if len(b.oldBlock) > 1 { 448 encoder.append(u32ToBytes(uint32(len(b.oldBlock))), idOldBlockLen) 449 } 450 451 var bloomFilter []byte 452 if !b.useSuRF { 453 bf := bbloom.New(float64(len(b.hashEntries)), b.bloomFpr) 454 for _, he := range b.hashEntries { 455 bf.Add(he.hash) 456 } 457 bloomFilter = bf.BinaryMarshal() 458 } 459 encoder.append(bloomFilter, idBloomFilter) 460 461 var hashIndex []byte 462 if !b.useSuRF { 463 hashIndex = buildHashIndex(b.hashEntries, b.opt.HashUtilRatio) 464 } 465 encoder.append(hashIndex, idHashIndex) 466 467 var surfIndex []byte 468 if b.useSuRF && len(b.surfKeys) > 0 { 469 hl := uint32(b.opt.SuRFOptions.HashSuffixLen) 470 rl := uint32(b.opt.SuRFOptions.RealSuffixLen) 471 sb := surf.NewBuilder(3, hl, rl) 472 sf := sb.Build(b.surfKeys, b.surfVals, b.opt.SuRFOptions.BitsPerKeyHint) 473 surfIndex = sf.Marshal() 474 } 475 encoder.append(surfIndex, idSuRFIndex) 476 477 if err = encoder.finish(b.w); err != nil { 478 return nil, err 479 } 480 481 if err = b.w.Finish(); err != nil { 482 return nil, err 483 } 484 if b.file == nil { 485 result.IndexData = y.Copy(b.w.(*inMemWriter).Bytes()) 486 } 487 return result, nil 488 } 489 490 func appendU16(buf []byte, v uint16) []byte { 491 return append(buf, byte(v), byte(v>>8)) 492 } 493 494 func u32ToBytes(v uint32) []byte { 495 var uBuf [4]byte 496 binary.LittleEndian.PutUint32(uBuf[:], v) 497 return uBuf[:] 498 } 499 500 func u64ToBytes(v uint64) []byte { 501 var uBuf [8]byte 502 binary.LittleEndian.PutUint64(uBuf[:], v) 503 return uBuf[:] 504 } 505 506 func u32SliceToBytes(u32s []uint32) []byte { 507 if len(u32s) == 0 { 508 return nil 509 } 510 var b []byte 511 hdr := (*reflect.SliceHeader)(unsafe.Pointer(&b)) 512 hdr.Len = len(u32s) * 4 513 hdr.Cap = hdr.Len 514 hdr.Data = uintptr(unsafe.Pointer(&u32s[0])) 515 return b 516 } 517 518 func bytesToU32Slice(b []byte) []uint32 { 519 if len(b) == 0 { 520 return nil 521 } 522 var u32s []uint32 523 hdr := (*reflect.SliceHeader)(unsafe.Pointer(&u32s)) 524 hdr.Len = len(b) / 4 525 hdr.Cap = hdr.Len 526 hdr.Data = uintptr(unsafe.Pointer(&b[0])) 527 return u32s 528 } 529 530 func bytesToU32(b []byte) uint32 { 531 return binary.LittleEndian.Uint32(b) 532 } 533 534 func bytesToU64(b []byte) uint64 { 535 return binary.LittleEndian.Uint64(b) 536 } 537 538 type metaEncoder struct { 539 buf []byte 540 compression options.CompressionType 541 } 542 543 func newMetaEncoder(buf []byte, compression options.CompressionType, globalTS uint64) *metaEncoder { 544 buf = append(buf, u64ToBytes(globalTS)...) 545 buf = append(buf, byte(compression)) 546 return &metaEncoder{ 547 buf: buf, 548 compression: compression, 549 } 550 } 551 552 func (e *metaEncoder) append(d []byte, id byte) { 553 e.buf = append(e.buf, id) 554 e.buf = append(e.buf, u32ToBytes(uint32(len(d)))...) 555 e.buf = append(e.buf, d...) 556 } 557 558 func (e *metaEncoder) finish(w tableWriter) error { 559 if e.compression == options.None { 560 _, err := w.Write(e.buf) 561 return err 562 } 563 564 if _, err := w.Write(e.buf[:9]); err != nil { 565 return err 566 } 567 return e.compression.Compress(w, e.buf[9:]) 568 } 569 570 type metaDecoder struct { 571 buf []byte 572 globalTS uint64 573 compression options.CompressionType 574 575 cursor int 576 } 577 578 func newMetaDecoder(buf []byte) (*metaDecoder, error) { 579 globalTS := bytesToU64(buf[:8]) 580 compression := options.CompressionType(buf[8]) 581 buf = buf[9:] 582 if compression != options.None { 583 buf1, err := compression.Decompress(buf) 584 if err != nil { 585 return nil, err 586 } 587 buf = buf1 588 } 589 return &metaDecoder{ 590 buf: buf, 591 globalTS: globalTS, 592 compression: compression, 593 }, nil 594 } 595 596 func (e *metaDecoder) valid() bool { 597 return e.cursor < len(e.buf) 598 } 599 600 func (e *metaDecoder) currentId() byte { 601 return e.buf[e.cursor] 602 } 603 604 func (e *metaDecoder) decode() []byte { 605 cursor := e.cursor + 1 606 l := int(bytesToU32(e.buf[cursor:])) 607 cursor += 4 608 d := e.buf[cursor : cursor+l] 609 return d 610 } 611 612 func (e *metaDecoder) next() { 613 l := int(bytesToU32(e.buf[e.cursor+1:])) 614 e.cursor += 1 + 4 + l 615 }