github.com/pingcap/badger@v1.5.1-0.20230103063557-828f39b09b6d/table/sstable/table.go (about) 1 /* 2 * Copyright 2017 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package sstable 18 19 import ( 20 "fmt" 21 "io" 22 "math" 23 "os" 24 "path" 25 "path/filepath" 26 "strconv" 27 "strings" 28 "sync" 29 "sync/atomic" 30 "unsafe" 31 32 "github.com/coocood/bbloom" 33 "github.com/pingcap/badger/buffer" 34 "github.com/pingcap/badger/cache" 35 "github.com/pingcap/badger/fileutil" 36 "github.com/pingcap/badger/options" 37 "github.com/pingcap/badger/surf" 38 "github.com/pingcap/badger/y" 39 "github.com/pingcap/errors" 40 ) 41 42 const ( 43 fileSuffix = ".sst" 44 idxFileSuffix = ".idx" 45 46 intSize = int(unsafe.Sizeof(int(0))) 47 ) 48 49 func IndexFilename(tableFilename string) string { return tableFilename + idxFileSuffix } 50 51 type tableIndex struct { 52 blockEndOffsets []uint32 53 baseKeys entrySlice 54 bf *bbloom.Bloom 55 hIdx *hashIndex 56 surf *surf.SuRF 57 } 58 59 // Table represents a loaded table file with the info we have about it 60 type Table struct { 61 sync.Mutex 62 63 fd *os.File // Own fd. 64 indexFd *os.File 65 66 globalTs uint64 67 tableSize int64 68 numBlocks int 69 smallest, biggest y.Key 70 id uint64 71 72 blockCache *cache.Cache 73 blocksData []byte 74 75 indexCache *cache.Cache 76 index *tableIndex 77 indexOnce sync.Once 78 indexData []byte 79 80 compacting int32 81 82 compression options.CompressionType 83 84 oldBlockLen int64 85 oldBlock []byte 86 } 87 88 // CompressionType returns the compression algorithm used for block compression. 89 func (t *Table) CompressionType() options.CompressionType { 90 return t.compression 91 } 92 93 // Delete delete table's file from disk. 94 func (t *Table) Delete() error { 95 if t.fd == nil { 96 t.blocksData = nil 97 t.indexData = nil 98 return nil 99 } 100 if t.blockCache != nil { 101 for blk := 0; blk < t.numBlocks; blk++ { 102 key := t.blockCacheKey(blk) 103 if v, ok := t.blockCache.Get(key); ok { 104 if b, ok := v.(*block); ok { 105 b.done() 106 } 107 t.blockCache.Del(key) 108 } 109 } 110 } 111 if t.indexCache != nil { 112 t.indexCache.Del(t.id) 113 } 114 if len(t.blocksData) != 0 { 115 y.Munmap(t.blocksData) 116 } 117 t.index = nil 118 if len(t.indexData) != 0 { 119 y.Munmap(t.indexData) 120 } 121 if err := t.fd.Truncate(0); err != nil { 122 // This is very important to let the FS know that the file is deleted. 123 return err 124 } 125 filename := t.fd.Name() 126 if err := t.fd.Close(); err != nil { 127 return err 128 } 129 if err := os.Remove(filename); err != nil { 130 return err 131 } 132 return os.Remove(filename + idxFileSuffix) 133 } 134 135 // OpenTable assumes file has only one table and opens it. Takes ownership of fd upon function 136 // entry. Returns a table with one reference count on it (decrementing which may delete the file! 137 // -- consider t.Close() instead). The fd has to writeable because we call Truncate on it before 138 // deleting. 139 func OpenTable(filename string, blockCache *cache.Cache, indexCache *cache.Cache) (*Table, error) { 140 id, ok := ParseFileID(filename) 141 if !ok { 142 return nil, errors.Errorf("Invalid filename: %s", filename) 143 } 144 145 // TODO: after we support cache of L2 storage, we will open block data file in cache manager. 146 fd, err := y.OpenExistingFile(filename, 0) 147 if err != nil { 148 return nil, err 149 } 150 151 indexFd, err := y.OpenExistingFile(filename+idxFileSuffix, 0) 152 if err != nil { 153 return nil, err 154 } 155 156 t := &Table{ 157 fd: fd, 158 indexFd: indexFd, 159 id: id, 160 blockCache: blockCache, 161 indexCache: indexCache, 162 } 163 164 if err := t.initTableInfo(); err != nil { 165 t.Close() 166 return nil, err 167 } 168 if blockCache == nil || t.oldBlockLen > 0 { 169 t.blocksData, err = y.Mmap(fd, false, t.Size()) 170 if err != nil { 171 t.Close() 172 return nil, y.Wrapf(err, "Unable to map file") 173 } 174 t.setOldBlock() 175 } 176 return t, nil 177 } 178 179 func (t *Table) setOldBlock() { 180 t.oldBlock = t.blocksData[t.tableSize-t.oldBlockLen : t.tableSize] 181 } 182 183 // OpenInMemoryTable opens a table that has data in memory. 184 func OpenInMemoryTable(blockData, indexData []byte) (*Table, error) { 185 t := &Table{ 186 blocksData: blockData, 187 indexData: indexData, 188 } 189 if err := t.initTableInfo(); err != nil { 190 return nil, err 191 } 192 t.setOldBlock() 193 return t, nil 194 } 195 196 // Close closes the open table. (Releases resources back to the OS.) 197 func (t *Table) Close() error { 198 if t.fd != nil { 199 t.fd.Close() 200 } 201 if t.indexFd != nil { 202 if len(t.indexData) != 0 { 203 y.Munmap(t.indexData) 204 } 205 t.indexFd.Close() 206 } 207 return nil 208 } 209 210 func (t *Table) NewIterator(reversed bool) y.Iterator { 211 return t.newIterator(reversed) 212 } 213 214 func (t *Table) Get(key y.Key, keyHash uint64) (y.ValueStruct, error) { 215 resultKey, resultVs, ok, err := t.pointGet(key, keyHash) 216 if err != nil { 217 return y.ValueStruct{}, err 218 } 219 if !ok { 220 it := t.NewIterator(false) 221 defer it.Close() 222 it.Seek(key.UserKey) 223 if !it.Valid() { 224 return y.ValueStruct{}, nil 225 } 226 if !key.SameUserKey(it.Key()) { 227 return y.ValueStruct{}, nil 228 } 229 resultKey, resultVs = it.Key(), it.Value() 230 } else if resultKey.IsEmpty() { 231 return y.ValueStruct{}, nil 232 } 233 result := resultVs 234 result.Version = resultKey.Version 235 return result, nil 236 } 237 238 // pointGet try to lookup a key and its value by table's hash index. 239 // If it find an hash collision the last return value will be false, 240 // which means caller should fallback to seek search. Otherwise it value will be true. 241 // If the hash index does not contain such an element the returned key will be nil. 242 func (t *Table) pointGet(key y.Key, keyHash uint64) (y.Key, y.ValueStruct, bool, error) { 243 idx, err := t.getIndex() 244 if err != nil { 245 return y.Key{}, y.ValueStruct{}, false, err 246 } 247 if idx.bf != nil && !idx.bf.Has(keyHash) { 248 return y.Key{}, y.ValueStruct{}, true, err 249 } 250 251 blkIdx, offset := uint32(resultFallback), uint8(0) 252 if idx.hIdx != nil { 253 blkIdx, offset = idx.hIdx.lookup(keyHash) 254 } else if idx.surf != nil { 255 v, ok := idx.surf.Get(key.UserKey) 256 if !ok { 257 blkIdx = resultNoEntry 258 } else { 259 var pos entryPosition 260 pos.decode(v) 261 blkIdx, offset = uint32(pos.blockIdx), pos.offset 262 } 263 } 264 if blkIdx == resultFallback { 265 return y.Key{}, y.ValueStruct{}, false, nil 266 } 267 if blkIdx == resultNoEntry { 268 return y.Key{}, y.ValueStruct{}, true, nil 269 } 270 271 it := t.newIterator(false) 272 defer it.Close() 273 it.seekFromOffset(int(blkIdx), int(offset), key.UserKey) 274 275 if !it.Valid() || !key.SameUserKey(it.Key()) { 276 return y.Key{}, y.ValueStruct{}, true, it.Error() 277 } 278 if !y.SeekToVersion(it, key.Version) { 279 return y.Key{}, y.ValueStruct{}, true, it.Error() 280 } 281 return it.Key(), it.Value(), true, nil 282 } 283 284 func (t *Table) read(off int, sz int) ([]byte, error) { 285 if len(t.blocksData) > 0 { 286 if len(t.blocksData[off:]) < sz { 287 return nil, y.ErrEOF 288 } 289 return t.blocksData[off : off+sz], nil 290 } 291 res := buffer.GetBuffer(sz) 292 _, err := t.fd.ReadAt(res, int64(off)) 293 return res, err 294 } 295 296 func (t *Table) initTableInfo() error { 297 d, err := t.loadIndexData(false) 298 if err != nil { 299 return err 300 } 301 302 t.compression = d.compression 303 t.globalTs = d.globalTS 304 305 for ; d.valid(); d.next() { 306 switch d.currentId() { 307 case idSmallest: 308 if k := d.decode(); len(k) != 0 { 309 t.smallest = y.KeyWithTs(y.Copy(k), math.MaxUint64) 310 } 311 case idBiggest: 312 if k := d.decode(); len(k) != 0 { 313 t.biggest = y.KeyWithTs(y.Copy(k), 0) 314 } 315 case idBlockEndOffsets: 316 offsets := bytesToU32Slice(d.decode()) 317 t.tableSize = int64(offsets[len(offsets)-1]) 318 t.numBlocks = len(offsets) 319 case idOldBlockLen: 320 t.oldBlockLen = int64(bytesToU32(d.decode())) 321 t.tableSize += t.oldBlockLen 322 } 323 } 324 return nil 325 } 326 327 func (t *Table) readTableIndex(d *metaDecoder) *tableIndex { 328 idx := new(tableIndex) 329 for ; d.valid(); d.next() { 330 switch d.currentId() { 331 case idBaseKeysEndOffs: 332 idx.baseKeys.endOffs = bytesToU32Slice(d.decode()) 333 case idBaseKeys: 334 idx.baseKeys.data = d.decode() 335 case idBlockEndOffsets: 336 idx.blockEndOffsets = bytesToU32Slice(d.decode()) 337 case idBloomFilter: 338 if d := d.decode(); len(d) != 0 { 339 idx.bf = new(bbloom.Bloom) 340 idx.bf.BinaryUnmarshal(d) 341 } 342 case idHashIndex: 343 if d := d.decode(); len(d) != 0 { 344 idx.hIdx = new(hashIndex) 345 idx.hIdx.readIndex(d) 346 } 347 case idSuRFIndex: 348 if d := d.decode(); len(d) != 0 { 349 idx.surf = new(surf.SuRF) 350 idx.surf.Unmarshal(d) 351 } 352 } 353 } 354 return idx 355 } 356 357 func (t *Table) getIndex() (*tableIndex, error) { 358 if t.indexCache == nil { 359 var err error 360 t.indexOnce.Do(func() { 361 var d *metaDecoder 362 d, err = t.loadIndexData(true) 363 if err != nil { 364 return 365 } 366 t.index = t.readTableIndex(d) 367 }) 368 return t.index, nil 369 } 370 371 index, err := t.indexCache.GetOrCompute(t.id, func() (interface{}, int64, error) { 372 d, err := t.loadIndexData(false) 373 if err != nil { 374 return nil, 0, err 375 } 376 return t.readTableIndex(d), int64(len(d.buf)), nil 377 }) 378 if err != nil { 379 return nil, err 380 } 381 return index.(*tableIndex), nil 382 } 383 384 func (t *Table) loadIndexData(useMmap bool) (*metaDecoder, error) { 385 if t.indexFd == nil { 386 return newMetaDecoder(t.indexData) 387 } 388 fstat, err := t.indexFd.Stat() 389 if err != nil { 390 return nil, err 391 } 392 var idxData []byte 393 394 if useMmap { 395 idxData, err = y.Mmap(t.indexFd, false, fstat.Size()) 396 if err != nil { 397 return nil, err 398 } 399 t.indexData = idxData 400 } else { 401 idxData = buffer.GetBuffer(int(fstat.Size())) 402 if _, err = t.indexFd.ReadAt(idxData, 0); err != nil { 403 return nil, err 404 } 405 } 406 407 decoder, err := newMetaDecoder(idxData) 408 if err != nil { 409 return nil, err 410 } 411 if decoder.compression != options.None && useMmap { 412 y.Munmap(idxData) 413 t.indexData = nil 414 } 415 return decoder, nil 416 } 417 418 type block struct { 419 offset int 420 data []byte 421 baseKey []byte 422 423 reference int32 424 } 425 426 func OnEvict(key uint64, value interface{}) { 427 if b, ok := value.(*block); ok { 428 b.done() 429 } 430 } 431 432 func (b *block) add() (ok bool) { 433 for { 434 old := atomic.LoadInt32(&b.reference) 435 if old == 0 { 436 return false 437 } 438 new := old + 1 439 if atomic.CompareAndSwapInt32(&b.reference, old, new) { 440 return true 441 } 442 } 443 } 444 445 func (b *block) done() { 446 if b != nil && atomic.AddInt32(&b.reference, -1) == 0 { 447 buffer.PutBuffer(b.data) 448 b.data = nil 449 } 450 } 451 452 func (b *block) size() int64 { 453 return int64(intSize + len(b.data)) 454 } 455 456 func (t *Table) block(idx int, index *tableIndex) (*block, error) { 457 y.Assert(idx >= 0) 458 459 if idx >= len(index.blockEndOffsets) { 460 return &block{}, io.EOF 461 } 462 463 if t.blockCache == nil { 464 return t.loadBlock(idx, index) 465 } 466 467 key := t.blockCacheKey(idx) 468 blk, err := t.blockCache.GetOrCompute(key, func() (interface{}, int64, error) { 469 b, e := t.loadBlock(idx, index) 470 if e != nil { 471 return nil, 0, e 472 } 473 b.reference = 1 474 return b, int64(len(b.data)), nil 475 }) 476 if err != nil { 477 return &block{}, err 478 } 479 b := blk.(*block) 480 if ok := b.add(); !ok { 481 return &block{}, errors.Errorf("block is evicted") 482 } 483 return b, nil 484 } 485 486 func (t *Table) loadBlock(idx int, index *tableIndex) (*block, error) { 487 var startOffset int 488 if idx > 0 { 489 startOffset = int(index.blockEndOffsets[idx-1]) 490 } 491 blk := &block{ 492 offset: startOffset, 493 } 494 endOffset := int(index.blockEndOffsets[idx]) 495 dataLen := endOffset - startOffset 496 var err error 497 if blk.data, err = t.read(blk.offset, dataLen); err != nil { 498 return &block{}, errors.Wrapf(err, 499 "failed to read from file: %s at offset: %d, len: %d", t.fd.Name(), blk.offset, dataLen) 500 } 501 502 blk.data, err = t.compression.Decompress(blk.data) 503 if err != nil { 504 return &block{}, errors.Wrapf(err, 505 "failed to decode compressed data in file: %s at offset: %d, len: %d", 506 t.fd.Name(), blk.offset, dataLen) 507 } 508 blk.baseKey = index.baseKeys.getEntry(idx) 509 return blk, nil 510 } 511 512 // HasGlobalTs returns table does set global ts. 513 func (t *Table) HasGlobalTs() bool { 514 return t.globalTs != 0 515 } 516 517 // SetGlobalTs update the global ts of external ingested tables. 518 func (t *Table) SetGlobalTs(ts uint64) error { 519 if _, err := t.indexFd.WriteAt(u64ToBytes(ts), 0); err != nil { 520 return err 521 } 522 if err := fileutil.Fsync(t.indexFd); err != nil { 523 return err 524 } 525 t.globalTs = ts 526 return nil 527 } 528 529 func (t *Table) MarkCompacting(flag bool) { 530 if flag { 531 atomic.StoreInt32(&t.compacting, 1) 532 } 533 atomic.StoreInt32(&t.compacting, 0) 534 } 535 536 func (t *Table) IsCompacting() bool { 537 return atomic.LoadInt32(&t.compacting) == 1 538 } 539 540 func (t *Table) blockCacheKey(idx int) uint64 { 541 y.Assert(t.ID() < math.MaxUint32) 542 y.Assert(idx < math.MaxUint32) 543 return (t.ID() << 32) | uint64(idx) 544 } 545 546 // Size is its file size in bytes 547 func (t *Table) Size() int64 { return t.tableSize } 548 549 // Smallest is its smallest key, or nil if there are none 550 func (t *Table) Smallest() y.Key { return t.smallest } 551 552 // Biggest is its biggest key, or nil if there are none 553 func (t *Table) Biggest() y.Key { return t.biggest } 554 555 // Filename is NOT the file name. Just kidding, it is. 556 func (t *Table) Filename() string { return t.fd.Name() } 557 558 // ID is the table's ID number (used to make the file name). 559 func (t *Table) ID() uint64 { return t.id } 560 561 func (t *Table) HasOverlap(start, end y.Key, includeEnd bool) bool { 562 if start.Compare(t.Biggest()) > 0 { 563 return false 564 } 565 566 if cmp := end.Compare(t.Smallest()); cmp < 0 { 567 return false 568 } else if cmp == 0 { 569 return includeEnd 570 } 571 572 idx, err := t.getIndex() 573 if err != nil { 574 return true 575 } 576 577 if idx.surf != nil { 578 return idx.surf.HasOverlap(start.UserKey, end.UserKey, includeEnd) 579 } 580 581 // If there are errors occurred during seeking, 582 // we assume the table has overlapped with the range to prevent data loss. 583 it := t.newIteratorWithIdx(false, idx) 584 defer it.Close() 585 it.Seek(start.UserKey) 586 if !it.Valid() { 587 return it.Error() != nil 588 } 589 if cmp := it.Key().Compare(end); cmp > 0 { 590 return false 591 } else if cmp == 0 { 592 return includeEnd 593 } 594 return true 595 } 596 597 // ParseFileID reads the file id out of a filename. 598 func ParseFileID(name string) (uint64, bool) { 599 name = path.Base(name) 600 if !strings.HasSuffix(name, fileSuffix) { 601 return 0, false 602 } 603 // suffix := name[len(fileSuffix):] 604 name = strings.TrimSuffix(name, fileSuffix) 605 id, err := strconv.ParseUint(name, 16, 64) 606 if err != nil { 607 return 0, false 608 } 609 return id, true 610 } 611 612 // IDToFilename does the inverse of ParseFileID 613 func IDToFilename(id uint64) string { 614 return fmt.Sprintf("%08x", id) + fileSuffix 615 } 616 617 // NewFilename should be named TableFilepath -- it combines the dir with the ID to make a table 618 // filepath. 619 func NewFilename(id uint64, dir string) string { 620 return filepath.Join(dir, IDToFilename(id)) 621 }