github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/sstable/reader.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package sstable 6 7 import ( 8 "bytes" 9 "cmp" 10 "context" 11 "encoding/binary" 12 "io" 13 "os" 14 "slices" 15 "time" 16 17 "github.com/cespare/xxhash/v2" 18 "github.com/cockroachdb/errors" 19 "github.com/cockroachdb/pebble/internal/base" 20 "github.com/cockroachdb/pebble/internal/bytealloc" 21 "github.com/cockroachdb/pebble/internal/cache" 22 "github.com/cockroachdb/pebble/internal/crc" 23 "github.com/cockroachdb/pebble/internal/invariants" 24 "github.com/cockroachdb/pebble/internal/keyspan" 25 "github.com/cockroachdb/pebble/internal/private" 26 "github.com/cockroachdb/pebble/objstorage" 27 "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing" 28 ) 29 30 var errCorruptIndexEntry = base.CorruptionErrorf("pebble/table: corrupt index entry") 31 var errReaderClosed = errors.New("pebble/table: reader is closed") 32 33 // decodeBlockHandle returns the block handle encoded at the start of src, as 34 // well as the number of bytes it occupies. It returns zero if given invalid 35 // input. A block handle for a data block or a first/lower level index block 36 // should not be decoded using decodeBlockHandle since the caller may validate 37 // that the number of bytes decoded is equal to the length of src, which will 38 // be false if the properties are not decoded. In those cases the caller 39 // should use decodeBlockHandleWithProperties. 40 func decodeBlockHandle(src []byte) (BlockHandle, int) { 41 offset, n := binary.Uvarint(src) 42 length, m := binary.Uvarint(src[n:]) 43 if n == 0 || m == 0 { 44 return BlockHandle{}, 0 45 } 46 return BlockHandle{offset, length}, n + m 47 } 48 49 // decodeBlockHandleWithProperties returns the block handle and properties 50 // encoded in src. src needs to be exactly the length that was encoded. This 51 // method must be used for data block and first/lower level index blocks. The 52 // properties in the block handle point to the bytes in src. 53 func decodeBlockHandleWithProperties(src []byte) (BlockHandleWithProperties, error) { 54 bh, n := decodeBlockHandle(src) 55 if n == 0 { 56 return BlockHandleWithProperties{}, errors.Errorf("invalid BlockHandle") 57 } 58 return BlockHandleWithProperties{ 59 BlockHandle: bh, 60 Props: src[n:], 61 }, nil 62 } 63 64 func encodeBlockHandle(dst []byte, b BlockHandle) int { 65 n := binary.PutUvarint(dst, b.Offset) 66 m := binary.PutUvarint(dst[n:], b.Length) 67 return n + m 68 } 69 70 func encodeBlockHandleWithProperties(dst []byte, b BlockHandleWithProperties) []byte { 71 n := encodeBlockHandle(dst, b.BlockHandle) 72 dst = append(dst[:n], b.Props...) 73 return dst 74 } 75 76 // block is a []byte that holds a sequence of key/value pairs plus an index 77 // over those pairs. 78 type block []byte 79 80 type loadBlockResult int8 81 82 const ( 83 loadBlockOK loadBlockResult = iota 84 // Could be due to error or because no block left to load. 85 loadBlockFailed 86 loadBlockIrrelevant 87 ) 88 89 type blockTransform func([]byte) ([]byte, error) 90 91 // ReaderOption provide an interface to do work on Reader while it is being 92 // opened. 93 type ReaderOption interface { 94 // readerApply is called on the reader during opening in order to set internal 95 // parameters. 96 readerApply(*Reader) 97 } 98 99 // Comparers is a map from comparer name to comparer. It is used for debugging 100 // tools which may be used on multiple databases configured with different 101 // comparers. Comparers implements the OpenOption interface and can be passed 102 // as a parameter to NewReader. 103 type Comparers map[string]*Comparer 104 105 func (c Comparers) readerApply(r *Reader) { 106 if r.Compare != nil || r.Properties.ComparerName == "" { 107 return 108 } 109 if comparer, ok := c[r.Properties.ComparerName]; ok { 110 r.Compare = comparer.Compare 111 r.FormatKey = comparer.FormatKey 112 r.Split = comparer.Split 113 } 114 } 115 116 // Mergers is a map from merger name to merger. It is used for debugging tools 117 // which may be used on multiple databases configured with different 118 // mergers. Mergers implements the OpenOption interface and can be passed as 119 // a parameter to NewReader. 120 type Mergers map[string]*Merger 121 122 func (m Mergers) readerApply(r *Reader) { 123 if r.mergerOK || r.Properties.MergerName == "" { 124 return 125 } 126 _, r.mergerOK = m[r.Properties.MergerName] 127 } 128 129 // cacheOpts is a Reader open option for specifying the cache ID and sstable file 130 // number. If not specified, a unique cache ID will be used. 131 type cacheOpts struct { 132 cacheID uint64 133 fileNum base.DiskFileNum 134 } 135 136 // Marker function to indicate the option should be applied before reading the 137 // sstable properties and, in the write path, before writing the default 138 // sstable properties. 139 func (c *cacheOpts) preApply() {} 140 141 func (c *cacheOpts) readerApply(r *Reader) { 142 if r.cacheID == 0 { 143 r.cacheID = c.cacheID 144 } 145 if r.fileNum.FileNum() == 0 { 146 r.fileNum = c.fileNum 147 } 148 } 149 150 func (c *cacheOpts) writerApply(w *Writer) { 151 if w.cacheID == 0 { 152 w.cacheID = c.cacheID 153 } 154 if w.fileNum.FileNum() == 0 { 155 w.fileNum = c.fileNum 156 } 157 } 158 159 // rawTombstonesOpt is a Reader open option for specifying that range 160 // tombstones returned by Reader.NewRangeDelIter() should not be 161 // fragmented. Used by debug tools to get a raw view of the tombstones 162 // contained in an sstable. 163 type rawTombstonesOpt struct{} 164 165 func (rawTombstonesOpt) preApply() {} 166 167 func (rawTombstonesOpt) readerApply(r *Reader) { 168 r.rawTombstones = true 169 } 170 171 func init() { 172 private.SSTableCacheOpts = func(cacheID uint64, fileNum base.DiskFileNum) interface{} { 173 return &cacheOpts{cacheID, fileNum} 174 } 175 private.SSTableRawTombstonesOpt = rawTombstonesOpt{} 176 } 177 178 // CommonReader abstracts functionality over a Reader or a VirtualReader. This 179 // can be used by code which doesn't care to distinguish between a reader and a 180 // virtual reader. 181 type CommonReader interface { 182 NewRawRangeKeyIter() (keyspan.FragmentIterator, error) 183 NewRawRangeDelIter() (keyspan.FragmentIterator, error) 184 NewIterWithBlockPropertyFiltersAndContextEtc( 185 ctx context.Context, lower, upper []byte, 186 filterer *BlockPropertiesFilterer, 187 hideObsoletePoints, useFilterBlock bool, 188 stats *base.InternalIteratorStats, 189 categoryAndQoS CategoryAndQoS, 190 statsCollector *CategoryStatsCollector, 191 rp ReaderProvider, 192 ) (Iterator, error) 193 NewCompactionIter( 194 bytesIterated *uint64, 195 categoryAndQoS CategoryAndQoS, 196 statsCollector *CategoryStatsCollector, 197 rp ReaderProvider, 198 bufferPool *BufferPool, 199 ) (Iterator, error) 200 EstimateDiskUsage(start, end []byte) (uint64, error) 201 CommonProperties() *CommonProperties 202 } 203 204 // Reader is a table reader. 205 type Reader struct { 206 readable objstorage.Readable 207 cacheID uint64 208 fileNum base.DiskFileNum 209 err error 210 indexBH BlockHandle 211 filterBH BlockHandle 212 rangeDelBH BlockHandle 213 rangeKeyBH BlockHandle 214 rangeDelTransform blockTransform 215 valueBIH valueBlocksIndexHandle 216 propertiesBH BlockHandle 217 metaIndexBH BlockHandle 218 footerBH BlockHandle 219 opts ReaderOptions 220 Compare Compare 221 FormatKey base.FormatKey 222 Split Split 223 tableFilter *tableFilterReader 224 // Keep types that are not multiples of 8 bytes at the end and with 225 // decreasing size. 226 Properties Properties 227 tableFormat TableFormat 228 rawTombstones bool 229 mergerOK bool 230 checksumType ChecksumType 231 // metaBufferPool is a buffer pool used exclusively when opening a table and 232 // loading its meta blocks. metaBufferPoolAlloc is used to batch-allocate 233 // the BufferPool.pool slice as a part of the Reader allocation. It's 234 // capacity 3 to accommodate the meta block (1), and both the compressed 235 // properties block (1) and decompressed properties block (1) 236 // simultaneously. 237 metaBufferPool BufferPool 238 metaBufferPoolAlloc [3]allocedBuffer 239 } 240 241 // Close implements DB.Close, as documented in the pebble package. 242 func (r *Reader) Close() error { 243 r.opts.Cache.Unref() 244 245 if r.readable != nil { 246 r.err = firstError(r.err, r.readable.Close()) 247 r.readable = nil 248 } 249 250 if r.err != nil { 251 return r.err 252 } 253 // Make any future calls to Get, NewIter or Close return an error. 254 r.err = errReaderClosed 255 return nil 256 } 257 258 // NewIterWithBlockPropertyFilters returns an iterator for the contents of the 259 // table. If an error occurs, NewIterWithBlockPropertyFilters cleans up after 260 // itself and returns a nil iterator. 261 func (r *Reader) NewIterWithBlockPropertyFilters( 262 lower, upper []byte, 263 filterer *BlockPropertiesFilterer, 264 useFilterBlock bool, 265 stats *base.InternalIteratorStats, 266 categoryAndQoS CategoryAndQoS, 267 statsCollector *CategoryStatsCollector, 268 rp ReaderProvider, 269 ) (Iterator, error) { 270 return r.newIterWithBlockPropertyFiltersAndContext( 271 context.Background(), lower, upper, filterer, false, useFilterBlock, stats, 272 categoryAndQoS, statsCollector, rp, nil) 273 } 274 275 // NewIterWithBlockPropertyFiltersAndContextEtc is similar to 276 // NewIterWithBlockPropertyFilters and additionally accepts a context for 277 // tracing. 278 // 279 // If hideObsoletePoints, the callee assumes that filterer already includes 280 // obsoleteKeyBlockPropertyFilter. The caller can satisfy this contract by 281 // first calling TryAddBlockPropertyFilterForHideObsoletePoints. 282 func (r *Reader) NewIterWithBlockPropertyFiltersAndContextEtc( 283 ctx context.Context, 284 lower, upper []byte, 285 filterer *BlockPropertiesFilterer, 286 hideObsoletePoints, useFilterBlock bool, 287 stats *base.InternalIteratorStats, 288 categoryAndQoS CategoryAndQoS, 289 statsCollector *CategoryStatsCollector, 290 rp ReaderProvider, 291 ) (Iterator, error) { 292 return r.newIterWithBlockPropertyFiltersAndContext( 293 ctx, lower, upper, filterer, hideObsoletePoints, useFilterBlock, stats, categoryAndQoS, 294 statsCollector, rp, nil) 295 } 296 297 // TryAddBlockPropertyFilterForHideObsoletePoints is expected to be called 298 // before the call to NewIterWithBlockPropertyFiltersAndContextEtc, to get the 299 // value of hideObsoletePoints and potentially add a block property filter. 300 func (r *Reader) TryAddBlockPropertyFilterForHideObsoletePoints( 301 snapshotForHideObsoletePoints uint64, 302 fileLargestSeqNum uint64, 303 pointKeyFilters []BlockPropertyFilter, 304 ) (hideObsoletePoints bool, filters []BlockPropertyFilter) { 305 hideObsoletePoints = r.tableFormat >= TableFormatPebblev4 && 306 snapshotForHideObsoletePoints > fileLargestSeqNum 307 if hideObsoletePoints { 308 pointKeyFilters = append(pointKeyFilters, obsoleteKeyBlockPropertyFilter{}) 309 } 310 return hideObsoletePoints, pointKeyFilters 311 } 312 313 func (r *Reader) newIterWithBlockPropertyFiltersAndContext( 314 ctx context.Context, 315 lower, upper []byte, 316 filterer *BlockPropertiesFilterer, 317 hideObsoletePoints bool, 318 useFilterBlock bool, 319 stats *base.InternalIteratorStats, 320 categoryAndQoS CategoryAndQoS, 321 statsCollector *CategoryStatsCollector, 322 rp ReaderProvider, 323 v *virtualState, 324 ) (Iterator, error) { 325 // NB: pebble.tableCache wraps the returned iterator with one which performs 326 // reference counting on the Reader, preventing the Reader from being closed 327 // until the final iterator closes. 328 if r.Properties.IndexType == twoLevelIndex { 329 i := twoLevelIterPool.Get().(*twoLevelIterator) 330 err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, hideObsoletePoints, stats, 331 categoryAndQoS, statsCollector, rp, nil /* bufferPool */) 332 if err != nil { 333 return nil, err 334 } 335 return i, nil 336 } 337 338 i := singleLevelIterPool.Get().(*singleLevelIterator) 339 err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, hideObsoletePoints, stats, 340 categoryAndQoS, statsCollector, rp, nil /* bufferPool */) 341 if err != nil { 342 return nil, err 343 } 344 return i, nil 345 } 346 347 // NewIter returns an iterator for the contents of the table. If an error 348 // occurs, NewIter cleans up after itself and returns a nil iterator. NewIter 349 // must only be used when the Reader is guaranteed to outlive any LazyValues 350 // returned from the iter. 351 func (r *Reader) NewIter(lower, upper []byte) (Iterator, error) { 352 return r.NewIterWithBlockPropertyFilters( 353 lower, upper, nil, true /* useFilterBlock */, nil, /* stats */ 354 CategoryAndQoS{}, nil /*statsCollector */, TrivialReaderProvider{Reader: r}) 355 } 356 357 // NewCompactionIter returns an iterator similar to NewIter but it also increments 358 // the number of bytes iterated. If an error occurs, NewCompactionIter cleans up 359 // after itself and returns a nil iterator. 360 func (r *Reader) NewCompactionIter( 361 bytesIterated *uint64, 362 categoryAndQoS CategoryAndQoS, 363 statsCollector *CategoryStatsCollector, 364 rp ReaderProvider, 365 bufferPool *BufferPool, 366 ) (Iterator, error) { 367 return r.newCompactionIter(bytesIterated, categoryAndQoS, statsCollector, rp, nil, bufferPool) 368 } 369 370 func (r *Reader) newCompactionIter( 371 bytesIterated *uint64, 372 categoryAndQoS CategoryAndQoS, 373 statsCollector *CategoryStatsCollector, 374 rp ReaderProvider, 375 v *virtualState, 376 bufferPool *BufferPool, 377 ) (Iterator, error) { 378 if r.Properties.IndexType == twoLevelIndex { 379 i := twoLevelIterPool.Get().(*twoLevelIterator) 380 err := i.init( 381 context.Background(), 382 r, v, nil /* lower */, nil /* upper */, nil, 383 false /* useFilter */, v != nil && v.isForeign, /* hideObsoletePoints */ 384 nil /* stats */, categoryAndQoS, statsCollector, rp, bufferPool, 385 ) 386 if err != nil { 387 return nil, err 388 } 389 i.setupForCompaction() 390 return &twoLevelCompactionIterator{ 391 twoLevelIterator: i, 392 bytesIterated: bytesIterated, 393 }, nil 394 } 395 i := singleLevelIterPool.Get().(*singleLevelIterator) 396 err := i.init( 397 context.Background(), r, v, nil /* lower */, nil, /* upper */ 398 nil, false /* useFilter */, v != nil && v.isForeign, /* hideObsoletePoints */ 399 nil /* stats */, categoryAndQoS, statsCollector, rp, bufferPool, 400 ) 401 if err != nil { 402 return nil, err 403 } 404 i.setupForCompaction() 405 return &compactionIterator{ 406 singleLevelIterator: i, 407 bytesIterated: bytesIterated, 408 }, nil 409 } 410 411 // NewRawRangeDelIter returns an internal iterator for the contents of the 412 // range-del block for the table. Returns nil if the table does not contain 413 // any range deletions. 414 // 415 // TODO(sumeer): plumb context.Context since this path is relevant in the user-facing 416 // iterator. Add WithContext methods since the existing ones are public. 417 func (r *Reader) NewRawRangeDelIter() (keyspan.FragmentIterator, error) { 418 if r.rangeDelBH.Length == 0 { 419 return nil, nil 420 } 421 h, err := r.readRangeDel(nil /* stats */, nil /* iterStats */) 422 if err != nil { 423 return nil, err 424 } 425 i := &fragmentBlockIter{elideSameSeqnum: true} 426 if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum, false); err != nil { 427 return nil, err 428 } 429 return i, nil 430 } 431 432 // NewRawRangeKeyIter returns an internal iterator for the contents of the 433 // range-key block for the table. Returns nil if the table does not contain any 434 // range keys. 435 // 436 // TODO(sumeer): plumb context.Context since this path is relevant in the user-facing 437 // iterator. Add WithContext methods since the existing ones are public. 438 func (r *Reader) NewRawRangeKeyIter() (keyspan.FragmentIterator, error) { 439 if r.rangeKeyBH.Length == 0 { 440 return nil, nil 441 } 442 h, err := r.readRangeKey(nil /* stats */, nil /* iterStats */) 443 if err != nil { 444 return nil, err 445 } 446 i := rangeKeyFragmentBlockIterPool.Get().(*rangeKeyFragmentBlockIter) 447 if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum, false); err != nil { 448 return nil, err 449 } 450 return i, nil 451 } 452 453 type rangeKeyFragmentBlockIter struct { 454 fragmentBlockIter 455 } 456 457 func (i *rangeKeyFragmentBlockIter) Close() error { 458 err := i.fragmentBlockIter.Close() 459 i.fragmentBlockIter = i.fragmentBlockIter.resetForReuse() 460 rangeKeyFragmentBlockIterPool.Put(i) 461 return err 462 } 463 464 func (r *Reader) readIndex( 465 ctx context.Context, stats *base.InternalIteratorStats, iterStats *iterStatsAccumulator, 466 ) (bufferHandle, error) { 467 ctx = objiotracing.WithBlockType(ctx, objiotracing.MetadataBlock) 468 return r.readBlock(ctx, r.indexBH, nil, nil, stats, iterStats, nil /* buffer pool */) 469 } 470 471 func (r *Reader) readFilter( 472 ctx context.Context, stats *base.InternalIteratorStats, iterStats *iterStatsAccumulator, 473 ) (bufferHandle, error) { 474 ctx = objiotracing.WithBlockType(ctx, objiotracing.FilterBlock) 475 return r.readBlock(ctx, r.filterBH, nil /* transform */, nil /* readHandle */, stats, iterStats, nil /* buffer pool */) 476 } 477 478 func (r *Reader) readRangeDel( 479 stats *base.InternalIteratorStats, iterStats *iterStatsAccumulator, 480 ) (bufferHandle, error) { 481 ctx := objiotracing.WithBlockType(context.Background(), objiotracing.MetadataBlock) 482 return r.readBlock(ctx, r.rangeDelBH, r.rangeDelTransform, nil /* readHandle */, stats, iterStats, nil /* buffer pool */) 483 } 484 485 func (r *Reader) readRangeKey( 486 stats *base.InternalIteratorStats, iterStats *iterStatsAccumulator, 487 ) (bufferHandle, error) { 488 ctx := objiotracing.WithBlockType(context.Background(), objiotracing.MetadataBlock) 489 return r.readBlock(ctx, r.rangeKeyBH, nil /* transform */, nil /* readHandle */, stats, iterStats, nil /* buffer pool */) 490 } 491 492 func checkChecksum( 493 checksumType ChecksumType, b []byte, bh BlockHandle, fileNum base.FileNum, 494 ) error { 495 expectedChecksum := binary.LittleEndian.Uint32(b[bh.Length+1:]) 496 var computedChecksum uint32 497 switch checksumType { 498 case ChecksumTypeCRC32c: 499 computedChecksum = crc.New(b[:bh.Length+1]).Value() 500 case ChecksumTypeXXHash64: 501 computedChecksum = uint32(xxhash.Sum64(b[:bh.Length+1])) 502 default: 503 return errors.Errorf("unsupported checksum type: %d", checksumType) 504 } 505 506 if expectedChecksum != computedChecksum { 507 return base.CorruptionErrorf( 508 "pebble/table: invalid table %s (checksum mismatch at %d/%d)", 509 errors.Safe(fileNum), errors.Safe(bh.Offset), errors.Safe(bh.Length)) 510 } 511 return nil 512 } 513 514 type cacheValueOrBuf struct { 515 // buf.Valid() returns true if backed by a BufferPool. 516 buf Buf 517 // v is non-nil if backed by the block cache. 518 v *cache.Value 519 } 520 521 func (b cacheValueOrBuf) get() []byte { 522 if b.buf.Valid() { 523 return b.buf.p.pool[b.buf.i].b 524 } 525 return b.v.Buf() 526 } 527 528 func (b cacheValueOrBuf) release() { 529 if b.buf.Valid() { 530 b.buf.Release() 531 } else { 532 cache.Free(b.v) 533 } 534 } 535 536 func (b cacheValueOrBuf) truncate(n int) { 537 if b.buf.Valid() { 538 b.buf.p.pool[b.buf.i].b = b.buf.p.pool[b.buf.i].b[:n] 539 } else { 540 b.v.Truncate(n) 541 } 542 } 543 544 func (r *Reader) readBlock( 545 ctx context.Context, 546 bh BlockHandle, 547 transform blockTransform, 548 readHandle objstorage.ReadHandle, 549 stats *base.InternalIteratorStats, 550 iterStats *iterStatsAccumulator, 551 bufferPool *BufferPool, 552 ) (handle bufferHandle, _ error) { 553 if h := r.opts.Cache.Get(r.cacheID, r.fileNum, bh.Offset); h.Get() != nil { 554 // Cache hit. 555 if readHandle != nil { 556 readHandle.RecordCacheHit(ctx, int64(bh.Offset), int64(bh.Length+blockTrailerLen)) 557 } 558 if stats != nil { 559 stats.BlockBytes += bh.Length 560 stats.BlockBytesInCache += bh.Length 561 } 562 if iterStats != nil { 563 iterStats.reportStats(bh.Length, bh.Length) 564 } 565 // This block is already in the cache; return a handle to existing vlaue 566 // in the cache. 567 return bufferHandle{h: h}, nil 568 } 569 570 // Cache miss. 571 var compressed cacheValueOrBuf 572 if bufferPool != nil { 573 compressed = cacheValueOrBuf{ 574 buf: bufferPool.Alloc(int(bh.Length + blockTrailerLen)), 575 } 576 } else { 577 compressed = cacheValueOrBuf{ 578 v: cache.Alloc(int(bh.Length + blockTrailerLen)), 579 } 580 } 581 582 readStartTime := time.Now() 583 var err error 584 if readHandle != nil { 585 err = readHandle.ReadAt(ctx, compressed.get(), int64(bh.Offset)) 586 } else { 587 err = r.readable.ReadAt(ctx, compressed.get(), int64(bh.Offset)) 588 } 589 readDuration := time.Since(readStartTime) 590 // TODO(sumeer): should the threshold be configurable. 591 const slowReadTracingThreshold = 5 * time.Millisecond 592 // The invariants.Enabled path is for deterministic testing. 593 if invariants.Enabled { 594 readDuration = slowReadTracingThreshold 595 } 596 // Call IsTracingEnabled to avoid the allocations of boxing integers into an 597 // interface{}, unless necessary. 598 if readDuration >= slowReadTracingThreshold && r.opts.LoggerAndTracer.IsTracingEnabled(ctx) { 599 r.opts.LoggerAndTracer.Eventf(ctx, "reading %d bytes took %s", 600 int(bh.Length+blockTrailerLen), readDuration.String()) 601 } 602 if stats != nil { 603 stats.BlockReadDuration += readDuration 604 } 605 if err != nil { 606 compressed.release() 607 return bufferHandle{}, err 608 } 609 if err := checkChecksum(r.checksumType, compressed.get(), bh, r.fileNum.FileNum()); err != nil { 610 compressed.release() 611 return bufferHandle{}, err 612 } 613 614 typ := blockType(compressed.get()[bh.Length]) 615 compressed.truncate(int(bh.Length)) 616 617 var decompressed cacheValueOrBuf 618 if typ == noCompressionBlockType { 619 decompressed = compressed 620 } else { 621 // Decode the length of the decompressed value. 622 decodedLen, prefixLen, err := decompressedLen(typ, compressed.get()) 623 if err != nil { 624 compressed.release() 625 return bufferHandle{}, err 626 } 627 628 if bufferPool != nil { 629 decompressed = cacheValueOrBuf{buf: bufferPool.Alloc(decodedLen)} 630 } else { 631 decompressed = cacheValueOrBuf{v: cache.Alloc(decodedLen)} 632 } 633 if _, err := decompressInto(typ, compressed.get()[prefixLen:], decompressed.get()); err != nil { 634 compressed.release() 635 return bufferHandle{}, err 636 } 637 compressed.release() 638 } 639 640 if transform != nil { 641 // Transforming blocks is very rare, so the extra copy of the 642 // transformed data is not problematic. 643 tmpTransformed, err := transform(decompressed.get()) 644 if err != nil { 645 decompressed.release() 646 return bufferHandle{}, err 647 } 648 649 var transformed cacheValueOrBuf 650 if bufferPool != nil { 651 transformed = cacheValueOrBuf{buf: bufferPool.Alloc(len(tmpTransformed))} 652 } else { 653 transformed = cacheValueOrBuf{v: cache.Alloc(len(tmpTransformed))} 654 } 655 copy(transformed.get(), tmpTransformed) 656 decompressed.release() 657 decompressed = transformed 658 } 659 660 if stats != nil { 661 stats.BlockBytes += bh.Length 662 } 663 if iterStats != nil { 664 iterStats.reportStats(bh.Length, 0) 665 } 666 if decompressed.buf.Valid() { 667 return bufferHandle{b: decompressed.buf}, nil 668 } 669 h := r.opts.Cache.Set(r.cacheID, r.fileNum, bh.Offset, decompressed.v) 670 return bufferHandle{h: h}, nil 671 } 672 673 func (r *Reader) transformRangeDelV1(b []byte) ([]byte, error) { 674 // Convert v1 (RocksDB format) range-del blocks to v2 blocks on the fly. The 675 // v1 format range-del blocks have unfragmented and unsorted range 676 // tombstones. We need properly fragmented and sorted range tombstones in 677 // order to serve from them directly. 678 iter := &blockIter{} 679 if err := iter.init(r.Compare, b, r.Properties.GlobalSeqNum, false); err != nil { 680 return nil, err 681 } 682 var tombstones []keyspan.Span 683 for key, value := iter.First(); key != nil; key, value = iter.Next() { 684 t := keyspan.Span{ 685 Start: key.UserKey, 686 End: value.InPlaceValue(), 687 Keys: []keyspan.Key{{Trailer: key.Trailer}}, 688 } 689 tombstones = append(tombstones, t) 690 } 691 keyspan.Sort(r.Compare, tombstones) 692 693 // Fragment the tombstones, outputting them directly to a block writer. 694 rangeDelBlock := blockWriter{ 695 restartInterval: 1, 696 } 697 frag := keyspan.Fragmenter{ 698 Cmp: r.Compare, 699 Format: r.FormatKey, 700 Emit: func(s keyspan.Span) { 701 for _, k := range s.Keys { 702 startIK := InternalKey{UserKey: s.Start, Trailer: k.Trailer} 703 rangeDelBlock.add(startIK, s.End) 704 } 705 }, 706 } 707 for i := range tombstones { 708 frag.Add(tombstones[i]) 709 } 710 frag.Finish() 711 712 // Return the contents of the constructed v2 format range-del block. 713 return rangeDelBlock.finish(), nil 714 } 715 716 func (r *Reader) readMetaindex(metaindexBH BlockHandle) error { 717 // We use a BufferPool when reading metaindex blocks in order to avoid 718 // populating the block cache with these blocks. In heavy-write workloads, 719 // especially with high compaction concurrency, new tables may be created 720 // frequently. Populating the block cache with these metaindex blocks adds 721 // additional contention on the block cache mutexes (see #1997). 722 // Additionally, these blocks are exceedingly unlikely to be read again 723 // while they're still in the block cache except in misconfigurations with 724 // excessive sstables counts or a table cache that's far too small. 725 r.metaBufferPool.initPreallocated(r.metaBufferPoolAlloc[:0]) 726 // When we're finished, release the buffers we've allocated back to memory 727 // allocator. We don't expect to use metaBufferPool again. 728 defer r.metaBufferPool.Release() 729 730 b, err := r.readBlock( 731 context.Background(), metaindexBH, nil /* transform */, nil /* readHandle */, nil, /* stats */ 732 nil /* iterStats */, &r.metaBufferPool) 733 if err != nil { 734 return err 735 } 736 data := b.Get() 737 defer b.Release() 738 739 if uint64(len(data)) != metaindexBH.Length { 740 return base.CorruptionErrorf("pebble/table: unexpected metaindex block size: %d vs %d", 741 errors.Safe(len(data)), errors.Safe(metaindexBH.Length)) 742 } 743 744 i, err := newRawBlockIter(bytes.Compare, data) 745 if err != nil { 746 return err 747 } 748 749 meta := map[string]BlockHandle{} 750 for valid := i.First(); valid; valid = i.Next() { 751 value := i.Value() 752 if bytes.Equal(i.Key().UserKey, []byte(metaValueIndexName)) { 753 vbih, n, err := decodeValueBlocksIndexHandle(i.Value()) 754 if err != nil { 755 return err 756 } 757 if n == 0 || n != len(value) { 758 return base.CorruptionErrorf("pebble/table: invalid table (bad value blocks index handle)") 759 } 760 r.valueBIH = vbih 761 } else { 762 bh, n := decodeBlockHandle(value) 763 if n == 0 || n != len(value) { 764 return base.CorruptionErrorf("pebble/table: invalid table (bad block handle)") 765 } 766 meta[string(i.Key().UserKey)] = bh 767 } 768 } 769 if err := i.Close(); err != nil { 770 return err 771 } 772 773 if bh, ok := meta[metaPropertiesName]; ok { 774 b, err = r.readBlock( 775 context.Background(), bh, nil /* transform */, nil /* readHandle */, nil, /* stats */ 776 nil /* iterStats */, nil /* buffer pool */) 777 if err != nil { 778 return err 779 } 780 r.propertiesBH = bh 781 err := r.Properties.load(b.Get(), bh.Offset, r.opts.DeniedUserProperties) 782 b.Release() 783 if err != nil { 784 return err 785 } 786 } 787 788 if bh, ok := meta[metaRangeDelV2Name]; ok { 789 r.rangeDelBH = bh 790 } else if bh, ok := meta[metaRangeDelName]; ok { 791 r.rangeDelBH = bh 792 if !r.rawTombstones { 793 r.rangeDelTransform = r.transformRangeDelV1 794 } 795 } 796 797 if bh, ok := meta[metaRangeKeyName]; ok { 798 r.rangeKeyBH = bh 799 } 800 801 for name, fp := range r.opts.Filters { 802 types := []struct { 803 ftype FilterType 804 prefix string 805 }{ 806 {TableFilter, "fullfilter."}, 807 } 808 var done bool 809 for _, t := range types { 810 if bh, ok := meta[t.prefix+name]; ok { 811 r.filterBH = bh 812 813 switch t.ftype { 814 case TableFilter: 815 r.tableFilter = newTableFilterReader(fp) 816 default: 817 return base.CorruptionErrorf("unknown filter type: %v", errors.Safe(t.ftype)) 818 } 819 820 done = true 821 break 822 } 823 } 824 if done { 825 break 826 } 827 } 828 return nil 829 } 830 831 // Layout returns the layout (block organization) for an sstable. 832 func (r *Reader) Layout() (*Layout, error) { 833 if r.err != nil { 834 return nil, r.err 835 } 836 837 l := &Layout{ 838 Data: make([]BlockHandleWithProperties, 0, r.Properties.NumDataBlocks), 839 Filter: r.filterBH, 840 RangeDel: r.rangeDelBH, 841 RangeKey: r.rangeKeyBH, 842 ValueIndex: r.valueBIH.h, 843 Properties: r.propertiesBH, 844 MetaIndex: r.metaIndexBH, 845 Footer: r.footerBH, 846 Format: r.tableFormat, 847 } 848 849 indexH, err := r.readIndex(context.Background(), nil, nil) 850 if err != nil { 851 return nil, err 852 } 853 defer indexH.Release() 854 855 var alloc bytealloc.A 856 857 if r.Properties.IndexPartitions == 0 { 858 l.Index = append(l.Index, r.indexBH) 859 iter, _ := newBlockIter(r.Compare, indexH.Get()) 860 for key, value := iter.First(); key != nil; key, value = iter.Next() { 861 dataBH, err := decodeBlockHandleWithProperties(value.InPlaceValue()) 862 if err != nil { 863 return nil, errCorruptIndexEntry 864 } 865 if len(dataBH.Props) > 0 { 866 alloc, dataBH.Props = alloc.Copy(dataBH.Props) 867 } 868 l.Data = append(l.Data, dataBH) 869 } 870 } else { 871 l.TopIndex = r.indexBH 872 topIter, _ := newBlockIter(r.Compare, indexH.Get()) 873 iter := &blockIter{} 874 for key, value := topIter.First(); key != nil; key, value = topIter.Next() { 875 indexBH, err := decodeBlockHandleWithProperties(value.InPlaceValue()) 876 if err != nil { 877 return nil, errCorruptIndexEntry 878 } 879 l.Index = append(l.Index, indexBH.BlockHandle) 880 881 subIndex, err := r.readBlock(context.Background(), indexBH.BlockHandle, 882 nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* iterStats */, nil /* buffer pool */) 883 if err != nil { 884 return nil, err 885 } 886 if err := iter.init(r.Compare, subIndex.Get(), 0, /* globalSeqNum */ 887 false /* hideObsoletePoints */); err != nil { 888 return nil, err 889 } 890 for key, value := iter.First(); key != nil; key, value = iter.Next() { 891 dataBH, err := decodeBlockHandleWithProperties(value.InPlaceValue()) 892 if len(dataBH.Props) > 0 { 893 alloc, dataBH.Props = alloc.Copy(dataBH.Props) 894 } 895 if err != nil { 896 return nil, errCorruptIndexEntry 897 } 898 l.Data = append(l.Data, dataBH) 899 } 900 subIndex.Release() 901 *iter = iter.resetForReuse() 902 } 903 } 904 if r.valueBIH.h.Length != 0 { 905 vbiH, err := r.readBlock(context.Background(), r.valueBIH.h, nil, nil, nil, nil, nil /* buffer pool */) 906 if err != nil { 907 return nil, err 908 } 909 defer vbiH.Release() 910 vbiBlock := vbiH.Get() 911 indexEntryLen := int(r.valueBIH.blockNumByteLength + r.valueBIH.blockOffsetByteLength + 912 r.valueBIH.blockLengthByteLength) 913 i := 0 914 for len(vbiBlock) != 0 { 915 if len(vbiBlock) < indexEntryLen { 916 return nil, errors.Errorf( 917 "remaining value index block %d does not contain a full entry of length %d", 918 len(vbiBlock), indexEntryLen) 919 } 920 n := int(r.valueBIH.blockNumByteLength) 921 bn := int(littleEndianGet(vbiBlock, n)) 922 if bn != i { 923 return nil, errors.Errorf("unexpected block num %d, expected %d", 924 bn, i) 925 } 926 i++ 927 vbiBlock = vbiBlock[n:] 928 n = int(r.valueBIH.blockOffsetByteLength) 929 blockOffset := littleEndianGet(vbiBlock, n) 930 vbiBlock = vbiBlock[n:] 931 n = int(r.valueBIH.blockLengthByteLength) 932 blockLen := littleEndianGet(vbiBlock, n) 933 vbiBlock = vbiBlock[n:] 934 l.ValueBlock = append(l.ValueBlock, BlockHandle{Offset: blockOffset, Length: blockLen}) 935 } 936 } 937 938 return l, nil 939 } 940 941 // ValidateBlockChecksums validates the checksums for each block in the SSTable. 942 func (r *Reader) ValidateBlockChecksums() error { 943 // Pre-compute the BlockHandles for the underlying file. 944 l, err := r.Layout() 945 if err != nil { 946 return err 947 } 948 949 // Construct the set of blocks to check. Note that the footer is not checked 950 // as it is not a block with a checksum. 951 blocks := make([]BlockHandle, len(l.Data)) 952 for i := range l.Data { 953 blocks[i] = l.Data[i].BlockHandle 954 } 955 blocks = append(blocks, l.Index...) 956 blocks = append(blocks, l.TopIndex, l.Filter, l.RangeDel, l.RangeKey, l.Properties, l.MetaIndex) 957 958 // Sorting by offset ensures we are performing a sequential scan of the 959 // file. 960 slices.SortFunc(blocks, func(a, b BlockHandle) int { 961 return cmp.Compare(a.Offset, b.Offset) 962 }) 963 964 // Check all blocks sequentially. Make use of read-ahead, given we are 965 // scanning the entire file from start to end. 966 rh := r.readable.NewReadHandle(context.TODO()) 967 defer rh.Close() 968 969 for _, bh := range blocks { 970 // Certain blocks may not be present, in which case we skip them. 971 if bh.Length == 0 { 972 continue 973 } 974 975 // Read the block, which validates the checksum. 976 h, err := r.readBlock(context.Background(), bh, nil, rh, nil, nil /* iterStats */, nil /* buffer pool */) 977 if err != nil { 978 return err 979 } 980 h.Release() 981 } 982 983 return nil 984 } 985 986 // CommonProperties implemented the CommonReader interface. 987 func (r *Reader) CommonProperties() *CommonProperties { 988 return &r.Properties.CommonProperties 989 } 990 991 // EstimateDiskUsage returns the total size of data blocks overlapping the range 992 // `[start, end]`. Even if a data block partially overlaps, or we cannot 993 // determine overlap due to abbreviated index keys, the full data block size is 994 // included in the estimation. 995 // 996 // This function does not account for any metablock space usage. Assumes there 997 // is at least partial overlap, i.e., `[start, end]` falls neither completely 998 // before nor completely after the file's range. 999 // 1000 // Only blocks containing point keys are considered. Range deletion and range 1001 // key blocks are not considered. 1002 // 1003 // TODO(ajkr): account for metablock space usage. Perhaps look at the fraction of 1004 // data blocks overlapped and add that same fraction of the metadata blocks to the 1005 // estimate. 1006 func (r *Reader) EstimateDiskUsage(start, end []byte) (uint64, error) { 1007 if r.err != nil { 1008 return 0, r.err 1009 } 1010 1011 indexH, err := r.readIndex(context.Background(), nil, nil) 1012 if err != nil { 1013 return 0, err 1014 } 1015 defer indexH.Release() 1016 1017 // Iterators over the bottom-level index blocks containing start and end. 1018 // These may be different in case of partitioned index but will both point 1019 // to the same blockIter over the single index in the unpartitioned case. 1020 var startIdxIter, endIdxIter *blockIter 1021 if r.Properties.IndexPartitions == 0 { 1022 iter, err := newBlockIter(r.Compare, indexH.Get()) 1023 if err != nil { 1024 return 0, err 1025 } 1026 startIdxIter = iter 1027 endIdxIter = iter 1028 } else { 1029 topIter, err := newBlockIter(r.Compare, indexH.Get()) 1030 if err != nil { 1031 return 0, err 1032 } 1033 1034 key, val := topIter.SeekGE(start, base.SeekGEFlagsNone) 1035 if key == nil { 1036 // The range falls completely after this file, or an error occurred. 1037 return 0, topIter.Error() 1038 } 1039 startIdxBH, err := decodeBlockHandleWithProperties(val.InPlaceValue()) 1040 if err != nil { 1041 return 0, errCorruptIndexEntry 1042 } 1043 startIdxBlock, err := r.readBlock(context.Background(), startIdxBH.BlockHandle, 1044 nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* iterStats */, nil /* buffer pool */) 1045 if err != nil { 1046 return 0, err 1047 } 1048 defer startIdxBlock.Release() 1049 startIdxIter, err = newBlockIter(r.Compare, startIdxBlock.Get()) 1050 if err != nil { 1051 return 0, err 1052 } 1053 1054 key, val = topIter.SeekGE(end, base.SeekGEFlagsNone) 1055 if key == nil { 1056 if err := topIter.Error(); err != nil { 1057 return 0, err 1058 } 1059 } else { 1060 endIdxBH, err := decodeBlockHandleWithProperties(val.InPlaceValue()) 1061 if err != nil { 1062 return 0, errCorruptIndexEntry 1063 } 1064 endIdxBlock, err := r.readBlock(context.Background(), 1065 endIdxBH.BlockHandle, nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* iterStats */, nil /* buffer pool */) 1066 if err != nil { 1067 return 0, err 1068 } 1069 defer endIdxBlock.Release() 1070 endIdxIter, err = newBlockIter(r.Compare, endIdxBlock.Get()) 1071 if err != nil { 1072 return 0, err 1073 } 1074 } 1075 } 1076 // startIdxIter should not be nil at this point, while endIdxIter can be if the 1077 // range spans past the end of the file. 1078 1079 key, val := startIdxIter.SeekGE(start, base.SeekGEFlagsNone) 1080 if key == nil { 1081 // The range falls completely after this file, or an error occurred. 1082 return 0, startIdxIter.Error() 1083 } 1084 startBH, err := decodeBlockHandleWithProperties(val.InPlaceValue()) 1085 if err != nil { 1086 return 0, errCorruptIndexEntry 1087 } 1088 1089 includeInterpolatedValueBlocksSize := func(dataBlockSize uint64) uint64 { 1090 // INVARIANT: r.Properties.DataSize > 0 since startIdxIter is not nil. 1091 // Linearly interpolate what is stored in value blocks. 1092 // 1093 // TODO(sumeer): if we need more accuracy, without loading any data blocks 1094 // (which contain the value handles, and which may also be insufficient if 1095 // the values are in separate files), we will need to accumulate the 1096 // logical size of the key-value pairs and store the cumulative value for 1097 // each data block in the index block entry. This increases the size of 1098 // the BlockHandle, so wait until this becomes necessary. 1099 return dataBlockSize + 1100 uint64((float64(dataBlockSize)/float64(r.Properties.DataSize))* 1101 float64(r.Properties.ValueBlocksSize)) 1102 } 1103 if endIdxIter == nil { 1104 // The range spans beyond this file. Include data blocks through the last. 1105 return includeInterpolatedValueBlocksSize(r.Properties.DataSize - startBH.Offset), nil 1106 } 1107 key, val = endIdxIter.SeekGE(end, base.SeekGEFlagsNone) 1108 if key == nil { 1109 if err := endIdxIter.Error(); err != nil { 1110 return 0, err 1111 } 1112 // The range spans beyond this file. Include data blocks through the last. 1113 return includeInterpolatedValueBlocksSize(r.Properties.DataSize - startBH.Offset), nil 1114 } 1115 endBH, err := decodeBlockHandleWithProperties(val.InPlaceValue()) 1116 if err != nil { 1117 return 0, errCorruptIndexEntry 1118 } 1119 return includeInterpolatedValueBlocksSize( 1120 endBH.Offset + endBH.Length + blockTrailerLen - startBH.Offset), nil 1121 } 1122 1123 // TableFormat returns the format version for the table. 1124 func (r *Reader) TableFormat() (TableFormat, error) { 1125 if r.err != nil { 1126 return TableFormatUnspecified, r.err 1127 } 1128 return r.tableFormat, nil 1129 } 1130 1131 // NewReader returns a new table reader for the file. Closing the reader will 1132 // close the file. 1133 func NewReader(f objstorage.Readable, o ReaderOptions, extraOpts ...ReaderOption) (*Reader, error) { 1134 o = o.ensureDefaults() 1135 r := &Reader{ 1136 readable: f, 1137 opts: o, 1138 } 1139 if r.opts.Cache == nil { 1140 r.opts.Cache = cache.New(0) 1141 } else { 1142 r.opts.Cache.Ref() 1143 } 1144 1145 if f == nil { 1146 r.err = errors.New("pebble/table: nil file") 1147 return nil, r.Close() 1148 } 1149 1150 // Note that the extra options are applied twice. First here for pre-apply 1151 // options, and then below for post-apply options. Pre and post refer to 1152 // before and after reading the metaindex and properties. 1153 type preApply interface{ preApply() } 1154 for _, opt := range extraOpts { 1155 if _, ok := opt.(preApply); ok { 1156 opt.readerApply(r) 1157 } 1158 } 1159 if r.cacheID == 0 { 1160 r.cacheID = r.opts.Cache.NewID() 1161 } 1162 1163 footer, err := readFooter(f) 1164 if err != nil { 1165 r.err = err 1166 return nil, r.Close() 1167 } 1168 r.checksumType = footer.checksum 1169 r.tableFormat = footer.format 1170 // Read the metaindex. 1171 if err := r.readMetaindex(footer.metaindexBH); err != nil { 1172 r.err = err 1173 return nil, r.Close() 1174 } 1175 r.indexBH = footer.indexBH 1176 r.metaIndexBH = footer.metaindexBH 1177 r.footerBH = footer.footerBH 1178 1179 if r.Properties.ComparerName == "" || o.Comparer.Name == r.Properties.ComparerName { 1180 r.Compare = o.Comparer.Compare 1181 r.FormatKey = o.Comparer.FormatKey 1182 r.Split = o.Comparer.Split 1183 } 1184 1185 if o.MergerName == r.Properties.MergerName { 1186 r.mergerOK = true 1187 } 1188 1189 // Apply the extra options again now that the comparer and merger names are 1190 // known. 1191 for _, opt := range extraOpts { 1192 if _, ok := opt.(preApply); !ok { 1193 opt.readerApply(r) 1194 } 1195 } 1196 1197 if r.Compare == nil { 1198 r.err = errors.Errorf("pebble/table: %d: unknown comparer %s", 1199 errors.Safe(r.fileNum), errors.Safe(r.Properties.ComparerName)) 1200 } 1201 if !r.mergerOK { 1202 if name := r.Properties.MergerName; name != "" && name != "nullptr" { 1203 r.err = errors.Errorf("pebble/table: %d: unknown merger %s", 1204 errors.Safe(r.fileNum), errors.Safe(r.Properties.MergerName)) 1205 } 1206 } 1207 if r.err != nil { 1208 return nil, r.Close() 1209 } 1210 1211 return r, nil 1212 } 1213 1214 // ReadableFile describes the smallest subset of vfs.File that is required for 1215 // reading SSTs. 1216 type ReadableFile interface { 1217 io.ReaderAt 1218 io.Closer 1219 Stat() (os.FileInfo, error) 1220 } 1221 1222 // NewSimpleReadable wraps a ReadableFile in a objstorage.Readable 1223 // implementation (which does not support read-ahead) 1224 func NewSimpleReadable(r ReadableFile) (objstorage.Readable, error) { 1225 info, err := r.Stat() 1226 if err != nil { 1227 return nil, err 1228 } 1229 res := &simpleReadable{ 1230 f: r, 1231 size: info.Size(), 1232 } 1233 res.rh = objstorage.MakeNoopReadHandle(res) 1234 return res, nil 1235 } 1236 1237 // simpleReadable wraps a ReadableFile to implement objstorage.Readable. 1238 type simpleReadable struct { 1239 f ReadableFile 1240 size int64 1241 rh objstorage.NoopReadHandle 1242 } 1243 1244 var _ objstorage.Readable = (*simpleReadable)(nil) 1245 1246 // ReadAt is part of the objstorage.Readable interface. 1247 func (s *simpleReadable) ReadAt(_ context.Context, p []byte, off int64) error { 1248 n, err := s.f.ReadAt(p, off) 1249 if invariants.Enabled && err == nil && n != len(p) { 1250 panic("short read") 1251 } 1252 return err 1253 } 1254 1255 // Close is part of the objstorage.Readable interface. 1256 func (s *simpleReadable) Close() error { 1257 return s.f.Close() 1258 } 1259 1260 // Size is part of the objstorage.Readable interface. 1261 func (s *simpleReadable) Size() int64 { 1262 return s.size 1263 } 1264 1265 // NewReaddHandle is part of the objstorage.Readable interface. 1266 func (s *simpleReadable) NewReadHandle(_ context.Context) objstorage.ReadHandle { 1267 return &s.rh 1268 }