github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/sstable/reader.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package sstable 6 7 import ( 8 "bytes" 9 "context" 10 "encoding/binary" 11 "io" 12 "os" 13 "sort" 14 "time" 15 16 "github.com/cespare/xxhash/v2" 17 "github.com/cockroachdb/errors" 18 "github.com/cockroachdb/pebble/internal/base" 19 "github.com/cockroachdb/pebble/internal/bytealloc" 20 "github.com/cockroachdb/pebble/internal/cache" 21 "github.com/cockroachdb/pebble/internal/crc" 22 "github.com/cockroachdb/pebble/internal/invariants" 23 "github.com/cockroachdb/pebble/internal/keyspan" 24 "github.com/cockroachdb/pebble/internal/private" 25 "github.com/cockroachdb/pebble/objstorage" 26 "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing" 27 ) 28 29 var errCorruptIndexEntry = base.CorruptionErrorf("pebble/table: corrupt index entry") 30 var errReaderClosed = errors.New("pebble/table: reader is closed") 31 32 // decodeBlockHandle returns the block handle encoded at the start of src, as 33 // well as the number of bytes it occupies. It returns zero if given invalid 34 // input. A block handle for a data block or a first/lower level index block 35 // should not be decoded using decodeBlockHandle since the caller may validate 36 // that the number of bytes decoded is equal to the length of src, which will 37 // be false if the properties are not decoded. In those cases the caller 38 // should use decodeBlockHandleWithProperties. 39 func decodeBlockHandle(src []byte) (BlockHandle, int) { 40 offset, n := binary.Uvarint(src) 41 length, m := binary.Uvarint(src[n:]) 42 if n == 0 || m == 0 { 43 return BlockHandle{}, 0 44 } 45 return BlockHandle{offset, length}, n + m 46 } 47 48 // decodeBlockHandleWithProperties returns the block handle and properties 49 // encoded in src. src needs to be exactly the length that was encoded. This 50 // method must be used for data block and first/lower level index blocks. The 51 // properties in the block handle point to the bytes in src. 52 func decodeBlockHandleWithProperties(src []byte) (BlockHandleWithProperties, error) { 53 bh, n := decodeBlockHandle(src) 54 if n == 0 { 55 return BlockHandleWithProperties{}, errors.Errorf("invalid BlockHandle") 56 } 57 return BlockHandleWithProperties{ 58 BlockHandle: bh, 59 Props: src[n:], 60 }, nil 61 } 62 63 func encodeBlockHandle(dst []byte, b BlockHandle) int { 64 n := binary.PutUvarint(dst, b.Offset) 65 m := binary.PutUvarint(dst[n:], b.Length) 66 return n + m 67 } 68 69 func encodeBlockHandleWithProperties(dst []byte, b BlockHandleWithProperties) []byte { 70 n := encodeBlockHandle(dst, b.BlockHandle) 71 dst = append(dst[:n], b.Props...) 72 return dst 73 } 74 75 // block is a []byte that holds a sequence of key/value pairs plus an index 76 // over those pairs. 77 type block []byte 78 79 type loadBlockResult int8 80 81 const ( 82 loadBlockOK loadBlockResult = iota 83 // Could be due to error or because no block left to load. 84 loadBlockFailed 85 loadBlockIrrelevant 86 ) 87 88 type blockTransform func([]byte) ([]byte, error) 89 90 // ReaderOption provide an interface to do work on Reader while it is being 91 // opened. 92 type ReaderOption interface { 93 // readerApply is called on the reader during opening in order to set internal 94 // parameters. 95 readerApply(*Reader) 96 } 97 98 // Comparers is a map from comparer name to comparer. It is used for debugging 99 // tools which may be used on multiple databases configured with different 100 // comparers. Comparers implements the OpenOption interface and can be passed 101 // as a parameter to NewReader. 102 type Comparers map[string]*Comparer 103 104 func (c Comparers) readerApply(r *Reader) { 105 if r.Compare != nil || r.Properties.ComparerName == "" { 106 return 107 } 108 if comparer, ok := c[r.Properties.ComparerName]; ok { 109 r.Compare = comparer.Compare 110 r.FormatKey = comparer.FormatKey 111 r.Split = comparer.Split 112 } 113 } 114 115 // Mergers is a map from merger name to merger. It is used for debugging tools 116 // which may be used on multiple databases configured with different 117 // mergers. Mergers implements the OpenOption interface and can be passed as 118 // a parameter to NewReader. 119 type Mergers map[string]*Merger 120 121 func (m Mergers) readerApply(r *Reader) { 122 if r.mergerOK || r.Properties.MergerName == "" { 123 return 124 } 125 _, r.mergerOK = m[r.Properties.MergerName] 126 } 127 128 // cacheOpts is a Reader open option for specifying the cache ID and sstable file 129 // number. If not specified, a unique cache ID will be used. 130 type cacheOpts struct { 131 cacheID uint64 132 fileNum base.DiskFileNum 133 } 134 135 // Marker function to indicate the option should be applied before reading the 136 // sstable properties and, in the write path, before writing the default 137 // sstable properties. 138 func (c *cacheOpts) preApply() {} 139 140 func (c *cacheOpts) readerApply(r *Reader) { 141 if r.cacheID == 0 { 142 r.cacheID = c.cacheID 143 } 144 if r.fileNum.FileNum() == 0 { 145 r.fileNum = c.fileNum 146 } 147 } 148 149 func (c *cacheOpts) writerApply(w *Writer) { 150 if w.cacheID == 0 { 151 w.cacheID = c.cacheID 152 } 153 if w.fileNum.FileNum() == 0 { 154 w.fileNum = c.fileNum 155 } 156 } 157 158 // rawTombstonesOpt is a Reader open option for specifying that range 159 // tombstones returned by Reader.NewRangeDelIter() should not be 160 // fragmented. Used by debug tools to get a raw view of the tombstones 161 // contained in an sstable. 162 type rawTombstonesOpt struct{} 163 164 func (rawTombstonesOpt) preApply() {} 165 166 func (rawTombstonesOpt) readerApply(r *Reader) { 167 r.rawTombstones = true 168 } 169 170 func init() { 171 private.SSTableCacheOpts = func(cacheID uint64, fileNum base.DiskFileNum) interface{} { 172 return &cacheOpts{cacheID, fileNum} 173 } 174 private.SSTableRawTombstonesOpt = rawTombstonesOpt{} 175 } 176 177 // CommonReader abstracts functionality over a Reader or a VirtualReader. This 178 // can be used by code which doesn't care to distinguish between a reader and a 179 // virtual reader. 180 type CommonReader interface { 181 NewRawRangeKeyIter() (keyspan.FragmentIterator, error) 182 NewRawRangeDelIter() (keyspan.FragmentIterator, error) 183 NewIterWithBlockPropertyFiltersAndContextEtc( 184 ctx context.Context, lower, upper []byte, 185 filterer *BlockPropertiesFilterer, 186 hideObsoletePoints, useFilterBlock bool, 187 stats *base.InternalIteratorStats, 188 rp ReaderProvider, 189 ) (Iterator, error) 190 NewCompactionIter( 191 bytesIterated *uint64, 192 rp ReaderProvider, 193 bufferPool *BufferPool, 194 ) (Iterator, error) 195 EstimateDiskUsage(start, end []byte) (uint64, error) 196 CommonProperties() *CommonProperties 197 } 198 199 // Reader is a table reader. 200 type Reader struct { 201 readable objstorage.Readable 202 cacheID uint64 203 fileNum base.DiskFileNum 204 err error 205 indexBH BlockHandle 206 filterBH BlockHandle 207 rangeDelBH BlockHandle 208 rangeKeyBH BlockHandle 209 rangeDelTransform blockTransform 210 valueBIH valueBlocksIndexHandle 211 propertiesBH BlockHandle 212 metaIndexBH BlockHandle 213 footerBH BlockHandle 214 opts ReaderOptions 215 Compare Compare 216 FormatKey base.FormatKey 217 Split Split 218 tableFilter *tableFilterReader 219 // Keep types that are not multiples of 8 bytes at the end and with 220 // decreasing size. 221 Properties Properties 222 tableFormat TableFormat 223 rawTombstones bool 224 mergerOK bool 225 checksumType ChecksumType 226 // metaBufferPool is a buffer pool used exclusively when opening a table and 227 // loading its meta blocks. metaBufferPoolAlloc is used to batch-allocate 228 // the BufferPool.pool slice as a part of the Reader allocation. It's 229 // capacity 3 to accommodate the meta block (1), and both the compressed 230 // properties block (1) and decompressed properties block (1) 231 // simultaneously. 232 metaBufferPool BufferPool 233 metaBufferPoolAlloc [3]allocedBuffer 234 } 235 236 // Close implements DB.Close, as documented in the pebble package. 237 func (r *Reader) Close() error { 238 r.opts.Cache.Unref() 239 240 if r.readable != nil { 241 r.err = firstError(r.err, r.readable.Close()) 242 r.readable = nil 243 } 244 245 if r.err != nil { 246 return r.err 247 } 248 // Make any future calls to Get, NewIter or Close return an error. 249 r.err = errReaderClosed 250 return nil 251 } 252 253 // NewIterWithBlockPropertyFilters returns an iterator for the contents of the 254 // table. If an error occurs, NewIterWithBlockPropertyFilters cleans up after 255 // itself and returns a nil iterator. 256 func (r *Reader) NewIterWithBlockPropertyFilters( 257 lower, upper []byte, 258 filterer *BlockPropertiesFilterer, 259 useFilterBlock bool, 260 stats *base.InternalIteratorStats, 261 rp ReaderProvider, 262 ) (Iterator, error) { 263 return r.newIterWithBlockPropertyFiltersAndContext( 264 context.Background(), 265 lower, upper, filterer, false, useFilterBlock, stats, rp, nil, 266 ) 267 } 268 269 // NewIterWithBlockPropertyFiltersAndContextEtc is similar to 270 // NewIterWithBlockPropertyFilters and additionally accepts a context for 271 // tracing. 272 // 273 // If hideObsoletePoints, the callee assumes that filterer already includes 274 // obsoleteKeyBlockPropertyFilter. The caller can satisfy this contract by 275 // first calling TryAddBlockPropertyFilterForHideObsoletePoints. 276 func (r *Reader) NewIterWithBlockPropertyFiltersAndContextEtc( 277 ctx context.Context, 278 lower, upper []byte, 279 filterer *BlockPropertiesFilterer, 280 hideObsoletePoints, useFilterBlock bool, 281 stats *base.InternalIteratorStats, 282 rp ReaderProvider, 283 ) (Iterator, error) { 284 return r.newIterWithBlockPropertyFiltersAndContext( 285 ctx, lower, upper, filterer, hideObsoletePoints, useFilterBlock, stats, rp, nil, 286 ) 287 } 288 289 // TryAddBlockPropertyFilterForHideObsoletePoints is expected to be called 290 // before the call to NewIterWithBlockPropertyFiltersAndContextEtc, to get the 291 // value of hideObsoletePoints and potentially add a block property filter. 292 func (r *Reader) TryAddBlockPropertyFilterForHideObsoletePoints( 293 snapshotForHideObsoletePoints uint64, 294 fileLargestSeqNum uint64, 295 pointKeyFilters []BlockPropertyFilter, 296 ) (hideObsoletePoints bool, filters []BlockPropertyFilter) { 297 hideObsoletePoints = r.tableFormat >= TableFormatPebblev4 && 298 snapshotForHideObsoletePoints > fileLargestSeqNum 299 if hideObsoletePoints { 300 pointKeyFilters = append(pointKeyFilters, obsoleteKeyBlockPropertyFilter{}) 301 } 302 return hideObsoletePoints, pointKeyFilters 303 } 304 305 func (r *Reader) newIterWithBlockPropertyFiltersAndContext( 306 ctx context.Context, 307 lower, upper []byte, 308 filterer *BlockPropertiesFilterer, 309 hideObsoletePoints bool, 310 useFilterBlock bool, 311 stats *base.InternalIteratorStats, 312 rp ReaderProvider, 313 v *virtualState, 314 ) (Iterator, error) { 315 // NB: pebble.tableCache wraps the returned iterator with one which performs 316 // reference counting on the Reader, preventing the Reader from being closed 317 // until the final iterator closes. 318 if r.Properties.IndexType == twoLevelIndex { 319 i := twoLevelIterPool.Get().(*twoLevelIterator) 320 err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, hideObsoletePoints, stats, rp, nil /* bufferPool */) 321 if err != nil { 322 return nil, err 323 } 324 return i, nil 325 } 326 327 i := singleLevelIterPool.Get().(*singleLevelIterator) 328 err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, hideObsoletePoints, stats, rp, nil /* bufferPool */) 329 if err != nil { 330 return nil, err 331 } 332 return i, nil 333 } 334 335 // NewIter returns an iterator for the contents of the table. If an error 336 // occurs, NewIter cleans up after itself and returns a nil iterator. NewIter 337 // must only be used when the Reader is guaranteed to outlive any LazyValues 338 // returned from the iter. 339 func (r *Reader) NewIter(lower, upper []byte) (Iterator, error) { 340 return r.NewIterWithBlockPropertyFilters( 341 lower, upper, nil, true /* useFilterBlock */, nil, /* stats */ 342 TrivialReaderProvider{Reader: r}) 343 } 344 345 // NewCompactionIter returns an iterator similar to NewIter but it also increments 346 // the number of bytes iterated. If an error occurs, NewCompactionIter cleans up 347 // after itself and returns a nil iterator. 348 func (r *Reader) NewCompactionIter( 349 bytesIterated *uint64, rp ReaderProvider, bufferPool *BufferPool, 350 ) (Iterator, error) { 351 return r.newCompactionIter(bytesIterated, rp, nil, bufferPool) 352 } 353 354 func (r *Reader) newCompactionIter( 355 bytesIterated *uint64, rp ReaderProvider, v *virtualState, bufferPool *BufferPool, 356 ) (Iterator, error) { 357 if r.Properties.IndexType == twoLevelIndex { 358 i := twoLevelIterPool.Get().(*twoLevelIterator) 359 err := i.init( 360 context.Background(), 361 r, v, nil /* lower */, nil /* upper */, nil, 362 false /* useFilter */, v != nil && v.isForeign, /* hideObsoletePoints */ 363 nil /* stats */, rp, bufferPool, 364 ) 365 if err != nil { 366 return nil, err 367 } 368 i.setupForCompaction() 369 return &twoLevelCompactionIterator{ 370 twoLevelIterator: i, 371 bytesIterated: bytesIterated, 372 }, nil 373 } 374 i := singleLevelIterPool.Get().(*singleLevelIterator) 375 err := i.init( 376 context.Background(), r, v, nil /* lower */, nil, /* upper */ 377 nil, false /* useFilter */, v != nil && v.isForeign, /* hideObsoletePoints */ 378 nil /* stats */, rp, bufferPool, 379 ) 380 if err != nil { 381 return nil, err 382 } 383 i.setupForCompaction() 384 return &compactionIterator{ 385 singleLevelIterator: i, 386 bytesIterated: bytesIterated, 387 }, nil 388 } 389 390 // NewRawRangeDelIter returns an internal iterator for the contents of the 391 // range-del block for the table. Returns nil if the table does not contain 392 // any range deletions. 393 // 394 // TODO(sumeer): plumb context.Context since this path is relevant in the user-facing 395 // iterator. Add WithContext methods since the existing ones are public. 396 func (r *Reader) NewRawRangeDelIter() (keyspan.FragmentIterator, error) { 397 if r.rangeDelBH.Length == 0 { 398 return nil, nil 399 } 400 h, err := r.readRangeDel(nil /* stats */) 401 if err != nil { 402 return nil, err 403 } 404 i := &fragmentBlockIter{elideSameSeqnum: true} 405 if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum, false); err != nil { 406 return nil, err 407 } 408 return i, nil 409 } 410 411 // NewRawRangeKeyIter returns an internal iterator for the contents of the 412 // range-key block for the table. Returns nil if the table does not contain any 413 // range keys. 414 // 415 // TODO(sumeer): plumb context.Context since this path is relevant in the user-facing 416 // iterator. Add WithContext methods since the existing ones are public. 417 func (r *Reader) NewRawRangeKeyIter() (keyspan.FragmentIterator, error) { 418 if r.rangeKeyBH.Length == 0 { 419 return nil, nil 420 } 421 h, err := r.readRangeKey(nil /* stats */) 422 if err != nil { 423 return nil, err 424 } 425 i := rangeKeyFragmentBlockIterPool.Get().(*rangeKeyFragmentBlockIter) 426 if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum, false); err != nil { 427 return nil, err 428 } 429 return i, nil 430 } 431 432 type rangeKeyFragmentBlockIter struct { 433 fragmentBlockIter 434 } 435 436 func (i *rangeKeyFragmentBlockIter) Close() error { 437 err := i.fragmentBlockIter.Close() 438 i.fragmentBlockIter = i.fragmentBlockIter.resetForReuse() 439 rangeKeyFragmentBlockIterPool.Put(i) 440 return err 441 } 442 443 func (r *Reader) readIndex( 444 ctx context.Context, stats *base.InternalIteratorStats, 445 ) (bufferHandle, error) { 446 ctx = objiotracing.WithBlockType(ctx, objiotracing.MetadataBlock) 447 return r.readBlock(ctx, r.indexBH, nil, nil, stats, nil /* buffer pool */) 448 } 449 450 func (r *Reader) readFilter( 451 ctx context.Context, stats *base.InternalIteratorStats, 452 ) (bufferHandle, error) { 453 ctx = objiotracing.WithBlockType(ctx, objiotracing.FilterBlock) 454 return r.readBlock(ctx, r.filterBH, nil /* transform */, nil /* readHandle */, stats, nil /* buffer pool */) 455 } 456 457 func (r *Reader) readRangeDel(stats *base.InternalIteratorStats) (bufferHandle, error) { 458 ctx := objiotracing.WithBlockType(context.Background(), objiotracing.MetadataBlock) 459 return r.readBlock(ctx, r.rangeDelBH, r.rangeDelTransform, nil /* readHandle */, stats, nil /* buffer pool */) 460 } 461 462 func (r *Reader) readRangeKey(stats *base.InternalIteratorStats) (bufferHandle, error) { 463 ctx := objiotracing.WithBlockType(context.Background(), objiotracing.MetadataBlock) 464 return r.readBlock(ctx, r.rangeKeyBH, nil /* transform */, nil /* readHandle */, stats, nil /* buffer pool */) 465 } 466 467 func checkChecksum( 468 checksumType ChecksumType, b []byte, bh BlockHandle, fileNum base.FileNum, 469 ) error { 470 expectedChecksum := binary.LittleEndian.Uint32(b[bh.Length+1:]) 471 var computedChecksum uint32 472 switch checksumType { 473 case ChecksumTypeCRC32c: 474 computedChecksum = crc.New(b[:bh.Length+1]).Value() 475 case ChecksumTypeXXHash64: 476 computedChecksum = uint32(xxhash.Sum64(b[:bh.Length+1])) 477 default: 478 return errors.Errorf("unsupported checksum type: %d", checksumType) 479 } 480 481 if expectedChecksum != computedChecksum { 482 return base.CorruptionErrorf( 483 "pebble/table: invalid table %s (checksum mismatch at %d/%d)", 484 errors.Safe(fileNum), errors.Safe(bh.Offset), errors.Safe(bh.Length)) 485 } 486 return nil 487 } 488 489 type cacheValueOrBuf struct { 490 // buf.Valid() returns true if backed by a BufferPool. 491 buf Buf 492 // v is non-nil if backed by the block cache. 493 v *cache.Value 494 } 495 496 func (b cacheValueOrBuf) get() []byte { 497 if b.buf.Valid() { 498 return b.buf.p.pool[b.buf.i].b 499 } 500 return b.v.Buf() 501 } 502 503 func (b cacheValueOrBuf) release() { 504 if b.buf.Valid() { 505 b.buf.Release() 506 } else { 507 cache.Free(b.v) 508 } 509 } 510 511 func (b cacheValueOrBuf) truncate(n int) { 512 if b.buf.Valid() { 513 b.buf.p.pool[b.buf.i].b = b.buf.p.pool[b.buf.i].b[:n] 514 } else { 515 b.v.Truncate(n) 516 } 517 } 518 519 func (r *Reader) readBlock( 520 ctx context.Context, 521 bh BlockHandle, 522 transform blockTransform, 523 readHandle objstorage.ReadHandle, 524 stats *base.InternalIteratorStats, 525 bufferPool *BufferPool, 526 ) (handle bufferHandle, _ error) { 527 if h := r.opts.Cache.Get(r.cacheID, r.fileNum, bh.Offset); h.Get() != nil { 528 // Cache hit. 529 if readHandle != nil { 530 readHandle.RecordCacheHit(ctx, int64(bh.Offset), int64(bh.Length+blockTrailerLen)) 531 } 532 if stats != nil { 533 stats.BlockBytes += bh.Length 534 stats.BlockBytesInCache += bh.Length 535 } 536 // This block is already in the cache; return a handle to existing vlaue 537 // in the cache. 538 return bufferHandle{h: h}, nil 539 } 540 541 // Cache miss. 542 var compressed cacheValueOrBuf 543 if bufferPool != nil { 544 compressed = cacheValueOrBuf{ 545 buf: bufferPool.Alloc(int(bh.Length + blockTrailerLen)), 546 } 547 } else { 548 compressed = cacheValueOrBuf{ 549 v: cache.Alloc(int(bh.Length + blockTrailerLen)), 550 } 551 } 552 553 readStartTime := time.Now() 554 var err error 555 if readHandle != nil { 556 err = readHandle.ReadAt(ctx, compressed.get(), int64(bh.Offset)) 557 } else { 558 err = r.readable.ReadAt(ctx, compressed.get(), int64(bh.Offset)) 559 } 560 readDuration := time.Since(readStartTime) 561 // TODO(sumeer): should the threshold be configurable. 562 const slowReadTracingThreshold = 5 * time.Millisecond 563 // The invariants.Enabled path is for deterministic testing. 564 if invariants.Enabled { 565 readDuration = slowReadTracingThreshold 566 } 567 // Call IsTracingEnabled to avoid the allocations of boxing integers into an 568 // interface{}, unless necessary. 569 if readDuration >= slowReadTracingThreshold && r.opts.LoggerAndTracer.IsTracingEnabled(ctx) { 570 r.opts.LoggerAndTracer.Eventf(ctx, "reading %d bytes took %s", 571 int(bh.Length+blockTrailerLen), readDuration.String()) 572 } 573 if stats != nil { 574 stats.BlockReadDuration += readDuration 575 } 576 if err != nil { 577 compressed.release() 578 return bufferHandle{}, err 579 } 580 if err := checkChecksum(r.checksumType, compressed.get(), bh, r.fileNum.FileNum()); err != nil { 581 compressed.release() 582 return bufferHandle{}, err 583 } 584 585 typ := blockType(compressed.get()[bh.Length]) 586 compressed.truncate(int(bh.Length)) 587 588 var decompressed cacheValueOrBuf 589 if typ == noCompressionBlockType { 590 decompressed = compressed 591 } else { 592 // Decode the length of the decompressed value. 593 decodedLen, prefixLen, err := decompressedLen(typ, compressed.get()) 594 if err != nil { 595 compressed.release() 596 return bufferHandle{}, err 597 } 598 599 if bufferPool != nil { 600 decompressed = cacheValueOrBuf{buf: bufferPool.Alloc(decodedLen)} 601 } else { 602 decompressed = cacheValueOrBuf{v: cache.Alloc(decodedLen)} 603 } 604 if _, err := decompressInto(typ, compressed.get()[prefixLen:], decompressed.get()); err != nil { 605 compressed.release() 606 return bufferHandle{}, err 607 } 608 compressed.release() 609 } 610 611 if transform != nil { 612 // Transforming blocks is very rare, so the extra copy of the 613 // transformed data is not problematic. 614 tmpTransformed, err := transform(decompressed.get()) 615 if err != nil { 616 decompressed.release() 617 return bufferHandle{}, err 618 } 619 620 var transformed cacheValueOrBuf 621 if bufferPool != nil { 622 transformed = cacheValueOrBuf{buf: bufferPool.Alloc(len(tmpTransformed))} 623 } else { 624 transformed = cacheValueOrBuf{v: cache.Alloc(len(tmpTransformed))} 625 } 626 copy(transformed.get(), tmpTransformed) 627 decompressed.release() 628 decompressed = transformed 629 } 630 631 if stats != nil { 632 stats.BlockBytes += bh.Length 633 } 634 if decompressed.buf.Valid() { 635 return bufferHandle{b: decompressed.buf}, nil 636 } 637 h := r.opts.Cache.Set(r.cacheID, r.fileNum, bh.Offset, decompressed.v) 638 return bufferHandle{h: h}, nil 639 } 640 641 func (r *Reader) transformRangeDelV1(b []byte) ([]byte, error) { 642 // Convert v1 (RocksDB format) range-del blocks to v2 blocks on the fly. The 643 // v1 format range-del blocks have unfragmented and unsorted range 644 // tombstones. We need properly fragmented and sorted range tombstones in 645 // order to serve from them directly. 646 iter := &blockIter{} 647 if err := iter.init(r.Compare, b, r.Properties.GlobalSeqNum, false); err != nil { 648 return nil, err 649 } 650 var tombstones []keyspan.Span 651 for key, value := iter.First(); key != nil; key, value = iter.Next() { 652 t := keyspan.Span{ 653 Start: key.UserKey, 654 End: value.InPlaceValue(), 655 Keys: []keyspan.Key{{Trailer: key.Trailer}}, 656 } 657 tombstones = append(tombstones, t) 658 } 659 keyspan.Sort(r.Compare, tombstones) 660 661 // Fragment the tombstones, outputting them directly to a block writer. 662 rangeDelBlock := blockWriter{ 663 restartInterval: 1, 664 } 665 frag := keyspan.Fragmenter{ 666 Cmp: r.Compare, 667 Format: r.FormatKey, 668 Emit: func(s keyspan.Span) { 669 for _, k := range s.Keys { 670 startIK := InternalKey{UserKey: s.Start, Trailer: k.Trailer} 671 rangeDelBlock.add(startIK, s.End) 672 } 673 }, 674 } 675 for i := range tombstones { 676 frag.Add(tombstones[i]) 677 } 678 frag.Finish() 679 680 // Return the contents of the constructed v2 format range-del block. 681 return rangeDelBlock.finish(), nil 682 } 683 684 func (r *Reader) readMetaindex(metaindexBH BlockHandle) error { 685 // We use a BufferPool when reading metaindex blocks in order to avoid 686 // populating the block cache with these blocks. In heavy-write workloads, 687 // especially with high compaction concurrency, new tables may be created 688 // frequently. Populating the block cache with these metaindex blocks adds 689 // additional contention on the block cache mutexes (see #1997). 690 // Additionally, these blocks are exceedingly unlikely to be read again 691 // while they're still in the block cache except in misconfigurations with 692 // excessive sstables counts or a table cache that's far too small. 693 r.metaBufferPool.initPreallocated(r.metaBufferPoolAlloc[:0]) 694 // When we're finished, release the buffers we've allocated back to memory 695 // allocator. We don't expect to use metaBufferPool again. 696 defer r.metaBufferPool.Release() 697 698 b, err := r.readBlock( 699 context.Background(), metaindexBH, nil /* transform */, nil /* readHandle */, nil /* stats */, &r.metaBufferPool) 700 if err != nil { 701 return err 702 } 703 data := b.Get() 704 defer b.Release() 705 706 if uint64(len(data)) != metaindexBH.Length { 707 return base.CorruptionErrorf("pebble/table: unexpected metaindex block size: %d vs %d", 708 errors.Safe(len(data)), errors.Safe(metaindexBH.Length)) 709 } 710 711 i, err := newRawBlockIter(bytes.Compare, data) 712 if err != nil { 713 return err 714 } 715 716 meta := map[string]BlockHandle{} 717 for valid := i.First(); valid; valid = i.Next() { 718 value := i.Value() 719 if bytes.Equal(i.Key().UserKey, []byte(metaValueIndexName)) { 720 vbih, n, err := decodeValueBlocksIndexHandle(i.Value()) 721 if err != nil { 722 return err 723 } 724 if n == 0 || n != len(value) { 725 return base.CorruptionErrorf("pebble/table: invalid table (bad value blocks index handle)") 726 } 727 r.valueBIH = vbih 728 } else { 729 bh, n := decodeBlockHandle(value) 730 if n == 0 || n != len(value) { 731 return base.CorruptionErrorf("pebble/table: invalid table (bad block handle)") 732 } 733 meta[string(i.Key().UserKey)] = bh 734 } 735 } 736 if err := i.Close(); err != nil { 737 return err 738 } 739 740 if bh, ok := meta[metaPropertiesName]; ok { 741 b, err = r.readBlock( 742 context.Background(), bh, nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */) 743 if err != nil { 744 return err 745 } 746 r.propertiesBH = bh 747 err := r.Properties.load(b.Get(), bh.Offset, r.opts.DeniedUserProperties) 748 b.Release() 749 if err != nil { 750 return err 751 } 752 } 753 754 if bh, ok := meta[metaRangeDelV2Name]; ok { 755 r.rangeDelBH = bh 756 } else if bh, ok := meta[metaRangeDelName]; ok { 757 r.rangeDelBH = bh 758 if !r.rawTombstones { 759 r.rangeDelTransform = r.transformRangeDelV1 760 } 761 } 762 763 if bh, ok := meta[metaRangeKeyName]; ok { 764 r.rangeKeyBH = bh 765 } 766 767 for name, fp := range r.opts.Filters { 768 types := []struct { 769 ftype FilterType 770 prefix string 771 }{ 772 {TableFilter, "fullfilter."}, 773 } 774 var done bool 775 for _, t := range types { 776 if bh, ok := meta[t.prefix+name]; ok { 777 r.filterBH = bh 778 779 switch t.ftype { 780 case TableFilter: 781 r.tableFilter = newTableFilterReader(fp) 782 default: 783 return base.CorruptionErrorf("unknown filter type: %v", errors.Safe(t.ftype)) 784 } 785 786 done = true 787 break 788 } 789 } 790 if done { 791 break 792 } 793 } 794 return nil 795 } 796 797 // Layout returns the layout (block organization) for an sstable. 798 func (r *Reader) Layout() (*Layout, error) { 799 if r.err != nil { 800 return nil, r.err 801 } 802 803 l := &Layout{ 804 Data: make([]BlockHandleWithProperties, 0, r.Properties.NumDataBlocks), 805 Filter: r.filterBH, 806 RangeDel: r.rangeDelBH, 807 RangeKey: r.rangeKeyBH, 808 ValueIndex: r.valueBIH.h, 809 Properties: r.propertiesBH, 810 MetaIndex: r.metaIndexBH, 811 Footer: r.footerBH, 812 Format: r.tableFormat, 813 } 814 815 indexH, err := r.readIndex(context.Background(), nil) 816 if err != nil { 817 return nil, err 818 } 819 defer indexH.Release() 820 821 var alloc bytealloc.A 822 823 if r.Properties.IndexPartitions == 0 { 824 l.Index = append(l.Index, r.indexBH) 825 iter, _ := newBlockIter(r.Compare, indexH.Get()) 826 for key, value := iter.First(); key != nil; key, value = iter.Next() { 827 dataBH, err := decodeBlockHandleWithProperties(value.InPlaceValue()) 828 if err != nil { 829 return nil, errCorruptIndexEntry 830 } 831 if len(dataBH.Props) > 0 { 832 alloc, dataBH.Props = alloc.Copy(dataBH.Props) 833 } 834 l.Data = append(l.Data, dataBH) 835 } 836 } else { 837 l.TopIndex = r.indexBH 838 topIter, _ := newBlockIter(r.Compare, indexH.Get()) 839 iter := &blockIter{} 840 for key, value := topIter.First(); key != nil; key, value = topIter.Next() { 841 indexBH, err := decodeBlockHandleWithProperties(value.InPlaceValue()) 842 if err != nil { 843 return nil, errCorruptIndexEntry 844 } 845 l.Index = append(l.Index, indexBH.BlockHandle) 846 847 subIndex, err := r.readBlock(context.Background(), indexBH.BlockHandle, 848 nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */) 849 if err != nil { 850 return nil, err 851 } 852 if err := iter.init(r.Compare, subIndex.Get(), 0, /* globalSeqNum */ 853 false /* hideObsoletePoints */); err != nil { 854 return nil, err 855 } 856 for key, value := iter.First(); key != nil; key, value = iter.Next() { 857 dataBH, err := decodeBlockHandleWithProperties(value.InPlaceValue()) 858 if len(dataBH.Props) > 0 { 859 alloc, dataBH.Props = alloc.Copy(dataBH.Props) 860 } 861 if err != nil { 862 return nil, errCorruptIndexEntry 863 } 864 l.Data = append(l.Data, dataBH) 865 } 866 subIndex.Release() 867 *iter = iter.resetForReuse() 868 } 869 } 870 if r.valueBIH.h.Length != 0 { 871 vbiH, err := r.readBlock(context.Background(), r.valueBIH.h, nil, nil, nil, nil /* buffer pool */) 872 if err != nil { 873 return nil, err 874 } 875 defer vbiH.Release() 876 vbiBlock := vbiH.Get() 877 indexEntryLen := int(r.valueBIH.blockNumByteLength + r.valueBIH.blockOffsetByteLength + 878 r.valueBIH.blockLengthByteLength) 879 i := 0 880 for len(vbiBlock) != 0 { 881 if len(vbiBlock) < indexEntryLen { 882 return nil, errors.Errorf( 883 "remaining value index block %d does not contain a full entry of length %d", 884 len(vbiBlock), indexEntryLen) 885 } 886 n := int(r.valueBIH.blockNumByteLength) 887 bn := int(littleEndianGet(vbiBlock, n)) 888 if bn != i { 889 return nil, errors.Errorf("unexpected block num %d, expected %d", 890 bn, i) 891 } 892 i++ 893 vbiBlock = vbiBlock[n:] 894 n = int(r.valueBIH.blockOffsetByteLength) 895 blockOffset := littleEndianGet(vbiBlock, n) 896 vbiBlock = vbiBlock[n:] 897 n = int(r.valueBIH.blockLengthByteLength) 898 blockLen := littleEndianGet(vbiBlock, n) 899 vbiBlock = vbiBlock[n:] 900 l.ValueBlock = append(l.ValueBlock, BlockHandle{Offset: blockOffset, Length: blockLen}) 901 } 902 } 903 904 return l, nil 905 } 906 907 // ValidateBlockChecksums validates the checksums for each block in the SSTable. 908 func (r *Reader) ValidateBlockChecksums() error { 909 // Pre-compute the BlockHandles for the underlying file. 910 l, err := r.Layout() 911 if err != nil { 912 return err 913 } 914 915 // Construct the set of blocks to check. Note that the footer is not checked 916 // as it is not a block with a checksum. 917 blocks := make([]BlockHandle, len(l.Data)) 918 for i := range l.Data { 919 blocks[i] = l.Data[i].BlockHandle 920 } 921 blocks = append(blocks, l.Index...) 922 blocks = append(blocks, l.TopIndex, l.Filter, l.RangeDel, l.RangeKey, l.Properties, l.MetaIndex) 923 924 // Sorting by offset ensures we are performing a sequential scan of the 925 // file. 926 sort.Slice(blocks, func(i, j int) bool { 927 return blocks[i].Offset < blocks[j].Offset 928 }) 929 930 // Check all blocks sequentially. Make use of read-ahead, given we are 931 // scanning the entire file from start to end. 932 rh := r.readable.NewReadHandle(context.TODO()) 933 defer rh.Close() 934 935 for _, bh := range blocks { 936 // Certain blocks may not be present, in which case we skip them. 937 if bh.Length == 0 { 938 continue 939 } 940 941 // Read the block, which validates the checksum. 942 h, err := r.readBlock(context.Background(), bh, nil, rh, nil, nil /* buffer pool */) 943 if err != nil { 944 return err 945 } 946 h.Release() 947 } 948 949 return nil 950 } 951 952 // CommonProperties implemented the CommonReader interface. 953 func (r *Reader) CommonProperties() *CommonProperties { 954 return &r.Properties.CommonProperties 955 } 956 957 // EstimateDiskUsage returns the total size of data blocks overlapping the range 958 // `[start, end]`. Even if a data block partially overlaps, or we cannot 959 // determine overlap due to abbreviated index keys, the full data block size is 960 // included in the estimation. 961 // 962 // This function does not account for any metablock space usage. Assumes there 963 // is at least partial overlap, i.e., `[start, end]` falls neither completely 964 // before nor completely after the file's range. 965 // 966 // Only blocks containing point keys are considered. Range deletion and range 967 // key blocks are not considered. 968 // 969 // TODO(ajkr): account for metablock space usage. Perhaps look at the fraction of 970 // data blocks overlapped and add that same fraction of the metadata blocks to the 971 // estimate. 972 func (r *Reader) EstimateDiskUsage(start, end []byte) (uint64, error) { 973 if r.err != nil { 974 return 0, r.err 975 } 976 977 indexH, err := r.readIndex(context.Background(), nil) 978 if err != nil { 979 return 0, err 980 } 981 defer indexH.Release() 982 983 // Iterators over the bottom-level index blocks containing start and end. 984 // These may be different in case of partitioned index but will both point 985 // to the same blockIter over the single index in the unpartitioned case. 986 var startIdxIter, endIdxIter *blockIter 987 if r.Properties.IndexPartitions == 0 { 988 iter, err := newBlockIter(r.Compare, indexH.Get()) 989 if err != nil { 990 return 0, err 991 } 992 startIdxIter = iter 993 endIdxIter = iter 994 } else { 995 topIter, err := newBlockIter(r.Compare, indexH.Get()) 996 if err != nil { 997 return 0, err 998 } 999 1000 key, val := topIter.SeekGE(start, base.SeekGEFlagsNone) 1001 if key == nil { 1002 // The range falls completely after this file, or an error occurred. 1003 return 0, topIter.Error() 1004 } 1005 startIdxBH, err := decodeBlockHandleWithProperties(val.InPlaceValue()) 1006 if err != nil { 1007 return 0, errCorruptIndexEntry 1008 } 1009 startIdxBlock, err := r.readBlock(context.Background(), startIdxBH.BlockHandle, 1010 nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */) 1011 if err != nil { 1012 return 0, err 1013 } 1014 defer startIdxBlock.Release() 1015 startIdxIter, err = newBlockIter(r.Compare, startIdxBlock.Get()) 1016 if err != nil { 1017 return 0, err 1018 } 1019 1020 key, val = topIter.SeekGE(end, base.SeekGEFlagsNone) 1021 if key == nil { 1022 if err := topIter.Error(); err != nil { 1023 return 0, err 1024 } 1025 } else { 1026 endIdxBH, err := decodeBlockHandleWithProperties(val.InPlaceValue()) 1027 if err != nil { 1028 return 0, errCorruptIndexEntry 1029 } 1030 endIdxBlock, err := r.readBlock(context.Background(), 1031 endIdxBH.BlockHandle, nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */) 1032 if err != nil { 1033 return 0, err 1034 } 1035 defer endIdxBlock.Release() 1036 endIdxIter, err = newBlockIter(r.Compare, endIdxBlock.Get()) 1037 if err != nil { 1038 return 0, err 1039 } 1040 } 1041 } 1042 // startIdxIter should not be nil at this point, while endIdxIter can be if the 1043 // range spans past the end of the file. 1044 1045 key, val := startIdxIter.SeekGE(start, base.SeekGEFlagsNone) 1046 if key == nil { 1047 // The range falls completely after this file, or an error occurred. 1048 return 0, startIdxIter.Error() 1049 } 1050 startBH, err := decodeBlockHandleWithProperties(val.InPlaceValue()) 1051 if err != nil { 1052 return 0, errCorruptIndexEntry 1053 } 1054 1055 includeInterpolatedValueBlocksSize := func(dataBlockSize uint64) uint64 { 1056 // INVARIANT: r.Properties.DataSize > 0 since startIdxIter is not nil. 1057 // Linearly interpolate what is stored in value blocks. 1058 // 1059 // TODO(sumeer): if we need more accuracy, without loading any data blocks 1060 // (which contain the value handles, and which may also be insufficient if 1061 // the values are in separate files), we will need to accumulate the 1062 // logical size of the key-value pairs and store the cumulative value for 1063 // each data block in the index block entry. This increases the size of 1064 // the BlockHandle, so wait until this becomes necessary. 1065 return dataBlockSize + 1066 uint64((float64(dataBlockSize)/float64(r.Properties.DataSize))* 1067 float64(r.Properties.ValueBlocksSize)) 1068 } 1069 if endIdxIter == nil { 1070 // The range spans beyond this file. Include data blocks through the last. 1071 return includeInterpolatedValueBlocksSize(r.Properties.DataSize - startBH.Offset), nil 1072 } 1073 key, val = endIdxIter.SeekGE(end, base.SeekGEFlagsNone) 1074 if key == nil { 1075 if err := endIdxIter.Error(); err != nil { 1076 return 0, err 1077 } 1078 // The range spans beyond this file. Include data blocks through the last. 1079 return includeInterpolatedValueBlocksSize(r.Properties.DataSize - startBH.Offset), nil 1080 } 1081 endBH, err := decodeBlockHandleWithProperties(val.InPlaceValue()) 1082 if err != nil { 1083 return 0, errCorruptIndexEntry 1084 } 1085 return includeInterpolatedValueBlocksSize( 1086 endBH.Offset + endBH.Length + blockTrailerLen - startBH.Offset), nil 1087 } 1088 1089 // TableFormat returns the format version for the table. 1090 func (r *Reader) TableFormat() (TableFormat, error) { 1091 if r.err != nil { 1092 return TableFormatUnspecified, r.err 1093 } 1094 return r.tableFormat, nil 1095 } 1096 1097 // NewReader returns a new table reader for the file. Closing the reader will 1098 // close the file. 1099 func NewReader(f objstorage.Readable, o ReaderOptions, extraOpts ...ReaderOption) (*Reader, error) { 1100 o = o.ensureDefaults() 1101 r := &Reader{ 1102 readable: f, 1103 opts: o, 1104 } 1105 if r.opts.Cache == nil { 1106 r.opts.Cache = cache.New(0) 1107 } else { 1108 r.opts.Cache.Ref() 1109 } 1110 1111 if f == nil { 1112 r.err = errors.New("pebble/table: nil file") 1113 return nil, r.Close() 1114 } 1115 1116 // Note that the extra options are applied twice. First here for pre-apply 1117 // options, and then below for post-apply options. Pre and post refer to 1118 // before and after reading the metaindex and properties. 1119 type preApply interface{ preApply() } 1120 for _, opt := range extraOpts { 1121 if _, ok := opt.(preApply); ok { 1122 opt.readerApply(r) 1123 } 1124 } 1125 if r.cacheID == 0 { 1126 r.cacheID = r.opts.Cache.NewID() 1127 } 1128 1129 footer, err := readFooter(f) 1130 if err != nil { 1131 r.err = err 1132 return nil, r.Close() 1133 } 1134 r.checksumType = footer.checksum 1135 r.tableFormat = footer.format 1136 // Read the metaindex. 1137 if err := r.readMetaindex(footer.metaindexBH); err != nil { 1138 r.err = err 1139 return nil, r.Close() 1140 } 1141 r.indexBH = footer.indexBH 1142 r.metaIndexBH = footer.metaindexBH 1143 r.footerBH = footer.footerBH 1144 1145 if r.Properties.ComparerName == "" || o.Comparer.Name == r.Properties.ComparerName { 1146 r.Compare = o.Comparer.Compare 1147 r.FormatKey = o.Comparer.FormatKey 1148 r.Split = o.Comparer.Split 1149 } 1150 1151 if o.MergerName == r.Properties.MergerName { 1152 r.mergerOK = true 1153 } 1154 1155 // Apply the extra options again now that the comparer and merger names are 1156 // known. 1157 for _, opt := range extraOpts { 1158 if _, ok := opt.(preApply); !ok { 1159 opt.readerApply(r) 1160 } 1161 } 1162 1163 if r.Compare == nil { 1164 r.err = errors.Errorf("pebble/table: %d: unknown comparer %s", 1165 errors.Safe(r.fileNum), errors.Safe(r.Properties.ComparerName)) 1166 } 1167 if !r.mergerOK { 1168 if name := r.Properties.MergerName; name != "" && name != "nullptr" { 1169 r.err = errors.Errorf("pebble/table: %d: unknown merger %s", 1170 errors.Safe(r.fileNum), errors.Safe(r.Properties.MergerName)) 1171 } 1172 } 1173 if r.err != nil { 1174 return nil, r.Close() 1175 } 1176 1177 return r, nil 1178 } 1179 1180 // ReadableFile describes the smallest subset of vfs.File that is required for 1181 // reading SSTs. 1182 type ReadableFile interface { 1183 io.ReaderAt 1184 io.Closer 1185 Stat() (os.FileInfo, error) 1186 } 1187 1188 // NewSimpleReadable wraps a ReadableFile in a objstorage.Readable 1189 // implementation (which does not support read-ahead) 1190 func NewSimpleReadable(r ReadableFile) (objstorage.Readable, error) { 1191 info, err := r.Stat() 1192 if err != nil { 1193 return nil, err 1194 } 1195 res := &simpleReadable{ 1196 f: r, 1197 size: info.Size(), 1198 } 1199 res.rh = objstorage.MakeNoopReadHandle(res) 1200 return res, nil 1201 } 1202 1203 // simpleReadable wraps a ReadableFile to implement objstorage.Readable. 1204 type simpleReadable struct { 1205 f ReadableFile 1206 size int64 1207 rh objstorage.NoopReadHandle 1208 } 1209 1210 var _ objstorage.Readable = (*simpleReadable)(nil) 1211 1212 // ReadAt is part of the objstorage.Readable interface. 1213 func (s *simpleReadable) ReadAt(_ context.Context, p []byte, off int64) error { 1214 n, err := s.f.ReadAt(p, off) 1215 if invariants.Enabled && err == nil && n != len(p) { 1216 panic("short read") 1217 } 1218 return err 1219 } 1220 1221 // Close is part of the objstorage.Readable interface. 1222 func (s *simpleReadable) Close() error { 1223 return s.f.Close() 1224 } 1225 1226 // Size is part of the objstorage.Readable interface. 1227 func (s *simpleReadable) Size() int64 { 1228 return s.size 1229 } 1230 1231 // NewReaddHandle is part of the objstorage.Readable interface. 1232 func (s *simpleReadable) NewReadHandle(_ context.Context) objstorage.ReadHandle { 1233 return &s.rh 1234 }