github.com/cockroachdb/pebble@v1.1.2/sstable/reader.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package sstable 6 7 import ( 8 "bytes" 9 "context" 10 "encoding/binary" 11 "io" 12 "os" 13 "sort" 14 "time" 15 16 "github.com/cespare/xxhash/v2" 17 "github.com/cockroachdb/errors" 18 "github.com/cockroachdb/pebble/internal/base" 19 "github.com/cockroachdb/pebble/internal/bytealloc" 20 "github.com/cockroachdb/pebble/internal/cache" 21 "github.com/cockroachdb/pebble/internal/crc" 22 "github.com/cockroachdb/pebble/internal/invariants" 23 "github.com/cockroachdb/pebble/internal/keyspan" 24 "github.com/cockroachdb/pebble/internal/private" 25 "github.com/cockroachdb/pebble/objstorage" 26 "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing" 27 ) 28 29 var errCorruptIndexEntry = base.CorruptionErrorf("pebble/table: corrupt index entry") 30 var errReaderClosed = errors.New("pebble/table: reader is closed") 31 32 // decodeBlockHandle returns the block handle encoded at the start of src, as 33 // well as the number of bytes it occupies. It returns zero if given invalid 34 // input. A block handle for a data block or a first/lower level index block 35 // should not be decoded using decodeBlockHandle since the caller may validate 36 // that the number of bytes decoded is equal to the length of src, which will 37 // be false if the properties are not decoded. In those cases the caller 38 // should use decodeBlockHandleWithProperties. 39 func decodeBlockHandle(src []byte) (BlockHandle, int) { 40 offset, n := binary.Uvarint(src) 41 length, m := binary.Uvarint(src[n:]) 42 if n == 0 || m == 0 { 43 return BlockHandle{}, 0 44 } 45 return BlockHandle{offset, length}, n + m 46 } 47 48 // decodeBlockHandleWithProperties returns the block handle and properties 49 // encoded in src. src needs to be exactly the length that was encoded. This 50 // method must be used for data block and first/lower level index blocks. The 51 // properties in the block handle point to the bytes in src. 52 func decodeBlockHandleWithProperties(src []byte) (BlockHandleWithProperties, error) { 53 bh, n := decodeBlockHandle(src) 54 if n == 0 { 55 return BlockHandleWithProperties{}, errors.Errorf("invalid BlockHandle") 56 } 57 return BlockHandleWithProperties{ 58 BlockHandle: bh, 59 Props: src[n:], 60 }, nil 61 } 62 63 func encodeBlockHandle(dst []byte, b BlockHandle) int { 64 n := binary.PutUvarint(dst, b.Offset) 65 m := binary.PutUvarint(dst[n:], b.Length) 66 return n + m 67 } 68 69 func encodeBlockHandleWithProperties(dst []byte, b BlockHandleWithProperties) []byte { 70 n := encodeBlockHandle(dst, b.BlockHandle) 71 dst = append(dst[:n], b.Props...) 72 return dst 73 } 74 75 // block is a []byte that holds a sequence of key/value pairs plus an index 76 // over those pairs. 77 type block []byte 78 79 type loadBlockResult int8 80 81 const ( 82 loadBlockOK loadBlockResult = iota 83 // Could be due to error or because no block left to load. 84 loadBlockFailed 85 loadBlockIrrelevant 86 ) 87 88 type blockTransform func([]byte) ([]byte, error) 89 90 // ReaderOption provide an interface to do work on Reader while it is being 91 // opened. 92 type ReaderOption interface { 93 // readerApply is called on the reader during opening in order to set internal 94 // parameters. 95 readerApply(*Reader) 96 } 97 98 // Comparers is a map from comparer name to comparer. It is used for debugging 99 // tools which may be used on multiple databases configured with different 100 // comparers. Comparers implements the OpenOption interface and can be passed 101 // as a parameter to NewReader. 102 type Comparers map[string]*Comparer 103 104 func (c Comparers) readerApply(r *Reader) { 105 if r.Compare != nil || r.Properties.ComparerName == "" { 106 return 107 } 108 if comparer, ok := c[r.Properties.ComparerName]; ok { 109 r.Compare = comparer.Compare 110 r.FormatKey = comparer.FormatKey 111 r.Split = comparer.Split 112 } 113 } 114 115 // Mergers is a map from merger name to merger. It is used for debugging tools 116 // which may be used on multiple databases configured with different 117 // mergers. Mergers implements the OpenOption interface and can be passed as 118 // a parameter to NewReader. 119 type Mergers map[string]*Merger 120 121 func (m Mergers) readerApply(r *Reader) { 122 if r.mergerOK || r.Properties.MergerName == "" { 123 return 124 } 125 _, r.mergerOK = m[r.Properties.MergerName] 126 } 127 128 // cacheOpts is a Reader open option for specifying the cache ID and sstable file 129 // number. If not specified, a unique cache ID will be used. 130 type cacheOpts struct { 131 cacheID uint64 132 fileNum base.DiskFileNum 133 } 134 135 // Marker function to indicate the option should be applied before reading the 136 // sstable properties and, in the write path, before writing the default 137 // sstable properties. 138 func (c *cacheOpts) preApply() {} 139 140 func (c *cacheOpts) readerApply(r *Reader) { 141 if r.cacheID == 0 { 142 r.cacheID = c.cacheID 143 } 144 if r.fileNum.FileNum() == 0 { 145 r.fileNum = c.fileNum 146 } 147 } 148 149 func (c *cacheOpts) writerApply(w *Writer) { 150 if w.cacheID == 0 { 151 w.cacheID = c.cacheID 152 } 153 if w.fileNum.FileNum() == 0 { 154 w.fileNum = c.fileNum 155 } 156 } 157 158 // rawTombstonesOpt is a Reader open option for specifying that range 159 // tombstones returned by Reader.NewRangeDelIter() should not be 160 // fragmented. Used by debug tools to get a raw view of the tombstones 161 // contained in an sstable. 162 type rawTombstonesOpt struct{} 163 164 func (rawTombstonesOpt) preApply() {} 165 166 func (rawTombstonesOpt) readerApply(r *Reader) { 167 r.rawTombstones = true 168 } 169 170 func init() { 171 private.SSTableCacheOpts = func(cacheID uint64, fileNum base.DiskFileNum) interface{} { 172 return &cacheOpts{cacheID, fileNum} 173 } 174 private.SSTableRawTombstonesOpt = rawTombstonesOpt{} 175 } 176 177 // CommonReader abstracts functionality over a Reader or a VirtualReader. This 178 // can be used by code which doesn't care to distinguish between a reader and a 179 // virtual reader. 180 type CommonReader interface { 181 NewRawRangeKeyIter() (keyspan.FragmentIterator, error) 182 NewRawRangeDelIter() (keyspan.FragmentIterator, error) 183 NewIterWithBlockPropertyFiltersAndContextEtc( 184 ctx context.Context, lower, upper []byte, 185 filterer *BlockPropertiesFilterer, 186 hideObsoletePoints, useFilterBlock bool, 187 stats *base.InternalIteratorStats, 188 rp ReaderProvider, 189 ) (Iterator, error) 190 NewCompactionIter( 191 bytesIterated *uint64, 192 rp ReaderProvider, 193 bufferPool *BufferPool, 194 ) (Iterator, error) 195 EstimateDiskUsage(start, end []byte) (uint64, error) 196 CommonProperties() *CommonProperties 197 } 198 199 // Reader is a table reader. 200 type Reader struct { 201 readable objstorage.Readable 202 cacheID uint64 203 fileNum base.DiskFileNum 204 err error 205 indexBH BlockHandle 206 filterBH BlockHandle 207 rangeDelBH BlockHandle 208 rangeKeyBH BlockHandle 209 rangeDelTransform blockTransform 210 valueBIH valueBlocksIndexHandle 211 propertiesBH BlockHandle 212 metaIndexBH BlockHandle 213 footerBH BlockHandle 214 opts ReaderOptions 215 Compare Compare 216 FormatKey base.FormatKey 217 Split Split 218 tableFilter *tableFilterReader 219 // Keep types that are not multiples of 8 bytes at the end and with 220 // decreasing size. 221 Properties Properties 222 tableFormat TableFormat 223 rawTombstones bool 224 mergerOK bool 225 checksumType ChecksumType 226 // metaBufferPool is a buffer pool used exclusively when opening a table and 227 // loading its meta blocks. metaBufferPoolAlloc is used to batch-allocate 228 // the BufferPool.pool slice as a part of the Reader allocation. It's 229 // capacity 3 to accommodate the meta block (1), and both the compressed 230 // properties block (1) and decompressed properties block (1) 231 // simultaneously. 232 metaBufferPool BufferPool 233 metaBufferPoolAlloc [3]allocedBuffer 234 } 235 236 // Close implements DB.Close, as documented in the pebble package. 237 func (r *Reader) Close() error { 238 r.opts.Cache.Unref() 239 240 if r.readable != nil { 241 r.err = firstError(r.err, r.readable.Close()) 242 r.readable = nil 243 } 244 245 if r.err != nil { 246 return r.err 247 } 248 // Make any future calls to Get, NewIter or Close return an error. 249 r.err = errReaderClosed 250 return nil 251 } 252 253 // NewIterWithBlockPropertyFilters returns an iterator for the contents of the 254 // table. If an error occurs, NewIterWithBlockPropertyFilters cleans up after 255 // itself and returns a nil iterator. 256 func (r *Reader) NewIterWithBlockPropertyFilters( 257 lower, upper []byte, 258 filterer *BlockPropertiesFilterer, 259 useFilterBlock bool, 260 stats *base.InternalIteratorStats, 261 rp ReaderProvider, 262 ) (Iterator, error) { 263 return r.newIterWithBlockPropertyFiltersAndContext( 264 context.Background(), 265 lower, upper, filterer, false, useFilterBlock, stats, rp, nil, 266 ) 267 } 268 269 // NewIterWithBlockPropertyFiltersAndContextEtc is similar to 270 // NewIterWithBlockPropertyFilters and additionally accepts a context for 271 // tracing. 272 // 273 // If hideObsoletePoints, the callee assumes that filterer already includes 274 // obsoleteKeyBlockPropertyFilter. The caller can satisfy this contract by 275 // first calling TryAddBlockPropertyFilterForHideObsoletePoints. 276 func (r *Reader) NewIterWithBlockPropertyFiltersAndContextEtc( 277 ctx context.Context, 278 lower, upper []byte, 279 filterer *BlockPropertiesFilterer, 280 hideObsoletePoints, useFilterBlock bool, 281 stats *base.InternalIteratorStats, 282 rp ReaderProvider, 283 ) (Iterator, error) { 284 return r.newIterWithBlockPropertyFiltersAndContext( 285 ctx, lower, upper, filterer, hideObsoletePoints, useFilterBlock, stats, rp, nil, 286 ) 287 } 288 289 // TryAddBlockPropertyFilterForHideObsoletePoints is expected to be called 290 // before the call to NewIterWithBlockPropertyFiltersAndContextEtc, to get the 291 // value of hideObsoletePoints and potentially add a block property filter. 292 func (r *Reader) TryAddBlockPropertyFilterForHideObsoletePoints( 293 snapshotForHideObsoletePoints uint64, 294 fileLargestSeqNum uint64, 295 pointKeyFilters []BlockPropertyFilter, 296 ) (hideObsoletePoints bool, filters []BlockPropertyFilter) { 297 hideObsoletePoints = r.tableFormat >= TableFormatPebblev4 && 298 snapshotForHideObsoletePoints > fileLargestSeqNum 299 if hideObsoletePoints { 300 pointKeyFilters = append(pointKeyFilters, obsoleteKeyBlockPropertyFilter{}) 301 } 302 return hideObsoletePoints, pointKeyFilters 303 } 304 305 func (r *Reader) newIterWithBlockPropertyFiltersAndContext( 306 ctx context.Context, 307 lower, upper []byte, 308 filterer *BlockPropertiesFilterer, 309 hideObsoletePoints bool, 310 useFilterBlock bool, 311 stats *base.InternalIteratorStats, 312 rp ReaderProvider, 313 v *virtualState, 314 ) (Iterator, error) { 315 // NB: pebble.tableCache wraps the returned iterator with one which performs 316 // reference counting on the Reader, preventing the Reader from being closed 317 // until the final iterator closes. 318 if r.Properties.IndexType == twoLevelIndex { 319 i := twoLevelIterPool.Get().(*twoLevelIterator) 320 err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, hideObsoletePoints, stats, rp, nil /* bufferPool */) 321 if err != nil { 322 return nil, err 323 } 324 return i, nil 325 } 326 327 i := singleLevelIterPool.Get().(*singleLevelIterator) 328 err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, hideObsoletePoints, stats, rp, nil /* bufferPool */) 329 if err != nil { 330 return nil, err 331 } 332 return i, nil 333 } 334 335 // NewIter returns an iterator for the contents of the table. If an error 336 // occurs, NewIter cleans up after itself and returns a nil iterator. NewIter 337 // must only be used when the Reader is guaranteed to outlive any LazyValues 338 // returned from the iter. 339 func (r *Reader) NewIter(lower, upper []byte) (Iterator, error) { 340 return r.NewIterWithBlockPropertyFilters( 341 lower, upper, nil, true /* useFilterBlock */, nil, /* stats */ 342 TrivialReaderProvider{Reader: r}) 343 } 344 345 // NewCompactionIter returns an iterator similar to NewIter but it also increments 346 // the number of bytes iterated. If an error occurs, NewCompactionIter cleans up 347 // after itself and returns a nil iterator. 348 func (r *Reader) NewCompactionIter( 349 bytesIterated *uint64, rp ReaderProvider, bufferPool *BufferPool, 350 ) (Iterator, error) { 351 return r.newCompactionIter(bytesIterated, rp, nil, bufferPool) 352 } 353 354 func (r *Reader) newCompactionIter( 355 bytesIterated *uint64, rp ReaderProvider, v *virtualState, bufferPool *BufferPool, 356 ) (Iterator, error) { 357 if r.Properties.IndexType == twoLevelIndex { 358 i := twoLevelIterPool.Get().(*twoLevelIterator) 359 err := i.init( 360 context.Background(), 361 r, v, nil /* lower */, nil /* upper */, nil, 362 false /* useFilter */, v != nil && v.isForeign, /* hideObsoletePoints */ 363 nil /* stats */, rp, bufferPool, 364 ) 365 if err != nil { 366 return nil, err 367 } 368 i.setupForCompaction() 369 return &twoLevelCompactionIterator{ 370 twoLevelIterator: i, 371 bytesIterated: bytesIterated, 372 }, nil 373 } 374 i := singleLevelIterPool.Get().(*singleLevelIterator) 375 err := i.init( 376 context.Background(), r, v, nil /* lower */, nil, /* upper */ 377 nil, false /* useFilter */, v != nil && v.isForeign, /* hideObsoletePoints */ 378 nil /* stats */, rp, bufferPool, 379 ) 380 if err != nil { 381 return nil, err 382 } 383 i.setupForCompaction() 384 return &compactionIterator{ 385 singleLevelIterator: i, 386 bytesIterated: bytesIterated, 387 }, nil 388 } 389 390 // NewRawRangeDelIter returns an internal iterator for the contents of the 391 // range-del block for the table. Returns nil if the table does not contain 392 // any range deletions. 393 // 394 // TODO(sumeer): plumb context.Context since this path is relevant in the user-facing 395 // iterator. Add WithContext methods since the existing ones are public. 396 func (r *Reader) NewRawRangeDelIter() (keyspan.FragmentIterator, error) { 397 if r.rangeDelBH.Length == 0 { 398 return nil, nil 399 } 400 h, err := r.readRangeDel(nil /* stats */) 401 if err != nil { 402 return nil, err 403 } 404 i := &fragmentBlockIter{elideSameSeqnum: true} 405 if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum, false); err != nil { 406 return nil, err 407 } 408 return i, nil 409 } 410 411 // NewRawRangeKeyIter returns an internal iterator for the contents of the 412 // range-key block for the table. Returns nil if the table does not contain any 413 // range keys. 414 // 415 // TODO(sumeer): plumb context.Context since this path is relevant in the user-facing 416 // iterator. Add WithContext methods since the existing ones are public. 417 func (r *Reader) NewRawRangeKeyIter() (keyspan.FragmentIterator, error) { 418 if r.rangeKeyBH.Length == 0 { 419 return nil, nil 420 } 421 h, err := r.readRangeKey(nil /* stats */) 422 if err != nil { 423 return nil, err 424 } 425 i := rangeKeyFragmentBlockIterPool.Get().(*rangeKeyFragmentBlockIter) 426 if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum, false); err != nil { 427 return nil, err 428 } 429 return i, nil 430 } 431 432 type rangeKeyFragmentBlockIter struct { 433 fragmentBlockIter 434 } 435 436 func (i *rangeKeyFragmentBlockIter) Close() error { 437 err := i.fragmentBlockIter.Close() 438 i.fragmentBlockIter = i.fragmentBlockIter.resetForReuse() 439 rangeKeyFragmentBlockIterPool.Put(i) 440 return err 441 } 442 443 func (r *Reader) readIndex( 444 ctx context.Context, stats *base.InternalIteratorStats, 445 ) (bufferHandle, error) { 446 ctx = objiotracing.WithBlockType(ctx, objiotracing.MetadataBlock) 447 return r.readBlock(ctx, r.indexBH, nil, nil, stats, nil /* buffer pool */) 448 } 449 450 func (r *Reader) readFilter( 451 ctx context.Context, stats *base.InternalIteratorStats, 452 ) (bufferHandle, error) { 453 ctx = objiotracing.WithBlockType(ctx, objiotracing.FilterBlock) 454 return r.readBlock(ctx, r.filterBH, nil /* transform */, nil /* readHandle */, stats, nil /* buffer pool */) 455 } 456 457 func (r *Reader) readRangeDel(stats *base.InternalIteratorStats) (bufferHandle, error) { 458 ctx := objiotracing.WithBlockType(context.Background(), objiotracing.MetadataBlock) 459 return r.readBlock(ctx, r.rangeDelBH, r.rangeDelTransform, nil /* readHandle */, stats, nil /* buffer pool */) 460 } 461 462 func (r *Reader) readRangeKey(stats *base.InternalIteratorStats) (bufferHandle, error) { 463 ctx := objiotracing.WithBlockType(context.Background(), objiotracing.MetadataBlock) 464 return r.readBlock(ctx, r.rangeKeyBH, nil /* transform */, nil /* readHandle */, stats, nil /* buffer pool */) 465 } 466 467 func checkChecksum( 468 checksumType ChecksumType, b []byte, bh BlockHandle, fileNum base.FileNum, 469 ) error { 470 expectedChecksum := binary.LittleEndian.Uint32(b[bh.Length+1:]) 471 var computedChecksum uint32 472 switch checksumType { 473 case ChecksumTypeCRC32c: 474 computedChecksum = crc.New(b[:bh.Length+1]).Value() 475 case ChecksumTypeXXHash64: 476 computedChecksum = uint32(xxhash.Sum64(b[:bh.Length+1])) 477 default: 478 return errors.Errorf("unsupported checksum type: %d", checksumType) 479 } 480 481 if expectedChecksum != computedChecksum { 482 return base.CorruptionErrorf( 483 "pebble/table: invalid table %s (checksum mismatch at %d/%d)", 484 errors.Safe(fileNum), errors.Safe(bh.Offset), errors.Safe(bh.Length)) 485 } 486 return nil 487 } 488 489 type cacheValueOrBuf struct { 490 // buf.Valid() returns true if backed by a BufferPool. 491 buf Buf 492 // v is non-nil if backed by the block cache. 493 v *cache.Value 494 } 495 496 func (b cacheValueOrBuf) get() []byte { 497 if b.buf.Valid() { 498 return b.buf.p.pool[b.buf.i].b 499 } 500 return b.v.Buf() 501 } 502 503 func (b cacheValueOrBuf) release() { 504 if b.buf.Valid() { 505 b.buf.Release() 506 } else { 507 cache.Free(b.v) 508 } 509 } 510 511 func (b cacheValueOrBuf) truncate(n int) { 512 if b.buf.Valid() { 513 b.buf.p.pool[b.buf.i].b = b.buf.p.pool[b.buf.i].b[:n] 514 } else { 515 b.v.Truncate(n) 516 } 517 } 518 519 func (r *Reader) readBlock( 520 ctx context.Context, 521 bh BlockHandle, 522 transform blockTransform, 523 readHandle objstorage.ReadHandle, 524 stats *base.InternalIteratorStats, 525 bufferPool *BufferPool, 526 ) (handle bufferHandle, _ error) { 527 if h := r.opts.Cache.Get(r.cacheID, r.fileNum, bh.Offset); h.Get() != nil { 528 // Cache hit. 529 if readHandle != nil { 530 readHandle.RecordCacheHit(ctx, int64(bh.Offset), int64(bh.Length+blockTrailerLen)) 531 } 532 if stats != nil { 533 stats.BlockBytes += bh.Length 534 stats.BlockBytesInCache += bh.Length 535 } 536 // This block is already in the cache; return a handle to existing vlaue 537 // in the cache. 538 return bufferHandle{h: h}, nil 539 } 540 541 // Cache miss. 542 543 if sema := r.opts.LoadBlockSema; sema != nil { 544 if err := sema.Acquire(ctx, 1); err != nil { 545 // An error here can only come from the context. 546 return bufferHandle{}, err 547 } 548 defer sema.Release(1) 549 } 550 551 var compressed cacheValueOrBuf 552 if bufferPool != nil { 553 compressed = cacheValueOrBuf{ 554 buf: bufferPool.Alloc(int(bh.Length + blockTrailerLen)), 555 } 556 } else { 557 compressed = cacheValueOrBuf{ 558 v: cache.Alloc(int(bh.Length + blockTrailerLen)), 559 } 560 } 561 562 readStartTime := time.Now() 563 var err error 564 if readHandle != nil { 565 err = readHandle.ReadAt(ctx, compressed.get(), int64(bh.Offset)) 566 } else { 567 err = r.readable.ReadAt(ctx, compressed.get(), int64(bh.Offset)) 568 } 569 readDuration := time.Since(readStartTime) 570 // TODO(sumeer): should the threshold be configurable. 571 const slowReadTracingThreshold = 5 * time.Millisecond 572 // The invariants.Enabled path is for deterministic testing. 573 if invariants.Enabled { 574 readDuration = slowReadTracingThreshold 575 } 576 // Call IsTracingEnabled to avoid the allocations of boxing integers into an 577 // interface{}, unless necessary. 578 if readDuration >= slowReadTracingThreshold && r.opts.LoggerAndTracer.IsTracingEnabled(ctx) { 579 r.opts.LoggerAndTracer.Eventf(ctx, "reading %d bytes took %s", 580 int(bh.Length+blockTrailerLen), readDuration.String()) 581 } 582 if stats != nil { 583 stats.BlockBytes += bh.Length 584 stats.BlockReadDuration += readDuration 585 } 586 if err != nil { 587 compressed.release() 588 return bufferHandle{}, err 589 } 590 if err := checkChecksum(r.checksumType, compressed.get(), bh, r.fileNum.FileNum()); err != nil { 591 compressed.release() 592 return bufferHandle{}, err 593 } 594 595 typ := blockType(compressed.get()[bh.Length]) 596 compressed.truncate(int(bh.Length)) 597 598 var decompressed cacheValueOrBuf 599 if typ == noCompressionBlockType { 600 decompressed = compressed 601 } else { 602 // Decode the length of the decompressed value. 603 decodedLen, prefixLen, err := decompressedLen(typ, compressed.get()) 604 if err != nil { 605 compressed.release() 606 return bufferHandle{}, err 607 } 608 609 if bufferPool != nil { 610 decompressed = cacheValueOrBuf{buf: bufferPool.Alloc(decodedLen)} 611 } else { 612 decompressed = cacheValueOrBuf{v: cache.Alloc(decodedLen)} 613 } 614 if _, err := decompressInto(typ, compressed.get()[prefixLen:], decompressed.get()); err != nil { 615 compressed.release() 616 return bufferHandle{}, err 617 } 618 compressed.release() 619 } 620 621 if transform != nil { 622 // Transforming blocks is very rare, so the extra copy of the 623 // transformed data is not problematic. 624 tmpTransformed, err := transform(decompressed.get()) 625 if err != nil { 626 decompressed.release() 627 return bufferHandle{}, err 628 } 629 630 var transformed cacheValueOrBuf 631 if bufferPool != nil { 632 transformed = cacheValueOrBuf{buf: bufferPool.Alloc(len(tmpTransformed))} 633 } else { 634 transformed = cacheValueOrBuf{v: cache.Alloc(len(tmpTransformed))} 635 } 636 copy(transformed.get(), tmpTransformed) 637 decompressed.release() 638 decompressed = transformed 639 } 640 641 if decompressed.buf.Valid() { 642 return bufferHandle{b: decompressed.buf}, nil 643 } 644 h := r.opts.Cache.Set(r.cacheID, r.fileNum, bh.Offset, decompressed.v) 645 return bufferHandle{h: h}, nil 646 } 647 648 func (r *Reader) transformRangeDelV1(b []byte) ([]byte, error) { 649 // Convert v1 (RocksDB format) range-del blocks to v2 blocks on the fly. The 650 // v1 format range-del blocks have unfragmented and unsorted range 651 // tombstones. We need properly fragmented and sorted range tombstones in 652 // order to serve from them directly. 653 iter := &blockIter{} 654 if err := iter.init(r.Compare, b, r.Properties.GlobalSeqNum, false); err != nil { 655 return nil, err 656 } 657 var tombstones []keyspan.Span 658 for key, value := iter.First(); key != nil; key, value = iter.Next() { 659 t := keyspan.Span{ 660 Start: key.UserKey, 661 End: value.InPlaceValue(), 662 Keys: []keyspan.Key{{Trailer: key.Trailer}}, 663 } 664 tombstones = append(tombstones, t) 665 } 666 keyspan.Sort(r.Compare, tombstones) 667 668 // Fragment the tombstones, outputting them directly to a block writer. 669 rangeDelBlock := blockWriter{ 670 restartInterval: 1, 671 } 672 frag := keyspan.Fragmenter{ 673 Cmp: r.Compare, 674 Format: r.FormatKey, 675 Emit: func(s keyspan.Span) { 676 for _, k := range s.Keys { 677 startIK := InternalKey{UserKey: s.Start, Trailer: k.Trailer} 678 rangeDelBlock.add(startIK, s.End) 679 } 680 }, 681 } 682 for i := range tombstones { 683 frag.Add(tombstones[i]) 684 } 685 frag.Finish() 686 687 // Return the contents of the constructed v2 format range-del block. 688 return rangeDelBlock.finish(), nil 689 } 690 691 func (r *Reader) readMetaindex(metaindexBH BlockHandle) error { 692 // We use a BufferPool when reading metaindex blocks in order to avoid 693 // populating the block cache with these blocks. In heavy-write workloads, 694 // especially with high compaction concurrency, new tables may be created 695 // frequently. Populating the block cache with these metaindex blocks adds 696 // additional contention on the block cache mutexes (see #1997). 697 // Additionally, these blocks are exceedingly unlikely to be read again 698 // while they're still in the block cache except in misconfigurations with 699 // excessive sstables counts or a table cache that's far too small. 700 r.metaBufferPool.initPreallocated(r.metaBufferPoolAlloc[:0]) 701 // When we're finished, release the buffers we've allocated back to memory 702 // allocator. We don't expect to use metaBufferPool again. 703 defer r.metaBufferPool.Release() 704 705 b, err := r.readBlock( 706 context.Background(), metaindexBH, nil /* transform */, nil /* readHandle */, nil /* stats */, &r.metaBufferPool) 707 if err != nil { 708 return err 709 } 710 data := b.Get() 711 defer b.Release() 712 713 if uint64(len(data)) != metaindexBH.Length { 714 return base.CorruptionErrorf("pebble/table: unexpected metaindex block size: %d vs %d", 715 errors.Safe(len(data)), errors.Safe(metaindexBH.Length)) 716 } 717 718 i, err := newRawBlockIter(bytes.Compare, data) 719 if err != nil { 720 return err 721 } 722 723 meta := map[string]BlockHandle{} 724 for valid := i.First(); valid; valid = i.Next() { 725 value := i.Value() 726 if bytes.Equal(i.Key().UserKey, []byte(metaValueIndexName)) { 727 vbih, n, err := decodeValueBlocksIndexHandle(i.Value()) 728 if err != nil { 729 return err 730 } 731 if n == 0 || n != len(value) { 732 return base.CorruptionErrorf("pebble/table: invalid table (bad value blocks index handle)") 733 } 734 r.valueBIH = vbih 735 } else { 736 bh, n := decodeBlockHandle(value) 737 if n == 0 || n != len(value) { 738 return base.CorruptionErrorf("pebble/table: invalid table (bad block handle)") 739 } 740 meta[string(i.Key().UserKey)] = bh 741 } 742 } 743 if err := i.Close(); err != nil { 744 return err 745 } 746 747 if bh, ok := meta[metaPropertiesName]; ok { 748 b, err = r.readBlock( 749 context.Background(), bh, nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */) 750 if err != nil { 751 return err 752 } 753 r.propertiesBH = bh 754 err := r.Properties.load(b.Get(), bh.Offset, r.opts.DeniedUserProperties) 755 b.Release() 756 if err != nil { 757 return err 758 } 759 } 760 761 if bh, ok := meta[metaRangeDelV2Name]; ok { 762 r.rangeDelBH = bh 763 } else if bh, ok := meta[metaRangeDelName]; ok { 764 r.rangeDelBH = bh 765 if !r.rawTombstones { 766 r.rangeDelTransform = r.transformRangeDelV1 767 } 768 } 769 770 if bh, ok := meta[metaRangeKeyName]; ok { 771 r.rangeKeyBH = bh 772 } 773 774 for name, fp := range r.opts.Filters { 775 types := []struct { 776 ftype FilterType 777 prefix string 778 }{ 779 {TableFilter, "fullfilter."}, 780 } 781 var done bool 782 for _, t := range types { 783 if bh, ok := meta[t.prefix+name]; ok { 784 r.filterBH = bh 785 786 switch t.ftype { 787 case TableFilter: 788 r.tableFilter = newTableFilterReader(fp) 789 default: 790 return base.CorruptionErrorf("unknown filter type: %v", errors.Safe(t.ftype)) 791 } 792 793 done = true 794 break 795 } 796 } 797 if done { 798 break 799 } 800 } 801 return nil 802 } 803 804 // Layout returns the layout (block organization) for an sstable. 805 func (r *Reader) Layout() (*Layout, error) { 806 if r.err != nil { 807 return nil, r.err 808 } 809 810 l := &Layout{ 811 Data: make([]BlockHandleWithProperties, 0, r.Properties.NumDataBlocks), 812 Filter: r.filterBH, 813 RangeDel: r.rangeDelBH, 814 RangeKey: r.rangeKeyBH, 815 ValueIndex: r.valueBIH.h, 816 Properties: r.propertiesBH, 817 MetaIndex: r.metaIndexBH, 818 Footer: r.footerBH, 819 Format: r.tableFormat, 820 } 821 822 indexH, err := r.readIndex(context.Background(), nil) 823 if err != nil { 824 return nil, err 825 } 826 defer indexH.Release() 827 828 var alloc bytealloc.A 829 830 if r.Properties.IndexPartitions == 0 { 831 l.Index = append(l.Index, r.indexBH) 832 iter, _ := newBlockIter(r.Compare, indexH.Get()) 833 for key, value := iter.First(); key != nil; key, value = iter.Next() { 834 dataBH, err := decodeBlockHandleWithProperties(value.InPlaceValue()) 835 if err != nil { 836 return nil, errCorruptIndexEntry 837 } 838 if len(dataBH.Props) > 0 { 839 alloc, dataBH.Props = alloc.Copy(dataBH.Props) 840 } 841 l.Data = append(l.Data, dataBH) 842 } 843 } else { 844 l.TopIndex = r.indexBH 845 topIter, _ := newBlockIter(r.Compare, indexH.Get()) 846 iter := &blockIter{} 847 for key, value := topIter.First(); key != nil; key, value = topIter.Next() { 848 indexBH, err := decodeBlockHandleWithProperties(value.InPlaceValue()) 849 if err != nil { 850 return nil, errCorruptIndexEntry 851 } 852 l.Index = append(l.Index, indexBH.BlockHandle) 853 854 subIndex, err := r.readBlock(context.Background(), indexBH.BlockHandle, 855 nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */) 856 if err != nil { 857 return nil, err 858 } 859 if err := iter.init(r.Compare, subIndex.Get(), 0, /* globalSeqNum */ 860 false /* hideObsoletePoints */); err != nil { 861 return nil, err 862 } 863 for key, value := iter.First(); key != nil; key, value = iter.Next() { 864 dataBH, err := decodeBlockHandleWithProperties(value.InPlaceValue()) 865 if len(dataBH.Props) > 0 { 866 alloc, dataBH.Props = alloc.Copy(dataBH.Props) 867 } 868 if err != nil { 869 return nil, errCorruptIndexEntry 870 } 871 l.Data = append(l.Data, dataBH) 872 } 873 subIndex.Release() 874 *iter = iter.resetForReuse() 875 } 876 } 877 if r.valueBIH.h.Length != 0 { 878 vbiH, err := r.readBlock(context.Background(), r.valueBIH.h, nil, nil, nil, nil /* buffer pool */) 879 if err != nil { 880 return nil, err 881 } 882 defer vbiH.Release() 883 vbiBlock := vbiH.Get() 884 indexEntryLen := int(r.valueBIH.blockNumByteLength + r.valueBIH.blockOffsetByteLength + 885 r.valueBIH.blockLengthByteLength) 886 i := 0 887 for len(vbiBlock) != 0 { 888 if len(vbiBlock) < indexEntryLen { 889 return nil, errors.Errorf( 890 "remaining value index block %d does not contain a full entry of length %d", 891 len(vbiBlock), indexEntryLen) 892 } 893 n := int(r.valueBIH.blockNumByteLength) 894 bn := int(littleEndianGet(vbiBlock, n)) 895 if bn != i { 896 return nil, errors.Errorf("unexpected block num %d, expected %d", 897 bn, i) 898 } 899 i++ 900 vbiBlock = vbiBlock[n:] 901 n = int(r.valueBIH.blockOffsetByteLength) 902 blockOffset := littleEndianGet(vbiBlock, n) 903 vbiBlock = vbiBlock[n:] 904 n = int(r.valueBIH.blockLengthByteLength) 905 blockLen := littleEndianGet(vbiBlock, n) 906 vbiBlock = vbiBlock[n:] 907 l.ValueBlock = append(l.ValueBlock, BlockHandle{Offset: blockOffset, Length: blockLen}) 908 } 909 } 910 911 return l, nil 912 } 913 914 // ValidateBlockChecksums validates the checksums for each block in the SSTable. 915 func (r *Reader) ValidateBlockChecksums() error { 916 // Pre-compute the BlockHandles for the underlying file. 917 l, err := r.Layout() 918 if err != nil { 919 return err 920 } 921 922 // Construct the set of blocks to check. Note that the footer is not checked 923 // as it is not a block with a checksum. 924 blocks := make([]BlockHandle, len(l.Data)) 925 for i := range l.Data { 926 blocks[i] = l.Data[i].BlockHandle 927 } 928 blocks = append(blocks, l.Index...) 929 blocks = append(blocks, l.TopIndex, l.Filter, l.RangeDel, l.RangeKey, l.Properties, l.MetaIndex) 930 931 // Sorting by offset ensures we are performing a sequential scan of the 932 // file. 933 sort.Slice(blocks, func(i, j int) bool { 934 return blocks[i].Offset < blocks[j].Offset 935 }) 936 937 // Check all blocks sequentially. Make use of read-ahead, given we are 938 // scanning the entire file from start to end. 939 rh := r.readable.NewReadHandle(context.TODO()) 940 defer rh.Close() 941 942 for _, bh := range blocks { 943 // Certain blocks may not be present, in which case we skip them. 944 if bh.Length == 0 { 945 continue 946 } 947 948 // Read the block, which validates the checksum. 949 h, err := r.readBlock(context.Background(), bh, nil, rh, nil, nil /* buffer pool */) 950 if err != nil { 951 return err 952 } 953 h.Release() 954 } 955 956 return nil 957 } 958 959 // CommonProperties implemented the CommonReader interface. 960 func (r *Reader) CommonProperties() *CommonProperties { 961 return &r.Properties.CommonProperties 962 } 963 964 // EstimateDiskUsage returns the total size of data blocks overlapping the range 965 // `[start, end]`. Even if a data block partially overlaps, or we cannot 966 // determine overlap due to abbreviated index keys, the full data block size is 967 // included in the estimation. 968 // 969 // This function does not account for any metablock space usage. Assumes there 970 // is at least partial overlap, i.e., `[start, end]` falls neither completely 971 // before nor completely after the file's range. 972 // 973 // Only blocks containing point keys are considered. Range deletion and range 974 // key blocks are not considered. 975 // 976 // TODO(ajkr): account for metablock space usage. Perhaps look at the fraction of 977 // data blocks overlapped and add that same fraction of the metadata blocks to the 978 // estimate. 979 func (r *Reader) EstimateDiskUsage(start, end []byte) (uint64, error) { 980 if r.err != nil { 981 return 0, r.err 982 } 983 984 indexH, err := r.readIndex(context.Background(), nil) 985 if err != nil { 986 return 0, err 987 } 988 defer indexH.Release() 989 990 // Iterators over the bottom-level index blocks containing start and end. 991 // These may be different in case of partitioned index but will both point 992 // to the same blockIter over the single index in the unpartitioned case. 993 var startIdxIter, endIdxIter *blockIter 994 if r.Properties.IndexPartitions == 0 { 995 iter, err := newBlockIter(r.Compare, indexH.Get()) 996 if err != nil { 997 return 0, err 998 } 999 startIdxIter = iter 1000 endIdxIter = iter 1001 } else { 1002 topIter, err := newBlockIter(r.Compare, indexH.Get()) 1003 if err != nil { 1004 return 0, err 1005 } 1006 1007 key, val := topIter.SeekGE(start, base.SeekGEFlagsNone) 1008 if key == nil { 1009 // The range falls completely after this file, or an error occurred. 1010 return 0, topIter.Error() 1011 } 1012 startIdxBH, err := decodeBlockHandleWithProperties(val.InPlaceValue()) 1013 if err != nil { 1014 return 0, errCorruptIndexEntry 1015 } 1016 startIdxBlock, err := r.readBlock(context.Background(), startIdxBH.BlockHandle, 1017 nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */) 1018 if err != nil { 1019 return 0, err 1020 } 1021 defer startIdxBlock.Release() 1022 startIdxIter, err = newBlockIter(r.Compare, startIdxBlock.Get()) 1023 if err != nil { 1024 return 0, err 1025 } 1026 1027 key, val = topIter.SeekGE(end, base.SeekGEFlagsNone) 1028 if key == nil { 1029 if err := topIter.Error(); err != nil { 1030 return 0, err 1031 } 1032 } else { 1033 endIdxBH, err := decodeBlockHandleWithProperties(val.InPlaceValue()) 1034 if err != nil { 1035 return 0, errCorruptIndexEntry 1036 } 1037 endIdxBlock, err := r.readBlock(context.Background(), 1038 endIdxBH.BlockHandle, nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */) 1039 if err != nil { 1040 return 0, err 1041 } 1042 defer endIdxBlock.Release() 1043 endIdxIter, err = newBlockIter(r.Compare, endIdxBlock.Get()) 1044 if err != nil { 1045 return 0, err 1046 } 1047 } 1048 } 1049 // startIdxIter should not be nil at this point, while endIdxIter can be if the 1050 // range spans past the end of the file. 1051 1052 key, val := startIdxIter.SeekGE(start, base.SeekGEFlagsNone) 1053 if key == nil { 1054 // The range falls completely after this file, or an error occurred. 1055 return 0, startIdxIter.Error() 1056 } 1057 startBH, err := decodeBlockHandleWithProperties(val.InPlaceValue()) 1058 if err != nil { 1059 return 0, errCorruptIndexEntry 1060 } 1061 1062 includeInterpolatedValueBlocksSize := func(dataBlockSize uint64) uint64 { 1063 // INVARIANT: r.Properties.DataSize > 0 since startIdxIter is not nil. 1064 // Linearly interpolate what is stored in value blocks. 1065 // 1066 // TODO(sumeer): if we need more accuracy, without loading any data blocks 1067 // (which contain the value handles, and which may also be insufficient if 1068 // the values are in separate files), we will need to accumulate the 1069 // logical size of the key-value pairs and store the cumulative value for 1070 // each data block in the index block entry. This increases the size of 1071 // the BlockHandle, so wait until this becomes necessary. 1072 return dataBlockSize + 1073 uint64((float64(dataBlockSize)/float64(r.Properties.DataSize))* 1074 float64(r.Properties.ValueBlocksSize)) 1075 } 1076 if endIdxIter == nil { 1077 // The range spans beyond this file. Include data blocks through the last. 1078 return includeInterpolatedValueBlocksSize(r.Properties.DataSize - startBH.Offset), nil 1079 } 1080 key, val = endIdxIter.SeekGE(end, base.SeekGEFlagsNone) 1081 if key == nil { 1082 if err := endIdxIter.Error(); err != nil { 1083 return 0, err 1084 } 1085 // The range spans beyond this file. Include data blocks through the last. 1086 return includeInterpolatedValueBlocksSize(r.Properties.DataSize - startBH.Offset), nil 1087 } 1088 endBH, err := decodeBlockHandleWithProperties(val.InPlaceValue()) 1089 if err != nil { 1090 return 0, errCorruptIndexEntry 1091 } 1092 return includeInterpolatedValueBlocksSize( 1093 endBH.Offset + endBH.Length + blockTrailerLen - startBH.Offset), nil 1094 } 1095 1096 // TableFormat returns the format version for the table. 1097 func (r *Reader) TableFormat() (TableFormat, error) { 1098 if r.err != nil { 1099 return TableFormatUnspecified, r.err 1100 } 1101 return r.tableFormat, nil 1102 } 1103 1104 // NewReader returns a new table reader for the file. Closing the reader will 1105 // close the file. 1106 func NewReader(f objstorage.Readable, o ReaderOptions, extraOpts ...ReaderOption) (*Reader, error) { 1107 o = o.ensureDefaults() 1108 r := &Reader{ 1109 readable: f, 1110 opts: o, 1111 } 1112 if r.opts.Cache == nil { 1113 r.opts.Cache = cache.New(0) 1114 } else { 1115 r.opts.Cache.Ref() 1116 } 1117 1118 if f == nil { 1119 r.err = errors.New("pebble/table: nil file") 1120 return nil, r.Close() 1121 } 1122 1123 // Note that the extra options are applied twice. First here for pre-apply 1124 // options, and then below for post-apply options. Pre and post refer to 1125 // before and after reading the metaindex and properties. 1126 type preApply interface{ preApply() } 1127 for _, opt := range extraOpts { 1128 if _, ok := opt.(preApply); ok { 1129 opt.readerApply(r) 1130 } 1131 } 1132 if r.cacheID == 0 { 1133 r.cacheID = r.opts.Cache.NewID() 1134 } 1135 1136 footer, err := readFooter(f) 1137 if err != nil { 1138 r.err = err 1139 return nil, r.Close() 1140 } 1141 r.checksumType = footer.checksum 1142 r.tableFormat = footer.format 1143 // Read the metaindex. 1144 if err := r.readMetaindex(footer.metaindexBH); err != nil { 1145 r.err = err 1146 return nil, r.Close() 1147 } 1148 r.indexBH = footer.indexBH 1149 r.metaIndexBH = footer.metaindexBH 1150 r.footerBH = footer.footerBH 1151 1152 if r.Properties.ComparerName == "" || o.Comparer.Name == r.Properties.ComparerName { 1153 r.Compare = o.Comparer.Compare 1154 r.FormatKey = o.Comparer.FormatKey 1155 r.Split = o.Comparer.Split 1156 } 1157 1158 if o.MergerName == r.Properties.MergerName { 1159 r.mergerOK = true 1160 } 1161 1162 // Apply the extra options again now that the comparer and merger names are 1163 // known. 1164 for _, opt := range extraOpts { 1165 if _, ok := opt.(preApply); !ok { 1166 opt.readerApply(r) 1167 } 1168 } 1169 1170 if r.Compare == nil { 1171 r.err = errors.Errorf("pebble/table: %d: unknown comparer %s", 1172 errors.Safe(r.fileNum), errors.Safe(r.Properties.ComparerName)) 1173 } 1174 if !r.mergerOK { 1175 if name := r.Properties.MergerName; name != "" && name != "nullptr" { 1176 r.err = errors.Errorf("pebble/table: %d: unknown merger %s", 1177 errors.Safe(r.fileNum), errors.Safe(r.Properties.MergerName)) 1178 } 1179 } 1180 if r.err != nil { 1181 return nil, r.Close() 1182 } 1183 1184 return r, nil 1185 } 1186 1187 // ReadableFile describes the smallest subset of vfs.File that is required for 1188 // reading SSTs. 1189 type ReadableFile interface { 1190 io.ReaderAt 1191 io.Closer 1192 Stat() (os.FileInfo, error) 1193 } 1194 1195 // NewSimpleReadable wraps a ReadableFile in a objstorage.Readable 1196 // implementation (which does not support read-ahead) 1197 func NewSimpleReadable(r ReadableFile) (objstorage.Readable, error) { 1198 info, err := r.Stat() 1199 if err != nil { 1200 return nil, err 1201 } 1202 res := &simpleReadable{ 1203 f: r, 1204 size: info.Size(), 1205 } 1206 res.rh = objstorage.MakeNoopReadHandle(res) 1207 return res, nil 1208 } 1209 1210 // simpleReadable wraps a ReadableFile to implement objstorage.Readable. 1211 type simpleReadable struct { 1212 f ReadableFile 1213 size int64 1214 rh objstorage.NoopReadHandle 1215 } 1216 1217 var _ objstorage.Readable = (*simpleReadable)(nil) 1218 1219 // ReadAt is part of the objstorage.Readable interface. 1220 func (s *simpleReadable) ReadAt(_ context.Context, p []byte, off int64) error { 1221 n, err := s.f.ReadAt(p, off) 1222 if invariants.Enabled && err == nil && n != len(p) { 1223 panic("short read") 1224 } 1225 return err 1226 } 1227 1228 // Close is part of the objstorage.Readable interface. 1229 func (s *simpleReadable) Close() error { 1230 return s.f.Close() 1231 } 1232 1233 // Size is part of the objstorage.Readable interface. 1234 func (s *simpleReadable) Size() int64 { 1235 return s.size 1236 } 1237 1238 // NewReaddHandle is part of the objstorage.Readable interface. 1239 func (s *simpleReadable) NewReadHandle(_ context.Context) objstorage.ReadHandle { 1240 return &s.rh 1241 }