github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/sstable/reader.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package sstable 6 7 import ( 8 "bytes" 9 "encoding/binary" 10 "fmt" 11 "io" 12 "os" 13 "sort" 14 "sync" 15 "unsafe" 16 17 "github.com/cespare/xxhash/v2" 18 "github.com/cockroachdb/errors" 19 "github.com/zuoyebang/bitalostable/internal/base" 20 "github.com/zuoyebang/bitalostable/internal/cache" 21 "github.com/zuoyebang/bitalostable/internal/crc" 22 "github.com/zuoyebang/bitalostable/internal/invariants" 23 "github.com/zuoyebang/bitalostable/internal/keyspan" 24 "github.com/zuoyebang/bitalostable/internal/private" 25 "github.com/zuoyebang/bitalostable/vfs" 26 ) 27 28 var errCorruptIndexEntry = base.CorruptionErrorf("bitalostable/table: corrupt index entry") 29 var errReaderClosed = errors.New("bitalostable/table: reader is closed") 30 31 const ( 32 // Constants for dynamic readahead of data blocks. Note that the size values 33 // make sense as some multiple of the default block size; and they should 34 // both be larger than the default block size. 35 minFileReadsForReadahead = 2 36 // TODO(bilal): Have the initial size value be a factor of the block size, 37 // as opposed to a hardcoded value. 38 initialReadaheadSize = 64 << 10 /* 64KB */ 39 maxReadaheadSize = 256 << 10 /* 256KB */ 40 ) 41 42 // decodeBlockHandle returns the block handle encoded at the start of src, as 43 // well as the number of bytes it occupies. It returns zero if given invalid 44 // input. A block handle for a data block or a first/lower level index block 45 // should not be decoded using decodeBlockHandle since the caller may validate 46 // that the number of bytes decoded is equal to the length of src, which will 47 // be false if the properties are not decoded. In those cases the caller 48 // should use decodeBlockHandleWithProperties. 49 func decodeBlockHandle(src []byte) (BlockHandle, int) { 50 offset, n := binary.Uvarint(src) 51 length, m := binary.Uvarint(src[n:]) 52 if n == 0 || m == 0 { 53 return BlockHandle{}, 0 54 } 55 return BlockHandle{offset, length}, n + m 56 } 57 58 // decodeBlockHandleWithProperties returns the block handle and properties 59 // encoded in src. src needs to be exactly the length that was encoded. This 60 // method must be used for data block and first/lower level index blocks. The 61 // properties in the block handle point to the bytes in src. 62 func decodeBlockHandleWithProperties(src []byte) (BlockHandleWithProperties, error) { 63 bh, n := decodeBlockHandle(src) 64 if n == 0 { 65 return BlockHandleWithProperties{}, errors.Errorf("invalid BlockHandle") 66 } 67 return BlockHandleWithProperties{ 68 BlockHandle: bh, 69 Props: src[n:], 70 }, nil 71 } 72 73 func encodeBlockHandle(dst []byte, b BlockHandle) int { 74 n := binary.PutUvarint(dst, b.Offset) 75 m := binary.PutUvarint(dst[n:], b.Length) 76 return n + m 77 } 78 79 func encodeBlockHandleWithProperties(dst []byte, b BlockHandleWithProperties) []byte { 80 n := encodeBlockHandle(dst, b.BlockHandle) 81 dst = append(dst[:n], b.Props...) 82 return dst 83 } 84 85 // block is a []byte that holds a sequence of key/value pairs plus an index 86 // over those pairs. 87 type block []byte 88 89 // Iterator iterates over an entire table of data. 90 type Iterator interface { 91 base.InternalIterator 92 93 // MaybeFilteredKeys may be called when an iterator is exhausted to indicate 94 // whether or not the last positioning method may have skipped any keys due 95 // to block-property filters. This is used by the Pebble levelIter to 96 // control when an iterator steps to the next sstable. 97 // 98 // MaybeFilteredKeys may always return false positives, that is it may 99 // return true when no keys were filtered. It should only be called when the 100 // iterator is exhausted. It must never return false negatives when the 101 // iterator is exhausted. 102 MaybeFilteredKeys() bool 103 104 SetCloseHook(fn func(i Iterator) error) 105 } 106 107 // singleLevelIterator iterates over an entire table of data. To seek for a given 108 // key, it first looks in the index for the block that contains that key, and then 109 // looks inside that block. 110 type singleLevelIterator struct { 111 cmp Compare 112 // Global lower/upper bound for the iterator. 113 lower []byte 114 upper []byte 115 bpfs *BlockPropertiesFilterer 116 // Per-block lower/upper bound. Nil if the bound does not apply to the block 117 // because we determined the block lies completely within the bound. 118 blockLower []byte 119 blockUpper []byte 120 reader *Reader 121 index blockIter 122 data blockIter 123 dataRS readaheadState 124 // dataBH refers to the last data block that the iterator considered 125 // loading. It may not actually have loaded the block, due to an error or 126 // because it was considered irrelevant. 127 dataBH BlockHandle 128 err error 129 closeHook func(i Iterator) error 130 stats *base.InternalIteratorStats 131 132 // boundsCmp and positionedUsingLatestBounds are for optimizing iteration 133 // that uses multiple adjacent bounds. The seek after setting a new bound 134 // can use the fact that the iterator is either within the previous bounds 135 // or exactly one key before or after the bounds. If the new bounds is 136 // after/before the previous bounds, and we are already positioned at a 137 // block that is relevant for the new bounds, we can try to first position 138 // using Next/Prev (repeatedly) instead of doing a more expensive seek. 139 // 140 // When there are wide files at higher levels that match the bounds 141 // but don't have any data for the bound, we will already be 142 // positioned at the key beyond the bounds and won't need to do much 143 // work -- given that most data is in L6, such files are likely to 144 // dominate the performance of the mergingIter, and may be the main 145 // benefit of this performance optimization (of course it also helps 146 // when the file that has the data has successive seeks that stay in 147 // the same block). 148 // 149 // Specifically, boundsCmp captures the relationship between the previous 150 // and current bounds, if the iterator had been positioned after setting 151 // the previous bounds. If it was not positioned, i.e., Seek/First/Last 152 // were not called, we don't know where it is positioned and cannot 153 // optimize. 154 // 155 // Example: Bounds moving forward, and iterator exhausted in forward direction. 156 // bounds = [f, h), ^ shows block iterator position 157 // file contents [ a b c d e f g h i j k ] 158 // ^ 159 // new bounds = [j, k). Since positionedUsingLatestBounds=true, boundsCmp is 160 // set to +1. SeekGE(j) can use next (the optimization also requires that j 161 // is within the block, but that is not for correctness, but to limit the 162 // optimization to when it will actually be an optimization). 163 // 164 // Example: Bounds moving forward. 165 // bounds = [f, h), ^ shows block iterator position 166 // file contents [ a b c d e f g h i j k ] 167 // ^ 168 // new bounds = [j, k). Since positionedUsingLatestBounds=true, boundsCmp is 169 // set to +1. SeekGE(j) can use next. 170 // 171 // Example: Bounds moving forward, but iterator not positioned using previous 172 // bounds. 173 // bounds = [f, h), ^ shows block iterator position 174 // file contents [ a b c d e f g h i j k ] 175 // ^ 176 // new bounds = [i, j). Iterator is at j since it was never positioned using 177 // [f, h). So positionedUsingLatestBounds=false, and boundsCmp is set to 0. 178 // SeekGE(i) will not use next. 179 // 180 // Example: Bounds moving forward and sparse file 181 // bounds = [f, h), ^ shows block iterator position 182 // file contents [ a z ] 183 // ^ 184 // new bounds = [j, k). Since positionedUsingLatestBounds=true, boundsCmp is 185 // set to +1. SeekGE(j) notices that the iterator is already past j and does 186 // not need to do anything. 187 // 188 // Similar examples can be constructed for backward iteration. 189 // 190 // This notion of exactly one key before or after the bounds is not quite 191 // true when block properties are used to ignore blocks. In that case we 192 // can't stop precisely at the first block that is past the bounds since 193 // we are using the index entries to enforce the bounds. 194 // 195 // e.g. 3 blocks with keys [b, c] [f, g], [i, j, k] with index entries d, 196 // h, l. And let the lower bound be k, and we are reverse iterating. If 197 // the block [i, j, k] is ignored due to the block interval annotations we 198 // do need to move the index to block [f, g] since the index entry for the 199 // [i, j, k] block is l which is not less than the lower bound of k. So we 200 // have passed the entries i, j. 201 // 202 // This behavior is harmless since the block property filters are fixed 203 // for the lifetime of the iterator so i, j are irrelevant. In addition, 204 // the current code will not load the [f, g] block, so the seek 205 // optimization that attempts to use Next/Prev do not apply anyway. 206 boundsCmp int 207 positionedUsingLatestBounds bool 208 209 // exhaustedBounds represents whether the iterator is exhausted for 210 // iteration by reaching the upper or lower bound. +1 when exhausted 211 // the upper bound, -1 when exhausted the lower bound, and 0 when 212 // neither. It is used for invariant checking. 213 exhaustedBounds int8 214 215 // maybeFilteredKeysSingleLevel indicates whether the last iterator 216 // positioning operation may have skipped any data blocks due to 217 // block-property filters when positioning the index. 218 maybeFilteredKeysSingleLevel bool 219 220 // useFilter specifies whether the filter block in this sstable, if present, 221 // should be used for prefix seeks or not. In some cases it is beneficial 222 // to skip a filter block even if it exists (eg. if probability of a match 223 // is high). 224 useFilter bool 225 lastBloomFilterMatched bool 226 } 227 228 // singleLevelIterator implements the base.InternalIterator interface. 229 var _ base.InternalIterator = (*singleLevelIterator)(nil) 230 231 var singleLevelIterPool = sync.Pool{ 232 New: func() interface{} { 233 i := &singleLevelIterator{} 234 // Note: this is a no-op if invariants are disabled or race is enabled. 235 invariants.SetFinalizer(i, checkSingleLevelIterator) 236 return i 237 }, 238 } 239 240 var twoLevelIterPool = sync.Pool{ 241 New: func() interface{} { 242 i := &twoLevelIterator{} 243 // Note: this is a no-op if invariants are disabled or race is enabled. 244 invariants.SetFinalizer(i, checkTwoLevelIterator) 245 return i 246 }, 247 } 248 249 // TODO(jackson): rangedel fragmentBlockIters can't be pooled because of some 250 // code paths that double Close the iters. Fix the double close and pool the 251 // *fragmentBlockIter type directly. 252 253 var rangeKeyFragmentBlockIterPool = sync.Pool{ 254 New: func() interface{} { 255 i := &rangeKeyFragmentBlockIter{} 256 // Note: this is a no-op if invariants are disabled or race is enabled. 257 invariants.SetFinalizer(i, checkRangeKeyFragmentBlockIterator) 258 return i 259 }, 260 } 261 262 func checkSingleLevelIterator(obj interface{}) { 263 i := obj.(*singleLevelIterator) 264 if p := i.data.cacheHandle.Get(); p != nil { 265 fmt.Fprintf(os.Stderr, "singleLevelIterator.data.cacheHandle is not nil: %p\n", p) 266 os.Exit(1) 267 } 268 if p := i.index.cacheHandle.Get(); p != nil { 269 fmt.Fprintf(os.Stderr, "singleLevelIterator.index.cacheHandle is not nil: %p\n", p) 270 os.Exit(1) 271 } 272 } 273 274 func checkTwoLevelIterator(obj interface{}) { 275 i := obj.(*twoLevelIterator) 276 if p := i.data.cacheHandle.Get(); p != nil { 277 fmt.Fprintf(os.Stderr, "singleLevelIterator.data.cacheHandle is not nil: %p\n", p) 278 os.Exit(1) 279 } 280 if p := i.index.cacheHandle.Get(); p != nil { 281 fmt.Fprintf(os.Stderr, "singleLevelIterator.index.cacheHandle is not nil: %p\n", p) 282 os.Exit(1) 283 } 284 } 285 286 func checkRangeKeyFragmentBlockIterator(obj interface{}) { 287 i := obj.(*rangeKeyFragmentBlockIter) 288 if p := i.blockIter.cacheHandle.Get(); p != nil { 289 fmt.Fprintf(os.Stderr, "fragmentBlockIter.blockIter.cacheHandle is not nil: %p\n", p) 290 os.Exit(1) 291 } 292 } 293 294 // init initializes a singleLevelIterator for reading from the table. It is 295 // synonmous with Reader.NewIter, but allows for reusing of the iterator 296 // between different Readers. 297 func (i *singleLevelIterator) init( 298 r *Reader, 299 lower, upper []byte, 300 filterer *BlockPropertiesFilterer, 301 useFilter bool, 302 stats *base.InternalIteratorStats, 303 ) error { 304 if r.err != nil { 305 return r.err 306 } 307 indexH, err := r.readIndex() 308 if err != nil { 309 return err 310 } 311 312 i.lower = lower 313 i.upper = upper 314 i.bpfs = filterer 315 i.useFilter = useFilter 316 i.reader = r 317 i.cmp = r.Compare 318 i.stats = stats 319 err = i.index.initHandle(i.cmp, indexH, r.Properties.GlobalSeqNum) 320 if err != nil { 321 // blockIter.Close releases indexH and always returns a nil error 322 _ = i.index.Close() 323 return err 324 } 325 i.dataRS.size = initialReadaheadSize 326 return nil 327 } 328 329 // setupForCompaction sets up the singleLevelIterator for use with compactionIter. 330 // Currently, it skips readahead ramp-up. It should be called after init is called. 331 func (i *singleLevelIterator) setupForCompaction() { 332 if i.reader.fs != nil { 333 f, err := i.reader.fs.Open(i.reader.filename, vfs.SequentialReadsOption) 334 if err == nil { 335 // Given that this iterator is for a compaction, we can assume that it 336 // will be read sequentially and we can skip the readahead ramp-up. 337 i.dataRS.sequentialFile = f 338 } 339 } 340 } 341 342 func (i *singleLevelIterator) resetForReuse() singleLevelIterator { 343 return singleLevelIterator{ 344 index: i.index.resetForReuse(), 345 data: i.data.resetForReuse(), 346 } 347 } 348 349 func (i *singleLevelIterator) initBounds() { 350 // Trim the iteration bounds for the current block. We don't have to check 351 // the bounds on each iteration if the block is entirely contained within the 352 // iteration bounds. 353 i.blockLower = i.lower 354 if i.blockLower != nil { 355 key, _ := i.data.First() 356 if key != nil && i.cmp(i.blockLower, key.UserKey) < 0 { 357 // The lower-bound is less than the first key in the block. No need 358 // to check the lower-bound again for this block. 359 i.blockLower = nil 360 } 361 } 362 i.blockUpper = i.upper 363 if i.blockUpper != nil && i.cmp(i.blockUpper, i.index.Key().UserKey) > 0 { 364 // The upper-bound is greater than the index key which itself is greater 365 // than or equal to every key in the block. No need to check the 366 // upper-bound again for this block. 367 i.blockUpper = nil 368 } 369 } 370 371 type loadBlockResult int8 372 373 const ( 374 loadBlockOK loadBlockResult = iota 375 // Could be due to error or because no block left to load. 376 loadBlockFailed 377 loadBlockIrrelevant 378 ) 379 380 // loadBlock loads the block at the current index position and leaves i.data 381 // unpositioned. If unsuccessful, it sets i.err to any error encountered, which 382 // may be nil if we have simply exhausted the entire table. 383 func (i *singleLevelIterator) loadBlock(dir int8) loadBlockResult { 384 if !i.index.valid() { 385 // Ensure the data block iterator is invalidated even if loading of the block 386 // fails. 387 i.data.invalidate() 388 return loadBlockFailed 389 } 390 // Load the next block. 391 v := i.index.Value() 392 bhp, err := decodeBlockHandleWithProperties(v) 393 if i.dataBH == bhp.BlockHandle && i.data.valid() { 394 // We're already at the data block we want to load. Reset bounds in case 395 // they changed since the last seek, but don't reload the block from cache 396 // or disk. 397 // 398 // It's safe to leave i.data in its original state here, as all callers to 399 // loadBlock make an absolute positioning call (i.e. a seek, first, or last) 400 // to `i.data` right after loadBlock returns loadBlockOK. 401 i.initBounds() 402 return loadBlockOK 403 } 404 // Ensure the data block iterator is invalidated even if loading of the block 405 // fails. 406 i.data.invalidate() 407 i.dataBH = bhp.BlockHandle 408 if err != nil { 409 i.err = errCorruptIndexEntry 410 return loadBlockFailed 411 } 412 if i.bpfs != nil { 413 intersects, err := i.bpfs.intersects(bhp.Props) 414 if err != nil { 415 i.err = errCorruptIndexEntry 416 return loadBlockFailed 417 } 418 if intersects == blockMaybeExcluded { 419 intersects = i.resolveMaybeExcluded(dir) 420 } 421 if intersects == blockExcluded { 422 i.maybeFilteredKeysSingleLevel = true 423 return loadBlockIrrelevant 424 } 425 // blockIntersects 426 } 427 block, err := i.readBlockWithStats(i.dataBH, &i.dataRS) 428 if err != nil { 429 i.err = err 430 return loadBlockFailed 431 } 432 i.err = i.data.initHandle(i.cmp, block, i.reader.Properties.GlobalSeqNum) 433 if i.err != nil { 434 // The block is partially loaded, and we don't want it to appear valid. 435 i.data.invalidate() 436 return loadBlockFailed 437 } 438 i.initBounds() 439 return loadBlockOK 440 } 441 442 // resolveMaybeExcluded is invoked when the block-property filterer has found 443 // that a block is excluded according to its properties but only if its bounds 444 // fall within the filter's current bounds. This function consults the 445 // apprioriate bound, depending on the iteration direction, and returns either 446 // `blockIntersects` or `blockMaybeExcluded`. 447 func (i *singleLevelIterator) resolveMaybeExcluded(dir int8) intersectsResult { 448 // TODO(jackson): We could first try comparing to top-level index block's 449 // key, and if within bounds avoid per-data block key comparisons. 450 451 // This iterator is configured with a bound-limited block property 452 // filter. The bpf determined this block could be excluded from 453 // iteration based on the property encoded in the block handle. 454 // However, we still need to determine if the block is wholly 455 // contained within the filter's key bounds. 456 // 457 // External guarantees ensure all the block's keys are ≥ the 458 // filter's lower bound during forward iteration, and that all the 459 // block's keys are < the filter's upper bound during backward 460 // iteration. We only need to determine if the opposite bound is 461 // also met. 462 // 463 // The index separator in index.Key() provides an inclusive 464 // upper-bound for the data block's keys, guaranteeing that all its 465 // keys are ≤ index.Key(). For forward iteration, this is all we 466 // need. 467 if dir > 0 { 468 // Forward iteration. 469 if i.bpfs.boundLimitedFilter.KeyIsWithinUpperBound(i.index.Key()) { 470 return blockExcluded 471 } 472 return blockIntersects 473 } 474 475 // Reverse iteration. 476 // 477 // Because we're iterating in the reverse direction, we don't yet have 478 // enough context available to determine if the block is wholly contained 479 // within its bounds. This case arises only during backward iteration, 480 // because of the way the index is structured. 481 // 482 // Consider a bound-limited bpf limited to the bounds [b,d), loading the 483 // block with separator `c`. During reverse iteration, the guarantee that 484 // all the block's keys are < `d` is externally provided, but no guarantee 485 // is made on the bpf's lower bound. The separator `c` only provides an 486 // inclusive upper bound on the block's keys, indicating that the 487 // corresponding block handle points to a block containing only keys ≤ `c`. 488 // 489 // To establish a lower bound, we step the index backwards to read the 490 // previous block's separator, which provides an inclusive lower bound on 491 // the original block's keys. Afterwards, we step forward to restore our 492 // index position. 493 if peekKey, _ := i.index.Prev(); peekKey == nil { 494 // The original block points to the first block of this index block. If 495 // there's a two-level index, it could potentially provide a lower 496 // bound, but the code refactoring necessary to read it doesn't seem 497 // worth the payoff. We fall through to loading the block. 498 } else if i.bpfs.boundLimitedFilter.KeyIsWithinLowerBound(peekKey) { 499 // The lower-bound on the original block falls within the filter's 500 // bounds, and we can skip the block (after restoring our current index 501 // position). 502 _, _ = i.index.Next() 503 return blockExcluded 504 } 505 _, _ = i.index.Next() 506 return blockIntersects 507 } 508 509 func (i *singleLevelIterator) readBlockWithStats( 510 bh BlockHandle, raState *readaheadState, 511 ) (cache.Handle, error) { 512 block, cacheHit, err := i.reader.readBlock(bh, nil /* transform */, raState) 513 if err == nil && i.stats != nil { 514 n := bh.Length 515 i.stats.BlockBytes += n 516 if cacheHit { 517 i.stats.BlockBytesInCache += n 518 } 519 } 520 return block, err 521 } 522 523 func (i *singleLevelIterator) initBoundsForAlreadyLoadedBlock() { 524 if i.data.firstKey.UserKey == nil { 525 panic("initBoundsForAlreadyLoadedBlock must not be called on empty or corrupted block") 526 } 527 i.blockLower = i.lower 528 if i.blockLower != nil { 529 if i.data.firstKey.UserKey != nil && i.cmp(i.blockLower, i.data.firstKey.UserKey) < 0 { 530 // The lower-bound is less than the first key in the block. No need 531 // to check the lower-bound again for this block. 532 i.blockLower = nil 533 } 534 } 535 i.blockUpper = i.upper 536 if i.blockUpper != nil && i.cmp(i.blockUpper, i.index.Key().UserKey) > 0 { 537 // The upper-bound is greater than the index key which itself is greater 538 // than or equal to every key in the block. No need to check the 539 // upper-bound again for this block. 540 i.blockUpper = nil 541 } 542 } 543 544 // The number of times to call Next/Prev in a block before giving up and seeking. 545 // The value of 4 is arbitrary. 546 // TODO(sumeer): experiment with dynamic adjustment based on the history of 547 // seeks for a particular iterator. 548 const numStepsBeforeSeek = 4 549 550 func (i *singleLevelIterator) trySeekGEUsingNextWithinBlock( 551 key []byte, 552 ) (k *InternalKey, v []byte, done bool) { 553 k, v = i.data.Key(), i.data.Value() 554 for j := 0; j < numStepsBeforeSeek; j++ { 555 curKeyCmp := i.cmp(k.UserKey, key) 556 if curKeyCmp >= 0 { 557 if i.blockUpper != nil && i.cmp(k.UserKey, i.blockUpper) >= 0 { 558 i.exhaustedBounds = +1 559 return nil, nil, true 560 } 561 return k, v, true 562 } 563 k, v = i.data.Next() 564 if k == nil { 565 break 566 } 567 } 568 return k, v, false 569 } 570 571 func (i *singleLevelIterator) trySeekLTUsingPrevWithinBlock( 572 key []byte, 573 ) (k *InternalKey, v []byte, done bool) { 574 k, v = i.data.Key(), i.data.Value() 575 for j := 0; j < numStepsBeforeSeek; j++ { 576 curKeyCmp := i.cmp(k.UserKey, key) 577 if curKeyCmp < 0 { 578 if i.blockLower != nil && i.cmp(k.UserKey, i.blockLower) < 0 { 579 i.exhaustedBounds = -1 580 return nil, nil, true 581 } 582 return k, v, true 583 } 584 k, v = i.data.Prev() 585 if k == nil { 586 break 587 } 588 } 589 return k, v, false 590 } 591 592 func (i *singleLevelIterator) recordOffset() uint64 { 593 offset := i.dataBH.Offset 594 if i.data.valid() { 595 // - i.dataBH.Length/len(i.data.data) is the compression ratio. If 596 // uncompressed, this is 1. 597 // - i.data.nextOffset is the uncompressed position of the current record 598 // in the block. 599 // - i.dataBH.Offset is the offset of the block in the sstable before 600 // decompression. 601 offset += (uint64(i.data.nextOffset) * i.dataBH.Length) / uint64(len(i.data.data)) 602 } else { 603 // Last entry in the block must increment bytes iterated by the size of the block trailer 604 // and restart points. 605 offset += i.dataBH.Length + blockTrailerLen 606 } 607 return offset 608 } 609 610 // SeekGE implements internalIterator.SeekGE, as documented in the bitalostable 611 // package. Note that SeekGE only checks the upper bound. It is up to the 612 // caller to ensure that key is greater than or equal to the lower bound. 613 func (i *singleLevelIterator) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, []byte) { 614 // The i.exhaustedBounds comparison indicates that the upper bound was 615 // reached. The i.data.isDataInvalidated() indicates that the sstable was 616 // exhausted. 617 if flags.TrySeekUsingNext() && (i.exhaustedBounds == +1 || i.data.isDataInvalidated()) { 618 // Already exhausted, so return nil. 619 return nil, nil 620 } 621 622 i.exhaustedBounds = 0 623 i.err = nil 624 boundsCmp := i.boundsCmp 625 // Seek optimization only applies until iterator is first positioned after SetBounds. 626 i.boundsCmp = 0 627 i.positionedUsingLatestBounds = true 628 return i.seekGEHelper(key, boundsCmp, flags) 629 } 630 631 // seekGEHelper contains the common functionality for SeekGE and SeekPrefixGE. 632 func (i *singleLevelIterator) seekGEHelper( 633 key []byte, boundsCmp int, flags base.SeekGEFlags, 634 ) (*InternalKey, []byte) { 635 // Invariant: trySeekUsingNext => !i.data.isDataInvalidated() && i.exhaustedBounds != +1 636 637 // SeekGE performs various step-instead-of-seeking optimizations: eg enabled 638 // by trySeekUsingNext, or by monotonically increasing bounds (i.boundsCmp). 639 // Care must be taken to ensure that when performing these optimizations and 640 // the iterator becomes exhausted, i.maybeFilteredKeys is set appropriately. 641 // Consider a previous SeekGE that filtered keys from k until the current 642 // iterator position. 643 // 644 // If the previous SeekGE exhausted the iterator, it's possible keys greater 645 // than or equal to the current search key were filtered. We must not reuse 646 // the current iterator position without remembering the previous value of 647 // maybeFilteredKeys. 648 649 var dontSeekWithinBlock bool 650 if !i.data.isDataInvalidated() && !i.index.isDataInvalidated() && i.data.valid() && i.index.valid() && 651 boundsCmp > 0 && i.cmp(key, i.index.Key().UserKey) <= 0 { 652 // Fast-path: The bounds have moved forward and this SeekGE is 653 // respecting the lower bound (guaranteed by Iterator). We know that 654 // the iterator must already be positioned within or just outside the 655 // previous bounds. Therefore it cannot be positioned at a block (or 656 // the position within that block) that is ahead of the seek position. 657 // However it can be positioned at an earlier block. This fast-path to 658 // use Next() on the block is only applied when we are already at the 659 // block that the slow-path (the else-clause) would load -- this is 660 // the motivation for the i.cmp(key, i.index.Key().UserKey) <= 0 661 // predicate. 662 i.initBoundsForAlreadyLoadedBlock() 663 ikey, val, done := i.trySeekGEUsingNextWithinBlock(key) 664 if done { 665 return ikey, val 666 } 667 if ikey == nil { 668 // Done with this block. 669 dontSeekWithinBlock = true 670 } 671 } else { 672 // Cannot use bounds monotonicity. But may be able to optimize if 673 // caller claimed externally known invariant represented by 674 // flags.TrySeekUsingNext(). 675 if flags.TrySeekUsingNext() { 676 // seekPrefixGE or SeekGE has already ensured 677 // !i.data.isDataInvalidated() && i.exhaustedBounds != +1 678 currKey := i.data.Key() 679 value := i.data.Value() 680 less := i.cmp(currKey.UserKey, key) < 0 681 // We could be more sophisticated and confirm that the seek 682 // position is within the current block before applying this 683 // optimization. But there may be some benefit even if it is in 684 // the next block, since we can avoid seeking i.index. 685 for j := 0; less && j < numStepsBeforeSeek; j++ { 686 currKey, value = i.Next() 687 if currKey == nil { 688 return nil, nil 689 } 690 less = i.cmp(currKey.UserKey, key) < 0 691 } 692 if !less { 693 if i.blockUpper != nil && i.cmp(currKey.UserKey, i.blockUpper) >= 0 { 694 i.exhaustedBounds = +1 695 return nil, nil 696 } 697 return currKey, value 698 } 699 } 700 701 // Slow-path. 702 703 // Since we're re-seeking the iterator, the previous value of 704 // maybeFilteredKeysSingleLevel is irrelevant. If we filter out blocks 705 // during seeking, loadBlock will set it to true. 706 i.maybeFilteredKeysSingleLevel = false 707 708 var ikey *InternalKey 709 if ikey, _ = i.index.SeekGE(key, flags.DisableTrySeekUsingNext()); ikey == nil { 710 // The target key is greater than any key in the index block. 711 // Invalidate the block iterator so that a subsequent call to Prev() 712 // will return the last key in the table. 713 i.data.invalidate() 714 return nil, nil 715 } 716 result := i.loadBlock(+1) 717 if result == loadBlockFailed { 718 return nil, nil 719 } 720 if result == loadBlockIrrelevant { 721 // Enforce the upper bound here since don't want to bother moving 722 // to the next block if upper bound is already exceeded. Note that 723 // the next block starts with keys >= ikey.UserKey since even 724 // though this is the block separator, the same user key can span 725 // multiple blocks. Since upper is exclusive we use >= below. 726 if i.upper != nil && i.cmp(ikey.UserKey, i.upper) >= 0 { 727 i.exhaustedBounds = +1 728 return nil, nil 729 } 730 // Want to skip to the next block. 731 dontSeekWithinBlock = true 732 } 733 } 734 if !dontSeekWithinBlock { 735 if ikey, val := i.data.SeekGE(key, flags.DisableTrySeekUsingNext()); ikey != nil { 736 if i.blockUpper != nil && i.cmp(ikey.UserKey, i.blockUpper) >= 0 { 737 i.exhaustedBounds = +1 738 return nil, nil 739 } 740 return ikey, val 741 } 742 } 743 return i.skipForward() 744 } 745 746 // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the 747 // bitalostable package. Note that SeekPrefixGE only checks the upper bound. It is up 748 // to the caller to ensure that key is greater than or equal to the lower bound. 749 func (i *singleLevelIterator) SeekPrefixGE( 750 prefix, key []byte, flags base.SeekGEFlags, 751 ) (*base.InternalKey, []byte) { 752 k, v := i.seekPrefixGE(prefix, key, flags, i.useFilter) 753 return k, v 754 } 755 756 func (i *singleLevelIterator) seekPrefixGE( 757 prefix, key []byte, flags base.SeekGEFlags, checkFilter bool, 758 ) (k *InternalKey, value []byte) { 759 i.err = nil 760 if checkFilter && i.reader.tableFilter != nil { 761 if !i.lastBloomFilterMatched { 762 // Iterator is not positioned based on last seek. 763 flags = flags.DisableTrySeekUsingNext() 764 } 765 i.lastBloomFilterMatched = false 766 // Check prefix bloom filter. 767 var dataH cache.Handle 768 dataH, i.err = i.reader.readFilter() 769 if i.err != nil { 770 i.data.invalidate() 771 return nil, nil 772 } 773 mayContain := i.reader.tableFilter.mayContain(dataH.Get(), prefix) 774 dataH.Release() 775 if !mayContain { 776 // This invalidation may not be necessary for correctness, and may 777 // be a place to optimize later by reusing the already loaded 778 // block. It was necessary in earlier versions of the code since 779 // the caller was allowed to call Next when SeekPrefixGE returned 780 // nil. This is no longer allowed. 781 i.data.invalidate() 782 return nil, nil 783 } 784 i.lastBloomFilterMatched = true 785 } 786 // The i.exhaustedBounds comparison indicates that the upper bound was 787 // reached. The i.data.isDataInvalidated() indicates that the sstable was 788 // exhausted. 789 if flags.TrySeekUsingNext() && (i.exhaustedBounds == +1 || i.data.isDataInvalidated()) { 790 // Already exhausted, so return nil. 791 return nil, nil 792 } 793 // Bloom filter matches, or skipped, so this method will position the 794 // iterator. 795 i.exhaustedBounds = 0 796 boundsCmp := i.boundsCmp 797 // Seek optimization only applies until iterator is first positioned after SetBounds. 798 i.boundsCmp = 0 799 i.positionedUsingLatestBounds = true 800 k, value = i.seekGEHelper(key, boundsCmp, flags) 801 return k, value 802 } 803 804 // SeekLT implements internalIterator.SeekLT, as documented in the bitalostable 805 // package. Note that SeekLT only checks the lower bound. It is up to the 806 // caller to ensure that key is less than the upper bound. 807 func (i *singleLevelIterator) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, []byte) { 808 i.exhaustedBounds = 0 809 i.err = nil 810 boundsCmp := i.boundsCmp 811 // Seek optimization only applies until iterator is first positioned after SetBounds. 812 i.boundsCmp = 0 813 814 // Seeking operations perform various step-instead-of-seeking optimizations: 815 // eg by considering monotonically increasing bounds (i.boundsCmp). Care 816 // must be taken to ensure that when performing these optimizations and the 817 // iterator becomes exhausted i.maybeFilteredKeysSingleLevel is set 818 // appropriately. Consider a previous SeekLT that filtered keys from k 819 // until the current iterator position. 820 // 821 // If the previous SeekLT did exhausted the iterator, it's possible keys 822 // less than the current search key were filtered. We must not reuse the 823 // current iterator position without remembering the previous value of 824 // maybeFilteredKeysSingleLevel. 825 826 i.positionedUsingLatestBounds = true 827 828 var dontSeekWithinBlock bool 829 if !i.data.isDataInvalidated() && !i.index.isDataInvalidated() && i.data.valid() && i.index.valid() && 830 boundsCmp < 0 && i.cmp(i.data.firstKey.UserKey, key) < 0 { 831 // Fast-path: The bounds have moved backward, and this SeekLT is 832 // respecting the upper bound (guaranteed by Iterator). We know that 833 // the iterator must already be positioned within or just outside the 834 // previous bounds. Therefore it cannot be positioned at a block (or 835 // the position within that block) that is behind the seek position. 836 // However it can be positioned at a later block. This fast-path to 837 // use Prev() on the block is only applied when we are already at the 838 // block that can satisfy this seek -- this is the motivation for the 839 // the i.cmp(i.data.firstKey.UserKey, key) < 0 predicate. 840 i.initBoundsForAlreadyLoadedBlock() 841 ikey, val, done := i.trySeekLTUsingPrevWithinBlock(key) 842 if done { 843 return ikey, val 844 } 845 if ikey == nil { 846 // Done with this block. 847 dontSeekWithinBlock = true 848 } 849 } else { 850 // Slow-path. 851 i.maybeFilteredKeysSingleLevel = false 852 var ikey *InternalKey 853 854 // NB: If a bound-limited block property filter is configured, it's 855 // externally ensured that the filter is disabled (through returning 856 // Intersects=false irrespective of the block props provided) during 857 // seeks. 858 if ikey, _ = i.index.SeekGE(key, base.SeekGEFlagsNone); ikey == nil { 859 ikey, _ = i.index.Last() 860 if ikey == nil { 861 return nil, nil 862 } 863 } 864 // INVARIANT: ikey != nil. 865 result := i.loadBlock(-1) 866 if result == loadBlockFailed { 867 return nil, nil 868 } 869 if result == loadBlockIrrelevant { 870 // Enforce the lower bound here since don't want to bother moving 871 // to the previous block if lower bound is already exceeded. Note 872 // that the previous block starts with keys <= ikey.UserKey since 873 // even though this is the current block's separator, the same 874 // user key can span multiple blocks. 875 if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 { 876 i.exhaustedBounds = -1 877 return nil, nil 878 } 879 // Want to skip to the previous block. 880 dontSeekWithinBlock = true 881 } 882 } 883 if !dontSeekWithinBlock { 884 if ikey, val := i.data.SeekLT(key, flags); ikey != nil { 885 if i.blockLower != nil && i.cmp(ikey.UserKey, i.blockLower) < 0 { 886 i.exhaustedBounds = -1 887 return nil, nil 888 } 889 return ikey, val 890 } 891 } 892 // The index contains separator keys which may lie between 893 // user-keys. Consider the user-keys: 894 // 895 // complete 896 // ---- new block --- 897 // complexion 898 // 899 // If these two keys end one block and start the next, the index key may 900 // be chosen as "compleu". The SeekGE in the index block will then point 901 // us to the block containing "complexion". If this happens, we want the 902 // last key from the previous data block. 903 return i.skipBackward() 904 } 905 906 // First implements internalIterator.First, as documented in the bitalostable 907 // package. Note that First only checks the upper bound. It is up to the caller 908 // to ensure that key is greater than or equal to the lower bound (e.g. via a 909 // call to SeekGE(lower)). 910 func (i *singleLevelIterator) First() (*InternalKey, []byte) { 911 if i.lower != nil { 912 panic("singleLevelIterator.First() used despite lower bound") 913 } 914 i.positionedUsingLatestBounds = true 915 i.maybeFilteredKeysSingleLevel = false 916 return i.firstInternal() 917 } 918 919 // firstInternal is a helper used for absolute positioning in a single-level 920 // index file, or for positioning in the second-level index in a two-level 921 // index file. For the latter, one cannot make any claims about absolute 922 // positioning. 923 func (i *singleLevelIterator) firstInternal() (*InternalKey, []byte) { 924 i.exhaustedBounds = 0 925 i.err = nil 926 // Seek optimization only applies until iterator is first positioned after SetBounds. 927 i.boundsCmp = 0 928 929 var ikey *InternalKey 930 if ikey, _ = i.index.First(); ikey == nil { 931 i.data.invalidate() 932 return nil, nil 933 } 934 result := i.loadBlock(+1) 935 if result == loadBlockFailed { 936 return nil, nil 937 } 938 if result == loadBlockOK { 939 if ikey, val := i.data.First(); ikey != nil { 940 if i.blockUpper != nil && i.cmp(ikey.UserKey, i.blockUpper) >= 0 { 941 i.exhaustedBounds = +1 942 return nil, nil 943 } 944 return ikey, val 945 } 946 // Else fall through to skipForward. 947 } else { 948 // result == loadBlockIrrelevant. Enforce the upper bound here since 949 // don't want to bother moving to the next block if upper bound is 950 // already exceeded. Note that the next block starts with keys >= 951 // ikey.UserKey since even though this is the block separator, the 952 // same user key can span multiple blocks. Since upper is exclusive we 953 // use >= below. 954 if i.upper != nil && i.cmp(ikey.UserKey, i.upper) >= 0 { 955 i.exhaustedBounds = +1 956 return nil, nil 957 } 958 // Else fall through to skipForward. 959 } 960 961 return i.skipForward() 962 } 963 964 // Last implements internalIterator.Last, as documented in the bitalostable 965 // package. Note that Last only checks the lower bound. It is up to the caller 966 // to ensure that key is less than the upper bound (e.g. via a call to 967 // SeekLT(upper)) 968 func (i *singleLevelIterator) Last() (*InternalKey, []byte) { 969 if i.upper != nil { 970 panic("singleLevelIterator.Last() used despite upper bound") 971 } 972 i.positionedUsingLatestBounds = true 973 i.maybeFilteredKeysSingleLevel = false 974 return i.lastInternal() 975 } 976 977 // lastInternal is a helper used for absolute positioning in a single-level 978 // index file, or for positioning in the second-level index in a two-level 979 // index file. For the latter, one cannot make any claims about absolute 980 // positioning. 981 func (i *singleLevelIterator) lastInternal() (*InternalKey, []byte) { 982 i.exhaustedBounds = 0 983 i.err = nil 984 // Seek optimization only applies until iterator is first positioned after SetBounds. 985 i.boundsCmp = 0 986 987 var ikey *InternalKey 988 if ikey, _ = i.index.Last(); ikey == nil { 989 i.data.invalidate() 990 return nil, nil 991 } 992 result := i.loadBlock(-1) 993 if result == loadBlockFailed { 994 return nil, nil 995 } 996 if result == loadBlockOK { 997 if ikey, val := i.data.Last(); ikey != nil { 998 if i.blockLower != nil && i.cmp(ikey.UserKey, i.blockLower) < 0 { 999 i.exhaustedBounds = -1 1000 return nil, nil 1001 } 1002 return ikey, val 1003 } 1004 // Else fall through to skipBackward. 1005 } else { 1006 // result == loadBlockIrrelevant. Enforce the lower bound here since 1007 // don't want to bother moving to the previous block if lower bound is 1008 // already exceeded. Note that the previous block starts with keys <= 1009 // key.UserKey since even though this is the current block's 1010 // separator, the same user key can span multiple blocks. 1011 if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 { 1012 i.exhaustedBounds = -1 1013 return nil, nil 1014 } 1015 } 1016 1017 return i.skipBackward() 1018 } 1019 1020 // Next implements internalIterator.Next, as documented in the bitalostable 1021 // package. 1022 // Note: compactionIterator.Next mirrors the implementation of Iterator.Next 1023 // due to performance. Keep the two in sync. 1024 func (i *singleLevelIterator) Next() (*InternalKey, []byte) { 1025 if i.exhaustedBounds == +1 { 1026 panic("Next called even though exhausted upper bound") 1027 } 1028 i.exhaustedBounds = 0 1029 i.maybeFilteredKeysSingleLevel = false 1030 // Seek optimization only applies until iterator is first positioned after SetBounds. 1031 i.boundsCmp = 0 1032 1033 if i.err != nil { 1034 return nil, nil 1035 } 1036 if key, val := i.data.Next(); key != nil { 1037 if i.blockUpper != nil && i.cmp(key.UserKey, i.blockUpper) >= 0 { 1038 i.exhaustedBounds = +1 1039 return nil, nil 1040 } 1041 return key, val 1042 } 1043 return i.skipForward() 1044 } 1045 1046 // Prev implements internalIterator.Prev, as documented in the bitalostable 1047 // package. 1048 func (i *singleLevelIterator) Prev() (*InternalKey, []byte) { 1049 if i.exhaustedBounds == -1 { 1050 panic("Prev called even though exhausted lower bound") 1051 } 1052 i.exhaustedBounds = 0 1053 i.maybeFilteredKeysSingleLevel = false 1054 // Seek optimization only applies until iterator is first positioned after SetBounds. 1055 i.boundsCmp = 0 1056 1057 if i.err != nil { 1058 return nil, nil 1059 } 1060 if key, val := i.data.Prev(); key != nil { 1061 if i.blockLower != nil && i.cmp(key.UserKey, i.blockLower) < 0 { 1062 i.exhaustedBounds = -1 1063 return nil, nil 1064 } 1065 return key, val 1066 } 1067 return i.skipBackward() 1068 } 1069 1070 func (i *singleLevelIterator) skipForward() (*InternalKey, []byte) { 1071 for { 1072 var key *InternalKey 1073 if key, _ = i.index.Next(); key == nil { 1074 i.data.invalidate() 1075 break 1076 } 1077 result := i.loadBlock(+1) 1078 if result != loadBlockOK { 1079 if i.err != nil { 1080 break 1081 } 1082 if result == loadBlockFailed { 1083 // We checked that i.index was at a valid entry, so 1084 // loadBlockFailed could not have happened due to to i.index 1085 // being exhausted, and must be due to an error. 1086 panic("loadBlock should not have failed with no error") 1087 } 1088 // result == loadBlockIrrelevant. Enforce the upper bound here 1089 // since don't want to bother moving to the next block if upper 1090 // bound is already exceeded. Note that the next block starts with 1091 // keys >= key.UserKey since even though this is the block 1092 // separator, the same user key can span multiple blocks. Since 1093 // upper is exclusive we use >= below. 1094 if i.upper != nil && i.cmp(key.UserKey, i.upper) >= 0 { 1095 i.exhaustedBounds = +1 1096 return nil, nil 1097 } 1098 continue 1099 } 1100 if key, val := i.data.First(); key != nil { 1101 if i.blockUpper != nil && i.cmp(key.UserKey, i.blockUpper) >= 0 { 1102 i.exhaustedBounds = +1 1103 return nil, nil 1104 } 1105 return key, val 1106 } 1107 } 1108 return nil, nil 1109 } 1110 1111 func (i *singleLevelIterator) skipBackward() (*InternalKey, []byte) { 1112 for { 1113 var key *InternalKey 1114 if key, _ = i.index.Prev(); key == nil { 1115 i.data.invalidate() 1116 break 1117 } 1118 result := i.loadBlock(-1) 1119 if result != loadBlockOK { 1120 if i.err != nil { 1121 break 1122 } 1123 if result == loadBlockFailed { 1124 // We checked that i.index was at a valid entry, so 1125 // loadBlockFailed could not have happened due to to i.index 1126 // being exhausted, and must be due to an error. 1127 panic("loadBlock should not have failed with no error") 1128 } 1129 // result == loadBlockIrrelevant. Enforce the lower bound here 1130 // since don't want to bother moving to the previous block if lower 1131 // bound is already exceeded. Note that the previous block starts with 1132 // keys <= key.UserKey since even though this is the current block's 1133 // separator, the same user key can span multiple blocks. 1134 if i.lower != nil && i.cmp(key.UserKey, i.lower) < 0 { 1135 i.exhaustedBounds = -1 1136 return nil, nil 1137 } 1138 continue 1139 } 1140 key, val := i.data.Last() 1141 if key == nil { 1142 return nil, nil 1143 } 1144 if i.blockLower != nil && i.cmp(key.UserKey, i.blockLower) < 0 { 1145 i.exhaustedBounds = -1 1146 return nil, nil 1147 } 1148 return key, val 1149 } 1150 return nil, nil 1151 } 1152 1153 // Error implements internalIterator.Error, as documented in the bitalostable 1154 // package. 1155 func (i *singleLevelIterator) Error() error { 1156 if err := i.data.Error(); err != nil { 1157 return err 1158 } 1159 return i.err 1160 } 1161 1162 // MaybeFilteredKeys may be called when an iterator is exhausted to indicate 1163 // whether or not the last positioning method may have skipped any keys due to 1164 // block-property filters. 1165 func (i *singleLevelIterator) MaybeFilteredKeys() bool { 1166 return i.maybeFilteredKeysSingleLevel 1167 } 1168 1169 // SetCloseHook sets a function that will be called when the iterator is 1170 // closed. 1171 func (i *singleLevelIterator) SetCloseHook(fn func(i Iterator) error) { 1172 i.closeHook = fn 1173 } 1174 1175 func firstError(err0, err1 error) error { 1176 if err0 != nil { 1177 return err0 1178 } 1179 return err1 1180 } 1181 1182 // Close implements internalIterator.Close, as documented in the bitalostable 1183 // package. 1184 func (i *singleLevelIterator) Close() error { 1185 var err error 1186 if i.closeHook != nil { 1187 err = firstError(err, i.closeHook(i)) 1188 } 1189 err = firstError(err, i.data.Close()) 1190 err = firstError(err, i.index.Close()) 1191 if i.dataRS.sequentialFile != nil { 1192 err = firstError(err, i.dataRS.sequentialFile.Close()) 1193 i.dataRS.sequentialFile = nil 1194 } 1195 err = firstError(err, i.err) 1196 if i.bpfs != nil { 1197 releaseBlockPropertiesFilterer(i.bpfs) 1198 } 1199 *i = i.resetForReuse() 1200 singleLevelIterPool.Put(i) 1201 return err 1202 } 1203 1204 func (i *singleLevelIterator) String() string { 1205 return i.reader.fileNum.String() 1206 } 1207 1208 // Deterministic disabling of the bounds-based optimization that avoids seeking. 1209 // Uses the iterator pointer, since we want diversity in iterator behavior for 1210 // the same SetBounds call. Used for tests. 1211 func disableBoundsOpt(bound []byte, ptr uintptr) bool { 1212 // Fibonacci hash https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ 1213 simpleHash := (11400714819323198485 * uint64(ptr)) >> 63 1214 return bound[len(bound)-1]&byte(1) == 0 && simpleHash == 0 1215 } 1216 1217 // SetBounds implements internalIterator.SetBounds, as documented in the bitalostable 1218 // package. 1219 func (i *singleLevelIterator) SetBounds(lower, upper []byte) { 1220 i.boundsCmp = 0 1221 if i.positionedUsingLatestBounds { 1222 if i.upper != nil && lower != nil && i.cmp(i.upper, lower) <= 0 { 1223 i.boundsCmp = +1 1224 if invariants.Enabled && disableBoundsOpt(lower, uintptr(unsafe.Pointer(i))) { 1225 i.boundsCmp = 0 1226 } 1227 } else if i.lower != nil && upper != nil && i.cmp(upper, i.lower) <= 0 { 1228 i.boundsCmp = -1 1229 if invariants.Enabled && disableBoundsOpt(upper, uintptr(unsafe.Pointer(i))) { 1230 i.boundsCmp = 0 1231 } 1232 } 1233 i.positionedUsingLatestBounds = false 1234 } 1235 i.lower = lower 1236 i.upper = upper 1237 i.blockLower = nil 1238 i.blockUpper = nil 1239 } 1240 1241 var _ base.InternalIterator = &singleLevelIterator{} 1242 var _ base.InternalIterator = &twoLevelIterator{} 1243 1244 // compactionIterator is similar to Iterator but it increments the number of 1245 // bytes that have been iterated through. 1246 type compactionIterator struct { 1247 *singleLevelIterator 1248 bytesIterated *uint64 1249 prevOffset uint64 1250 } 1251 1252 // compactionIterator implements the base.InternalIterator interface. 1253 var _ base.InternalIterator = (*compactionIterator)(nil) 1254 1255 func (i *compactionIterator) String() string { 1256 return i.reader.fileNum.String() 1257 } 1258 1259 func (i *compactionIterator) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, []byte) { 1260 panic("bitalostable: SeekGE unimplemented") 1261 } 1262 1263 func (i *compactionIterator) SeekPrefixGE( 1264 prefix, key []byte, flags base.SeekGEFlags, 1265 ) (*base.InternalKey, []byte) { 1266 panic("bitalostable: SeekPrefixGE unimplemented") 1267 } 1268 1269 func (i *compactionIterator) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, []byte) { 1270 panic("bitalostable: SeekLT unimplemented") 1271 } 1272 1273 func (i *compactionIterator) First() (*InternalKey, []byte) { 1274 i.err = nil 1275 return i.skipForward(i.singleLevelIterator.First()) 1276 } 1277 1278 func (i *compactionIterator) Last() (*InternalKey, []byte) { 1279 panic("bitalostable: Last unimplemented") 1280 } 1281 1282 // Note: compactionIterator.Next mirrors the implementation of Iterator.Next 1283 // due to performance. Keep the two in sync. 1284 func (i *compactionIterator) Next() (*InternalKey, []byte) { 1285 if i.err != nil { 1286 return nil, nil 1287 } 1288 return i.skipForward(i.data.Next()) 1289 } 1290 1291 func (i *compactionIterator) Prev() (*InternalKey, []byte) { 1292 panic("bitalostable: Prev unimplemented") 1293 } 1294 1295 func (i *compactionIterator) skipForward(key *InternalKey, val []byte) (*InternalKey, []byte) { 1296 if key == nil { 1297 for { 1298 if key, _ := i.index.Next(); key == nil { 1299 break 1300 } 1301 result := i.loadBlock(+1) 1302 if result != loadBlockOK { 1303 if i.err != nil { 1304 break 1305 } 1306 switch result { 1307 case loadBlockFailed: 1308 // We checked that i.index was at a valid entry, so 1309 // loadBlockFailed could not have happened due to to i.index 1310 // being exhausted, and must be due to an error. 1311 panic("loadBlock should not have failed with no error") 1312 case loadBlockIrrelevant: 1313 panic("compactionIter should not be using block intervals for skipping") 1314 default: 1315 panic(fmt.Sprintf("unexpected case %d", result)) 1316 } 1317 } 1318 // result == loadBlockOK 1319 if key, val = i.data.First(); key != nil { 1320 break 1321 } 1322 } 1323 } 1324 1325 curOffset := i.recordOffset() 1326 *i.bytesIterated += uint64(curOffset - i.prevOffset) 1327 i.prevOffset = curOffset 1328 return key, val 1329 } 1330 1331 type twoLevelIterator struct { 1332 singleLevelIterator 1333 // maybeFilteredKeysSingleLevel indicates whether the last iterator 1334 // positioning operation may have skipped any index blocks due to 1335 // block-property filters when positioning the top-level-index. 1336 maybeFilteredKeysTwoLevel bool 1337 topLevelIndex blockIter 1338 } 1339 1340 // twoLevelIterator implements the base.InternalIterator interface. 1341 var _ base.InternalIterator = (*twoLevelIterator)(nil) 1342 1343 // loadIndex loads the index block at the current top level index position and 1344 // leaves i.index unpositioned. If unsuccessful, it gets i.err to any error 1345 // encountered, which may be nil if we have simply exhausted the entire table. 1346 // This is used for two level indexes. 1347 func (i *twoLevelIterator) loadIndex(dir int8) loadBlockResult { 1348 // Ensure the data block iterator is invalidated even if loading of the 1349 // index fails. 1350 i.data.invalidate() 1351 if !i.topLevelIndex.valid() { 1352 i.index.offset = 0 1353 i.index.restarts = 0 1354 return loadBlockFailed 1355 } 1356 bhp, err := decodeBlockHandleWithProperties(i.topLevelIndex.Value()) 1357 if err != nil { 1358 i.err = base.CorruptionErrorf("bitalostable/table: corrupt top level index entry") 1359 return loadBlockFailed 1360 } 1361 if i.bpfs != nil { 1362 intersects, err := i.bpfs.intersects(bhp.Props) 1363 if err != nil { 1364 i.err = errCorruptIndexEntry 1365 return loadBlockFailed 1366 } 1367 if intersects == blockMaybeExcluded { 1368 intersects = i.resolveMaybeExcluded(dir) 1369 } 1370 if intersects == blockExcluded { 1371 i.maybeFilteredKeysTwoLevel = true 1372 return loadBlockIrrelevant 1373 } 1374 // blockIntersects 1375 } 1376 indexBlock, err := i.readBlockWithStats(bhp.BlockHandle, nil /* readaheadState */) 1377 if err != nil { 1378 i.err = err 1379 return loadBlockFailed 1380 } 1381 if i.err = i.index.initHandle( 1382 i.cmp, indexBlock, i.reader.Properties.GlobalSeqNum); i.err == nil { 1383 return loadBlockOK 1384 } 1385 return loadBlockFailed 1386 } 1387 1388 // resolveMaybeExcluded is invoked when the block-property filterer has found 1389 // that an index block is excluded according to its properties but only if its 1390 // bounds fall within the filter's current bounds. This function consults the 1391 // apprioriate bound, depending on the iteration direction, and returns either 1392 // `blockIntersects` or 1393 // `blockMaybeExcluded`. 1394 func (i *twoLevelIterator) resolveMaybeExcluded(dir int8) intersectsResult { 1395 // This iterator is configured with a bound-limited block property filter. 1396 // The bpf determined this entire index block could be excluded from 1397 // iteration based on the property encoded in the block handle. However, we 1398 // still need to determine if the index block is wholly contained within the 1399 // filter's key bounds. 1400 // 1401 // External guarantees ensure all its data blocks' keys are ≥ the filter's 1402 // lower bound during forward iteration, and that all its data blocks' keys 1403 // are < the filter's upper bound during backward iteration. We only need to 1404 // determine if the opposite bound is also met. 1405 // 1406 // The index separator in topLevelIndex.Key() provides an inclusive 1407 // upper-bound for the index block's keys, guaranteeing that all its keys 1408 // are ≤ topLevelIndex.Key(). For forward iteration, this is all we need. 1409 if dir > 0 { 1410 // Forward iteration. 1411 if i.bpfs.boundLimitedFilter.KeyIsWithinUpperBound(i.topLevelIndex.Key()) { 1412 return blockExcluded 1413 } 1414 return blockIntersects 1415 } 1416 1417 // Reverse iteration. 1418 // 1419 // Because we're iterating in the reverse direction, we don't yet have 1420 // enough context available to determine if the block is wholly contained 1421 // within its bounds. This case arises only during backward iteration, 1422 // because of the way the index is structured. 1423 // 1424 // Consider a bound-limited bpf limited to the bounds [b,d), loading the 1425 // block with separator `c`. During reverse iteration, the guarantee that 1426 // all the block's keys are < `d` is externally provided, but no guarantee 1427 // is made on the bpf's lower bound. The separator `c` only provides an 1428 // inclusive upper bound on the block's keys, indicating that the 1429 // corresponding block handle points to a block containing only keys ≤ `c`. 1430 // 1431 // To establish a lower bound, we step the top-level index backwards to read 1432 // the previous block's separator, which provides an inclusive lower bound 1433 // on the original index block's keys. Afterwards, we step forward to 1434 // restore our top-level index position. 1435 if peekKey, _ := i.topLevelIndex.Prev(); peekKey == nil { 1436 // The original block points to the first index block of this table. If 1437 // we knew the lower bound for the entire table, it could provide a 1438 // lower bound, but the code refactoring necessary to read it doesn't 1439 // seem worth the payoff. We fall through to loading the block. 1440 } else if i.bpfs.boundLimitedFilter.KeyIsWithinLowerBound(peekKey) { 1441 // The lower-bound on the original index block falls within the filter's 1442 // bounds, and we can skip the block (after restoring our current 1443 // top-level index position). 1444 _, _ = i.topLevelIndex.Next() 1445 return blockExcluded 1446 } 1447 _, _ = i.topLevelIndex.Next() 1448 return blockIntersects 1449 } 1450 1451 func (i *twoLevelIterator) init( 1452 r *Reader, 1453 lower, upper []byte, 1454 filterer *BlockPropertiesFilterer, 1455 useFilter bool, 1456 stats *base.InternalIteratorStats, 1457 ) error { 1458 if r.err != nil { 1459 return r.err 1460 } 1461 topLevelIndexH, err := r.readIndex() 1462 if err != nil { 1463 return err 1464 } 1465 1466 i.lower = lower 1467 i.upper = upper 1468 i.bpfs = filterer 1469 i.useFilter = useFilter 1470 i.reader = r 1471 i.cmp = r.Compare 1472 i.stats = stats 1473 err = i.topLevelIndex.initHandle(i.cmp, topLevelIndexH, r.Properties.GlobalSeqNum) 1474 if err != nil { 1475 // blockIter.Close releases topLevelIndexH and always returns a nil error 1476 _ = i.topLevelIndex.Close() 1477 return err 1478 } 1479 return nil 1480 } 1481 1482 func (i *twoLevelIterator) String() string { 1483 return i.reader.fileNum.String() 1484 } 1485 1486 // MaybeFilteredKeys may be called when an iterator is exhausted to indicate 1487 // whether or not the last positioning method may have skipped any keys due to 1488 // block-property filters. 1489 func (i *twoLevelIterator) MaybeFilteredKeys() bool { 1490 // While reading sstables with two-level indexes, knowledge of whether we've 1491 // filtered keys is tracked separately for each index level. The 1492 // seek-using-next optimizations have different criteria. We can only reset 1493 // maybeFilteredKeys back to false during a seek when NOT using the 1494 // fast-path that uses the current iterator position. 1495 // 1496 // If either level might have filtered keys to arrive at the current 1497 // iterator position, return MaybeFilteredKeys=true. 1498 return i.maybeFilteredKeysTwoLevel || i.maybeFilteredKeysSingleLevel 1499 } 1500 1501 // SeekGE implements internalIterator.SeekGE, as documented in the bitalostable 1502 // package. Note that SeekGE only checks the upper bound. It is up to the 1503 // caller to ensure that key is greater than or equal to the lower bound. 1504 func (i *twoLevelIterator) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, []byte) { 1505 i.exhaustedBounds = 0 1506 i.err = nil 1507 1508 // SeekGE performs various step-instead-of-seeking optimizations: eg enabled 1509 // by trySeekUsingNext, or by monotonically increasing bounds (i.boundsCmp). 1510 // Care must be taken to ensure that when performing these optimizations and 1511 // the iterator becomes exhausted, i.maybeFilteredKeys is set appropriately. 1512 // Consider a previous SeekGE that filtered keys from k until the current 1513 // iterator position. 1514 // 1515 // If the previous SeekGE exhausted the iterator while seeking within the 1516 // two-level index, it's possible keys greater than or equal to the current 1517 // search key were filtered through skipped index blocks. We must not reuse 1518 // the position of the two-level index iterator without remembering the 1519 // previous value of maybeFilteredKeys. 1520 1521 var dontSeekWithinSingleLevelIter bool 1522 if i.topLevelIndex.isDataInvalidated() || !i.topLevelIndex.valid() || 1523 (i.boundsCmp <= 0 && !flags.TrySeekUsingNext()) || i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 { 1524 // Slow-path: need to position the topLevelIndex. 1525 i.maybeFilteredKeysTwoLevel = false 1526 flags = flags.DisableTrySeekUsingNext() 1527 var ikey *InternalKey 1528 if ikey, _ = i.topLevelIndex.SeekGE(key, flags); ikey == nil { 1529 i.data.invalidate() 1530 i.index.invalidate() 1531 return nil, nil 1532 } 1533 1534 result := i.loadIndex(+1) 1535 if result == loadBlockFailed { 1536 return nil, nil 1537 } 1538 if result == loadBlockIrrelevant { 1539 // Enforce the upper bound here since don't want to bother moving 1540 // to the next entry in the top level index if upper bound is 1541 // already exceeded. Note that the next entry starts with keys >= 1542 // ikey.UserKey since even though this is the block separator, the 1543 // same user key can span multiple index blocks. Since upper is 1544 // exclusive we use >= below. 1545 if i.upper != nil && i.cmp(ikey.UserKey, i.upper) >= 0 { 1546 i.exhaustedBounds = +1 1547 } 1548 // Fall through to skipForward. 1549 dontSeekWithinSingleLevelIter = true 1550 } 1551 } 1552 // Else fast-path: There are two possible cases, from 1553 // (i.boundsCmp > 0 || flags.TrySeekUsingNext()): 1554 // 1555 // 1) The bounds have moved forward (i.boundsCmp > 0) and this SeekGE is 1556 // respecting the lower bound (guaranteed by Iterator). We know that 1557 // the iterator must already be positioned within or just outside the 1558 // previous bounds. Therefore the topLevelIndex iter cannot be 1559 // positioned at an entry ahead of the seek position (though it can be 1560 // positioned behind). The !i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 1561 // confirms that it is not behind. Since it is not ahead and not behind 1562 // it must be at the right position. 1563 // 1564 // 2) This SeekGE will land on a key that is greater than the key we are 1565 // currently at (guaranteed by trySeekUsingNext), but since 1566 // i.cmp(key, i.topLevelIndex.Key().UserKey) <= 0, we are at the correct 1567 // lower level index block. No need to reset the state of singleLevelIterator. 1568 1569 if !dontSeekWithinSingleLevelIter { 1570 // Note that while trySeekUsingNext could be false here, singleLevelIterator 1571 // could do its own boundsCmp-based optimization to seek using next. 1572 if ikey, val := i.singleLevelIterator.SeekGE(key, flags); ikey != nil { 1573 return ikey, val 1574 } 1575 } 1576 return i.skipForward() 1577 } 1578 1579 // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the 1580 // bitalostable package. Note that SeekPrefixGE only checks the upper bound. It is up 1581 // to the caller to ensure that key is greater than or equal to the lower bound. 1582 func (i *twoLevelIterator) SeekPrefixGE( 1583 prefix, key []byte, flags base.SeekGEFlags, 1584 ) (*base.InternalKey, []byte) { 1585 i.err = nil 1586 1587 // Check prefix bloom filter. 1588 if i.reader.tableFilter != nil && i.useFilter { 1589 if !i.lastBloomFilterMatched { 1590 // Iterator is not positioned based on last seek. 1591 flags = flags.DisableTrySeekUsingNext() 1592 } 1593 i.lastBloomFilterMatched = false 1594 var dataH cache.Handle 1595 dataH, i.err = i.reader.readFilter() 1596 if i.err != nil { 1597 i.data.invalidate() 1598 return nil, nil 1599 } 1600 mayContain := i.reader.tableFilter.mayContain(dataH.Get(), prefix) 1601 dataH.Release() 1602 if !mayContain { 1603 // This invalidation may not be necessary for correctness, and may 1604 // be a place to optimize later by reusing the already loaded 1605 // block. It was necessary in earlier versions of the code since 1606 // the caller was allowed to call Next when SeekPrefixGE returned 1607 // nil. This is no longer allowed. 1608 i.data.invalidate() 1609 return nil, nil 1610 } 1611 i.lastBloomFilterMatched = true 1612 } 1613 1614 // Bloom filter matches. 1615 i.exhaustedBounds = 0 1616 1617 // SeekPrefixGE performs various step-instead-of-seeking optimizations: eg 1618 // enabled by trySeekUsingNext, or by monotonically increasing bounds 1619 // (i.boundsCmp). Care must be taken to ensure that when performing these 1620 // optimizations and the iterator becomes exhausted, 1621 // i.maybeFilteredKeysTwoLevel is set appropriately. Consider a previous 1622 // SeekPrefixGE that filtered keys from k until the current iterator 1623 // position. 1624 // 1625 // If the previous SeekPrefixGE exhausted the iterator while seeking within 1626 // the two-level index, it's possible keys greater than or equal to the 1627 // current search key were filtered through skipped index blocks. We must 1628 // not reuse the position of the two-level index iterator without 1629 // remembering the previous value of maybeFilteredKeysTwoLevel. 1630 1631 var dontSeekWithinSingleLevelIter bool 1632 if i.topLevelIndex.isDataInvalidated() || !i.topLevelIndex.valid() || 1633 i.boundsCmp <= 0 || i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 { 1634 // Slow-path: need to position the topLevelIndex. 1635 // 1636 // TODO(sumeer): improve this slow-path to be able to use Next, when 1637 // flags.TrySeekUsingNext() is true, since the fast path never applies 1638 // for practical uses of SeekPrefixGE in CockroachDB (they never set 1639 // monotonic bounds). To apply it here, we would need to confirm that 1640 // the topLevelIndex can continue using the same second level index 1641 // block, and in that case we don't need to invalidate and reload the 1642 // singleLevelIterator state. 1643 i.maybeFilteredKeysTwoLevel = false 1644 flags = flags.DisableTrySeekUsingNext() 1645 var ikey *InternalKey 1646 if ikey, _ = i.topLevelIndex.SeekGE(key, flags); ikey == nil { 1647 i.data.invalidate() 1648 i.index.invalidate() 1649 return nil, nil 1650 } 1651 1652 result := i.loadIndex(+1) 1653 if result == loadBlockFailed { 1654 return nil, nil 1655 } 1656 if result == loadBlockIrrelevant { 1657 // Enforce the upper bound here since don't want to bother moving 1658 // to the next entry in the top level index if upper bound is 1659 // already exceeded. Note that the next entry starts with keys >= 1660 // ikey.UserKey since even though this is the block separator, the 1661 // same user key can span multiple index blocks. Since upper is 1662 // exclusive we use >= below. 1663 if i.upper != nil && i.cmp(ikey.UserKey, i.upper) >= 0 { 1664 i.exhaustedBounds = +1 1665 } 1666 // Fall through to skipForward. 1667 dontSeekWithinSingleLevelIter = true 1668 } 1669 } 1670 // Else fast-path: The bounds have moved forward and this SeekGE is 1671 // respecting the lower bound (guaranteed by Iterator). We know that 1672 // the iterator must already be positioned within or just outside the 1673 // previous bounds. Therefore the topLevelIndex iter cannot be 1674 // positioned at an entry ahead of the seek position (though it can be 1675 // positioned behind). The !i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 1676 // confirms that it is not behind. Since it is not ahead and not behind 1677 // it must be at the right position. 1678 1679 if !dontSeekWithinSingleLevelIter { 1680 if ikey, val := i.singleLevelIterator.seekPrefixGE( 1681 prefix, key, flags, false /* checkFilter */); ikey != nil { 1682 return ikey, val 1683 } 1684 } 1685 // NB: skipForward checks whether exhaustedBounds is already +1. 1686 return i.skipForward() 1687 } 1688 1689 // SeekLT implements internalIterator.SeekLT, as documented in the bitalostable 1690 // package. Note that SeekLT only checks the lower bound. It is up to the 1691 // caller to ensure that key is less than the upper bound. 1692 func (i *twoLevelIterator) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, []byte) { 1693 i.exhaustedBounds = 0 1694 i.err = nil 1695 // Seek optimization only applies until iterator is first positioned after SetBounds. 1696 i.boundsCmp = 0 1697 1698 var result loadBlockResult 1699 var ikey *InternalKey 1700 // NB: Unlike SeekGE, we don't have a fast-path here since we don't know 1701 // whether the topLevelIndex is positioned after the position that would 1702 // be returned by doing i.topLevelIndex.SeekGE(). To know this we would 1703 // need to know the index key preceding the current one. 1704 // NB: If a bound-limited block property filter is configured, it's 1705 // externally ensured that the filter is disabled (through returning 1706 // Intersects=false irrespective of the block props provided) during seeks. 1707 i.maybeFilteredKeysTwoLevel = false 1708 if ikey, _ = i.topLevelIndex.SeekGE(key, base.SeekGEFlagsNone); ikey == nil { 1709 if ikey, _ = i.topLevelIndex.Last(); ikey == nil { 1710 i.data.invalidate() 1711 i.index.invalidate() 1712 return nil, nil 1713 } 1714 1715 result = i.loadIndex(-1) 1716 if result == loadBlockFailed { 1717 return nil, nil 1718 } 1719 if result == loadBlockOK { 1720 if ikey, val := i.singleLevelIterator.lastInternal(); ikey != nil { 1721 return ikey, val 1722 } 1723 // Fall through to skipBackward since the singleLevelIterator did 1724 // not have any blocks that satisfy the block interval 1725 // constraints, or the lower bound was reached. 1726 } 1727 // Else loadBlockIrrelevant, so fall through. 1728 } else { 1729 result = i.loadIndex(-1) 1730 if result == loadBlockFailed { 1731 return nil, nil 1732 } 1733 if result == loadBlockOK { 1734 if ikey, val := i.singleLevelIterator.SeekLT(key, flags); ikey != nil { 1735 return ikey, val 1736 } 1737 // Fall through to skipBackward since the singleLevelIterator did 1738 // not have any blocks that satisfy the block interval 1739 // constraint, or the lower bound was reached. 1740 } 1741 // Else loadBlockIrrelevant, so fall through. 1742 } 1743 if result == loadBlockIrrelevant { 1744 // Enforce the lower bound here since don't want to bother moving to 1745 // the previous entry in the top level index if lower bound is already 1746 // exceeded. Note that the previous entry starts with keys <= 1747 // ikey.UserKey since even though this is the current block's 1748 // separator, the same user key can span multiple index blocks. 1749 if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 { 1750 i.exhaustedBounds = -1 1751 } 1752 } 1753 // NB: skipBackward checks whether exhaustedBounds is already -1. 1754 return i.skipBackward() 1755 } 1756 1757 // First implements internalIterator.First, as documented in the bitalostable 1758 // package. Note that First only checks the upper bound. It is up to the caller 1759 // to ensure that key is greater than or equal to the lower bound (e.g. via a 1760 // call to SeekGE(lower)). 1761 func (i *twoLevelIterator) First() (*InternalKey, []byte) { 1762 if i.lower != nil { 1763 panic("twoLevelIterator.First() used despite lower bound") 1764 } 1765 i.exhaustedBounds = 0 1766 i.maybeFilteredKeysTwoLevel = false 1767 i.err = nil 1768 // Seek optimization only applies until iterator is first positioned after SetBounds. 1769 i.boundsCmp = 0 1770 1771 var ikey *InternalKey 1772 if ikey, _ = i.topLevelIndex.First(); ikey == nil { 1773 return nil, nil 1774 } 1775 1776 result := i.loadIndex(+1) 1777 if result == loadBlockFailed { 1778 return nil, nil 1779 } 1780 if result == loadBlockOK { 1781 if ikey, val := i.singleLevelIterator.First(); ikey != nil { 1782 return ikey, val 1783 } 1784 // Else fall through to skipForward. 1785 } else { 1786 // result == loadBlockIrrelevant. Enforce the upper bound here since 1787 // don't want to bother moving to the next entry in the top level 1788 // index if upper bound is already exceeded. Note that the next entry 1789 // starts with keys >= ikey.UserKey since even though this is the 1790 // block separator, the same user key can span multiple index blocks. 1791 // Since upper is exclusive we use >= below. 1792 if i.upper != nil && i.cmp(ikey.UserKey, i.upper) >= 0 { 1793 i.exhaustedBounds = +1 1794 } 1795 } 1796 // NB: skipForward checks whether exhaustedBounds is already +1. 1797 return i.skipForward() 1798 } 1799 1800 // Last implements internalIterator.Last, as documented in the bitalostable 1801 // package. Note that Last only checks the lower bound. It is up to the caller 1802 // to ensure that key is less than the upper bound (e.g. via a call to 1803 // SeekLT(upper)) 1804 func (i *twoLevelIterator) Last() (*InternalKey, []byte) { 1805 if i.upper != nil { 1806 panic("twoLevelIterator.Last() used despite upper bound") 1807 } 1808 i.exhaustedBounds = 0 1809 i.maybeFilteredKeysTwoLevel = false 1810 i.err = nil 1811 // Seek optimization only applies until iterator is first positioned after SetBounds. 1812 i.boundsCmp = 0 1813 1814 var ikey *InternalKey 1815 if ikey, _ = i.topLevelIndex.Last(); ikey == nil { 1816 return nil, nil 1817 } 1818 1819 result := i.loadIndex(-1) 1820 if result == loadBlockFailed { 1821 return nil, nil 1822 } 1823 if result == loadBlockOK { 1824 if ikey, val := i.singleLevelIterator.Last(); ikey != nil { 1825 return ikey, val 1826 } 1827 // Else fall through to skipBackward. 1828 } else { 1829 // result == loadBlockIrrelevant. Enforce the lower bound here 1830 // since don't want to bother moving to the previous entry in the 1831 // top level index if lower bound is already exceeded. Note that 1832 // the previous entry starts with keys <= ikey.UserKey since even 1833 // though this is the current block's separator, the same user key 1834 // can span multiple index blocks. 1835 if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 { 1836 i.exhaustedBounds = -1 1837 } 1838 } 1839 // NB: skipBackward checks whether exhaustedBounds is already -1. 1840 return i.skipBackward() 1841 } 1842 1843 // Next implements internalIterator.Next, as documented in the bitalostable 1844 // package. 1845 // Note: twoLevelCompactionIterator.Next mirrors the implementation of 1846 // twoLevelIterator.Next due to performance. Keep the two in sync. 1847 func (i *twoLevelIterator) Next() (*InternalKey, []byte) { 1848 // Seek optimization only applies until iterator is first positioned after SetBounds. 1849 i.boundsCmp = 0 1850 i.maybeFilteredKeysTwoLevel = false 1851 if i.err != nil { 1852 return nil, nil 1853 } 1854 if key, val := i.singleLevelIterator.Next(); key != nil { 1855 return key, val 1856 } 1857 return i.skipForward() 1858 } 1859 1860 // Prev implements internalIterator.Prev, as documented in the bitalostable 1861 // package. 1862 func (i *twoLevelIterator) Prev() (*InternalKey, []byte) { 1863 // Seek optimization only applies until iterator is first positioned after SetBounds. 1864 i.boundsCmp = 0 1865 i.maybeFilteredKeysTwoLevel = false 1866 if i.err != nil { 1867 return nil, nil 1868 } 1869 if key, val := i.singleLevelIterator.Prev(); key != nil { 1870 return key, val 1871 } 1872 return i.skipBackward() 1873 } 1874 1875 func (i *twoLevelIterator) skipForward() (*InternalKey, []byte) { 1876 for { 1877 if i.err != nil || i.exhaustedBounds > 0 { 1878 return nil, nil 1879 } 1880 i.exhaustedBounds = 0 1881 var ikey *InternalKey 1882 if ikey, _ = i.topLevelIndex.Next(); ikey == nil { 1883 i.data.invalidate() 1884 i.index.invalidate() 1885 return nil, nil 1886 } 1887 result := i.loadIndex(+1) 1888 if result == loadBlockFailed { 1889 return nil, nil 1890 } 1891 if result == loadBlockOK { 1892 if ikey, val := i.singleLevelIterator.firstInternal(); ikey != nil { 1893 return ikey, val 1894 } 1895 // Next iteration will return if singleLevelIterator set 1896 // exhaustedBounds = +1. 1897 } else { 1898 // result == loadBlockIrrelevant. Enforce the upper bound here 1899 // since don't want to bother moving to the next entry in the top 1900 // level index if upper bound is already exceeded. Note that the 1901 // next entry starts with keys >= ikey.UserKey since even though 1902 // this is the block separator, the same user key can span 1903 // multiple index blocks. Since upper is exclusive we use >= 1904 // below. 1905 if i.upper != nil && i.cmp(ikey.UserKey, i.upper) >= 0 { 1906 i.exhaustedBounds = +1 1907 // Next iteration will return. 1908 } 1909 } 1910 } 1911 } 1912 1913 func (i *twoLevelIterator) skipBackward() (*InternalKey, []byte) { 1914 for { 1915 if i.err != nil || i.exhaustedBounds < 0 { 1916 return nil, nil 1917 } 1918 i.exhaustedBounds = 0 1919 var ikey *InternalKey 1920 if ikey, _ = i.topLevelIndex.Prev(); ikey == nil { 1921 i.data.invalidate() 1922 i.index.invalidate() 1923 return nil, nil 1924 } 1925 result := i.loadIndex(-1) 1926 if result == loadBlockFailed { 1927 return nil, nil 1928 } 1929 if result == loadBlockOK { 1930 if ikey, val := i.singleLevelIterator.lastInternal(); ikey != nil { 1931 return ikey, val 1932 } 1933 // Next iteration will return if singleLevelIterator set 1934 // exhaustedBounds = -1. 1935 } else { 1936 // result == loadBlockIrrelevant. Enforce the lower bound here 1937 // since don't want to bother moving to the previous entry in the 1938 // top level index if lower bound is already exceeded. Note that 1939 // the previous entry starts with keys <= ikey.UserKey since even 1940 // though this is the current block's separator, the same user key 1941 // can span multiple index blocks. 1942 if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 { 1943 i.exhaustedBounds = -1 1944 // Next iteration will return. 1945 } 1946 } 1947 } 1948 } 1949 1950 // Close implements internalIterator.Close, as documented in the bitalostable 1951 // package. 1952 func (i *twoLevelIterator) Close() error { 1953 var err error 1954 if i.closeHook != nil { 1955 err = firstError(err, i.closeHook(i)) 1956 } 1957 err = firstError(err, i.data.Close()) 1958 err = firstError(err, i.index.Close()) 1959 err = firstError(err, i.topLevelIndex.Close()) 1960 if i.dataRS.sequentialFile != nil { 1961 err = firstError(err, i.dataRS.sequentialFile.Close()) 1962 i.dataRS.sequentialFile = nil 1963 } 1964 err = firstError(err, i.err) 1965 if i.bpfs != nil { 1966 releaseBlockPropertiesFilterer(i.bpfs) 1967 } 1968 *i = twoLevelIterator{ 1969 singleLevelIterator: i.singleLevelIterator.resetForReuse(), 1970 topLevelIndex: i.topLevelIndex.resetForReuse(), 1971 } 1972 twoLevelIterPool.Put(i) 1973 return err 1974 } 1975 1976 // Note: twoLevelCompactionIterator and compactionIterator are very similar but 1977 // were separated due to performance. 1978 type twoLevelCompactionIterator struct { 1979 *twoLevelIterator 1980 bytesIterated *uint64 1981 prevOffset uint64 1982 } 1983 1984 // twoLevelCompactionIterator implements the base.InternalIterator interface. 1985 var _ base.InternalIterator = (*twoLevelCompactionIterator)(nil) 1986 1987 func (i *twoLevelCompactionIterator) Close() error { 1988 return i.twoLevelIterator.Close() 1989 } 1990 1991 func (i *twoLevelCompactionIterator) SeekGE( 1992 key []byte, flags base.SeekGEFlags, 1993 ) (*InternalKey, []byte) { 1994 panic("bitalostable: SeekGE unimplemented") 1995 } 1996 1997 func (i *twoLevelCompactionIterator) SeekPrefixGE( 1998 prefix, key []byte, flags base.SeekGEFlags, 1999 ) (*base.InternalKey, []byte) { 2000 panic("bitalostable: SeekPrefixGE unimplemented") 2001 } 2002 2003 func (i *twoLevelCompactionIterator) SeekLT( 2004 key []byte, flags base.SeekLTFlags, 2005 ) (*InternalKey, []byte) { 2006 panic("bitalostable: SeekLT unimplemented") 2007 } 2008 2009 func (i *twoLevelCompactionIterator) First() (*InternalKey, []byte) { 2010 i.err = nil 2011 return i.skipForward(i.twoLevelIterator.First()) 2012 } 2013 2014 func (i *twoLevelCompactionIterator) Last() (*InternalKey, []byte) { 2015 panic("bitalostable: Last unimplemented") 2016 } 2017 2018 // Note: twoLevelCompactionIterator.Next mirrors the implementation of 2019 // twoLevelIterator.Next due to performance. Keep the two in sync. 2020 func (i *twoLevelCompactionIterator) Next() (*InternalKey, []byte) { 2021 if i.err != nil { 2022 return nil, nil 2023 } 2024 return i.skipForward(i.singleLevelIterator.Next()) 2025 } 2026 2027 func (i *twoLevelCompactionIterator) Prev() (*InternalKey, []byte) { 2028 panic("bitalostable: Prev unimplemented") 2029 } 2030 2031 func (i *twoLevelCompactionIterator) String() string { 2032 return i.reader.fileNum.String() 2033 } 2034 2035 func (i *twoLevelCompactionIterator) skipForward( 2036 key *InternalKey, val []byte, 2037 ) (*InternalKey, []byte) { 2038 if key == nil { 2039 for { 2040 if key, _ := i.topLevelIndex.Next(); key == nil { 2041 break 2042 } 2043 result := i.loadIndex(+1) 2044 if result != loadBlockOK { 2045 if i.err != nil { 2046 break 2047 } 2048 switch result { 2049 case loadBlockFailed: 2050 // We checked that i.index was at a valid entry, so 2051 // loadBlockFailed could not have happened due to to i.index 2052 // being exhausted, and must be due to an error. 2053 panic("loadBlock should not have failed with no error") 2054 case loadBlockIrrelevant: 2055 panic("compactionIter should not be using block intervals for skipping") 2056 default: 2057 panic(fmt.Sprintf("unexpected case %d", result)) 2058 } 2059 } 2060 // result == loadBlockOK 2061 if key, val = i.singleLevelIterator.First(); key != nil { 2062 break 2063 } 2064 } 2065 } 2066 2067 curOffset := i.recordOffset() 2068 *i.bytesIterated += uint64(curOffset - i.prevOffset) 2069 i.prevOffset = curOffset 2070 return key, val 2071 } 2072 2073 type blockTransform func([]byte) ([]byte, error) 2074 2075 // readaheadState contains state variables related to readahead. Updated on 2076 // file reads. 2077 type readaheadState struct { 2078 // Number of sequential reads. 2079 numReads int64 2080 // Size issued to the next call to Prefetch. Starts at or above 2081 // initialReadaheadSize and grows exponentially until maxReadaheadSize. 2082 size int64 2083 // prevSize is the size used in the last Prefetch call. 2084 prevSize int64 2085 // The byte offset up to which the OS has been asked to read ahead / cached. 2086 // When reading ahead, reads up to this limit should not incur an IO 2087 // operation. Reads after this limit can benefit from a new call to 2088 // Prefetch. 2089 limit int64 2090 // sequentialFile holds a file descriptor to the same underlying File, 2091 // except with fadvise(FADV_SEQUENTIAL) called on it to take advantage of 2092 // OS-level readahead. Initialized when the iterator has been consistently 2093 // reading blocks in a sequential access pattern. Once this is non-nil, 2094 // the other variables in readaheadState don't matter much as we defer 2095 // to OS-level readahead. 2096 sequentialFile vfs.File 2097 } 2098 2099 func (rs *readaheadState) recordCacheHit(offset, blockLength int64) { 2100 currentReadEnd := offset + blockLength 2101 if rs.sequentialFile != nil { 2102 // Using OS-level readahead instead, so do nothing. 2103 return 2104 } 2105 if rs.numReads >= minFileReadsForReadahead { 2106 if currentReadEnd >= rs.limit && offset <= rs.limit+maxReadaheadSize { 2107 // This is a read that would have resulted in a readahead, had it 2108 // not been a cache hit. 2109 rs.limit = currentReadEnd 2110 return 2111 } 2112 if currentReadEnd < rs.limit-rs.prevSize || offset > rs.limit+maxReadaheadSize { 2113 // We read too far away from rs.limit to benefit from readahead in 2114 // any scenario. Reset all variables. 2115 rs.numReads = 1 2116 rs.limit = currentReadEnd 2117 rs.size = initialReadaheadSize 2118 rs.prevSize = 0 2119 return 2120 } 2121 // Reads in the range [rs.limit - rs.prevSize, rs.limit] end up 2122 // here. This is a read that is potentially benefitting from a past 2123 // readahead. 2124 return 2125 } 2126 if currentReadEnd >= rs.limit && offset <= rs.limit+maxReadaheadSize { 2127 // Blocks are being read sequentially and would benefit from readahead 2128 // down the line. 2129 rs.numReads++ 2130 return 2131 } 2132 // We read too far ahead of the last read, or before it. This indicates 2133 // a random read, where readahead is not desirable. Reset all variables. 2134 rs.numReads = 1 2135 rs.limit = currentReadEnd 2136 rs.size = initialReadaheadSize 2137 rs.prevSize = 0 2138 } 2139 2140 // maybeReadahead updates state and determines whether to issue a readahead / 2141 // prefetch call for a block read at offset for blockLength bytes. 2142 // Returns a size value (greater than 0) that should be prefetched if readahead 2143 // would be beneficial. 2144 func (rs *readaheadState) maybeReadahead(offset, blockLength int64) int64 { 2145 currentReadEnd := offset + blockLength 2146 if rs.sequentialFile != nil { 2147 // Using OS-level readahead instead, so do nothing. 2148 return 0 2149 } 2150 if rs.numReads >= minFileReadsForReadahead { 2151 // The minimum threshold of sequential reads to justify reading ahead 2152 // has been reached. 2153 // There are two intervals: the interval being read: 2154 // [offset, currentReadEnd] 2155 // as well as the interval where a read would benefit from read ahead: 2156 // [rs.limit, rs.limit + rs.size] 2157 // We increase the latter interval to 2158 // [rs.limit, rs.limit + maxReadaheadSize] to account for cases where 2159 // readahead may not be beneficial with a small readahead size, but over 2160 // time the readahead size would increase exponentially to make it 2161 // beneficial. 2162 if currentReadEnd >= rs.limit && offset <= rs.limit+maxReadaheadSize { 2163 // We are doing a read in the interval ahead of 2164 // the last readahead range. In the diagrams below, ++++ is the last 2165 // readahead range, ==== is the range represented by 2166 // [rs.limit, rs.limit + maxReadaheadSize], and ---- is the range 2167 // being read. 2168 // 2169 // rs.limit rs.limit + maxReadaheadSize 2170 // ++++++++++|===========================| 2171 // 2172 // |-------------| 2173 // offset currentReadEnd 2174 // 2175 // This case is also possible, as are all cases with an overlap 2176 // between [rs.limit, rs.limit + maxReadaheadSize] and [offset, 2177 // currentReadEnd]: 2178 // 2179 // rs.limit rs.limit + maxReadaheadSize 2180 // ++++++++++|===========================| 2181 // 2182 // |-------------| 2183 // offset currentReadEnd 2184 // 2185 // 2186 rs.numReads++ 2187 rs.limit = offset + rs.size 2188 rs.prevSize = rs.size 2189 // Increase rs.size for the next read. 2190 rs.size *= 2 2191 if rs.size > maxReadaheadSize { 2192 rs.size = maxReadaheadSize 2193 } 2194 return rs.prevSize 2195 } 2196 if currentReadEnd < rs.limit-rs.prevSize || offset > rs.limit+maxReadaheadSize { 2197 // The above conditional has rs.limit > rs.prevSize to confirm that 2198 // rs.limit - rs.prevSize would not underflow. 2199 // We read too far away from rs.limit to benefit from readahead in 2200 // any scenario. Reset all variables. 2201 // The case where we read too far ahead: 2202 // 2203 // (rs.limit - rs.prevSize) (rs.limit) (rs.limit + maxReadaheadSize) 2204 // |+++++++++++++|=============| 2205 // 2206 // |-------------| 2207 // offset currentReadEnd 2208 // 2209 // Or too far behind: 2210 // 2211 // (rs.limit - rs.prevSize) (rs.limit) (rs.limit + maxReadaheadSize) 2212 // |+++++++++++++|=============| 2213 // 2214 // |-------------| 2215 // offset currentReadEnd 2216 // 2217 rs.numReads = 1 2218 rs.limit = currentReadEnd 2219 rs.size = initialReadaheadSize 2220 rs.prevSize = 0 2221 return 0 2222 } 2223 // Reads in the range [rs.limit - rs.prevSize, rs.limit] end up 2224 // here. This is a read that is potentially benefitting from a past 2225 // readahead, but there's no reason to issue a readahead call at the 2226 // moment. 2227 // 2228 // (rs.limit - rs.prevSize) (rs.limit + maxReadaheadSize) 2229 // |+++++++++++++|===============| 2230 // (rs.limit) 2231 // 2232 // |-------| 2233 // offset currentReadEnd 2234 // 2235 rs.numReads++ 2236 return 0 2237 } 2238 if currentReadEnd >= rs.limit && offset <= rs.limit+maxReadaheadSize { 2239 // Blocks are being read sequentially and would benefit from readahead 2240 // down the line. 2241 // 2242 // (rs.limit) (rs.limit + maxReadaheadSize) 2243 // |=============| 2244 // 2245 // |-------| 2246 // offset currentReadEnd 2247 // 2248 rs.numReads++ 2249 return 0 2250 } 2251 // We read too far ahead of the last read, or before it. This indicates 2252 // a random read, where readahead is not desirable. Reset all variables. 2253 // 2254 // (rs.limit - maxReadaheadSize) (rs.limit) (rs.limit + maxReadaheadSize) 2255 // |+++++++++++++|=============| 2256 // 2257 // |-------| 2258 // offset currentReadEnd 2259 // 2260 rs.numReads = 1 2261 rs.limit = currentReadEnd 2262 rs.size = initialReadaheadSize 2263 rs.prevSize = 0 2264 return 0 2265 } 2266 2267 // ReaderOption provide an interface to do work on Reader while it is being 2268 // opened. 2269 type ReaderOption interface { 2270 // readerApply is called on the reader during opening in order to set internal 2271 // parameters. 2272 readerApply(*Reader) 2273 } 2274 2275 // Comparers is a map from comparer name to comparer. It is used for debugging 2276 // tools which may be used on multiple databases configured with different 2277 // comparers. Comparers implements the OpenOption interface and can be passed 2278 // as a parameter to NewReader. 2279 type Comparers map[string]*Comparer 2280 2281 func (c Comparers) readerApply(r *Reader) { 2282 if r.Compare != nil || r.Properties.ComparerName == "" { 2283 return 2284 } 2285 if comparer, ok := c[r.Properties.ComparerName]; ok { 2286 r.Compare = comparer.Compare 2287 r.FormatKey = comparer.FormatKey 2288 r.Split = comparer.Split 2289 } 2290 } 2291 2292 // Mergers is a map from merger name to merger. It is used for debugging tools 2293 // which may be used on multiple databases configured with different 2294 // mergers. Mergers implements the OpenOption interface and can be passed as 2295 // a parameter to NewReader. 2296 type Mergers map[string]*Merger 2297 2298 func (m Mergers) readerApply(r *Reader) { 2299 if r.mergerOK || r.Properties.MergerName == "" { 2300 return 2301 } 2302 _, r.mergerOK = m[r.Properties.MergerName] 2303 } 2304 2305 // cacheOpts is a Reader open option for specifying the cache ID and sstable file 2306 // number. If not specified, a unique cache ID will be used. 2307 type cacheOpts struct { 2308 cacheID uint64 2309 fileNum base.FileNum 2310 } 2311 2312 // Marker function to indicate the option should be applied before reading the 2313 // sstable properties and, in the write path, before writing the default 2314 // sstable properties. 2315 func (c *cacheOpts) preApply() {} 2316 2317 func (c *cacheOpts) readerApply(r *Reader) { 2318 if r.cacheID == 0 { 2319 r.cacheID = c.cacheID 2320 } 2321 if r.fileNum == 0 { 2322 r.fileNum = c.fileNum 2323 } 2324 } 2325 2326 func (c *cacheOpts) writerApply(w *Writer) { 2327 if w.cacheID == 0 { 2328 w.cacheID = c.cacheID 2329 } 2330 if w.fileNum == 0 { 2331 w.fileNum = c.fileNum 2332 } 2333 } 2334 2335 // FileReopenOpt is specified if this reader is allowed to reopen additional 2336 // file descriptors for this file. Used to take advantage of OS-level readahead. 2337 type FileReopenOpt struct { 2338 FS vfs.FS 2339 Filename string 2340 } 2341 2342 func (f FileReopenOpt) readerApply(r *Reader) { 2343 if r.fs == nil { 2344 r.fs = f.FS 2345 r.filename = f.Filename 2346 } 2347 } 2348 2349 // rawTombstonesOpt is a Reader open option for specifying that range 2350 // tombstones returned by Reader.NewRangeDelIter() should not be 2351 // fragmented. Used by debug tools to get a raw view of the tombstones 2352 // contained in an sstable. 2353 type rawTombstonesOpt struct{} 2354 2355 func (rawTombstonesOpt) preApply() {} 2356 2357 func (rawTombstonesOpt) readerApply(r *Reader) { 2358 r.rawTombstones = true 2359 } 2360 2361 func init() { 2362 private.SSTableCacheOpts = func(cacheID uint64, fileNum base.FileNum) interface{} { 2363 return &cacheOpts{cacheID, fileNum} 2364 } 2365 private.SSTableRawTombstonesOpt = rawTombstonesOpt{} 2366 } 2367 2368 // Reader is a table reader. 2369 type Reader struct { 2370 file ReadableFile 2371 fs vfs.FS 2372 filename string 2373 cacheID uint64 2374 fileNum base.FileNum 2375 rawTombstones bool 2376 err error 2377 indexBH BlockHandle 2378 filterBH BlockHandle 2379 rangeDelBH BlockHandle 2380 rangeKeyBH BlockHandle 2381 rangeDelTransform blockTransform 2382 propertiesBH BlockHandle 2383 metaIndexBH BlockHandle 2384 footerBH BlockHandle 2385 opts ReaderOptions 2386 Compare Compare 2387 FormatKey base.FormatKey 2388 Split Split 2389 mergerOK bool 2390 checksumType ChecksumType 2391 tableFilter *tableFilterReader 2392 tableFormat TableFormat 2393 Properties Properties 2394 } 2395 2396 // Close implements DB.Close, as documented in the bitalostable package. 2397 func (r *Reader) Close() error { 2398 r.opts.Cache.Unref() 2399 2400 if r.err != nil { 2401 if r.file != nil { 2402 r.file.Close() 2403 r.file = nil 2404 } 2405 return r.err 2406 } 2407 if r.file != nil { 2408 r.err = r.file.Close() 2409 r.file = nil 2410 if r.err != nil { 2411 return r.err 2412 } 2413 } 2414 // Make any future calls to Get, NewIter or Close return an error. 2415 r.err = errReaderClosed 2416 return nil 2417 } 2418 2419 // NewIterWithBlockPropertyFilters returns an iterator for the contents of the 2420 // table. If an error occurs, NewIterWithBlockPropertyFilters cleans up after 2421 // itself and returns a nil iterator. 2422 func (r *Reader) NewIterWithBlockPropertyFilters( 2423 lower, upper []byte, 2424 filterer *BlockPropertiesFilterer, 2425 useFilterBlock bool, 2426 stats *base.InternalIteratorStats, 2427 ) (Iterator, error) { 2428 // NB: bitalostable.tableCache wraps the returned iterator with one which performs 2429 // reference counting on the Reader, preventing the Reader from being closed 2430 // until the final iterator closes. 2431 if r.Properties.IndexType == twoLevelIndex { 2432 i := twoLevelIterPool.Get().(*twoLevelIterator) 2433 err := i.init(r, lower, upper, filterer, useFilterBlock, stats) 2434 if err != nil { 2435 return nil, err 2436 } 2437 return i, nil 2438 } 2439 2440 i := singleLevelIterPool.Get().(*singleLevelIterator) 2441 err := i.init(r, lower, upper, filterer, useFilterBlock, stats) 2442 if err != nil { 2443 return nil, err 2444 } 2445 return i, nil 2446 } 2447 2448 // NewIter returns an iterator for the contents of the table. If an error 2449 // occurs, NewIter cleans up after itself and returns a nil iterator. 2450 func (r *Reader) NewIter(lower, upper []byte) (Iterator, error) { 2451 return r.NewIterWithBlockPropertyFilters(lower, upper, nil, true /* useFilterBlock */, nil /* stats */) 2452 } 2453 2454 // NewCompactionIter returns an iterator similar to NewIter but it also increments 2455 // the number of bytes iterated. If an error occurs, NewCompactionIter cleans up 2456 // after itself and returns a nil iterator. 2457 func (r *Reader) NewCompactionIter(bytesIterated *uint64) (Iterator, error) { 2458 if r.Properties.IndexType == twoLevelIndex { 2459 i := twoLevelIterPool.Get().(*twoLevelIterator) 2460 err := i.init(r, nil /* lower */, nil /* upper */, nil, false /* useFilter */, nil /* stats */) 2461 if err != nil { 2462 return nil, err 2463 } 2464 i.setupForCompaction() 2465 return &twoLevelCompactionIterator{ 2466 twoLevelIterator: i, 2467 bytesIterated: bytesIterated, 2468 }, nil 2469 } 2470 i := singleLevelIterPool.Get().(*singleLevelIterator) 2471 err := i.init(r, nil /* lower */, nil /* upper */, nil, false /* useFilter */, nil /* stats */) 2472 if err != nil { 2473 return nil, err 2474 } 2475 i.setupForCompaction() 2476 return &compactionIterator{ 2477 singleLevelIterator: i, 2478 bytesIterated: bytesIterated, 2479 }, nil 2480 } 2481 2482 // NewRawRangeDelIter returns an internal iterator for the contents of the 2483 // range-del block for the table. Returns nil if the table does not contain 2484 // any range deletions. 2485 func (r *Reader) NewRawRangeDelIter() (keyspan.FragmentIterator, error) { 2486 if r.rangeDelBH.Length == 0 { 2487 return nil, nil 2488 } 2489 h, err := r.readRangeDel() 2490 if err != nil { 2491 return nil, err 2492 } 2493 i := &fragmentBlockIter{} 2494 if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum); err != nil { 2495 return nil, err 2496 } 2497 return i, nil 2498 } 2499 2500 // NewRawRangeKeyIter returns an internal iterator for the contents of the 2501 // range-key block for the table. Returns nil if the table does not contain any 2502 // range keys. 2503 func (r *Reader) NewRawRangeKeyIter() (keyspan.FragmentIterator, error) { 2504 if r.rangeKeyBH.Length == 0 { 2505 return nil, nil 2506 } 2507 h, err := r.readRangeKey() 2508 if err != nil { 2509 return nil, err 2510 } 2511 i := rangeKeyFragmentBlockIterPool.Get().(*rangeKeyFragmentBlockIter) 2512 if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum); err != nil { 2513 return nil, err 2514 } 2515 return i, nil 2516 } 2517 2518 type rangeKeyFragmentBlockIter struct { 2519 fragmentBlockIter 2520 } 2521 2522 func (i *rangeKeyFragmentBlockIter) Close() error { 2523 err := i.fragmentBlockIter.Close() 2524 i.fragmentBlockIter = i.fragmentBlockIter.resetForReuse() 2525 rangeKeyFragmentBlockIterPool.Put(i) 2526 return err 2527 } 2528 2529 func (r *Reader) readIndex() (cache.Handle, error) { 2530 h, _, err := 2531 r.readBlock(r.indexBH, nil /* transform */, nil /* readaheadState */) 2532 return h, err 2533 } 2534 2535 func (r *Reader) readFilter() (cache.Handle, error) { 2536 h, _, err := 2537 r.readBlock(r.filterBH, nil /* transform */, nil /* readaheadState */) 2538 return h, err 2539 } 2540 2541 func (r *Reader) readRangeDel() (cache.Handle, error) { 2542 h, _, err := 2543 r.readBlock(r.rangeDelBH, r.rangeDelTransform, nil /* readaheadState */) 2544 return h, err 2545 } 2546 2547 func (r *Reader) readRangeKey() (cache.Handle, error) { 2548 h, _, err := 2549 r.readBlock(r.rangeKeyBH, nil /* transform */, nil /* readaheadState */) 2550 return h, err 2551 } 2552 2553 func checkChecksum( 2554 checksumType ChecksumType, b []byte, bh BlockHandle, fileNum base.FileNum, 2555 ) error { 2556 expectedChecksum := binary.LittleEndian.Uint32(b[bh.Length+1:]) 2557 var computedChecksum uint32 2558 switch checksumType { 2559 case ChecksumTypeCRC32c: 2560 computedChecksum = crc.New(b[:bh.Length+1]).Value() 2561 case ChecksumTypeXXHash64: 2562 computedChecksum = uint32(xxhash.Sum64(b[:bh.Length+1])) 2563 default: 2564 return errors.Errorf("unsupported checksum type: %d", checksumType) 2565 } 2566 2567 if expectedChecksum != computedChecksum { 2568 return base.CorruptionErrorf( 2569 "bitalostable/table: invalid table %s (checksum mismatch at %d/%d)", 2570 errors.Safe(fileNum), errors.Safe(bh.Offset), errors.Safe(bh.Length)) 2571 } 2572 return nil 2573 } 2574 2575 // readBlock reads and decompresses a block from disk into memory. 2576 func (r *Reader) readBlock( 2577 bh BlockHandle, transform blockTransform, raState *readaheadState, 2578 ) (_ cache.Handle, cacheHit bool, _ error) { 2579 if h := r.opts.Cache.Get(r.cacheID, r.fileNum, bh.Offset); h.Get() != nil { 2580 if raState != nil { 2581 raState.recordCacheHit(int64(bh.Offset), int64(bh.Length+blockTrailerLen)) 2582 } 2583 return h, true, nil 2584 } 2585 file := r.file 2586 2587 if raState != nil { 2588 if raState.sequentialFile != nil { 2589 file = raState.sequentialFile 2590 } else if readaheadSize := raState.maybeReadahead(int64(bh.Offset), int64(bh.Length+blockTrailerLen)); readaheadSize > 0 { 2591 if readaheadSize >= maxReadaheadSize { 2592 // We've reached the maximum readahead size. Beyond this 2593 // point, rely on OS-level readahead. Note that we can only 2594 // reopen a new file handle with this optimization if 2595 // r.fs != nil. This reader must have been created with the 2596 // FileReopenOpt for this field to be set. 2597 if r.fs != nil { 2598 f, err := r.fs.Open(r.filename, vfs.SequentialReadsOption) 2599 if err == nil { 2600 // Use this new file handle for all sequential reads by 2601 // this iterator going forward. 2602 raState.sequentialFile = f 2603 file = f 2604 } 2605 2606 // If we tried to load a table that doesn't exist, panic 2607 // immediately. Something is seriously wrong if a table 2608 // doesn't exist. 2609 // See cockroachdb/cockroach#56490. 2610 base.MustExist(r.fs, r.filename, panicFataler{}, err) 2611 } 2612 } 2613 if raState.sequentialFile == nil { 2614 type fd interface { 2615 Fd() uintptr 2616 } 2617 if f, ok := r.file.(fd); ok { 2618 _ = vfs.Prefetch(f.Fd(), bh.Offset, uint64(readaheadSize)) 2619 } 2620 } 2621 } 2622 } 2623 2624 v := r.opts.Cache.Alloc(int(bh.Length + blockTrailerLen)) 2625 b := v.Buf() 2626 if _, err := file.ReadAt(b, int64(bh.Offset)); err != nil { 2627 r.opts.Cache.Free(v) 2628 return cache.Handle{}, false, err 2629 } 2630 2631 if err := checkChecksum(r.checksumType, b, bh, r.fileNum); err != nil { 2632 r.opts.Cache.Free(v) 2633 return cache.Handle{}, false, err 2634 } 2635 2636 typ := blockType(b[bh.Length]) 2637 b = b[:bh.Length] 2638 v.Truncate(len(b)) 2639 2640 decoded, err := decompressBlock(r.opts.Cache, typ, b) 2641 if decoded != nil { 2642 r.opts.Cache.Free(v) 2643 v = decoded 2644 b = v.Buf() 2645 } else if err != nil { 2646 r.opts.Cache.Free(v) 2647 return cache.Handle{}, false, err 2648 } 2649 2650 if transform != nil { 2651 // Transforming blocks is rare, so the extra copy of the transformed data 2652 // is not problematic. 2653 var err error 2654 b, err = transform(b) 2655 if err != nil { 2656 r.opts.Cache.Free(v) 2657 return cache.Handle{}, false, err 2658 } 2659 newV := r.opts.Cache.Alloc(len(b)) 2660 copy(newV.Buf(), b) 2661 r.opts.Cache.Free(v) 2662 v = newV 2663 } 2664 2665 h := r.opts.Cache.Set(r.cacheID, r.fileNum, bh.Offset, v) 2666 return h, false, nil 2667 } 2668 2669 func (r *Reader) transformRangeDelV1(b []byte) ([]byte, error) { 2670 // Convert v1 (RocksDB format) range-del blocks to v2 blocks on the fly. The 2671 // v1 format range-del blocks have unfragmented and unsorted range 2672 // tombstones. We need properly fragmented and sorted range tombstones in 2673 // order to serve from them directly. 2674 iter := &blockIter{} 2675 if err := iter.init(r.Compare, b, r.Properties.GlobalSeqNum); err != nil { 2676 return nil, err 2677 } 2678 var tombstones []keyspan.Span 2679 for key, value := iter.First(); key != nil; key, value = iter.Next() { 2680 t := keyspan.Span{ 2681 Start: key.UserKey, 2682 End: value, 2683 Keys: []keyspan.Key{{Trailer: key.Trailer}}, 2684 } 2685 tombstones = append(tombstones, t) 2686 } 2687 keyspan.Sort(r.Compare, tombstones) 2688 2689 // Fragment the tombstones, outputting them directly to a block writer. 2690 rangeDelBlock := blockWriter{ 2691 restartInterval: 1, 2692 } 2693 frag := keyspan.Fragmenter{ 2694 Cmp: r.Compare, 2695 Format: r.FormatKey, 2696 Emit: func(s keyspan.Span) { 2697 for _, k := range s.Keys { 2698 startIK := InternalKey{UserKey: s.Start, Trailer: k.Trailer} 2699 rangeDelBlock.add(startIK, s.End) 2700 } 2701 }, 2702 } 2703 for i := range tombstones { 2704 frag.Add(tombstones[i]) 2705 } 2706 frag.Finish() 2707 2708 // Return the contents of the constructed v2 format range-del block. 2709 return rangeDelBlock.finish(), nil 2710 } 2711 2712 func (r *Reader) readMetaindex(metaindexBH BlockHandle) error { 2713 b, _, err := r.readBlock(metaindexBH, nil /* transform */, nil /* readaheadState */) 2714 if err != nil { 2715 return err 2716 } 2717 data := b.Get() 2718 defer b.Release() 2719 2720 if uint64(len(data)) != metaindexBH.Length { 2721 return base.CorruptionErrorf("bitalostable/table: unexpected metaindex block size: %d vs %d", 2722 errors.Safe(len(data)), errors.Safe(metaindexBH.Length)) 2723 } 2724 2725 i, err := newRawBlockIter(bytes.Compare, data) 2726 if err != nil { 2727 return err 2728 } 2729 2730 meta := map[string]BlockHandle{} 2731 for valid := i.First(); valid; valid = i.Next() { 2732 bh, n := decodeBlockHandle(i.Value()) 2733 if n == 0 { 2734 return base.CorruptionErrorf("bitalostable/table: invalid table (bad filter block handle)") 2735 } 2736 meta[string(i.Key().UserKey)] = bh 2737 } 2738 if err := i.Close(); err != nil { 2739 return err 2740 } 2741 2742 if bh, ok := meta[metaPropertiesName]; ok { 2743 b, _, err = r.readBlock(bh, nil /* transform */, nil /* readaheadState */) 2744 if err != nil { 2745 return err 2746 } 2747 r.propertiesBH = bh 2748 err := r.Properties.load(b.Get(), bh.Offset) 2749 b.Release() 2750 if err != nil { 2751 return err 2752 } 2753 } 2754 2755 if bh, ok := meta[metaRangeDelV2Name]; ok { 2756 r.rangeDelBH = bh 2757 } else if bh, ok := meta[metaRangeDelName]; ok { 2758 r.rangeDelBH = bh 2759 if !r.rawTombstones { 2760 r.rangeDelTransform = r.transformRangeDelV1 2761 } 2762 } 2763 2764 if bh, ok := meta[metaRangeKeyName]; ok { 2765 r.rangeKeyBH = bh 2766 } 2767 2768 for name, fp := range r.opts.Filters { 2769 types := []struct { 2770 ftype FilterType 2771 prefix string 2772 }{ 2773 {TableFilter, "fullfilter."}, 2774 } 2775 var done bool 2776 for _, t := range types { 2777 if bh, ok := meta[t.prefix+name]; ok { 2778 r.filterBH = bh 2779 2780 switch t.ftype { 2781 case TableFilter: 2782 r.tableFilter = newTableFilterReader(fp) 2783 default: 2784 return base.CorruptionErrorf("unknown filter type: %v", errors.Safe(t.ftype)) 2785 } 2786 2787 done = true 2788 break 2789 } 2790 } 2791 if done { 2792 break 2793 } 2794 } 2795 return nil 2796 } 2797 2798 // Layout returns the layout (block organization) for an sstable. 2799 func (r *Reader) Layout() (*Layout, error) { 2800 if r.err != nil { 2801 return nil, r.err 2802 } 2803 2804 l := &Layout{ 2805 Data: make([]BlockHandleWithProperties, 0, r.Properties.NumDataBlocks), 2806 Filter: r.filterBH, 2807 RangeDel: r.rangeDelBH, 2808 RangeKey: r.rangeKeyBH, 2809 Properties: r.propertiesBH, 2810 MetaIndex: r.metaIndexBH, 2811 Footer: r.footerBH, 2812 } 2813 2814 indexH, err := r.readIndex() 2815 if err != nil { 2816 return nil, err 2817 } 2818 defer indexH.Release() 2819 2820 var alloc []byte 2821 2822 if r.Properties.IndexPartitions == 0 { 2823 l.Index = append(l.Index, r.indexBH) 2824 iter, _ := newBlockIter(r.Compare, indexH.Get()) 2825 for key, value := iter.First(); key != nil; key, value = iter.Next() { 2826 dataBH, err := decodeBlockHandleWithProperties(value) 2827 if err != nil { 2828 return nil, errCorruptIndexEntry 2829 } 2830 if len(dataBH.Props) > 0 { 2831 if len(alloc) < len(dataBH.Props) { 2832 alloc = make([]byte, 256<<10) 2833 } 2834 n := copy(alloc, dataBH.Props) 2835 dataBH.Props = alloc[:n:n] 2836 alloc = alloc[n:] 2837 } 2838 l.Data = append(l.Data, dataBH) 2839 } 2840 } else { 2841 l.TopIndex = r.indexBH 2842 topIter, _ := newBlockIter(r.Compare, indexH.Get()) 2843 iter := &blockIter{} 2844 for key, value := topIter.First(); key != nil; key, value = topIter.Next() { 2845 indexBH, err := decodeBlockHandleWithProperties(value) 2846 if err != nil { 2847 return nil, errCorruptIndexEntry 2848 } 2849 l.Index = append(l.Index, indexBH.BlockHandle) 2850 2851 subIndex, _, err := r.readBlock( 2852 indexBH.BlockHandle, nil /* transform */, nil /* readaheadState */) 2853 if err != nil { 2854 return nil, err 2855 } 2856 if err := iter.init(r.Compare, subIndex.Get(), 0 /* globalSeqNum */); err != nil { 2857 return nil, err 2858 } 2859 for key, value := iter.First(); key != nil; key, value = iter.Next() { 2860 dataBH, err := decodeBlockHandleWithProperties(value) 2861 if len(dataBH.Props) > 0 { 2862 if len(alloc) < len(dataBH.Props) { 2863 alloc = make([]byte, 256<<10) 2864 } 2865 n := copy(alloc, dataBH.Props) 2866 dataBH.Props = alloc[:n:n] 2867 alloc = alloc[n:] 2868 } 2869 if err != nil { 2870 return nil, errCorruptIndexEntry 2871 } 2872 l.Data = append(l.Data, dataBH) 2873 } 2874 subIndex.Release() 2875 *iter = iter.resetForReuse() 2876 } 2877 } 2878 2879 return l, nil 2880 } 2881 2882 // ValidateBlockChecksums validates the checksums for each block in the SSTable. 2883 func (r *Reader) ValidateBlockChecksums() error { 2884 // Pre-compute the BlockHandles for the underlying file. 2885 l, err := r.Layout() 2886 if err != nil { 2887 return err 2888 } 2889 2890 // Construct the set of blocks to check. Note that the footer is not checked 2891 // as it is not a block with a checksum. 2892 blocks := make([]BlockHandle, len(l.Data)) 2893 for i := range l.Data { 2894 blocks[i] = l.Data[i].BlockHandle 2895 } 2896 blocks = append(blocks, l.Index...) 2897 blocks = append(blocks, l.TopIndex, l.Filter, l.RangeDel, l.RangeKey, l.Properties, l.MetaIndex) 2898 2899 // Sorting by offset ensures we are performing a sequential scan of the 2900 // file. 2901 sort.Slice(blocks, func(i, j int) bool { 2902 return blocks[i].Offset < blocks[j].Offset 2903 }) 2904 2905 // Check all blocks sequentially. Make use of read-ahead, given we are 2906 // scanning the entire file from start to end. 2907 blockRS := &readaheadState{ 2908 size: initialReadaheadSize, 2909 } 2910 for _, bh := range blocks { 2911 // Certain blocks may not be present, in which case we skip them. 2912 if bh.Length == 0 { 2913 continue 2914 } 2915 2916 // Read the block, which validates the checksum. 2917 h, _, err := r.readBlock(bh, nil /* transform */, blockRS) 2918 if err != nil { 2919 return err 2920 } 2921 h.Release() 2922 } 2923 2924 return nil 2925 } 2926 2927 // EstimateDiskUsage returns the total size of data blocks overlapping the range 2928 // `[start, end]`. Even if a data block partially overlaps, or we cannot 2929 // determine overlap due to abbreviated index keys, the full data block size is 2930 // included in the estimation. 2931 // 2932 // This function does not account for any metablock space usage. Assumes there 2933 // is at least partial overlap, i.e., `[start, end]` falls neither completely 2934 // before nor completely after the file's range. 2935 // 2936 // Only blocks containing point keys are considered. Range deletion and range 2937 // key blocks are not considered. 2938 // 2939 // TODO(ajkr): account for metablock space usage. Perhaps look at the fraction of 2940 // data blocks overlapped and add that same fraction of the metadata blocks to the 2941 // estimate. 2942 func (r *Reader) EstimateDiskUsage(start, end []byte) (uint64, error) { 2943 if r.err != nil { 2944 return 0, r.err 2945 } 2946 2947 indexH, err := r.readIndex() 2948 if err != nil { 2949 return 0, err 2950 } 2951 defer indexH.Release() 2952 2953 // Iterators over the bottom-level index blocks containing start and end. 2954 // These may be different in case of partitioned index but will both point 2955 // to the same blockIter over the single index in the unpartitioned case. 2956 var startIdxIter, endIdxIter *blockIter 2957 if r.Properties.IndexPartitions == 0 { 2958 iter, err := newBlockIter(r.Compare, indexH.Get()) 2959 if err != nil { 2960 return 0, err 2961 } 2962 startIdxIter = iter 2963 endIdxIter = iter 2964 } else { 2965 topIter, err := newBlockIter(r.Compare, indexH.Get()) 2966 if err != nil { 2967 return 0, err 2968 } 2969 2970 key, val := topIter.SeekGE(start, base.SeekGEFlagsNone) 2971 if key == nil { 2972 // The range falls completely after this file, or an error occurred. 2973 return 0, topIter.Error() 2974 } 2975 startIdxBH, err := decodeBlockHandleWithProperties(val) 2976 if err != nil { 2977 return 0, errCorruptIndexEntry 2978 } 2979 startIdxBlock, _, err := r.readBlock( 2980 startIdxBH.BlockHandle, nil /* transform */, nil /* readaheadState */) 2981 if err != nil { 2982 return 0, err 2983 } 2984 defer startIdxBlock.Release() 2985 startIdxIter, err = newBlockIter(r.Compare, startIdxBlock.Get()) 2986 if err != nil { 2987 return 0, err 2988 } 2989 2990 key, val = topIter.SeekGE(end, base.SeekGEFlagsNone) 2991 if key == nil { 2992 if err := topIter.Error(); err != nil { 2993 return 0, err 2994 } 2995 } else { 2996 endIdxBH, err := decodeBlockHandleWithProperties(val) 2997 if err != nil { 2998 return 0, errCorruptIndexEntry 2999 } 3000 endIdxBlock, _, err := r.readBlock( 3001 endIdxBH.BlockHandle, nil /* transform */, nil /* readaheadState */) 3002 if err != nil { 3003 return 0, err 3004 } 3005 defer endIdxBlock.Release() 3006 endIdxIter, err = newBlockIter(r.Compare, endIdxBlock.Get()) 3007 if err != nil { 3008 return 0, err 3009 } 3010 } 3011 } 3012 // startIdxIter should not be nil at this point, while endIdxIter can be if the 3013 // range spans past the end of the file. 3014 3015 key, val := startIdxIter.SeekGE(start, base.SeekGEFlagsNone) 3016 if key == nil { 3017 // The range falls completely after this file, or an error occurred. 3018 return 0, startIdxIter.Error() 3019 } 3020 startBH, err := decodeBlockHandleWithProperties(val) 3021 if err != nil { 3022 return 0, errCorruptIndexEntry 3023 } 3024 3025 if endIdxIter == nil { 3026 // The range spans beyond this file. Include data blocks through the last. 3027 return r.Properties.DataSize - startBH.Offset, nil 3028 } 3029 key, val = endIdxIter.SeekGE(end, base.SeekGEFlagsNone) 3030 if key == nil { 3031 if err := endIdxIter.Error(); err != nil { 3032 return 0, err 3033 } 3034 // The range spans beyond this file. Include data blocks through the last. 3035 return r.Properties.DataSize - startBH.Offset, nil 3036 } 3037 endBH, err := decodeBlockHandleWithProperties(val) 3038 if err != nil { 3039 return 0, errCorruptIndexEntry 3040 } 3041 return endBH.Offset + endBH.Length + blockTrailerLen - startBH.Offset, nil 3042 } 3043 3044 // TableFormat returns the format version for the table. 3045 func (r *Reader) TableFormat() (TableFormat, error) { 3046 if r.err != nil { 3047 return TableFormatUnspecified, r.err 3048 } 3049 return r.tableFormat, nil 3050 } 3051 3052 // ReadableFile describes subset of vfs.File required for reading SSTs. 3053 type ReadableFile interface { 3054 io.ReaderAt 3055 io.Closer 3056 Stat() (os.FileInfo, error) 3057 } 3058 3059 // NewReader returns a new table reader for the file. Closing the reader will 3060 // close the file. 3061 func NewReader(f ReadableFile, o ReaderOptions, extraOpts ...ReaderOption) (*Reader, error) { 3062 o = o.ensureDefaults() 3063 r := &Reader{ 3064 file: f, 3065 opts: o, 3066 } 3067 if r.opts.Cache == nil { 3068 r.opts.Cache = cache.New(0) 3069 } else { 3070 r.opts.Cache.Ref() 3071 } 3072 3073 if f == nil { 3074 r.err = errors.New("bitalostable/table: nil file") 3075 return nil, r.Close() 3076 } 3077 3078 // Note that the extra options are applied twice. First here for pre-apply 3079 // options, and then below for post-apply options. Pre and post refer to 3080 // before and after reading the metaindex and properties. 3081 type preApply interface{ preApply() } 3082 for _, opt := range extraOpts { 3083 if _, ok := opt.(preApply); ok { 3084 opt.readerApply(r) 3085 } 3086 } 3087 if r.cacheID == 0 { 3088 r.cacheID = r.opts.Cache.NewID() 3089 } 3090 3091 footer, err := readFooter(f) 3092 if err != nil { 3093 r.err = err 3094 return nil, r.Close() 3095 } 3096 r.checksumType = footer.checksum 3097 r.tableFormat = footer.format 3098 // Read the metaindex. 3099 if err := r.readMetaindex(footer.metaindexBH); err != nil { 3100 r.err = err 3101 return nil, r.Close() 3102 } 3103 r.indexBH = footer.indexBH 3104 r.metaIndexBH = footer.metaindexBH 3105 r.footerBH = footer.footerBH 3106 3107 if r.Properties.ComparerName == "" || o.Comparer.Name == r.Properties.ComparerName { 3108 r.Compare = o.Comparer.Compare 3109 r.FormatKey = o.Comparer.FormatKey 3110 r.Split = o.Comparer.Split 3111 } 3112 3113 if o.MergerName == r.Properties.MergerName { 3114 r.mergerOK = true 3115 } 3116 3117 // Apply the extra options again now that the comparer and merger names are 3118 // known. 3119 for _, opt := range extraOpts { 3120 if _, ok := opt.(preApply); !ok { 3121 opt.readerApply(r) 3122 } 3123 } 3124 3125 if r.Compare == nil { 3126 r.err = errors.Errorf("bitalostable/table: %d: unknown comparer %s", 3127 errors.Safe(r.fileNum), errors.Safe(r.Properties.ComparerName)) 3128 } 3129 if !r.mergerOK { 3130 if name := r.Properties.MergerName; name != "" && name != "nullptr" { 3131 r.err = errors.Errorf("bitalostable/table: %d: unknown merger %s", 3132 errors.Safe(r.fileNum), errors.Safe(r.Properties.MergerName)) 3133 } 3134 } 3135 if r.err != nil { 3136 return nil, r.Close() 3137 } 3138 return r, nil 3139 } 3140 3141 // Layout describes the block organization of an sstable. 3142 type Layout struct { 3143 // NOTE: changes to fields in this struct should also be reflected in 3144 // ValidateBlockChecksums, which validates a static list of BlockHandles 3145 // referenced in this struct. 3146 3147 Data []BlockHandleWithProperties 3148 Index []BlockHandle 3149 TopIndex BlockHandle 3150 Filter BlockHandle 3151 RangeDel BlockHandle 3152 RangeKey BlockHandle 3153 Properties BlockHandle 3154 MetaIndex BlockHandle 3155 Footer BlockHandle 3156 } 3157 3158 // Describe returns a description of the layout. If the verbose parameter is 3159 // true, details of the structure of each block are returned as well. 3160 func (l *Layout) Describe( 3161 w io.Writer, verbose bool, r *Reader, fmtRecord func(key *base.InternalKey, value []byte), 3162 ) { 3163 type block struct { 3164 BlockHandle 3165 name string 3166 } 3167 var blocks []block 3168 3169 for i := range l.Data { 3170 blocks = append(blocks, block{l.Data[i].BlockHandle, "data"}) 3171 } 3172 for i := range l.Index { 3173 blocks = append(blocks, block{l.Index[i], "index"}) 3174 } 3175 if l.TopIndex.Length != 0 { 3176 blocks = append(blocks, block{l.TopIndex, "top-index"}) 3177 } 3178 if l.Filter.Length != 0 { 3179 blocks = append(blocks, block{l.Filter, "filter"}) 3180 } 3181 if l.RangeDel.Length != 0 { 3182 blocks = append(blocks, block{l.RangeDel, "range-del"}) 3183 } 3184 if l.RangeKey.Length != 0 { 3185 blocks = append(blocks, block{l.RangeKey, "range-key"}) 3186 } 3187 if l.Properties.Length != 0 { 3188 blocks = append(blocks, block{l.Properties, "properties"}) 3189 } 3190 if l.MetaIndex.Length != 0 { 3191 blocks = append(blocks, block{l.MetaIndex, "meta-index"}) 3192 } 3193 if l.Footer.Length != 0 { 3194 if l.Footer.Length == levelDBFooterLen { 3195 blocks = append(blocks, block{l.Footer, "leveldb-footer"}) 3196 } else { 3197 blocks = append(blocks, block{l.Footer, "footer"}) 3198 } 3199 } 3200 3201 sort.Slice(blocks, func(i, j int) bool { 3202 return blocks[i].Offset < blocks[j].Offset 3203 }) 3204 3205 for i := range blocks { 3206 b := &blocks[i] 3207 fmt.Fprintf(w, "%10d %s (%d)\n", b.Offset, b.name, b.Length) 3208 3209 if !verbose { 3210 continue 3211 } 3212 if b.name == "filter" { 3213 continue 3214 } 3215 3216 if b.name == "footer" || b.name == "leveldb-footer" { 3217 trailer, offset := make([]byte, b.Length), b.Offset 3218 _, _ = r.file.ReadAt(trailer, int64(offset)) 3219 3220 if b.name == "footer" { 3221 checksumType := ChecksumType(trailer[0]) 3222 fmt.Fprintf(w, "%10d checksum type: %s\n", offset, checksumType) 3223 trailer, offset = trailer[1:], offset+1 3224 } 3225 3226 metaHandle, n := binary.Uvarint(trailer) 3227 metaLen, m := binary.Uvarint(trailer[n:]) 3228 fmt.Fprintf(w, "%10d meta: offset=%d, length=%d\n", offset, metaHandle, metaLen) 3229 trailer, offset = trailer[n+m:], offset+uint64(n+m) 3230 3231 indexHandle, n := binary.Uvarint(trailer) 3232 indexLen, m := binary.Uvarint(trailer[n:]) 3233 fmt.Fprintf(w, "%10d index: offset=%d, length=%d\n", offset, indexHandle, indexLen) 3234 trailer, offset = trailer[n+m:], offset+uint64(n+m) 3235 3236 fmt.Fprintf(w, "%10d [padding]\n", offset) 3237 3238 trailing := 12 3239 if b.name == "leveldb-footer" { 3240 trailing = 8 3241 } 3242 3243 offset += uint64(len(trailer) - trailing) 3244 trailer = trailer[len(trailer)-trailing:] 3245 3246 if b.name == "footer" { 3247 version := trailer[:4] 3248 fmt.Fprintf(w, "%10d version: %d\n", offset, binary.LittleEndian.Uint32(version)) 3249 trailer, offset = trailer[4:], offset+4 3250 } 3251 3252 magicNumber := trailer 3253 fmt.Fprintf(w, "%10d magic number: 0x%x\n", offset, magicNumber) 3254 3255 continue 3256 } 3257 3258 h, _, err := r.readBlock(b.BlockHandle, nil /* transform */, nil /* readaheadState */) 3259 if err != nil { 3260 fmt.Fprintf(w, " [err: %s]\n", err) 3261 continue 3262 } 3263 3264 getRestart := func(data []byte, restarts, i int32) int32 { 3265 return int32(binary.LittleEndian.Uint32(data[restarts+4*i:])) 3266 } 3267 3268 formatIsRestart := func(data []byte, restarts, numRestarts, offset int32) { 3269 i := sort.Search(int(numRestarts), func(i int) bool { 3270 return getRestart(data, restarts, int32(i)) >= offset 3271 }) 3272 if i < int(numRestarts) && getRestart(data, restarts, int32(i)) == offset { 3273 fmt.Fprintf(w, " [restart]\n") 3274 } else { 3275 fmt.Fprintf(w, "\n") 3276 } 3277 } 3278 3279 formatRestarts := func(data []byte, restarts, numRestarts int32) { 3280 for i := int32(0); i < numRestarts; i++ { 3281 offset := getRestart(data, restarts, i) 3282 fmt.Fprintf(w, "%10d [restart %d]\n", 3283 b.Offset+uint64(restarts+4*i), b.Offset+uint64(offset)) 3284 } 3285 } 3286 3287 formatTrailer := func() { 3288 trailer := make([]byte, blockTrailerLen) 3289 offset := int64(b.Offset + b.Length) 3290 _, _ = r.file.ReadAt(trailer, offset) 3291 bt := blockType(trailer[0]) 3292 checksum := binary.LittleEndian.Uint32(trailer[1:]) 3293 fmt.Fprintf(w, "%10d [trailer compression=%s checksum=0x%04x]\n", offset, bt, checksum) 3294 } 3295 3296 var lastKey InternalKey 3297 switch b.name { 3298 case "data", "range-del", "range-key": 3299 iter, _ := newBlockIter(r.Compare, h.Get()) 3300 for key, value := iter.First(); key != nil; key, value = iter.Next() { 3301 ptr := unsafe.Pointer(uintptr(iter.ptr) + uintptr(iter.offset)) 3302 shared, ptr := decodeVarint(ptr) 3303 unshared, ptr := decodeVarint(ptr) 3304 value2, _ := decodeVarint(ptr) 3305 3306 total := iter.nextOffset - iter.offset 3307 // The format of the numbers in the record line is: 3308 // 3309 // (<total> = <length> [<shared>] + <unshared> + <value>) 3310 // 3311 // <total> is the total number of bytes for the record. 3312 // <length> is the size of the 3 varint encoded integers for <shared>, 3313 // <unshared>, and <value>. 3314 // <shared> is the number of key bytes shared with the previous key. 3315 // <unshared> is the number of unshared key bytes. 3316 // <value> is the number of value bytes. 3317 fmt.Fprintf(w, "%10d record (%d = %d [%d] + %d + %d)", 3318 b.Offset+uint64(iter.offset), total, 3319 total-int32(unshared+value2), shared, unshared, value2) 3320 formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset) 3321 if fmtRecord != nil { 3322 fmt.Fprintf(w, " ") 3323 fmtRecord(key, value) 3324 } 3325 3326 if base.InternalCompare(r.Compare, lastKey, *key) >= 0 { 3327 fmt.Fprintf(w, " WARNING: OUT OF ORDER KEYS!\n") 3328 } 3329 lastKey.Trailer = key.Trailer 3330 lastKey.UserKey = append(lastKey.UserKey[:0], key.UserKey...) 3331 } 3332 formatRestarts(iter.data, iter.restarts, iter.numRestarts) 3333 formatTrailer() 3334 case "index", "top-index": 3335 iter, _ := newBlockIter(r.Compare, h.Get()) 3336 for key, value := iter.First(); key != nil; key, value = iter.Next() { 3337 bh, err := decodeBlockHandleWithProperties(value) 3338 if err != nil { 3339 fmt.Fprintf(w, "%10d [err: %s]\n", b.Offset+uint64(iter.offset), err) 3340 continue 3341 } 3342 fmt.Fprintf(w, "%10d block:%d/%d", 3343 b.Offset+uint64(iter.offset), bh.Offset, bh.Length) 3344 formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset) 3345 } 3346 formatRestarts(iter.data, iter.restarts, iter.numRestarts) 3347 formatTrailer() 3348 case "properties": 3349 iter, _ := newRawBlockIter(r.Compare, h.Get()) 3350 for valid := iter.First(); valid; valid = iter.Next() { 3351 fmt.Fprintf(w, "%10d %s (%d)", 3352 b.Offset+uint64(iter.offset), iter.Key().UserKey, iter.nextOffset-iter.offset) 3353 formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset) 3354 } 3355 formatRestarts(iter.data, iter.restarts, iter.numRestarts) 3356 formatTrailer() 3357 case "meta-index": 3358 iter, _ := newRawBlockIter(r.Compare, h.Get()) 3359 for valid := iter.First(); valid; valid = iter.Next() { 3360 value := iter.Value() 3361 bh, n := decodeBlockHandle(value) 3362 if n == 0 || n != len(value) { 3363 fmt.Fprintf(w, "%10d [err: %s]\n", b.Offset+uint64(iter.offset), err) 3364 continue 3365 } 3366 3367 fmt.Fprintf(w, "%10d %s block:%d/%d", 3368 b.Offset+uint64(iter.offset), iter.Key().UserKey, 3369 bh.Offset, bh.Length) 3370 formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset) 3371 } 3372 formatRestarts(iter.data, iter.restarts, iter.numRestarts) 3373 formatTrailer() 3374 } 3375 3376 h.Release() 3377 } 3378 3379 last := blocks[len(blocks)-1] 3380 fmt.Fprintf(w, "%10d EOF\n", last.Offset+last.Length) 3381 } 3382 3383 type panicFataler struct{} 3384 3385 func (panicFataler) Fatalf(format string, args ...interface{}) { 3386 panic(errors.Errorf(format, args...)) 3387 }