github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/scan_internal.go (about) 1 // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "context" 9 "fmt" 10 11 "github.com/cockroachdb/errors" 12 "github.com/cockroachdb/pebble/internal/base" 13 "github.com/cockroachdb/pebble/internal/invariants" 14 "github.com/cockroachdb/pebble/internal/keyspan" 15 "github.com/cockroachdb/pebble/internal/manifest" 16 "github.com/cockroachdb/pebble/objstorage" 17 "github.com/cockroachdb/pebble/objstorage/remote" 18 ) 19 20 const ( 21 // In skip-shared iteration mode, keys in levels sharedLevelsStart and greater 22 // (i.e. lower in the LSM) are skipped. 23 sharedLevelsStart = remote.SharedLevelsStart 24 ) 25 26 // ErrInvalidSkipSharedIteration is returned by ScanInternal if it was called 27 // with a shared file visitor function, and a file in a shareable level (i.e. 28 // level >= sharedLevelsStart) was found to not be in shared storage according 29 // to objstorage.Provider, or not shareable for another reason such as for 30 // containing keys newer than the snapshot sequence number. 31 var ErrInvalidSkipSharedIteration = errors.New("pebble: cannot use skip-shared iteration due to non-shareable files in lower levels") 32 33 // SharedSSTMeta represents an sstable on shared storage that can be ingested 34 // by another pebble instance. This struct must contain all fields that are 35 // required for a Pebble instance to ingest a foreign sstable on shared storage, 36 // including constructing any relevant objstorage.Provider / remoteobjcat.Catalog 37 // data structures, as well as creating virtual FileMetadatas. 38 // 39 // Note that the Pebble instance creating and returning a SharedSSTMeta might 40 // not be the one that created the underlying sstable on shared storage to begin 41 // with; it's possible for a Pebble instance to reshare an sstable that was 42 // shared to it. 43 type SharedSSTMeta struct { 44 // Backing is the shared object underlying this SST. Can be attached to an 45 // objstorage.Provider. 46 Backing objstorage.RemoteObjectBackingHandle 47 48 // Smallest and Largest internal keys for the overall bounds. The kind and 49 // SeqNum of these will reflect what is physically present on the source Pebble 50 // instance's view of the sstable; it's up to the ingesting instance to set the 51 // sequence number in the trailer to match the read-time sequence numbers 52 // reserved for the level this SST is being ingested into. The Kind is expected 53 // to remain unchanged by the ingesting instance. 54 // 55 // Note that these bounds could be narrower than the bounds of the underlying 56 // sstable; ScanInternal is expected to truncate sstable bounds to the user key 57 // bounds passed into that method. 58 Smallest, Largest InternalKey 59 60 // SmallestRangeKey and LargestRangeKey are internal keys that denote the 61 // range key bounds of this sstable. Must lie within [Smallest, Largest]. 62 SmallestRangeKey, LargestRangeKey InternalKey 63 64 // SmallestPointKey and LargestPointKey are internal keys that denote the 65 // point key bounds of this sstable. Must lie within [Smallest, Largest]. 66 SmallestPointKey, LargestPointKey InternalKey 67 68 // Level denotes the level at which this file was present at read time. 69 // For files visited by ScanInternal, this value will only be 5 or 6. 70 Level uint8 71 72 // Size contains an estimate of the size of this sstable. 73 Size uint64 74 75 // fileNum at time of creation in the creator instance. Only used for 76 // debugging/tests. 77 fileNum base.FileNum 78 } 79 80 func (s *SharedSSTMeta) cloneFromFileMeta(f *fileMetadata) { 81 *s = SharedSSTMeta{ 82 Smallest: f.Smallest.Clone(), 83 Largest: f.Largest.Clone(), 84 SmallestRangeKey: f.SmallestRangeKey.Clone(), 85 LargestRangeKey: f.LargestRangeKey.Clone(), 86 SmallestPointKey: f.SmallestPointKey.Clone(), 87 LargestPointKey: f.LargestPointKey.Clone(), 88 Size: f.Size, 89 fileNum: f.FileNum, 90 } 91 } 92 93 type sharedByLevel []SharedSSTMeta 94 95 func (s sharedByLevel) Len() int { return len(s) } 96 func (s sharedByLevel) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 97 func (s sharedByLevel) Less(i, j int) bool { return s[i].Level < s[j].Level } 98 99 type pcIterPos int 100 101 const ( 102 pcIterPosCur pcIterPos = iota 103 pcIterPosNext 104 ) 105 106 // pointCollapsingIterator is an internalIterator that collapses point keys and 107 // returns at most one point internal key for each user key. Merges and 108 // SingleDels are not supported and result in a panic if encountered. Point keys 109 // deleted by rangedels are considered shadowed and not exposed. 110 // 111 // Only used in ScanInternal to return at most one internal key per user key. 112 type pointCollapsingIterator struct { 113 iter keyspan.InterleavingIter 114 pos pcIterPos 115 comparer *base.Comparer 116 merge base.Merge 117 err error 118 seqNum uint64 119 // The current position of `iter`. Always owned by the underlying iter. 120 iterKey *InternalKey 121 // The last saved key. findNextEntry and similar methods are expected to save 122 // the current value of iterKey to savedKey if they're iterating away from the 123 // current key but still need to retain it. See comments in findNextEntry on 124 // how this field is used. 125 // 126 // At the end of a positioning call: 127 // - if pos == pcIterPosNext, iterKey is pointing to the next user key owned 128 // by `iter` while savedKey is holding a copy to our current key. 129 // - If pos == pcIterPosCur, iterKey is pointing to an `iter`-owned current 130 // key, and savedKey is either undefined or pointing to a version of the 131 // current key owned by this iterator (i.e. backed by savedKeyBuf). 132 savedKey InternalKey 133 savedKeyBuf []byte 134 // Value at the current iterator position, at iterKey. 135 iterValue base.LazyValue 136 // If fixedSeqNum is non-zero, all emitted points are verified to have this 137 // fixed sequence number. 138 fixedSeqNum uint64 139 } 140 141 func (p *pointCollapsingIterator) Span() *keyspan.Span { 142 return p.iter.Span() 143 } 144 145 // SeekPrefixGE implements the InternalIterator interface. 146 func (p *pointCollapsingIterator) SeekPrefixGE( 147 prefix, key []byte, flags base.SeekGEFlags, 148 ) (*base.InternalKey, base.LazyValue) { 149 p.resetKey() 150 p.iterKey, p.iterValue = p.iter.SeekPrefixGE(prefix, key, flags) 151 p.pos = pcIterPosCur 152 if p.iterKey == nil { 153 return nil, base.LazyValue{} 154 } 155 return p.findNextEntry() 156 } 157 158 // SeekGE implements the InternalIterator interface. 159 func (p *pointCollapsingIterator) SeekGE( 160 key []byte, flags base.SeekGEFlags, 161 ) (*base.InternalKey, base.LazyValue) { 162 p.resetKey() 163 p.iterKey, p.iterValue = p.iter.SeekGE(key, flags) 164 p.pos = pcIterPosCur 165 if p.iterKey == nil { 166 return nil, base.LazyValue{} 167 } 168 return p.findNextEntry() 169 } 170 171 // SeekLT implements the InternalIterator interface. 172 func (p *pointCollapsingIterator) SeekLT( 173 key []byte, flags base.SeekLTFlags, 174 ) (*base.InternalKey, base.LazyValue) { 175 panic("unimplemented") 176 } 177 178 func (p *pointCollapsingIterator) resetKey() { 179 p.savedKey.UserKey = p.savedKeyBuf[:0] 180 p.savedKey.Trailer = 0 181 p.iterKey = nil 182 p.pos = pcIterPosCur 183 } 184 185 func (p *pointCollapsingIterator) verifySeqNum(key *base.InternalKey) *base.InternalKey { 186 if !invariants.Enabled { 187 return key 188 } 189 if p.fixedSeqNum == 0 || key == nil || key.Kind() == InternalKeyKindRangeDelete { 190 return key 191 } 192 if key.SeqNum() != p.fixedSeqNum { 193 panic(fmt.Sprintf("expected foreign point key to have seqnum %d, got %d", p.fixedSeqNum, key.SeqNum())) 194 } 195 return key 196 } 197 198 // findNextEntry is called to return the next key. p.iter must be positioned at the 199 // start of the first user key we are interested in. 200 func (p *pointCollapsingIterator) findNextEntry() (*base.InternalKey, base.LazyValue) { 201 p.saveKey() 202 // Saves a comparison in the fast path 203 firstIteration := true 204 for p.iterKey != nil { 205 // NB: p.savedKey is either the current key (iff p.iterKey == firstKey), 206 // or the previous key. 207 if !firstIteration && !p.comparer.Equal(p.iterKey.UserKey, p.savedKey.UserKey) { 208 p.saveKey() 209 continue 210 } 211 firstIteration = false 212 if s := p.iter.Span(); s != nil && s.CoversAt(p.seqNum, p.iterKey.SeqNum()) { 213 // All future keys for this user key must be deleted. 214 if p.savedKey.Kind() == InternalKeyKindSingleDelete { 215 panic("cannot process singledel key in point collapsing iterator") 216 } 217 // Fast forward to the next user key. 218 p.saveKey() 219 p.iterKey, p.iterValue = p.iter.Next() 220 for p.iterKey != nil && p.savedKey.SeqNum() >= p.iterKey.SeqNum() && p.comparer.Equal(p.iterKey.UserKey, p.savedKey.UserKey) { 221 p.iterKey, p.iterValue = p.iter.Next() 222 } 223 continue 224 } 225 switch p.savedKey.Kind() { 226 case InternalKeyKindSet, InternalKeyKindDelete, InternalKeyKindSetWithDelete, InternalKeyKindDeleteSized: 227 // Note that we return SETs directly, even if they would otherwise get 228 // compacted into a Del to turn into a SetWithDelete. This is a fast 229 // path optimization that can break SINGLEDEL determinism. To lead to 230 // consistent SINGLEDEL behaviour, this iterator should *not* be used for 231 // a keyspace where SINGLEDELs could be in use. If this iterator observes 232 // a SINGLEDEL as the first internal key for a user key, it will panic. 233 // 234 // As p.value is a lazy value owned by the child iterator, we can thread 235 // it through without loading it into p.valueBuf. 236 // 237 // TODO(bilal): We can even avoid saving the key in this fast path if 238 // we are in a block where setHasSamePrefix = false in a v3 sstable, 239 // guaranteeing that there's only one internal key for each user key. 240 // Thread this logic through the sstable iterators and/or consider 241 // collapsing (ha) this logic into the sstable iterators that are aware 242 // of blocks and can determine user key changes without doing key saves 243 // or comparisons. 244 p.pos = pcIterPosCur 245 return p.verifySeqNum(p.iterKey), p.iterValue 246 case InternalKeyKindSingleDelete: 247 // Panic, as this iterator is not expected to observe single deletes. 248 panic("cannot process singledel key in point collapsing iterator") 249 case InternalKeyKindMerge: 250 // Panic, as this iterator is not expected to observe merges. 251 panic("cannot process merge key in point collapsing iterator") 252 case InternalKeyKindRangeDelete: 253 // These are interleaved by the interleaving iterator ahead of all points. 254 // We should pass them as-is, but also account for any points ahead of 255 // them. 256 p.pos = pcIterPosCur 257 return p.verifySeqNum(p.iterKey), p.iterValue 258 default: 259 panic(fmt.Sprintf("unexpected kind: %d", p.iterKey.Kind())) 260 } 261 } 262 p.resetKey() 263 return nil, base.LazyValue{} 264 } 265 266 // First implements the InternalIterator interface. 267 func (p *pointCollapsingIterator) First() (*base.InternalKey, base.LazyValue) { 268 p.resetKey() 269 p.iterKey, p.iterValue = p.iter.First() 270 p.pos = pcIterPosCur 271 if p.iterKey == nil { 272 return nil, base.LazyValue{} 273 } 274 return p.findNextEntry() 275 } 276 277 // Last implements the InternalIterator interface. 278 func (p *pointCollapsingIterator) Last() (*base.InternalKey, base.LazyValue) { 279 panic("unimplemented") 280 } 281 282 func (p *pointCollapsingIterator) saveKey() { 283 if p.iterKey == nil { 284 p.savedKey = InternalKey{UserKey: p.savedKeyBuf[:0]} 285 return 286 } 287 p.savedKeyBuf = append(p.savedKeyBuf[:0], p.iterKey.UserKey...) 288 p.savedKey = InternalKey{UserKey: p.savedKeyBuf, Trailer: p.iterKey.Trailer} 289 } 290 291 // Next implements the InternalIterator interface. 292 func (p *pointCollapsingIterator) Next() (*base.InternalKey, base.LazyValue) { 293 switch p.pos { 294 case pcIterPosCur: 295 p.saveKey() 296 if p.iterKey != nil && p.iterKey.Kind() == InternalKeyKindRangeDelete { 297 // Step over the interleaved range delete and process the very next 298 // internal key, even if it's at the same user key. This is because a 299 // point for that user key has not been returned yet. 300 p.iterKey, p.iterValue = p.iter.Next() 301 break 302 } 303 // Fast forward to the next user key. 304 key, val := p.iter.Next() 305 // p.iterKey.SeqNum() >= key.SeqNum() is an optimization that allows us to 306 // use p.iterKey.SeqNum() < key.SeqNum() as a sign that the user key has 307 // changed, without needing to do the full key comparison. 308 for key != nil && p.savedKey.SeqNum() >= key.SeqNum() && 309 p.comparer.Equal(p.savedKey.UserKey, key.UserKey) { 310 key, val = p.iter.Next() 311 } 312 if key == nil { 313 // There are no keys to return. 314 p.resetKey() 315 return nil, base.LazyValue{} 316 } 317 p.iterKey, p.iterValue = key, val 318 case pcIterPosNext: 319 p.pos = pcIterPosCur 320 } 321 if p.iterKey == nil { 322 p.resetKey() 323 return nil, base.LazyValue{} 324 } 325 return p.findNextEntry() 326 } 327 328 // NextPrefix implements the InternalIterator interface. 329 func (p *pointCollapsingIterator) NextPrefix(succKey []byte) (*base.InternalKey, base.LazyValue) { 330 panic("unimplemented") 331 } 332 333 // Prev implements the InternalIterator interface. 334 func (p *pointCollapsingIterator) Prev() (*base.InternalKey, base.LazyValue) { 335 panic("unimplemented") 336 } 337 338 // Error implements the InternalIterator interface. 339 func (p *pointCollapsingIterator) Error() error { 340 if p.err != nil { 341 return p.err 342 } 343 return p.iter.Error() 344 } 345 346 // Close implements the InternalIterator interface. 347 func (p *pointCollapsingIterator) Close() error { 348 return p.iter.Close() 349 } 350 351 // SetBounds implements the InternalIterator interface. 352 func (p *pointCollapsingIterator) SetBounds(lower, upper []byte) { 353 p.resetKey() 354 p.iter.SetBounds(lower, upper) 355 } 356 357 // String implements the InternalIterator interface. 358 func (p *pointCollapsingIterator) String() string { 359 return p.iter.String() 360 } 361 362 var _ internalIterator = &pointCollapsingIterator{} 363 364 // IteratorLevelKind is used to denote whether the current ScanInternal iterator 365 // is unknown, belongs to a flushable, or belongs to an LSM level type. 366 type IteratorLevelKind int8 367 368 const ( 369 // IteratorLevelUnknown indicates an unknown LSM level. 370 IteratorLevelUnknown IteratorLevelKind = iota 371 // IteratorLevelLSM indicates an LSM level. 372 IteratorLevelLSM 373 // IteratorLevelFlushable indicates a flushable (i.e. memtable). 374 IteratorLevelFlushable 375 ) 376 377 // IteratorLevel is used with scanInternalIterator to surface additional iterator-specific info where possible. 378 // Note: this is struct is only provided for point keys. 379 type IteratorLevel struct { 380 Kind IteratorLevelKind 381 // FlushableIndex indicates the position within the flushable queue of this level. 382 // Only valid if kind == IteratorLevelFlushable. 383 FlushableIndex int 384 // The level within the LSM. Only valid if Kind == IteratorLevelLSM. 385 Level int 386 // Sublevel is only valid if Kind == IteratorLevelLSM and Level == 0. 387 Sublevel int 388 } 389 390 // scanInternalIterator is an iterator that returns all internal keys, including 391 // tombstones. For instance, an InternalKeyKindDelete would be returned as an 392 // InternalKeyKindDelete instead of the iterator skipping over to the next key. 393 // Internal keys within a user key are collapsed, eg. if there are two SETs, the 394 // one with the higher sequence is returned. Useful if an external user of Pebble 395 // needs to observe and rebuild Pebble's history of internal keys, such as in 396 // node-to-node replication. For use with {db,snapshot}.ScanInternal(). 397 // 398 // scanInternalIterator is expected to ignore point keys deleted by range 399 // deletions, and range keys shadowed by a range key unset or delete, however it 400 // *must* return the range delete as well as the range key unset/delete that did 401 // the shadowing. 402 type scanInternalIterator struct { 403 db *DB 404 opts scanInternalOptions 405 comparer *base.Comparer 406 merge Merge 407 iter internalIterator 408 readState *readState 409 version *version 410 rangeKey *iteratorRangeKeyState 411 pointKeyIter internalIterator 412 iterKey *InternalKey 413 iterValue LazyValue 414 alloc *iterAlloc 415 newIters tableNewIters 416 newIterRangeKey keyspan.TableNewSpanIter 417 seqNum uint64 418 iterLevels []IteratorLevel 419 mergingIter *mergingIter 420 421 // boundsBuf holds two buffers used to store the lower and upper bounds. 422 // Whenever the InternalIterator's bounds change, the new bounds are copied 423 // into boundsBuf[boundsBufIdx]. The two bounds share a slice to reduce 424 // allocations. opts.LowerBound and opts.UpperBound point into this slice. 425 boundsBuf [2][]byte 426 boundsBufIdx int 427 } 428 429 // truncateSharedFile truncates a shared file's [Smallest, Largest] fields to 430 // [lower, upper), potentially opening iterators on the file to find keys within 431 // the requested bounds. A SharedSSTMeta is produced that is suitable for 432 // external consumption by other Pebble instances. If shouldSkip is true, this 433 // file does not contain any keys in [lower, upper) and can be skipped. 434 // 435 // TODO(bilal): If opening iterators and doing reads in this method is too 436 // inefficient, consider producing non-tight file bounds instead. 437 func (d *DB) truncateSharedFile( 438 ctx context.Context, 439 lower, upper []byte, 440 level int, 441 file *fileMetadata, 442 objMeta objstorage.ObjectMetadata, 443 ) (sst *SharedSSTMeta, shouldSkip bool, err error) { 444 cmp := d.cmp 445 sst = &SharedSSTMeta{} 446 sst.cloneFromFileMeta(file) 447 sst.Level = uint8(level) 448 sst.Backing, err = d.objProvider.RemoteObjectBacking(&objMeta) 449 if err != nil { 450 return nil, false, err 451 } 452 needsLowerTruncate := cmp(lower, file.Smallest.UserKey) > 0 453 needsUpperTruncate := cmp(upper, file.Largest.UserKey) < 0 || (cmp(upper, file.Largest.UserKey) == 0 && !file.Largest.IsExclusiveSentinel()) 454 // Fast path: file is entirely within [lower, upper). 455 if !needsLowerTruncate && !needsUpperTruncate { 456 return sst, false, nil 457 } 458 459 // We will need to truncate file bounds in at least one direction. Open all 460 // relevant iterators. 461 iter, rangeDelIter, err := d.newIters(ctx, file, &IterOptions{ 462 LowerBound: lower, 463 UpperBound: upper, 464 level: manifest.Level(level), 465 }, internalIterOpts{}) 466 if err != nil { 467 return nil, false, err 468 } 469 defer iter.Close() 470 if rangeDelIter != nil { 471 rangeDelIter = keyspan.Truncate( 472 cmp, rangeDelIter, lower, upper, nil, nil, 473 false, /* panicOnUpperTruncate */ 474 ) 475 defer rangeDelIter.Close() 476 } 477 rangeKeyIter, err := d.tableNewRangeKeyIter(file, keyspan.SpanIterOptions{}) 478 if err != nil { 479 return nil, false, err 480 } 481 if rangeKeyIter != nil { 482 rangeKeyIter = keyspan.Truncate( 483 cmp, rangeKeyIter, lower, upper, nil, nil, 484 false, /* panicOnUpperTruncate */ 485 ) 486 defer rangeKeyIter.Close() 487 } 488 // Check if we need to truncate on the left side. This means finding a new 489 // LargestPointKey and LargestRangeKey that is >= lower. 490 if needsLowerTruncate { 491 sst.SmallestPointKey.UserKey = sst.SmallestPointKey.UserKey[:0] 492 sst.SmallestPointKey.Trailer = 0 493 key, _ := iter.SeekGE(lower, base.SeekGEFlagsNone) 494 foundPointKey := key != nil 495 if key != nil { 496 sst.SmallestPointKey.CopyFrom(*key) 497 } 498 if rangeDelIter != nil { 499 span := rangeDelIter.SeekGE(lower) 500 if span != nil && (len(sst.SmallestPointKey.UserKey) == 0 || base.InternalCompare(cmp, span.SmallestKey(), sst.SmallestPointKey) < 0) { 501 sst.SmallestPointKey.CopyFrom(span.SmallestKey()) 502 foundPointKey = true 503 } 504 } 505 if !foundPointKey { 506 // There are no point keys in the span we're interested in. 507 sst.SmallestPointKey = InternalKey{} 508 sst.LargestPointKey = InternalKey{} 509 } 510 sst.SmallestRangeKey.UserKey = sst.SmallestRangeKey.UserKey[:0] 511 sst.SmallestRangeKey.Trailer = 0 512 if rangeKeyIter != nil { 513 span := rangeKeyIter.SeekGE(lower) 514 if span != nil { 515 sst.SmallestRangeKey.CopyFrom(span.SmallestKey()) 516 } else { 517 // There are no range keys in the span we're interested in. 518 sst.SmallestRangeKey = InternalKey{} 519 sst.LargestRangeKey = InternalKey{} 520 } 521 } 522 } 523 // Check if we need to truncate on the right side. This means finding a new 524 // LargestPointKey and LargestRangeKey that is < upper. 525 if needsUpperTruncate { 526 sst.LargestPointKey.UserKey = sst.LargestPointKey.UserKey[:0] 527 sst.LargestPointKey.Trailer = 0 528 key, _ := iter.SeekLT(upper, base.SeekLTFlagsNone) 529 foundPointKey := key != nil 530 if key != nil { 531 sst.LargestPointKey.CopyFrom(*key) 532 } 533 if rangeDelIter != nil { 534 span := rangeDelIter.SeekLT(upper) 535 if span != nil && (len(sst.LargestPointKey.UserKey) == 0 || base.InternalCompare(cmp, span.LargestKey(), sst.LargestPointKey) > 0) { 536 sst.LargestPointKey.CopyFrom(span.LargestKey()) 537 foundPointKey = true 538 } 539 } 540 if !foundPointKey { 541 // There are no point keys in the span we're interested in. 542 sst.SmallestPointKey = InternalKey{} 543 sst.LargestPointKey = InternalKey{} 544 } 545 sst.LargestRangeKey.UserKey = sst.LargestRangeKey.UserKey[:0] 546 sst.LargestRangeKey.Trailer = 0 547 if rangeKeyIter != nil { 548 span := rangeKeyIter.SeekLT(upper) 549 if span != nil { 550 sst.LargestRangeKey.CopyFrom(span.LargestKey()) 551 } else { 552 // There are no range keys in the span we're interested in. 553 sst.SmallestRangeKey = InternalKey{} 554 sst.LargestRangeKey = InternalKey{} 555 } 556 } 557 } 558 // Set overall bounds based on {Smallest,Largest}{Point,Range}Key. 559 switch { 560 case len(sst.SmallestRangeKey.UserKey) == 0: 561 sst.Smallest = sst.SmallestPointKey 562 case len(sst.SmallestPointKey.UserKey) == 0: 563 sst.Smallest = sst.SmallestRangeKey 564 default: 565 sst.Smallest = sst.SmallestPointKey 566 if base.InternalCompare(cmp, sst.SmallestRangeKey, sst.SmallestPointKey) < 0 { 567 sst.Smallest = sst.SmallestRangeKey 568 } 569 } 570 switch { 571 case len(sst.LargestRangeKey.UserKey) == 0: 572 sst.Largest = sst.LargestPointKey 573 case len(sst.LargestPointKey.UserKey) == 0: 574 sst.Largest = sst.LargestRangeKey 575 default: 576 sst.Largest = sst.LargestPointKey 577 if base.InternalCompare(cmp, sst.LargestRangeKey, sst.LargestPointKey) > 0 { 578 sst.Largest = sst.LargestRangeKey 579 } 580 } 581 // On rare occasion, a file might overlap with [lower, upper) but not actually 582 // have any keys within those bounds. Skip such files. 583 if len(sst.Smallest.UserKey) == 0 { 584 return nil, true, nil 585 } 586 sst.Size, err = d.tableCache.estimateSize(file, sst.Smallest.UserKey, sst.Largest.UserKey) 587 if err != nil { 588 return nil, false, err 589 } 590 // On occasion, estimateSize gives us a low estimate, i.e. a 0 file size. This 591 // can cause panics in places where we divide by file sizes. Correct for it 592 // here. 593 if sst.Size == 0 { 594 sst.Size = 1 595 } 596 return sst, false, nil 597 } 598 599 func scanInternalImpl( 600 ctx context.Context, lower, upper []byte, iter *scanInternalIterator, opts *scanInternalOptions, 601 ) error { 602 if opts.visitSharedFile != nil && (lower == nil || upper == nil) { 603 panic("lower and upper bounds must be specified in skip-shared iteration mode") 604 } 605 // Before starting iteration, check if any files in levels sharedLevelsStart 606 // and below are *not* shared. Error out if that is the case, as skip-shared 607 // iteration will not produce a consistent point-in-time view of this range 608 // of keys. For files that are shared, call visitSharedFile with a truncated 609 // version of that file. 610 cmp := iter.comparer.Compare 611 provider := iter.db.ObjProvider() 612 seqNum := iter.seqNum 613 current := iter.version 614 if current == nil { 615 current = iter.readState.current 616 } 617 if opts.visitSharedFile != nil { 618 if provider == nil { 619 panic("expected non-nil Provider in skip-shared iteration mode") 620 } 621 for level := sharedLevelsStart; level < numLevels; level++ { 622 files := current.Levels[level].Iter() 623 for f := files.SeekGE(cmp, lower); f != nil && cmp(f.Smallest.UserKey, upper) < 0; f = files.Next() { 624 var objMeta objstorage.ObjectMetadata 625 var err error 626 objMeta, err = provider.Lookup(fileTypeTable, f.FileBacking.DiskFileNum) 627 if err != nil { 628 return err 629 } 630 if !objMeta.IsShared() { 631 return errors.Wrapf(ErrInvalidSkipSharedIteration, "file %s is not shared", objMeta.DiskFileNum) 632 } 633 if !base.Visible(f.LargestSeqNum, seqNum, base.InternalKeySeqNumMax) { 634 return errors.Wrapf(ErrInvalidSkipSharedIteration, "file %s contains keys newer than snapshot", objMeta.DiskFileNum) 635 } 636 var sst *SharedSSTMeta 637 var skip bool 638 sst, skip, err = iter.db.truncateSharedFile(ctx, lower, upper, level, f, objMeta) 639 if err != nil { 640 return err 641 } 642 if skip { 643 continue 644 } 645 if err = opts.visitSharedFile(sst); err != nil { 646 return err 647 } 648 } 649 } 650 } 651 652 for valid := iter.seekGE(lower); valid && iter.error() == nil; valid = iter.next() { 653 key := iter.unsafeKey() 654 655 if opts.rateLimitFunc != nil { 656 if err := opts.rateLimitFunc(key, iter.lazyValue()); err != nil { 657 return err 658 } 659 } 660 661 switch key.Kind() { 662 case InternalKeyKindRangeKeyDelete, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeySet: 663 if opts.visitRangeKey != nil { 664 span := iter.unsafeSpan() 665 // NB: The caller isn't interested in the sequence numbers of these 666 // range keys. Rather, the caller wants them to be in trailer order 667 // _after_ zeroing of sequence numbers. Copy span.Keys, sort it, and then 668 // call visitRangeKey. 669 keysCopy := make([]keyspan.Key, len(span.Keys)) 670 for i := range span.Keys { 671 keysCopy[i] = span.Keys[i] 672 keysCopy[i].Trailer = base.MakeTrailer(0, span.Keys[i].Kind()) 673 } 674 keyspan.SortKeysByTrailer(&keysCopy) 675 if err := opts.visitRangeKey(span.Start, span.End, keysCopy); err != nil { 676 return err 677 } 678 } 679 case InternalKeyKindRangeDelete: 680 if opts.visitRangeDel != nil { 681 rangeDel := iter.unsafeRangeDel() 682 if err := opts.visitRangeDel(rangeDel.Start, rangeDel.End, rangeDel.LargestSeqNum()); err != nil { 683 return err 684 } 685 } 686 default: 687 if opts.visitPointKey != nil { 688 var info IteratorLevel 689 if len(iter.mergingIter.heap.items) > 0 { 690 mergingIterIdx := iter.mergingIter.heap.items[0].index 691 info = iter.iterLevels[mergingIterIdx] 692 } else { 693 info = IteratorLevel{Kind: IteratorLevelUnknown} 694 } 695 val := iter.lazyValue() 696 if err := opts.visitPointKey(key, val, info); err != nil { 697 return err 698 } 699 } 700 } 701 } 702 703 return nil 704 } 705 706 // constructPointIter constructs a merging iterator and sets i.iter to it. 707 func (i *scanInternalIterator) constructPointIter(memtables flushableList, buf *iterAlloc) { 708 // Merging levels and levels from iterAlloc. 709 mlevels := buf.mlevels[:0] 710 levels := buf.levels[:0] 711 712 // We compute the number of levels needed ahead of time and reallocate a slice if 713 // the array from the iterAlloc isn't large enough. Doing this allocation once 714 // should improve the performance. 715 numMergingLevels := len(memtables) 716 numLevelIters := 0 717 718 current := i.version 719 if current == nil { 720 current = i.readState.current 721 } 722 numMergingLevels += len(current.L0SublevelFiles) 723 numLevelIters += len(current.L0SublevelFiles) 724 725 for level := 1; level < len(current.Levels); level++ { 726 if current.Levels[level].Empty() { 727 continue 728 } 729 if i.opts.skipSharedLevels && level >= sharedLevelsStart { 730 continue 731 } 732 numMergingLevels++ 733 numLevelIters++ 734 } 735 736 if numMergingLevels > cap(mlevels) { 737 mlevels = make([]mergingIterLevel, 0, numMergingLevels) 738 } 739 if numLevelIters > cap(levels) { 740 levels = make([]levelIter, 0, numLevelIters) 741 } 742 // TODO(bilal): Push these into the iterAlloc buf. 743 var rangeDelMiter keyspan.MergingIter 744 rangeDelIters := make([]keyspan.FragmentIterator, 0, numMergingLevels) 745 rangeDelLevels := make([]keyspan.LevelIter, 0, numLevelIters) 746 747 i.iterLevels = make([]IteratorLevel, numMergingLevels) 748 mlevelsIndex := 0 749 750 // Next are the memtables. 751 for j := len(memtables) - 1; j >= 0; j-- { 752 mem := memtables[j] 753 mlevels = append(mlevels, mergingIterLevel{ 754 iter: mem.newIter(&i.opts.IterOptions), 755 }) 756 i.iterLevels[mlevelsIndex] = IteratorLevel{ 757 Kind: IteratorLevelFlushable, 758 FlushableIndex: j, 759 } 760 mlevelsIndex++ 761 if rdi := mem.newRangeDelIter(&i.opts.IterOptions); rdi != nil { 762 rangeDelIters = append(rangeDelIters, rdi) 763 } 764 } 765 766 // Next are the file levels: L0 sub-levels followed by lower levels. 767 levelsIndex := len(levels) 768 mlevels = mlevels[:numMergingLevels] 769 levels = levels[:numLevelIters] 770 rangeDelLevels = rangeDelLevels[:numLevelIters] 771 i.opts.IterOptions.snapshotForHideObsoletePoints = i.seqNum 772 addLevelIterForFiles := func(files manifest.LevelIterator, level manifest.Level) { 773 li := &levels[levelsIndex] 774 rli := &rangeDelLevels[levelsIndex] 775 776 li.init( 777 context.Background(), i.opts.IterOptions, i.comparer, i.newIters, files, level, 778 internalIterOpts{}) 779 li.initBoundaryContext(&mlevels[mlevelsIndex].levelIterBoundaryContext) 780 mlevels[mlevelsIndex].iter = li 781 rli.Init(keyspan.SpanIterOptions{RangeKeyFilters: i.opts.RangeKeyFilters}, 782 i.comparer.Compare, tableNewRangeDelIter(context.Background(), i.newIters), files, level, 783 manifest.KeyTypePoint) 784 rangeDelIters = append(rangeDelIters, rli) 785 786 levelsIndex++ 787 mlevelsIndex++ 788 } 789 790 for j := len(current.L0SublevelFiles) - 1; j >= 0; j-- { 791 i.iterLevels[mlevelsIndex] = IteratorLevel{ 792 Kind: IteratorLevelLSM, 793 Level: 0, 794 Sublevel: j, 795 } 796 addLevelIterForFiles(current.L0SublevelFiles[j].Iter(), manifest.L0Sublevel(j)) 797 } 798 // Add level iterators for the non-empty non-L0 levels. 799 for level := 1; level < numLevels; level++ { 800 if current.Levels[level].Empty() { 801 continue 802 } 803 if i.opts.skipSharedLevels && level >= sharedLevelsStart { 804 continue 805 } 806 i.iterLevels[mlevelsIndex] = IteratorLevel{Kind: IteratorLevelLSM, Level: level} 807 addLevelIterForFiles(current.Levels[level].Iter(), manifest.Level(level)) 808 } 809 810 buf.merging.init(&i.opts.IterOptions, &InternalIteratorStats{}, i.comparer.Compare, i.comparer.Split, mlevels...) 811 buf.merging.snapshot = i.seqNum 812 rangeDelMiter.Init(i.comparer.Compare, keyspan.VisibleTransform(i.seqNum), new(keyspan.MergingBuffers), rangeDelIters...) 813 814 if i.opts.includeObsoleteKeys { 815 iiter := &keyspan.InterleavingIter{} 816 iiter.Init(i.comparer, &buf.merging, &rangeDelMiter, 817 keyspan.InterleavingIterOpts{ 818 LowerBound: i.opts.LowerBound, 819 UpperBound: i.opts.UpperBound, 820 }) 821 i.pointKeyIter = iiter 822 } else { 823 pcIter := &pointCollapsingIterator{ 824 comparer: i.comparer, 825 merge: i.merge, 826 seqNum: i.seqNum, 827 } 828 pcIter.iter.Init(i.comparer, &buf.merging, &rangeDelMiter, keyspan.InterleavingIterOpts{ 829 LowerBound: i.opts.LowerBound, 830 UpperBound: i.opts.UpperBound, 831 }) 832 i.pointKeyIter = pcIter 833 } 834 i.iter = i.pointKeyIter 835 } 836 837 // constructRangeKeyIter constructs the range-key iterator stack, populating 838 // i.rangeKey.rangeKeyIter with the resulting iterator. This is similar to 839 // Iterator.constructRangeKeyIter, except it doesn't handle batches and ensures 840 // iterConfig does *not* elide unsets/deletes. 841 func (i *scanInternalIterator) constructRangeKeyIter() { 842 // We want the bounded iter from iterConfig, but not the collapsing of 843 // RangeKeyUnsets and RangeKeyDels. 844 i.rangeKey.rangeKeyIter = i.rangeKey.iterConfig.Init( 845 i.comparer, i.seqNum, i.opts.LowerBound, i.opts.UpperBound, 846 nil /* hasPrefix */, nil /* prefix */, true, /* internalKeys */ 847 &i.rangeKey.rangeKeyBuffers.internal) 848 849 // Next are the flushables: memtables and large batches. 850 if i.readState != nil { 851 for j := len(i.readState.memtables) - 1; j >= 0; j-- { 852 mem := i.readState.memtables[j] 853 // We only need to read from memtables which contain sequence numbers older 854 // than seqNum. 855 if logSeqNum := mem.logSeqNum; logSeqNum >= i.seqNum { 856 continue 857 } 858 if rki := mem.newRangeKeyIter(&i.opts.IterOptions); rki != nil { 859 i.rangeKey.iterConfig.AddLevel(rki) 860 } 861 } 862 } 863 864 current := i.version 865 if current == nil { 866 current = i.readState.current 867 } 868 // Next are the file levels: L0 sub-levels followed by lower levels. 869 // 870 // Add file-specific iterators for L0 files containing range keys. This is less 871 // efficient than using levelIters for sublevels of L0 files containing 872 // range keys, but range keys are expected to be sparse anyway, reducing the 873 // cost benefit of maintaining a separate L0Sublevels instance for range key 874 // files and then using it here. 875 // 876 // NB: We iterate L0's files in reverse order. They're sorted by 877 // LargestSeqNum ascending, and we need to add them to the merging iterator 878 // in LargestSeqNum descending to preserve the merging iterator's invariants 879 // around Key Trailer order. 880 iter := current.RangeKeyLevels[0].Iter() 881 for f := iter.Last(); f != nil; f = iter.Prev() { 882 spanIter, err := i.newIterRangeKey(f, i.opts.SpanIterOptions()) 883 if err != nil { 884 i.rangeKey.iterConfig.AddLevel(&errorKeyspanIter{err: err}) 885 continue 886 } 887 i.rangeKey.iterConfig.AddLevel(spanIter) 888 } 889 890 // Add level iterators for the non-empty non-L0 levels. 891 for level := 1; level < len(current.RangeKeyLevels); level++ { 892 if current.RangeKeyLevels[level].Empty() { 893 continue 894 } 895 if i.opts.skipSharedLevels && level >= sharedLevelsStart { 896 continue 897 } 898 li := i.rangeKey.iterConfig.NewLevelIter() 899 spanIterOpts := i.opts.SpanIterOptions() 900 li.Init(spanIterOpts, i.comparer.Compare, i.newIterRangeKey, current.RangeKeyLevels[level].Iter(), 901 manifest.Level(level), manifest.KeyTypeRange) 902 i.rangeKey.iterConfig.AddLevel(li) 903 } 904 } 905 906 // seekGE seeks this iterator to the first key that's greater than or equal 907 // to the specified user key. 908 func (i *scanInternalIterator) seekGE(key []byte) bool { 909 i.iterKey, i.iterValue = i.iter.SeekGE(key, base.SeekGEFlagsNone) 910 return i.iterKey != nil 911 } 912 913 // unsafeKey returns the unsafe InternalKey at the current position. The value 914 // is nil if the iterator is invalid or exhausted. 915 func (i *scanInternalIterator) unsafeKey() *InternalKey { 916 return i.iterKey 917 } 918 919 // lazyValue returns a value pointer to the value at the current iterator 920 // position. Behaviour undefined if unsafeKey() returns a Range key or Rangedel 921 // kind key. 922 func (i *scanInternalIterator) lazyValue() LazyValue { 923 return i.iterValue 924 } 925 926 // unsafeRangeDel returns a range key span. Behaviour undefined if UnsafeKey returns 927 // a non-rangedel kind. 928 func (i *scanInternalIterator) unsafeRangeDel() *keyspan.Span { 929 type spanInternalIterator interface { 930 Span() *keyspan.Span 931 } 932 return i.pointKeyIter.(spanInternalIterator).Span() 933 } 934 935 // unsafeSpan returns a range key span. Behaviour undefined if UnsafeKey returns 936 // a non-rangekey type. 937 func (i *scanInternalIterator) unsafeSpan() *keyspan.Span { 938 return i.rangeKey.iiter.Span() 939 } 940 941 // next advances the iterator in the forward direction, and returns the 942 // iterator's new validity state. 943 func (i *scanInternalIterator) next() bool { 944 i.iterKey, i.iterValue = i.iter.Next() 945 return i.iterKey != nil 946 } 947 948 // error returns an error from the internal iterator, if there's any. 949 func (i *scanInternalIterator) error() error { 950 return i.iter.Error() 951 } 952 953 // close closes this iterator, and releases any pooled objects. 954 func (i *scanInternalIterator) close() error { 955 if err := i.iter.Close(); err != nil { 956 return err 957 } 958 if i.readState != nil { 959 i.readState.unref() 960 } 961 if i.version != nil { 962 i.version.Unref() 963 } 964 if i.rangeKey != nil { 965 i.rangeKey.PrepareForReuse() 966 *i.rangeKey = iteratorRangeKeyState{ 967 rangeKeyBuffers: i.rangeKey.rangeKeyBuffers, 968 } 969 iterRangeKeyStateAllocPool.Put(i.rangeKey) 970 i.rangeKey = nil 971 } 972 if alloc := i.alloc; alloc != nil { 973 for j := range i.boundsBuf { 974 if cap(i.boundsBuf[j]) >= maxKeyBufCacheSize { 975 alloc.boundsBuf[j] = nil 976 } else { 977 alloc.boundsBuf[j] = i.boundsBuf[j] 978 } 979 } 980 *alloc = iterAlloc{ 981 keyBuf: alloc.keyBuf[:0], 982 boundsBuf: alloc.boundsBuf, 983 prefixOrFullSeekKey: alloc.prefixOrFullSeekKey[:0], 984 } 985 iterAllocPool.Put(alloc) 986 i.alloc = nil 987 } 988 return nil 989 } 990 991 func (i *scanInternalIterator) initializeBoundBufs(lower, upper []byte) { 992 buf := i.boundsBuf[i.boundsBufIdx][:0] 993 if lower != nil { 994 buf = append(buf, lower...) 995 i.opts.LowerBound = buf 996 } else { 997 i.opts.LowerBound = nil 998 } 999 if upper != nil { 1000 buf = append(buf, upper...) 1001 i.opts.UpperBound = buf[len(buf)-len(upper):] 1002 } else { 1003 i.opts.UpperBound = nil 1004 } 1005 i.boundsBuf[i.boundsBufIdx] = buf 1006 i.boundsBufIdx = 1 - i.boundsBufIdx 1007 }