github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/scan_internal.go (about) 1 // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "context" 9 "fmt" 10 11 "github.com/cockroachdb/errors" 12 "github.com/cockroachdb/pebble/internal/base" 13 "github.com/cockroachdb/pebble/internal/invariants" 14 "github.com/cockroachdb/pebble/internal/keyspan" 15 "github.com/cockroachdb/pebble/internal/manifest" 16 "github.com/cockroachdb/pebble/objstorage" 17 "github.com/cockroachdb/pebble/objstorage/remote" 18 "github.com/cockroachdb/pebble/sstable" 19 ) 20 21 const ( 22 // In skip-shared iteration mode, keys in levels sharedLevelsStart and greater 23 // (i.e. lower in the LSM) are skipped. 24 sharedLevelsStart = remote.SharedLevelsStart 25 ) 26 27 // ErrInvalidSkipSharedIteration is returned by ScanInternal if it was called 28 // with a shared file visitor function, and a file in a shareable level (i.e. 29 // level >= sharedLevelsStart) was found to not be in shared storage according 30 // to objstorage.Provider, or not shareable for another reason such as for 31 // containing keys newer than the snapshot sequence number. 32 var ErrInvalidSkipSharedIteration = errors.New("pebble: cannot use skip-shared iteration due to non-shareable files in lower levels") 33 34 // SharedSSTMeta represents an sstable on shared storage that can be ingested 35 // by another pebble instance. This struct must contain all fields that are 36 // required for a Pebble instance to ingest a foreign sstable on shared storage, 37 // including constructing any relevant objstorage.Provider / remoteobjcat.Catalog 38 // data structures, as well as creating virtual FileMetadatas. 39 // 40 // Note that the Pebble instance creating and returning a SharedSSTMeta might 41 // not be the one that created the underlying sstable on shared storage to begin 42 // with; it's possible for a Pebble instance to reshare an sstable that was 43 // shared to it. 44 type SharedSSTMeta struct { 45 // Backing is the shared object underlying this SST. Can be attached to an 46 // objstorage.Provider. 47 Backing objstorage.RemoteObjectBackingHandle 48 49 // Smallest and Largest internal keys for the overall bounds. The kind and 50 // SeqNum of these will reflect what is physically present on the source Pebble 51 // instance's view of the sstable; it's up to the ingesting instance to set the 52 // sequence number in the trailer to match the read-time sequence numbers 53 // reserved for the level this SST is being ingested into. The Kind is expected 54 // to remain unchanged by the ingesting instance. 55 // 56 // Note that these bounds could be narrower than the bounds of the underlying 57 // sstable; ScanInternal is expected to truncate sstable bounds to the user key 58 // bounds passed into that method. 59 Smallest, Largest InternalKey 60 61 // SmallestRangeKey and LargestRangeKey are internal keys that denote the 62 // range key bounds of this sstable. Must lie within [Smallest, Largest]. 63 SmallestRangeKey, LargestRangeKey InternalKey 64 65 // SmallestPointKey and LargestPointKey are internal keys that denote the 66 // point key bounds of this sstable. Must lie within [Smallest, Largest]. 67 SmallestPointKey, LargestPointKey InternalKey 68 69 // Level denotes the level at which this file was present at read time. 70 // For files visited by ScanInternal, this value will only be 5 or 6. 71 Level uint8 72 73 // Size contains an estimate of the size of this sstable. 74 Size uint64 75 76 // fileNum at time of creation in the creator instance. Only used for 77 // debugging/tests. 78 fileNum base.FileNum 79 } 80 81 func (s *SharedSSTMeta) cloneFromFileMeta(f *fileMetadata) { 82 *s = SharedSSTMeta{ 83 Smallest: f.Smallest.Clone(), 84 Largest: f.Largest.Clone(), 85 SmallestRangeKey: f.SmallestRangeKey.Clone(), 86 LargestRangeKey: f.LargestRangeKey.Clone(), 87 SmallestPointKey: f.SmallestPointKey.Clone(), 88 LargestPointKey: f.LargestPointKey.Clone(), 89 Size: f.Size, 90 fileNum: f.FileNum, 91 } 92 } 93 94 type sharedByLevel []SharedSSTMeta 95 96 func (s sharedByLevel) Len() int { return len(s) } 97 func (s sharedByLevel) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 98 func (s sharedByLevel) Less(i, j int) bool { return s[i].Level < s[j].Level } 99 100 type pcIterPos int 101 102 const ( 103 pcIterPosCur pcIterPos = iota 104 pcIterPosNext 105 ) 106 107 // pointCollapsingIterator is an internalIterator that collapses point keys and 108 // returns at most one point internal key for each user key. Merges and 109 // SingleDels are not supported and result in a panic if encountered. Point keys 110 // deleted by rangedels are considered shadowed and not exposed. 111 // 112 // Only used in ScanInternal to return at most one internal key per user key. 113 type pointCollapsingIterator struct { 114 iter keyspan.InterleavingIter 115 pos pcIterPos 116 comparer *base.Comparer 117 merge base.Merge 118 err error 119 seqNum uint64 120 // The current position of `iter`. Always owned by the underlying iter. 121 iterKey *InternalKey 122 // The last saved key. findNextEntry and similar methods are expected to save 123 // the current value of iterKey to savedKey if they're iterating away from the 124 // current key but still need to retain it. See comments in findNextEntry on 125 // how this field is used. 126 // 127 // At the end of a positioning call: 128 // - if pos == pcIterPosNext, iterKey is pointing to the next user key owned 129 // by `iter` while savedKey is holding a copy to our current key. 130 // - If pos == pcIterPosCur, iterKey is pointing to an `iter`-owned current 131 // key, and savedKey is either undefined or pointing to a version of the 132 // current key owned by this iterator (i.e. backed by savedKeyBuf). 133 savedKey InternalKey 134 savedKeyBuf []byte 135 // Value at the current iterator position, at iterKey. 136 iterValue base.LazyValue 137 // If fixedSeqNum is non-zero, all emitted points are verified to have this 138 // fixed sequence number. 139 fixedSeqNum uint64 140 } 141 142 func (p *pointCollapsingIterator) Span() *keyspan.Span { 143 return p.iter.Span() 144 } 145 146 // SeekPrefixGE implements the InternalIterator interface. 147 func (p *pointCollapsingIterator) SeekPrefixGE( 148 prefix, key []byte, flags base.SeekGEFlags, 149 ) (*base.InternalKey, base.LazyValue) { 150 p.resetKey() 151 p.iterKey, p.iterValue = p.iter.SeekPrefixGE(prefix, key, flags) 152 p.pos = pcIterPosCur 153 if p.iterKey == nil { 154 return nil, base.LazyValue{} 155 } 156 return p.findNextEntry() 157 } 158 159 // SeekGE implements the InternalIterator interface. 160 func (p *pointCollapsingIterator) SeekGE( 161 key []byte, flags base.SeekGEFlags, 162 ) (*base.InternalKey, base.LazyValue) { 163 p.resetKey() 164 p.iterKey, p.iterValue = p.iter.SeekGE(key, flags) 165 p.pos = pcIterPosCur 166 if p.iterKey == nil { 167 return nil, base.LazyValue{} 168 } 169 return p.findNextEntry() 170 } 171 172 // SeekLT implements the InternalIterator interface. 173 func (p *pointCollapsingIterator) SeekLT( 174 key []byte, flags base.SeekLTFlags, 175 ) (*base.InternalKey, base.LazyValue) { 176 panic("unimplemented") 177 } 178 179 func (p *pointCollapsingIterator) resetKey() { 180 p.savedKey.UserKey = p.savedKeyBuf[:0] 181 p.savedKey.Trailer = 0 182 p.iterKey = nil 183 p.pos = pcIterPosCur 184 } 185 186 func (p *pointCollapsingIterator) verifySeqNum(key *base.InternalKey) *base.InternalKey { 187 if !invariants.Enabled { 188 return key 189 } 190 if p.fixedSeqNum == 0 || key == nil || key.Kind() == InternalKeyKindRangeDelete { 191 return key 192 } 193 if key.SeqNum() != p.fixedSeqNum { 194 panic(fmt.Sprintf("expected foreign point key to have seqnum %d, got %d", p.fixedSeqNum, key.SeqNum())) 195 } 196 return key 197 } 198 199 // findNextEntry is called to return the next key. p.iter must be positioned at the 200 // start of the first user key we are interested in. 201 func (p *pointCollapsingIterator) findNextEntry() (*base.InternalKey, base.LazyValue) { 202 p.saveKey() 203 // Saves a comparison in the fast path 204 firstIteration := true 205 for p.iterKey != nil { 206 // NB: p.savedKey is either the current key (iff p.iterKey == firstKey), 207 // or the previous key. 208 if !firstIteration && !p.comparer.Equal(p.iterKey.UserKey, p.savedKey.UserKey) { 209 p.saveKey() 210 continue 211 } 212 firstIteration = false 213 if s := p.iter.Span(); s != nil && s.CoversAt(p.seqNum, p.iterKey.SeqNum()) { 214 // All future keys for this user key must be deleted. 215 if p.savedKey.Kind() == InternalKeyKindSingleDelete { 216 panic("cannot process singledel key in point collapsing iterator") 217 } 218 // Fast forward to the next user key. 219 p.saveKey() 220 p.iterKey, p.iterValue = p.iter.Next() 221 for p.iterKey != nil && p.savedKey.SeqNum() >= p.iterKey.SeqNum() && p.comparer.Equal(p.iterKey.UserKey, p.savedKey.UserKey) { 222 p.iterKey, p.iterValue = p.iter.Next() 223 } 224 continue 225 } 226 switch p.savedKey.Kind() { 227 case InternalKeyKindSet, InternalKeyKindDelete, InternalKeyKindSetWithDelete, InternalKeyKindDeleteSized: 228 // Note that we return SETs directly, even if they would otherwise get 229 // compacted into a Del to turn into a SetWithDelete. This is a fast 230 // path optimization that can break SINGLEDEL determinism. To lead to 231 // consistent SINGLEDEL behaviour, this iterator should *not* be used for 232 // a keyspace where SINGLEDELs could be in use. If this iterator observes 233 // a SINGLEDEL as the first internal key for a user key, it will panic. 234 // 235 // As p.value is a lazy value owned by the child iterator, we can thread 236 // it through without loading it into p.valueBuf. 237 // 238 // TODO(bilal): We can even avoid saving the key in this fast path if 239 // we are in a block where setHasSamePrefix = false in a v3 sstable, 240 // guaranteeing that there's only one internal key for each user key. 241 // Thread this logic through the sstable iterators and/or consider 242 // collapsing (ha) this logic into the sstable iterators that are aware 243 // of blocks and can determine user key changes without doing key saves 244 // or comparisons. 245 p.pos = pcIterPosCur 246 return p.verifySeqNum(p.iterKey), p.iterValue 247 case InternalKeyKindSingleDelete: 248 // Panic, as this iterator is not expected to observe single deletes. 249 panic("cannot process singledel key in point collapsing iterator") 250 case InternalKeyKindMerge: 251 // Panic, as this iterator is not expected to observe merges. 252 panic("cannot process merge key in point collapsing iterator") 253 case InternalKeyKindRangeDelete: 254 // These are interleaved by the interleaving iterator ahead of all points. 255 // We should pass them as-is, but also account for any points ahead of 256 // them. 257 p.pos = pcIterPosCur 258 return p.verifySeqNum(p.iterKey), p.iterValue 259 default: 260 panic(fmt.Sprintf("unexpected kind: %d", p.iterKey.Kind())) 261 } 262 } 263 p.resetKey() 264 return nil, base.LazyValue{} 265 } 266 267 // First implements the InternalIterator interface. 268 func (p *pointCollapsingIterator) First() (*base.InternalKey, base.LazyValue) { 269 p.resetKey() 270 p.iterKey, p.iterValue = p.iter.First() 271 p.pos = pcIterPosCur 272 if p.iterKey == nil { 273 return nil, base.LazyValue{} 274 } 275 return p.findNextEntry() 276 } 277 278 // Last implements the InternalIterator interface. 279 func (p *pointCollapsingIterator) Last() (*base.InternalKey, base.LazyValue) { 280 panic("unimplemented") 281 } 282 283 func (p *pointCollapsingIterator) saveKey() { 284 if p.iterKey == nil { 285 p.savedKey = InternalKey{UserKey: p.savedKeyBuf[:0]} 286 return 287 } 288 p.savedKeyBuf = append(p.savedKeyBuf[:0], p.iterKey.UserKey...) 289 p.savedKey = InternalKey{UserKey: p.savedKeyBuf, Trailer: p.iterKey.Trailer} 290 } 291 292 // Next implements the InternalIterator interface. 293 func (p *pointCollapsingIterator) Next() (*base.InternalKey, base.LazyValue) { 294 switch p.pos { 295 case pcIterPosCur: 296 p.saveKey() 297 if p.iterKey != nil && p.iterKey.Kind() == InternalKeyKindRangeDelete { 298 // Step over the interleaved range delete and process the very next 299 // internal key, even if it's at the same user key. This is because a 300 // point for that user key has not been returned yet. 301 p.iterKey, p.iterValue = p.iter.Next() 302 break 303 } 304 // Fast forward to the next user key. 305 key, val := p.iter.Next() 306 // p.iterKey.SeqNum() >= key.SeqNum() is an optimization that allows us to 307 // use p.iterKey.SeqNum() < key.SeqNum() as a sign that the user key has 308 // changed, without needing to do the full key comparison. 309 for key != nil && p.savedKey.SeqNum() >= key.SeqNum() && 310 p.comparer.Equal(p.savedKey.UserKey, key.UserKey) { 311 key, val = p.iter.Next() 312 } 313 if key == nil { 314 // There are no keys to return. 315 p.resetKey() 316 return nil, base.LazyValue{} 317 } 318 p.iterKey, p.iterValue = key, val 319 case pcIterPosNext: 320 p.pos = pcIterPosCur 321 } 322 if p.iterKey == nil { 323 p.resetKey() 324 return nil, base.LazyValue{} 325 } 326 return p.findNextEntry() 327 } 328 329 // NextPrefix implements the InternalIterator interface. 330 func (p *pointCollapsingIterator) NextPrefix(succKey []byte) (*base.InternalKey, base.LazyValue) { 331 panic("unimplemented") 332 } 333 334 // Prev implements the InternalIterator interface. 335 func (p *pointCollapsingIterator) Prev() (*base.InternalKey, base.LazyValue) { 336 panic("unimplemented") 337 } 338 339 // Error implements the InternalIterator interface. 340 func (p *pointCollapsingIterator) Error() error { 341 if p.err != nil { 342 return p.err 343 } 344 return p.iter.Error() 345 } 346 347 // Close implements the InternalIterator interface. 348 func (p *pointCollapsingIterator) Close() error { 349 return p.iter.Close() 350 } 351 352 // SetBounds implements the InternalIterator interface. 353 func (p *pointCollapsingIterator) SetBounds(lower, upper []byte) { 354 p.resetKey() 355 p.iter.SetBounds(lower, upper) 356 } 357 358 func (p *pointCollapsingIterator) SetContext(ctx context.Context) { 359 p.iter.SetContext(ctx) 360 } 361 362 // String implements the InternalIterator interface. 363 func (p *pointCollapsingIterator) String() string { 364 return p.iter.String() 365 } 366 367 var _ internalIterator = &pointCollapsingIterator{} 368 369 // IteratorLevelKind is used to denote whether the current ScanInternal iterator 370 // is unknown, belongs to a flushable, or belongs to an LSM level type. 371 type IteratorLevelKind int8 372 373 const ( 374 // IteratorLevelUnknown indicates an unknown LSM level. 375 IteratorLevelUnknown IteratorLevelKind = iota 376 // IteratorLevelLSM indicates an LSM level. 377 IteratorLevelLSM 378 // IteratorLevelFlushable indicates a flushable (i.e. memtable). 379 IteratorLevelFlushable 380 ) 381 382 // IteratorLevel is used with scanInternalIterator to surface additional iterator-specific info where possible. 383 // Note: this is struct is only provided for point keys. 384 type IteratorLevel struct { 385 Kind IteratorLevelKind 386 // FlushableIndex indicates the position within the flushable queue of this level. 387 // Only valid if kind == IteratorLevelFlushable. 388 FlushableIndex int 389 // The level within the LSM. Only valid if Kind == IteratorLevelLSM. 390 Level int 391 // Sublevel is only valid if Kind == IteratorLevelLSM and Level == 0. 392 Sublevel int 393 } 394 395 // scanInternalIterator is an iterator that returns all internal keys, including 396 // tombstones. For instance, an InternalKeyKindDelete would be returned as an 397 // InternalKeyKindDelete instead of the iterator skipping over to the next key. 398 // Internal keys within a user key are collapsed, eg. if there are two SETs, the 399 // one with the higher sequence is returned. Useful if an external user of Pebble 400 // needs to observe and rebuild Pebble's history of internal keys, such as in 401 // node-to-node replication. For use with {db,snapshot}.ScanInternal(). 402 // 403 // scanInternalIterator is expected to ignore point keys deleted by range 404 // deletions, and range keys shadowed by a range key unset or delete, however it 405 // *must* return the range delete as well as the range key unset/delete that did 406 // the shadowing. 407 type scanInternalIterator struct { 408 ctx context.Context 409 db *DB 410 opts scanInternalOptions 411 comparer *base.Comparer 412 merge Merge 413 iter internalIterator 414 readState *readState 415 version *version 416 rangeKey *iteratorRangeKeyState 417 pointKeyIter internalIterator 418 iterKey *InternalKey 419 iterValue LazyValue 420 alloc *iterAlloc 421 newIters tableNewIters 422 newIterRangeKey keyspan.TableNewSpanIter 423 seqNum uint64 424 iterLevels []IteratorLevel 425 mergingIter *mergingIter 426 427 // boundsBuf holds two buffers used to store the lower and upper bounds. 428 // Whenever the InternalIterator's bounds change, the new bounds are copied 429 // into boundsBuf[boundsBufIdx]. The two bounds share a slice to reduce 430 // allocations. opts.LowerBound and opts.UpperBound point into this slice. 431 boundsBuf [2][]byte 432 boundsBufIdx int 433 } 434 435 // truncateSharedFile truncates a shared file's [Smallest, Largest] fields to 436 // [lower, upper), potentially opening iterators on the file to find keys within 437 // the requested bounds. A SharedSSTMeta is produced that is suitable for 438 // external consumption by other Pebble instances. If shouldSkip is true, this 439 // file does not contain any keys in [lower, upper) and can be skipped. 440 // 441 // TODO(bilal): If opening iterators and doing reads in this method is too 442 // inefficient, consider producing non-tight file bounds instead. 443 func (d *DB) truncateSharedFile( 444 ctx context.Context, 445 lower, upper []byte, 446 level int, 447 file *fileMetadata, 448 objMeta objstorage.ObjectMetadata, 449 ) (sst *SharedSSTMeta, shouldSkip bool, err error) { 450 cmp := d.cmp 451 sst = &SharedSSTMeta{} 452 sst.cloneFromFileMeta(file) 453 sst.Level = uint8(level) 454 sst.Backing, err = d.objProvider.RemoteObjectBacking(&objMeta) 455 if err != nil { 456 return nil, false, err 457 } 458 needsLowerTruncate := cmp(lower, file.Smallest.UserKey) > 0 459 needsUpperTruncate := cmp(upper, file.Largest.UserKey) < 0 || (cmp(upper, file.Largest.UserKey) == 0 && !file.Largest.IsExclusiveSentinel()) 460 // Fast path: file is entirely within [lower, upper). 461 if !needsLowerTruncate && !needsUpperTruncate { 462 return sst, false, nil 463 } 464 465 // We will need to truncate file bounds in at least one direction. Open all 466 // relevant iterators. 467 iter, rangeDelIter, err := d.newIters(ctx, file, &IterOptions{ 468 LowerBound: lower, 469 UpperBound: upper, 470 level: manifest.Level(level), 471 }, internalIterOpts{}) 472 if err != nil { 473 return nil, false, err 474 } 475 defer iter.Close() 476 if rangeDelIter != nil { 477 rangeDelIter = keyspan.Truncate( 478 cmp, rangeDelIter, lower, upper, nil, nil, 479 false, /* panicOnUpperTruncate */ 480 ) 481 defer rangeDelIter.Close() 482 } 483 rangeKeyIter, err := d.tableNewRangeKeyIter(file, keyspan.SpanIterOptions{}) 484 if err != nil { 485 return nil, false, err 486 } 487 if rangeKeyIter != nil { 488 rangeKeyIter = keyspan.Truncate( 489 cmp, rangeKeyIter, lower, upper, nil, nil, 490 false, /* panicOnUpperTruncate */ 491 ) 492 defer rangeKeyIter.Close() 493 } 494 // Check if we need to truncate on the left side. This means finding a new 495 // LargestPointKey and LargestRangeKey that is >= lower. 496 if needsLowerTruncate { 497 sst.SmallestPointKey.UserKey = sst.SmallestPointKey.UserKey[:0] 498 sst.SmallestPointKey.Trailer = 0 499 key, _ := iter.SeekGE(lower, base.SeekGEFlagsNone) 500 foundPointKey := key != nil 501 if key != nil { 502 sst.SmallestPointKey.CopyFrom(*key) 503 } 504 if rangeDelIter != nil { 505 span := rangeDelIter.SeekGE(lower) 506 if span != nil && (len(sst.SmallestPointKey.UserKey) == 0 || base.InternalCompare(cmp, span.SmallestKey(), sst.SmallestPointKey) < 0) { 507 sst.SmallestPointKey.CopyFrom(span.SmallestKey()) 508 foundPointKey = true 509 } 510 } 511 if !foundPointKey { 512 // There are no point keys in the span we're interested in. 513 sst.SmallestPointKey = InternalKey{} 514 sst.LargestPointKey = InternalKey{} 515 } 516 sst.SmallestRangeKey.UserKey = sst.SmallestRangeKey.UserKey[:0] 517 sst.SmallestRangeKey.Trailer = 0 518 if rangeKeyIter != nil { 519 span := rangeKeyIter.SeekGE(lower) 520 if span != nil { 521 sst.SmallestRangeKey.CopyFrom(span.SmallestKey()) 522 } else { 523 // There are no range keys in the span we're interested in. 524 sst.SmallestRangeKey = InternalKey{} 525 sst.LargestRangeKey = InternalKey{} 526 } 527 } 528 } 529 // Check if we need to truncate on the right side. This means finding a new 530 // LargestPointKey and LargestRangeKey that is < upper. 531 if needsUpperTruncate { 532 sst.LargestPointKey.UserKey = sst.LargestPointKey.UserKey[:0] 533 sst.LargestPointKey.Trailer = 0 534 key, _ := iter.SeekLT(upper, base.SeekLTFlagsNone) 535 foundPointKey := key != nil 536 if key != nil { 537 sst.LargestPointKey.CopyFrom(*key) 538 } 539 if rangeDelIter != nil { 540 span := rangeDelIter.SeekLT(upper) 541 if span != nil && (len(sst.LargestPointKey.UserKey) == 0 || base.InternalCompare(cmp, span.LargestKey(), sst.LargestPointKey) > 0) { 542 sst.LargestPointKey.CopyFrom(span.LargestKey()) 543 foundPointKey = true 544 } 545 } 546 if !foundPointKey { 547 // There are no point keys in the span we're interested in. 548 sst.SmallestPointKey = InternalKey{} 549 sst.LargestPointKey = InternalKey{} 550 } 551 sst.LargestRangeKey.UserKey = sst.LargestRangeKey.UserKey[:0] 552 sst.LargestRangeKey.Trailer = 0 553 if rangeKeyIter != nil { 554 span := rangeKeyIter.SeekLT(upper) 555 if span != nil { 556 sst.LargestRangeKey.CopyFrom(span.LargestKey()) 557 } else { 558 // There are no range keys in the span we're interested in. 559 sst.SmallestRangeKey = InternalKey{} 560 sst.LargestRangeKey = InternalKey{} 561 } 562 } 563 } 564 // Set overall bounds based on {Smallest,Largest}{Point,Range}Key. 565 switch { 566 case len(sst.SmallestRangeKey.UserKey) == 0: 567 sst.Smallest = sst.SmallestPointKey 568 case len(sst.SmallestPointKey.UserKey) == 0: 569 sst.Smallest = sst.SmallestRangeKey 570 default: 571 sst.Smallest = sst.SmallestPointKey 572 if base.InternalCompare(cmp, sst.SmallestRangeKey, sst.SmallestPointKey) < 0 { 573 sst.Smallest = sst.SmallestRangeKey 574 } 575 } 576 switch { 577 case len(sst.LargestRangeKey.UserKey) == 0: 578 sst.Largest = sst.LargestPointKey 579 case len(sst.LargestPointKey.UserKey) == 0: 580 sst.Largest = sst.LargestRangeKey 581 default: 582 sst.Largest = sst.LargestPointKey 583 if base.InternalCompare(cmp, sst.LargestRangeKey, sst.LargestPointKey) > 0 { 584 sst.Largest = sst.LargestRangeKey 585 } 586 } 587 // On rare occasion, a file might overlap with [lower, upper) but not actually 588 // have any keys within those bounds. Skip such files. 589 if len(sst.Smallest.UserKey) == 0 { 590 return nil, true, nil 591 } 592 sst.Size, err = d.tableCache.estimateSize(file, sst.Smallest.UserKey, sst.Largest.UserKey) 593 if err != nil { 594 return nil, false, err 595 } 596 // On occasion, estimateSize gives us a low estimate, i.e. a 0 file size. This 597 // can cause panics in places where we divide by file sizes. Correct for it 598 // here. 599 if sst.Size == 0 { 600 sst.Size = 1 601 } 602 return sst, false, nil 603 } 604 605 func scanInternalImpl( 606 ctx context.Context, lower, upper []byte, iter *scanInternalIterator, opts *scanInternalOptions, 607 ) error { 608 if opts.visitSharedFile != nil && (lower == nil || upper == nil) { 609 panic("lower and upper bounds must be specified in skip-shared iteration mode") 610 } 611 // Before starting iteration, check if any files in levels sharedLevelsStart 612 // and below are *not* shared. Error out if that is the case, as skip-shared 613 // iteration will not produce a consistent point-in-time view of this range 614 // of keys. For files that are shared, call visitSharedFile with a truncated 615 // version of that file. 616 cmp := iter.comparer.Compare 617 provider := iter.db.ObjProvider() 618 seqNum := iter.seqNum 619 current := iter.version 620 if current == nil { 621 current = iter.readState.current 622 } 623 if opts.visitSharedFile != nil { 624 if provider == nil { 625 panic("expected non-nil Provider in skip-shared iteration mode") 626 } 627 for level := sharedLevelsStart; level < numLevels; level++ { 628 files := current.Levels[level].Iter() 629 for f := files.SeekGE(cmp, lower); f != nil && cmp(f.Smallest.UserKey, upper) < 0; f = files.Next() { 630 var objMeta objstorage.ObjectMetadata 631 var err error 632 objMeta, err = provider.Lookup(fileTypeTable, f.FileBacking.DiskFileNum) 633 if err != nil { 634 return err 635 } 636 if !objMeta.IsShared() { 637 return errors.Wrapf(ErrInvalidSkipSharedIteration, "file %s is not shared", objMeta.DiskFileNum) 638 } 639 if !base.Visible(f.LargestSeqNum, seqNum, base.InternalKeySeqNumMax) { 640 return errors.Wrapf(ErrInvalidSkipSharedIteration, "file %s contains keys newer than snapshot", objMeta.DiskFileNum) 641 } 642 var sst *SharedSSTMeta 643 var skip bool 644 sst, skip, err = iter.db.truncateSharedFile(ctx, lower, upper, level, f, objMeta) 645 if err != nil { 646 return err 647 } 648 if skip { 649 continue 650 } 651 if err = opts.visitSharedFile(sst); err != nil { 652 return err 653 } 654 } 655 } 656 } 657 658 for valid := iter.seekGE(lower); valid && iter.error() == nil; valid = iter.next() { 659 key := iter.unsafeKey() 660 661 if opts.rateLimitFunc != nil { 662 if err := opts.rateLimitFunc(key, iter.lazyValue()); err != nil { 663 return err 664 } 665 } 666 667 switch key.Kind() { 668 case InternalKeyKindRangeKeyDelete, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeySet: 669 if opts.visitRangeKey != nil { 670 span := iter.unsafeSpan() 671 // NB: The caller isn't interested in the sequence numbers of these 672 // range keys. Rather, the caller wants them to be in trailer order 673 // _after_ zeroing of sequence numbers. Copy span.Keys, sort it, and then 674 // call visitRangeKey. 675 keysCopy := make([]keyspan.Key, len(span.Keys)) 676 for i := range span.Keys { 677 keysCopy[i] = span.Keys[i] 678 keysCopy[i].Trailer = base.MakeTrailer(0, span.Keys[i].Kind()) 679 } 680 keyspan.SortKeysByTrailer(&keysCopy) 681 if err := opts.visitRangeKey(span.Start, span.End, keysCopy); err != nil { 682 return err 683 } 684 } 685 case InternalKeyKindRangeDelete: 686 if opts.visitRangeDel != nil { 687 rangeDel := iter.unsafeRangeDel() 688 if err := opts.visitRangeDel(rangeDel.Start, rangeDel.End, rangeDel.LargestSeqNum()); err != nil { 689 return err 690 } 691 } 692 default: 693 if opts.visitPointKey != nil { 694 var info IteratorLevel 695 if len(iter.mergingIter.heap.items) > 0 { 696 mergingIterIdx := iter.mergingIter.heap.items[0].index 697 info = iter.iterLevels[mergingIterIdx] 698 } else { 699 info = IteratorLevel{Kind: IteratorLevelUnknown} 700 } 701 val := iter.lazyValue() 702 if err := opts.visitPointKey(key, val, info); err != nil { 703 return err 704 } 705 } 706 } 707 } 708 709 return nil 710 } 711 712 // constructPointIter constructs a merging iterator and sets i.iter to it. 713 func (i *scanInternalIterator) constructPointIter( 714 categoryAndQoS sstable.CategoryAndQoS, memtables flushableList, buf *iterAlloc, 715 ) { 716 // Merging levels and levels from iterAlloc. 717 mlevels := buf.mlevels[:0] 718 levels := buf.levels[:0] 719 720 // We compute the number of levels needed ahead of time and reallocate a slice if 721 // the array from the iterAlloc isn't large enough. Doing this allocation once 722 // should improve the performance. 723 numMergingLevels := len(memtables) 724 numLevelIters := 0 725 726 current := i.version 727 if current == nil { 728 current = i.readState.current 729 } 730 numMergingLevels += len(current.L0SublevelFiles) 731 numLevelIters += len(current.L0SublevelFiles) 732 733 for level := 1; level < len(current.Levels); level++ { 734 if current.Levels[level].Empty() { 735 continue 736 } 737 if i.opts.skipSharedLevels && level >= sharedLevelsStart { 738 continue 739 } 740 numMergingLevels++ 741 numLevelIters++ 742 } 743 744 if numMergingLevels > cap(mlevels) { 745 mlevels = make([]mergingIterLevel, 0, numMergingLevels) 746 } 747 if numLevelIters > cap(levels) { 748 levels = make([]levelIter, 0, numLevelIters) 749 } 750 // TODO(bilal): Push these into the iterAlloc buf. 751 var rangeDelMiter keyspan.MergingIter 752 rangeDelIters := make([]keyspan.FragmentIterator, 0, numMergingLevels) 753 rangeDelLevels := make([]keyspan.LevelIter, 0, numLevelIters) 754 755 i.iterLevels = make([]IteratorLevel, numMergingLevels) 756 mlevelsIndex := 0 757 758 // Next are the memtables. 759 for j := len(memtables) - 1; j >= 0; j-- { 760 mem := memtables[j] 761 mlevels = append(mlevels, mergingIterLevel{ 762 iter: mem.newIter(&i.opts.IterOptions), 763 }) 764 i.iterLevels[mlevelsIndex] = IteratorLevel{ 765 Kind: IteratorLevelFlushable, 766 FlushableIndex: j, 767 } 768 mlevelsIndex++ 769 if rdi := mem.newRangeDelIter(&i.opts.IterOptions); rdi != nil { 770 rangeDelIters = append(rangeDelIters, rdi) 771 } 772 } 773 774 // Next are the file levels: L0 sub-levels followed by lower levels. 775 levelsIndex := len(levels) 776 mlevels = mlevels[:numMergingLevels] 777 levels = levels[:numLevelIters] 778 rangeDelLevels = rangeDelLevels[:numLevelIters] 779 i.opts.IterOptions.snapshotForHideObsoletePoints = i.seqNum 780 i.opts.IterOptions.CategoryAndQoS = categoryAndQoS 781 addLevelIterForFiles := func(files manifest.LevelIterator, level manifest.Level) { 782 li := &levels[levelsIndex] 783 rli := &rangeDelLevels[levelsIndex] 784 785 li.init( 786 i.ctx, i.opts.IterOptions, i.comparer, i.newIters, files, level, 787 internalIterOpts{}) 788 li.initBoundaryContext(&mlevels[mlevelsIndex].levelIterBoundaryContext) 789 mlevels[mlevelsIndex].iter = li 790 rli.Init(keyspan.SpanIterOptions{RangeKeyFilters: i.opts.RangeKeyFilters}, 791 i.comparer.Compare, tableNewRangeDelIter(i.ctx, i.newIters), files, level, 792 manifest.KeyTypePoint) 793 rangeDelIters = append(rangeDelIters, rli) 794 795 levelsIndex++ 796 mlevelsIndex++ 797 } 798 799 for j := len(current.L0SublevelFiles) - 1; j >= 0; j-- { 800 i.iterLevels[mlevelsIndex] = IteratorLevel{ 801 Kind: IteratorLevelLSM, 802 Level: 0, 803 Sublevel: j, 804 } 805 addLevelIterForFiles(current.L0SublevelFiles[j].Iter(), manifest.L0Sublevel(j)) 806 } 807 // Add level iterators for the non-empty non-L0 levels. 808 for level := 1; level < numLevels; level++ { 809 if current.Levels[level].Empty() { 810 continue 811 } 812 if i.opts.skipSharedLevels && level >= sharedLevelsStart { 813 continue 814 } 815 i.iterLevels[mlevelsIndex] = IteratorLevel{Kind: IteratorLevelLSM, Level: level} 816 addLevelIterForFiles(current.Levels[level].Iter(), manifest.Level(level)) 817 } 818 819 buf.merging.init(&i.opts.IterOptions, &InternalIteratorStats{}, i.comparer.Compare, i.comparer.Split, mlevels...) 820 buf.merging.snapshot = i.seqNum 821 rangeDelMiter.Init(i.comparer.Compare, keyspan.VisibleTransform(i.seqNum), new(keyspan.MergingBuffers), rangeDelIters...) 822 823 if i.opts.includeObsoleteKeys { 824 iiter := &keyspan.InterleavingIter{} 825 iiter.Init(i.comparer, &buf.merging, &rangeDelMiter, 826 keyspan.InterleavingIterOpts{ 827 LowerBound: i.opts.LowerBound, 828 UpperBound: i.opts.UpperBound, 829 }) 830 i.pointKeyIter = iiter 831 } else { 832 pcIter := &pointCollapsingIterator{ 833 comparer: i.comparer, 834 merge: i.merge, 835 seqNum: i.seqNum, 836 } 837 pcIter.iter.Init(i.comparer, &buf.merging, &rangeDelMiter, keyspan.InterleavingIterOpts{ 838 LowerBound: i.opts.LowerBound, 839 UpperBound: i.opts.UpperBound, 840 }) 841 i.pointKeyIter = pcIter 842 } 843 i.iter = i.pointKeyIter 844 } 845 846 // constructRangeKeyIter constructs the range-key iterator stack, populating 847 // i.rangeKey.rangeKeyIter with the resulting iterator. This is similar to 848 // Iterator.constructRangeKeyIter, except it doesn't handle batches and ensures 849 // iterConfig does *not* elide unsets/deletes. 850 func (i *scanInternalIterator) constructRangeKeyIter() error { 851 // We want the bounded iter from iterConfig, but not the collapsing of 852 // RangeKeyUnsets and RangeKeyDels. 853 i.rangeKey.rangeKeyIter = i.rangeKey.iterConfig.Init( 854 i.comparer, i.seqNum, i.opts.LowerBound, i.opts.UpperBound, 855 nil /* hasPrefix */, nil /* prefix */, true, /* internalKeys */ 856 &i.rangeKey.rangeKeyBuffers.internal) 857 858 // Next are the flushables: memtables and large batches. 859 if i.readState != nil { 860 for j := len(i.readState.memtables) - 1; j >= 0; j-- { 861 mem := i.readState.memtables[j] 862 // We only need to read from memtables which contain sequence numbers older 863 // than seqNum. 864 if logSeqNum := mem.logSeqNum; logSeqNum >= i.seqNum { 865 continue 866 } 867 if rki := mem.newRangeKeyIter(&i.opts.IterOptions); rki != nil { 868 i.rangeKey.iterConfig.AddLevel(rki) 869 } 870 } 871 } 872 873 current := i.version 874 if current == nil { 875 current = i.readState.current 876 } 877 // Next are the file levels: L0 sub-levels followed by lower levels. 878 // 879 // Add file-specific iterators for L0 files containing range keys. This is less 880 // efficient than using levelIters for sublevels of L0 files containing 881 // range keys, but range keys are expected to be sparse anyway, reducing the 882 // cost benefit of maintaining a separate L0Sublevels instance for range key 883 // files and then using it here. 884 // 885 // NB: We iterate L0's files in reverse order. They're sorted by 886 // LargestSeqNum ascending, and we need to add them to the merging iterator 887 // in LargestSeqNum descending to preserve the merging iterator's invariants 888 // around Key Trailer order. 889 iter := current.RangeKeyLevels[0].Iter() 890 for f := iter.Last(); f != nil; f = iter.Prev() { 891 spanIter, err := i.newIterRangeKey(f, i.opts.SpanIterOptions()) 892 if err != nil { 893 return err 894 } 895 i.rangeKey.iterConfig.AddLevel(spanIter) 896 } 897 898 // Add level iterators for the non-empty non-L0 levels. 899 for level := 1; level < len(current.RangeKeyLevels); level++ { 900 if current.RangeKeyLevels[level].Empty() { 901 continue 902 } 903 if i.opts.skipSharedLevels && level >= sharedLevelsStart { 904 continue 905 } 906 li := i.rangeKey.iterConfig.NewLevelIter() 907 spanIterOpts := i.opts.SpanIterOptions() 908 li.Init(spanIterOpts, i.comparer.Compare, i.newIterRangeKey, current.RangeKeyLevels[level].Iter(), 909 manifest.Level(level), manifest.KeyTypeRange) 910 i.rangeKey.iterConfig.AddLevel(li) 911 } 912 return nil 913 } 914 915 // seekGE seeks this iterator to the first key that's greater than or equal 916 // to the specified user key. 917 func (i *scanInternalIterator) seekGE(key []byte) bool { 918 i.iterKey, i.iterValue = i.iter.SeekGE(key, base.SeekGEFlagsNone) 919 return i.iterKey != nil 920 } 921 922 // unsafeKey returns the unsafe InternalKey at the current position. The value 923 // is nil if the iterator is invalid or exhausted. 924 func (i *scanInternalIterator) unsafeKey() *InternalKey { 925 return i.iterKey 926 } 927 928 // lazyValue returns a value pointer to the value at the current iterator 929 // position. Behaviour undefined if unsafeKey() returns a Range key or Rangedel 930 // kind key. 931 func (i *scanInternalIterator) lazyValue() LazyValue { 932 return i.iterValue 933 } 934 935 // unsafeRangeDel returns a range key span. Behaviour undefined if UnsafeKey returns 936 // a non-rangedel kind. 937 func (i *scanInternalIterator) unsafeRangeDel() *keyspan.Span { 938 type spanInternalIterator interface { 939 Span() *keyspan.Span 940 } 941 return i.pointKeyIter.(spanInternalIterator).Span() 942 } 943 944 // unsafeSpan returns a range key span. Behaviour undefined if UnsafeKey returns 945 // a non-rangekey type. 946 func (i *scanInternalIterator) unsafeSpan() *keyspan.Span { 947 return i.rangeKey.iiter.Span() 948 } 949 950 // next advances the iterator in the forward direction, and returns the 951 // iterator's new validity state. 952 func (i *scanInternalIterator) next() bool { 953 i.iterKey, i.iterValue = i.iter.Next() 954 return i.iterKey != nil 955 } 956 957 // error returns an error from the internal iterator, if there's any. 958 func (i *scanInternalIterator) error() error { 959 return i.iter.Error() 960 } 961 962 // close closes this iterator, and releases any pooled objects. 963 func (i *scanInternalIterator) close() error { 964 if err := i.iter.Close(); err != nil { 965 return err 966 } 967 if i.readState != nil { 968 i.readState.unref() 969 } 970 if i.version != nil { 971 i.version.Unref() 972 } 973 if i.rangeKey != nil { 974 i.rangeKey.PrepareForReuse() 975 *i.rangeKey = iteratorRangeKeyState{ 976 rangeKeyBuffers: i.rangeKey.rangeKeyBuffers, 977 } 978 iterRangeKeyStateAllocPool.Put(i.rangeKey) 979 i.rangeKey = nil 980 } 981 if alloc := i.alloc; alloc != nil { 982 for j := range i.boundsBuf { 983 if cap(i.boundsBuf[j]) >= maxKeyBufCacheSize { 984 alloc.boundsBuf[j] = nil 985 } else { 986 alloc.boundsBuf[j] = i.boundsBuf[j] 987 } 988 } 989 *alloc = iterAlloc{ 990 keyBuf: alloc.keyBuf[:0], 991 boundsBuf: alloc.boundsBuf, 992 prefixOrFullSeekKey: alloc.prefixOrFullSeekKey[:0], 993 } 994 iterAllocPool.Put(alloc) 995 i.alloc = nil 996 } 997 return nil 998 } 999 1000 func (i *scanInternalIterator) initializeBoundBufs(lower, upper []byte) { 1001 buf := i.boundsBuf[i.boundsBufIdx][:0] 1002 if lower != nil { 1003 buf = append(buf, lower...) 1004 i.opts.LowerBound = buf 1005 } else { 1006 i.opts.LowerBound = nil 1007 } 1008 if upper != nil { 1009 buf = append(buf, upper...) 1010 i.opts.UpperBound = buf[len(buf)-len(upper):] 1011 } else { 1012 i.opts.UpperBound = nil 1013 } 1014 i.boundsBuf[i.boundsBufIdx] = buf 1015 i.boundsBufIdx = 1 - i.boundsBufIdx 1016 }