github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/external_iterator.go (about) 1 // Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "context" 9 "fmt" 10 "sort" 11 12 "github.com/cockroachdb/errors" 13 "github.com/cockroachdb/pebble/internal/base" 14 "github.com/cockroachdb/pebble/internal/keyspan" 15 "github.com/cockroachdb/pebble/internal/manifest" 16 "github.com/cockroachdb/pebble/sstable" 17 ) 18 19 // ExternalIterOption provide an interface to specify open-time options to 20 // NewExternalIter. 21 type ExternalIterOption interface { 22 // iterApply is called on the iterator during opening in order to set internal 23 // parameters. 24 iterApply(*Iterator) 25 // readerOptions returns any reader options added by this iter option. 26 readerOptions() []sstable.ReaderOption 27 } 28 29 type externalIterReaderOptions struct { 30 opts []sstable.ReaderOption 31 } 32 33 func (e *externalIterReaderOptions) iterApply(iterator *Iterator) { 34 // Do nothing. 35 } 36 37 func (e *externalIterReaderOptions) readerOptions() []sstable.ReaderOption { 38 return e.opts 39 } 40 41 // ExternalIterReaderOptions returns an ExternalIterOption that specifies 42 // sstable.ReaderOptions to be applied on sstable readers in NewExternalIter. 43 func ExternalIterReaderOptions(opts ...sstable.ReaderOption) ExternalIterOption { 44 return &externalIterReaderOptions{opts: opts} 45 } 46 47 // ExternalIterForwardOnly is an ExternalIterOption that specifies this iterator 48 // will only be used for forward positioning operations (First, SeekGE, Next). 49 // This could enable optimizations that take advantage of this invariant. 50 // Behaviour when a reverse positioning operation is done on an iterator 51 // opened with this option is unpredictable, though in most cases it should. 52 type ExternalIterForwardOnly struct{} 53 54 func (e ExternalIterForwardOnly) iterApply(iter *Iterator) { 55 iter.forwardOnly = true 56 } 57 58 func (e ExternalIterForwardOnly) readerOptions() []sstable.ReaderOption { 59 return nil 60 } 61 62 // NewExternalIter takes an input 2d array of sstable files which may overlap 63 // across subarrays but not within a subarray (at least as far as points are 64 // concerned; range keys are allowed to overlap arbitrarily even within a 65 // subarray), and returns an Iterator over the merged contents of the sstables. 66 // Input sstables may contain point keys, range keys, range deletions, etc. The 67 // input files slice must be sorted in reverse chronological ordering. A key in a 68 // file at a lower index subarray will shadow a key with an identical user key 69 // contained within a file at a higher index subarray. Each subarray must be 70 // sorted in internal key order, where lower index files contain keys that sort 71 // left of files with higher indexes. 72 // 73 // Input sstables must only contain keys with the zero sequence number. 74 // 75 // Iterators constructed through NewExternalIter do not support all iterator 76 // options, including block-property and table filters. NewExternalIter errors 77 // if an incompatible option is set. 78 func NewExternalIter( 79 o *Options, 80 iterOpts *IterOptions, 81 files [][]sstable.ReadableFile, 82 extraOpts ...ExternalIterOption, 83 ) (it *Iterator, err error) { 84 return NewExternalIterWithContext(context.Background(), o, iterOpts, files, extraOpts...) 85 } 86 87 // NewExternalIterWithContext is like NewExternalIter, and additionally 88 // accepts a context for tracing. 89 func NewExternalIterWithContext( 90 ctx context.Context, 91 o *Options, 92 iterOpts *IterOptions, 93 files [][]sstable.ReadableFile, 94 extraOpts ...ExternalIterOption, 95 ) (it *Iterator, err error) { 96 if iterOpts != nil { 97 if err := validateExternalIterOpts(iterOpts); err != nil { 98 return nil, err 99 } 100 } 101 102 var readers [][]*sstable.Reader 103 104 // Ensure we close all the opened readers if we error out. 105 defer func() { 106 if err != nil { 107 for i := range readers { 108 for j := range readers[i] { 109 _ = readers[i][j].Close() 110 } 111 } 112 } 113 }() 114 seqNumOffset := 0 115 var extraReaderOpts []sstable.ReaderOption 116 for i := range extraOpts { 117 extraReaderOpts = append(extraReaderOpts, extraOpts[i].readerOptions()...) 118 } 119 for _, levelFiles := range files { 120 seqNumOffset += len(levelFiles) 121 } 122 for _, levelFiles := range files { 123 var subReaders []*sstable.Reader 124 seqNumOffset -= len(levelFiles) 125 subReaders, err = openExternalTables(o, levelFiles, seqNumOffset, o.MakeReaderOptions(), extraReaderOpts...) 126 readers = append(readers, subReaders) 127 } 128 if err != nil { 129 return nil, err 130 } 131 132 buf := iterAllocPool.Get().(*iterAlloc) 133 dbi := &buf.dbi 134 *dbi = Iterator{ 135 ctx: ctx, 136 alloc: buf, 137 merge: o.Merger.Merge, 138 comparer: *o.Comparer, 139 readState: nil, 140 keyBuf: buf.keyBuf, 141 prefixOrFullSeekKey: buf.prefixOrFullSeekKey, 142 boundsBuf: buf.boundsBuf, 143 batch: nil, 144 // Add the readers to the Iterator so that Close closes them, and 145 // SetOptions can re-construct iterators from them. 146 externalReaders: readers, 147 newIters: func( 148 ctx context.Context, f *manifest.FileMetadata, opts *IterOptions, 149 internalOpts internalIterOpts) (internalIterator, keyspan.FragmentIterator, error) { 150 // NB: External iterators are currently constructed without any 151 // `levelIters`. newIters should never be called. When we support 152 // organizing multiple non-overlapping files into a single level 153 // (see TODO below), we'll need to adjust this tableNewIters 154 // implementation to open iterators by looking up f in a map 155 // of readers indexed by *fileMetadata. 156 panic("unreachable") 157 }, 158 seqNum: base.InternalKeySeqNumMax, 159 } 160 if iterOpts != nil { 161 dbi.opts = *iterOpts 162 dbi.processBounds(iterOpts.LowerBound, iterOpts.UpperBound) 163 } 164 for i := range extraOpts { 165 extraOpts[i].iterApply(dbi) 166 } 167 finishInitializingExternal(ctx, dbi) 168 return dbi, nil 169 } 170 171 func validateExternalIterOpts(iterOpts *IterOptions) error { 172 switch { 173 case iterOpts.TableFilter != nil: 174 return errors.Errorf("pebble: external iterator: TableFilter unsupported") 175 case iterOpts.PointKeyFilters != nil: 176 return errors.Errorf("pebble: external iterator: PointKeyFilters unsupported") 177 case iterOpts.RangeKeyFilters != nil: 178 return errors.Errorf("pebble: external iterator: RangeKeyFilters unsupported") 179 case iterOpts.OnlyReadGuaranteedDurable: 180 return errors.Errorf("pebble: external iterator: OnlyReadGuaranteedDurable unsupported") 181 case iterOpts.UseL6Filters: 182 return errors.Errorf("pebble: external iterator: UseL6Filters unsupported") 183 } 184 return nil 185 } 186 187 func createExternalPointIter(ctx context.Context, it *Iterator) (internalIterator, error) { 188 // TODO(jackson): In some instances we could generate fewer levels by using 189 // L0Sublevels code to organize nonoverlapping files into the same level. 190 // This would allow us to use levelIters and keep a smaller set of data and 191 // files in-memory. However, it would also require us to identify the bounds 192 // of all the files upfront. 193 194 if !it.opts.pointKeys() { 195 return emptyIter, nil 196 } else if it.pointIter != nil { 197 return it.pointIter, nil 198 } 199 mlevels := it.alloc.mlevels[:0] 200 201 if len(it.externalReaders) > cap(mlevels) { 202 mlevels = make([]mergingIterLevel, 0, len(it.externalReaders)) 203 } 204 for _, readers := range it.externalReaders { 205 var combinedIters []internalIterator 206 for _, r := range readers { 207 var ( 208 rangeDelIter keyspan.FragmentIterator 209 pointIter internalIterator 210 err error 211 ) 212 // We could set hideObsoletePoints=true, since we are reading at 213 // InternalKeySeqNumMax, but we don't bother since these sstables should 214 // not have obsolete points (so the performance optimization is 215 // unnecessary), and we don't want to bother constructing a 216 // BlockPropertiesFilterer that includes obsoleteKeyBlockPropertyFilter. 217 pointIter, err = r.NewIterWithBlockPropertyFiltersAndContextEtc( 218 ctx, it.opts.LowerBound, it.opts.UpperBound, nil, /* BlockPropertiesFilterer */ 219 false /* hideObsoletePoints */, false, /* useFilterBlock */ 220 &it.stats.InternalStats, sstable.TrivialReaderProvider{Reader: r}) 221 if err != nil { 222 return nil, err 223 } 224 rangeDelIter, err = r.NewRawRangeDelIter() 225 if err != nil { 226 return nil, err 227 } 228 if rangeDelIter == nil && pointIter != nil && it.forwardOnly { 229 // TODO(bilal): Consider implementing range key pausing in 230 // simpleLevelIter so we can reduce mergingIterLevels even more by 231 // sending all sstable iterators to combinedIters, not just those 232 // corresponding to sstables without range deletes. 233 combinedIters = append(combinedIters, pointIter) 234 continue 235 } 236 mlevels = append(mlevels, mergingIterLevel{ 237 iter: pointIter, 238 rangeDelIter: rangeDelIter, 239 }) 240 } 241 if len(combinedIters) == 1 { 242 mlevels = append(mlevels, mergingIterLevel{ 243 iter: combinedIters[0], 244 }) 245 } else if len(combinedIters) > 1 { 246 sli := &simpleLevelIter{ 247 cmp: it.cmp, 248 iters: combinedIters, 249 } 250 sli.init(it.opts) 251 mlevels = append(mlevels, mergingIterLevel{ 252 iter: sli, 253 rangeDelIter: nil, 254 }) 255 } 256 } 257 if len(mlevels) == 1 && mlevels[0].rangeDelIter == nil { 258 // Set closePointIterOnce to true. This is because we're bypassing the 259 // merging iter, which turns Close()s on it idempotent for any child 260 // iterators. The outer Iterator could call Close() on a point iter twice, 261 // which sstable iterators do not support (as they release themselves to 262 // a pool). 263 it.closePointIterOnce = true 264 return mlevels[0].iter, nil 265 } 266 267 it.alloc.merging.init(&it.opts, &it.stats.InternalStats, it.comparer.Compare, it.comparer.Split, mlevels...) 268 it.alloc.merging.snapshot = base.InternalKeySeqNumMax 269 if len(mlevels) <= cap(it.alloc.levelsPositioned) { 270 it.alloc.merging.levelsPositioned = it.alloc.levelsPositioned[:len(mlevels)] 271 } 272 return &it.alloc.merging, nil 273 } 274 275 func finishInitializingExternal(ctx context.Context, it *Iterator) { 276 pointIter, err := createExternalPointIter(ctx, it) 277 if err != nil { 278 it.pointIter = &errorIter{err: err} 279 } else { 280 it.pointIter = pointIter 281 } 282 it.iter = it.pointIter 283 284 if it.opts.rangeKeys() { 285 it.rangeKeyMasking.init(it, it.comparer.Compare, it.comparer.Split) 286 var rangeKeyIters []keyspan.FragmentIterator 287 if it.rangeKey == nil { 288 // We could take advantage of the lack of overlaps in range keys within 289 // each slice in it.externalReaders, and generate keyspan.LevelIters 290 // out of those. However, since range keys are expected to be sparse to 291 // begin with, the performance gain might not be significant enough to 292 // warrant it. 293 // 294 // TODO(bilal): Explore adding a simpleRangeKeyLevelIter that does not 295 // operate on FileMetadatas (similar to simpleLevelIter), and implements 296 // this optimization. 297 for _, readers := range it.externalReaders { 298 for _, r := range readers { 299 if rki, err := r.NewRawRangeKeyIter(); err != nil { 300 rangeKeyIters = append(rangeKeyIters, &errorKeyspanIter{err: err}) 301 } else if rki != nil { 302 rangeKeyIters = append(rangeKeyIters, rki) 303 } 304 } 305 } 306 if len(rangeKeyIters) > 0 { 307 it.rangeKey = iterRangeKeyStateAllocPool.Get().(*iteratorRangeKeyState) 308 it.rangeKey.init(it.comparer.Compare, it.comparer.Split, &it.opts) 309 it.rangeKey.rangeKeyIter = it.rangeKey.iterConfig.Init( 310 &it.comparer, 311 base.InternalKeySeqNumMax, 312 it.opts.LowerBound, it.opts.UpperBound, 313 &it.hasPrefix, &it.prefixOrFullSeekKey, 314 false /* internalKeys */, &it.rangeKey.internal, 315 ) 316 for i := range rangeKeyIters { 317 it.rangeKey.iterConfig.AddLevel(rangeKeyIters[i]) 318 } 319 } 320 } 321 if it.rangeKey != nil { 322 it.rangeKey.iiter.Init(&it.comparer, it.iter, it.rangeKey.rangeKeyIter, 323 keyspan.InterleavingIterOpts{ 324 Mask: &it.rangeKeyMasking, 325 LowerBound: it.opts.LowerBound, 326 UpperBound: it.opts.UpperBound, 327 }) 328 it.iter = &it.rangeKey.iiter 329 } 330 } 331 } 332 333 func openExternalTables( 334 o *Options, 335 files []sstable.ReadableFile, 336 seqNumOffset int, 337 readerOpts sstable.ReaderOptions, 338 extraReaderOpts ...sstable.ReaderOption, 339 ) (readers []*sstable.Reader, err error) { 340 readers = make([]*sstable.Reader, 0, len(files)) 341 for i := range files { 342 readable, err := sstable.NewSimpleReadable(files[i]) 343 if err != nil { 344 return readers, err 345 } 346 r, err := sstable.NewReader(readable, readerOpts, extraReaderOpts...) 347 if err != nil { 348 return readers, err 349 } 350 // Use the index of the file in files as the sequence number for all of 351 // its keys. 352 r.Properties.GlobalSeqNum = uint64(len(files) - i + seqNumOffset) 353 readers = append(readers, r) 354 } 355 return readers, err 356 } 357 358 // simpleLevelIter is similar to a levelIter in that it merges the points 359 // from multiple point iterators that are non-overlapping in the key ranges 360 // they return. It is only expected to support forward iteration and forward 361 // regular seeking; reverse iteration and prefix seeking is not supported. 362 // Intended to be a low-overhead, non-FileMetadata dependent option for 363 // NewExternalIter. To optimize seeking and forward iteration, it maintains 364 // two slices of child iterators; one of all iterators, and a subset of it that 365 // contains just the iterators that contain point keys within the current 366 // bounds. 367 // 368 // Note that this levelIter does not support pausing at file boundaries 369 // in case of range tombstones in this file that could apply to points outside 370 // of this file (and outside of this level). This is sufficient for optimizing 371 // the main use cases of NewExternalIter, however for completeness it would make 372 // sense to build this pausing functionality in. 373 type simpleLevelIter struct { 374 cmp Compare 375 err error 376 lowerBound []byte 377 iters []internalIterator 378 filtered []internalIterator 379 firstKeys [][]byte 380 firstKeysBuf []byte 381 currentIdx int 382 } 383 384 var _ internalIterator = &simpleLevelIter{} 385 386 // init initializes this simpleLevelIter. 387 func (s *simpleLevelIter) init(opts IterOptions) { 388 s.currentIdx = 0 389 s.lowerBound = opts.LowerBound 390 s.resetFilteredIters() 391 } 392 393 func (s *simpleLevelIter) resetFilteredIters() { 394 s.filtered = s.filtered[:0] 395 s.firstKeys = s.firstKeys[:0] 396 s.firstKeysBuf = s.firstKeysBuf[:0] 397 s.err = nil 398 for i := range s.iters { 399 var iterKey *base.InternalKey 400 if s.lowerBound != nil { 401 iterKey, _ = s.iters[i].SeekGE(s.lowerBound, base.SeekGEFlagsNone) 402 } else { 403 iterKey, _ = s.iters[i].First() 404 } 405 if iterKey != nil { 406 s.filtered = append(s.filtered, s.iters[i]) 407 bufStart := len(s.firstKeysBuf) 408 s.firstKeysBuf = append(s.firstKeysBuf, iterKey.UserKey...) 409 s.firstKeys = append(s.firstKeys, s.firstKeysBuf[bufStart:bufStart+len(iterKey.UserKey)]) 410 } else if err := s.iters[i].Error(); err != nil { 411 s.err = err 412 } 413 } 414 } 415 416 func (s *simpleLevelIter) SeekGE( 417 key []byte, flags base.SeekGEFlags, 418 ) (*base.InternalKey, base.LazyValue) { 419 if s.err != nil { 420 return nil, base.LazyValue{} 421 } 422 // Find the first file that is entirely >= key. The file before that could 423 // contain the key we're looking for. 424 n := sort.Search(len(s.firstKeys), func(i int) bool { 425 return s.cmp(key, s.firstKeys[i]) <= 0 426 }) 427 if n > 0 { 428 s.currentIdx = n - 1 429 } else { 430 s.currentIdx = n 431 } 432 if s.currentIdx < len(s.filtered) { 433 if iterKey, val := s.filtered[s.currentIdx].SeekGE(key, flags); iterKey != nil { 434 return iterKey, val 435 } 436 if err := s.filtered[s.currentIdx].Error(); err != nil { 437 s.err = err 438 } 439 s.currentIdx++ 440 } 441 return s.skipEmptyFileForward(key, flags) 442 } 443 444 func (s *simpleLevelIter) skipEmptyFileForward( 445 seekKey []byte, flags base.SeekGEFlags, 446 ) (*base.InternalKey, base.LazyValue) { 447 var iterKey *base.InternalKey 448 var val base.LazyValue 449 for s.currentIdx >= 0 && s.currentIdx < len(s.filtered) && s.err == nil { 450 if seekKey != nil { 451 iterKey, val = s.filtered[s.currentIdx].SeekGE(seekKey, flags) 452 } else if s.lowerBound != nil { 453 iterKey, val = s.filtered[s.currentIdx].SeekGE(s.lowerBound, flags) 454 } else { 455 iterKey, val = s.filtered[s.currentIdx].First() 456 } 457 if iterKey != nil { 458 return iterKey, val 459 } 460 if err := s.filtered[s.currentIdx].Error(); err != nil { 461 s.err = err 462 } 463 s.currentIdx++ 464 } 465 return nil, base.LazyValue{} 466 } 467 468 func (s *simpleLevelIter) SeekPrefixGE( 469 prefix, key []byte, flags base.SeekGEFlags, 470 ) (*base.InternalKey, base.LazyValue) { 471 panic("unimplemented") 472 } 473 474 func (s *simpleLevelIter) SeekLT( 475 key []byte, flags base.SeekLTFlags, 476 ) (*base.InternalKey, base.LazyValue) { 477 panic("unimplemented") 478 } 479 480 func (s *simpleLevelIter) First() (*base.InternalKey, base.LazyValue) { 481 if s.err != nil { 482 return nil, base.LazyValue{} 483 } 484 s.currentIdx = 0 485 return s.skipEmptyFileForward(nil /* seekKey */, base.SeekGEFlagsNone) 486 } 487 488 func (s *simpleLevelIter) Last() (*base.InternalKey, base.LazyValue) { 489 panic("unimplemented") 490 } 491 492 func (s *simpleLevelIter) Next() (*base.InternalKey, base.LazyValue) { 493 if s.err != nil { 494 return nil, base.LazyValue{} 495 } 496 if s.currentIdx < 0 || s.currentIdx >= len(s.filtered) { 497 return nil, base.LazyValue{} 498 } 499 if iterKey, val := s.filtered[s.currentIdx].Next(); iterKey != nil { 500 return iterKey, val 501 } 502 s.currentIdx++ 503 return s.skipEmptyFileForward(nil /* seekKey */, base.SeekGEFlagsNone) 504 } 505 506 func (s *simpleLevelIter) NextPrefix(succKey []byte) (*base.InternalKey, base.LazyValue) { 507 if s.err != nil { 508 return nil, base.LazyValue{} 509 } 510 if s.currentIdx < 0 || s.currentIdx >= len(s.filtered) { 511 return nil, base.LazyValue{} 512 } 513 if iterKey, val := s.filtered[s.currentIdx].NextPrefix(succKey); iterKey != nil { 514 return iterKey, val 515 } 516 s.currentIdx++ 517 return s.skipEmptyFileForward(succKey /* seekKey */, base.SeekGEFlagsNone) 518 } 519 520 func (s *simpleLevelIter) Prev() (*base.InternalKey, base.LazyValue) { 521 panic("unimplemented") 522 } 523 524 func (s *simpleLevelIter) Error() error { 525 if s.currentIdx >= 0 && s.currentIdx < len(s.filtered) { 526 s.err = firstError(s.err, s.filtered[s.currentIdx].Error()) 527 } 528 return s.err 529 } 530 531 func (s *simpleLevelIter) Close() error { 532 var err error 533 for i := range s.iters { 534 err = firstError(err, s.iters[i].Close()) 535 } 536 return err 537 } 538 539 func (s *simpleLevelIter) SetBounds(lower, upper []byte) { 540 s.currentIdx = -1 541 s.lowerBound = lower 542 for i := range s.iters { 543 s.iters[i].SetBounds(lower, upper) 544 } 545 s.resetFilteredIters() 546 } 547 548 func (s *simpleLevelIter) String() string { 549 if s.currentIdx < 0 || s.currentIdx >= len(s.filtered) { 550 return "simpleLevelIter: current=<nil>" 551 } 552 return fmt.Sprintf("simpleLevelIter: current=%s", s.filtered[s.currentIdx]) 553 } 554 555 var _ internalIterator = &simpleLevelIter{}