github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/external_iterator.go (about) 1 // Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "context" 9 "fmt" 10 "sort" 11 12 "github.com/cockroachdb/errors" 13 "github.com/cockroachdb/pebble/internal/base" 14 "github.com/cockroachdb/pebble/internal/keyspan" 15 "github.com/cockroachdb/pebble/internal/manifest" 16 "github.com/cockroachdb/pebble/sstable" 17 ) 18 19 // ExternalIterOption provide an interface to specify open-time options to 20 // NewExternalIter. 21 type ExternalIterOption interface { 22 // iterApply is called on the iterator during opening in order to set internal 23 // parameters. 24 iterApply(*Iterator) 25 // readerOptions returns any reader options added by this iter option. 26 readerOptions() []sstable.ReaderOption 27 } 28 29 type externalIterReaderOptions struct { 30 opts []sstable.ReaderOption 31 } 32 33 func (e *externalIterReaderOptions) iterApply(iterator *Iterator) { 34 // Do nothing. 35 } 36 37 func (e *externalIterReaderOptions) readerOptions() []sstable.ReaderOption { 38 return e.opts 39 } 40 41 // ExternalIterReaderOptions returns an ExternalIterOption that specifies 42 // sstable.ReaderOptions to be applied on sstable readers in NewExternalIter. 43 func ExternalIterReaderOptions(opts ...sstable.ReaderOption) ExternalIterOption { 44 return &externalIterReaderOptions{opts: opts} 45 } 46 47 // ExternalIterForwardOnly is an ExternalIterOption that specifies this iterator 48 // will only be used for forward positioning operations (First, SeekGE, Next). 49 // This could enable optimizations that take advantage of this invariant. 50 // Behaviour when a reverse positioning operation is done on an iterator 51 // opened with this option is unpredictable, though in most cases it should. 52 type ExternalIterForwardOnly struct{} 53 54 func (e ExternalIterForwardOnly) iterApply(iter *Iterator) { 55 iter.forwardOnly = true 56 } 57 58 func (e ExternalIterForwardOnly) readerOptions() []sstable.ReaderOption { 59 return nil 60 } 61 62 // NewExternalIter takes an input 2d array of sstable files which may overlap 63 // across subarrays but not within a subarray (at least as far as points are 64 // concerned; range keys are allowed to overlap arbitrarily even within a 65 // subarray), and returns an Iterator over the merged contents of the sstables. 66 // Input sstables may contain point keys, range keys, range deletions, etc. The 67 // input files slice must be sorted in reverse chronological ordering. A key in a 68 // file at a lower index subarray will shadow a key with an identical user key 69 // contained within a file at a higher index subarray. Each subarray must be 70 // sorted in internal key order, where lower index files contain keys that sort 71 // left of files with higher indexes. 72 // 73 // Input sstables must only contain keys with the zero sequence number. 74 // 75 // Iterators constructed through NewExternalIter do not support all iterator 76 // options, including block-property and table filters. NewExternalIter errors 77 // if an incompatible option is set. 78 func NewExternalIter( 79 o *Options, 80 iterOpts *IterOptions, 81 files [][]sstable.ReadableFile, 82 extraOpts ...ExternalIterOption, 83 ) (it *Iterator, err error) { 84 return NewExternalIterWithContext(context.Background(), o, iterOpts, files, extraOpts...) 85 } 86 87 // NewExternalIterWithContext is like NewExternalIter, and additionally 88 // accepts a context for tracing. 89 func NewExternalIterWithContext( 90 ctx context.Context, 91 o *Options, 92 iterOpts *IterOptions, 93 files [][]sstable.ReadableFile, 94 extraOpts ...ExternalIterOption, 95 ) (it *Iterator, err error) { 96 if iterOpts != nil { 97 if err := validateExternalIterOpts(iterOpts); err != nil { 98 return nil, err 99 } 100 } 101 102 var readers [][]*sstable.Reader 103 104 // Ensure we close all the opened readers if we error out. 105 defer func() { 106 if err != nil { 107 for i := range readers { 108 for j := range readers[i] { 109 _ = readers[i][j].Close() 110 } 111 } 112 } 113 }() 114 seqNumOffset := 0 115 var extraReaderOpts []sstable.ReaderOption 116 for i := range extraOpts { 117 extraReaderOpts = append(extraReaderOpts, extraOpts[i].readerOptions()...) 118 } 119 for _, levelFiles := range files { 120 seqNumOffset += len(levelFiles) 121 } 122 for _, levelFiles := range files { 123 var subReaders []*sstable.Reader 124 seqNumOffset -= len(levelFiles) 125 subReaders, err = openExternalTables(o, levelFiles, seqNumOffset, o.MakeReaderOptions(), extraReaderOpts...) 126 readers = append(readers, subReaders) 127 } 128 if err != nil { 129 return nil, err 130 } 131 132 buf := iterAllocPool.Get().(*iterAlloc) 133 dbi := &buf.dbi 134 *dbi = Iterator{ 135 ctx: ctx, 136 alloc: buf, 137 merge: o.Merger.Merge, 138 comparer: *o.Comparer, 139 readState: nil, 140 keyBuf: buf.keyBuf, 141 prefixOrFullSeekKey: buf.prefixOrFullSeekKey, 142 boundsBuf: buf.boundsBuf, 143 batch: nil, 144 // Add the readers to the Iterator so that Close closes them, and 145 // SetOptions can re-construct iterators from them. 146 externalReaders: readers, 147 newIters: func( 148 ctx context.Context, f *manifest.FileMetadata, opts *IterOptions, 149 internalOpts internalIterOpts) (internalIterator, keyspan.FragmentIterator, error) { 150 // NB: External iterators are currently constructed without any 151 // `levelIters`. newIters should never be called. When we support 152 // organizing multiple non-overlapping files into a single level 153 // (see TODO below), we'll need to adjust this tableNewIters 154 // implementation to open iterators by looking up f in a map 155 // of readers indexed by *fileMetadata. 156 panic("unreachable") 157 }, 158 seqNum: base.InternalKeySeqNumMax, 159 } 160 if iterOpts != nil { 161 dbi.opts = *iterOpts 162 dbi.processBounds(iterOpts.LowerBound, iterOpts.UpperBound) 163 } 164 for i := range extraOpts { 165 extraOpts[i].iterApply(dbi) 166 } 167 if err := finishInitializingExternal(ctx, dbi); err != nil { 168 dbi.Close() 169 return nil, err 170 } 171 return dbi, nil 172 } 173 174 func validateExternalIterOpts(iterOpts *IterOptions) error { 175 switch { 176 case iterOpts.TableFilter != nil: 177 return errors.Errorf("pebble: external iterator: TableFilter unsupported") 178 case iterOpts.PointKeyFilters != nil: 179 return errors.Errorf("pebble: external iterator: PointKeyFilters unsupported") 180 case iterOpts.RangeKeyFilters != nil: 181 return errors.Errorf("pebble: external iterator: RangeKeyFilters unsupported") 182 case iterOpts.OnlyReadGuaranteedDurable: 183 return errors.Errorf("pebble: external iterator: OnlyReadGuaranteedDurable unsupported") 184 case iterOpts.UseL6Filters: 185 return errors.Errorf("pebble: external iterator: UseL6Filters unsupported") 186 } 187 return nil 188 } 189 190 func createExternalPointIter(ctx context.Context, it *Iterator) (internalIterator, error) { 191 // TODO(jackson): In some instances we could generate fewer levels by using 192 // L0Sublevels code to organize nonoverlapping files into the same level. 193 // This would allow us to use levelIters and keep a smaller set of data and 194 // files in-memory. However, it would also require us to identify the bounds 195 // of all the files upfront. 196 197 if !it.opts.pointKeys() { 198 return emptyIter, nil 199 } else if it.pointIter != nil { 200 return it.pointIter, nil 201 } 202 mlevels := it.alloc.mlevels[:0] 203 204 if len(it.externalReaders) > cap(mlevels) { 205 mlevels = make([]mergingIterLevel, 0, len(it.externalReaders)) 206 } 207 for _, readers := range it.externalReaders { 208 var combinedIters []internalIterator 209 for _, r := range readers { 210 var ( 211 rangeDelIter keyspan.FragmentIterator 212 pointIter internalIterator 213 err error 214 ) 215 // We could set hideObsoletePoints=true, since we are reading at 216 // InternalKeySeqNumMax, but we don't bother since these sstables should 217 // not have obsolete points (so the performance optimization is 218 // unnecessary), and we don't want to bother constructing a 219 // BlockPropertiesFilterer that includes obsoleteKeyBlockPropertyFilter. 220 pointIter, err = r.NewIterWithBlockPropertyFiltersAndContextEtc( 221 ctx, it.opts.LowerBound, it.opts.UpperBound, nil, /* BlockPropertiesFilterer */ 222 false /* hideObsoletePoints */, false, /* useFilterBlock */ 223 &it.stats.InternalStats, it.opts.CategoryAndQoS, nil, 224 sstable.TrivialReaderProvider{Reader: r}) 225 if err != nil { 226 return nil, err 227 } 228 rangeDelIter, err = r.NewRawRangeDelIter() 229 if err != nil { 230 return nil, err 231 } 232 if rangeDelIter == nil && pointIter != nil && it.forwardOnly { 233 // TODO(bilal): Consider implementing range key pausing in 234 // simpleLevelIter so we can reduce mergingIterLevels even more by 235 // sending all sstable iterators to combinedIters, not just those 236 // corresponding to sstables without range deletes. 237 combinedIters = append(combinedIters, pointIter) 238 continue 239 } 240 mlevels = append(mlevels, mergingIterLevel{ 241 iter: pointIter, 242 rangeDelIter: rangeDelIter, 243 }) 244 } 245 if len(combinedIters) == 1 { 246 mlevels = append(mlevels, mergingIterLevel{ 247 iter: combinedIters[0], 248 }) 249 } else if len(combinedIters) > 1 { 250 sli := &simpleLevelIter{ 251 cmp: it.cmp, 252 iters: combinedIters, 253 } 254 sli.init(it.opts) 255 mlevels = append(mlevels, mergingIterLevel{ 256 iter: sli, 257 rangeDelIter: nil, 258 }) 259 } 260 } 261 if len(mlevels) == 1 && mlevels[0].rangeDelIter == nil { 262 // Set closePointIterOnce to true. This is because we're bypassing the 263 // merging iter, which turns Close()s on it idempotent for any child 264 // iterators. The outer Iterator could call Close() on a point iter twice, 265 // which sstable iterators do not support (as they release themselves to 266 // a pool). 267 it.closePointIterOnce = true 268 return mlevels[0].iter, nil 269 } 270 271 it.alloc.merging.init(&it.opts, &it.stats.InternalStats, it.comparer.Compare, it.comparer.Split, mlevels...) 272 it.alloc.merging.snapshot = base.InternalKeySeqNumMax 273 if len(mlevels) <= cap(it.alloc.levelsPositioned) { 274 it.alloc.merging.levelsPositioned = it.alloc.levelsPositioned[:len(mlevels)] 275 } 276 return &it.alloc.merging, nil 277 } 278 279 func finishInitializingExternal(ctx context.Context, it *Iterator) error { 280 pointIter, err := createExternalPointIter(ctx, it) 281 if err != nil { 282 return err 283 } 284 it.pointIter = pointIter 285 it.iter = it.pointIter 286 287 if it.opts.rangeKeys() { 288 it.rangeKeyMasking.init(it, it.comparer.Compare, it.comparer.Split) 289 var rangeKeyIters []keyspan.FragmentIterator 290 if it.rangeKey == nil { 291 // We could take advantage of the lack of overlaps in range keys within 292 // each slice in it.externalReaders, and generate keyspan.LevelIters 293 // out of those. However, since range keys are expected to be sparse to 294 // begin with, the performance gain might not be significant enough to 295 // warrant it. 296 // 297 // TODO(bilal): Explore adding a simpleRangeKeyLevelIter that does not 298 // operate on FileMetadatas (similar to simpleLevelIter), and implements 299 // this optimization. 300 for _, readers := range it.externalReaders { 301 for _, r := range readers { 302 if rki, err := r.NewRawRangeKeyIter(); err != nil { 303 return err 304 } else if rki != nil { 305 rangeKeyIters = append(rangeKeyIters, rki) 306 } 307 } 308 } 309 if len(rangeKeyIters) > 0 { 310 it.rangeKey = iterRangeKeyStateAllocPool.Get().(*iteratorRangeKeyState) 311 it.rangeKey.init(it.comparer.Compare, it.comparer.Split, &it.opts) 312 it.rangeKey.rangeKeyIter = it.rangeKey.iterConfig.Init( 313 &it.comparer, 314 base.InternalKeySeqNumMax, 315 it.opts.LowerBound, it.opts.UpperBound, 316 &it.hasPrefix, &it.prefixOrFullSeekKey, 317 false /* internalKeys */, &it.rangeKey.internal, 318 ) 319 for i := range rangeKeyIters { 320 it.rangeKey.iterConfig.AddLevel(rangeKeyIters[i]) 321 } 322 } 323 } 324 if it.rangeKey != nil { 325 it.rangeKey.iiter.Init(&it.comparer, it.iter, it.rangeKey.rangeKeyIter, 326 keyspan.InterleavingIterOpts{ 327 Mask: &it.rangeKeyMasking, 328 LowerBound: it.opts.LowerBound, 329 UpperBound: it.opts.UpperBound, 330 }) 331 it.iter = &it.rangeKey.iiter 332 } 333 } 334 return nil 335 } 336 337 func openExternalTables( 338 o *Options, 339 files []sstable.ReadableFile, 340 seqNumOffset int, 341 readerOpts sstable.ReaderOptions, 342 extraReaderOpts ...sstable.ReaderOption, 343 ) (readers []*sstable.Reader, err error) { 344 readers = make([]*sstable.Reader, 0, len(files)) 345 for i := range files { 346 readable, err := sstable.NewSimpleReadable(files[i]) 347 if err != nil { 348 return readers, err 349 } 350 r, err := sstable.NewReader(readable, readerOpts, extraReaderOpts...) 351 if err != nil { 352 return readers, err 353 } 354 // Use the index of the file in files as the sequence number for all of 355 // its keys. 356 r.Properties.GlobalSeqNum = uint64(len(files) - i + seqNumOffset) 357 readers = append(readers, r) 358 } 359 return readers, err 360 } 361 362 // simpleLevelIter is similar to a levelIter in that it merges the points 363 // from multiple point iterators that are non-overlapping in the key ranges 364 // they return. It is only expected to support forward iteration and forward 365 // regular seeking; reverse iteration and prefix seeking is not supported. 366 // Intended to be a low-overhead, non-FileMetadata dependent option for 367 // NewExternalIter. To optimize seeking and forward iteration, it maintains 368 // two slices of child iterators; one of all iterators, and a subset of it that 369 // contains just the iterators that contain point keys within the current 370 // bounds. 371 // 372 // Note that this levelIter does not support pausing at file boundaries 373 // in case of range tombstones in this file that could apply to points outside 374 // of this file (and outside of this level). This is sufficient for optimizing 375 // the main use cases of NewExternalIter, however for completeness it would make 376 // sense to build this pausing functionality in. 377 type simpleLevelIter struct { 378 cmp Compare 379 err error 380 lowerBound []byte 381 iters []internalIterator 382 filtered []internalIterator 383 firstKeys [][]byte 384 firstKeysBuf []byte 385 currentIdx int 386 } 387 388 var _ internalIterator = &simpleLevelIter{} 389 390 // init initializes this simpleLevelIter. 391 func (s *simpleLevelIter) init(opts IterOptions) { 392 s.currentIdx = 0 393 s.lowerBound = opts.LowerBound 394 s.resetFilteredIters() 395 } 396 397 func (s *simpleLevelIter) resetFilteredIters() { 398 s.filtered = s.filtered[:0] 399 s.firstKeys = s.firstKeys[:0] 400 s.firstKeysBuf = s.firstKeysBuf[:0] 401 s.err = nil 402 for i := range s.iters { 403 var iterKey *base.InternalKey 404 if s.lowerBound != nil { 405 iterKey, _ = s.iters[i].SeekGE(s.lowerBound, base.SeekGEFlagsNone) 406 } else { 407 iterKey, _ = s.iters[i].First() 408 } 409 if iterKey != nil { 410 s.filtered = append(s.filtered, s.iters[i]) 411 bufStart := len(s.firstKeysBuf) 412 s.firstKeysBuf = append(s.firstKeysBuf, iterKey.UserKey...) 413 s.firstKeys = append(s.firstKeys, s.firstKeysBuf[bufStart:bufStart+len(iterKey.UserKey)]) 414 } else if err := s.iters[i].Error(); err != nil { 415 s.err = err 416 } 417 } 418 } 419 420 func (s *simpleLevelIter) SeekGE( 421 key []byte, flags base.SeekGEFlags, 422 ) (*base.InternalKey, base.LazyValue) { 423 if s.err != nil { 424 return nil, base.LazyValue{} 425 } 426 // Find the first file that is entirely >= key. The file before that could 427 // contain the key we're looking for. 428 n := sort.Search(len(s.firstKeys), func(i int) bool { 429 return s.cmp(key, s.firstKeys[i]) <= 0 430 }) 431 if n > 0 { 432 s.currentIdx = n - 1 433 } else { 434 s.currentIdx = n 435 } 436 if s.currentIdx < len(s.filtered) { 437 if iterKey, val := s.filtered[s.currentIdx].SeekGE(key, flags); iterKey != nil { 438 return iterKey, val 439 } 440 if err := s.filtered[s.currentIdx].Error(); err != nil { 441 s.err = err 442 } 443 s.currentIdx++ 444 } 445 return s.skipEmptyFileForward(key, flags) 446 } 447 448 func (s *simpleLevelIter) skipEmptyFileForward( 449 seekKey []byte, flags base.SeekGEFlags, 450 ) (*base.InternalKey, base.LazyValue) { 451 var iterKey *base.InternalKey 452 var val base.LazyValue 453 for s.currentIdx >= 0 && s.currentIdx < len(s.filtered) && s.err == nil { 454 if seekKey != nil { 455 iterKey, val = s.filtered[s.currentIdx].SeekGE(seekKey, flags) 456 } else if s.lowerBound != nil { 457 iterKey, val = s.filtered[s.currentIdx].SeekGE(s.lowerBound, flags) 458 } else { 459 iterKey, val = s.filtered[s.currentIdx].First() 460 } 461 if iterKey != nil { 462 return iterKey, val 463 } 464 if err := s.filtered[s.currentIdx].Error(); err != nil { 465 s.err = err 466 } 467 s.currentIdx++ 468 } 469 return nil, base.LazyValue{} 470 } 471 472 func (s *simpleLevelIter) SeekPrefixGE( 473 prefix, key []byte, flags base.SeekGEFlags, 474 ) (*base.InternalKey, base.LazyValue) { 475 panic("unimplemented") 476 } 477 478 func (s *simpleLevelIter) SeekLT( 479 key []byte, flags base.SeekLTFlags, 480 ) (*base.InternalKey, base.LazyValue) { 481 panic("unimplemented") 482 } 483 484 func (s *simpleLevelIter) First() (*base.InternalKey, base.LazyValue) { 485 if s.err != nil { 486 return nil, base.LazyValue{} 487 } 488 s.currentIdx = 0 489 return s.skipEmptyFileForward(nil /* seekKey */, base.SeekGEFlagsNone) 490 } 491 492 func (s *simpleLevelIter) Last() (*base.InternalKey, base.LazyValue) { 493 panic("unimplemented") 494 } 495 496 func (s *simpleLevelIter) Next() (*base.InternalKey, base.LazyValue) { 497 if s.err != nil { 498 return nil, base.LazyValue{} 499 } 500 if s.currentIdx < 0 || s.currentIdx >= len(s.filtered) { 501 return nil, base.LazyValue{} 502 } 503 if iterKey, val := s.filtered[s.currentIdx].Next(); iterKey != nil { 504 return iterKey, val 505 } 506 s.currentIdx++ 507 return s.skipEmptyFileForward(nil /* seekKey */, base.SeekGEFlagsNone) 508 } 509 510 func (s *simpleLevelIter) NextPrefix(succKey []byte) (*base.InternalKey, base.LazyValue) { 511 if s.err != nil { 512 return nil, base.LazyValue{} 513 } 514 if s.currentIdx < 0 || s.currentIdx >= len(s.filtered) { 515 return nil, base.LazyValue{} 516 } 517 if iterKey, val := s.filtered[s.currentIdx].NextPrefix(succKey); iterKey != nil { 518 return iterKey, val 519 } 520 s.currentIdx++ 521 return s.skipEmptyFileForward(succKey /* seekKey */, base.SeekGEFlagsNone) 522 } 523 524 func (s *simpleLevelIter) Prev() (*base.InternalKey, base.LazyValue) { 525 panic("unimplemented") 526 } 527 528 func (s *simpleLevelIter) Error() error { 529 if s.currentIdx >= 0 && s.currentIdx < len(s.filtered) { 530 s.err = firstError(s.err, s.filtered[s.currentIdx].Error()) 531 } 532 return s.err 533 } 534 535 func (s *simpleLevelIter) Close() error { 536 var err error 537 for i := range s.iters { 538 err = firstError(err, s.iters[i].Close()) 539 } 540 return err 541 } 542 543 func (s *simpleLevelIter) SetBounds(lower, upper []byte) { 544 s.currentIdx = -1 545 s.lowerBound = lower 546 for i := range s.iters { 547 s.iters[i].SetBounds(lower, upper) 548 } 549 s.resetFilteredIters() 550 } 551 552 func (s *simpleLevelIter) SetContext(_ context.Context) {} 553 554 func (s *simpleLevelIter) String() string { 555 if s.currentIdx < 0 || s.currentIdx >= len(s.filtered) { 556 return "simpleLevelIter: current=<nil>" 557 } 558 return fmt.Sprintf("simpleLevelIter: current=%s", s.filtered[s.currentIdx]) 559 } 560 561 var _ internalIterator = &simpleLevelIter{}