github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/external_iterator.go (about) 1 // Copyright 2022 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package bitalostable 6 7 import ( 8 "fmt" 9 "sort" 10 11 "github.com/cockroachdb/errors" 12 "github.com/zuoyebang/bitalostable/internal/base" 13 "github.com/zuoyebang/bitalostable/internal/keyspan" 14 "github.com/zuoyebang/bitalostable/internal/manifest" 15 "github.com/zuoyebang/bitalostable/sstable" 16 ) 17 18 // ExternalIterOption provide an interface to specify open-time options to 19 // NewExternalIter. 20 type ExternalIterOption interface { 21 // iterApply is called on the iterator during opening in order to set internal 22 // parameters. 23 iterApply(*Iterator) 24 // readerOptions returns any reader options added by this iter option. 25 readerOptions() []sstable.ReaderOption 26 } 27 28 type externalIterReaderOptions struct { 29 opts []sstable.ReaderOption 30 } 31 32 func (e *externalIterReaderOptions) iterApply(iterator *Iterator) { 33 // Do nothing. 34 } 35 36 func (e *externalIterReaderOptions) readerOptions() []sstable.ReaderOption { 37 return e.opts 38 } 39 40 // ExternalIterReaderOptions returns an ExternalIterOption that specifies 41 // sstable.ReaderOptions to be applied on sstable readers in NewExternalIter. 42 func ExternalIterReaderOptions(opts ...sstable.ReaderOption) ExternalIterOption { 43 return &externalIterReaderOptions{opts: opts} 44 } 45 46 // ExternalIterForwardOnly is an ExternalIterOption that specifies this iterator 47 // will only be used for forward positioning operations (First, SeekGE, Next). 48 // This could enable optimizations that take advantage of this invariant. 49 // Behaviour when a reverse positioning operation is done on an iterator 50 // opened with this option is unpredictable, though in most cases it should. 51 type ExternalIterForwardOnly struct{} 52 53 func (e ExternalIterForwardOnly) iterApply(iter *Iterator) { 54 iter.forwardOnly = true 55 } 56 57 func (e ExternalIterForwardOnly) readerOptions() []sstable.ReaderOption { 58 return nil 59 } 60 61 // NewExternalIter takes an input 2d array of sstable files which may overlap 62 // across subarrays but not within a subarray (at least as far as points are 63 // concerned; range keys are allowed to overlap arbitrarily even within a 64 // subarray), and returns an Iterator over the merged contents of the sstables. 65 // Input sstables may contain point keys, range keys, range deletions, etc. The 66 // input files slice must be sorted in reverse chronological ordering. A key in a 67 // file at a lower index subarray will shadow a key with an identical user key 68 // contained within a file at a higher index subarray. Each subarray must be 69 // sorted in internal key order, where lower index files contain keys that sort 70 // left of files with higher indexes. 71 // 72 // Input sstables must only contain keys with the zero sequence number. 73 // 74 // Iterators constructed through NewExternalIter do not support all iterator 75 // options, including block-property and table filters. NewExternalIter errors 76 // if an incompatible option is set. 77 func NewExternalIter( 78 o *Options, 79 iterOpts *IterOptions, 80 files [][]sstable.ReadableFile, 81 extraOpts ...ExternalIterOption, 82 ) (it *Iterator, err error) { 83 if iterOpts != nil { 84 if err := validateExternalIterOpts(iterOpts); err != nil { 85 return nil, err 86 } 87 } 88 89 var readers [][]*sstable.Reader 90 91 // Ensure we close all the opened readers if we error out. 92 defer func() { 93 if err != nil { 94 for i := range readers { 95 for j := range readers[i] { 96 _ = readers[i][j].Close() 97 } 98 } 99 } 100 }() 101 seqNumOffset := 0 102 var extraReaderOpts []sstable.ReaderOption 103 for i := range extraOpts { 104 extraReaderOpts = append(extraReaderOpts, extraOpts[i].readerOptions()...) 105 } 106 for _, levelFiles := range files { 107 seqNumOffset += len(levelFiles) 108 } 109 for _, levelFiles := range files { 110 var subReaders []*sstable.Reader 111 seqNumOffset -= len(levelFiles) 112 subReaders, err = openExternalTables(o, levelFiles, seqNumOffset, o.MakeReaderOptions(), extraReaderOpts...) 113 readers = append(readers, subReaders) 114 } 115 if err != nil { 116 return nil, err 117 } 118 119 buf := iterAllocPool.Get().(*iterAlloc) 120 dbi := &buf.dbi 121 *dbi = Iterator{ 122 alloc: buf, 123 merge: o.Merger.Merge, 124 comparer: *o.Comparer, 125 readState: nil, 126 keyBuf: buf.keyBuf, 127 prefixOrFullSeekKey: buf.prefixOrFullSeekKey, 128 boundsBuf: buf.boundsBuf, 129 batch: nil, 130 // Add the readers to the Iterator so that Close closes them, and 131 // SetOptions can re-construct iterators from them. 132 externalReaders: readers, 133 newIters: func(f *manifest.FileMetadata, opts *IterOptions, internalOpts internalIterOpts) (internalIterator, keyspan.FragmentIterator, error) { 134 // NB: External iterators are currently constructed without any 135 // `levelIters`. newIters should never be called. When we support 136 // organizing multiple non-overlapping files into a single level 137 // (see TODO below), we'll need to adjust this tableNewIters 138 // implementation to open iterators by looking up f in a map 139 // of readers indexed by *fileMetadata. 140 panic("unreachable") 141 }, 142 seqNum: base.InternalKeySeqNumMax, 143 } 144 if iterOpts != nil { 145 dbi.opts = *iterOpts 146 dbi.saveBounds(iterOpts.LowerBound, iterOpts.UpperBound) 147 } 148 for i := range extraOpts { 149 extraOpts[i].iterApply(dbi) 150 } 151 finishInitializingExternal(dbi) 152 return dbi, nil 153 } 154 155 func validateExternalIterOpts(iterOpts *IterOptions) error { 156 switch { 157 case iterOpts.TableFilter != nil: 158 return errors.Errorf("bitalostable: external iterator: TableFilter unsupported") 159 case iterOpts.PointKeyFilters != nil: 160 return errors.Errorf("bitalostable: external iterator: PointKeyFilters unsupported") 161 case iterOpts.RangeKeyFilters != nil: 162 return errors.Errorf("bitalostable: external iterator: RangeKeyFilters unsupported") 163 case iterOpts.OnlyReadGuaranteedDurable: 164 return errors.Errorf("bitalostable: external iterator: OnlyReadGuaranteedDurable unsupported") 165 case iterOpts.UseL6Filters: 166 return errors.Errorf("bitalostable: external iterator: UseL6Filters unsupported") 167 } 168 return nil 169 } 170 171 func createExternalPointIter(it *Iterator) (internalIterator, error) { 172 // TODO(jackson): In some instances we could generate fewer levels by using 173 // L0Sublevels code to organize nonoverlapping files into the same level. 174 // This would allow us to use levelIters and keep a smaller set of data and 175 // files in-memory. However, it would also require us to identify the bounds 176 // of all the files upfront. 177 178 if !it.opts.pointKeys() { 179 return emptyIter, nil 180 } else if it.pointIter != nil { 181 return it.pointIter, nil 182 } 183 mlevels := it.alloc.mlevels[:0] 184 185 if len(it.externalReaders) > cap(mlevels) { 186 mlevels = make([]mergingIterLevel, 0, len(it.externalReaders)) 187 } 188 for _, readers := range it.externalReaders { 189 var combinedIters []internalIterator 190 for _, r := range readers { 191 var ( 192 rangeDelIter keyspan.FragmentIterator 193 pointIter internalIterator 194 err error 195 ) 196 pointIter, err = r.NewIter(it.opts.LowerBound, it.opts.UpperBound) 197 if err != nil { 198 return nil, err 199 } 200 rangeDelIter, err = r.NewRawRangeDelIter() 201 if err != nil { 202 return nil, err 203 } 204 if rangeDelIter == nil && pointIter != nil && it.forwardOnly { 205 // TODO(bilal): Consider implementing range key pausing in 206 // simpleLevelIter so we can reduce mergingIterLevels even more by 207 // sending all sstable iterators to combinedIters, not just those 208 // corresponding to sstables without range deletes. 209 combinedIters = append(combinedIters, pointIter) 210 continue 211 } 212 mlevels = append(mlevels, mergingIterLevel{ 213 iter: pointIter, 214 rangeDelIter: rangeDelIter, 215 }) 216 } 217 if len(combinedIters) == 1 { 218 mlevels = append(mlevels, mergingIterLevel{ 219 iter: combinedIters[0], 220 }) 221 } else if len(combinedIters) > 1 { 222 sli := &simpleLevelIter{ 223 cmp: it.cmp, 224 iters: combinedIters, 225 } 226 sli.init(it.opts) 227 mlevels = append(mlevels, mergingIterLevel{ 228 iter: sli, 229 rangeDelIter: nil, 230 }) 231 } 232 } 233 if len(mlevels) == 1 && mlevels[0].rangeDelIter == nil { 234 // Set closePointIterOnce to true. This is because we're bypassing the 235 // merging iter, which turns Close()s on it idempotent for any child 236 // iterators. The outer Iterator could call Close() on a point iter twice, 237 // which sstable iterators do not support (as they release themselves to 238 // a pool). 239 it.closePointIterOnce = true 240 return mlevels[0].iter, nil 241 } 242 243 it.alloc.merging.init(&it.opts, &it.stats.InternalStats, it.comparer.Compare, it.comparer.Split, mlevels...) 244 it.alloc.merging.snapshot = base.InternalKeySeqNumMax 245 it.alloc.merging.elideRangeTombstones = true 246 return &it.alloc.merging, nil 247 } 248 249 func finishInitializingExternal(it *Iterator) { 250 pointIter, err := createExternalPointIter(it) 251 if err != nil { 252 it.pointIter = &errorIter{err: err} 253 } else { 254 it.pointIter = pointIter 255 } 256 it.iter = it.pointIter 257 258 if it.opts.rangeKeys() { 259 it.rangeKeyMasking.init(it, it.comparer.Compare, it.comparer.Split) 260 var rangeKeyIters []keyspan.FragmentIterator 261 if it.rangeKey == nil { 262 // We could take advantage of the lack of overlaps in range keys within 263 // each slice in it.externalReaders, and generate keyspan.LevelIters 264 // out of those. However, since range keys are expected to be sparse to 265 // begin with, the performance gain might not be significant enough to 266 // warrant it. 267 // 268 // TODO(bilal): Explore adding a simpleRangeKeyLevelIter that does not 269 // operate on FileMetadatas (similar to simpleLevelIter), and implements 270 // this optimization. 271 for _, readers := range it.externalReaders { 272 for _, r := range readers { 273 if rki, err := r.NewRawRangeKeyIter(); err != nil { 274 rangeKeyIters = append(rangeKeyIters, &errorKeyspanIter{err: err}) 275 } else if rki != nil { 276 rangeKeyIters = append(rangeKeyIters, rki) 277 } 278 } 279 } 280 if len(rangeKeyIters) > 0 { 281 it.rangeKey = iterRangeKeyStateAllocPool.Get().(*iteratorRangeKeyState) 282 it.rangeKey.init(it.comparer.Compare, it.comparer.Split, &it.opts) 283 it.rangeKey.rangeKeyIter = it.rangeKey.iterConfig.Init( 284 &it.comparer, 285 base.InternalKeySeqNumMax, 286 it.opts.LowerBound, it.opts.UpperBound, 287 &it.hasPrefix, &it.prefixOrFullSeekKey, 288 ) 289 for i := range rangeKeyIters { 290 it.rangeKey.iterConfig.AddLevel(rangeKeyIters[i]) 291 } 292 } 293 } 294 if it.rangeKey != nil { 295 it.rangeKey.iiter.Init(&it.comparer, it.iter, it.rangeKey.rangeKeyIter, &it.rangeKeyMasking, 296 it.opts.LowerBound, it.opts.UpperBound) 297 it.iter = &it.rangeKey.iiter 298 } 299 } 300 } 301 302 func openExternalTables( 303 o *Options, 304 files []sstable.ReadableFile, 305 seqNumOffset int, 306 readerOpts sstable.ReaderOptions, 307 extraReaderOpts ...sstable.ReaderOption, 308 ) (readers []*sstable.Reader, err error) { 309 readers = make([]*sstable.Reader, 0, len(files)) 310 for i := range files { 311 r, err := sstable.NewReader(files[i], readerOpts, extraReaderOpts...) 312 if err != nil { 313 return readers, err 314 } 315 // Use the index of the file in files as the sequence number for all of 316 // its keys. 317 r.Properties.GlobalSeqNum = uint64(len(files) - i + seqNumOffset) 318 readers = append(readers, r) 319 } 320 return readers, err 321 } 322 323 // simpleLevelIter is similar to a levelIter in that it merges the points 324 // from multiple point iterators that are non-overlapping in the key ranges 325 // they return. It is only expected to support forward iteration and forward 326 // regular seeking; reverse iteration and prefix seeking is not supported. 327 // Intended to be a low-overhead, non-FileMetadata dependent option for 328 // NewExternalIter. To optimize seeking and forward iteration, it maintains 329 // two slices of child iterators; one of all iterators, and a subset of it that 330 // contains just the iterators that contain point keys within the current 331 // bounds. 332 // 333 // Note that this levelIter does not support pausing at file boundaries 334 // in case of range tombstones in this file that could apply to points outside 335 // of this file (and outside of this level). This is sufficient for optimizing 336 // the main use cases of NewExternalIter, however for completeness it would make 337 // sense to build this pausing functionality in. 338 type simpleLevelIter struct { 339 cmp Compare 340 err error 341 lowerBound []byte 342 iters []internalIterator 343 filtered []internalIterator 344 firstKeys [][]byte 345 firstKeysBuf []byte 346 currentIdx int 347 } 348 349 // init initializes this simpleLevelIter. 350 func (s *simpleLevelIter) init(opts IterOptions) { 351 s.currentIdx = 0 352 s.lowerBound = opts.LowerBound 353 s.resetFilteredIters() 354 } 355 356 func (s *simpleLevelIter) resetFilteredIters() { 357 s.filtered = s.filtered[:0] 358 s.firstKeys = s.firstKeys[:0] 359 s.firstKeysBuf = s.firstKeysBuf[:0] 360 s.err = nil 361 for i := range s.iters { 362 var iterKey *base.InternalKey 363 if s.lowerBound != nil { 364 iterKey, _ = s.iters[i].SeekGE(s.lowerBound, base.SeekGEFlagsNone) 365 } else { 366 iterKey, _ = s.iters[i].First() 367 } 368 if iterKey != nil { 369 s.filtered = append(s.filtered, s.iters[i]) 370 bufStart := len(s.firstKeysBuf) 371 s.firstKeysBuf = append(s.firstKeysBuf, iterKey.UserKey...) 372 s.firstKeys = append(s.firstKeys, s.firstKeysBuf[bufStart:bufStart+len(iterKey.UserKey)]) 373 } else if err := s.iters[i].Error(); err != nil { 374 s.err = err 375 } 376 } 377 } 378 379 func (s *simpleLevelIter) SeekGE(key []byte, flags base.SeekGEFlags) (*base.InternalKey, []byte) { 380 if s.err != nil { 381 return nil, nil 382 } 383 // Find the first file that is entirely >= key. The file before that could 384 // contain the key we're looking for. 385 n := sort.Search(len(s.firstKeys), func(i int) bool { 386 return s.cmp(key, s.firstKeys[i]) <= 0 387 }) 388 if n > 0 { 389 s.currentIdx = n - 1 390 } else { 391 s.currentIdx = n 392 } 393 if s.currentIdx < len(s.filtered) { 394 if iterKey, val := s.filtered[s.currentIdx].SeekGE(key, flags); iterKey != nil { 395 return iterKey, val 396 } 397 if err := s.filtered[s.currentIdx].Error(); err != nil { 398 s.err = err 399 } 400 s.currentIdx++ 401 } 402 return s.skipEmptyFileForward(key, flags) 403 } 404 405 func (s *simpleLevelIter) skipEmptyFileForward( 406 seekKey []byte, flags base.SeekGEFlags, 407 ) (*base.InternalKey, []byte) { 408 var iterKey *base.InternalKey 409 var val []byte 410 for s.currentIdx >= 0 && s.currentIdx < len(s.filtered) && s.err == nil { 411 if seekKey != nil { 412 iterKey, val = s.filtered[s.currentIdx].SeekGE(seekKey, flags) 413 } else if s.lowerBound != nil { 414 iterKey, val = s.filtered[s.currentIdx].SeekGE(s.lowerBound, flags) 415 } else { 416 iterKey, val = s.filtered[s.currentIdx].First() 417 } 418 if iterKey != nil { 419 return iterKey, val 420 } 421 if err := s.filtered[s.currentIdx].Error(); err != nil { 422 s.err = err 423 } 424 s.currentIdx++ 425 } 426 return nil, nil 427 } 428 429 func (s *simpleLevelIter) SeekPrefixGE( 430 prefix, key []byte, flags base.SeekGEFlags, 431 ) (*base.InternalKey, []byte) { 432 panic("unimplemented") 433 } 434 435 func (s *simpleLevelIter) SeekLT(key []byte, flags base.SeekLTFlags) (*base.InternalKey, []byte) { 436 panic("unimplemented") 437 } 438 439 func (s *simpleLevelIter) First() (*base.InternalKey, []byte) { 440 if s.err != nil { 441 return nil, nil 442 } 443 s.currentIdx = 0 444 return s.skipEmptyFileForward(nil /* seekKey */, base.SeekGEFlagsNone) 445 } 446 447 func (s *simpleLevelIter) Last() (*base.InternalKey, []byte) { 448 panic("unimplemented") 449 } 450 451 func (s *simpleLevelIter) Next() (*base.InternalKey, []byte) { 452 if s.err != nil { 453 return nil, nil 454 } 455 if s.currentIdx < 0 || s.currentIdx >= len(s.filtered) { 456 return nil, nil 457 } 458 if iterKey, val := s.filtered[s.currentIdx].Next(); iterKey != nil { 459 return iterKey, val 460 } 461 s.currentIdx++ 462 return s.skipEmptyFileForward(nil /* seekKey */, base.SeekGEFlagsNone) 463 } 464 465 func (s *simpleLevelIter) Prev() (*base.InternalKey, []byte) { 466 panic("unimplemented") 467 } 468 469 func (s *simpleLevelIter) Error() error { 470 if s.currentIdx >= 0 && s.currentIdx < len(s.filtered) { 471 s.err = firstError(s.err, s.filtered[s.currentIdx].Error()) 472 } 473 return s.err 474 } 475 476 func (s *simpleLevelIter) Close() error { 477 var err error 478 for i := range s.iters { 479 err = firstError(err, s.iters[i].Close()) 480 } 481 return err 482 } 483 484 func (s *simpleLevelIter) SetBounds(lower, upper []byte) { 485 s.currentIdx = -1 486 s.lowerBound = lower 487 for i := range s.iters { 488 s.iters[i].SetBounds(lower, upper) 489 } 490 s.resetFilteredIters() 491 } 492 493 func (s *simpleLevelIter) String() string { 494 if s.currentIdx < 0 || s.currentIdx >= len(s.filtered) { 495 return "simpleLevelIter: current=<nil>" 496 } 497 return fmt.Sprintf("simpleLevelIter: current=%s", s.filtered[s.currentIdx]) 498 } 499 500 var _ internalIterator = &simpleLevelIter{}