github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/internal/base/iterator.go (about) 1 // Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package base 6 7 import ( 8 "context" 9 "fmt" 10 "time" 11 ) 12 13 // InternalIterator iterates over a DB's key/value pairs in key order. Unlike 14 // the Iterator interface, the returned keys are InternalKeys composed of the 15 // user-key, a sequence number and a key kind. In forward iteration, key/value 16 // pairs for identical user-keys are returned in descending sequence order. In 17 // reverse iteration, key/value pairs for identical user-keys are returned in 18 // ascending sequence order. 19 // 20 // InternalIterators provide 5 absolute positioning methods and 2 relative 21 // positioning methods. The absolute positioning methods are: 22 // 23 // - SeekGE 24 // - SeekPrefixGE 25 // - SeekLT 26 // - First 27 // - Last 28 // 29 // The relative positioning methods are: 30 // 31 // - Next 32 // - Prev 33 // 34 // The relative positioning methods can be used in conjunction with any of the 35 // absolute positioning methods with one exception: SeekPrefixGE does not 36 // support reverse iteration via Prev. It is undefined to call relative 37 // positioning methods without ever calling an absolute positioning method. 38 // 39 // InternalIterators can optionally implement a prefix iteration mode. This 40 // mode is entered by calling SeekPrefixGE and exited by any other absolute 41 // positioning method (SeekGE, SeekLT, First, Last). When in prefix iteration 42 // mode, a call to Next will advance to the next key which has the same 43 // "prefix" as the one supplied to SeekPrefixGE. Note that "prefix" in this 44 // context is not a strict byte prefix, but defined by byte equality for the 45 // result of the Comparer.Split method. An InternalIterator is not required to 46 // support prefix iteration mode, and can implement SeekPrefixGE by forwarding 47 // to SeekGE. When the iteration prefix is exhausted, it is not valid to call 48 // Next on an internal iterator that's already returned (nil,nilv) or a key 49 // beyond the prefix. 50 // 51 // Bounds, [lower, upper), can be set on iterators, either using the SetBounds() 52 // function in the interface, or in implementation specific ways during iterator 53 // creation. The forward positioning routines (SeekGE, First, and Next) only 54 // check the upper bound. The reverse positioning routines (SeekLT, Last, and 55 // Prev) only check the lower bound. It is up to the caller to ensure that the 56 // forward positioning routines respect the lower bound and the reverse 57 // positioning routines respect the upper bound (i.e. calling SeekGE instead of 58 // First if there is a lower bound, and SeekLT instead of Last if there is an 59 // upper bound). This imposition is done in order to elevate that enforcement to 60 // the caller (generally pebble.Iterator or pebble.mergingIter) rather than 61 // having it duplicated in every InternalIterator implementation. 62 // 63 // Additionally, the caller needs to ensure that SeekGE/SeekPrefixGE are not 64 // called with a key > the upper bound, and SeekLT is not called with a key < 65 // the lower bound. InternalIterator implementations are required to respect 66 // the iterator bounds, never returning records outside of the bounds with one 67 // exception: an iterator may generate synthetic RANGEDEL marker records. See 68 // levelIter.syntheticBoundary for the sole existing example of this behavior. 69 // Specifically, levelIter can return synthetic keys whose user key is equal to 70 // the lower/upper bound. 71 // 72 // The bounds provided to an internal iterator must remain valid until a 73 // subsequent call to SetBounds has returned. This requirement exists so that 74 // iterator implementations may compare old and new bounds to apply low-level 75 // optimizations. The pebble.Iterator satisfies this requirement by maintaining 76 // two bound buffers and switching between them. 77 // 78 // An iterator must be closed after use, but it is not necessary to read an 79 // iterator until exhaustion. 80 // 81 // An iterator is not goroutine-safe, but it is safe to use multiple iterators 82 // concurrently, either in separate goroutines or switching between the 83 // iterators in a single goroutine. 84 // 85 // It is also safe to use an iterator concurrently with modifying its 86 // underlying DB, if that DB permits modification. However, the resultant 87 // key/value pairs are not guaranteed to be a consistent snapshot of that DB 88 // at a particular point in time. 89 // 90 // InternalIterators accumulate errors encountered during operation, exposing 91 // them through the Error method. All of the absolute positioning methods 92 // reset any accumulated error before positioning. Relative positioning 93 // methods return without advancing if the iterator has accumulated an error. 94 // 95 // nilv == shorthand for LazyValue{}, which represents a nil value. 96 type InternalIterator interface { 97 // SeekGE moves the iterator to the first key/value pair whose key is greater 98 // than or equal to the given key. Returns the key and value if the iterator 99 // is pointing at a valid entry, and (nil, nilv) otherwise. Note that SeekGE 100 // only checks the upper bound. It is up to the caller to ensure that key 101 // is greater than or equal to the lower bound. 102 SeekGE(key []byte, flags SeekGEFlags) (*InternalKey, LazyValue) 103 104 // SeekPrefixGE moves the iterator to the first key/value pair whose key is 105 // greater than or equal to the given key. Returns the key and value if the 106 // iterator is pointing at a valid entry, and (nil, nilv) otherwise. Note that 107 // SeekPrefixGE only checks the upper bound. It is up to the caller to ensure 108 // that key is greater than or equal to the lower bound. 109 // 110 // The prefix argument is used by some InternalIterator implementations (e.g. 111 // sstable.Reader) to avoid expensive operations. A user-defined Split 112 // function must be supplied to the Comparer for the DB. The supplied prefix 113 // will be the prefix of the given key returned by that Split function. If 114 // the iterator is able to determine that no key with the prefix exists, it 115 // can return (nil,nilv). Unlike SeekGE, this is not an indication that 116 // iteration is exhausted. 117 // 118 // Note that the iterator may return keys not matching the prefix. It is up 119 // to the caller to check if the prefix matches. 120 // 121 // Calling SeekPrefixGE places the receiver into prefix iteration mode. Once 122 // in this mode, reverse iteration may not be supported and will return an 123 // error. Note that pebble/Iterator.SeekPrefixGE has this same restriction on 124 // not supporting reverse iteration in prefix iteration mode until a 125 // different positioning routine (SeekGE, SeekLT, First or Last) switches the 126 // iterator out of prefix iteration. 127 SeekPrefixGE(prefix, key []byte, flags SeekGEFlags) (*InternalKey, LazyValue) 128 129 // SeekLT moves the iterator to the last key/value pair whose key is less 130 // than the given key. Returns the key and value if the iterator is pointing 131 // at a valid entry, and (nil, nilv) otherwise. Note that SeekLT only checks 132 // the lower bound. It is up to the caller to ensure that key is less than 133 // the upper bound. 134 SeekLT(key []byte, flags SeekLTFlags) (*InternalKey, LazyValue) 135 136 // First moves the iterator the the first key/value pair. Returns the key and 137 // value if the iterator is pointing at a valid entry, and (nil, nilv) 138 // otherwise. Note that First only checks the upper bound. It is up to the 139 // caller to ensure that First() is not called when there is a lower bound, 140 // and instead call SeekGE(lower). 141 First() (*InternalKey, LazyValue) 142 143 // Last moves the iterator the the last key/value pair. Returns the key and 144 // value if the iterator is pointing at a valid entry, and (nil, nilv) 145 // otherwise. Note that Last only checks the lower bound. It is up to the 146 // caller to ensure that Last() is not called when there is an upper bound, 147 // and instead call SeekLT(upper). 148 Last() (*InternalKey, LazyValue) 149 150 // Next moves the iterator to the next key/value pair. Returns the key and 151 // value if the iterator is pointing at a valid entry, and (nil, nilv) 152 // otherwise. Note that Next only checks the upper bound. It is up to the 153 // caller to ensure that key is greater than or equal to the lower bound. 154 // 155 // It is valid to call Next when the iterator is positioned before the first 156 // key/value pair due to either a prior call to SeekLT or Prev which returned 157 // (nil, nilv). It is not allowed to call Next when the previous call to SeekGE, 158 // SeekPrefixGE or Next returned (nil, nilv). 159 Next() (*InternalKey, LazyValue) 160 161 // NextPrefix moves the iterator to the next key/value pair with a different 162 // prefix than the key at the current iterator position. Returns the key and 163 // value if the iterator is pointing at a valid entry, and (nil, nil) 164 // otherwise. Note that NextPrefix only checks the upper bound. It is up to 165 // the caller to ensure that key is greater than or equal to the lower 166 // bound. 167 // 168 // NextPrefix is passed the immediate successor to the current prefix key. A 169 // valid implementation of NextPrefix is to call SeekGE with succKey. 170 // 171 // It is not allowed to call NextPrefix when the previous call was a reverse 172 // positioning operation or a call to a forward positioning method that 173 // returned (nil, nilv). It is also not allowed to call NextPrefix when the 174 // iterator is in prefix iteration mode. 175 NextPrefix(succKey []byte) (*InternalKey, LazyValue) 176 177 // Prev moves the iterator to the previous key/value pair. Returns the key 178 // and value if the iterator is pointing at a valid entry, and (nil, nilv) 179 // otherwise. Note that Prev only checks the lower bound. It is up to the 180 // caller to ensure that key is less than the upper bound. 181 // 182 // It is valid to call Prev when the iterator is positioned after the last 183 // key/value pair due to either a prior call to SeekGE or Next which returned 184 // (nil, nilv). It is not allowed to call Prev when the previous call to SeekLT 185 // or Prev returned (nil, nilv). 186 Prev() (*InternalKey, LazyValue) 187 188 // Error returns any accumulated error. It may not include errors returned 189 // to the client when calling LazyValue.Value(). 190 Error() error 191 192 // Close closes the iterator and returns any accumulated error. Exhausting 193 // all the key/value pairs in a table is not considered to be an error. 194 // It is valid to call Close multiple times. Other methods should not be 195 // called after the iterator has been closed. 196 Close() error 197 198 // SetBounds sets the lower and upper bounds for the iterator. Note that the 199 // result of Next and Prev will be undefined until the iterator has been 200 // repositioned with SeekGE, SeekPrefixGE, SeekLT, First, or Last. 201 // 202 // The bounds provided must remain valid until a subsequent call to 203 // SetBounds has returned. This requirement exists so that iterator 204 // implementations may compare old and new bounds to apply low-level 205 // optimizations. 206 SetBounds(lower, upper []byte) 207 208 // SetContext replaces the context provided at iterator creation, or the 209 // last one provided by SetContext. 210 SetContext(ctx context.Context) 211 212 fmt.Stringer 213 } 214 215 // SeekGEFlags holds flags that may configure the behavior of a forward seek. 216 // Not all flags are relevant to all iterators. 217 type SeekGEFlags uint8 218 219 const ( 220 seekGEFlagTrySeekUsingNext uint8 = iota 221 seekGEFlagRelativeSeek 222 seekGEFlagBatchJustRefreshed 223 ) 224 225 // SeekGEFlagsNone is the default value of SeekGEFlags, with all flags disabled. 226 const SeekGEFlagsNone = SeekGEFlags(0) 227 228 // TrySeekUsingNext indicates whether a performance optimization was enabled 229 // by a caller, indicating the caller has not done any action to move this 230 // iterator beyond the first key that would be found if this iterator were to 231 // honestly do the intended seek. For example, say the caller did a 232 // SeekGE(k1...), followed by SeekGE(k2...) where k1 <= k2, without any 233 // intermediate positioning calls. The caller can safely specify true for this 234 // parameter in the second call. As another example, say the caller did do one 235 // call to Next between the two Seek calls, and k1 < k2. Again, the caller can 236 // safely specify a true value for this parameter. Note that a false value is 237 // always safe. The callee is free to ignore the true value if its 238 // implementation does not permit this optimization. 239 // 240 // We make the caller do this determination since a string comparison of k1, k2 241 // is not necessarily cheap, and there may be many iterators in the iterator 242 // stack. Doing it once at the root of the iterator stack is cheaper. 243 // 244 // This optimization could also be applied to SeekLT (where it would be 245 // trySeekUsingPrev). We currently only do it for SeekPrefixGE and SeekGE 246 // because this is where this optimization helps the performance of CockroachDB. 247 // The SeekLT cases in CockroachDB are typically accompanied with bounds that 248 // change between seek calls, and is optimized inside certain iterator 249 // implementations, like singleLevelIterator, without any extra parameter 250 // passing (though the same amortization of string comparisons could be done to 251 // improve that optimization, by making the root of the iterator stack do it). 252 func (s SeekGEFlags) TrySeekUsingNext() bool { return (s & (1 << seekGEFlagTrySeekUsingNext)) != 0 } 253 254 // RelativeSeek is set when in the course of a forward positioning operation, a 255 // higher-level iterator seeks a lower-level iterator to a larger key than the 256 // one at the current iterator position. 257 // 258 // Concretely, this occurs when the merging iterator observes a range deletion 259 // covering the key at a level's current position, and the merging iterator 260 // seeks the level to the range deletion's end key. During lazy-combined 261 // iteration, this flag signals to the level iterator that the seek is NOT an 262 // absolute-positioning operation from the perspective of the pebble.Iterator, 263 // and the level iterator must look for range keys in tables between the current 264 // iterator position and the new seeked position. 265 func (s SeekGEFlags) RelativeSeek() bool { return (s & (1 << seekGEFlagRelativeSeek)) != 0 } 266 267 // BatchJustRefreshed is set by Seek[Prefix]GE when an iterator's view of an 268 // indexed batch was just refreshed. It serves as a signal to the batch iterator 269 // to ignore the TrySeekUsingNext optimization, because the external knowledge 270 // imparted by the TrySeekUsingNext flag does not apply to the batch iterator's 271 // position. See (pebble.Iterator).batchJustRefreshed. 272 func (s SeekGEFlags) BatchJustRefreshed() bool { return (s & (1 << seekGEFlagBatchJustRefreshed)) != 0 } 273 274 // EnableTrySeekUsingNext returns the provided flags with the 275 // try-seek-using-next optimization enabled. See TrySeekUsingNext for an 276 // explanation of this optimization. 277 func (s SeekGEFlags) EnableTrySeekUsingNext() SeekGEFlags { 278 return s | (1 << seekGEFlagTrySeekUsingNext) 279 } 280 281 // DisableTrySeekUsingNext returns the provided flags with the 282 // try-seek-using-next optimization disabled. 283 func (s SeekGEFlags) DisableTrySeekUsingNext() SeekGEFlags { 284 return s &^ (1 << seekGEFlagTrySeekUsingNext) 285 } 286 287 // EnableRelativeSeek returns the provided flags with the relative-seek flag 288 // enabled. See RelativeSeek for an explanation of this flag's use. 289 func (s SeekGEFlags) EnableRelativeSeek() SeekGEFlags { 290 return s | (1 << seekGEFlagRelativeSeek) 291 } 292 293 // DisableRelativeSeek returns the provided flags with the relative-seek flag 294 // disabled. 295 func (s SeekGEFlags) DisableRelativeSeek() SeekGEFlags { 296 return s &^ (1 << seekGEFlagRelativeSeek) 297 } 298 299 // EnableBatchJustRefreshed returns the provided flags with the 300 // batch-just-refreshed bit set. See BatchJustRefreshed for an explanation of 301 // this flag. 302 func (s SeekGEFlags) EnableBatchJustRefreshed() SeekGEFlags { 303 return s | (1 << seekGEFlagBatchJustRefreshed) 304 } 305 306 // DisableBatchJustRefreshed returns the provided flags with the 307 // batch-just-refreshed bit unset. 308 func (s SeekGEFlags) DisableBatchJustRefreshed() SeekGEFlags { 309 return s &^ (1 << seekGEFlagBatchJustRefreshed) 310 } 311 312 // SeekLTFlags holds flags that may configure the behavior of a reverse seek. 313 // Not all flags are relevant to all iterators. 314 type SeekLTFlags uint8 315 316 const ( 317 seekLTFlagRelativeSeek uint8 = iota 318 ) 319 320 // SeekLTFlagsNone is the default value of SeekLTFlags, with all flags disabled. 321 const SeekLTFlagsNone = SeekLTFlags(0) 322 323 // RelativeSeek is set when in the course of a reverse positioning operation, a 324 // higher-level iterator seeks a lower-level iterator to a smaller key than the 325 // one at the current iterator position. 326 // 327 // Concretely, this occurs when the merging iterator observes a range deletion 328 // covering the key at a level's current position, and the merging iterator 329 // seeks the level to the range deletion's start key. During lazy-combined 330 // iteration, this flag signals to the level iterator that the seek is NOT an 331 // absolute-positioning operation from the perspective of the pebble.Iterator, 332 // and the level iterator must look for range keys in tables between the current 333 // iterator position and the new seeked position. 334 func (s SeekLTFlags) RelativeSeek() bool { return s&(1<<seekLTFlagRelativeSeek) != 0 } 335 336 // EnableRelativeSeek returns the provided flags with the relative-seek flag 337 // enabled. See RelativeSeek for an explanation of this flag's use. 338 func (s SeekLTFlags) EnableRelativeSeek() SeekLTFlags { 339 return s | (1 << seekLTFlagRelativeSeek) 340 } 341 342 // DisableRelativeSeek returns the provided flags with the relative-seek flag 343 // disabled. 344 func (s SeekLTFlags) DisableRelativeSeek() SeekLTFlags { 345 return s &^ (1 << seekLTFlagRelativeSeek) 346 } 347 348 // InternalIteratorStats contains miscellaneous stats produced by 349 // InternalIterators that are part of the InternalIterator tree. Not every 350 // field is relevant for an InternalIterator implementation. The field values 351 // are aggregated as one goes up the InternalIterator tree. 352 type InternalIteratorStats struct { 353 // Bytes in the loaded blocks. If the block was compressed, this is the 354 // compressed bytes. Currently, only the index blocks, data blocks 355 // containing points, and filter blocks are included. 356 BlockBytes uint64 357 // Subset of BlockBytes that were in the block cache. 358 BlockBytesInCache uint64 359 // BlockReadDuration accumulates the duration spent fetching blocks 360 // due to block cache misses. 361 // TODO(sumeer): this currently excludes the time spent in Reader creation, 362 // and in reading the rangedel and rangekey blocks. Fix that. 363 BlockReadDuration time.Duration 364 // The following can repeatedly count the same points if they are iterated 365 // over multiple times. Additionally, they may count a point twice when 366 // switching directions. The latter could be improved if needed. 367 368 // Bytes in keys that were iterated over. Currently, only point keys are 369 // included. 370 KeyBytes uint64 371 // Bytes in values that were iterated over. Currently, only point values are 372 // included. For separated values, this is the size of the handle. 373 ValueBytes uint64 374 // The count of points iterated over. 375 PointCount uint64 376 // Points that were iterated over that were covered by range tombstones. It 377 // can be useful for discovering instances of 378 // https://github.com/cockroachdb/pebble/issues/1070. 379 PointsCoveredByRangeTombstones uint64 380 381 // Stats related to points in value blocks encountered during iteration. 382 // These are useful to understand outliers, since typical user facing 383 // iteration should tend to only look at the latest point, and hence have 384 // the following stats close to 0. 385 SeparatedPointValue struct { 386 // Count is a count of points that were in value blocks. This is not a 387 // subset of PointCount: PointCount is produced by mergingIter and if 388 // positioned once, and successful in returning a point, will have a 389 // PointCount of 1, regardless of how many sstables (and memtables etc.) 390 // in the heap got positioned. The count here includes every sstable 391 // iterator that got positioned in the heap. 392 Count uint64 393 // ValueBytes represent the total byte length of the values (in value 394 // blocks) of the points corresponding to Count. 395 ValueBytes uint64 396 // ValueBytesFetched is the total byte length of the values (in value 397 // blocks) that were retrieved. 398 ValueBytesFetched uint64 399 } 400 } 401 402 // Merge merges the stats in from into the given stats. 403 func (s *InternalIteratorStats) Merge(from InternalIteratorStats) { 404 s.BlockBytes += from.BlockBytes 405 s.BlockBytesInCache += from.BlockBytesInCache 406 s.BlockReadDuration += from.BlockReadDuration 407 s.KeyBytes += from.KeyBytes 408 s.ValueBytes += from.ValueBytes 409 s.PointCount += from.PointCount 410 s.PointsCoveredByRangeTombstones += from.PointsCoveredByRangeTombstones 411 s.SeparatedPointValue.Count += from.SeparatedPointValue.Count 412 s.SeparatedPointValue.ValueBytes += from.SeparatedPointValue.ValueBytes 413 s.SeparatedPointValue.ValueBytesFetched += from.SeparatedPointValue.ValueBytesFetched 414 }