github.com/cockroachdb/pebble@v1.1.2/internal/base/iterator.go (about) 1 // Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package base 6 7 import ( 8 "fmt" 9 "time" 10 11 "github.com/cockroachdb/pebble/internal/humanize" 12 "github.com/cockroachdb/redact" 13 ) 14 15 // InternalIterator iterates over a DB's key/value pairs in key order. Unlike 16 // the Iterator interface, the returned keys are InternalKeys composed of the 17 // user-key, a sequence number and a key kind. In forward iteration, key/value 18 // pairs for identical user-keys are returned in descending sequence order. In 19 // reverse iteration, key/value pairs for identical user-keys are returned in 20 // ascending sequence order. 21 // 22 // InternalIterators provide 5 absolute positioning methods and 2 relative 23 // positioning methods. The absolute positioning methods are: 24 // 25 // - SeekGE 26 // - SeekPrefixGE 27 // - SeekLT 28 // - First 29 // - Last 30 // 31 // The relative positioning methods are: 32 // 33 // - Next 34 // - Prev 35 // 36 // The relative positioning methods can be used in conjunction with any of the 37 // absolute positioning methods with one exception: SeekPrefixGE does not 38 // support reverse iteration via Prev. It is undefined to call relative 39 // positioning methods without ever calling an absolute positioning method. 40 // 41 // InternalIterators can optionally implement a prefix iteration mode. This 42 // mode is entered by calling SeekPrefixGE and exited by any other absolute 43 // positioning method (SeekGE, SeekLT, First, Last). When in prefix iteration 44 // mode, a call to Next will advance to the next key which has the same 45 // "prefix" as the one supplied to SeekPrefixGE. Note that "prefix" in this 46 // context is not a strict byte prefix, but defined by byte equality for the 47 // result of the Comparer.Split method. An InternalIterator is not required to 48 // support prefix iteration mode, and can implement SeekPrefixGE by forwarding 49 // to SeekGE. When the iteration prefix is exhausted, it is not valid to call 50 // Next on an internal iterator that's already returned (nil,nilv) or a key 51 // beyond the prefix. 52 // 53 // Bounds, [lower, upper), can be set on iterators, either using the SetBounds() 54 // function in the interface, or in implementation specific ways during iterator 55 // creation. The forward positioning routines (SeekGE, First, and Next) only 56 // check the upper bound. The reverse positioning routines (SeekLT, Last, and 57 // Prev) only check the lower bound. It is up to the caller to ensure that the 58 // forward positioning routines respect the lower bound and the reverse 59 // positioning routines respect the upper bound (i.e. calling SeekGE instead of 60 // First if there is a lower bound, and SeekLT instead of Last if there is an 61 // upper bound). This imposition is done in order to elevate that enforcement to 62 // the caller (generally pebble.Iterator or pebble.mergingIter) rather than 63 // having it duplicated in every InternalIterator implementation. 64 // 65 // Additionally, the caller needs to ensure that SeekGE/SeekPrefixGE are not 66 // called with a key > the upper bound, and SeekLT is not called with a key < 67 // the lower bound. InternalIterator implementations are required to respect 68 // the iterator bounds, never returning records outside of the bounds with one 69 // exception: an iterator may generate synthetic RANGEDEL marker records. See 70 // levelIter.syntheticBoundary for the sole existing example of this behavior. 71 // Specifically, levelIter can return synthetic keys whose user key is equal to 72 // the lower/upper bound. 73 // 74 // The bounds provided to an internal iterator must remain valid until a 75 // subsequent call to SetBounds has returned. This requirement exists so that 76 // iterator implementations may compare old and new bounds to apply low-level 77 // optimizations. The pebble.Iterator satisfies this requirement by maintaining 78 // two bound buffers and switching between them. 79 // 80 // An iterator must be closed after use, but it is not necessary to read an 81 // iterator until exhaustion. 82 // 83 // An iterator is not goroutine-safe, but it is safe to use multiple iterators 84 // concurrently, either in separate goroutines or switching between the 85 // iterators in a single goroutine. 86 // 87 // It is also safe to use an iterator concurrently with modifying its 88 // underlying DB, if that DB permits modification. However, the resultant 89 // key/value pairs are not guaranteed to be a consistent snapshot of that DB 90 // at a particular point in time. 91 // 92 // InternalIterators accumulate errors encountered during operation, exposing 93 // them through the Error method. All of the absolute positioning methods 94 // reset any accumulated error before positioning. Relative positioning 95 // methods return without advancing if the iterator has accumulated an error. 96 // 97 // nilv == shorthand for LazyValue{}, which represents a nil value. 98 type InternalIterator interface { 99 // SeekGE moves the iterator to the first key/value pair whose key is greater 100 // than or equal to the given key. Returns the key and value if the iterator 101 // is pointing at a valid entry, and (nil, nilv) otherwise. Note that SeekGE 102 // only checks the upper bound. It is up to the caller to ensure that key 103 // is greater than or equal to the lower bound. 104 SeekGE(key []byte, flags SeekGEFlags) (*InternalKey, LazyValue) 105 106 // SeekPrefixGE moves the iterator to the first key/value pair whose key is 107 // greater than or equal to the given key. Returns the key and value if the 108 // iterator is pointing at a valid entry, and (nil, nilv) otherwise. Note that 109 // SeekPrefixGE only checks the upper bound. It is up to the caller to ensure 110 // that key is greater than or equal to the lower bound. 111 // 112 // The prefix argument is used by some InternalIterator implementations (e.g. 113 // sstable.Reader) to avoid expensive operations. A user-defined Split 114 // function must be supplied to the Comparer for the DB. The supplied prefix 115 // will be the prefix of the given key returned by that Split function. If 116 // the iterator is able to determine that no key with the prefix exists, it 117 // can return (nil,nilv). Unlike SeekGE, this is not an indication that 118 // iteration is exhausted. 119 // 120 // Note that the iterator may return keys not matching the prefix. It is up 121 // to the caller to check if the prefix matches. 122 // 123 // Calling SeekPrefixGE places the receiver into prefix iteration mode. Once 124 // in this mode, reverse iteration may not be supported and will return an 125 // error. Note that pebble/Iterator.SeekPrefixGE has this same restriction on 126 // not supporting reverse iteration in prefix iteration mode until a 127 // different positioning routine (SeekGE, SeekLT, First or Last) switches the 128 // iterator out of prefix iteration. 129 SeekPrefixGE(prefix, key []byte, flags SeekGEFlags) (*InternalKey, LazyValue) 130 131 // SeekLT moves the iterator to the last key/value pair whose key is less 132 // than the given key. Returns the key and value if the iterator is pointing 133 // at a valid entry, and (nil, nilv) otherwise. Note that SeekLT only checks 134 // the lower bound. It is up to the caller to ensure that key is less than 135 // the upper bound. 136 SeekLT(key []byte, flags SeekLTFlags) (*InternalKey, LazyValue) 137 138 // First moves the iterator the the first key/value pair. Returns the key and 139 // value if the iterator is pointing at a valid entry, and (nil, nilv) 140 // otherwise. Note that First only checks the upper bound. It is up to the 141 // caller to ensure that First() is not called when there is a lower bound, 142 // and instead call SeekGE(lower). 143 First() (*InternalKey, LazyValue) 144 145 // Last moves the iterator the the last key/value pair. Returns the key and 146 // value if the iterator is pointing at a valid entry, and (nil, nilv) 147 // otherwise. Note that Last only checks the lower bound. It is up to the 148 // caller to ensure that Last() is not called when there is an upper bound, 149 // and instead call SeekLT(upper). 150 Last() (*InternalKey, LazyValue) 151 152 // Next moves the iterator to the next key/value pair. Returns the key and 153 // value if the iterator is pointing at a valid entry, and (nil, nilv) 154 // otherwise. Note that Next only checks the upper bound. It is up to the 155 // caller to ensure that key is greater than or equal to the lower bound. 156 // 157 // It is valid to call Next when the iterator is positioned before the first 158 // key/value pair due to either a prior call to SeekLT or Prev which returned 159 // (nil, nilv). It is not allowed to call Next when the previous call to SeekGE, 160 // SeekPrefixGE or Next returned (nil, nilv). 161 Next() (*InternalKey, LazyValue) 162 163 // NextPrefix moves the iterator to the next key/value pair with a different 164 // prefix than the key at the current iterator position. Returns the key and 165 // value if the iterator is pointing at a valid entry, and (nil, nil) 166 // otherwise. Note that NextPrefix only checks the upper bound. It is up to 167 // the caller to ensure that key is greater than or equal to the lower 168 // bound. 169 // 170 // NextPrefix is passed the immediate successor to the current prefix key. A 171 // valid implementation of NextPrefix is to call SeekGE with succKey. 172 // 173 // It is not allowed to call NextPrefix when the previous call was a reverse 174 // positioning operation or a call to a forward positioning method that 175 // returned (nil, nilv). It is also not allowed to call NextPrefix when the 176 // iterator is in prefix iteration mode. 177 NextPrefix(succKey []byte) (*InternalKey, LazyValue) 178 179 // Prev moves the iterator to the previous key/value pair. Returns the key 180 // and value if the iterator is pointing at a valid entry, and (nil, nilv) 181 // otherwise. Note that Prev only checks the lower bound. It is up to the 182 // caller to ensure that key is less than the upper bound. 183 // 184 // It is valid to call Prev when the iterator is positioned after the last 185 // key/value pair due to either a prior call to SeekGE or Next which returned 186 // (nil, nilv). It is not allowed to call Prev when the previous call to SeekLT 187 // or Prev returned (nil, nilv). 188 Prev() (*InternalKey, LazyValue) 189 190 // Error returns any accumulated error. It may not include errors returned 191 // to the client when calling LazyValue.Value(). 192 Error() error 193 194 // Close closes the iterator and returns any accumulated error. Exhausting 195 // all the key/value pairs in a table is not considered to be an error. 196 // It is valid to call Close multiple times. Other methods should not be 197 // called after the iterator has been closed. 198 Close() error 199 200 // SetBounds sets the lower and upper bounds for the iterator. Note that the 201 // result of Next and Prev will be undefined until the iterator has been 202 // repositioned with SeekGE, SeekPrefixGE, SeekLT, First, or Last. 203 // 204 // The bounds provided must remain valid until a subsequent call to 205 // SetBounds has returned. This requirement exists so that iterator 206 // implementations may compare old and new bounds to apply low-level 207 // optimizations. 208 SetBounds(lower, upper []byte) 209 210 fmt.Stringer 211 } 212 213 // SeekGEFlags holds flags that may configure the behavior of a forward seek. 214 // Not all flags are relevant to all iterators. 215 type SeekGEFlags uint8 216 217 const ( 218 seekGEFlagTrySeekUsingNext uint8 = iota 219 seekGEFlagRelativeSeek 220 seekGEFlagBatchJustRefreshed 221 ) 222 223 // SeekGEFlagsNone is the default value of SeekGEFlags, with all flags disabled. 224 const SeekGEFlagsNone = SeekGEFlags(0) 225 226 // TrySeekUsingNext indicates whether a performance optimization was enabled 227 // by a caller, indicating the caller has not done any action to move this 228 // iterator beyond the first key that would be found if this iterator were to 229 // honestly do the intended seek. For example, say the caller did a 230 // SeekGE(k1...), followed by SeekGE(k2...) where k1 <= k2, without any 231 // intermediate positioning calls. The caller can safely specify true for this 232 // parameter in the second call. As another example, say the caller did do one 233 // call to Next between the two Seek calls, and k1 < k2. Again, the caller can 234 // safely specify a true value for this parameter. Note that a false value is 235 // always safe. The callee is free to ignore the true value if its 236 // implementation does not permit this optimization. 237 // 238 // We make the caller do this determination since a string comparison of k1, k2 239 // is not necessarily cheap, and there may be many iterators in the iterator 240 // stack. Doing it once at the root of the iterator stack is cheaper. 241 // 242 // This optimization could also be applied to SeekLT (where it would be 243 // trySeekUsingPrev). We currently only do it for SeekPrefixGE and SeekGE 244 // because this is where this optimization helps the performance of CockroachDB. 245 // The SeekLT cases in CockroachDB are typically accompanied with bounds that 246 // change between seek calls, and is optimized inside certain iterator 247 // implementations, like singleLevelIterator, without any extra parameter 248 // passing (though the same amortization of string comparisons could be done to 249 // improve that optimization, by making the root of the iterator stack do it). 250 func (s SeekGEFlags) TrySeekUsingNext() bool { return (s & (1 << seekGEFlagTrySeekUsingNext)) != 0 } 251 252 // RelativeSeek is set when in the course of a forward positioning operation, a 253 // higher-level iterator seeks a lower-level iterator to a larger key than the 254 // one at the current iterator position. 255 // 256 // Concretely, this occurs when the merging iterator observes a range deletion 257 // covering the key at a level's current position, and the merging iterator 258 // seeks the level to the range deletion's end key. During lazy-combined 259 // iteration, this flag signals to the level iterator that the seek is NOT an 260 // absolute-positioning operation from the perspective of the pebble.Iterator, 261 // and the level iterator must look for range keys in tables between the current 262 // iterator position and the new seeked position. 263 func (s SeekGEFlags) RelativeSeek() bool { return (s & (1 << seekGEFlagRelativeSeek)) != 0 } 264 265 // BatchJustRefreshed is set by Seek[Prefix]GE when an iterator's view of an 266 // indexed batch was just refreshed. It serves as a signal to the batch iterator 267 // to ignore the TrySeekUsingNext optimization, because the external knowledge 268 // imparted by the TrySeekUsingNext flag does not apply to the batch iterator's 269 // position. See (pebble.Iterator).batchJustRefreshed. 270 func (s SeekGEFlags) BatchJustRefreshed() bool { return (s & (1 << seekGEFlagBatchJustRefreshed)) != 0 } 271 272 // EnableTrySeekUsingNext returns the provided flags with the 273 // try-seek-using-next optimization enabled. See TrySeekUsingNext for an 274 // explanation of this optimization. 275 func (s SeekGEFlags) EnableTrySeekUsingNext() SeekGEFlags { 276 return s | (1 << seekGEFlagTrySeekUsingNext) 277 } 278 279 // DisableTrySeekUsingNext returns the provided flags with the 280 // try-seek-using-next optimization disabled. 281 func (s SeekGEFlags) DisableTrySeekUsingNext() SeekGEFlags { 282 return s &^ (1 << seekGEFlagTrySeekUsingNext) 283 } 284 285 // EnableRelativeSeek returns the provided flags with the relative-seek flag 286 // enabled. See RelativeSeek for an explanation of this flag's use. 287 func (s SeekGEFlags) EnableRelativeSeek() SeekGEFlags { 288 return s | (1 << seekGEFlagRelativeSeek) 289 } 290 291 // DisableRelativeSeek returns the provided flags with the relative-seek flag 292 // disabled. 293 func (s SeekGEFlags) DisableRelativeSeek() SeekGEFlags { 294 return s &^ (1 << seekGEFlagRelativeSeek) 295 } 296 297 // EnableBatchJustRefreshed returns the provided flags with the 298 // batch-just-refreshed bit set. See BatchJustRefreshed for an explanation of 299 // this flag. 300 func (s SeekGEFlags) EnableBatchJustRefreshed() SeekGEFlags { 301 return s | (1 << seekGEFlagBatchJustRefreshed) 302 } 303 304 // DisableBatchJustRefreshed returns the provided flags with the 305 // batch-just-refreshed bit unset. 306 func (s SeekGEFlags) DisableBatchJustRefreshed() SeekGEFlags { 307 return s &^ (1 << seekGEFlagBatchJustRefreshed) 308 } 309 310 // SeekLTFlags holds flags that may configure the behavior of a reverse seek. 311 // Not all flags are relevant to all iterators. 312 type SeekLTFlags uint8 313 314 const ( 315 seekLTFlagRelativeSeek uint8 = iota 316 ) 317 318 // SeekLTFlagsNone is the default value of SeekLTFlags, with all flags disabled. 319 const SeekLTFlagsNone = SeekLTFlags(0) 320 321 // RelativeSeek is set when in the course of a reverse positioning operation, a 322 // higher-level iterator seeks a lower-level iterator to a smaller key than the 323 // one at the current iterator position. 324 // 325 // Concretely, this occurs when the merging iterator observes a range deletion 326 // covering the key at a level's current position, and the merging iterator 327 // seeks the level to the range deletion's start key. During lazy-combined 328 // iteration, this flag signals to the level iterator that the seek is NOT an 329 // absolute-positioning operation from the perspective of the pebble.Iterator, 330 // and the level iterator must look for range keys in tables between the current 331 // iterator position and the new seeked position. 332 func (s SeekLTFlags) RelativeSeek() bool { return s&(1<<seekLTFlagRelativeSeek) != 0 } 333 334 // EnableRelativeSeek returns the provided flags with the relative-seek flag 335 // enabled. See RelativeSeek for an explanation of this flag's use. 336 func (s SeekLTFlags) EnableRelativeSeek() SeekLTFlags { 337 return s | (1 << seekLTFlagRelativeSeek) 338 } 339 340 // DisableRelativeSeek returns the provided flags with the relative-seek flag 341 // disabled. 342 func (s SeekLTFlags) DisableRelativeSeek() SeekLTFlags { 343 return s &^ (1 << seekLTFlagRelativeSeek) 344 } 345 346 // InternalIteratorStats contains miscellaneous stats produced by 347 // InternalIterators that are part of the InternalIterator tree. Not every 348 // field is relevant for an InternalIterator implementation. The field values 349 // are aggregated as one goes up the InternalIterator tree. 350 type InternalIteratorStats struct { 351 // Bytes in the loaded blocks. If the block was compressed, this is the 352 // compressed bytes. Currently, only the index blocks, data blocks 353 // containing points, and filter blocks are included. 354 BlockBytes uint64 355 // Subset of BlockBytes that were in the block cache. 356 BlockBytesInCache uint64 357 // BlockReadDuration accumulates the duration spent fetching blocks 358 // due to block cache misses. 359 // TODO(sumeer): this currently excludes the time spent in Reader creation, 360 // and in reading the rangedel and rangekey blocks. Fix that. 361 BlockReadDuration time.Duration 362 // The following can repeatedly count the same points if they are iterated 363 // over multiple times. Additionally, they may count a point twice when 364 // switching directions. The latter could be improved if needed. 365 366 // Bytes in keys that were iterated over. Currently, only point keys are 367 // included. 368 KeyBytes uint64 369 // Bytes in values that were iterated over. Currently, only point values are 370 // included. For separated values, this is the size of the handle. 371 ValueBytes uint64 372 // The count of points iterated over. 373 PointCount uint64 374 // Points that were iterated over that were covered by range tombstones. It 375 // can be useful for discovering instances of 376 // https://github.com/cockroachdb/pebble/issues/1070. 377 PointsCoveredByRangeTombstones uint64 378 379 // Stats related to points in value blocks encountered during iteration. 380 // These are useful to understand outliers, since typical user facing 381 // iteration should tend to only look at the latest point, and hence have 382 // the following stats close to 0. 383 SeparatedPointValue struct { 384 // Count is a count of points that were in value blocks. This is not a 385 // subset of PointCount: PointCount is produced by mergingIter and if 386 // positioned once, and successful in returning a point, will have a 387 // PointCount of 1, regardless of how many sstables (and memtables etc.) 388 // in the heap got positioned. The count here includes every sstable 389 // iterator that got positioned in the heap. 390 Count uint64 391 // ValueBytes represent the total byte length of the values (in value 392 // blocks) of the points corresponding to Count. 393 ValueBytes uint64 394 // ValueBytesFetched is the total byte length of the values (in value 395 // blocks) that were retrieved. 396 ValueBytesFetched uint64 397 } 398 } 399 400 // Merge merges the stats in from into the given stats. 401 func (s *InternalIteratorStats) Merge(from InternalIteratorStats) { 402 s.BlockBytes += from.BlockBytes 403 s.BlockBytesInCache += from.BlockBytesInCache 404 s.BlockReadDuration += from.BlockReadDuration 405 s.KeyBytes += from.KeyBytes 406 s.ValueBytes += from.ValueBytes 407 s.PointCount += from.PointCount 408 s.PointsCoveredByRangeTombstones += from.PointsCoveredByRangeTombstones 409 s.SeparatedPointValue.Count += from.SeparatedPointValue.Count 410 s.SeparatedPointValue.ValueBytes += from.SeparatedPointValue.ValueBytes 411 s.SeparatedPointValue.ValueBytesFetched += from.SeparatedPointValue.ValueBytesFetched 412 } 413 414 func (s *InternalIteratorStats) String() string { 415 return redact.StringWithoutMarkers(s) 416 } 417 418 // SafeFormat implements the redact.SafeFormatter interface. 419 func (s *InternalIteratorStats) SafeFormat(p redact.SafePrinter, verb rune) { 420 p.Printf("blocks: %s cached", 421 humanize.Bytes.Uint64(s.BlockBytesInCache), 422 ) 423 if s.BlockBytes != s.BlockBytesInCache || s.BlockReadDuration != 0 { 424 p.Printf(", %s not cached (read time: %s)", 425 humanize.Bytes.Uint64(s.BlockBytes-s.BlockBytesInCache), 426 humanize.FormattedString(s.BlockReadDuration.String()), 427 ) 428 } 429 p.Printf("; points: %s", humanize.Count.Uint64(s.PointCount)) 430 431 if s.PointsCoveredByRangeTombstones != 0 { 432 p.Printf("(%s tombstoned)", humanize.Count.Uint64(s.PointsCoveredByRangeTombstones)) 433 } 434 p.Printf(" (%s keys, %s values)", 435 humanize.Bytes.Uint64(s.KeyBytes), 436 humanize.Bytes.Uint64(s.ValueBytes), 437 ) 438 if s.SeparatedPointValue.Count != 0 { 439 p.Printf("; separated: %s (%s, %s fetched)", 440 humanize.Count.Uint64(s.SeparatedPointValue.Count), 441 humanize.Bytes.Uint64(s.SeparatedPointValue.ValueBytes), 442 humanize.Bytes.Uint64(s.SeparatedPointValue.ValueBytesFetched)) 443 } 444 }