github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/storage/pebble_iterator.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package storage 12 13 import ( 14 "bytes" 15 "math" 16 "sync" 17 18 "github.com/cockroachdb/cockroach/pkg/keys" 19 "github.com/cockroachdb/cockroach/pkg/roachpb" 20 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 21 "github.com/cockroachdb/cockroach/pkg/util/hlc" 22 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 23 "github.com/cockroachdb/pebble" 24 ) 25 26 // pebbleIterator is a wrapper around a pebble.Iterator that implements the 27 // Iterator interface. 28 type pebbleIterator struct { 29 // Underlying iterator for the DB. 30 iter *pebble.Iterator 31 options pebble.IterOptions 32 // Reusable buffer for MVCC key encoding. 33 keyBuf []byte 34 // Buffers for copying iterator bounds to. Note that the underlying memory 35 // is not GCed upon Close(), to reduce the number of overall allocations. 36 lowerBoundBuf []byte 37 upperBoundBuf []byte 38 // Set to true to govern whether to call SeekPrefixGE or SeekGE. Skips 39 // SSTables based on MVCC key when true. 40 prefix bool 41 // If reusable is true, Close() does not actually close the underlying 42 // iterator, but simply marks it as not inuse. Used by pebbleReadOnly. 43 reusable bool 44 inuse bool 45 // Stat tracking the number of sstables encountered during time-bound 46 // iteration. 47 timeBoundNumSSTables int 48 } 49 50 var _ Iterator = &pebbleIterator{} 51 52 var pebbleIterPool = sync.Pool{ 53 New: func() interface{} { 54 return &pebbleIterator{} 55 }, 56 } 57 58 // Instantiates a new Pebble iterator, or gets one from the pool. 59 func newPebbleIterator(handle pebble.Reader, opts IterOptions) Iterator { 60 iter := pebbleIterPool.Get().(*pebbleIterator) 61 iter.init(handle, opts) 62 return iter 63 } 64 65 // init resets this pebbleIterator for use with the specified arguments. The 66 // current instance could either be a cached iterator (eg. in pebbleBatch), or 67 // a newly-instantiated one through newPebbleIterator. 68 func (p *pebbleIterator) init(handle pebble.Reader, opts IterOptions) { 69 *p = pebbleIterator{ 70 keyBuf: p.keyBuf, 71 lowerBoundBuf: p.lowerBoundBuf, 72 upperBoundBuf: p.upperBoundBuf, 73 prefix: opts.Prefix, 74 reusable: p.reusable, 75 } 76 77 if !opts.Prefix && len(opts.UpperBound) == 0 && len(opts.LowerBound) == 0 { 78 panic("iterator must set prefix or upper bound or lower bound") 79 } 80 81 if opts.LowerBound != nil { 82 // This is the same as 83 // p.options.LowerBound = EncodeKeyToBuf(p.lowerBoundBuf[:0], MVCCKey{Key: opts.LowerBound}) . 84 // Since we are encoding zero-timestamp MVCC Keys anyway, we can just append 85 // the NUL byte instead of calling EncodeKey which will do the same thing. 86 p.lowerBoundBuf = append(p.lowerBoundBuf[:0], opts.LowerBound...) 87 p.lowerBoundBuf = append(p.lowerBoundBuf, 0x00) 88 p.options.LowerBound = p.lowerBoundBuf 89 } 90 if opts.UpperBound != nil { 91 // Same as above. 92 p.upperBoundBuf = append(p.upperBoundBuf[:0], opts.UpperBound...) 93 p.upperBoundBuf = append(p.upperBoundBuf, 0x00) 94 p.options.UpperBound = p.upperBoundBuf 95 } 96 97 if opts.MaxTimestampHint != (hlc.Timestamp{}) { 98 encodedMinTS := string(encodeTimestamp(opts.MinTimestampHint)) 99 encodedMaxTS := string(encodeTimestamp(opts.MaxTimestampHint)) 100 p.options.TableFilter = func(userProps map[string]string) bool { 101 tableMinTS := userProps["crdb.ts.min"] 102 if len(tableMinTS) == 0 { 103 if opts.WithStats { 104 p.timeBoundNumSSTables++ 105 } 106 return true 107 } 108 tableMaxTS := userProps["crdb.ts.max"] 109 if len(tableMaxTS) == 0 { 110 if opts.WithStats { 111 p.timeBoundNumSSTables++ 112 } 113 return true 114 } 115 used := encodedMaxTS >= tableMinTS && encodedMinTS <= tableMaxTS 116 if used && opts.WithStats { 117 p.timeBoundNumSSTables++ 118 } 119 return used 120 } 121 } else if opts.MinTimestampHint != (hlc.Timestamp{}) { 122 panic("min timestamp hint set without max timestamp hint") 123 } 124 125 p.iter = handle.NewIter(&p.options) 126 if p.iter == nil { 127 panic("unable to create iterator") 128 } 129 130 p.inuse = true 131 } 132 133 func (p *pebbleIterator) setOptions(opts IterOptions) { 134 // Overwrite any stale options from last time. 135 p.options = pebble.IterOptions{} 136 137 if opts.MinTimestampHint != (hlc.Timestamp{}) || opts.MaxTimestampHint != (hlc.Timestamp{}) { 138 panic("iterator with timestamp hints cannot be reused") 139 } 140 if !opts.Prefix && len(opts.UpperBound) == 0 && len(opts.LowerBound) == 0 { 141 panic("iterator must set prefix or upper bound or lower bound") 142 } 143 144 p.prefix = opts.Prefix 145 if opts.LowerBound != nil { 146 // This is the same as 147 // p.options.LowerBound = EncodeKeyToBuf(p.lowerBoundBuf[:0], MVCCKey{Key: opts.LowerBound}) . 148 // Since we are encoding zero-timestamp MVCC Keys anyway, we can just append 149 // the NUL byte instead of calling EncodeKey which will do the same thing. 150 p.lowerBoundBuf = append(p.lowerBoundBuf[:0], opts.LowerBound...) 151 p.lowerBoundBuf = append(p.lowerBoundBuf, 0x00) 152 p.options.LowerBound = p.lowerBoundBuf 153 } 154 if opts.UpperBound != nil { 155 // Same as above. 156 p.upperBoundBuf = append(p.upperBoundBuf[:0], opts.UpperBound...) 157 p.upperBoundBuf = append(p.upperBoundBuf, 0x00) 158 p.options.UpperBound = p.upperBoundBuf 159 } 160 p.iter.SetBounds(p.options.LowerBound, p.options.UpperBound) 161 } 162 163 // Close implements the Iterator interface. 164 func (p *pebbleIterator) Close() { 165 if !p.inuse { 166 panic("closing idle iterator") 167 } 168 p.inuse = false 169 170 if p.reusable { 171 return 172 } 173 174 p.destroy() 175 176 pebbleIterPool.Put(p) 177 } 178 179 // SeekGE implements the Iterator interface. 180 func (p *pebbleIterator) SeekGE(key MVCCKey) { 181 p.keyBuf = EncodeKeyToBuf(p.keyBuf[:0], key) 182 if p.prefix { 183 p.iter.SeekPrefixGE(p.keyBuf) 184 } else { 185 p.iter.SeekGE(p.keyBuf) 186 } 187 } 188 189 // Valid implements the Iterator interface. 190 func (p *pebbleIterator) Valid() (bool, error) { 191 // NB: A Pebble Iterator always returns Valid()==false when an error is 192 // present. If Valid() is true, there is no error. 193 if ok := p.iter.Valid(); ok { 194 return ok, nil 195 } 196 return false, p.iter.Error() 197 } 198 199 // Next implements the Iterator interface. 200 func (p *pebbleIterator) Next() { 201 p.iter.Next() 202 } 203 204 // NextKey implements the Iterator interface. 205 func (p *pebbleIterator) NextKey() { 206 if valid, err := p.Valid(); err != nil || !valid { 207 return 208 } 209 p.keyBuf = append(p.keyBuf[:0], p.UnsafeKey().Key...) 210 if !p.iter.Next() { 211 return 212 } 213 if bytes.Equal(p.keyBuf, p.UnsafeKey().Key) { 214 // This is equivalent to: 215 // p.iter.SeekGE(EncodeKey(MVCCKey{p.UnsafeKey().Key.Next(), hlc.Timestamp{}})) 216 p.iter.SeekGE(append(p.keyBuf, 0, 0)) 217 } 218 } 219 220 // UnsafeKey implements the Iterator interface. 221 func (p *pebbleIterator) UnsafeKey() MVCCKey { 222 if valid, err := p.Valid(); err != nil || !valid { 223 return MVCCKey{} 224 } 225 226 mvccKey, err := DecodeMVCCKey(p.iter.Key()) 227 if err != nil { 228 return MVCCKey{} 229 } 230 231 return mvccKey 232 } 233 234 // unsafeRawKey returns the raw key from the underlying pebble.Iterator. 235 func (p *pebbleIterator) unsafeRawKey() []byte { 236 return p.iter.Key() 237 } 238 239 // UnsafeValue implements the Iterator interface. 240 func (p *pebbleIterator) UnsafeValue() []byte { 241 if valid, err := p.Valid(); err != nil || !valid { 242 return nil 243 } 244 return p.iter.Value() 245 } 246 247 // SeekLT implements the Iterator interface. 248 func (p *pebbleIterator) SeekLT(key MVCCKey) { 249 p.keyBuf = EncodeKeyToBuf(p.keyBuf[:0], key) 250 p.iter.SeekLT(p.keyBuf) 251 } 252 253 // Prev implements the Iterator interface. 254 func (p *pebbleIterator) Prev() { 255 p.iter.Prev() 256 } 257 258 // Key implements the Iterator interface. 259 func (p *pebbleIterator) Key() MVCCKey { 260 key := p.UnsafeKey() 261 keyCopy := make([]byte, len(key.Key)) 262 copy(keyCopy, key.Key) 263 key.Key = keyCopy 264 return key 265 } 266 267 // Value implements the Iterator interface. 268 func (p *pebbleIterator) Value() []byte { 269 value := p.UnsafeValue() 270 valueCopy := make([]byte, len(value)) 271 copy(valueCopy, value) 272 return valueCopy 273 } 274 275 // ValueProto implements the Iterator interface. 276 func (p *pebbleIterator) ValueProto(msg protoutil.Message) error { 277 value := p.UnsafeValue() 278 279 return protoutil.Unmarshal(value, msg) 280 } 281 282 // ComputeStats implements the Iterator interface. 283 func (p *pebbleIterator) ComputeStats( 284 start, end roachpb.Key, nowNanos int64, 285 ) (enginepb.MVCCStats, error) { 286 return ComputeStatsGo(p, start, end, nowNanos) 287 } 288 289 // Go-only version of IsValidSplitKey. Checks if the specified key is in 290 // NoSplitSpans. 291 func isValidSplitKey(key roachpb.Key, noSplitSpans []roachpb.Span) bool { 292 for i := range noSplitSpans { 293 if noSplitSpans[i].ContainsKey(key) { 294 return false 295 } 296 } 297 return true 298 } 299 300 // FindSplitKey implements the Iterator interface. 301 func (p *pebbleIterator) FindSplitKey( 302 start, end, minSplitKey roachpb.Key, targetSize int64, 303 ) (MVCCKey, error) { 304 const timestampLen = 12 305 306 sizeSoFar := int64(0) 307 bestDiff := int64(math.MaxInt64) 308 bestSplitKey := MVCCKey{} 309 // found indicates that we have found a valid split key that is the best 310 // known so far. If bestSplitKey is empty => that split key 311 // is in prevKey, else it is in bestSplitKey. 312 found := false 313 prevKey := MVCCKey{} 314 315 // We only have to consider no-split spans if our minimum split key possibly 316 // lies before them. Note that the no-split spans are ordered by end-key. 317 noSplitSpans := keys.NoSplitSpans 318 for i := range noSplitSpans { 319 if minSplitKey.Compare(noSplitSpans[i].EndKey) <= 0 { 320 noSplitSpans = noSplitSpans[i:] 321 break 322 } 323 } 324 325 // Note that it is unnecessary to compare against "end" to decide to 326 // terminate iteration because the iterator's upper bound has already been 327 // set to end. 328 mvccMinSplitKey := MakeMVCCMetadataKey(minSplitKey) 329 p.SeekGE(MakeMVCCMetadataKey(start)) 330 for ; p.iter.Valid(); p.iter.Next() { 331 mvccKey, err := DecodeMVCCKey(p.iter.Key()) 332 if err != nil { 333 return MVCCKey{}, err 334 } 335 336 diff := targetSize - sizeSoFar 337 if diff < 0 { 338 diff = -diff 339 } 340 if diff > bestDiff { 341 // diff will keep increasing past this point. And we must have had a valid 342 // candidate in the past since we can't be worse than MaxInt64. 343 break 344 } 345 346 if mvccMinSplitKey.Key != nil && !mvccKey.Less(mvccMinSplitKey) { 347 // mvccKey is >= mvccMinSplitKey. Set the minSplitKey to nil so we do 348 // not have to make any more checks going forward. 349 mvccMinSplitKey.Key = nil 350 } 351 352 if mvccMinSplitKey.Key == nil && diff < bestDiff && 353 (len(noSplitSpans) == 0 || isValidSplitKey(mvccKey.Key, noSplitSpans)) { 354 // This is a valid candidate for a split key. 355 // 356 // Instead of copying bestSplitKey just yet, flip the found flag. In the 357 // most common case where the actual best split key is followed by a key 358 // that has diff > bestDiff (see the if statement with that predicate 359 // above), this lets us save a copy by reusing prevCandidateKey as the 360 // best split key. 361 bestDiff = diff 362 found = true 363 // Set length of bestSplitKey to 0, which the rest of this method relies 364 // on to check if the last key encountered was the best split key. 365 bestSplitKey.Key = bestSplitKey.Key[:0] 366 } else if found && len(bestSplitKey.Key) == 0 { 367 // We were just at a valid split key candidate, but then we came across 368 // a key that cannot be a split key (i.e. is in noSplitSpans), or was not 369 // an improvement over bestDiff. Copy the previous key as the 370 // bestSplitKey. 371 bestSplitKey.Timestamp = prevKey.Timestamp 372 bestSplitKey.Key = append(bestSplitKey.Key[:0], prevKey.Key...) 373 } 374 375 sizeSoFar += int64(len(p.iter.Value())) 376 if mvccKey.IsValue() && bytes.Equal(prevKey.Key, mvccKey.Key) { 377 // We only advanced timestamps, but not new mvcc keys. 378 sizeSoFar += timestampLen 379 } else { 380 sizeSoFar += int64(len(mvccKey.Key) + 1) 381 if mvccKey.IsValue() { 382 sizeSoFar += timestampLen 383 } 384 } 385 386 prevKey.Key = append(prevKey.Key[:0], mvccKey.Key...) 387 prevKey.Timestamp = mvccKey.Timestamp 388 } 389 390 // There are three distinct types of cases possible here: 391 // 392 // 1. No valid split key was found (found == false), in which case we return 393 // bestSplitKey (which should be MVCCKey{}). 394 // 2. The best candidate seen for a split key so far was encountered in the 395 // last iteration of the above loop. We broke out of the loop either due 396 // to iterator exhaustion (!p.iter.Valid()), or an increasing diff. Return 397 // prevKey as the best split key. 398 // 3. The best split key was seen multiple iterations ago, and was copied into 399 // bestSplitKey at some point (found == true, len(bestSplitKey.Key) > 0). 400 // Keys encountered after that point were invalid for being in noSplitSpans 401 // so return the bestSplitKey that had been copied. 402 // 403 // This if statement checks for case 2. 404 if found && len(bestSplitKey.Key) == 0 { 405 // Use the last key found as the best split key, since we broke out of the 406 // loop (due to iterator exhaustion or increasing diff) right after we saw 407 // the best split key. prevKey has to be a valid split key since the only 408 // way we'd have both found && len(bestSplitKey.Key) == 0 is when we've 409 // already checked prevKey for validity. 410 return prevKey, nil 411 } 412 return bestSplitKey, nil 413 } 414 415 // SetUpperBound implements the Iterator interface. 416 func (p *pebbleIterator) SetUpperBound(upperBound roachpb.Key) { 417 p.upperBoundBuf = append(p.upperBoundBuf[:0], upperBound...) 418 p.upperBoundBuf = append(p.upperBoundBuf, 0x00) 419 p.options.UpperBound = p.upperBoundBuf 420 p.iter.SetBounds(p.options.LowerBound, p.options.UpperBound) 421 } 422 423 // Stats implements the Iterator interface. 424 func (p *pebbleIterator) Stats() IteratorStats { 425 return IteratorStats{ 426 TimeBoundNumSSTs: p.timeBoundNumSSTables, 427 } 428 } 429 430 // CheckForKeyCollisions indicates if the provided SST data collides with this 431 // iterator in the specified range. 432 func (p *pebbleIterator) CheckForKeyCollisions( 433 sstData []byte, start, end roachpb.Key, 434 ) (enginepb.MVCCStats, error) { 435 return checkForKeyCollisionsGo(p, sstData, start, end) 436 } 437 438 func (p *pebbleIterator) destroy() { 439 if p.inuse { 440 panic("iterator still in use") 441 } 442 if p.iter != nil { 443 err := p.iter.Close() 444 if err != nil { 445 panic(err) 446 } 447 p.iter = nil 448 } 449 // Reset all fields except for the key and lower/upper bound buffers. Holding 450 // onto their underlying memory is more efficient to prevent extra 451 // allocations down the line. 452 *p = pebbleIterator{ 453 keyBuf: p.keyBuf, 454 lowerBoundBuf: p.lowerBoundBuf, 455 upperBoundBuf: p.upperBoundBuf, 456 reusable: p.reusable, 457 } 458 }