github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/sstable/block.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package sstable 6 7 import ( 8 "context" 9 "encoding/binary" 10 "unsafe" 11 12 "github.com/cockroachdb/errors" 13 "github.com/cockroachdb/pebble/internal/base" 14 "github.com/cockroachdb/pebble/internal/invariants" 15 "github.com/cockroachdb/pebble/internal/keyspan" 16 "github.com/cockroachdb/pebble/internal/manual" 17 "github.com/cockroachdb/pebble/internal/rangedel" 18 "github.com/cockroachdb/pebble/internal/rangekey" 19 ) 20 21 func uvarintLen(v uint32) int { 22 i := 0 23 for v >= 0x80 { 24 v >>= 7 25 i++ 26 } 27 return i + 1 28 } 29 30 type blockWriter struct { 31 restartInterval int 32 nEntries int 33 nextRestart int 34 buf []byte 35 // For datablocks in TableFormatPebblev3, we steal the most significant bit 36 // in restarts for encoding setHasSameKeyPrefixSinceLastRestart. This leaves 37 // us with 31 bits, which is more than enough (no one needs > 2GB blocks). 38 // Typically, restarts occur every 16 keys, and by storing this bit with the 39 // restart, we can optimize for the case where a user wants to skip to the 40 // next prefix which happens to be in the same data block, but is > 16 keys 41 // away. We have seen production situations with 100+ versions per MVCC key 42 // (which share the same prefix). Additionally, for such writers, the prefix 43 // compression of the key, that shares the key with the preceding key, is 44 // limited to the prefix part of the preceding key -- this ensures that when 45 // doing NPrefix (see blockIter) we don't need to assemble the full key 46 // for each step since by limiting the length of the shared key we are 47 // ensuring that any of the keys with the same prefix can be used to 48 // assemble the full key when the prefix does change. 49 restarts []uint32 50 // Do not read curKey directly from outside blockWriter since it can have 51 // the InternalKeyKindSSTableInternalObsoleteBit set. Use getCurKey() or 52 // getCurUserKey() instead. 53 curKey []byte 54 // curValue excludes the optional prefix provided to 55 // storeWithOptionalValuePrefix. 56 curValue []byte 57 prevKey []byte 58 tmp [4]byte 59 // We don't know the state of the sets that were at the end of the previous 60 // block, so this is initially 0. It may be true for the second and later 61 // restarts in a block. Not having inter-block information is fine since we 62 // will optimize by stepping through restarts only within the same block. 63 // Note that the first restart is the first key in the block. 64 setHasSameKeyPrefixSinceLastRestart bool 65 } 66 67 func (w *blockWriter) clear() { 68 *w = blockWriter{ 69 buf: w.buf[:0], 70 restarts: w.restarts[:0], 71 curKey: w.curKey[:0], 72 curValue: w.curValue[:0], 73 prevKey: w.prevKey[:0], 74 } 75 } 76 77 // MaximumBlockSize is an extremely generous maximum block size of 256MiB. We 78 // explicitly place this limit to reserve a few bits in the restart for 79 // internal use. 80 const MaximumBlockSize = 1 << 28 81 const setHasSameKeyPrefixRestartMask uint32 = 1 << 31 82 const restartMaskLittleEndianHighByteWithoutSetHasSamePrefix byte = 0b0111_1111 83 const restartMaskLittleEndianHighByteOnlySetHasSamePrefix byte = 0b1000_0000 84 85 func (w *blockWriter) getCurKey() InternalKey { 86 k := base.DecodeInternalKey(w.curKey) 87 k.Trailer = k.Trailer & trailerObsoleteMask 88 return k 89 } 90 91 func (w *blockWriter) getCurUserKey() []byte { 92 n := len(w.curKey) - base.InternalTrailerLen 93 if n < 0 { 94 panic(errors.AssertionFailedf("corrupt key in blockWriter buffer")) 95 } 96 return w.curKey[:n:n] 97 } 98 99 // If !addValuePrefix, the valuePrefix is ignored. 100 func (w *blockWriter) storeWithOptionalValuePrefix( 101 keySize int, 102 value []byte, 103 maxSharedKeyLen int, 104 addValuePrefix bool, 105 valuePrefix valuePrefix, 106 setHasSameKeyPrefix bool, 107 ) { 108 shared := 0 109 if !setHasSameKeyPrefix { 110 w.setHasSameKeyPrefixSinceLastRestart = false 111 } 112 if w.nEntries == w.nextRestart { 113 w.nextRestart = w.nEntries + w.restartInterval 114 restart := uint32(len(w.buf)) 115 if w.setHasSameKeyPrefixSinceLastRestart { 116 restart = restart | setHasSameKeyPrefixRestartMask 117 } 118 w.setHasSameKeyPrefixSinceLastRestart = true 119 w.restarts = append(w.restarts, restart) 120 } else { 121 // TODO(peter): Manually inlined version of base.SharedPrefixLen(). This 122 // is 3% faster on BenchmarkWriter on go1.16. Remove if future versions 123 // show this to not be a performance win. For now, functions that use of 124 // unsafe cannot be inlined. 125 n := maxSharedKeyLen 126 if n > len(w.prevKey) { 127 n = len(w.prevKey) 128 } 129 asUint64 := func(b []byte, i int) uint64 { 130 return binary.LittleEndian.Uint64(b[i:]) 131 } 132 for shared < n-7 && asUint64(w.curKey, shared) == asUint64(w.prevKey, shared) { 133 shared += 8 134 } 135 for shared < n && w.curKey[shared] == w.prevKey[shared] { 136 shared++ 137 } 138 } 139 140 lenValuePlusOptionalPrefix := len(value) 141 if addValuePrefix { 142 lenValuePlusOptionalPrefix++ 143 } 144 needed := 3*binary.MaxVarintLen32 + len(w.curKey[shared:]) + lenValuePlusOptionalPrefix 145 n := len(w.buf) 146 if cap(w.buf) < n+needed { 147 newCap := 2 * cap(w.buf) 148 if newCap == 0 { 149 newCap = 1024 150 } 151 for newCap < n+needed { 152 newCap *= 2 153 } 154 newBuf := make([]byte, n, newCap) 155 copy(newBuf, w.buf) 156 w.buf = newBuf 157 } 158 w.buf = w.buf[:n+needed] 159 160 // TODO(peter): Manually inlined versions of binary.PutUvarint(). This is 15% 161 // faster on BenchmarkWriter on go1.13. Remove if go1.14 or future versions 162 // show this to not be a performance win. 163 { 164 x := uint32(shared) 165 for x >= 0x80 { 166 w.buf[n] = byte(x) | 0x80 167 x >>= 7 168 n++ 169 } 170 w.buf[n] = byte(x) 171 n++ 172 } 173 174 { 175 x := uint32(keySize - shared) 176 for x >= 0x80 { 177 w.buf[n] = byte(x) | 0x80 178 x >>= 7 179 n++ 180 } 181 w.buf[n] = byte(x) 182 n++ 183 } 184 185 { 186 x := uint32(lenValuePlusOptionalPrefix) 187 for x >= 0x80 { 188 w.buf[n] = byte(x) | 0x80 189 x >>= 7 190 n++ 191 } 192 w.buf[n] = byte(x) 193 n++ 194 } 195 196 n += copy(w.buf[n:], w.curKey[shared:]) 197 if addValuePrefix { 198 w.buf[n : n+1][0] = byte(valuePrefix) 199 n++ 200 } 201 n += copy(w.buf[n:], value) 202 w.buf = w.buf[:n] 203 204 w.curValue = w.buf[n-len(value):] 205 206 w.nEntries++ 207 } 208 209 func (w *blockWriter) add(key InternalKey, value []byte) { 210 w.addWithOptionalValuePrefix( 211 key, false, value, len(key.UserKey), false, 0, false) 212 } 213 214 // Callers that always set addValuePrefix to false should use add() instead. 215 // 216 // isObsolete indicates whether this key-value pair is obsolete in this 217 // sstable (only applicable when writing data blocks) -- see the comment in 218 // table.go and the longer one in format.go. addValuePrefix adds a 1 byte 219 // prefix to the value, specified in valuePrefix -- this is used for data 220 // blocks in TableFormatPebblev3 onwards for SETs (see the comment in 221 // format.go, with more details in value_block.go). setHasSameKeyPrefix is 222 // also used in TableFormatPebblev3 onwards for SETs. 223 func (w *blockWriter) addWithOptionalValuePrefix( 224 key InternalKey, 225 isObsolete bool, 226 value []byte, 227 maxSharedKeyLen int, 228 addValuePrefix bool, 229 valuePrefix valuePrefix, 230 setHasSameKeyPrefix bool, 231 ) { 232 w.curKey, w.prevKey = w.prevKey, w.curKey 233 234 size := key.Size() 235 if cap(w.curKey) < size { 236 w.curKey = make([]byte, 0, size*2) 237 } 238 w.curKey = w.curKey[:size] 239 if isObsolete { 240 key.Trailer = key.Trailer | trailerObsoleteBit 241 } 242 key.Encode(w.curKey) 243 244 w.storeWithOptionalValuePrefix( 245 size, value, maxSharedKeyLen, addValuePrefix, valuePrefix, setHasSameKeyPrefix) 246 } 247 248 func (w *blockWriter) finish() []byte { 249 // Write the restart points to the buffer. 250 if w.nEntries == 0 { 251 // Every block must have at least one restart point. 252 if cap(w.restarts) > 0 { 253 w.restarts = w.restarts[:1] 254 w.restarts[0] = 0 255 } else { 256 w.restarts = append(w.restarts, 0) 257 } 258 } 259 tmp4 := w.tmp[:4] 260 for _, x := range w.restarts { 261 binary.LittleEndian.PutUint32(tmp4, x) 262 w.buf = append(w.buf, tmp4...) 263 } 264 binary.LittleEndian.PutUint32(tmp4, uint32(len(w.restarts))) 265 w.buf = append(w.buf, tmp4...) 266 result := w.buf 267 268 // Reset the block state. 269 w.nEntries = 0 270 w.nextRestart = 0 271 w.buf = w.buf[:0] 272 w.restarts = w.restarts[:0] 273 return result 274 } 275 276 // emptyBlockSize holds the size of an empty block. Every block ends 277 // in a uint32 trailer encoding the number of restart points within the 278 // block. 279 const emptyBlockSize = 4 280 281 func (w *blockWriter) estimatedSize() int { 282 return len(w.buf) + 4*len(w.restarts) + emptyBlockSize 283 } 284 285 type blockEntry struct { 286 offset int32 287 keyStart int32 288 keyEnd int32 289 valStart int32 290 valSize int32 291 } 292 293 // blockIter is an iterator over a single block of data. 294 // 295 // A blockIter provides an additional guarantee around key stability when a 296 // block has a restart interval of 1 (i.e. when there is no prefix 297 // compression). Key stability refers to whether the InternalKey.UserKey bytes 298 // returned by a positioning call will remain stable after a subsequent 299 // positioning call. The normal case is that a positioning call will invalidate 300 // any previously returned InternalKey.UserKey. If a block has a restart 301 // interval of 1 (no prefix compression), blockIter guarantees that 302 // InternalKey.UserKey will point to the key as stored in the block itself 303 // which will remain valid until the blockIter is closed. The key stability 304 // guarantee is used by the range tombstone and range key code, which knows that 305 // the respective blocks are always encoded with a restart interval of 1. This 306 // per-block key stability guarantee is sufficient for range tombstones and 307 // range deletes as they are always encoded in a single block. 308 // 309 // A blockIter also provides a value stability guarantee for range deletions and 310 // range keys since there is only a single range deletion and range key block 311 // per sstable and the blockIter will not release the bytes for the block until 312 // it is closed. 313 // 314 // Note on why blockIter knows about lazyValueHandling: 315 // 316 // blockIter's positioning functions (that return a LazyValue), are too 317 // complex to inline even prior to lazyValueHandling. blockIter.Next and 318 // blockIter.First were by far the cheapest and had costs 195 and 180 319 // respectively, which exceeds the budget of 80. We initially tried to keep 320 // the lazyValueHandling logic out of blockIter by wrapping it with a 321 // lazyValueDataBlockIter. singleLevelIter and twoLevelIter would use this 322 // wrapped iter. The functions in lazyValueDataBlockIter were simple, in that 323 // they called the corresponding blockIter func and then decided whether the 324 // value was in fact in-place (so return immediately) or needed further 325 // handling. But these also turned out too costly for mid-stack inlining since 326 // simple calls like the following have a high cost that is barely under the 327 // budget of 80 328 // 329 // k, v := i.data.SeekGE(key, flags) // cost 74 330 // k, v := i.data.Next() // cost 72 331 // 332 // We have 2 options for minimizing performance regressions: 333 // - Include the lazyValueHandling logic in the already non-inlineable 334 // blockIter functions: Since most of the time is spent in data block iters, 335 // it is acceptable to take the small hit of unnecessary branching (which 336 // hopefully branch prediction will predict correctly) for other kinds of 337 // blocks. 338 // - Duplicate the logic of singleLevelIterator and twoLevelIterator for the 339 // v3 sstable and only use the aforementioned lazyValueDataBlockIter for a 340 // v3 sstable. We would want to manage these copies via code generation. 341 // 342 // We have picked the first option here. 343 type blockIter struct { 344 cmp Compare 345 // offset is the byte index that marks where the current key/value is 346 // encoded in the block. 347 offset int32 348 // nextOffset is the byte index where the next key/value is encoded in the 349 // block. 350 nextOffset int32 351 // A "restart point" in a block is a point where the full key is encoded, 352 // instead of just having a suffix of the key encoded. See readEntry() for 353 // how prefix compression of keys works. Keys in between two restart points 354 // only have a suffix encoded in the block. When restart interval is 1, no 355 // prefix compression of keys happens. This is the case with range tombstone 356 // blocks. 357 // 358 // All restart offsets are listed in increasing order in 359 // i.ptr[i.restarts:len(block)-4], while numRestarts is encoded in the last 360 // 4 bytes of the block as a uint32 (i.ptr[len(block)-4:]). i.restarts can 361 // therefore be seen as the point where data in the block ends, and a list 362 // of offsets of all restart points begins. 363 restarts int32 364 // Number of restart points in this block. Encoded at the end of the block 365 // as a uint32. 366 numRestarts int32 367 globalSeqNum uint64 368 ptr unsafe.Pointer 369 data []byte 370 // key contains the raw key the iterator is currently pointed at. This may 371 // point directly to data stored in the block (for a key which has no prefix 372 // compression), to fullKey (for a prefix compressed key), or to a slice of 373 // data stored in cachedBuf (during reverse iteration). 374 key []byte 375 // fullKey is a buffer used for key prefix decompression. 376 fullKey []byte 377 // val contains the value the iterator is currently pointed at. If non-nil, 378 // this points to a slice of the block data. 379 val []byte 380 // lazyValue is val turned into a LazyValue, whenever a positioning method 381 // returns a non-nil key-value pair. 382 lazyValue base.LazyValue 383 // ikey contains the decoded InternalKey the iterator is currently pointed 384 // at. Note that the memory backing ikey.UserKey is either data stored 385 // directly in the block, fullKey, or cachedBuf. The key stability guarantee 386 // for blocks built with a restart interval of 1 is achieved by having 387 // ikey.UserKey always point to data stored directly in the block. 388 ikey InternalKey 389 // cached and cachedBuf are used during reverse iteration. They are needed 390 // because we can't perform prefix decoding in reverse, only in the forward 391 // direction. In order to iterate in reverse, we decode and cache the entries 392 // between two restart points. 393 // 394 // Note that cached[len(cached)-1] contains the previous entry to the one the 395 // blockIter is currently pointed at. As usual, nextOffset will contain the 396 // offset of the next entry. During reverse iteration, nextOffset will be 397 // updated to point to offset, and we'll set the blockIter to point at the 398 // entry cached[len(cached)-1]. See Prev() for more details. 399 // 400 // For a block encoded with a restart interval of 1, cached and cachedBuf 401 // will not be used as there are no prefix compressed entries between the 402 // restart points. 403 cached []blockEntry 404 cachedBuf []byte 405 handle bufferHandle 406 // for block iteration for already loaded blocks. 407 firstUserKey []byte 408 lazyValueHandling struct { 409 vbr *valueBlockReader 410 hasValuePrefix bool 411 } 412 hideObsoletePoints bool 413 } 414 415 // blockIter implements the base.InternalIterator interface. 416 var _ base.InternalIterator = (*blockIter)(nil) 417 418 func newBlockIter(cmp Compare, block block) (*blockIter, error) { 419 i := &blockIter{} 420 return i, i.init(cmp, block, 0, false) 421 } 422 423 func (i *blockIter) String() string { 424 return "block" 425 } 426 427 func (i *blockIter) init( 428 cmp Compare, block block, globalSeqNum uint64, hideObsoletePoints bool, 429 ) error { 430 numRestarts := int32(binary.LittleEndian.Uint32(block[len(block)-4:])) 431 if numRestarts == 0 { 432 return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)") 433 } 434 i.cmp = cmp 435 i.restarts = int32(len(block)) - 4*(1+numRestarts) 436 i.numRestarts = numRestarts 437 i.globalSeqNum = globalSeqNum 438 i.ptr = unsafe.Pointer(&block[0]) 439 i.data = block 440 i.fullKey = i.fullKey[:0] 441 i.val = nil 442 i.hideObsoletePoints = hideObsoletePoints 443 i.clearCache() 444 if i.restarts > 0 { 445 if err := i.readFirstKey(); err != nil { 446 return err 447 } 448 } else { 449 // Block is empty. 450 i.firstUserKey = nil 451 } 452 return nil 453 } 454 455 // NB: two cases of hideObsoletePoints: 456 // - Local sstable iteration: globalSeqNum will be set iff the sstable was 457 // ingested. 458 // - Foreign sstable iteration: globalSeqNum is always set. 459 func (i *blockIter) initHandle( 460 cmp Compare, block bufferHandle, globalSeqNum uint64, hideObsoletePoints bool, 461 ) error { 462 i.handle.Release() 463 i.handle = block 464 return i.init(cmp, block.Get(), globalSeqNum, hideObsoletePoints) 465 } 466 467 func (i *blockIter) invalidate() { 468 i.clearCache() 469 i.offset = 0 470 i.nextOffset = 0 471 i.restarts = 0 472 i.numRestarts = 0 473 i.data = nil 474 } 475 476 // isDataInvalidated returns true when the blockIter has been invalidated 477 // using an invalidate call. NB: this is different from blockIter.Valid 478 // which is part of the InternalIterator implementation. 479 func (i *blockIter) isDataInvalidated() bool { 480 return i.data == nil 481 } 482 483 func (i *blockIter) resetForReuse() blockIter { 484 return blockIter{ 485 fullKey: i.fullKey[:0], 486 cached: i.cached[:0], 487 cachedBuf: i.cachedBuf[:0], 488 data: nil, 489 } 490 } 491 492 func (i *blockIter) readEntry() { 493 ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset)) 494 495 // This is an ugly performance hack. Reading entries from blocks is one of 496 // the inner-most routines and decoding the 3 varints per-entry takes 497 // significant time. Neither go1.11 or go1.12 will inline decodeVarint for 498 // us, so we do it manually. This provides a 10-15% performance improvement 499 // on blockIter benchmarks on both go1.11 and go1.12. 500 // 501 // TODO(peter): remove this hack if go:inline is ever supported. 502 503 var shared uint32 504 if a := *((*uint8)(ptr)); a < 128 { 505 shared = uint32(a) 506 ptr = unsafe.Pointer(uintptr(ptr) + 1) 507 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 508 shared = uint32(b)<<7 | uint32(a) 509 ptr = unsafe.Pointer(uintptr(ptr) + 2) 510 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 511 shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 512 ptr = unsafe.Pointer(uintptr(ptr) + 3) 513 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 514 shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 515 ptr = unsafe.Pointer(uintptr(ptr) + 4) 516 } else { 517 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 518 shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 519 ptr = unsafe.Pointer(uintptr(ptr) + 5) 520 } 521 522 var unshared uint32 523 if a := *((*uint8)(ptr)); a < 128 { 524 unshared = uint32(a) 525 ptr = unsafe.Pointer(uintptr(ptr) + 1) 526 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 527 unshared = uint32(b)<<7 | uint32(a) 528 ptr = unsafe.Pointer(uintptr(ptr) + 2) 529 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 530 unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 531 ptr = unsafe.Pointer(uintptr(ptr) + 3) 532 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 533 unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 534 ptr = unsafe.Pointer(uintptr(ptr) + 4) 535 } else { 536 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 537 unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 538 ptr = unsafe.Pointer(uintptr(ptr) + 5) 539 } 540 541 var value uint32 542 if a := *((*uint8)(ptr)); a < 128 { 543 value = uint32(a) 544 ptr = unsafe.Pointer(uintptr(ptr) + 1) 545 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 546 value = uint32(b)<<7 | uint32(a) 547 ptr = unsafe.Pointer(uintptr(ptr) + 2) 548 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 549 value = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 550 ptr = unsafe.Pointer(uintptr(ptr) + 3) 551 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 552 value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 553 ptr = unsafe.Pointer(uintptr(ptr) + 4) 554 } else { 555 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 556 value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 557 ptr = unsafe.Pointer(uintptr(ptr) + 5) 558 } 559 560 unsharedKey := getBytes(ptr, int(unshared)) 561 // TODO(sumeer): move this into the else block below. 562 i.fullKey = append(i.fullKey[:shared], unsharedKey...) 563 if shared == 0 { 564 // Provide stability for the key across positioning calls if the key 565 // doesn't share a prefix with the previous key. This removes requiring the 566 // key to be copied if the caller knows the block has a restart interval of 567 // 1. An important example of this is range-del blocks. 568 i.key = unsharedKey 569 } else { 570 i.key = i.fullKey 571 } 572 ptr = unsafe.Pointer(uintptr(ptr) + uintptr(unshared)) 573 i.val = getBytes(ptr, int(value)) 574 i.nextOffset = int32(uintptr(ptr)-uintptr(i.ptr)) + int32(value) 575 } 576 577 func (i *blockIter) readFirstKey() error { 578 ptr := i.ptr 579 580 // This is an ugly performance hack. Reading entries from blocks is one of 581 // the inner-most routines and decoding the 3 varints per-entry takes 582 // significant time. Neither go1.11 or go1.12 will inline decodeVarint for 583 // us, so we do it manually. This provides a 10-15% performance improvement 584 // on blockIter benchmarks on both go1.11 and go1.12. 585 // 586 // TODO(peter): remove this hack if go:inline is ever supported. 587 588 if shared := *((*uint8)(ptr)); shared == 0 { 589 ptr = unsafe.Pointer(uintptr(ptr) + 1) 590 } else { 591 // The shared length is != 0, which is invalid. 592 panic("first key in block must have zero shared length") 593 } 594 595 var unshared uint32 596 if a := *((*uint8)(ptr)); a < 128 { 597 unshared = uint32(a) 598 ptr = unsafe.Pointer(uintptr(ptr) + 1) 599 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 600 unshared = uint32(b)<<7 | uint32(a) 601 ptr = unsafe.Pointer(uintptr(ptr) + 2) 602 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 603 unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 604 ptr = unsafe.Pointer(uintptr(ptr) + 3) 605 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 606 unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 607 ptr = unsafe.Pointer(uintptr(ptr) + 4) 608 } else { 609 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 610 unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 611 ptr = unsafe.Pointer(uintptr(ptr) + 5) 612 } 613 614 // Skip the value length. 615 if a := *((*uint8)(ptr)); a < 128 { 616 ptr = unsafe.Pointer(uintptr(ptr) + 1) 617 } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); a < 128 { 618 ptr = unsafe.Pointer(uintptr(ptr) + 2) 619 } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); a < 128 { 620 ptr = unsafe.Pointer(uintptr(ptr) + 3) 621 } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); a < 128 { 622 ptr = unsafe.Pointer(uintptr(ptr) + 4) 623 } else { 624 ptr = unsafe.Pointer(uintptr(ptr) + 5) 625 } 626 627 firstKey := getBytes(ptr, int(unshared)) 628 // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on 629 // BlockIter benchmarks. 630 if n := len(firstKey) - 8; n >= 0 { 631 i.firstUserKey = firstKey[:n:n] 632 } else { 633 i.firstUserKey = nil 634 return base.CorruptionErrorf("pebble/table: invalid firstKey in block") 635 } 636 return nil 637 } 638 639 // The sstable internal obsolete bit is set when writing a block and unset by 640 // blockIter, so no code outside block writing/reading code ever sees it. 641 const trailerObsoleteBit = uint64(base.InternalKeyKindSSTableInternalObsoleteBit) 642 const trailerObsoleteMask = (InternalKeySeqNumMax << 8) | uint64(base.InternalKeyKindSSTableInternalObsoleteMask) 643 644 func (i *blockIter) decodeInternalKey(key []byte) (hiddenPoint bool) { 645 // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on 646 // BlockIter benchmarks. 647 if n := len(key) - 8; n >= 0 { 648 trailer := binary.LittleEndian.Uint64(key[n:]) 649 hiddenPoint = i.hideObsoletePoints && 650 (trailer&trailerObsoleteBit != 0) 651 i.ikey.Trailer = trailer & trailerObsoleteMask 652 i.ikey.UserKey = key[:n:n] 653 if i.globalSeqNum != 0 { 654 i.ikey.SetSeqNum(i.globalSeqNum) 655 } 656 } else { 657 i.ikey.Trailer = uint64(InternalKeyKindInvalid) 658 i.ikey.UserKey = nil 659 } 660 return hiddenPoint 661 } 662 663 func (i *blockIter) clearCache() { 664 i.cached = i.cached[:0] 665 i.cachedBuf = i.cachedBuf[:0] 666 } 667 668 func (i *blockIter) cacheEntry() { 669 var valStart int32 670 valSize := int32(len(i.val)) 671 if valSize > 0 { 672 valStart = int32(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr)) 673 } 674 675 i.cached = append(i.cached, blockEntry{ 676 offset: i.offset, 677 keyStart: int32(len(i.cachedBuf)), 678 keyEnd: int32(len(i.cachedBuf) + len(i.key)), 679 valStart: valStart, 680 valSize: valSize, 681 }) 682 i.cachedBuf = append(i.cachedBuf, i.key...) 683 } 684 685 func (i *blockIter) getFirstUserKey() []byte { 686 return i.firstUserKey 687 } 688 689 // SeekGE implements internalIterator.SeekGE, as documented in the pebble 690 // package. 691 func (i *blockIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) { 692 if invariants.Enabled && i.isDataInvalidated() { 693 panic(errors.AssertionFailedf("invalidated blockIter used")) 694 } 695 696 i.clearCache() 697 // Find the index of the smallest restart point whose key is > the key 698 // sought; index will be numRestarts if there is no such restart point. 699 i.offset = 0 700 var index int32 701 702 { 703 // NB: manually inlined sort.Seach is ~5% faster. 704 // 705 // Define f(-1) == false and f(n) == true. 706 // Invariant: f(index-1) == false, f(upper) == true. 707 upper := i.numRestarts 708 for index < upper { 709 h := int32(uint(index+upper) >> 1) // avoid overflow when computing h 710 // index ≤ h < upper 711 offset := decodeRestart(i.data[i.restarts+4*h:]) 712 // For a restart point, there are 0 bytes shared with the previous key. 713 // The varint encoding of 0 occupies 1 byte. 714 ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1)) 715 716 // Decode the key at that restart point, and compare it to the key 717 // sought. See the comment in readEntry for why we manually inline the 718 // varint decoding. 719 var v1 uint32 720 if a := *((*uint8)(ptr)); a < 128 { 721 v1 = uint32(a) 722 ptr = unsafe.Pointer(uintptr(ptr) + 1) 723 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 724 v1 = uint32(b)<<7 | uint32(a) 725 ptr = unsafe.Pointer(uintptr(ptr) + 2) 726 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 727 v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 728 ptr = unsafe.Pointer(uintptr(ptr) + 3) 729 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 730 v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 731 ptr = unsafe.Pointer(uintptr(ptr) + 4) 732 } else { 733 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 734 v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 735 ptr = unsafe.Pointer(uintptr(ptr) + 5) 736 } 737 738 if *((*uint8)(ptr)) < 128 { 739 ptr = unsafe.Pointer(uintptr(ptr) + 1) 740 } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 { 741 ptr = unsafe.Pointer(uintptr(ptr) + 2) 742 } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 { 743 ptr = unsafe.Pointer(uintptr(ptr) + 3) 744 } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 { 745 ptr = unsafe.Pointer(uintptr(ptr) + 4) 746 } else { 747 ptr = unsafe.Pointer(uintptr(ptr) + 5) 748 } 749 750 // Manually inlining part of base.DecodeInternalKey provides a 5-10% 751 // speedup on BlockIter benchmarks. 752 s := getBytes(ptr, int(v1)) 753 var k []byte 754 if n := len(s) - 8; n >= 0 { 755 k = s[:n:n] 756 } 757 // Else k is invalid, and left as nil 758 759 if i.cmp(key, k) > 0 { 760 // The search key is greater than the user key at this restart point. 761 // Search beyond this restart point, since we are trying to find the 762 // first restart point with a user key >= the search key. 763 index = h + 1 // preserves f(i-1) == false 764 } else { 765 // k >= search key, so prune everything after index (since index 766 // satisfies the property we are looking for). 767 upper = h // preserves f(j) == true 768 } 769 } 770 // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true 771 // => answer is index. 772 } 773 774 // index is the first restart point with key >= search key. Define the keys 775 // between a restart point and the next restart point as belonging to that 776 // restart point. 777 // 778 // Since keys are strictly increasing, if index > 0 then the restart point 779 // at index-1 will be the first one that has some keys belonging to it that 780 // could be equal to the search key. If index == 0, then all keys in this 781 // block are larger than the key sought, and offset remains at zero. 782 if index > 0 { 783 i.offset = decodeRestart(i.data[i.restarts+4*(index-1):]) 784 } 785 i.readEntry() 786 hiddenPoint := i.decodeInternalKey(i.key) 787 788 // Iterate from that restart point to somewhere >= the key sought. 789 if !i.valid() { 790 return nil, base.LazyValue{} 791 } 792 if !hiddenPoint && i.cmp(i.ikey.UserKey, key) >= 0 { 793 // Initialize i.lazyValue 794 if !i.lazyValueHandling.hasValuePrefix || 795 base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { 796 i.lazyValue = base.MakeInPlaceValue(i.val) 797 } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { 798 i.lazyValue = base.MakeInPlaceValue(i.val[1:]) 799 } else { 800 i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) 801 } 802 return &i.ikey, i.lazyValue 803 } 804 for i.Next(); i.valid(); i.Next() { 805 if i.cmp(i.ikey.UserKey, key) >= 0 { 806 // i.Next() has already initialized i.lazyValue. 807 return &i.ikey, i.lazyValue 808 } 809 } 810 return nil, base.LazyValue{} 811 } 812 813 // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the 814 // pebble package. 815 func (i *blockIter) SeekPrefixGE( 816 prefix, key []byte, flags base.SeekGEFlags, 817 ) (*base.InternalKey, base.LazyValue) { 818 // This should never be called as prefix iteration is handled by sstable.Iterator. 819 panic("pebble: SeekPrefixGE unimplemented") 820 } 821 822 // SeekLT implements internalIterator.SeekLT, as documented in the pebble 823 // package. 824 func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) { 825 if invariants.Enabled && i.isDataInvalidated() { 826 panic(errors.AssertionFailedf("invalidated blockIter used")) 827 } 828 829 i.clearCache() 830 // Find the index of the smallest restart point whose key is >= the key 831 // sought; index will be numRestarts if there is no such restart point. 832 i.offset = 0 833 var index int32 834 835 { 836 // NB: manually inlined sort.Search is ~5% faster. 837 // 838 // Define f(-1) == false and f(n) == true. 839 // Invariant: f(index-1) == false, f(upper) == true. 840 upper := i.numRestarts 841 for index < upper { 842 h := int32(uint(index+upper) >> 1) // avoid overflow when computing h 843 // index ≤ h < upper 844 offset := decodeRestart(i.data[i.restarts+4*h:]) 845 // For a restart point, there are 0 bytes shared with the previous key. 846 // The varint encoding of 0 occupies 1 byte. 847 ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1)) 848 849 // Decode the key at that restart point, and compare it to the key 850 // sought. See the comment in readEntry for why we manually inline the 851 // varint decoding. 852 var v1 uint32 853 if a := *((*uint8)(ptr)); a < 128 { 854 v1 = uint32(a) 855 ptr = unsafe.Pointer(uintptr(ptr) + 1) 856 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 857 v1 = uint32(b)<<7 | uint32(a) 858 ptr = unsafe.Pointer(uintptr(ptr) + 2) 859 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 860 v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 861 ptr = unsafe.Pointer(uintptr(ptr) + 3) 862 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 863 v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 864 ptr = unsafe.Pointer(uintptr(ptr) + 4) 865 } else { 866 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 867 v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 868 ptr = unsafe.Pointer(uintptr(ptr) + 5) 869 } 870 871 if *((*uint8)(ptr)) < 128 { 872 ptr = unsafe.Pointer(uintptr(ptr) + 1) 873 } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 { 874 ptr = unsafe.Pointer(uintptr(ptr) + 2) 875 } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 { 876 ptr = unsafe.Pointer(uintptr(ptr) + 3) 877 } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 { 878 ptr = unsafe.Pointer(uintptr(ptr) + 4) 879 } else { 880 ptr = unsafe.Pointer(uintptr(ptr) + 5) 881 } 882 883 // Manually inlining part of base.DecodeInternalKey provides a 5-10% 884 // speedup on BlockIter benchmarks. 885 s := getBytes(ptr, int(v1)) 886 var k []byte 887 if n := len(s) - 8; n >= 0 { 888 k = s[:n:n] 889 } 890 // Else k is invalid, and left as nil 891 892 if i.cmp(key, k) > 0 { 893 // The search key is greater than the user key at this restart point. 894 // Search beyond this restart point, since we are trying to find the 895 // first restart point with a user key >= the search key. 896 index = h + 1 // preserves f(i-1) == false 897 } else { 898 // k >= search key, so prune everything after index (since index 899 // satisfies the property we are looking for). 900 upper = h // preserves f(j) == true 901 } 902 } 903 // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true 904 // => answer is index. 905 } 906 907 // index is the first restart point with key >= search key. Define the keys 908 // between a restart point and the next restart point as belonging to that 909 // restart point. Note that index could be equal to i.numRestarts, i.e., we 910 // are past the last restart. 911 // 912 // Since keys are strictly increasing, if index > 0 then the restart point 913 // at index-1 will be the first one that has some keys belonging to it that 914 // are less than the search key. If index == 0, then all keys in this block 915 // are larger than the search key, so there is no match. 916 targetOffset := i.restarts 917 if index > 0 { 918 i.offset = decodeRestart(i.data[i.restarts+4*(index-1):]) 919 if index < i.numRestarts { 920 targetOffset = decodeRestart(i.data[i.restarts+4*(index):]) 921 } 922 } else if index == 0 { 923 // If index == 0 then all keys in this block are larger than the key 924 // sought. 925 i.offset = -1 926 i.nextOffset = 0 927 return nil, base.LazyValue{} 928 } 929 930 // Iterate from that restart point to somewhere >= the key sought, then back 931 // up to the previous entry. The expectation is that we'll be performing 932 // reverse iteration, so we cache the entries as we advance forward. 933 i.nextOffset = i.offset 934 935 for { 936 i.offset = i.nextOffset 937 i.readEntry() 938 // When hidden keys are common, there is additional optimization possible 939 // by not caching entries that are hidden (note that some calls to 940 // cacheEntry don't decode the internal key before caching, but checking 941 // whether a key is hidden does not require full decoding). However, we do 942 // need to use the blockEntry.offset in the cache for the first entry at 943 // the reset point to do the binary search when the cache is empty -- so 944 // we would need to cache that first entry (though not the key) even if 945 // was hidden. Our current assumption is that if there are large numbers 946 // of hidden keys we will be able to skip whole blocks (using block 947 // property filters) so we don't bother optimizing. 948 hiddenPoint := i.decodeInternalKey(i.key) 949 950 // NB: we don't use the hiddenPoint return value of decodeInternalKey 951 // since we want to stop as soon as we reach a key >= ikey.UserKey, so 952 // that we can reverse. 953 if i.cmp(i.ikey.UserKey, key) >= 0 { 954 // The current key is greater than or equal to our search key. Back up to 955 // the previous key which was less than our search key. Note that this for 956 // loop will execute at least once with this if-block not being true, so 957 // the key we are backing up to is the last one this loop cached. 958 return i.Prev() 959 } 960 961 if i.nextOffset >= targetOffset { 962 // We've reached the end of the current restart block. Return the 963 // current key if not hidden, else call Prev(). 964 // 965 // When the restart interval is 1, the first iteration of the for loop 966 // will bring us here. In that case ikey is backed by the block so we 967 // get the desired key stability guarantee for the lifetime of the 968 // blockIter. That is, we never cache anything and therefore never 969 // return a key backed by cachedBuf. 970 if hiddenPoint { 971 return i.Prev() 972 } 973 break 974 } 975 976 i.cacheEntry() 977 } 978 979 if !i.valid() { 980 return nil, base.LazyValue{} 981 } 982 if !i.lazyValueHandling.hasValuePrefix || 983 base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { 984 i.lazyValue = base.MakeInPlaceValue(i.val) 985 } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { 986 i.lazyValue = base.MakeInPlaceValue(i.val[1:]) 987 } else { 988 i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) 989 } 990 return &i.ikey, i.lazyValue 991 } 992 993 // First implements internalIterator.First, as documented in the pebble 994 // package. 995 func (i *blockIter) First() (*InternalKey, base.LazyValue) { 996 if invariants.Enabled && i.isDataInvalidated() { 997 panic(errors.AssertionFailedf("invalidated blockIter used")) 998 } 999 1000 i.offset = 0 1001 if !i.valid() { 1002 return nil, base.LazyValue{} 1003 } 1004 i.clearCache() 1005 i.readEntry() 1006 hiddenPoint := i.decodeInternalKey(i.key) 1007 if hiddenPoint { 1008 return i.Next() 1009 } 1010 if !i.lazyValueHandling.hasValuePrefix || 1011 base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { 1012 i.lazyValue = base.MakeInPlaceValue(i.val) 1013 } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { 1014 i.lazyValue = base.MakeInPlaceValue(i.val[1:]) 1015 } else { 1016 i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) 1017 } 1018 return &i.ikey, i.lazyValue 1019 } 1020 1021 func decodeRestart(b []byte) int32 { 1022 _ = b[3] // bounds check hint to compiler; see golang.org/issue/14808 1023 return int32(uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | 1024 uint32(b[3]&restartMaskLittleEndianHighByteWithoutSetHasSamePrefix)<<24) 1025 } 1026 1027 // Last implements internalIterator.Last, as documented in the pebble package. 1028 func (i *blockIter) Last() (*InternalKey, base.LazyValue) { 1029 if invariants.Enabled && i.isDataInvalidated() { 1030 panic(errors.AssertionFailedf("invalidated blockIter used")) 1031 } 1032 1033 // Seek forward from the last restart point. 1034 i.offset = decodeRestart(i.data[i.restarts+4*(i.numRestarts-1):]) 1035 if !i.valid() { 1036 return nil, base.LazyValue{} 1037 } 1038 1039 i.readEntry() 1040 i.clearCache() 1041 1042 for i.nextOffset < i.restarts { 1043 i.cacheEntry() 1044 i.offset = i.nextOffset 1045 i.readEntry() 1046 } 1047 1048 hiddenPoint := i.decodeInternalKey(i.key) 1049 if hiddenPoint { 1050 return i.Prev() 1051 } 1052 if !i.lazyValueHandling.hasValuePrefix || 1053 base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { 1054 i.lazyValue = base.MakeInPlaceValue(i.val) 1055 } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { 1056 i.lazyValue = base.MakeInPlaceValue(i.val[1:]) 1057 } else { 1058 i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) 1059 } 1060 return &i.ikey, i.lazyValue 1061 } 1062 1063 // Next implements internalIterator.Next, as documented in the pebble 1064 // package. 1065 func (i *blockIter) Next() (*InternalKey, base.LazyValue) { 1066 if len(i.cachedBuf) > 0 { 1067 // We're switching from reverse iteration to forward iteration. We need to 1068 // populate i.fullKey with the current key we're positioned at so that 1069 // readEntry() can use i.fullKey for key prefix decompression. Note that we 1070 // don't know whether i.key is backed by i.cachedBuf or i.fullKey (if 1071 // SeekLT was the previous call, i.key may be backed by i.fullKey), but 1072 // copying into i.fullKey works for both cases. 1073 // 1074 // TODO(peter): Rather than clearing the cache, we could instead use the 1075 // cache until it is exhausted. This would likely be faster than falling 1076 // through to the normal forward iteration code below. 1077 i.fullKey = append(i.fullKey[:0], i.key...) 1078 i.clearCache() 1079 } 1080 1081 start: 1082 i.offset = i.nextOffset 1083 if !i.valid() { 1084 return nil, base.LazyValue{} 1085 } 1086 i.readEntry() 1087 // Manually inlined version of i.decodeInternalKey(i.key). 1088 if n := len(i.key) - 8; n >= 0 { 1089 trailer := binary.LittleEndian.Uint64(i.key[n:]) 1090 hiddenPoint := i.hideObsoletePoints && 1091 (trailer&trailerObsoleteBit != 0) 1092 i.ikey.Trailer = trailer & trailerObsoleteMask 1093 i.ikey.UserKey = i.key[:n:n] 1094 if i.globalSeqNum != 0 { 1095 i.ikey.SetSeqNum(i.globalSeqNum) 1096 } 1097 if hiddenPoint { 1098 goto start 1099 } 1100 } else { 1101 i.ikey.Trailer = uint64(InternalKeyKindInvalid) 1102 i.ikey.UserKey = nil 1103 } 1104 if !i.lazyValueHandling.hasValuePrefix || 1105 base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { 1106 i.lazyValue = base.MakeInPlaceValue(i.val) 1107 } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { 1108 i.lazyValue = base.MakeInPlaceValue(i.val[1:]) 1109 } else { 1110 i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) 1111 } 1112 return &i.ikey, i.lazyValue 1113 } 1114 1115 // NextPrefix implements (base.InternalIterator).NextPrefix. 1116 func (i *blockIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) { 1117 if i.lazyValueHandling.hasValuePrefix { 1118 return i.nextPrefixV3(succKey) 1119 } 1120 const nextsBeforeSeek = 3 1121 k, v := i.Next() 1122 for j := 1; k != nil && i.cmp(k.UserKey, succKey) < 0; j++ { 1123 if j >= nextsBeforeSeek { 1124 return i.SeekGE(succKey, base.SeekGEFlagsNone) 1125 } 1126 k, v = i.Next() 1127 } 1128 return k, v 1129 } 1130 1131 func (i *blockIter) nextPrefixV3(succKey []byte) (*InternalKey, base.LazyValue) { 1132 // Doing nexts that involve a key comparison can be expensive (and the cost 1133 // depends on the key length), so we use the same threshold of 3 that we use 1134 // for TableFormatPebblev2 in blockIter.nextPrefix above. The next fast path 1135 // that looks at setHasSamePrefix takes ~5ns per key, which is ~150x faster 1136 // than doing a SeekGE within the block, so we do this 16 times 1137 // (~5ns*16=80ns), and then switch to looking at restarts. Doing the binary 1138 // search for the restart consumes > 100ns. If the number of versions is > 1139 // 17, we will increment nextFastCount to 17, then do a binary search, and 1140 // on average need to find a key between two restarts, so another 8 steps 1141 // corresponding to nextFastCount, for a mean total of 17 + 8 = 25 such 1142 // steps. 1143 // 1144 // TODO(sumeer): use the configured restartInterval for the sstable when it 1145 // was written (which we don't currently store) instead of the default value 1146 // of 16. 1147 const nextCmpThresholdBeforeSeek = 3 1148 const nextFastThresholdBeforeRestarts = 16 1149 nextCmpCount := 0 1150 nextFastCount := 0 1151 usedRestarts := false 1152 // INVARIANT: blockIter is valid. 1153 if invariants.Enabled && !i.valid() { 1154 panic(errors.AssertionFailedf("nextPrefixV3 called on invalid blockIter")) 1155 } 1156 prevKeyIsSet := i.ikey.Kind() == InternalKeyKindSet 1157 for { 1158 i.offset = i.nextOffset 1159 if !i.valid() { 1160 return nil, base.LazyValue{} 1161 } 1162 // Need to decode the length integers, so we can compute nextOffset. 1163 ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset)) 1164 // This is an ugly performance hack. Reading entries from blocks is one of 1165 // the inner-most routines and decoding the 3 varints per-entry takes 1166 // significant time. Neither go1.11 or go1.12 will inline decodeVarint for 1167 // us, so we do it manually. This provides a 10-15% performance improvement 1168 // on blockIter benchmarks on both go1.11 and go1.12. 1169 // 1170 // TODO(peter): remove this hack if go:inline is ever supported. 1171 1172 // Decode the shared key length integer. 1173 var shared uint32 1174 if a := *((*uint8)(ptr)); a < 128 { 1175 shared = uint32(a) 1176 ptr = unsafe.Pointer(uintptr(ptr) + 1) 1177 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 1178 shared = uint32(b)<<7 | uint32(a) 1179 ptr = unsafe.Pointer(uintptr(ptr) + 2) 1180 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 1181 shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1182 ptr = unsafe.Pointer(uintptr(ptr) + 3) 1183 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 1184 shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1185 ptr = unsafe.Pointer(uintptr(ptr) + 4) 1186 } else { 1187 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 1188 shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1189 ptr = unsafe.Pointer(uintptr(ptr) + 5) 1190 } 1191 // Decode the unshared key length integer. 1192 var unshared uint32 1193 if a := *((*uint8)(ptr)); a < 128 { 1194 unshared = uint32(a) 1195 ptr = unsafe.Pointer(uintptr(ptr) + 1) 1196 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 1197 unshared = uint32(b)<<7 | uint32(a) 1198 ptr = unsafe.Pointer(uintptr(ptr) + 2) 1199 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 1200 unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1201 ptr = unsafe.Pointer(uintptr(ptr) + 3) 1202 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 1203 unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1204 ptr = unsafe.Pointer(uintptr(ptr) + 4) 1205 } else { 1206 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 1207 unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1208 ptr = unsafe.Pointer(uintptr(ptr) + 5) 1209 } 1210 // Decode the value length integer. 1211 var value uint32 1212 if a := *((*uint8)(ptr)); a < 128 { 1213 value = uint32(a) 1214 ptr = unsafe.Pointer(uintptr(ptr) + 1) 1215 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 1216 value = uint32(b)<<7 | uint32(a) 1217 ptr = unsafe.Pointer(uintptr(ptr) + 2) 1218 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 1219 value = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1220 ptr = unsafe.Pointer(uintptr(ptr) + 3) 1221 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 1222 value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1223 ptr = unsafe.Pointer(uintptr(ptr) + 4) 1224 } else { 1225 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 1226 value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1227 ptr = unsafe.Pointer(uintptr(ptr) + 5) 1228 } 1229 // The starting position of the value. 1230 valuePtr := unsafe.Pointer(uintptr(ptr) + uintptr(unshared)) 1231 i.nextOffset = int32(uintptr(valuePtr)-uintptr(i.ptr)) + int32(value) 1232 if invariants.Enabled && unshared < 8 { 1233 // This should not happen since only the key prefix is shared, so even 1234 // if the prefix length is the same as the user key length, the unshared 1235 // will include the trailer. 1236 panic(errors.AssertionFailedf("unshared %d is too small", unshared)) 1237 } 1238 // The trailer is written in little endian, so the key kind is the first 1239 // byte in the trailer that is encoded in the slice [unshared-8:unshared]. 1240 keyKind := InternalKeyKind((*[manual.MaxArrayLen]byte)(ptr)[unshared-8]) 1241 keyKind = keyKind & base.InternalKeyKindSSTableInternalObsoleteMask 1242 prefixChanged := false 1243 if keyKind == InternalKeyKindSet { 1244 if invariants.Enabled && value == 0 { 1245 panic(errors.AssertionFailedf("value is of length 0, but we expect a valuePrefix")) 1246 } 1247 valPrefix := *((*valuePrefix)(valuePtr)) 1248 if setHasSamePrefix(valPrefix) { 1249 // Fast-path. No need to assemble i.fullKey, or update i.key. We know 1250 // that subsequent keys will not have a shared length that is greater 1251 // than the prefix of the current key, which is also the prefix of 1252 // i.key. Since we are continuing to iterate, we don't need to 1253 // initialize i.ikey and i.lazyValue (these are initialized before 1254 // returning). 1255 nextFastCount++ 1256 if nextFastCount > nextFastThresholdBeforeRestarts { 1257 if usedRestarts { 1258 // Exhausted iteration budget. This will never happen unless 1259 // someone is using a restart interval > 16. It is just to guard 1260 // against long restart intervals causing too much iteration. 1261 break 1262 } 1263 // Haven't used restarts yet, so find the first restart at or beyond 1264 // the current offset. 1265 targetOffset := i.offset 1266 var index int32 1267 { 1268 // NB: manually inlined sort.Sort is ~5% faster. 1269 // 1270 // f defined for a restart point is true iff the offset >= 1271 // targetOffset. 1272 // Define f(-1) == false and f(i.numRestarts) == true. 1273 // Invariant: f(index-1) == false, f(upper) == true. 1274 upper := i.numRestarts 1275 for index < upper { 1276 h := int32(uint(index+upper) >> 1) // avoid overflow when computing h 1277 // index ≤ h < upper 1278 offset := decodeRestart(i.data[i.restarts+4*h:]) 1279 if offset < targetOffset { 1280 index = h + 1 // preserves f(index-1) == false 1281 } else { 1282 upper = h // preserves f(upper) == true 1283 } 1284 } 1285 // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true 1286 // => answer is index. 1287 } 1288 usedRestarts = true 1289 nextFastCount = 0 1290 if index == i.numRestarts { 1291 // Already past the last real restart, so iterate a bit more until 1292 // we are done with the block. 1293 continue 1294 } 1295 // Have some real restarts after index. NB: index is the first 1296 // restart at or beyond the current offset. 1297 startingIndex := index 1298 for index != i.numRestarts && 1299 // The restart at index is 4 bytes written in little endian format 1300 // starting at i.restart+4*index. The 0th byte is the least 1301 // significant and the 3rd byte is the most significant. Since the 1302 // most significant bit of the 3rd byte is what we use for 1303 // encoding the set-has-same-prefix information, the indexing 1304 // below has +3. 1305 i.data[i.restarts+4*index+3]&restartMaskLittleEndianHighByteOnlySetHasSamePrefix != 0 { 1306 // We still have the same prefix, so move to the next restart. 1307 index++ 1308 } 1309 // index is the first restart that did not have the same prefix. 1310 if index != startingIndex { 1311 // Managed to skip past at least one restart. Resume iteration 1312 // from index-1. Since nextFastCount has been reset to 0, we 1313 // should be able to iterate to the next prefix. 1314 i.offset = decodeRestart(i.data[i.restarts+4*(index-1):]) 1315 i.readEntry() 1316 } 1317 // Else, unable to skip past any restart. Resume iteration. Since 1318 // nextFastCount has been reset to 0, we should be able to iterate 1319 // to the next prefix. 1320 continue 1321 } 1322 continue 1323 } else if prevKeyIsSet { 1324 prefixChanged = true 1325 } 1326 } else { 1327 prevKeyIsSet = false 1328 } 1329 // Slow-path cases: 1330 // - (Likely) The prefix has changed. 1331 // - (Unlikely) The prefix has not changed. 1332 // We assemble the key etc. under the assumption that it is the likely 1333 // case. 1334 unsharedKey := getBytes(ptr, int(unshared)) 1335 // TODO(sumeer): move this into the else block below. This is a bit tricky 1336 // since the current logic assumes we have always copied the latest key 1337 // into fullKey, which is why when we get to the next key we can (a) 1338 // access i.fullKey[:shared], (b) append only the unsharedKey to 1339 // i.fullKey. For (a), we can access i.key[:shared] since that memory is 1340 // valid (even if unshared). For (b), we will need to remember whether 1341 // i.key refers to i.fullKey or not, and can append the unsharedKey only 1342 // in the former case and for the latter case need to copy the shared part 1343 // too. This same comment applies to the other place where we can do this 1344 // optimization, in readEntry(). 1345 i.fullKey = append(i.fullKey[:shared], unsharedKey...) 1346 i.val = getBytes(valuePtr, int(value)) 1347 if shared == 0 { 1348 // Provide stability for the key across positioning calls if the key 1349 // doesn't share a prefix with the previous key. This removes requiring the 1350 // key to be copied if the caller knows the block has a restart interval of 1351 // 1. An important example of this is range-del blocks. 1352 i.key = unsharedKey 1353 } else { 1354 i.key = i.fullKey 1355 } 1356 // Manually inlined version of i.decodeInternalKey(i.key). 1357 hiddenPoint := false 1358 if n := len(i.key) - 8; n >= 0 { 1359 trailer := binary.LittleEndian.Uint64(i.key[n:]) 1360 hiddenPoint = i.hideObsoletePoints && 1361 (trailer&trailerObsoleteBit != 0) 1362 i.ikey.Trailer = trailer & trailerObsoleteMask 1363 i.ikey.UserKey = i.key[:n:n] 1364 if i.globalSeqNum != 0 { 1365 i.ikey.SetSeqNum(i.globalSeqNum) 1366 } 1367 } else { 1368 i.ikey.Trailer = uint64(InternalKeyKindInvalid) 1369 i.ikey.UserKey = nil 1370 } 1371 nextCmpCount++ 1372 if invariants.Enabled && prefixChanged && i.cmp(i.ikey.UserKey, succKey) < 0 { 1373 panic(errors.AssertionFailedf("prefix should have changed but %x < %x", 1374 i.ikey.UserKey, succKey)) 1375 } 1376 if prefixChanged || i.cmp(i.ikey.UserKey, succKey) >= 0 { 1377 // Prefix has changed. 1378 if hiddenPoint { 1379 return i.Next() 1380 } 1381 if invariants.Enabled && !i.lazyValueHandling.hasValuePrefix { 1382 panic(errors.AssertionFailedf("nextPrefixV3 being run for non-v3 sstable")) 1383 } 1384 if base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { 1385 i.lazyValue = base.MakeInPlaceValue(i.val) 1386 } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { 1387 i.lazyValue = base.MakeInPlaceValue(i.val[1:]) 1388 } else { 1389 i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) 1390 } 1391 return &i.ikey, i.lazyValue 1392 } 1393 // Else prefix has not changed. 1394 1395 if nextCmpCount >= nextCmpThresholdBeforeSeek { 1396 break 1397 } 1398 } 1399 return i.SeekGE(succKey, base.SeekGEFlagsNone) 1400 } 1401 1402 // Prev implements internalIterator.Prev, as documented in the pebble 1403 // package. 1404 func (i *blockIter) Prev() (*InternalKey, base.LazyValue) { 1405 start: 1406 for n := len(i.cached) - 1; n >= 0; n-- { 1407 i.nextOffset = i.offset 1408 e := &i.cached[n] 1409 i.offset = e.offset 1410 i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize)) 1411 // Manually inlined version of i.decodeInternalKey(i.key). 1412 i.key = i.cachedBuf[e.keyStart:e.keyEnd] 1413 if n := len(i.key) - 8; n >= 0 { 1414 trailer := binary.LittleEndian.Uint64(i.key[n:]) 1415 hiddenPoint := i.hideObsoletePoints && 1416 (trailer&trailerObsoleteBit != 0) 1417 if hiddenPoint { 1418 continue 1419 } 1420 i.ikey.Trailer = trailer & trailerObsoleteMask 1421 i.ikey.UserKey = i.key[:n:n] 1422 if i.globalSeqNum != 0 { 1423 i.ikey.SetSeqNum(i.globalSeqNum) 1424 } 1425 } else { 1426 i.ikey.Trailer = uint64(InternalKeyKindInvalid) 1427 i.ikey.UserKey = nil 1428 } 1429 i.cached = i.cached[:n] 1430 if !i.lazyValueHandling.hasValuePrefix || 1431 base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { 1432 i.lazyValue = base.MakeInPlaceValue(i.val) 1433 } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { 1434 i.lazyValue = base.MakeInPlaceValue(i.val[1:]) 1435 } else { 1436 i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) 1437 } 1438 return &i.ikey, i.lazyValue 1439 } 1440 1441 i.clearCache() 1442 if i.offset <= 0 { 1443 i.offset = -1 1444 i.nextOffset = 0 1445 return nil, base.LazyValue{} 1446 } 1447 1448 targetOffset := i.offset 1449 var index int32 1450 1451 { 1452 // NB: manually inlined sort.Sort is ~5% faster. 1453 // 1454 // Define f(-1) == false and f(n) == true. 1455 // Invariant: f(index-1) == false, f(upper) == true. 1456 upper := i.numRestarts 1457 for index < upper { 1458 h := int32(uint(index+upper) >> 1) // avoid overflow when computing h 1459 // index ≤ h < upper 1460 offset := decodeRestart(i.data[i.restarts+4*h:]) 1461 if offset < targetOffset { 1462 // Looking for the first restart that has offset >= targetOffset, so 1463 // ignore h and earlier. 1464 index = h + 1 // preserves f(i-1) == false 1465 } else { 1466 upper = h // preserves f(j) == true 1467 } 1468 } 1469 // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true 1470 // => answer is index. 1471 } 1472 1473 // index is first restart with offset >= targetOffset. Note that 1474 // targetOffset may not be at a restart point since one can call Prev() 1475 // after Next() (so the cache was not populated) and targetOffset refers to 1476 // the current entry. index-1 must have an offset < targetOffset (it can't 1477 // be equal to targetOffset since the binary search would have selected that 1478 // as the index). 1479 i.offset = 0 1480 if index > 0 { 1481 i.offset = decodeRestart(i.data[i.restarts+4*(index-1):]) 1482 } 1483 // TODO(sumeer): why is the else case not an error given targetOffset is a 1484 // valid offset. 1485 1486 i.readEntry() 1487 1488 // We stop when i.nextOffset == targetOffset since the targetOffset is the 1489 // entry we are stepping back from, and we don't need to cache the entry 1490 // before it, since it is the candidate to return. 1491 for i.nextOffset < targetOffset { 1492 i.cacheEntry() 1493 i.offset = i.nextOffset 1494 i.readEntry() 1495 } 1496 1497 hiddenPoint := i.decodeInternalKey(i.key) 1498 if hiddenPoint { 1499 // Use the cache. 1500 goto start 1501 } 1502 if !i.lazyValueHandling.hasValuePrefix || 1503 base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { 1504 i.lazyValue = base.MakeInPlaceValue(i.val) 1505 } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { 1506 i.lazyValue = base.MakeInPlaceValue(i.val[1:]) 1507 } else { 1508 i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) 1509 } 1510 return &i.ikey, i.lazyValue 1511 } 1512 1513 // Key implements internalIterator.Key, as documented in the pebble package. 1514 func (i *blockIter) Key() *InternalKey { 1515 return &i.ikey 1516 } 1517 1518 func (i *blockIter) value() base.LazyValue { 1519 return i.lazyValue 1520 } 1521 1522 // Error implements internalIterator.Error, as documented in the pebble 1523 // package. 1524 func (i *blockIter) Error() error { 1525 return nil // infallible 1526 } 1527 1528 // Close implements internalIterator.Close, as documented in the pebble 1529 // package. 1530 func (i *blockIter) Close() error { 1531 i.handle.Release() 1532 i.handle = bufferHandle{} 1533 i.val = nil 1534 i.lazyValue = base.LazyValue{} 1535 i.lazyValueHandling.vbr = nil 1536 return nil 1537 } 1538 1539 func (i *blockIter) SetBounds(lower, upper []byte) { 1540 // This should never be called as bounds are handled by sstable.Iterator. 1541 panic("pebble: SetBounds unimplemented") 1542 } 1543 1544 func (i *blockIter) SetContext(_ context.Context) {} 1545 1546 func (i *blockIter) valid() bool { 1547 return i.offset >= 0 && i.offset < i.restarts 1548 } 1549 1550 // fragmentBlockIter wraps a blockIter, implementing the 1551 // keyspan.FragmentIterator interface. It's used for reading range deletion and 1552 // range key blocks. 1553 // 1554 // Range deletions and range keys are fragmented before they're persisted to the 1555 // block. Overlapping fragments have identical bounds. The fragmentBlockIter 1556 // gathers all the fragments with identical bounds within a block and returns a 1557 // single keyspan.Span describing all the keys defined over the span. 1558 // 1559 // # Memory lifetime 1560 // 1561 // A Span returned by fragmentBlockIter is only guaranteed to be stable until 1562 // the next fragmentBlockIter iteration positioning method. A Span's Keys slice 1563 // may be reused, so the user must not assume it's stable. 1564 // 1565 // Blocks holding range deletions and range keys are configured to use a restart 1566 // interval of 1. This provides key stability. The caller may treat the various 1567 // byte slices (start, end, suffix, value) as stable for the lifetime of the 1568 // iterator. 1569 type fragmentBlockIter struct { 1570 blockIter blockIter 1571 keyBuf [2]keyspan.Key 1572 span keyspan.Span 1573 err error 1574 dir int8 1575 closeHook func(i keyspan.FragmentIterator) error 1576 1577 // elideSameSeqnum, if true, returns only the first-occurring (in forward 1578 // order) Key for each sequence number. 1579 elideSameSeqnum bool 1580 } 1581 1582 func (i *fragmentBlockIter) resetForReuse() fragmentBlockIter { 1583 return fragmentBlockIter{blockIter: i.blockIter.resetForReuse()} 1584 } 1585 1586 func (i *fragmentBlockIter) decodeSpanKeys(k *InternalKey, internalValue []byte) { 1587 // TODO(jackson): The use of i.span.Keys to accumulate keys across multiple 1588 // calls to Decode is too confusing and subtle. Refactor to make it 1589 // explicit. 1590 1591 // decode the contents of the fragment's value. This always includes at 1592 // least the end key: RANGEDELs store the end key directly as the value, 1593 // whereas the various range key kinds store are more complicated. The 1594 // details of the range key internal value format are documented within the 1595 // internal/rangekey package. 1596 switch k.Kind() { 1597 case base.InternalKeyKindRangeDelete: 1598 i.span = rangedel.Decode(*k, internalValue, i.span.Keys) 1599 i.err = nil 1600 case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset, base.InternalKeyKindRangeKeyDelete: 1601 i.span, i.err = rangekey.Decode(*k, internalValue, i.span.Keys) 1602 default: 1603 i.span = keyspan.Span{} 1604 i.err = base.CorruptionErrorf("pebble: corrupt keyspan fragment of kind %d", k.Kind()) 1605 } 1606 } 1607 1608 func (i *fragmentBlockIter) elideKeysOfSameSeqNum() { 1609 if invariants.Enabled { 1610 if !i.elideSameSeqnum || len(i.span.Keys) == 0 { 1611 panic("elideKeysOfSameSeqNum called when it should not be") 1612 } 1613 } 1614 lastSeqNum := i.span.Keys[0].SeqNum() 1615 k := 1 1616 for j := 1; j < len(i.span.Keys); j++ { 1617 if lastSeqNum != i.span.Keys[j].SeqNum() { 1618 lastSeqNum = i.span.Keys[j].SeqNum() 1619 i.span.Keys[k] = i.span.Keys[j] 1620 k++ 1621 } 1622 } 1623 i.span.Keys = i.span.Keys[:k] 1624 } 1625 1626 // gatherForward gathers internal keys with identical bounds. Keys defined over 1627 // spans of the keyspace are fragmented such that any overlapping key spans have 1628 // identical bounds. When these spans are persisted to a range deletion or range 1629 // key block, they may be persisted as multiple internal keys in order to encode 1630 // multiple sequence numbers or key kinds. 1631 // 1632 // gatherForward iterates forward, re-combining the fragmented internal keys to 1633 // reconstruct a keyspan.Span that holds all the keys defined over the span. 1634 func (i *fragmentBlockIter) gatherForward(k *InternalKey, lazyValue base.LazyValue) *keyspan.Span { 1635 i.span = keyspan.Span{} 1636 if k == nil || !i.blockIter.valid() { 1637 return nil 1638 } 1639 i.err = nil 1640 // Use the i.keyBuf array to back the Keys slice to prevent an allocation 1641 // when a span contains few keys. 1642 i.span.Keys = i.keyBuf[:0] 1643 1644 // Decode the span's end key and individual keys from the value. 1645 internalValue := lazyValue.InPlaceValue() 1646 i.decodeSpanKeys(k, internalValue) 1647 if i.err != nil { 1648 return nil 1649 } 1650 prevEnd := i.span.End 1651 1652 // There might exist additional internal keys with identical bounds encoded 1653 // within the block. Iterate forward, accumulating all the keys with 1654 // identical bounds to s. 1655 k, lazyValue = i.blockIter.Next() 1656 internalValue = lazyValue.InPlaceValue() 1657 for k != nil && i.blockIter.cmp(k.UserKey, i.span.Start) == 0 { 1658 i.decodeSpanKeys(k, internalValue) 1659 if i.err != nil { 1660 return nil 1661 } 1662 1663 // Since k indicates an equal start key, the encoded end key must 1664 // exactly equal the original end key from the first internal key. 1665 // Overlapping fragments are required to have exactly equal start and 1666 // end bounds. 1667 if i.blockIter.cmp(prevEnd, i.span.End) != 0 { 1668 i.err = base.CorruptionErrorf("pebble: corrupt keyspan fragmentation") 1669 i.span = keyspan.Span{} 1670 return nil 1671 } 1672 k, lazyValue = i.blockIter.Next() 1673 internalValue = lazyValue.InPlaceValue() 1674 } 1675 if i.elideSameSeqnum && len(i.span.Keys) > 0 { 1676 i.elideKeysOfSameSeqNum() 1677 } 1678 // i.blockIter is positioned over the first internal key for the next span. 1679 return &i.span 1680 } 1681 1682 // gatherBackward gathers internal keys with identical bounds. Keys defined over 1683 // spans of the keyspace are fragmented such that any overlapping key spans have 1684 // identical bounds. When these spans are persisted to a range deletion or range 1685 // key block, they may be persisted as multiple internal keys in order to encode 1686 // multiple sequence numbers or key kinds. 1687 // 1688 // gatherBackward iterates backwards, re-combining the fragmented internal keys 1689 // to reconstruct a keyspan.Span that holds all the keys defined over the span. 1690 func (i *fragmentBlockIter) gatherBackward(k *InternalKey, lazyValue base.LazyValue) *keyspan.Span { 1691 i.span = keyspan.Span{} 1692 if k == nil || !i.blockIter.valid() { 1693 return nil 1694 } 1695 i.err = nil 1696 // Use the i.keyBuf array to back the Keys slice to prevent an allocation 1697 // when a span contains few keys. 1698 i.span.Keys = i.keyBuf[:0] 1699 1700 // Decode the span's end key and individual keys from the value. 1701 internalValue := lazyValue.InPlaceValue() 1702 i.decodeSpanKeys(k, internalValue) 1703 if i.err != nil { 1704 return nil 1705 } 1706 prevEnd := i.span.End 1707 1708 // There might exist additional internal keys with identical bounds encoded 1709 // within the block. Iterate backward, accumulating all the keys with 1710 // identical bounds to s. 1711 k, lazyValue = i.blockIter.Prev() 1712 internalValue = lazyValue.InPlaceValue() 1713 for k != nil && i.blockIter.cmp(k.UserKey, i.span.Start) == 0 { 1714 i.decodeSpanKeys(k, internalValue) 1715 if i.err != nil { 1716 return nil 1717 } 1718 1719 // Since k indicates an equal start key, the encoded end key must 1720 // exactly equal the original end key from the first internal key. 1721 // Overlapping fragments are required to have exactly equal start and 1722 // end bounds. 1723 if i.blockIter.cmp(prevEnd, i.span.End) != 0 { 1724 i.err = base.CorruptionErrorf("pebble: corrupt keyspan fragmentation") 1725 i.span = keyspan.Span{} 1726 return nil 1727 } 1728 k, lazyValue = i.blockIter.Prev() 1729 internalValue = lazyValue.InPlaceValue() 1730 } 1731 // i.blockIter is positioned over the last internal key for the previous 1732 // span. 1733 1734 // Backwards iteration encounters internal keys in the wrong order. 1735 keyspan.SortKeysByTrailer(&i.span.Keys) 1736 1737 if i.elideSameSeqnum && len(i.span.Keys) > 0 { 1738 i.elideKeysOfSameSeqNum() 1739 } 1740 return &i.span 1741 } 1742 1743 // Error implements (keyspan.FragmentIterator).Error. 1744 func (i *fragmentBlockIter) Error() error { 1745 return i.err 1746 } 1747 1748 // Close implements (keyspan.FragmentIterator).Close. 1749 func (i *fragmentBlockIter) Close() error { 1750 var err error 1751 if i.closeHook != nil { 1752 err = i.closeHook(i) 1753 } 1754 err = firstError(err, i.blockIter.Close()) 1755 return err 1756 } 1757 1758 // First implements (keyspan.FragmentIterator).First 1759 func (i *fragmentBlockIter) First() *keyspan.Span { 1760 i.dir = +1 1761 return i.gatherForward(i.blockIter.First()) 1762 } 1763 1764 // Last implements (keyspan.FragmentIterator).Last. 1765 func (i *fragmentBlockIter) Last() *keyspan.Span { 1766 i.dir = -1 1767 return i.gatherBackward(i.blockIter.Last()) 1768 } 1769 1770 // Next implements (keyspan.FragmentIterator).Next. 1771 func (i *fragmentBlockIter) Next() *keyspan.Span { 1772 switch { 1773 case i.dir == -1 && !i.span.Valid(): 1774 // Switching directions. 1775 // 1776 // i.blockIter is exhausted, before the first key. Move onto the first. 1777 i.blockIter.First() 1778 i.dir = +1 1779 case i.dir == -1 && i.span.Valid(): 1780 // Switching directions. 1781 // 1782 // i.blockIter is currently positioned over the last internal key for 1783 // the previous span. Next it once to move to the first internal key 1784 // that makes up the current span, and gatherForwaad to land on the 1785 // first internal key making up the next span. 1786 // 1787 // In the diagram below, if the last span returned to the user during 1788 // reverse iteration was [b,c), i.blockIter is currently positioned at 1789 // [a,b). The block iter must be positioned over [d,e) to gather the 1790 // next span's fragments. 1791 // 1792 // ... [a,b) [b,c) [b,c) [b,c) [d,e) ... 1793 // ^ ^ 1794 // i.blockIter want 1795 if x := i.gatherForward(i.blockIter.Next()); invariants.Enabled && !x.Valid() { 1796 panic("pebble: invariant violation: next entry unexpectedly invalid") 1797 } 1798 i.dir = +1 1799 } 1800 // We know that this blockIter has in-place values. 1801 return i.gatherForward(&i.blockIter.ikey, base.MakeInPlaceValue(i.blockIter.val)) 1802 } 1803 1804 // Prev implements (keyspan.FragmentIterator).Prev. 1805 func (i *fragmentBlockIter) Prev() *keyspan.Span { 1806 switch { 1807 case i.dir == +1 && !i.span.Valid(): 1808 // Switching directions. 1809 // 1810 // i.blockIter is exhausted, after the last key. Move onto the last. 1811 i.blockIter.Last() 1812 i.dir = -1 1813 case i.dir == +1 && i.span.Valid(): 1814 // Switching directions. 1815 // 1816 // i.blockIter is currently positioned over the first internal key for 1817 // the next span. Prev it once to move to the last internal key that 1818 // makes up the current span, and gatherBackward to land on the last 1819 // internal key making up the previous span. 1820 // 1821 // In the diagram below, if the last span returned to the user during 1822 // forward iteration was [b,c), i.blockIter is currently positioned at 1823 // [d,e). The block iter must be positioned over [a,b) to gather the 1824 // previous span's fragments. 1825 // 1826 // ... [a,b) [b,c) [b,c) [b,c) [d,e) ... 1827 // ^ ^ 1828 // want i.blockIter 1829 if x := i.gatherBackward(i.blockIter.Prev()); invariants.Enabled && !x.Valid() { 1830 panic("pebble: invariant violation: previous entry unexpectedly invalid") 1831 } 1832 i.dir = -1 1833 } 1834 // We know that this blockIter has in-place values. 1835 return i.gatherBackward(&i.blockIter.ikey, base.MakeInPlaceValue(i.blockIter.val)) 1836 } 1837 1838 // SeekGE implements (keyspan.FragmentIterator).SeekGE. 1839 func (i *fragmentBlockIter) SeekGE(k []byte) *keyspan.Span { 1840 if s := i.SeekLT(k); s != nil && i.blockIter.cmp(k, s.End) < 0 { 1841 return s 1842 } 1843 // TODO(jackson): If the above i.SeekLT(k) discovers a span but the span 1844 // doesn't meet the k < s.End comparison, then there's no need for the 1845 // SeekLT to gatherBackward. 1846 return i.Next() 1847 } 1848 1849 // SeekLT implements (keyspan.FragmentIterator).SeekLT. 1850 func (i *fragmentBlockIter) SeekLT(k []byte) *keyspan.Span { 1851 i.dir = -1 1852 return i.gatherBackward(i.blockIter.SeekLT(k, base.SeekLTFlagsNone)) 1853 } 1854 1855 // String implements fmt.Stringer. 1856 func (i *fragmentBlockIter) String() string { 1857 return "fragment-block-iter" 1858 } 1859 1860 // SetCloseHook implements sstable.FragmentIterator. 1861 func (i *fragmentBlockIter) SetCloseHook(fn func(i keyspan.FragmentIterator) error) { 1862 i.closeHook = fn 1863 }