github.com/cockroachdb/pebble@v1.1.2/sstable/block.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package sstable 6 7 import ( 8 "encoding/binary" 9 "unsafe" 10 11 "github.com/cockroachdb/errors" 12 "github.com/cockroachdb/pebble/internal/base" 13 "github.com/cockroachdb/pebble/internal/invariants" 14 "github.com/cockroachdb/pebble/internal/keyspan" 15 "github.com/cockroachdb/pebble/internal/manual" 16 "github.com/cockroachdb/pebble/internal/rangedel" 17 "github.com/cockroachdb/pebble/internal/rangekey" 18 ) 19 20 func uvarintLen(v uint32) int { 21 i := 0 22 for v >= 0x80 { 23 v >>= 7 24 i++ 25 } 26 return i + 1 27 } 28 29 type blockWriter struct { 30 restartInterval int 31 nEntries int 32 nextRestart int 33 buf []byte 34 // For datablocks in TableFormatPebblev3, we steal the most significant bit 35 // in restarts for encoding setHasSameKeyPrefixSinceLastRestart. This leaves 36 // us with 31 bits, which is more than enough (no one needs > 2GB blocks). 37 // Typically, restarts occur every 16 keys, and by storing this bit with the 38 // restart, we can optimize for the case where a user wants to skip to the 39 // next prefix which happens to be in the same data block, but is > 16 keys 40 // away. We have seen production situations with 100+ versions per MVCC key 41 // (which share the same prefix). Additionally, for such writers, the prefix 42 // compression of the key, that shares the key with the preceding key, is 43 // limited to the prefix part of the preceding key -- this ensures that when 44 // doing NPrefix (see blockIter) we don't need to assemble the full key 45 // for each step since by limiting the length of the shared key we are 46 // ensuring that any of the keys with the same prefix can be used to 47 // assemble the full key when the prefix does change. 48 restarts []uint32 49 // Do not read curKey directly from outside blockWriter since it can have 50 // the InternalKeyKindSSTableInternalObsoleteBit set. Use getCurKey() or 51 // getCurUserKey() instead. 52 curKey []byte 53 // curValue excludes the optional prefix provided to 54 // storeWithOptionalValuePrefix. 55 curValue []byte 56 prevKey []byte 57 tmp [4]byte 58 // We don't know the state of the sets that were at the end of the previous 59 // block, so this is initially 0. It may be true for the second and later 60 // restarts in a block. Not having inter-block information is fine since we 61 // will optimize by stepping through restarts only within the same block. 62 // Note that the first restart is the first key in the block. 63 setHasSameKeyPrefixSinceLastRestart bool 64 } 65 66 func (w *blockWriter) clear() { 67 *w = blockWriter{ 68 buf: w.buf[:0], 69 restarts: w.restarts[:0], 70 curKey: w.curKey[:0], 71 curValue: w.curValue[:0], 72 prevKey: w.prevKey[:0], 73 } 74 } 75 76 // MaximumBlockSize is an extremely generous maximum block size of 256MiB. We 77 // explicitly place this limit to reserve a few bits in the restart for 78 // internal use. 79 const MaximumBlockSize = 1 << 28 80 const setHasSameKeyPrefixRestartMask uint32 = 1 << 31 81 const restartMaskLittleEndianHighByteWithoutSetHasSamePrefix byte = 0b0111_1111 82 const restartMaskLittleEndianHighByteOnlySetHasSamePrefix byte = 0b1000_0000 83 84 func (w *blockWriter) getCurKey() InternalKey { 85 k := base.DecodeInternalKey(w.curKey) 86 k.Trailer = k.Trailer & trailerObsoleteMask 87 return k 88 } 89 90 func (w *blockWriter) getCurUserKey() []byte { 91 n := len(w.curKey) - base.InternalTrailerLen 92 if n < 0 { 93 panic(errors.AssertionFailedf("corrupt key in blockWriter buffer")) 94 } 95 return w.curKey[:n:n] 96 } 97 98 // If !addValuePrefix, the valuePrefix is ignored. 99 func (w *blockWriter) storeWithOptionalValuePrefix( 100 keySize int, 101 value []byte, 102 maxSharedKeyLen int, 103 addValuePrefix bool, 104 valuePrefix valuePrefix, 105 setHasSameKeyPrefix bool, 106 ) { 107 shared := 0 108 if !setHasSameKeyPrefix { 109 w.setHasSameKeyPrefixSinceLastRestart = false 110 } 111 if w.nEntries == w.nextRestart { 112 w.nextRestart = w.nEntries + w.restartInterval 113 restart := uint32(len(w.buf)) 114 if w.setHasSameKeyPrefixSinceLastRestart { 115 restart = restart | setHasSameKeyPrefixRestartMask 116 } 117 w.setHasSameKeyPrefixSinceLastRestart = true 118 w.restarts = append(w.restarts, restart) 119 } else { 120 // TODO(peter): Manually inlined version of base.SharedPrefixLen(). This 121 // is 3% faster on BenchmarkWriter on go1.16. Remove if future versions 122 // show this to not be a performance win. For now, functions that use of 123 // unsafe cannot be inlined. 124 n := maxSharedKeyLen 125 if n > len(w.prevKey) { 126 n = len(w.prevKey) 127 } 128 asUint64 := func(b []byte, i int) uint64 { 129 return binary.LittleEndian.Uint64(b[i:]) 130 } 131 for shared < n-7 && asUint64(w.curKey, shared) == asUint64(w.prevKey, shared) { 132 shared += 8 133 } 134 for shared < n && w.curKey[shared] == w.prevKey[shared] { 135 shared++ 136 } 137 } 138 139 lenValuePlusOptionalPrefix := len(value) 140 if addValuePrefix { 141 lenValuePlusOptionalPrefix++ 142 } 143 needed := 3*binary.MaxVarintLen32 + len(w.curKey[shared:]) + lenValuePlusOptionalPrefix 144 n := len(w.buf) 145 if cap(w.buf) < n+needed { 146 newCap := 2 * cap(w.buf) 147 if newCap == 0 { 148 newCap = 1024 149 } 150 for newCap < n+needed { 151 newCap *= 2 152 } 153 newBuf := make([]byte, n, newCap) 154 copy(newBuf, w.buf) 155 w.buf = newBuf 156 } 157 w.buf = w.buf[:n+needed] 158 159 // TODO(peter): Manually inlined versions of binary.PutUvarint(). This is 15% 160 // faster on BenchmarkWriter on go1.13. Remove if go1.14 or future versions 161 // show this to not be a performance win. 162 { 163 x := uint32(shared) 164 for x >= 0x80 { 165 w.buf[n] = byte(x) | 0x80 166 x >>= 7 167 n++ 168 } 169 w.buf[n] = byte(x) 170 n++ 171 } 172 173 { 174 x := uint32(keySize - shared) 175 for x >= 0x80 { 176 w.buf[n] = byte(x) | 0x80 177 x >>= 7 178 n++ 179 } 180 w.buf[n] = byte(x) 181 n++ 182 } 183 184 { 185 x := uint32(lenValuePlusOptionalPrefix) 186 for x >= 0x80 { 187 w.buf[n] = byte(x) | 0x80 188 x >>= 7 189 n++ 190 } 191 w.buf[n] = byte(x) 192 n++ 193 } 194 195 n += copy(w.buf[n:], w.curKey[shared:]) 196 if addValuePrefix { 197 w.buf[n : n+1][0] = byte(valuePrefix) 198 n++ 199 } 200 n += copy(w.buf[n:], value) 201 w.buf = w.buf[:n] 202 203 w.curValue = w.buf[n-len(value):] 204 205 w.nEntries++ 206 } 207 208 func (w *blockWriter) add(key InternalKey, value []byte) { 209 w.addWithOptionalValuePrefix( 210 key, false, value, len(key.UserKey), false, 0, false) 211 } 212 213 // Callers that always set addValuePrefix to false should use add() instead. 214 // 215 // isObsolete indicates whether this key-value pair is obsolete in this 216 // sstable (only applicable when writing data blocks) -- see the comment in 217 // table.go and the longer one in format.go. addValuePrefix adds a 1 byte 218 // prefix to the value, specified in valuePrefix -- this is used for data 219 // blocks in TableFormatPebblev3 onwards for SETs (see the comment in 220 // format.go, with more details in value_block.go). setHasSameKeyPrefix is 221 // also used in TableFormatPebblev3 onwards for SETs. 222 func (w *blockWriter) addWithOptionalValuePrefix( 223 key InternalKey, 224 isObsolete bool, 225 value []byte, 226 maxSharedKeyLen int, 227 addValuePrefix bool, 228 valuePrefix valuePrefix, 229 setHasSameKeyPrefix bool, 230 ) { 231 w.curKey, w.prevKey = w.prevKey, w.curKey 232 233 size := key.Size() 234 if cap(w.curKey) < size { 235 w.curKey = make([]byte, 0, size*2) 236 } 237 w.curKey = w.curKey[:size] 238 if isObsolete { 239 key.Trailer = key.Trailer | trailerObsoleteBit 240 } 241 key.Encode(w.curKey) 242 243 w.storeWithOptionalValuePrefix( 244 size, value, maxSharedKeyLen, addValuePrefix, valuePrefix, setHasSameKeyPrefix) 245 } 246 247 func (w *blockWriter) finish() []byte { 248 // Write the restart points to the buffer. 249 if w.nEntries == 0 { 250 // Every block must have at least one restart point. 251 if cap(w.restarts) > 0 { 252 w.restarts = w.restarts[:1] 253 w.restarts[0] = 0 254 } else { 255 w.restarts = append(w.restarts, 0) 256 } 257 } 258 tmp4 := w.tmp[:4] 259 for _, x := range w.restarts { 260 binary.LittleEndian.PutUint32(tmp4, x) 261 w.buf = append(w.buf, tmp4...) 262 } 263 binary.LittleEndian.PutUint32(tmp4, uint32(len(w.restarts))) 264 w.buf = append(w.buf, tmp4...) 265 result := w.buf 266 267 // Reset the block state. 268 w.nEntries = 0 269 w.nextRestart = 0 270 w.buf = w.buf[:0] 271 w.restarts = w.restarts[:0] 272 return result 273 } 274 275 // emptyBlockSize holds the size of an empty block. Every block ends 276 // in a uint32 trailer encoding the number of restart points within the 277 // block. 278 const emptyBlockSize = 4 279 280 func (w *blockWriter) estimatedSize() int { 281 return len(w.buf) + 4*len(w.restarts) + emptyBlockSize 282 } 283 284 type blockEntry struct { 285 offset int32 286 keyStart int32 287 keyEnd int32 288 valStart int32 289 valSize int32 290 } 291 292 // blockIter is an iterator over a single block of data. 293 // 294 // A blockIter provides an additional guarantee around key stability when a 295 // block has a restart interval of 1 (i.e. when there is no prefix 296 // compression). Key stability refers to whether the InternalKey.UserKey bytes 297 // returned by a positioning call will remain stable after a subsequent 298 // positioning call. The normal case is that a positioning call will invalidate 299 // any previously returned InternalKey.UserKey. If a block has a restart 300 // interval of 1 (no prefix compression), blockIter guarantees that 301 // InternalKey.UserKey will point to the key as stored in the block itself 302 // which will remain valid until the blockIter is closed. The key stability 303 // guarantee is used by the range tombstone and range key code, which knows that 304 // the respective blocks are always encoded with a restart interval of 1. This 305 // per-block key stability guarantee is sufficient for range tombstones and 306 // range deletes as they are always encoded in a single block. 307 // 308 // A blockIter also provides a value stability guarantee for range deletions and 309 // range keys since there is only a single range deletion and range key block 310 // per sstable and the blockIter will not release the bytes for the block until 311 // it is closed. 312 // 313 // Note on why blockIter knows about lazyValueHandling: 314 // 315 // blockIter's positioning functions (that return a LazyValue), are too 316 // complex to inline even prior to lazyValueHandling. blockIter.Next and 317 // blockIter.First were by far the cheapest and had costs 195 and 180 318 // respectively, which exceeds the budget of 80. We initially tried to keep 319 // the lazyValueHandling logic out of blockIter by wrapping it with a 320 // lazyValueDataBlockIter. singleLevelIter and twoLevelIter would use this 321 // wrapped iter. The functions in lazyValueDataBlockIter were simple, in that 322 // they called the corresponding blockIter func and then decided whether the 323 // value was in fact in-place (so return immediately) or needed further 324 // handling. But these also turned out too costly for mid-stack inlining since 325 // simple calls like the following have a high cost that is barely under the 326 // budget of 80 327 // 328 // k, v := i.data.SeekGE(key, flags) // cost 74 329 // k, v := i.data.Next() // cost 72 330 // 331 // We have 2 options for minimizing performance regressions: 332 // - Include the lazyValueHandling logic in the already non-inlineable 333 // blockIter functions: Since most of the time is spent in data block iters, 334 // it is acceptable to take the small hit of unnecessary branching (which 335 // hopefully branch prediction will predict correctly) for other kinds of 336 // blocks. 337 // - Duplicate the logic of singleLevelIterator and twoLevelIterator for the 338 // v3 sstable and only use the aforementioned lazyValueDataBlockIter for a 339 // v3 sstable. We would want to manage these copies via code generation. 340 // 341 // We have picked the first option here. 342 type blockIter struct { 343 cmp Compare 344 // offset is the byte index that marks where the current key/value is 345 // encoded in the block. 346 offset int32 347 // nextOffset is the byte index where the next key/value is encoded in the 348 // block. 349 nextOffset int32 350 // A "restart point" in a block is a point where the full key is encoded, 351 // instead of just having a suffix of the key encoded. See readEntry() for 352 // how prefix compression of keys works. Keys in between two restart points 353 // only have a suffix encoded in the block. When restart interval is 1, no 354 // prefix compression of keys happens. This is the case with range tombstone 355 // blocks. 356 // 357 // All restart offsets are listed in increasing order in 358 // i.ptr[i.restarts:len(block)-4], while numRestarts is encoded in the last 359 // 4 bytes of the block as a uint32 (i.ptr[len(block)-4:]). i.restarts can 360 // therefore be seen as the point where data in the block ends, and a list 361 // of offsets of all restart points begins. 362 restarts int32 363 // Number of restart points in this block. Encoded at the end of the block 364 // as a uint32. 365 numRestarts int32 366 globalSeqNum uint64 367 ptr unsafe.Pointer 368 data []byte 369 // key contains the raw key the iterator is currently pointed at. This may 370 // point directly to data stored in the block (for a key which has no prefix 371 // compression), to fullKey (for a prefix compressed key), or to a slice of 372 // data stored in cachedBuf (during reverse iteration). 373 key []byte 374 // fullKey is a buffer used for key prefix decompression. 375 fullKey []byte 376 // val contains the value the iterator is currently pointed at. If non-nil, 377 // this points to a slice of the block data. 378 val []byte 379 // lazyValue is val turned into a LazyValue, whenever a positioning method 380 // returns a non-nil key-value pair. 381 lazyValue base.LazyValue 382 // ikey contains the decoded InternalKey the iterator is currently pointed 383 // at. Note that the memory backing ikey.UserKey is either data stored 384 // directly in the block, fullKey, or cachedBuf. The key stability guarantee 385 // for blocks built with a restart interval of 1 is achieved by having 386 // ikey.UserKey always point to data stored directly in the block. 387 ikey InternalKey 388 // cached and cachedBuf are used during reverse iteration. They are needed 389 // because we can't perform prefix decoding in reverse, only in the forward 390 // direction. In order to iterate in reverse, we decode and cache the entries 391 // between two restart points. 392 // 393 // Note that cached[len(cached)-1] contains the previous entry to the one the 394 // blockIter is currently pointed at. As usual, nextOffset will contain the 395 // offset of the next entry. During reverse iteration, nextOffset will be 396 // updated to point to offset, and we'll set the blockIter to point at the 397 // entry cached[len(cached)-1]. See Prev() for more details. 398 // 399 // For a block encoded with a restart interval of 1, cached and cachedBuf 400 // will not be used as there are no prefix compressed entries between the 401 // restart points. 402 cached []blockEntry 403 cachedBuf []byte 404 handle bufferHandle 405 // for block iteration for already loaded blocks. 406 firstUserKey []byte 407 lazyValueHandling struct { 408 vbr *valueBlockReader 409 hasValuePrefix bool 410 } 411 hideObsoletePoints bool 412 } 413 414 // blockIter implements the base.InternalIterator interface. 415 var _ base.InternalIterator = (*blockIter)(nil) 416 417 func newBlockIter(cmp Compare, block block) (*blockIter, error) { 418 i := &blockIter{} 419 return i, i.init(cmp, block, 0, false) 420 } 421 422 func (i *blockIter) String() string { 423 return "block" 424 } 425 426 func (i *blockIter) init( 427 cmp Compare, block block, globalSeqNum uint64, hideObsoletePoints bool, 428 ) error { 429 numRestarts := int32(binary.LittleEndian.Uint32(block[len(block)-4:])) 430 if numRestarts == 0 { 431 return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)") 432 } 433 i.cmp = cmp 434 i.restarts = int32(len(block)) - 4*(1+numRestarts) 435 i.numRestarts = numRestarts 436 i.globalSeqNum = globalSeqNum 437 i.ptr = unsafe.Pointer(&block[0]) 438 i.data = block 439 i.fullKey = i.fullKey[:0] 440 i.val = nil 441 i.hideObsoletePoints = hideObsoletePoints 442 i.clearCache() 443 if i.restarts > 0 { 444 if err := i.readFirstKey(); err != nil { 445 return err 446 } 447 } else { 448 // Block is empty. 449 i.firstUserKey = nil 450 } 451 return nil 452 } 453 454 // NB: two cases of hideObsoletePoints: 455 // - Local sstable iteration: globalSeqNum will be set iff the sstable was 456 // ingested. 457 // - Foreign sstable iteration: globalSeqNum is always set. 458 func (i *blockIter) initHandle( 459 cmp Compare, block bufferHandle, globalSeqNum uint64, hideObsoletePoints bool, 460 ) error { 461 i.handle.Release() 462 i.handle = block 463 return i.init(cmp, block.Get(), globalSeqNum, hideObsoletePoints) 464 } 465 466 func (i *blockIter) invalidate() { 467 i.clearCache() 468 i.offset = 0 469 i.nextOffset = 0 470 i.restarts = 0 471 i.numRestarts = 0 472 i.data = nil 473 } 474 475 // isDataInvalidated returns true when the blockIter has been invalidated 476 // using an invalidate call. NB: this is different from blockIter.Valid 477 // which is part of the InternalIterator implementation. 478 func (i *blockIter) isDataInvalidated() bool { 479 return i.data == nil 480 } 481 482 func (i *blockIter) resetForReuse() blockIter { 483 return blockIter{ 484 fullKey: i.fullKey[:0], 485 cached: i.cached[:0], 486 cachedBuf: i.cachedBuf[:0], 487 data: nil, 488 } 489 } 490 491 func (i *blockIter) readEntry() { 492 ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset)) 493 494 // This is an ugly performance hack. Reading entries from blocks is one of 495 // the inner-most routines and decoding the 3 varints per-entry takes 496 // significant time. Neither go1.11 or go1.12 will inline decodeVarint for 497 // us, so we do it manually. This provides a 10-15% performance improvement 498 // on blockIter benchmarks on both go1.11 and go1.12. 499 // 500 // TODO(peter): remove this hack if go:inline is ever supported. 501 502 var shared uint32 503 if a := *((*uint8)(ptr)); a < 128 { 504 shared = uint32(a) 505 ptr = unsafe.Pointer(uintptr(ptr) + 1) 506 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 507 shared = uint32(b)<<7 | uint32(a) 508 ptr = unsafe.Pointer(uintptr(ptr) + 2) 509 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 510 shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 511 ptr = unsafe.Pointer(uintptr(ptr) + 3) 512 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 513 shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 514 ptr = unsafe.Pointer(uintptr(ptr) + 4) 515 } else { 516 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 517 shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 518 ptr = unsafe.Pointer(uintptr(ptr) + 5) 519 } 520 521 var unshared uint32 522 if a := *((*uint8)(ptr)); a < 128 { 523 unshared = uint32(a) 524 ptr = unsafe.Pointer(uintptr(ptr) + 1) 525 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 526 unshared = uint32(b)<<7 | uint32(a) 527 ptr = unsafe.Pointer(uintptr(ptr) + 2) 528 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 529 unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 530 ptr = unsafe.Pointer(uintptr(ptr) + 3) 531 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 532 unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 533 ptr = unsafe.Pointer(uintptr(ptr) + 4) 534 } else { 535 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 536 unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 537 ptr = unsafe.Pointer(uintptr(ptr) + 5) 538 } 539 540 var value uint32 541 if a := *((*uint8)(ptr)); a < 128 { 542 value = uint32(a) 543 ptr = unsafe.Pointer(uintptr(ptr) + 1) 544 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 545 value = uint32(b)<<7 | uint32(a) 546 ptr = unsafe.Pointer(uintptr(ptr) + 2) 547 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 548 value = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 549 ptr = unsafe.Pointer(uintptr(ptr) + 3) 550 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 551 value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 552 ptr = unsafe.Pointer(uintptr(ptr) + 4) 553 } else { 554 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 555 value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 556 ptr = unsafe.Pointer(uintptr(ptr) + 5) 557 } 558 559 unsharedKey := getBytes(ptr, int(unshared)) 560 // TODO(sumeer): move this into the else block below. 561 i.fullKey = append(i.fullKey[:shared], unsharedKey...) 562 if shared == 0 { 563 // Provide stability for the key across positioning calls if the key 564 // doesn't share a prefix with the previous key. This removes requiring the 565 // key to be copied if the caller knows the block has a restart interval of 566 // 1. An important example of this is range-del blocks. 567 i.key = unsharedKey 568 } else { 569 i.key = i.fullKey 570 } 571 ptr = unsafe.Pointer(uintptr(ptr) + uintptr(unshared)) 572 i.val = getBytes(ptr, int(value)) 573 i.nextOffset = int32(uintptr(ptr)-uintptr(i.ptr)) + int32(value) 574 } 575 576 func (i *blockIter) readFirstKey() error { 577 ptr := i.ptr 578 579 // This is an ugly performance hack. Reading entries from blocks is one of 580 // the inner-most routines and decoding the 3 varints per-entry takes 581 // significant time. Neither go1.11 or go1.12 will inline decodeVarint for 582 // us, so we do it manually. This provides a 10-15% performance improvement 583 // on blockIter benchmarks on both go1.11 and go1.12. 584 // 585 // TODO(peter): remove this hack if go:inline is ever supported. 586 587 if shared := *((*uint8)(ptr)); shared == 0 { 588 ptr = unsafe.Pointer(uintptr(ptr) + 1) 589 } else { 590 // The shared length is != 0, which is invalid. 591 panic("first key in block must have zero shared length") 592 } 593 594 var unshared uint32 595 if a := *((*uint8)(ptr)); a < 128 { 596 unshared = uint32(a) 597 ptr = unsafe.Pointer(uintptr(ptr) + 1) 598 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 599 unshared = uint32(b)<<7 | uint32(a) 600 ptr = unsafe.Pointer(uintptr(ptr) + 2) 601 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 602 unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 603 ptr = unsafe.Pointer(uintptr(ptr) + 3) 604 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 605 unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 606 ptr = unsafe.Pointer(uintptr(ptr) + 4) 607 } else { 608 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 609 unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 610 ptr = unsafe.Pointer(uintptr(ptr) + 5) 611 } 612 613 // Skip the value length. 614 if a := *((*uint8)(ptr)); a < 128 { 615 ptr = unsafe.Pointer(uintptr(ptr) + 1) 616 } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); a < 128 { 617 ptr = unsafe.Pointer(uintptr(ptr) + 2) 618 } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); a < 128 { 619 ptr = unsafe.Pointer(uintptr(ptr) + 3) 620 } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); a < 128 { 621 ptr = unsafe.Pointer(uintptr(ptr) + 4) 622 } else { 623 ptr = unsafe.Pointer(uintptr(ptr) + 5) 624 } 625 626 firstKey := getBytes(ptr, int(unshared)) 627 // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on 628 // BlockIter benchmarks. 629 if n := len(firstKey) - 8; n >= 0 { 630 i.firstUserKey = firstKey[:n:n] 631 } else { 632 i.firstUserKey = nil 633 return base.CorruptionErrorf("pebble/table: invalid firstKey in block") 634 } 635 return nil 636 } 637 638 // The sstable internal obsolete bit is set when writing a block and unset by 639 // blockIter, so no code outside block writing/reading code ever sees it. 640 const trailerObsoleteBit = uint64(base.InternalKeyKindSSTableInternalObsoleteBit) 641 const trailerObsoleteMask = (InternalKeySeqNumMax << 8) | uint64(base.InternalKeyKindSSTableInternalObsoleteMask) 642 643 func (i *blockIter) decodeInternalKey(key []byte) (hiddenPoint bool) { 644 // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on 645 // BlockIter benchmarks. 646 if n := len(key) - 8; n >= 0 { 647 trailer := binary.LittleEndian.Uint64(key[n:]) 648 hiddenPoint = i.hideObsoletePoints && 649 (trailer&trailerObsoleteBit != 0) 650 i.ikey.Trailer = trailer & trailerObsoleteMask 651 i.ikey.UserKey = key[:n:n] 652 if i.globalSeqNum != 0 { 653 i.ikey.SetSeqNum(i.globalSeqNum) 654 } 655 } else { 656 i.ikey.Trailer = uint64(InternalKeyKindInvalid) 657 i.ikey.UserKey = nil 658 } 659 return hiddenPoint 660 } 661 662 func (i *blockIter) clearCache() { 663 i.cached = i.cached[:0] 664 i.cachedBuf = i.cachedBuf[:0] 665 } 666 667 func (i *blockIter) cacheEntry() { 668 var valStart int32 669 valSize := int32(len(i.val)) 670 if valSize > 0 { 671 valStart = int32(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr)) 672 } 673 674 i.cached = append(i.cached, blockEntry{ 675 offset: i.offset, 676 keyStart: int32(len(i.cachedBuf)), 677 keyEnd: int32(len(i.cachedBuf) + len(i.key)), 678 valStart: valStart, 679 valSize: valSize, 680 }) 681 i.cachedBuf = append(i.cachedBuf, i.key...) 682 } 683 684 func (i *blockIter) getFirstUserKey() []byte { 685 return i.firstUserKey 686 } 687 688 // SeekGE implements internalIterator.SeekGE, as documented in the pebble 689 // package. 690 func (i *blockIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) { 691 if invariants.Enabled && i.isDataInvalidated() { 692 panic(errors.AssertionFailedf("invalidated blockIter used")) 693 } 694 695 i.clearCache() 696 // Find the index of the smallest restart point whose key is > the key 697 // sought; index will be numRestarts if there is no such restart point. 698 i.offset = 0 699 var index int32 700 701 { 702 // NB: manually inlined sort.Seach is ~5% faster. 703 // 704 // Define f(-1) == false and f(n) == true. 705 // Invariant: f(index-1) == false, f(upper) == true. 706 upper := i.numRestarts 707 for index < upper { 708 h := int32(uint(index+upper) >> 1) // avoid overflow when computing h 709 // index ≤ h < upper 710 offset := decodeRestart(i.data[i.restarts+4*h:]) 711 // For a restart point, there are 0 bytes shared with the previous key. 712 // The varint encoding of 0 occupies 1 byte. 713 ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1)) 714 715 // Decode the key at that restart point, and compare it to the key 716 // sought. See the comment in readEntry for why we manually inline the 717 // varint decoding. 718 var v1 uint32 719 if a := *((*uint8)(ptr)); a < 128 { 720 v1 = uint32(a) 721 ptr = unsafe.Pointer(uintptr(ptr) + 1) 722 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 723 v1 = uint32(b)<<7 | uint32(a) 724 ptr = unsafe.Pointer(uintptr(ptr) + 2) 725 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 726 v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 727 ptr = unsafe.Pointer(uintptr(ptr) + 3) 728 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 729 v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 730 ptr = unsafe.Pointer(uintptr(ptr) + 4) 731 } else { 732 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 733 v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 734 ptr = unsafe.Pointer(uintptr(ptr) + 5) 735 } 736 737 if *((*uint8)(ptr)) < 128 { 738 ptr = unsafe.Pointer(uintptr(ptr) + 1) 739 } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 { 740 ptr = unsafe.Pointer(uintptr(ptr) + 2) 741 } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 { 742 ptr = unsafe.Pointer(uintptr(ptr) + 3) 743 } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 { 744 ptr = unsafe.Pointer(uintptr(ptr) + 4) 745 } else { 746 ptr = unsafe.Pointer(uintptr(ptr) + 5) 747 } 748 749 // Manually inlining part of base.DecodeInternalKey provides a 5-10% 750 // speedup on BlockIter benchmarks. 751 s := getBytes(ptr, int(v1)) 752 var k []byte 753 if n := len(s) - 8; n >= 0 { 754 k = s[:n:n] 755 } 756 // Else k is invalid, and left as nil 757 758 if i.cmp(key, k) > 0 { 759 // The search key is greater than the user key at this restart point. 760 // Search beyond this restart point, since we are trying to find the 761 // first restart point with a user key >= the search key. 762 index = h + 1 // preserves f(i-1) == false 763 } else { 764 // k >= search key, so prune everything after index (since index 765 // satisfies the property we are looking for). 766 upper = h // preserves f(j) == true 767 } 768 } 769 // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true 770 // => answer is index. 771 } 772 773 // index is the first restart point with key >= search key. Define the keys 774 // between a restart point and the next restart point as belonging to that 775 // restart point. 776 // 777 // Since keys are strictly increasing, if index > 0 then the restart point 778 // at index-1 will be the first one that has some keys belonging to it that 779 // could be equal to the search key. If index == 0, then all keys in this 780 // block are larger than the key sought, and offset remains at zero. 781 if index > 0 { 782 i.offset = decodeRestart(i.data[i.restarts+4*(index-1):]) 783 } 784 i.readEntry() 785 hiddenPoint := i.decodeInternalKey(i.key) 786 787 // Iterate from that restart point to somewhere >= the key sought. 788 if !i.valid() { 789 return nil, base.LazyValue{} 790 } 791 if !hiddenPoint && i.cmp(i.ikey.UserKey, key) >= 0 { 792 // Initialize i.lazyValue 793 if !i.lazyValueHandling.hasValuePrefix || 794 base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { 795 i.lazyValue = base.MakeInPlaceValue(i.val) 796 } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { 797 i.lazyValue = base.MakeInPlaceValue(i.val[1:]) 798 } else { 799 i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) 800 } 801 return &i.ikey, i.lazyValue 802 } 803 for i.Next(); i.valid(); i.Next() { 804 if i.cmp(i.ikey.UserKey, key) >= 0 { 805 // i.Next() has already initialized i.lazyValue. 806 return &i.ikey, i.lazyValue 807 } 808 } 809 return nil, base.LazyValue{} 810 } 811 812 // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the 813 // pebble package. 814 func (i *blockIter) SeekPrefixGE( 815 prefix, key []byte, flags base.SeekGEFlags, 816 ) (*base.InternalKey, base.LazyValue) { 817 // This should never be called as prefix iteration is handled by sstable.Iterator. 818 panic("pebble: SeekPrefixGE unimplemented") 819 } 820 821 // SeekLT implements internalIterator.SeekLT, as documented in the pebble 822 // package. 823 func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) { 824 if invariants.Enabled && i.isDataInvalidated() { 825 panic(errors.AssertionFailedf("invalidated blockIter used")) 826 } 827 828 i.clearCache() 829 // Find the index of the smallest restart point whose key is >= the key 830 // sought; index will be numRestarts if there is no such restart point. 831 i.offset = 0 832 var index int32 833 834 { 835 // NB: manually inlined sort.Search is ~5% faster. 836 // 837 // Define f(-1) == false and f(n) == true. 838 // Invariant: f(index-1) == false, f(upper) == true. 839 upper := i.numRestarts 840 for index < upper { 841 h := int32(uint(index+upper) >> 1) // avoid overflow when computing h 842 // index ≤ h < upper 843 offset := decodeRestart(i.data[i.restarts+4*h:]) 844 // For a restart point, there are 0 bytes shared with the previous key. 845 // The varint encoding of 0 occupies 1 byte. 846 ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1)) 847 848 // Decode the key at that restart point, and compare it to the key 849 // sought. See the comment in readEntry for why we manually inline the 850 // varint decoding. 851 var v1 uint32 852 if a := *((*uint8)(ptr)); a < 128 { 853 v1 = uint32(a) 854 ptr = unsafe.Pointer(uintptr(ptr) + 1) 855 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 856 v1 = uint32(b)<<7 | uint32(a) 857 ptr = unsafe.Pointer(uintptr(ptr) + 2) 858 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 859 v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 860 ptr = unsafe.Pointer(uintptr(ptr) + 3) 861 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 862 v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 863 ptr = unsafe.Pointer(uintptr(ptr) + 4) 864 } else { 865 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 866 v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 867 ptr = unsafe.Pointer(uintptr(ptr) + 5) 868 } 869 870 if *((*uint8)(ptr)) < 128 { 871 ptr = unsafe.Pointer(uintptr(ptr) + 1) 872 } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 { 873 ptr = unsafe.Pointer(uintptr(ptr) + 2) 874 } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 { 875 ptr = unsafe.Pointer(uintptr(ptr) + 3) 876 } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 { 877 ptr = unsafe.Pointer(uintptr(ptr) + 4) 878 } else { 879 ptr = unsafe.Pointer(uintptr(ptr) + 5) 880 } 881 882 // Manually inlining part of base.DecodeInternalKey provides a 5-10% 883 // speedup on BlockIter benchmarks. 884 s := getBytes(ptr, int(v1)) 885 var k []byte 886 if n := len(s) - 8; n >= 0 { 887 k = s[:n:n] 888 } 889 // Else k is invalid, and left as nil 890 891 if i.cmp(key, k) > 0 { 892 // The search key is greater than the user key at this restart point. 893 // Search beyond this restart point, since we are trying to find the 894 // first restart point with a user key >= the search key. 895 index = h + 1 // preserves f(i-1) == false 896 } else { 897 // k >= search key, so prune everything after index (since index 898 // satisfies the property we are looking for). 899 upper = h // preserves f(j) == true 900 } 901 } 902 // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true 903 // => answer is index. 904 } 905 906 // index is the first restart point with key >= search key. Define the keys 907 // between a restart point and the next restart point as belonging to that 908 // restart point. Note that index could be equal to i.numRestarts, i.e., we 909 // are past the last restart. 910 // 911 // Since keys are strictly increasing, if index > 0 then the restart point 912 // at index-1 will be the first one that has some keys belonging to it that 913 // are less than the search key. If index == 0, then all keys in this block 914 // are larger than the search key, so there is no match. 915 targetOffset := i.restarts 916 if index > 0 { 917 i.offset = decodeRestart(i.data[i.restarts+4*(index-1):]) 918 if index < i.numRestarts { 919 targetOffset = decodeRestart(i.data[i.restarts+4*(index):]) 920 } 921 } else if index == 0 { 922 // If index == 0 then all keys in this block are larger than the key 923 // sought. 924 i.offset = -1 925 i.nextOffset = 0 926 return nil, base.LazyValue{} 927 } 928 929 // Iterate from that restart point to somewhere >= the key sought, then back 930 // up to the previous entry. The expectation is that we'll be performing 931 // reverse iteration, so we cache the entries as we advance forward. 932 i.nextOffset = i.offset 933 934 for { 935 i.offset = i.nextOffset 936 i.readEntry() 937 // When hidden keys are common, there is additional optimization possible 938 // by not caching entries that are hidden (note that some calls to 939 // cacheEntry don't decode the internal key before caching, but checking 940 // whether a key is hidden does not require full decoding). However, we do 941 // need to use the blockEntry.offset in the cache for the first entry at 942 // the reset point to do the binary search when the cache is empty -- so 943 // we would need to cache that first entry (though not the key) even if 944 // was hidden. Our current assumption is that if there are large numbers 945 // of hidden keys we will be able to skip whole blocks (using block 946 // property filters) so we don't bother optimizing. 947 hiddenPoint := i.decodeInternalKey(i.key) 948 949 // NB: we don't use the hiddenPoint return value of decodeInternalKey 950 // since we want to stop as soon as we reach a key >= ikey.UserKey, so 951 // that we can reverse. 952 if i.cmp(i.ikey.UserKey, key) >= 0 { 953 // The current key is greater than or equal to our search key. Back up to 954 // the previous key which was less than our search key. Note that this for 955 // loop will execute at least once with this if-block not being true, so 956 // the key we are backing up to is the last one this loop cached. 957 return i.Prev() 958 } 959 960 if i.nextOffset >= targetOffset { 961 // We've reached the end of the current restart block. Return the 962 // current key if not hidden, else call Prev(). 963 // 964 // When the restart interval is 1, the first iteration of the for loop 965 // will bring us here. In that case ikey is backed by the block so we 966 // get the desired key stability guarantee for the lifetime of the 967 // blockIter. That is, we never cache anything and therefore never 968 // return a key backed by cachedBuf. 969 if hiddenPoint { 970 return i.Prev() 971 } 972 break 973 } 974 975 i.cacheEntry() 976 } 977 978 if !i.valid() { 979 return nil, base.LazyValue{} 980 } 981 if !i.lazyValueHandling.hasValuePrefix || 982 base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { 983 i.lazyValue = base.MakeInPlaceValue(i.val) 984 } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { 985 i.lazyValue = base.MakeInPlaceValue(i.val[1:]) 986 } else { 987 i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) 988 } 989 return &i.ikey, i.lazyValue 990 } 991 992 // First implements internalIterator.First, as documented in the pebble 993 // package. 994 func (i *blockIter) First() (*InternalKey, base.LazyValue) { 995 if invariants.Enabled && i.isDataInvalidated() { 996 panic(errors.AssertionFailedf("invalidated blockIter used")) 997 } 998 999 i.offset = 0 1000 if !i.valid() { 1001 return nil, base.LazyValue{} 1002 } 1003 i.clearCache() 1004 i.readEntry() 1005 hiddenPoint := i.decodeInternalKey(i.key) 1006 if hiddenPoint { 1007 return i.Next() 1008 } 1009 if !i.lazyValueHandling.hasValuePrefix || 1010 base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { 1011 i.lazyValue = base.MakeInPlaceValue(i.val) 1012 } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { 1013 i.lazyValue = base.MakeInPlaceValue(i.val[1:]) 1014 } else { 1015 i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) 1016 } 1017 return &i.ikey, i.lazyValue 1018 } 1019 1020 func decodeRestart(b []byte) int32 { 1021 _ = b[3] // bounds check hint to compiler; see golang.org/issue/14808 1022 return int32(uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | 1023 uint32(b[3]&restartMaskLittleEndianHighByteWithoutSetHasSamePrefix)<<24) 1024 } 1025 1026 // Last implements internalIterator.Last, as documented in the pebble package. 1027 func (i *blockIter) Last() (*InternalKey, base.LazyValue) { 1028 if invariants.Enabled && i.isDataInvalidated() { 1029 panic(errors.AssertionFailedf("invalidated blockIter used")) 1030 } 1031 1032 // Seek forward from the last restart point. 1033 i.offset = decodeRestart(i.data[i.restarts+4*(i.numRestarts-1):]) 1034 if !i.valid() { 1035 return nil, base.LazyValue{} 1036 } 1037 1038 i.readEntry() 1039 i.clearCache() 1040 1041 for i.nextOffset < i.restarts { 1042 i.cacheEntry() 1043 i.offset = i.nextOffset 1044 i.readEntry() 1045 } 1046 1047 hiddenPoint := i.decodeInternalKey(i.key) 1048 if hiddenPoint { 1049 return i.Prev() 1050 } 1051 if !i.lazyValueHandling.hasValuePrefix || 1052 base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { 1053 i.lazyValue = base.MakeInPlaceValue(i.val) 1054 } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { 1055 i.lazyValue = base.MakeInPlaceValue(i.val[1:]) 1056 } else { 1057 i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) 1058 } 1059 return &i.ikey, i.lazyValue 1060 } 1061 1062 // Next implements internalIterator.Next, as documented in the pebble 1063 // package. 1064 func (i *blockIter) Next() (*InternalKey, base.LazyValue) { 1065 if len(i.cachedBuf) > 0 { 1066 // We're switching from reverse iteration to forward iteration. We need to 1067 // populate i.fullKey with the current key we're positioned at so that 1068 // readEntry() can use i.fullKey for key prefix decompression. Note that we 1069 // don't know whether i.key is backed by i.cachedBuf or i.fullKey (if 1070 // SeekLT was the previous call, i.key may be backed by i.fullKey), but 1071 // copying into i.fullKey works for both cases. 1072 // 1073 // TODO(peter): Rather than clearing the cache, we could instead use the 1074 // cache until it is exhausted. This would likely be faster than falling 1075 // through to the normal forward iteration code below. 1076 i.fullKey = append(i.fullKey[:0], i.key...) 1077 i.clearCache() 1078 } 1079 1080 start: 1081 i.offset = i.nextOffset 1082 if !i.valid() { 1083 return nil, base.LazyValue{} 1084 } 1085 i.readEntry() 1086 // Manually inlined version of i.decodeInternalKey(i.key). 1087 if n := len(i.key) - 8; n >= 0 { 1088 trailer := binary.LittleEndian.Uint64(i.key[n:]) 1089 hiddenPoint := i.hideObsoletePoints && 1090 (trailer&trailerObsoleteBit != 0) 1091 i.ikey.Trailer = trailer & trailerObsoleteMask 1092 i.ikey.UserKey = i.key[:n:n] 1093 if i.globalSeqNum != 0 { 1094 i.ikey.SetSeqNum(i.globalSeqNum) 1095 } 1096 if hiddenPoint { 1097 goto start 1098 } 1099 } else { 1100 i.ikey.Trailer = uint64(InternalKeyKindInvalid) 1101 i.ikey.UserKey = nil 1102 } 1103 if !i.lazyValueHandling.hasValuePrefix || 1104 base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { 1105 i.lazyValue = base.MakeInPlaceValue(i.val) 1106 } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { 1107 i.lazyValue = base.MakeInPlaceValue(i.val[1:]) 1108 } else { 1109 i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) 1110 } 1111 return &i.ikey, i.lazyValue 1112 } 1113 1114 // NextPrefix implements (base.InternalIterator).NextPrefix. 1115 func (i *blockIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) { 1116 if i.lazyValueHandling.hasValuePrefix { 1117 return i.nextPrefixV3(succKey) 1118 } 1119 const nextsBeforeSeek = 3 1120 k, v := i.Next() 1121 for j := 1; k != nil && i.cmp(k.UserKey, succKey) < 0; j++ { 1122 if j >= nextsBeforeSeek { 1123 return i.SeekGE(succKey, base.SeekGEFlagsNone) 1124 } 1125 k, v = i.Next() 1126 } 1127 return k, v 1128 } 1129 1130 func (i *blockIter) nextPrefixV3(succKey []byte) (*InternalKey, base.LazyValue) { 1131 // Doing nexts that involve a key comparison can be expensive (and the cost 1132 // depends on the key length), so we use the same threshold of 3 that we use 1133 // for TableFormatPebblev2 in blockIter.nextPrefix above. The next fast path 1134 // that looks at setHasSamePrefix takes ~5ns per key, which is ~150x faster 1135 // than doing a SeekGE within the block, so we do this 16 times 1136 // (~5ns*16=80ns), and then switch to looking at restarts. Doing the binary 1137 // search for the restart consumes > 100ns. If the number of versions is > 1138 // 17, we will increment nextFastCount to 17, then do a binary search, and 1139 // on average need to find a key between two restarts, so another 8 steps 1140 // corresponding to nextFastCount, for a mean total of 17 + 8 = 25 such 1141 // steps. 1142 // 1143 // TODO(sumeer): use the configured restartInterval for the sstable when it 1144 // was written (which we don't currently store) instead of the default value 1145 // of 16. 1146 const nextCmpThresholdBeforeSeek = 3 1147 const nextFastThresholdBeforeRestarts = 16 1148 nextCmpCount := 0 1149 nextFastCount := 0 1150 usedRestarts := false 1151 // INVARIANT: blockIter is valid. 1152 if invariants.Enabled && !i.valid() { 1153 panic(errors.AssertionFailedf("nextPrefixV3 called on invalid blockIter")) 1154 } 1155 prevKeyIsSet := i.ikey.Kind() == InternalKeyKindSet 1156 for { 1157 i.offset = i.nextOffset 1158 if !i.valid() { 1159 return nil, base.LazyValue{} 1160 } 1161 // Need to decode the length integers, so we can compute nextOffset. 1162 ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset)) 1163 // This is an ugly performance hack. Reading entries from blocks is one of 1164 // the inner-most routines and decoding the 3 varints per-entry takes 1165 // significant time. Neither go1.11 or go1.12 will inline decodeVarint for 1166 // us, so we do it manually. This provides a 10-15% performance improvement 1167 // on blockIter benchmarks on both go1.11 and go1.12. 1168 // 1169 // TODO(peter): remove this hack if go:inline is ever supported. 1170 1171 // Decode the shared key length integer. 1172 var shared uint32 1173 if a := *((*uint8)(ptr)); a < 128 { 1174 shared = uint32(a) 1175 ptr = unsafe.Pointer(uintptr(ptr) + 1) 1176 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 1177 shared = uint32(b)<<7 | uint32(a) 1178 ptr = unsafe.Pointer(uintptr(ptr) + 2) 1179 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 1180 shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1181 ptr = unsafe.Pointer(uintptr(ptr) + 3) 1182 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 1183 shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1184 ptr = unsafe.Pointer(uintptr(ptr) + 4) 1185 } else { 1186 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 1187 shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1188 ptr = unsafe.Pointer(uintptr(ptr) + 5) 1189 } 1190 // Decode the unshared key length integer. 1191 var unshared uint32 1192 if a := *((*uint8)(ptr)); a < 128 { 1193 unshared = uint32(a) 1194 ptr = unsafe.Pointer(uintptr(ptr) + 1) 1195 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 1196 unshared = uint32(b)<<7 | uint32(a) 1197 ptr = unsafe.Pointer(uintptr(ptr) + 2) 1198 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 1199 unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1200 ptr = unsafe.Pointer(uintptr(ptr) + 3) 1201 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 1202 unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1203 ptr = unsafe.Pointer(uintptr(ptr) + 4) 1204 } else { 1205 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 1206 unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1207 ptr = unsafe.Pointer(uintptr(ptr) + 5) 1208 } 1209 // Decode the value length integer. 1210 var value uint32 1211 if a := *((*uint8)(ptr)); a < 128 { 1212 value = uint32(a) 1213 ptr = unsafe.Pointer(uintptr(ptr) + 1) 1214 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 1215 value = uint32(b)<<7 | uint32(a) 1216 ptr = unsafe.Pointer(uintptr(ptr) + 2) 1217 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 1218 value = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1219 ptr = unsafe.Pointer(uintptr(ptr) + 3) 1220 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 1221 value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1222 ptr = unsafe.Pointer(uintptr(ptr) + 4) 1223 } else { 1224 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 1225 value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1226 ptr = unsafe.Pointer(uintptr(ptr) + 5) 1227 } 1228 // The starting position of the value. 1229 valuePtr := unsafe.Pointer(uintptr(ptr) + uintptr(unshared)) 1230 i.nextOffset = int32(uintptr(valuePtr)-uintptr(i.ptr)) + int32(value) 1231 if invariants.Enabled && unshared < 8 { 1232 // This should not happen since only the key prefix is shared, so even 1233 // if the prefix length is the same as the user key length, the unshared 1234 // will include the trailer. 1235 panic(errors.AssertionFailedf("unshared %d is too small", unshared)) 1236 } 1237 // The trailer is written in little endian, so the key kind is the first 1238 // byte in the trailer that is encoded in the slice [unshared-8:unshared]. 1239 keyKind := InternalKeyKind((*[manual.MaxArrayLen]byte)(ptr)[unshared-8]) 1240 keyKind = keyKind & base.InternalKeyKindSSTableInternalObsoleteMask 1241 prefixChanged := false 1242 if keyKind == InternalKeyKindSet { 1243 if invariants.Enabled && value == 0 { 1244 panic(errors.AssertionFailedf("value is of length 0, but we expect a valuePrefix")) 1245 } 1246 valPrefix := *((*valuePrefix)(valuePtr)) 1247 if setHasSamePrefix(valPrefix) { 1248 // Fast-path. No need to assemble i.fullKey, or update i.key. We know 1249 // that subsequent keys will not have a shared length that is greater 1250 // than the prefix of the current key, which is also the prefix of 1251 // i.key. Since we are continuing to iterate, we don't need to 1252 // initialize i.ikey and i.lazyValue (these are initialized before 1253 // returning). 1254 nextFastCount++ 1255 if nextFastCount > nextFastThresholdBeforeRestarts { 1256 if usedRestarts { 1257 // Exhausted iteration budget. This will never happen unless 1258 // someone is using a restart interval > 16. It is just to guard 1259 // against long restart intervals causing too much iteration. 1260 break 1261 } 1262 // Haven't used restarts yet, so find the first restart at or beyond 1263 // the current offset. 1264 targetOffset := i.offset 1265 var index int32 1266 { 1267 // NB: manually inlined sort.Sort is ~5% faster. 1268 // 1269 // f defined for a restart point is true iff the offset >= 1270 // targetOffset. 1271 // Define f(-1) == false and f(i.numRestarts) == true. 1272 // Invariant: f(index-1) == false, f(upper) == true. 1273 upper := i.numRestarts 1274 for index < upper { 1275 h := int32(uint(index+upper) >> 1) // avoid overflow when computing h 1276 // index ≤ h < upper 1277 offset := decodeRestart(i.data[i.restarts+4*h:]) 1278 if offset < targetOffset { 1279 index = h + 1 // preserves f(index-1) == false 1280 } else { 1281 upper = h // preserves f(upper) == true 1282 } 1283 } 1284 // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true 1285 // => answer is index. 1286 } 1287 usedRestarts = true 1288 nextFastCount = 0 1289 if index == i.numRestarts { 1290 // Already past the last real restart, so iterate a bit more until 1291 // we are done with the block. 1292 continue 1293 } 1294 // Have some real restarts after index. NB: index is the first 1295 // restart at or beyond the current offset. 1296 startingIndex := index 1297 for index != i.numRestarts && 1298 // The restart at index is 4 bytes written in little endian format 1299 // starting at i.restart+4*index. The 0th byte is the least 1300 // significant and the 3rd byte is the most significant. Since the 1301 // most significant bit of the 3rd byte is what we use for 1302 // encoding the set-has-same-prefix information, the indexing 1303 // below has +3. 1304 i.data[i.restarts+4*index+3]&restartMaskLittleEndianHighByteOnlySetHasSamePrefix != 0 { 1305 // We still have the same prefix, so move to the next restart. 1306 index++ 1307 } 1308 // index is the first restart that did not have the same prefix. 1309 if index != startingIndex { 1310 // Managed to skip past at least one restart. Resume iteration 1311 // from index-1. Since nextFastCount has been reset to 0, we 1312 // should be able to iterate to the next prefix. 1313 i.offset = decodeRestart(i.data[i.restarts+4*(index-1):]) 1314 i.readEntry() 1315 } 1316 // Else, unable to skip past any restart. Resume iteration. Since 1317 // nextFastCount has been reset to 0, we should be able to iterate 1318 // to the next prefix. 1319 continue 1320 } 1321 continue 1322 } else if prevKeyIsSet { 1323 prefixChanged = true 1324 } 1325 } else { 1326 prevKeyIsSet = false 1327 } 1328 // Slow-path cases: 1329 // - (Likely) The prefix has changed. 1330 // - (Unlikely) The prefix has not changed. 1331 // We assemble the key etc. under the assumption that it is the likely 1332 // case. 1333 unsharedKey := getBytes(ptr, int(unshared)) 1334 // TODO(sumeer): move this into the else block below. This is a bit tricky 1335 // since the current logic assumes we have always copied the latest key 1336 // into fullKey, which is why when we get to the next key we can (a) 1337 // access i.fullKey[:shared], (b) append only the unsharedKey to 1338 // i.fullKey. For (a), we can access i.key[:shared] since that memory is 1339 // valid (even if unshared). For (b), we will need to remember whether 1340 // i.key refers to i.fullKey or not, and can append the unsharedKey only 1341 // in the former case and for the latter case need to copy the shared part 1342 // too. This same comment applies to the other place where we can do this 1343 // optimization, in readEntry(). 1344 i.fullKey = append(i.fullKey[:shared], unsharedKey...) 1345 i.val = getBytes(valuePtr, int(value)) 1346 if shared == 0 { 1347 // Provide stability for the key across positioning calls if the key 1348 // doesn't share a prefix with the previous key. This removes requiring the 1349 // key to be copied if the caller knows the block has a restart interval of 1350 // 1. An important example of this is range-del blocks. 1351 i.key = unsharedKey 1352 } else { 1353 i.key = i.fullKey 1354 } 1355 // Manually inlined version of i.decodeInternalKey(i.key). 1356 hiddenPoint := false 1357 if n := len(i.key) - 8; n >= 0 { 1358 trailer := binary.LittleEndian.Uint64(i.key[n:]) 1359 hiddenPoint = i.hideObsoletePoints && 1360 (trailer&trailerObsoleteBit != 0) 1361 i.ikey.Trailer = trailer & trailerObsoleteMask 1362 i.ikey.UserKey = i.key[:n:n] 1363 if i.globalSeqNum != 0 { 1364 i.ikey.SetSeqNum(i.globalSeqNum) 1365 } 1366 } else { 1367 i.ikey.Trailer = uint64(InternalKeyKindInvalid) 1368 i.ikey.UserKey = nil 1369 } 1370 nextCmpCount++ 1371 if invariants.Enabled && prefixChanged && i.cmp(i.ikey.UserKey, succKey) < 0 { 1372 panic(errors.AssertionFailedf("prefix should have changed but %x < %x", 1373 i.ikey.UserKey, succKey)) 1374 } 1375 if prefixChanged || i.cmp(i.ikey.UserKey, succKey) >= 0 { 1376 // Prefix has changed. 1377 if hiddenPoint { 1378 return i.Next() 1379 } 1380 if invariants.Enabled && !i.lazyValueHandling.hasValuePrefix { 1381 panic(errors.AssertionFailedf("nextPrefixV3 being run for non-v3 sstable")) 1382 } 1383 if base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { 1384 i.lazyValue = base.MakeInPlaceValue(i.val) 1385 } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { 1386 i.lazyValue = base.MakeInPlaceValue(i.val[1:]) 1387 } else { 1388 i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) 1389 } 1390 return &i.ikey, i.lazyValue 1391 } 1392 // Else prefix has not changed. 1393 1394 if nextCmpCount >= nextCmpThresholdBeforeSeek { 1395 break 1396 } 1397 } 1398 return i.SeekGE(succKey, base.SeekGEFlagsNone) 1399 } 1400 1401 // Prev implements internalIterator.Prev, as documented in the pebble 1402 // package. 1403 func (i *blockIter) Prev() (*InternalKey, base.LazyValue) { 1404 start: 1405 for n := len(i.cached) - 1; n >= 0; n-- { 1406 i.nextOffset = i.offset 1407 e := &i.cached[n] 1408 i.offset = e.offset 1409 i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize)) 1410 // Manually inlined version of i.decodeInternalKey(i.key). 1411 i.key = i.cachedBuf[e.keyStart:e.keyEnd] 1412 if n := len(i.key) - 8; n >= 0 { 1413 trailer := binary.LittleEndian.Uint64(i.key[n:]) 1414 hiddenPoint := i.hideObsoletePoints && 1415 (trailer&trailerObsoleteBit != 0) 1416 if hiddenPoint { 1417 continue 1418 } 1419 i.ikey.Trailer = trailer & trailerObsoleteMask 1420 i.ikey.UserKey = i.key[:n:n] 1421 if i.globalSeqNum != 0 { 1422 i.ikey.SetSeqNum(i.globalSeqNum) 1423 } 1424 } else { 1425 i.ikey.Trailer = uint64(InternalKeyKindInvalid) 1426 i.ikey.UserKey = nil 1427 } 1428 i.cached = i.cached[:n] 1429 if !i.lazyValueHandling.hasValuePrefix || 1430 base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { 1431 i.lazyValue = base.MakeInPlaceValue(i.val) 1432 } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { 1433 i.lazyValue = base.MakeInPlaceValue(i.val[1:]) 1434 } else { 1435 i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) 1436 } 1437 return &i.ikey, i.lazyValue 1438 } 1439 1440 i.clearCache() 1441 if i.offset <= 0 { 1442 i.offset = -1 1443 i.nextOffset = 0 1444 return nil, base.LazyValue{} 1445 } 1446 1447 targetOffset := i.offset 1448 var index int32 1449 1450 { 1451 // NB: manually inlined sort.Sort is ~5% faster. 1452 // 1453 // Define f(-1) == false and f(n) == true. 1454 // Invariant: f(index-1) == false, f(upper) == true. 1455 upper := i.numRestarts 1456 for index < upper { 1457 h := int32(uint(index+upper) >> 1) // avoid overflow when computing h 1458 // index ≤ h < upper 1459 offset := decodeRestart(i.data[i.restarts+4*h:]) 1460 if offset < targetOffset { 1461 // Looking for the first restart that has offset >= targetOffset, so 1462 // ignore h and earlier. 1463 index = h + 1 // preserves f(i-1) == false 1464 } else { 1465 upper = h // preserves f(j) == true 1466 } 1467 } 1468 // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true 1469 // => answer is index. 1470 } 1471 1472 // index is first restart with offset >= targetOffset. Note that 1473 // targetOffset may not be at a restart point since one can call Prev() 1474 // after Next() (so the cache was not populated) and targetOffset refers to 1475 // the current entry. index-1 must have an offset < targetOffset (it can't 1476 // be equal to targetOffset since the binary search would have selected that 1477 // as the index). 1478 i.offset = 0 1479 if index > 0 { 1480 i.offset = decodeRestart(i.data[i.restarts+4*(index-1):]) 1481 } 1482 // TODO(sumeer): why is the else case not an error given targetOffset is a 1483 // valid offset. 1484 1485 i.readEntry() 1486 1487 // We stop when i.nextOffset == targetOffset since the targetOffset is the 1488 // entry we are stepping back from, and we don't need to cache the entry 1489 // before it, since it is the candidate to return. 1490 for i.nextOffset < targetOffset { 1491 i.cacheEntry() 1492 i.offset = i.nextOffset 1493 i.readEntry() 1494 } 1495 1496 hiddenPoint := i.decodeInternalKey(i.key) 1497 if hiddenPoint { 1498 // Use the cache. 1499 goto start 1500 } 1501 if !i.lazyValueHandling.hasValuePrefix || 1502 base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { 1503 i.lazyValue = base.MakeInPlaceValue(i.val) 1504 } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { 1505 i.lazyValue = base.MakeInPlaceValue(i.val[1:]) 1506 } else { 1507 i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) 1508 } 1509 return &i.ikey, i.lazyValue 1510 } 1511 1512 // Key implements internalIterator.Key, as documented in the pebble package. 1513 func (i *blockIter) Key() *InternalKey { 1514 return &i.ikey 1515 } 1516 1517 func (i *blockIter) value() base.LazyValue { 1518 return i.lazyValue 1519 } 1520 1521 // Error implements internalIterator.Error, as documented in the pebble 1522 // package. 1523 func (i *blockIter) Error() error { 1524 return nil // infallible 1525 } 1526 1527 // Close implements internalIterator.Close, as documented in the pebble 1528 // package. 1529 func (i *blockIter) Close() error { 1530 i.handle.Release() 1531 i.handle = bufferHandle{} 1532 i.val = nil 1533 i.lazyValue = base.LazyValue{} 1534 i.lazyValueHandling.vbr = nil 1535 return nil 1536 } 1537 1538 func (i *blockIter) SetBounds(lower, upper []byte) { 1539 // This should never be called as bounds are handled by sstable.Iterator. 1540 panic("pebble: SetBounds unimplemented") 1541 } 1542 1543 func (i *blockIter) valid() bool { 1544 return i.offset >= 0 && i.offset < i.restarts 1545 } 1546 1547 // fragmentBlockIter wraps a blockIter, implementing the 1548 // keyspan.FragmentIterator interface. It's used for reading range deletion and 1549 // range key blocks. 1550 // 1551 // Range deletions and range keys are fragmented before they're persisted to the 1552 // block. Overlapping fragments have identical bounds. The fragmentBlockIter 1553 // gathers all the fragments with identical bounds within a block and returns a 1554 // single keyspan.Span describing all the keys defined over the span. 1555 // 1556 // # Memory lifetime 1557 // 1558 // A Span returned by fragmentBlockIter is only guaranteed to be stable until 1559 // the next fragmentBlockIter iteration positioning method. A Span's Keys slice 1560 // may be reused, so the user must not assume it's stable. 1561 // 1562 // Blocks holding range deletions and range keys are configured to use a restart 1563 // interval of 1. This provides key stability. The caller may treat the various 1564 // byte slices (start, end, suffix, value) as stable for the lifetime of the 1565 // iterator. 1566 type fragmentBlockIter struct { 1567 blockIter blockIter 1568 keyBuf [2]keyspan.Key 1569 span keyspan.Span 1570 err error 1571 dir int8 1572 closeHook func(i keyspan.FragmentIterator) error 1573 1574 // elideSameSeqnum, if true, returns only the first-occurring (in forward 1575 // order) Key for each sequence number. 1576 elideSameSeqnum bool 1577 } 1578 1579 func (i *fragmentBlockIter) resetForReuse() fragmentBlockIter { 1580 return fragmentBlockIter{blockIter: i.blockIter.resetForReuse()} 1581 } 1582 1583 func (i *fragmentBlockIter) decodeSpanKeys(k *InternalKey, internalValue []byte) { 1584 // TODO(jackson): The use of i.span.Keys to accumulate keys across multiple 1585 // calls to Decode is too confusing and subtle. Refactor to make it 1586 // explicit. 1587 1588 // decode the contents of the fragment's value. This always includes at 1589 // least the end key: RANGEDELs store the end key directly as the value, 1590 // whereas the various range key kinds store are more complicated. The 1591 // details of the range key internal value format are documented within the 1592 // internal/rangekey package. 1593 switch k.Kind() { 1594 case base.InternalKeyKindRangeDelete: 1595 i.span = rangedel.Decode(*k, internalValue, i.span.Keys) 1596 i.err = nil 1597 case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset, base.InternalKeyKindRangeKeyDelete: 1598 i.span, i.err = rangekey.Decode(*k, internalValue, i.span.Keys) 1599 default: 1600 i.span = keyspan.Span{} 1601 i.err = base.CorruptionErrorf("pebble: corrupt keyspan fragment of kind %d", k.Kind()) 1602 } 1603 } 1604 1605 func (i *fragmentBlockIter) elideKeysOfSameSeqNum() { 1606 if invariants.Enabled { 1607 if !i.elideSameSeqnum || len(i.span.Keys) == 0 { 1608 panic("elideKeysOfSameSeqNum called when it should not be") 1609 } 1610 } 1611 lastSeqNum := i.span.Keys[0].SeqNum() 1612 k := 1 1613 for j := 1; j < len(i.span.Keys); j++ { 1614 if lastSeqNum != i.span.Keys[j].SeqNum() { 1615 lastSeqNum = i.span.Keys[j].SeqNum() 1616 i.span.Keys[k] = i.span.Keys[j] 1617 k++ 1618 } 1619 } 1620 i.span.Keys = i.span.Keys[:k] 1621 } 1622 1623 // gatherForward gathers internal keys with identical bounds. Keys defined over 1624 // spans of the keyspace are fragmented such that any overlapping key spans have 1625 // identical bounds. When these spans are persisted to a range deletion or range 1626 // key block, they may be persisted as multiple internal keys in order to encode 1627 // multiple sequence numbers or key kinds. 1628 // 1629 // gatherForward iterates forward, re-combining the fragmented internal keys to 1630 // reconstruct a keyspan.Span that holds all the keys defined over the span. 1631 func (i *fragmentBlockIter) gatherForward(k *InternalKey, lazyValue base.LazyValue) *keyspan.Span { 1632 i.span = keyspan.Span{} 1633 if k == nil || !i.blockIter.valid() { 1634 return nil 1635 } 1636 i.err = nil 1637 // Use the i.keyBuf array to back the Keys slice to prevent an allocation 1638 // when a span contains few keys. 1639 i.span.Keys = i.keyBuf[:0] 1640 1641 // Decode the span's end key and individual keys from the value. 1642 internalValue := lazyValue.InPlaceValue() 1643 i.decodeSpanKeys(k, internalValue) 1644 if i.err != nil { 1645 return nil 1646 } 1647 prevEnd := i.span.End 1648 1649 // There might exist additional internal keys with identical bounds encoded 1650 // within the block. Iterate forward, accumulating all the keys with 1651 // identical bounds to s. 1652 k, lazyValue = i.blockIter.Next() 1653 internalValue = lazyValue.InPlaceValue() 1654 for k != nil && i.blockIter.cmp(k.UserKey, i.span.Start) == 0 { 1655 i.decodeSpanKeys(k, internalValue) 1656 if i.err != nil { 1657 return nil 1658 } 1659 1660 // Since k indicates an equal start key, the encoded end key must 1661 // exactly equal the original end key from the first internal key. 1662 // Overlapping fragments are required to have exactly equal start and 1663 // end bounds. 1664 if i.blockIter.cmp(prevEnd, i.span.End) != 0 { 1665 i.err = base.CorruptionErrorf("pebble: corrupt keyspan fragmentation") 1666 i.span = keyspan.Span{} 1667 return nil 1668 } 1669 k, lazyValue = i.blockIter.Next() 1670 internalValue = lazyValue.InPlaceValue() 1671 } 1672 if i.elideSameSeqnum && len(i.span.Keys) > 0 { 1673 i.elideKeysOfSameSeqNum() 1674 } 1675 // i.blockIter is positioned over the first internal key for the next span. 1676 return &i.span 1677 } 1678 1679 // gatherBackward gathers internal keys with identical bounds. Keys defined over 1680 // spans of the keyspace are fragmented such that any overlapping key spans have 1681 // identical bounds. When these spans are persisted to a range deletion or range 1682 // key block, they may be persisted as multiple internal keys in order to encode 1683 // multiple sequence numbers or key kinds. 1684 // 1685 // gatherBackward iterates backwards, re-combining the fragmented internal keys 1686 // to reconstruct a keyspan.Span that holds all the keys defined over the span. 1687 func (i *fragmentBlockIter) gatherBackward(k *InternalKey, lazyValue base.LazyValue) *keyspan.Span { 1688 i.span = keyspan.Span{} 1689 if k == nil || !i.blockIter.valid() { 1690 return nil 1691 } 1692 i.err = nil 1693 // Use the i.keyBuf array to back the Keys slice to prevent an allocation 1694 // when a span contains few keys. 1695 i.span.Keys = i.keyBuf[:0] 1696 1697 // Decode the span's end key and individual keys from the value. 1698 internalValue := lazyValue.InPlaceValue() 1699 i.decodeSpanKeys(k, internalValue) 1700 if i.err != nil { 1701 return nil 1702 } 1703 prevEnd := i.span.End 1704 1705 // There might exist additional internal keys with identical bounds encoded 1706 // within the block. Iterate backward, accumulating all the keys with 1707 // identical bounds to s. 1708 k, lazyValue = i.blockIter.Prev() 1709 internalValue = lazyValue.InPlaceValue() 1710 for k != nil && i.blockIter.cmp(k.UserKey, i.span.Start) == 0 { 1711 i.decodeSpanKeys(k, internalValue) 1712 if i.err != nil { 1713 return nil 1714 } 1715 1716 // Since k indicates an equal start key, the encoded end key must 1717 // exactly equal the original end key from the first internal key. 1718 // Overlapping fragments are required to have exactly equal start and 1719 // end bounds. 1720 if i.blockIter.cmp(prevEnd, i.span.End) != 0 { 1721 i.err = base.CorruptionErrorf("pebble: corrupt keyspan fragmentation") 1722 i.span = keyspan.Span{} 1723 return nil 1724 } 1725 k, lazyValue = i.blockIter.Prev() 1726 internalValue = lazyValue.InPlaceValue() 1727 } 1728 // i.blockIter is positioned over the last internal key for the previous 1729 // span. 1730 1731 // Backwards iteration encounters internal keys in the wrong order. 1732 keyspan.SortKeysByTrailer(&i.span.Keys) 1733 1734 if i.elideSameSeqnum && len(i.span.Keys) > 0 { 1735 i.elideKeysOfSameSeqNum() 1736 } 1737 return &i.span 1738 } 1739 1740 // Error implements (keyspan.FragmentIterator).Error. 1741 func (i *fragmentBlockIter) Error() error { 1742 return i.err 1743 } 1744 1745 // Close implements (keyspan.FragmentIterator).Close. 1746 func (i *fragmentBlockIter) Close() error { 1747 var err error 1748 if i.closeHook != nil { 1749 err = i.closeHook(i) 1750 } 1751 err = firstError(err, i.blockIter.Close()) 1752 return err 1753 } 1754 1755 // First implements (keyspan.FragmentIterator).First 1756 func (i *fragmentBlockIter) First() *keyspan.Span { 1757 i.dir = +1 1758 return i.gatherForward(i.blockIter.First()) 1759 } 1760 1761 // Last implements (keyspan.FragmentIterator).Last. 1762 func (i *fragmentBlockIter) Last() *keyspan.Span { 1763 i.dir = -1 1764 return i.gatherBackward(i.blockIter.Last()) 1765 } 1766 1767 // Next implements (keyspan.FragmentIterator).Next. 1768 func (i *fragmentBlockIter) Next() *keyspan.Span { 1769 switch { 1770 case i.dir == -1 && !i.span.Valid(): 1771 // Switching directions. 1772 // 1773 // i.blockIter is exhausted, before the first key. Move onto the first. 1774 i.blockIter.First() 1775 i.dir = +1 1776 case i.dir == -1 && i.span.Valid(): 1777 // Switching directions. 1778 // 1779 // i.blockIter is currently positioned over the last internal key for 1780 // the previous span. Next it once to move to the first internal key 1781 // that makes up the current span, and gatherForwaad to land on the 1782 // first internal key making up the next span. 1783 // 1784 // In the diagram below, if the last span returned to the user during 1785 // reverse iteration was [b,c), i.blockIter is currently positioned at 1786 // [a,b). The block iter must be positioned over [d,e) to gather the 1787 // next span's fragments. 1788 // 1789 // ... [a,b) [b,c) [b,c) [b,c) [d,e) ... 1790 // ^ ^ 1791 // i.blockIter want 1792 if x := i.gatherForward(i.blockIter.Next()); invariants.Enabled && !x.Valid() { 1793 panic("pebble: invariant violation: next entry unexpectedly invalid") 1794 } 1795 i.dir = +1 1796 } 1797 // We know that this blockIter has in-place values. 1798 return i.gatherForward(&i.blockIter.ikey, base.MakeInPlaceValue(i.blockIter.val)) 1799 } 1800 1801 // Prev implements (keyspan.FragmentIterator).Prev. 1802 func (i *fragmentBlockIter) Prev() *keyspan.Span { 1803 switch { 1804 case i.dir == +1 && !i.span.Valid(): 1805 // Switching directions. 1806 // 1807 // i.blockIter is exhausted, after the last key. Move onto the last. 1808 i.blockIter.Last() 1809 i.dir = -1 1810 case i.dir == +1 && i.span.Valid(): 1811 // Switching directions. 1812 // 1813 // i.blockIter is currently positioned over the first internal key for 1814 // the next span. Prev it once to move to the last internal key that 1815 // makes up the current span, and gatherBackward to land on the last 1816 // internal key making up the previous span. 1817 // 1818 // In the diagram below, if the last span returned to the user during 1819 // forward iteration was [b,c), i.blockIter is currently positioned at 1820 // [d,e). The block iter must be positioned over [a,b) to gather the 1821 // previous span's fragments. 1822 // 1823 // ... [a,b) [b,c) [b,c) [b,c) [d,e) ... 1824 // ^ ^ 1825 // want i.blockIter 1826 if x := i.gatherBackward(i.blockIter.Prev()); invariants.Enabled && !x.Valid() { 1827 panic("pebble: invariant violation: previous entry unexpectedly invalid") 1828 } 1829 i.dir = -1 1830 } 1831 // We know that this blockIter has in-place values. 1832 return i.gatherBackward(&i.blockIter.ikey, base.MakeInPlaceValue(i.blockIter.val)) 1833 } 1834 1835 // SeekGE implements (keyspan.FragmentIterator).SeekGE. 1836 func (i *fragmentBlockIter) SeekGE(k []byte) *keyspan.Span { 1837 if s := i.SeekLT(k); s != nil && i.blockIter.cmp(k, s.End) < 0 { 1838 return s 1839 } 1840 // TODO(jackson): If the above i.SeekLT(k) discovers a span but the span 1841 // doesn't meet the k < s.End comparison, then there's no need for the 1842 // SeekLT to gatherBackward. 1843 return i.Next() 1844 } 1845 1846 // SeekLT implements (keyspan.FragmentIterator).SeekLT. 1847 func (i *fragmentBlockIter) SeekLT(k []byte) *keyspan.Span { 1848 i.dir = -1 1849 return i.gatherBackward(i.blockIter.SeekLT(k, base.SeekLTFlagsNone)) 1850 } 1851 1852 // String implements fmt.Stringer. 1853 func (i *fragmentBlockIter) String() string { 1854 return "fragment-block-iter" 1855 } 1856 1857 // SetCloseHook implements sstable.FragmentIterator. 1858 func (i *fragmentBlockIter) SetCloseHook(fn func(i keyspan.FragmentIterator) error) { 1859 i.closeHook = fn 1860 }