github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/sstable/block.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package sstable 6 7 import ( 8 "encoding/binary" 9 "unsafe" 10 11 "github.com/zuoyebang/bitalostable/internal/base" 12 "github.com/zuoyebang/bitalostable/internal/cache" 13 "github.com/zuoyebang/bitalostable/internal/invariants" 14 "github.com/zuoyebang/bitalostable/internal/keyspan" 15 "github.com/zuoyebang/bitalostable/internal/rangedel" 16 "github.com/zuoyebang/bitalostable/internal/rangekey" 17 ) 18 19 func uvarintLen(v uint32) int { 20 i := 0 21 for v >= 0x80 { 22 v >>= 7 23 i++ 24 } 25 return i + 1 26 } 27 28 type blockWriter struct { 29 restartInterval int 30 nEntries int 31 nextRestart int 32 buf []byte 33 restarts []uint32 34 curKey []byte 35 curValue []byte 36 prevKey []byte 37 tmp [4]byte 38 } 39 40 func (w *blockWriter) clear() { 41 *w = blockWriter{ 42 buf: w.buf[:0], 43 restarts: w.restarts[:0], 44 curKey: w.curKey[:0], 45 curValue: w.curValue[:0], 46 prevKey: w.prevKey[:0], 47 } 48 } 49 50 func (w *blockWriter) store(keySize int, value []byte) { 51 shared := 0 52 if w.nEntries == w.nextRestart { 53 w.nextRestart = w.nEntries + w.restartInterval 54 w.restarts = append(w.restarts, uint32(len(w.buf))) 55 } else { 56 // TODO(peter): Manually inlined version of base.SharedPrefixLen(). This 57 // is 3% faster on BenchmarkWriter on go1.16. Remove if future versions 58 // show this to not be a performance win. For now, functions that use of 59 // unsafe cannot be inlined. 60 n := len(w.curKey) 61 if n > len(w.prevKey) { 62 n = len(w.prevKey) 63 } 64 asUint64 := func(b []byte, i int) uint64 { 65 return binary.LittleEndian.Uint64(b[i:]) 66 } 67 for shared < n-7 && asUint64(w.curKey, shared) == asUint64(w.prevKey, shared) { 68 shared += 8 69 } 70 for shared < n && w.curKey[shared] == w.prevKey[shared] { 71 shared++ 72 } 73 } 74 75 needed := 3*binary.MaxVarintLen32 + len(w.curKey[shared:]) + len(value) 76 n := len(w.buf) 77 if cap(w.buf) < n+needed { 78 newCap := 2 * cap(w.buf) 79 if newCap == 0 { 80 newCap = 1024 81 } 82 for newCap < n+needed { 83 newCap *= 2 84 } 85 newBuf := make([]byte, n, newCap) 86 copy(newBuf, w.buf) 87 w.buf = newBuf 88 } 89 w.buf = w.buf[:n+needed] 90 91 // TODO(peter): Manually inlined versions of binary.PutUvarint(). This is 15% 92 // faster on BenchmarkWriter on go1.13. Remove if go1.14 or future versions 93 // show this to not be a performance win. 94 { 95 x := uint32(shared) 96 for x >= 0x80 { 97 w.buf[n] = byte(x) | 0x80 98 x >>= 7 99 n++ 100 } 101 w.buf[n] = byte(x) 102 n++ 103 } 104 105 { 106 x := uint32(keySize - shared) 107 for x >= 0x80 { 108 w.buf[n] = byte(x) | 0x80 109 x >>= 7 110 n++ 111 } 112 w.buf[n] = byte(x) 113 n++ 114 } 115 116 { 117 x := uint32(len(value)) 118 for x >= 0x80 { 119 w.buf[n] = byte(x) | 0x80 120 x >>= 7 121 n++ 122 } 123 w.buf[n] = byte(x) 124 n++ 125 } 126 127 n += copy(w.buf[n:], w.curKey[shared:]) 128 n += copy(w.buf[n:], value) 129 w.buf = w.buf[:n] 130 131 w.curValue = w.buf[n-len(value):] 132 133 w.nEntries++ 134 } 135 136 func (w *blockWriter) add(key InternalKey, value []byte) { 137 w.curKey, w.prevKey = w.prevKey, w.curKey 138 139 size := key.Size() 140 if cap(w.curKey) < size { 141 w.curKey = make([]byte, 0, size*2) 142 } 143 w.curKey = w.curKey[:size] 144 key.Encode(w.curKey) 145 146 w.store(size, value) 147 } 148 149 func (w *blockWriter) finish() []byte { 150 // Write the restart points to the buffer. 151 if w.nEntries == 0 { 152 // Every block must have at least one restart point. 153 if cap(w.restarts) > 0 { 154 w.restarts = w.restarts[:1] 155 w.restarts[0] = 0 156 } else { 157 w.restarts = append(w.restarts, 0) 158 } 159 } 160 tmp4 := w.tmp[:4] 161 for _, x := range w.restarts { 162 binary.LittleEndian.PutUint32(tmp4, x) 163 w.buf = append(w.buf, tmp4...) 164 } 165 binary.LittleEndian.PutUint32(tmp4, uint32(len(w.restarts))) 166 w.buf = append(w.buf, tmp4...) 167 result := w.buf 168 169 // Reset the block state. 170 w.nEntries = 0 171 w.nextRestart = 0 172 w.buf = w.buf[:0] 173 w.restarts = w.restarts[:0] 174 return result 175 } 176 177 // emptyBlockSize holds the size of an empty block. Every block ends 178 // in a uint32 trailer encoding the number of restart points within the 179 // block. 180 const emptyBlockSize = 4 181 182 func (w *blockWriter) estimatedSize() int { 183 return len(w.buf) + 4*len(w.restarts) + emptyBlockSize 184 } 185 186 type blockEntry struct { 187 offset int32 188 keyStart int32 189 keyEnd int32 190 valStart int32 191 valSize int32 192 } 193 194 // blockIter is an iterator over a single block of data. 195 // 196 // A blockIter provides an additional guarantee around key stability when a 197 // block has a restart interval of 1 (i.e. when there is no prefix 198 // compression). Key stability refers to whether the InternalKey.UserKey bytes 199 // returned by a positioning call will remain stable after a subsequent 200 // positioning call. The normal case is that a positioning call will invalidate 201 // any previously returned InternalKey.UserKey. If a block has a restart 202 // interval of 1 (no prefix compression), blockIter guarantees that 203 // InternalKey.UserKey will point to the key as stored in the block itself 204 // which will remain valid until the blockIter is closed. The key stability 205 // guarantee is used by the range tombstone and range key code, which knows that 206 // the respective blocks are always encoded with a restart interval of 1. This 207 // per-block key stability guarantee is sufficient for range tombstones and 208 // range deletes as they are always encoded in a single block. 209 // 210 // A blockIter also provides a value stability guarantee for range deletions and 211 // range keys since there is only a single range deletion and range key block 212 // per sstable and the blockIter will not release the bytes for the block until 213 // it is closed. 214 type blockIter struct { 215 cmp Compare 216 // offset is the byte index that marks where the current key/value is 217 // encoded in the block. 218 offset int32 219 // nextOffset is the byte index where the next key/value is encoded in the 220 // block. 221 nextOffset int32 222 // A "restart point" in a block is a point where the full key is encoded, 223 // instead of just having a suffix of the key encoded. See readEntry() for 224 // how prefix compression of keys works. Keys in between two restart points 225 // only have a suffix encoded in the block. When restart interval is 1, no 226 // prefix compression of keys happens. This is the case with range tombstone 227 // blocks. 228 // 229 // All restart offsets are listed in increasing order in 230 // i.ptr[i.restarts:len(block)-4], while numRestarts is encoded in the last 231 // 4 bytes of the block as a uint32 (i.ptr[len(block)-4:]). i.restarts can 232 // therefore be seen as the point where data in the block ends, and a list 233 // of offsets of all restart points begins. 234 restarts int32 235 // Number of restart points in this block. Encoded at the end of the block 236 // as a uint32. 237 numRestarts int32 238 globalSeqNum uint64 239 ptr unsafe.Pointer 240 data []byte 241 // key contains the raw key the iterator is currently pointed at. This may 242 // point directly to data stored in the block (for a key which has no prefix 243 // compression), to fullKey (for a prefix compressed key), or to a slice of 244 // data stored in cachedBuf (during reverse iteration). 245 key []byte 246 // fullKey is a buffer used for key prefix decompression. 247 fullKey []byte 248 // val contains the value the iterator is currently pointed at. If non-nil, 249 // this points to a slice of the block data. 250 val []byte 251 // ikey contains the decoded InternalKey the iterator is currently pointed 252 // at. Note that the memory backing ikey.UserKey is either data stored 253 // directly in the block, fullKey, or cachedBuf. The key stability guarantee 254 // for blocks built with a restart interval of 1 is achieved by having 255 // ikey.UserKey always point to data stored directly in the block. 256 ikey InternalKey 257 // cached and cachedBuf are used during reverse iteration. They are needed 258 // because we can't perform prefix decoding in reverse, only in the forward 259 // direction. In order to iterate in reverse, we decode and cache the entries 260 // between two restart points. 261 // 262 // Note that cached[len(cached)-1] contains the previous entry to the one the 263 // blockIter is currently pointed at. As usual, nextOffset will contain the 264 // offset of the next entry. During reverse iteration, nextOffset will be 265 // updated to point to offset, and we'll set the blockIter to point at the 266 // entry cached[len(cached)-1]. See Prev() for more details. 267 // 268 // For a block encoded with a restart interval of 1, cached and cachedBuf 269 // will not be used as there are no prefix compressed entries between the 270 // restart points. 271 cached []blockEntry 272 cachedBuf []byte 273 cacheHandle cache.Handle 274 // The first key in the block. This is used by the caller to set bounds 275 // for block iteration for already loaded blocks. 276 firstKey InternalKey 277 } 278 279 // blockIter implements the base.InternalIterator interface. 280 var _ base.InternalIterator = (*blockIter)(nil) 281 282 func newBlockIter(cmp Compare, block block) (*blockIter, error) { 283 i := &blockIter{} 284 return i, i.init(cmp, block, 0) 285 } 286 287 func (i *blockIter) String() string { 288 return "block" 289 } 290 291 func (i *blockIter) init(cmp Compare, block block, globalSeqNum uint64) error { 292 numRestarts := int32(binary.LittleEndian.Uint32(block[len(block)-4:])) 293 if numRestarts == 0 { 294 return base.CorruptionErrorf("bitalostable/table: invalid table (block has no restart points)") 295 } 296 i.cmp = cmp 297 i.restarts = int32(len(block)) - 4*(1+numRestarts) 298 i.numRestarts = numRestarts 299 i.globalSeqNum = globalSeqNum 300 i.ptr = unsafe.Pointer(&block[0]) 301 i.data = block 302 i.fullKey = i.fullKey[:0] 303 i.val = nil 304 i.clearCache() 305 if i.restarts > 0 { 306 if err := i.readFirstKey(); err != nil { 307 return err 308 } 309 } else { 310 // Block is empty. 311 i.firstKey = InternalKey{} 312 } 313 return nil 314 } 315 316 func (i *blockIter) initHandle(cmp Compare, block cache.Handle, globalSeqNum uint64) error { 317 i.cacheHandle.Release() 318 i.cacheHandle = block 319 return i.init(cmp, block.Get(), globalSeqNum) 320 } 321 322 func (i *blockIter) invalidate() { 323 i.clearCache() 324 i.offset = 0 325 i.nextOffset = 0 326 i.restarts = 0 327 i.numRestarts = 0 328 i.data = nil 329 } 330 331 // isDataInvalidated returns true when the blockIter has been invalidated 332 // using an invalidate call. NB: this is different from blockIter.Valid 333 // which is part of the InternalIterator implementation. 334 func (i *blockIter) isDataInvalidated() bool { 335 return i.data == nil 336 } 337 338 func (i *blockIter) resetForReuse() blockIter { 339 return blockIter{ 340 fullKey: i.fullKey[:0], 341 cached: i.cached[:0], 342 cachedBuf: i.cachedBuf[:0], 343 data: nil, 344 } 345 } 346 347 func (i *blockIter) readEntry() { 348 ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset)) 349 350 // This is an ugly performance hack. Reading entries from blocks is one of 351 // the inner-most routines and decoding the 3 varints per-entry takes 352 // significant time. Neither go1.11 or go1.12 will inline decodeVarint for 353 // us, so we do it manually. This provides a 10-15% performance improvement 354 // on blockIter benchmarks on both go1.11 and go1.12. 355 // 356 // TODO(peter): remove this hack if go:inline is ever supported. 357 358 var shared uint32 359 if a := *((*uint8)(ptr)); a < 128 { 360 shared = uint32(a) 361 ptr = unsafe.Pointer(uintptr(ptr) + 1) 362 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 363 shared = uint32(b)<<7 | uint32(a) 364 ptr = unsafe.Pointer(uintptr(ptr) + 2) 365 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 366 shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 367 ptr = unsafe.Pointer(uintptr(ptr) + 3) 368 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 369 shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 370 ptr = unsafe.Pointer(uintptr(ptr) + 4) 371 } else { 372 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 373 shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 374 ptr = unsafe.Pointer(uintptr(ptr) + 5) 375 } 376 377 var unshared uint32 378 if a := *((*uint8)(ptr)); a < 128 { 379 unshared = uint32(a) 380 ptr = unsafe.Pointer(uintptr(ptr) + 1) 381 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 382 unshared = uint32(b)<<7 | uint32(a) 383 ptr = unsafe.Pointer(uintptr(ptr) + 2) 384 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 385 unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 386 ptr = unsafe.Pointer(uintptr(ptr) + 3) 387 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 388 unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 389 ptr = unsafe.Pointer(uintptr(ptr) + 4) 390 } else { 391 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 392 unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 393 ptr = unsafe.Pointer(uintptr(ptr) + 5) 394 } 395 396 var value uint32 397 if a := *((*uint8)(ptr)); a < 128 { 398 value = uint32(a) 399 ptr = unsafe.Pointer(uintptr(ptr) + 1) 400 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 401 value = uint32(b)<<7 | uint32(a) 402 ptr = unsafe.Pointer(uintptr(ptr) + 2) 403 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 404 value = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 405 ptr = unsafe.Pointer(uintptr(ptr) + 3) 406 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 407 value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 408 ptr = unsafe.Pointer(uintptr(ptr) + 4) 409 } else { 410 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 411 value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 412 ptr = unsafe.Pointer(uintptr(ptr) + 5) 413 } 414 415 unsharedKey := getBytes(ptr, int(unshared)) 416 i.fullKey = append(i.fullKey[:shared], unsharedKey...) 417 if shared == 0 { 418 // Provide stability for the key across positioning calls if the key 419 // doesn't share a prefix with the previous key. This removes requiring the 420 // key to be copied if the caller knows the block has a restart interval of 421 // 1. An important example of this is range-del blocks. 422 i.key = unsharedKey 423 } else { 424 i.key = i.fullKey 425 } 426 ptr = unsafe.Pointer(uintptr(ptr) + uintptr(unshared)) 427 i.val = getBytes(ptr, int(value)) 428 i.nextOffset = int32(uintptr(ptr)-uintptr(i.ptr)) + int32(value) 429 } 430 431 func (i *blockIter) readFirstKey() error { 432 ptr := i.ptr 433 434 // This is an ugly performance hack. Reading entries from blocks is one of 435 // the inner-most routines and decoding the 3 varints per-entry takes 436 // significant time. Neither go1.11 or go1.12 will inline decodeVarint for 437 // us, so we do it manually. This provides a 10-15% performance improvement 438 // on blockIter benchmarks on both go1.11 and go1.12. 439 // 440 // TODO(peter): remove this hack if go:inline is ever supported. 441 442 if shared := *((*uint8)(ptr)); shared == 0 { 443 ptr = unsafe.Pointer(uintptr(ptr) + 1) 444 } else { 445 // The shared length is != 0, which is invalid. 446 panic("first key in block must have zero shared length") 447 } 448 449 var unshared uint32 450 if a := *((*uint8)(ptr)); a < 128 { 451 unshared = uint32(a) 452 ptr = unsafe.Pointer(uintptr(ptr) + 1) 453 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 454 unshared = uint32(b)<<7 | uint32(a) 455 ptr = unsafe.Pointer(uintptr(ptr) + 2) 456 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 457 unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 458 ptr = unsafe.Pointer(uintptr(ptr) + 3) 459 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 460 unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 461 ptr = unsafe.Pointer(uintptr(ptr) + 4) 462 } else { 463 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 464 unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 465 ptr = unsafe.Pointer(uintptr(ptr) + 5) 466 } 467 468 // Skip the value length. 469 if a := *((*uint8)(ptr)); a < 128 { 470 ptr = unsafe.Pointer(uintptr(ptr) + 1) 471 } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); a < 128 { 472 ptr = unsafe.Pointer(uintptr(ptr) + 2) 473 } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); a < 128 { 474 ptr = unsafe.Pointer(uintptr(ptr) + 3) 475 } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); a < 128 { 476 ptr = unsafe.Pointer(uintptr(ptr) + 4) 477 } else { 478 ptr = unsafe.Pointer(uintptr(ptr) + 5) 479 } 480 481 firstKey := getBytes(ptr, int(unshared)) 482 // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on 483 // BlockIter benchmarks. 484 if n := len(firstKey) - 8; n >= 0 { 485 i.firstKey.Trailer = binary.LittleEndian.Uint64(firstKey[n:]) 486 i.firstKey.UserKey = firstKey[:n:n] 487 if i.globalSeqNum != 0 { 488 i.firstKey.SetSeqNum(i.globalSeqNum) 489 } 490 } else { 491 i.firstKey.Trailer = uint64(InternalKeyKindInvalid) 492 i.firstKey.UserKey = nil 493 return base.CorruptionErrorf("bitalostable/table: invalid firstKey in block") 494 } 495 return nil 496 } 497 498 func (i *blockIter) decodeInternalKey(key []byte) { 499 // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on 500 // BlockIter benchmarks. 501 if n := len(key) - 8; n >= 0 { 502 i.ikey.Trailer = binary.LittleEndian.Uint64(key[n:]) 503 i.ikey.UserKey = key[:n:n] 504 if i.globalSeqNum != 0 { 505 i.ikey.SetSeqNum(i.globalSeqNum) 506 } 507 } else { 508 i.ikey.Trailer = uint64(InternalKeyKindInvalid) 509 i.ikey.UserKey = nil 510 } 511 } 512 513 func (i *blockIter) clearCache() { 514 i.cached = i.cached[:0] 515 i.cachedBuf = i.cachedBuf[:0] 516 } 517 518 func (i *blockIter) cacheEntry() { 519 var valStart int32 520 valSize := int32(len(i.val)) 521 if valSize > 0 { 522 valStart = int32(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr)) 523 } 524 525 i.cached = append(i.cached, blockEntry{ 526 offset: i.offset, 527 keyStart: int32(len(i.cachedBuf)), 528 keyEnd: int32(len(i.cachedBuf) + len(i.key)), 529 valStart: valStart, 530 valSize: valSize, 531 }) 532 i.cachedBuf = append(i.cachedBuf, i.key...) 533 } 534 535 // SeekGE implements internalIterator.SeekGE, as documented in the bitalostable 536 // package. 537 func (i *blockIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, []byte) { 538 i.clearCache() 539 540 ikey := base.MakeSearchKey(key) 541 542 // Find the index of the smallest restart point whose key is > the key 543 // sought; index will be numRestarts if there is no such restart point. 544 i.offset = 0 545 var index int32 546 547 { 548 // NB: manually inlined sort.Seach is ~5% faster. 549 // 550 // Define f(-1) == false and f(n) == true. 551 // Invariant: f(index-1) == false, f(upper) == true. 552 upper := i.numRestarts 553 for index < upper { 554 h := int32(uint(index+upper) >> 1) // avoid overflow when computing h 555 // index ≤ h < upper 556 offset := int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*h:])) 557 // For a restart point, there are 0 bytes shared with the previous key. 558 // The varint encoding of 0 occupies 1 byte. 559 ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1)) 560 561 // Decode the key at that restart point, and compare it to the key 562 // sought. See the comment in readEntry for why we manually inline the 563 // varint decoding. 564 var v1 uint32 565 if a := *((*uint8)(ptr)); a < 128 { 566 v1 = uint32(a) 567 ptr = unsafe.Pointer(uintptr(ptr) + 1) 568 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 569 v1 = uint32(b)<<7 | uint32(a) 570 ptr = unsafe.Pointer(uintptr(ptr) + 2) 571 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 572 v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 573 ptr = unsafe.Pointer(uintptr(ptr) + 3) 574 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 575 v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 576 ptr = unsafe.Pointer(uintptr(ptr) + 4) 577 } else { 578 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 579 v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 580 ptr = unsafe.Pointer(uintptr(ptr) + 5) 581 } 582 583 if *((*uint8)(ptr)) < 128 { 584 ptr = unsafe.Pointer(uintptr(ptr) + 1) 585 } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 { 586 ptr = unsafe.Pointer(uintptr(ptr) + 2) 587 } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 { 588 ptr = unsafe.Pointer(uintptr(ptr) + 3) 589 } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 { 590 ptr = unsafe.Pointer(uintptr(ptr) + 4) 591 } else { 592 ptr = unsafe.Pointer(uintptr(ptr) + 5) 593 } 594 595 // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on 596 // BlockIter benchmarks. 597 s := getBytes(ptr, int(v1)) 598 var k InternalKey 599 if n := len(s) - 8; n >= 0 { 600 k.Trailer = binary.LittleEndian.Uint64(s[n:]) 601 k.UserKey = s[:n:n] 602 // NB: We can't have duplicate keys if the globalSeqNum != 0, so we 603 // leave the seqnum on this key as 0 as it won't affect our search 604 // since ikey has the maximum seqnum. 605 } else { 606 k.Trailer = uint64(InternalKeyKindInvalid) 607 } 608 609 if base.InternalCompare(i.cmp, ikey, k) >= 0 { 610 index = h + 1 // preserves f(i-1) == false 611 } else { 612 upper = h // preserves f(j) == true 613 } 614 } 615 // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true 616 // => answer is index. 617 } 618 619 // Since keys are strictly increasing, if index > 0 then the restart point at 620 // index-1 will be the largest whose key is <= the key sought. If index == 621 // 0, then all keys in this block are larger than the key sought, and offset 622 // remains at zero. 623 if index > 0 { 624 i.offset = int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*(index-1):])) 625 } 626 i.readEntry() 627 i.decodeInternalKey(i.key) 628 629 // Iterate from that restart point to somewhere >= the key sought. 630 for ; i.valid(); i.Next() { 631 if base.InternalCompare(i.cmp, i.ikey, ikey) >= 0 { 632 return &i.ikey, i.val 633 } 634 } 635 636 return nil, nil 637 } 638 639 // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the 640 // bitalostable package. 641 func (i *blockIter) SeekPrefixGE( 642 prefix, key []byte, flags base.SeekGEFlags, 643 ) (*base.InternalKey, []byte) { 644 // This should never be called as prefix iteration is handled by sstable.Iterator. 645 panic("bitalostable: SeekPrefixGE unimplemented") 646 } 647 648 // SeekLT implements internalIterator.SeekLT, as documented in the bitalostable 649 // package. 650 func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, []byte) { 651 i.clearCache() 652 653 ikey := base.MakeSearchKey(key) 654 655 // Find the index of the smallest restart point whose key is >= the key 656 // sought; index will be numRestarts if there is no such restart point. 657 i.offset = 0 658 var index int32 659 660 { 661 // NB: manually inlined sort.Search is ~5% faster. 662 // 663 // Define f(-1) == false and f(n) == true. 664 // Invariant: f(index-1) == false, f(upper) == true. 665 upper := i.numRestarts 666 for index < upper { 667 h := int32(uint(index+upper) >> 1) // avoid overflow when computing h 668 // index ≤ h < upper 669 offset := int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*h:])) 670 // For a restart point, there are 0 bytes shared with the previous key. 671 // The varint encoding of 0 occupies 1 byte. 672 ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1)) 673 674 // Decode the key at that restart point, and compare it to the key 675 // sought. See the comment in readEntry for why we manually inline the 676 // varint decoding. 677 var v1 uint32 678 if a := *((*uint8)(ptr)); a < 128 { 679 v1 = uint32(a) 680 ptr = unsafe.Pointer(uintptr(ptr) + 1) 681 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 682 v1 = uint32(b)<<7 | uint32(a) 683 ptr = unsafe.Pointer(uintptr(ptr) + 2) 684 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 685 v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 686 ptr = unsafe.Pointer(uintptr(ptr) + 3) 687 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 688 v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 689 ptr = unsafe.Pointer(uintptr(ptr) + 4) 690 } else { 691 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 692 v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 693 ptr = unsafe.Pointer(uintptr(ptr) + 5) 694 } 695 696 if *((*uint8)(ptr)) < 128 { 697 ptr = unsafe.Pointer(uintptr(ptr) + 1) 698 } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 { 699 ptr = unsafe.Pointer(uintptr(ptr) + 2) 700 } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 { 701 ptr = unsafe.Pointer(uintptr(ptr) + 3) 702 } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 { 703 ptr = unsafe.Pointer(uintptr(ptr) + 4) 704 } else { 705 ptr = unsafe.Pointer(uintptr(ptr) + 5) 706 } 707 708 // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on 709 // BlockIter benchmarks. 710 s := getBytes(ptr, int(v1)) 711 var k InternalKey 712 if n := len(s) - 8; n >= 0 { 713 k.Trailer = binary.LittleEndian.Uint64(s[n:]) 714 k.UserKey = s[:n:n] 715 // NB: We can't have duplicate keys if the globalSeqNum != 0, so we 716 // leave the seqnum on this key as 0 as it won't affect our search 717 // since ikey has the maximum seqnum. 718 } else { 719 k.Trailer = uint64(InternalKeyKindInvalid) 720 } 721 722 if base.InternalCompare(i.cmp, ikey, k) > 0 { 723 index = h + 1 // preserves f(i-1) == false 724 } else { 725 upper = h // preserves f(j) == true 726 } 727 } 728 // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true 729 // => answer is index. 730 } 731 732 // Since keys are strictly increasing, if index > 0 then the restart point at 733 // index-1 will be the largest whose key is < the key sought. 734 targetOffset := i.restarts 735 if index > 0 { 736 i.offset = int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*(index-1):])) 737 if index < i.numRestarts { 738 targetOffset = int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*(index):])) 739 } 740 } else if index == 0 { 741 // If index == 0 then all keys in this block are larger than the key 742 // sought. 743 i.offset = -1 744 i.nextOffset = 0 745 return nil, nil 746 } 747 748 // Iterate from that restart point to somewhere >= the key sought, then back 749 // up to the previous entry. The expectation is that we'll be performing 750 // reverse iteration, so we cache the entries as we advance forward. 751 i.nextOffset = i.offset 752 753 for { 754 i.offset = i.nextOffset 755 i.readEntry() 756 i.decodeInternalKey(i.key) 757 758 if i.cmp(i.ikey.UserKey, ikey.UserKey) >= 0 { 759 // The current key is greater than or equal to our search key. Back up to 760 // the previous key which was less than our search key. Note that his for 761 // loop will execute at least once with this if-block not being true, so 762 // the key we are backing up to is the last one this loop cached. 763 i.Prev() 764 return &i.ikey, i.val 765 } 766 767 if i.nextOffset >= targetOffset { 768 // We've reached the end of the current restart block. Return the current 769 // key. When the restart interval is 1, the first iteration of the for 770 // loop will bring us here. In that case ikey is backed by the block so 771 // we get the desired key stability guarantee for the lifetime of the 772 // blockIter. 773 break 774 } 775 776 i.cacheEntry() 777 } 778 779 if !i.valid() { 780 return nil, nil 781 } 782 return &i.ikey, i.val 783 } 784 785 // First implements internalIterator.First, as documented in the bitalostable 786 // package. 787 func (i *blockIter) First() (*InternalKey, []byte) { 788 i.offset = 0 789 if !i.valid() { 790 return nil, nil 791 } 792 i.clearCache() 793 i.readEntry() 794 i.decodeInternalKey(i.key) 795 return &i.ikey, i.val 796 } 797 798 // Last implements internalIterator.Last, as documented in the bitalostable package. 799 func (i *blockIter) Last() (*InternalKey, []byte) { 800 // Seek forward from the last restart point. 801 i.offset = int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*(i.numRestarts-1):])) 802 if !i.valid() { 803 return nil, nil 804 } 805 806 i.readEntry() 807 i.clearCache() 808 809 for i.nextOffset < i.restarts { 810 i.cacheEntry() 811 i.offset = i.nextOffset 812 i.readEntry() 813 } 814 815 i.decodeInternalKey(i.key) 816 return &i.ikey, i.val 817 } 818 819 // Next implements internalIterator.Next, as documented in the bitalostable 820 // package. 821 func (i *blockIter) Next() (*InternalKey, []byte) { 822 if len(i.cachedBuf) > 0 { 823 // We're switching from reverse iteration to forward iteration. We need to 824 // populate i.fullKey with the current key we're positioned at so that 825 // readEntry() can use i.fullKey for key prefix decompression. Note that we 826 // don't know whether i.key is backed by i.cachedBuf or i.fullKey (if 827 // SeekLT was the previous call, i.key may be backed by i.fullKey), but 828 // copying into i.fullKey works for both cases. 829 // 830 // TODO(peter): Rather than clearing the cache, we could instead use the 831 // cache until it is exhausted. This would likely be faster than falling 832 // through to the normal forward iteration code below. 833 i.fullKey = append(i.fullKey[:0], i.key...) 834 i.clearCache() 835 } 836 837 i.offset = i.nextOffset 838 if !i.valid() { 839 return nil, nil 840 } 841 i.readEntry() 842 // Manually inlined version of i.decodeInternalKey(i.key). 843 if n := len(i.key) - 8; n >= 0 { 844 i.ikey.Trailer = binary.LittleEndian.Uint64(i.key[n:]) 845 i.ikey.UserKey = i.key[:n:n] 846 if i.globalSeqNum != 0 { 847 i.ikey.SetSeqNum(i.globalSeqNum) 848 } 849 } else { 850 i.ikey.Trailer = uint64(InternalKeyKindInvalid) 851 i.ikey.UserKey = nil 852 } 853 return &i.ikey, i.val 854 } 855 856 // Prev implements internalIterator.Prev, as documented in the bitalostable 857 // package. 858 func (i *blockIter) Prev() (*InternalKey, []byte) { 859 if n := len(i.cached) - 1; n >= 0 { 860 i.nextOffset = i.offset 861 e := &i.cached[n] 862 i.offset = e.offset 863 i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize)) 864 // Manually inlined version of i.decodeInternalKey(i.key). 865 i.key = i.cachedBuf[e.keyStart:e.keyEnd] 866 if n := len(i.key) - 8; n >= 0 { 867 i.ikey.Trailer = binary.LittleEndian.Uint64(i.key[n:]) 868 i.ikey.UserKey = i.key[:n:n] 869 if i.globalSeqNum != 0 { 870 i.ikey.SetSeqNum(i.globalSeqNum) 871 } 872 } else { 873 i.ikey.Trailer = uint64(InternalKeyKindInvalid) 874 i.ikey.UserKey = nil 875 } 876 i.cached = i.cached[:n] 877 return &i.ikey, i.val 878 } 879 880 i.clearCache() 881 if i.offset <= 0 { 882 i.offset = -1 883 i.nextOffset = 0 884 return nil, nil 885 } 886 887 targetOffset := i.offset 888 var index int32 889 890 { 891 // NB: manually inlined sort.Sort is ~5% faster. 892 // 893 // Define f(-1) == false and f(n) == true. 894 // Invariant: f(index-1) == false, f(upper) == true. 895 upper := i.numRestarts 896 for index < upper { 897 h := int32(uint(index+upper) >> 1) // avoid overflow when computing h 898 // index ≤ h < upper 899 offset := int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*h:])) 900 if offset < targetOffset { 901 index = h + 1 // preserves f(i-1) == false 902 } else { 903 upper = h // preserves f(j) == true 904 } 905 } 906 // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true 907 // => answer is index. 908 } 909 910 i.offset = 0 911 if index > 0 { 912 i.offset = int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*(index-1):])) 913 } 914 915 i.readEntry() 916 917 for i.nextOffset < targetOffset { 918 i.cacheEntry() 919 i.offset = i.nextOffset 920 i.readEntry() 921 } 922 923 i.decodeInternalKey(i.key) 924 return &i.ikey, i.val 925 } 926 927 // Key implements internalIterator.Key, as documented in the bitalostable package. 928 func (i *blockIter) Key() *InternalKey { 929 return &i.ikey 930 } 931 932 // Value implements internalIterator.Value, as documented in the bitalostable 933 // package. 934 func (i *blockIter) Value() []byte { 935 return i.val 936 } 937 938 // Error implements internalIterator.Error, as documented in the bitalostable 939 // package. 940 func (i *blockIter) Error() error { 941 return nil // infallible 942 } 943 944 // Close implements internalIterator.Close, as documented in the bitalostable 945 // package. 946 func (i *blockIter) Close() error { 947 i.cacheHandle.Release() 948 i.cacheHandle = cache.Handle{} 949 i.val = nil 950 return nil 951 } 952 953 func (i *blockIter) SetBounds(lower, upper []byte) { 954 // This should never be called as bounds are handled by sstable.Iterator. 955 panic("bitalostable: SetBounds unimplemented") 956 } 957 958 func (i *blockIter) valid() bool { 959 return i.offset >= 0 && i.offset < i.restarts 960 } 961 962 // fragmentBlockIter wraps a blockIter, implementing the 963 // keyspan.FragmentIterator interface. It's used for reading range deletion and 964 // range key blocks. 965 // 966 // Range deletions and range keys are fragmented before they're persisted to the 967 // block. Overlapping fragments have identical bounds. The fragmentBlockIter 968 // gathers all the fragments with identical bounds within a block and returns a 969 // single keyspan.Span describing all the keys defined over the span. 970 // 971 // # Memory lifetime 972 // 973 // A Span returned by fragmentBlockIter is only guaranteed to be stable until 974 // the next fragmentBlockIter iteration positioning method. A Span's Keys slice 975 // may be reused, so the user must not assume it's stable. 976 // 977 // Blocks holding range deletions and range keys are configured to use a restart 978 // interval of 1. This provides key stability. The caller may treat the various 979 // byte slices (start, end, suffix, value) as stable for the lifetime of the 980 // iterator. 981 type fragmentBlockIter struct { 982 blockIter blockIter 983 keyBuf [2]keyspan.Key 984 span keyspan.Span 985 err error 986 dir int8 987 closeHook func(i keyspan.FragmentIterator) error 988 } 989 990 func (i *fragmentBlockIter) resetForReuse() fragmentBlockIter { 991 return fragmentBlockIter{blockIter: i.blockIter.resetForReuse()} 992 } 993 994 func (i *fragmentBlockIter) decodeSpanKeys(k *InternalKey, internalValue []byte) { 995 // TODO(jackson): The use of i.span.Keys to accumulate keys across multiple 996 // calls to Decode is too confusing and subtle. Refactor to make it 997 // explicit. 998 999 // decode the contents of the fragment's value. This always includes at 1000 // least the end key: RANGEDELs store the end key directly as the value, 1001 // whereas the various range key kinds store are more complicated. The 1002 // details of the range key internal value format are documented within the 1003 // internal/rangekey package. 1004 switch k.Kind() { 1005 case base.InternalKeyKindRangeDelete: 1006 i.span = rangedel.Decode(*k, internalValue, i.span.Keys) 1007 i.err = nil 1008 case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset, base.InternalKeyKindRangeKeyDelete: 1009 i.span, i.err = rangekey.Decode(*k, internalValue, i.span.Keys) 1010 default: 1011 i.span = keyspan.Span{} 1012 i.err = base.CorruptionErrorf("bitalostable: corrupt keyspan fragment of kind %d", k.Kind()) 1013 } 1014 } 1015 1016 // gatherForward gathers internal keys with identical bounds. Keys defined over 1017 // spans of the keyspace are fragmented such that any overlapping key spans have 1018 // identical bounds. When these spans are persisted to a range deletion or range 1019 // key block, they may be persisted as multiple internal keys in order to encode 1020 // multiple sequence numbers or key kinds. 1021 // 1022 // gatherForward iterates forward, re-combining the fragmented internal keys to 1023 // reconstruct a keyspan.Span that holds all the keys defined over the span. 1024 func (i *fragmentBlockIter) gatherForward(k *InternalKey, internalValue []byte) *keyspan.Span { 1025 i.span = keyspan.Span{} 1026 if k == nil || !i.blockIter.valid() { 1027 return nil 1028 } 1029 i.err = nil 1030 // Use the i.keyBuf array to back the Keys slice to prevent an allocation 1031 // when a span contains few keys. 1032 i.span.Keys = i.keyBuf[:0] 1033 1034 // Decode the span's end key and individual keys from the value. 1035 i.decodeSpanKeys(k, internalValue) 1036 if i.err != nil { 1037 return nil 1038 } 1039 prevEnd := i.span.End 1040 1041 // There might exist additional internal keys with identical bounds encoded 1042 // within the block. Iterate forward, accumulating all the keys with 1043 // identical bounds to s. 1044 k, internalValue = i.blockIter.Next() 1045 for k != nil && i.blockIter.cmp(k.UserKey, i.span.Start) == 0 { 1046 i.decodeSpanKeys(k, internalValue) 1047 if i.err != nil { 1048 return nil 1049 } 1050 1051 // Since k indicates an equal start key, the encoded end key must 1052 // exactly equal the original end key from the first internal key. 1053 // Overlapping fragments are required to have exactly equal start and 1054 // end bounds. 1055 if i.blockIter.cmp(prevEnd, i.span.End) != 0 { 1056 i.err = base.CorruptionErrorf("bitalostable: corrupt keyspan fragmentation") 1057 i.span = keyspan.Span{} 1058 return nil 1059 } 1060 k, internalValue = i.blockIter.Next() 1061 } 1062 // i.blockIter is positioned over the first internal key for the next span. 1063 return &i.span 1064 } 1065 1066 // gatherBackward gathers internal keys with identical bounds. Keys defined over 1067 // spans of the keyspace are fragmented such that any overlapping key spans have 1068 // identical bounds. When these spans are persisted to a range deletion or range 1069 // key block, they may be persisted as multiple internal keys in order to encode 1070 // multiple sequence numbers or key kinds. 1071 // 1072 // gatherBackward iterates backwards, re-combining the fragmented internal keys 1073 // to reconstruct a keyspan.Span that holds all the keys defined over the span. 1074 func (i *fragmentBlockIter) gatherBackward(k *InternalKey, internalValue []byte) *keyspan.Span { 1075 i.span = keyspan.Span{} 1076 if k == nil || !i.blockIter.valid() { 1077 return nil 1078 } 1079 i.err = nil 1080 // Use the i.keyBuf array to back the Keys slice to prevent an allocation 1081 // when a span contains few keys. 1082 i.span.Keys = i.keyBuf[:0] 1083 1084 // Decode the span's end key and individual keys from the value. 1085 i.decodeSpanKeys(k, internalValue) 1086 if i.err != nil { 1087 return nil 1088 } 1089 prevEnd := i.span.End 1090 1091 // There might exist additional internal keys with identical bounds encoded 1092 // within the block. Iterate backward, accumulating all the keys with 1093 // identical bounds to s. 1094 k, internalValue = i.blockIter.Prev() 1095 for k != nil && i.blockIter.cmp(k.UserKey, i.span.Start) == 0 { 1096 i.decodeSpanKeys(k, internalValue) 1097 if i.err != nil { 1098 return nil 1099 } 1100 1101 // Since k indicates an equal start key, the encoded end key must 1102 // exactly equal the original end key from the first internal key. 1103 // Overlapping fragments are required to have exactly equal start and 1104 // end bounds. 1105 if i.blockIter.cmp(prevEnd, i.span.End) != 0 { 1106 i.err = base.CorruptionErrorf("bitalostable: corrupt keyspan fragmentation") 1107 i.span = keyspan.Span{} 1108 return nil 1109 } 1110 k, internalValue = i.blockIter.Prev() 1111 } 1112 // i.blockIter is positioned over the last internal key for the previous 1113 // span. 1114 1115 // Backwards iteration encounters internal keys in the wrong order. 1116 keyspan.SortKeysByTrailer(&i.span.Keys) 1117 1118 return &i.span 1119 } 1120 1121 // Error implements (keyspan.FragmentIterator).Error. 1122 func (i *fragmentBlockIter) Error() error { 1123 return i.err 1124 } 1125 1126 // Close implements (keyspan.FragmentIterator).Close. 1127 func (i *fragmentBlockIter) Close() error { 1128 var err error 1129 if i.closeHook != nil { 1130 err = i.closeHook(i) 1131 } 1132 err = firstError(err, i.blockIter.Close()) 1133 return err 1134 } 1135 1136 // First implements (keyspan.FragmentIterator).First 1137 func (i *fragmentBlockIter) First() *keyspan.Span { 1138 i.dir = +1 1139 return i.gatherForward(i.blockIter.First()) 1140 } 1141 1142 // Last implements (keyspan.FragmentIterator).Last. 1143 func (i *fragmentBlockIter) Last() *keyspan.Span { 1144 i.dir = -1 1145 return i.gatherBackward(i.blockIter.Last()) 1146 } 1147 1148 // Next implements (keyspan.FragmentIterator).Next. 1149 func (i *fragmentBlockIter) Next() *keyspan.Span { 1150 switch { 1151 case i.dir == -1 && !i.span.Valid(): 1152 // Switching directions. 1153 // 1154 // i.blockIter is exhausted, before the first key. Move onto the first. 1155 i.blockIter.First() 1156 i.dir = +1 1157 case i.dir == -1 && i.span.Valid(): 1158 // Switching directions. 1159 // 1160 // i.blockIter is currently positioned over the last internal key for 1161 // the previous span. Next it once to move to the first internal key 1162 // that makes up the current span, and gatherForwaad to land on the 1163 // first internal key making up the next span. 1164 // 1165 // In the diagram below, if the last span returned to the user during 1166 // reverse iteration was [b,c), i.blockIter is currently positioned at 1167 // [a,b). The block iter must be positioned over [d,e) to gather the 1168 // next span's fragments. 1169 // 1170 // ... [a,b) [b,c) [b,c) [b,c) [d,e) ... 1171 // ^ ^ 1172 // i.blockIter want 1173 if x := i.gatherForward(i.blockIter.Next()); invariants.Enabled && !x.Valid() { 1174 panic("bitalostable: invariant violation: next entry unexpectedly invalid") 1175 } 1176 i.dir = +1 1177 } 1178 return i.gatherForward(&i.blockIter.ikey, i.blockIter.val) 1179 } 1180 1181 // Prev implements (keyspan.FragmentIterator).Prev. 1182 func (i *fragmentBlockIter) Prev() *keyspan.Span { 1183 switch { 1184 case i.dir == +1 && !i.span.Valid(): 1185 // Switching directions. 1186 // 1187 // i.blockIter is exhausted, after the last key. Move onto the last. 1188 i.blockIter.Last() 1189 i.dir = -1 1190 case i.dir == +1 && i.span.Valid(): 1191 // Switching directions. 1192 // 1193 // i.blockIter is currently positioned over the first internal key for 1194 // the next span. Prev it once to move to the last internal key that 1195 // makes up the current span, and gatherBackward to land on the last 1196 // internal key making up the previous span. 1197 // 1198 // In the diagram below, if the last span returned to the user during 1199 // forward iteration was [b,c), i.blockIter is currently positioned at 1200 // [d,e). The block iter must be positioned over [a,b) to gather the 1201 // previous span's fragments. 1202 // 1203 // ... [a,b) [b,c) [b,c) [b,c) [d,e) ... 1204 // ^ ^ 1205 // want i.blockIter 1206 if x := i.gatherBackward(i.blockIter.Prev()); invariants.Enabled && !x.Valid() { 1207 panic("bitalostable: invariant violation: previous entry unexpectedly invalid") 1208 } 1209 i.dir = -1 1210 } 1211 return i.gatherBackward(&i.blockIter.ikey, i.blockIter.val) 1212 } 1213 1214 // SeekGE implements (keyspan.FragmentIterator).SeekGE. 1215 func (i *fragmentBlockIter) SeekGE(k []byte) *keyspan.Span { 1216 i.dir = +1 1217 return i.gatherForward(i.blockIter.SeekGE(k, base.SeekGEFlags(0))) 1218 } 1219 1220 // SeekLT implements (keyspan.FragmentIterator).SeekLT. 1221 func (i *fragmentBlockIter) SeekLT(k []byte) *keyspan.Span { 1222 i.dir = -1 1223 return i.gatherBackward(i.blockIter.SeekLT(k, base.SeekLTFlagsNone)) 1224 } 1225 1226 // String implements fmt.Stringer. 1227 func (i *fragmentBlockIter) String() string { 1228 return "fragment-block-iter" 1229 } 1230 1231 // SetCloseHook implements sstable.FragmentIterator. 1232 func (i *fragmentBlockIter) SetCloseHook(fn func(i keyspan.FragmentIterator) error) { 1233 i.closeHook = fn 1234 }