github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/sstable/block.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package sstable
     6  
     7  import (
     8  	"context"
     9  	"encoding/binary"
    10  	"unsafe"
    11  
    12  	"github.com/cockroachdb/errors"
    13  	"github.com/cockroachdb/pebble/internal/base"
    14  	"github.com/cockroachdb/pebble/internal/invariants"
    15  	"github.com/cockroachdb/pebble/internal/keyspan"
    16  	"github.com/cockroachdb/pebble/internal/manual"
    17  	"github.com/cockroachdb/pebble/internal/rangedel"
    18  	"github.com/cockroachdb/pebble/internal/rangekey"
    19  )
    20  
    21  func uvarintLen(v uint32) int {
    22  	i := 0
    23  	for v >= 0x80 {
    24  		v >>= 7
    25  		i++
    26  	}
    27  	return i + 1
    28  }
    29  
    30  type blockWriter struct {
    31  	restartInterval int
    32  	nEntries        int
    33  	nextRestart     int
    34  	buf             []byte
    35  	// For datablocks in TableFormatPebblev3, we steal the most significant bit
    36  	// in restarts for encoding setHasSameKeyPrefixSinceLastRestart. This leaves
    37  	// us with 31 bits, which is more than enough (no one needs > 2GB blocks).
    38  	// Typically, restarts occur every 16 keys, and by storing this bit with the
    39  	// restart, we can optimize for the case where a user wants to skip to the
    40  	// next prefix which happens to be in the same data block, but is > 16 keys
    41  	// away. We have seen production situations with 100+ versions per MVCC key
    42  	// (which share the same prefix). Additionally, for such writers, the prefix
    43  	// compression of the key, that shares the key with the preceding key, is
    44  	// limited to the prefix part of the preceding key -- this ensures that when
    45  	// doing NPrefix (see blockIter) we don't need to assemble the full key
    46  	// for each step since by limiting the length of the shared key we are
    47  	// ensuring that any of the keys with the same prefix can be used to
    48  	// assemble the full key when the prefix does change.
    49  	restarts []uint32
    50  	// Do not read curKey directly from outside blockWriter since it can have
    51  	// the InternalKeyKindSSTableInternalObsoleteBit set. Use getCurKey() or
    52  	// getCurUserKey() instead.
    53  	curKey []byte
    54  	// curValue excludes the optional prefix provided to
    55  	// storeWithOptionalValuePrefix.
    56  	curValue []byte
    57  	prevKey  []byte
    58  	tmp      [4]byte
    59  	// We don't know the state of the sets that were at the end of the previous
    60  	// block, so this is initially 0. It may be true for the second and later
    61  	// restarts in a block. Not having inter-block information is fine since we
    62  	// will optimize by stepping through restarts only within the same block.
    63  	// Note that the first restart is the first key in the block.
    64  	setHasSameKeyPrefixSinceLastRestart bool
    65  }
    66  
    67  func (w *blockWriter) clear() {
    68  	*w = blockWriter{
    69  		buf:      w.buf[:0],
    70  		restarts: w.restarts[:0],
    71  		curKey:   w.curKey[:0],
    72  		curValue: w.curValue[:0],
    73  		prevKey:  w.prevKey[:0],
    74  	}
    75  }
    76  
    77  // MaximumBlockSize is an extremely generous maximum block size of 256MiB. We
    78  // explicitly place this limit to reserve a few bits in the restart for
    79  // internal use.
    80  const MaximumBlockSize = 1 << 28
    81  const setHasSameKeyPrefixRestartMask uint32 = 1 << 31
    82  const restartMaskLittleEndianHighByteWithoutSetHasSamePrefix byte = 0b0111_1111
    83  const restartMaskLittleEndianHighByteOnlySetHasSamePrefix byte = 0b1000_0000
    84  
    85  func (w *blockWriter) getCurKey() InternalKey {
    86  	k := base.DecodeInternalKey(w.curKey)
    87  	k.Trailer = k.Trailer & trailerObsoleteMask
    88  	return k
    89  }
    90  
    91  func (w *blockWriter) getCurUserKey() []byte {
    92  	n := len(w.curKey) - base.InternalTrailerLen
    93  	if n < 0 {
    94  		panic(errors.AssertionFailedf("corrupt key in blockWriter buffer"))
    95  	}
    96  	return w.curKey[:n:n]
    97  }
    98  
    99  // If !addValuePrefix, the valuePrefix is ignored.
   100  func (w *blockWriter) storeWithOptionalValuePrefix(
   101  	keySize int,
   102  	value []byte,
   103  	maxSharedKeyLen int,
   104  	addValuePrefix bool,
   105  	valuePrefix valuePrefix,
   106  	setHasSameKeyPrefix bool,
   107  ) {
   108  	shared := 0
   109  	if !setHasSameKeyPrefix {
   110  		w.setHasSameKeyPrefixSinceLastRestart = false
   111  	}
   112  	if w.nEntries == w.nextRestart {
   113  		w.nextRestart = w.nEntries + w.restartInterval
   114  		restart := uint32(len(w.buf))
   115  		if w.setHasSameKeyPrefixSinceLastRestart {
   116  			restart = restart | setHasSameKeyPrefixRestartMask
   117  		}
   118  		w.setHasSameKeyPrefixSinceLastRestart = true
   119  		w.restarts = append(w.restarts, restart)
   120  	} else {
   121  		// TODO(peter): Manually inlined version of base.SharedPrefixLen(). This
   122  		// is 3% faster on BenchmarkWriter on go1.16. Remove if future versions
   123  		// show this to not be a performance win. For now, functions that use of
   124  		// unsafe cannot be inlined.
   125  		n := maxSharedKeyLen
   126  		if n > len(w.prevKey) {
   127  			n = len(w.prevKey)
   128  		}
   129  		asUint64 := func(b []byte, i int) uint64 {
   130  			return binary.LittleEndian.Uint64(b[i:])
   131  		}
   132  		for shared < n-7 && asUint64(w.curKey, shared) == asUint64(w.prevKey, shared) {
   133  			shared += 8
   134  		}
   135  		for shared < n && w.curKey[shared] == w.prevKey[shared] {
   136  			shared++
   137  		}
   138  	}
   139  
   140  	lenValuePlusOptionalPrefix := len(value)
   141  	if addValuePrefix {
   142  		lenValuePlusOptionalPrefix++
   143  	}
   144  	needed := 3*binary.MaxVarintLen32 + len(w.curKey[shared:]) + lenValuePlusOptionalPrefix
   145  	n := len(w.buf)
   146  	if cap(w.buf) < n+needed {
   147  		newCap := 2 * cap(w.buf)
   148  		if newCap == 0 {
   149  			newCap = 1024
   150  		}
   151  		for newCap < n+needed {
   152  			newCap *= 2
   153  		}
   154  		newBuf := make([]byte, n, newCap)
   155  		copy(newBuf, w.buf)
   156  		w.buf = newBuf
   157  	}
   158  	w.buf = w.buf[:n+needed]
   159  
   160  	// TODO(peter): Manually inlined versions of binary.PutUvarint(). This is 15%
   161  	// faster on BenchmarkWriter on go1.13. Remove if go1.14 or future versions
   162  	// show this to not be a performance win.
   163  	{
   164  		x := uint32(shared)
   165  		for x >= 0x80 {
   166  			w.buf[n] = byte(x) | 0x80
   167  			x >>= 7
   168  			n++
   169  		}
   170  		w.buf[n] = byte(x)
   171  		n++
   172  	}
   173  
   174  	{
   175  		x := uint32(keySize - shared)
   176  		for x >= 0x80 {
   177  			w.buf[n] = byte(x) | 0x80
   178  			x >>= 7
   179  			n++
   180  		}
   181  		w.buf[n] = byte(x)
   182  		n++
   183  	}
   184  
   185  	{
   186  		x := uint32(lenValuePlusOptionalPrefix)
   187  		for x >= 0x80 {
   188  			w.buf[n] = byte(x) | 0x80
   189  			x >>= 7
   190  			n++
   191  		}
   192  		w.buf[n] = byte(x)
   193  		n++
   194  	}
   195  
   196  	n += copy(w.buf[n:], w.curKey[shared:])
   197  	if addValuePrefix {
   198  		w.buf[n : n+1][0] = byte(valuePrefix)
   199  		n++
   200  	}
   201  	n += copy(w.buf[n:], value)
   202  	w.buf = w.buf[:n]
   203  
   204  	w.curValue = w.buf[n-len(value):]
   205  
   206  	w.nEntries++
   207  }
   208  
   209  func (w *blockWriter) add(key InternalKey, value []byte) {
   210  	w.addWithOptionalValuePrefix(
   211  		key, false, value, len(key.UserKey), false, 0, false)
   212  }
   213  
   214  // Callers that always set addValuePrefix to false should use add() instead.
   215  //
   216  // isObsolete indicates whether this key-value pair is obsolete in this
   217  // sstable (only applicable when writing data blocks) -- see the comment in
   218  // table.go and the longer one in format.go. addValuePrefix adds a 1 byte
   219  // prefix to the value, specified in valuePrefix -- this is used for data
   220  // blocks in TableFormatPebblev3 onwards for SETs (see the comment in
   221  // format.go, with more details in value_block.go). setHasSameKeyPrefix is
   222  // also used in TableFormatPebblev3 onwards for SETs.
   223  func (w *blockWriter) addWithOptionalValuePrefix(
   224  	key InternalKey,
   225  	isObsolete bool,
   226  	value []byte,
   227  	maxSharedKeyLen int,
   228  	addValuePrefix bool,
   229  	valuePrefix valuePrefix,
   230  	setHasSameKeyPrefix bool,
   231  ) {
   232  	w.curKey, w.prevKey = w.prevKey, w.curKey
   233  
   234  	size := key.Size()
   235  	if cap(w.curKey) < size {
   236  		w.curKey = make([]byte, 0, size*2)
   237  	}
   238  	w.curKey = w.curKey[:size]
   239  	if isObsolete {
   240  		key.Trailer = key.Trailer | trailerObsoleteBit
   241  	}
   242  	key.Encode(w.curKey)
   243  
   244  	w.storeWithOptionalValuePrefix(
   245  		size, value, maxSharedKeyLen, addValuePrefix, valuePrefix, setHasSameKeyPrefix)
   246  }
   247  
   248  func (w *blockWriter) finish() []byte {
   249  	// Write the restart points to the buffer.
   250  	if w.nEntries == 0 {
   251  		// Every block must have at least one restart point.
   252  		if cap(w.restarts) > 0 {
   253  			w.restarts = w.restarts[:1]
   254  			w.restarts[0] = 0
   255  		} else {
   256  			w.restarts = append(w.restarts, 0)
   257  		}
   258  	}
   259  	tmp4 := w.tmp[:4]
   260  	for _, x := range w.restarts {
   261  		binary.LittleEndian.PutUint32(tmp4, x)
   262  		w.buf = append(w.buf, tmp4...)
   263  	}
   264  	binary.LittleEndian.PutUint32(tmp4, uint32(len(w.restarts)))
   265  	w.buf = append(w.buf, tmp4...)
   266  	result := w.buf
   267  
   268  	// Reset the block state.
   269  	w.nEntries = 0
   270  	w.nextRestart = 0
   271  	w.buf = w.buf[:0]
   272  	w.restarts = w.restarts[:0]
   273  	return result
   274  }
   275  
   276  // emptyBlockSize holds the size of an empty block. Every block ends
   277  // in a uint32 trailer encoding the number of restart points within the
   278  // block.
   279  const emptyBlockSize = 4
   280  
   281  func (w *blockWriter) estimatedSize() int {
   282  	return len(w.buf) + 4*len(w.restarts) + emptyBlockSize
   283  }
   284  
   285  type blockEntry struct {
   286  	offset   int32
   287  	keyStart int32
   288  	keyEnd   int32
   289  	valStart int32
   290  	valSize  int32
   291  }
   292  
   293  // blockIter is an iterator over a single block of data.
   294  //
   295  // A blockIter provides an additional guarantee around key stability when a
   296  // block has a restart interval of 1 (i.e. when there is no prefix
   297  // compression). Key stability refers to whether the InternalKey.UserKey bytes
   298  // returned by a positioning call will remain stable after a subsequent
   299  // positioning call. The normal case is that a positioning call will invalidate
   300  // any previously returned InternalKey.UserKey. If a block has a restart
   301  // interval of 1 (no prefix compression), blockIter guarantees that
   302  // InternalKey.UserKey will point to the key as stored in the block itself
   303  // which will remain valid until the blockIter is closed. The key stability
   304  // guarantee is used by the range tombstone and range key code, which knows that
   305  // the respective blocks are always encoded with a restart interval of 1. This
   306  // per-block key stability guarantee is sufficient for range tombstones and
   307  // range deletes as they are always encoded in a single block.
   308  //
   309  // A blockIter also provides a value stability guarantee for range deletions and
   310  // range keys since there is only a single range deletion and range key block
   311  // per sstable and the blockIter will not release the bytes for the block until
   312  // it is closed.
   313  //
   314  // Note on why blockIter knows about lazyValueHandling:
   315  //
   316  // blockIter's positioning functions (that return a LazyValue), are too
   317  // complex to inline even prior to lazyValueHandling. blockIter.Next and
   318  // blockIter.First were by far the cheapest and had costs 195 and 180
   319  // respectively, which exceeds the budget of 80. We initially tried to keep
   320  // the lazyValueHandling logic out of blockIter by wrapping it with a
   321  // lazyValueDataBlockIter. singleLevelIter and twoLevelIter would use this
   322  // wrapped iter. The functions in lazyValueDataBlockIter were simple, in that
   323  // they called the corresponding blockIter func and then decided whether the
   324  // value was in fact in-place (so return immediately) or needed further
   325  // handling. But these also turned out too costly for mid-stack inlining since
   326  // simple calls like the following have a high cost that is barely under the
   327  // budget of 80
   328  //
   329  //	k, v := i.data.SeekGE(key, flags)  // cost 74
   330  //	k, v := i.data.Next()              // cost 72
   331  //
   332  // We have 2 options for minimizing performance regressions:
   333  //   - Include the lazyValueHandling logic in the already non-inlineable
   334  //     blockIter functions: Since most of the time is spent in data block iters,
   335  //     it is acceptable to take the small hit of unnecessary branching (which
   336  //     hopefully branch prediction will predict correctly) for other kinds of
   337  //     blocks.
   338  //   - Duplicate the logic of singleLevelIterator and twoLevelIterator for the
   339  //     v3 sstable and only use the aforementioned lazyValueDataBlockIter for a
   340  //     v3 sstable. We would want to manage these copies via code generation.
   341  //
   342  // We have picked the first option here.
   343  type blockIter struct {
   344  	cmp Compare
   345  	// offset is the byte index that marks where the current key/value is
   346  	// encoded in the block.
   347  	offset int32
   348  	// nextOffset is the byte index where the next key/value is encoded in the
   349  	// block.
   350  	nextOffset int32
   351  	// A "restart point" in a block is a point where the full key is encoded,
   352  	// instead of just having a suffix of the key encoded. See readEntry() for
   353  	// how prefix compression of keys works. Keys in between two restart points
   354  	// only have a suffix encoded in the block. When restart interval is 1, no
   355  	// prefix compression of keys happens. This is the case with range tombstone
   356  	// blocks.
   357  	//
   358  	// All restart offsets are listed in increasing order in
   359  	// i.ptr[i.restarts:len(block)-4], while numRestarts is encoded in the last
   360  	// 4 bytes of the block as a uint32 (i.ptr[len(block)-4:]). i.restarts can
   361  	// therefore be seen as the point where data in the block ends, and a list
   362  	// of offsets of all restart points begins.
   363  	restarts int32
   364  	// Number of restart points in this block. Encoded at the end of the block
   365  	// as a uint32.
   366  	numRestarts  int32
   367  	globalSeqNum uint64
   368  	ptr          unsafe.Pointer
   369  	data         []byte
   370  	// key contains the raw key the iterator is currently pointed at. This may
   371  	// point directly to data stored in the block (for a key which has no prefix
   372  	// compression), to fullKey (for a prefix compressed key), or to a slice of
   373  	// data stored in cachedBuf (during reverse iteration).
   374  	key []byte
   375  	// fullKey is a buffer used for key prefix decompression.
   376  	fullKey []byte
   377  	// val contains the value the iterator is currently pointed at. If non-nil,
   378  	// this points to a slice of the block data.
   379  	val []byte
   380  	// lazyValue is val turned into a LazyValue, whenever a positioning method
   381  	// returns a non-nil key-value pair.
   382  	lazyValue base.LazyValue
   383  	// ikey contains the decoded InternalKey the iterator is currently pointed
   384  	// at. Note that the memory backing ikey.UserKey is either data stored
   385  	// directly in the block, fullKey, or cachedBuf. The key stability guarantee
   386  	// for blocks built with a restart interval of 1 is achieved by having
   387  	// ikey.UserKey always point to data stored directly in the block.
   388  	ikey InternalKey
   389  	// cached and cachedBuf are used during reverse iteration. They are needed
   390  	// because we can't perform prefix decoding in reverse, only in the forward
   391  	// direction. In order to iterate in reverse, we decode and cache the entries
   392  	// between two restart points.
   393  	//
   394  	// Note that cached[len(cached)-1] contains the previous entry to the one the
   395  	// blockIter is currently pointed at. As usual, nextOffset will contain the
   396  	// offset of the next entry. During reverse iteration, nextOffset will be
   397  	// updated to point to offset, and we'll set the blockIter to point at the
   398  	// entry cached[len(cached)-1]. See Prev() for more details.
   399  	//
   400  	// For a block encoded with a restart interval of 1, cached and cachedBuf
   401  	// will not be used as there are no prefix compressed entries between the
   402  	// restart points.
   403  	cached    []blockEntry
   404  	cachedBuf []byte
   405  	handle    bufferHandle
   406  	// for block iteration for already loaded blocks.
   407  	firstUserKey      []byte
   408  	lazyValueHandling struct {
   409  		vbr            *valueBlockReader
   410  		hasValuePrefix bool
   411  	}
   412  	hideObsoletePoints bool
   413  }
   414  
   415  // blockIter implements the base.InternalIterator interface.
   416  var _ base.InternalIterator = (*blockIter)(nil)
   417  
   418  func newBlockIter(cmp Compare, block block) (*blockIter, error) {
   419  	i := &blockIter{}
   420  	return i, i.init(cmp, block, 0, false)
   421  }
   422  
   423  func (i *blockIter) String() string {
   424  	return "block"
   425  }
   426  
   427  func (i *blockIter) init(
   428  	cmp Compare, block block, globalSeqNum uint64, hideObsoletePoints bool,
   429  ) error {
   430  	numRestarts := int32(binary.LittleEndian.Uint32(block[len(block)-4:]))
   431  	if numRestarts == 0 {
   432  		return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)")
   433  	}
   434  	i.cmp = cmp
   435  	i.restarts = int32(len(block)) - 4*(1+numRestarts)
   436  	i.numRestarts = numRestarts
   437  	i.globalSeqNum = globalSeqNum
   438  	i.ptr = unsafe.Pointer(&block[0])
   439  	i.data = block
   440  	i.fullKey = i.fullKey[:0]
   441  	i.val = nil
   442  	i.hideObsoletePoints = hideObsoletePoints
   443  	i.clearCache()
   444  	if i.restarts > 0 {
   445  		if err := i.readFirstKey(); err != nil {
   446  			return err
   447  		}
   448  	} else {
   449  		// Block is empty.
   450  		i.firstUserKey = nil
   451  	}
   452  	return nil
   453  }
   454  
   455  // NB: two cases of hideObsoletePoints:
   456  //   - Local sstable iteration: globalSeqNum will be set iff the sstable was
   457  //     ingested.
   458  //   - Foreign sstable iteration: globalSeqNum is always set.
   459  func (i *blockIter) initHandle(
   460  	cmp Compare, block bufferHandle, globalSeqNum uint64, hideObsoletePoints bool,
   461  ) error {
   462  	i.handle.Release()
   463  	i.handle = block
   464  	return i.init(cmp, block.Get(), globalSeqNum, hideObsoletePoints)
   465  }
   466  
   467  func (i *blockIter) invalidate() {
   468  	i.clearCache()
   469  	i.offset = 0
   470  	i.nextOffset = 0
   471  	i.restarts = 0
   472  	i.numRestarts = 0
   473  	i.data = nil
   474  }
   475  
   476  // isDataInvalidated returns true when the blockIter has been invalidated
   477  // using an invalidate call. NB: this is different from blockIter.Valid
   478  // which is part of the InternalIterator implementation.
   479  func (i *blockIter) isDataInvalidated() bool {
   480  	return i.data == nil
   481  }
   482  
   483  func (i *blockIter) resetForReuse() blockIter {
   484  	return blockIter{
   485  		fullKey:   i.fullKey[:0],
   486  		cached:    i.cached[:0],
   487  		cachedBuf: i.cachedBuf[:0],
   488  		data:      nil,
   489  	}
   490  }
   491  
   492  func (i *blockIter) readEntry() {
   493  	ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
   494  
   495  	// This is an ugly performance hack. Reading entries from blocks is one of
   496  	// the inner-most routines and decoding the 3 varints per-entry takes
   497  	// significant time. Neither go1.11 or go1.12 will inline decodeVarint for
   498  	// us, so we do it manually. This provides a 10-15% performance improvement
   499  	// on blockIter benchmarks on both go1.11 and go1.12.
   500  	//
   501  	// TODO(peter): remove this hack if go:inline is ever supported.
   502  
   503  	var shared uint32
   504  	if a := *((*uint8)(ptr)); a < 128 {
   505  		shared = uint32(a)
   506  		ptr = unsafe.Pointer(uintptr(ptr) + 1)
   507  	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
   508  		shared = uint32(b)<<7 | uint32(a)
   509  		ptr = unsafe.Pointer(uintptr(ptr) + 2)
   510  	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
   511  		shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   512  		ptr = unsafe.Pointer(uintptr(ptr) + 3)
   513  	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
   514  		shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   515  		ptr = unsafe.Pointer(uintptr(ptr) + 4)
   516  	} else {
   517  		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
   518  		shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   519  		ptr = unsafe.Pointer(uintptr(ptr) + 5)
   520  	}
   521  
   522  	var unshared uint32
   523  	if a := *((*uint8)(ptr)); a < 128 {
   524  		unshared = uint32(a)
   525  		ptr = unsafe.Pointer(uintptr(ptr) + 1)
   526  	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
   527  		unshared = uint32(b)<<7 | uint32(a)
   528  		ptr = unsafe.Pointer(uintptr(ptr) + 2)
   529  	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
   530  		unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   531  		ptr = unsafe.Pointer(uintptr(ptr) + 3)
   532  	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
   533  		unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   534  		ptr = unsafe.Pointer(uintptr(ptr) + 4)
   535  	} else {
   536  		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
   537  		unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   538  		ptr = unsafe.Pointer(uintptr(ptr) + 5)
   539  	}
   540  
   541  	var value uint32
   542  	if a := *((*uint8)(ptr)); a < 128 {
   543  		value = uint32(a)
   544  		ptr = unsafe.Pointer(uintptr(ptr) + 1)
   545  	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
   546  		value = uint32(b)<<7 | uint32(a)
   547  		ptr = unsafe.Pointer(uintptr(ptr) + 2)
   548  	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
   549  		value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   550  		ptr = unsafe.Pointer(uintptr(ptr) + 3)
   551  	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
   552  		value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   553  		ptr = unsafe.Pointer(uintptr(ptr) + 4)
   554  	} else {
   555  		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
   556  		value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   557  		ptr = unsafe.Pointer(uintptr(ptr) + 5)
   558  	}
   559  
   560  	unsharedKey := getBytes(ptr, int(unshared))
   561  	// TODO(sumeer): move this into the else block below.
   562  	i.fullKey = append(i.fullKey[:shared], unsharedKey...)
   563  	if shared == 0 {
   564  		// Provide stability for the key across positioning calls if the key
   565  		// doesn't share a prefix with the previous key. This removes requiring the
   566  		// key to be copied if the caller knows the block has a restart interval of
   567  		// 1. An important example of this is range-del blocks.
   568  		i.key = unsharedKey
   569  	} else {
   570  		i.key = i.fullKey
   571  	}
   572  	ptr = unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
   573  	i.val = getBytes(ptr, int(value))
   574  	i.nextOffset = int32(uintptr(ptr)-uintptr(i.ptr)) + int32(value)
   575  }
   576  
   577  func (i *blockIter) readFirstKey() error {
   578  	ptr := i.ptr
   579  
   580  	// This is an ugly performance hack. Reading entries from blocks is one of
   581  	// the inner-most routines and decoding the 3 varints per-entry takes
   582  	// significant time. Neither go1.11 or go1.12 will inline decodeVarint for
   583  	// us, so we do it manually. This provides a 10-15% performance improvement
   584  	// on blockIter benchmarks on both go1.11 and go1.12.
   585  	//
   586  	// TODO(peter): remove this hack if go:inline is ever supported.
   587  
   588  	if shared := *((*uint8)(ptr)); shared == 0 {
   589  		ptr = unsafe.Pointer(uintptr(ptr) + 1)
   590  	} else {
   591  		// The shared length is != 0, which is invalid.
   592  		panic("first key in block must have zero shared length")
   593  	}
   594  
   595  	var unshared uint32
   596  	if a := *((*uint8)(ptr)); a < 128 {
   597  		unshared = uint32(a)
   598  		ptr = unsafe.Pointer(uintptr(ptr) + 1)
   599  	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
   600  		unshared = uint32(b)<<7 | uint32(a)
   601  		ptr = unsafe.Pointer(uintptr(ptr) + 2)
   602  	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
   603  		unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   604  		ptr = unsafe.Pointer(uintptr(ptr) + 3)
   605  	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
   606  		unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   607  		ptr = unsafe.Pointer(uintptr(ptr) + 4)
   608  	} else {
   609  		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
   610  		unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   611  		ptr = unsafe.Pointer(uintptr(ptr) + 5)
   612  	}
   613  
   614  	// Skip the value length.
   615  	if a := *((*uint8)(ptr)); a < 128 {
   616  		ptr = unsafe.Pointer(uintptr(ptr) + 1)
   617  	} else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); a < 128 {
   618  		ptr = unsafe.Pointer(uintptr(ptr) + 2)
   619  	} else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); a < 128 {
   620  		ptr = unsafe.Pointer(uintptr(ptr) + 3)
   621  	} else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); a < 128 {
   622  		ptr = unsafe.Pointer(uintptr(ptr) + 4)
   623  	} else {
   624  		ptr = unsafe.Pointer(uintptr(ptr) + 5)
   625  	}
   626  
   627  	firstKey := getBytes(ptr, int(unshared))
   628  	// Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
   629  	// BlockIter benchmarks.
   630  	if n := len(firstKey) - 8; n >= 0 {
   631  		i.firstUserKey = firstKey[:n:n]
   632  	} else {
   633  		i.firstUserKey = nil
   634  		return base.CorruptionErrorf("pebble/table: invalid firstKey in block")
   635  	}
   636  	return nil
   637  }
   638  
   639  // The sstable internal obsolete bit is set when writing a block and unset by
   640  // blockIter, so no code outside block writing/reading code ever sees it.
   641  const trailerObsoleteBit = uint64(base.InternalKeyKindSSTableInternalObsoleteBit)
   642  const trailerObsoleteMask = (InternalKeySeqNumMax << 8) | uint64(base.InternalKeyKindSSTableInternalObsoleteMask)
   643  
   644  func (i *blockIter) decodeInternalKey(key []byte) (hiddenPoint bool) {
   645  	// Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
   646  	// BlockIter benchmarks.
   647  	if n := len(key) - 8; n >= 0 {
   648  		trailer := binary.LittleEndian.Uint64(key[n:])
   649  		hiddenPoint = i.hideObsoletePoints &&
   650  			(trailer&trailerObsoleteBit != 0)
   651  		i.ikey.Trailer = trailer & trailerObsoleteMask
   652  		i.ikey.UserKey = key[:n:n]
   653  		if i.globalSeqNum != 0 {
   654  			i.ikey.SetSeqNum(i.globalSeqNum)
   655  		}
   656  	} else {
   657  		i.ikey.Trailer = uint64(InternalKeyKindInvalid)
   658  		i.ikey.UserKey = nil
   659  	}
   660  	return hiddenPoint
   661  }
   662  
   663  func (i *blockIter) clearCache() {
   664  	i.cached = i.cached[:0]
   665  	i.cachedBuf = i.cachedBuf[:0]
   666  }
   667  
   668  func (i *blockIter) cacheEntry() {
   669  	var valStart int32
   670  	valSize := int32(len(i.val))
   671  	if valSize > 0 {
   672  		valStart = int32(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr))
   673  	}
   674  
   675  	i.cached = append(i.cached, blockEntry{
   676  		offset:   i.offset,
   677  		keyStart: int32(len(i.cachedBuf)),
   678  		keyEnd:   int32(len(i.cachedBuf) + len(i.key)),
   679  		valStart: valStart,
   680  		valSize:  valSize,
   681  	})
   682  	i.cachedBuf = append(i.cachedBuf, i.key...)
   683  }
   684  
   685  func (i *blockIter) getFirstUserKey() []byte {
   686  	return i.firstUserKey
   687  }
   688  
   689  // SeekGE implements internalIterator.SeekGE, as documented in the pebble
   690  // package.
   691  func (i *blockIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) {
   692  	if invariants.Enabled && i.isDataInvalidated() {
   693  		panic(errors.AssertionFailedf("invalidated blockIter used"))
   694  	}
   695  
   696  	i.clearCache()
   697  	// Find the index of the smallest restart point whose key is > the key
   698  	// sought; index will be numRestarts if there is no such restart point.
   699  	i.offset = 0
   700  	var index int32
   701  
   702  	{
   703  		// NB: manually inlined sort.Seach is ~5% faster.
   704  		//
   705  		// Define f(-1) == false and f(n) == true.
   706  		// Invariant: f(index-1) == false, f(upper) == true.
   707  		upper := i.numRestarts
   708  		for index < upper {
   709  			h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
   710  			// index ≤ h < upper
   711  			offset := decodeRestart(i.data[i.restarts+4*h:])
   712  			// For a restart point, there are 0 bytes shared with the previous key.
   713  			// The varint encoding of 0 occupies 1 byte.
   714  			ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
   715  
   716  			// Decode the key at that restart point, and compare it to the key
   717  			// sought. See the comment in readEntry for why we manually inline the
   718  			// varint decoding.
   719  			var v1 uint32
   720  			if a := *((*uint8)(ptr)); a < 128 {
   721  				v1 = uint32(a)
   722  				ptr = unsafe.Pointer(uintptr(ptr) + 1)
   723  			} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
   724  				v1 = uint32(b)<<7 | uint32(a)
   725  				ptr = unsafe.Pointer(uintptr(ptr) + 2)
   726  			} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
   727  				v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   728  				ptr = unsafe.Pointer(uintptr(ptr) + 3)
   729  			} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
   730  				v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   731  				ptr = unsafe.Pointer(uintptr(ptr) + 4)
   732  			} else {
   733  				d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
   734  				v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   735  				ptr = unsafe.Pointer(uintptr(ptr) + 5)
   736  			}
   737  
   738  			if *((*uint8)(ptr)) < 128 {
   739  				ptr = unsafe.Pointer(uintptr(ptr) + 1)
   740  			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
   741  				ptr = unsafe.Pointer(uintptr(ptr) + 2)
   742  			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
   743  				ptr = unsafe.Pointer(uintptr(ptr) + 3)
   744  			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
   745  				ptr = unsafe.Pointer(uintptr(ptr) + 4)
   746  			} else {
   747  				ptr = unsafe.Pointer(uintptr(ptr) + 5)
   748  			}
   749  
   750  			// Manually inlining part of base.DecodeInternalKey provides a 5-10%
   751  			// speedup on BlockIter benchmarks.
   752  			s := getBytes(ptr, int(v1))
   753  			var k []byte
   754  			if n := len(s) - 8; n >= 0 {
   755  				k = s[:n:n]
   756  			}
   757  			// Else k is invalid, and left as nil
   758  
   759  			if i.cmp(key, k) > 0 {
   760  				// The search key is greater than the user key at this restart point.
   761  				// Search beyond this restart point, since we are trying to find the
   762  				// first restart point with a user key >= the search key.
   763  				index = h + 1 // preserves f(i-1) == false
   764  			} else {
   765  				// k >= search key, so prune everything after index (since index
   766  				// satisfies the property we are looking for).
   767  				upper = h // preserves f(j) == true
   768  			}
   769  		}
   770  		// index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
   771  		// => answer is index.
   772  	}
   773  
   774  	// index is the first restart point with key >= search key. Define the keys
   775  	// between a restart point and the next restart point as belonging to that
   776  	// restart point.
   777  	//
   778  	// Since keys are strictly increasing, if index > 0 then the restart point
   779  	// at index-1 will be the first one that has some keys belonging to it that
   780  	// could be equal to the search key.  If index == 0, then all keys in this
   781  	// block are larger than the key sought, and offset remains at zero.
   782  	if index > 0 {
   783  		i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
   784  	}
   785  	i.readEntry()
   786  	hiddenPoint := i.decodeInternalKey(i.key)
   787  
   788  	// Iterate from that restart point to somewhere >= the key sought.
   789  	if !i.valid() {
   790  		return nil, base.LazyValue{}
   791  	}
   792  	if !hiddenPoint && i.cmp(i.ikey.UserKey, key) >= 0 {
   793  		// Initialize i.lazyValue
   794  		if !i.lazyValueHandling.hasValuePrefix ||
   795  			base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
   796  			i.lazyValue = base.MakeInPlaceValue(i.val)
   797  		} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
   798  			i.lazyValue = base.MakeInPlaceValue(i.val[1:])
   799  		} else {
   800  			i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
   801  		}
   802  		return &i.ikey, i.lazyValue
   803  	}
   804  	for i.Next(); i.valid(); i.Next() {
   805  		if i.cmp(i.ikey.UserKey, key) >= 0 {
   806  			// i.Next() has already initialized i.lazyValue.
   807  			return &i.ikey, i.lazyValue
   808  		}
   809  	}
   810  	return nil, base.LazyValue{}
   811  }
   812  
   813  // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
   814  // pebble package.
   815  func (i *blockIter) SeekPrefixGE(
   816  	prefix, key []byte, flags base.SeekGEFlags,
   817  ) (*base.InternalKey, base.LazyValue) {
   818  	// This should never be called as prefix iteration is handled by sstable.Iterator.
   819  	panic("pebble: SeekPrefixGE unimplemented")
   820  }
   821  
   822  // SeekLT implements internalIterator.SeekLT, as documented in the pebble
   823  // package.
   824  func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) {
   825  	if invariants.Enabled && i.isDataInvalidated() {
   826  		panic(errors.AssertionFailedf("invalidated blockIter used"))
   827  	}
   828  
   829  	i.clearCache()
   830  	// Find the index of the smallest restart point whose key is >= the key
   831  	// sought; index will be numRestarts if there is no such restart point.
   832  	i.offset = 0
   833  	var index int32
   834  
   835  	{
   836  		// NB: manually inlined sort.Search is ~5% faster.
   837  		//
   838  		// Define f(-1) == false and f(n) == true.
   839  		// Invariant: f(index-1) == false, f(upper) == true.
   840  		upper := i.numRestarts
   841  		for index < upper {
   842  			h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
   843  			// index ≤ h < upper
   844  			offset := decodeRestart(i.data[i.restarts+4*h:])
   845  			// For a restart point, there are 0 bytes shared with the previous key.
   846  			// The varint encoding of 0 occupies 1 byte.
   847  			ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
   848  
   849  			// Decode the key at that restart point, and compare it to the key
   850  			// sought. See the comment in readEntry for why we manually inline the
   851  			// varint decoding.
   852  			var v1 uint32
   853  			if a := *((*uint8)(ptr)); a < 128 {
   854  				v1 = uint32(a)
   855  				ptr = unsafe.Pointer(uintptr(ptr) + 1)
   856  			} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
   857  				v1 = uint32(b)<<7 | uint32(a)
   858  				ptr = unsafe.Pointer(uintptr(ptr) + 2)
   859  			} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
   860  				v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   861  				ptr = unsafe.Pointer(uintptr(ptr) + 3)
   862  			} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
   863  				v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   864  				ptr = unsafe.Pointer(uintptr(ptr) + 4)
   865  			} else {
   866  				d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
   867  				v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   868  				ptr = unsafe.Pointer(uintptr(ptr) + 5)
   869  			}
   870  
   871  			if *((*uint8)(ptr)) < 128 {
   872  				ptr = unsafe.Pointer(uintptr(ptr) + 1)
   873  			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
   874  				ptr = unsafe.Pointer(uintptr(ptr) + 2)
   875  			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
   876  				ptr = unsafe.Pointer(uintptr(ptr) + 3)
   877  			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
   878  				ptr = unsafe.Pointer(uintptr(ptr) + 4)
   879  			} else {
   880  				ptr = unsafe.Pointer(uintptr(ptr) + 5)
   881  			}
   882  
   883  			// Manually inlining part of base.DecodeInternalKey provides a 5-10%
   884  			// speedup on BlockIter benchmarks.
   885  			s := getBytes(ptr, int(v1))
   886  			var k []byte
   887  			if n := len(s) - 8; n >= 0 {
   888  				k = s[:n:n]
   889  			}
   890  			// Else k is invalid, and left as nil
   891  
   892  			if i.cmp(key, k) > 0 {
   893  				// The search key is greater than the user key at this restart point.
   894  				// Search beyond this restart point, since we are trying to find the
   895  				// first restart point with a user key >= the search key.
   896  				index = h + 1 // preserves f(i-1) == false
   897  			} else {
   898  				// k >= search key, so prune everything after index (since index
   899  				// satisfies the property we are looking for).
   900  				upper = h // preserves f(j) == true
   901  			}
   902  		}
   903  		// index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
   904  		// => answer is index.
   905  	}
   906  
   907  	// index is the first restart point with key >= search key. Define the keys
   908  	// between a restart point and the next restart point as belonging to that
   909  	// restart point. Note that index could be equal to i.numRestarts, i.e., we
   910  	// are past the last restart.
   911  	//
   912  	// Since keys are strictly increasing, if index > 0 then the restart point
   913  	// at index-1 will be the first one that has some keys belonging to it that
   914  	// are less than the search key.  If index == 0, then all keys in this block
   915  	// are larger than the search key, so there is no match.
   916  	targetOffset := i.restarts
   917  	if index > 0 {
   918  		i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
   919  		if index < i.numRestarts {
   920  			targetOffset = decodeRestart(i.data[i.restarts+4*(index):])
   921  		}
   922  	} else if index == 0 {
   923  		// If index == 0 then all keys in this block are larger than the key
   924  		// sought.
   925  		i.offset = -1
   926  		i.nextOffset = 0
   927  		return nil, base.LazyValue{}
   928  	}
   929  
   930  	// Iterate from that restart point to somewhere >= the key sought, then back
   931  	// up to the previous entry. The expectation is that we'll be performing
   932  	// reverse iteration, so we cache the entries as we advance forward.
   933  	i.nextOffset = i.offset
   934  
   935  	for {
   936  		i.offset = i.nextOffset
   937  		i.readEntry()
   938  		// When hidden keys are common, there is additional optimization possible
   939  		// by not caching entries that are hidden (note that some calls to
   940  		// cacheEntry don't decode the internal key before caching, but checking
   941  		// whether a key is hidden does not require full decoding). However, we do
   942  		// need to use the blockEntry.offset in the cache for the first entry at
   943  		// the reset point to do the binary search when the cache is empty -- so
   944  		// we would need to cache that first entry (though not the key) even if
   945  		// was hidden. Our current assumption is that if there are large numbers
   946  		// of hidden keys we will be able to skip whole blocks (using block
   947  		// property filters) so we don't bother optimizing.
   948  		hiddenPoint := i.decodeInternalKey(i.key)
   949  
   950  		// NB: we don't use the hiddenPoint return value of decodeInternalKey
   951  		// since we want to stop as soon as we reach a key >= ikey.UserKey, so
   952  		// that we can reverse.
   953  		if i.cmp(i.ikey.UserKey, key) >= 0 {
   954  			// The current key is greater than or equal to our search key. Back up to
   955  			// the previous key which was less than our search key. Note that this for
   956  			// loop will execute at least once with this if-block not being true, so
   957  			// the key we are backing up to is the last one this loop cached.
   958  			return i.Prev()
   959  		}
   960  
   961  		if i.nextOffset >= targetOffset {
   962  			// We've reached the end of the current restart block. Return the
   963  			// current key if not hidden, else call Prev().
   964  			//
   965  			// When the restart interval is 1, the first iteration of the for loop
   966  			// will bring us here. In that case ikey is backed by the block so we
   967  			// get the desired key stability guarantee for the lifetime of the
   968  			// blockIter. That is, we never cache anything and therefore never
   969  			// return a key backed by cachedBuf.
   970  			if hiddenPoint {
   971  				return i.Prev()
   972  			}
   973  			break
   974  		}
   975  
   976  		i.cacheEntry()
   977  	}
   978  
   979  	if !i.valid() {
   980  		return nil, base.LazyValue{}
   981  	}
   982  	if !i.lazyValueHandling.hasValuePrefix ||
   983  		base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
   984  		i.lazyValue = base.MakeInPlaceValue(i.val)
   985  	} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
   986  		i.lazyValue = base.MakeInPlaceValue(i.val[1:])
   987  	} else {
   988  		i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
   989  	}
   990  	return &i.ikey, i.lazyValue
   991  }
   992  
   993  // First implements internalIterator.First, as documented in the pebble
   994  // package.
   995  func (i *blockIter) First() (*InternalKey, base.LazyValue) {
   996  	if invariants.Enabled && i.isDataInvalidated() {
   997  		panic(errors.AssertionFailedf("invalidated blockIter used"))
   998  	}
   999  
  1000  	i.offset = 0
  1001  	if !i.valid() {
  1002  		return nil, base.LazyValue{}
  1003  	}
  1004  	i.clearCache()
  1005  	i.readEntry()
  1006  	hiddenPoint := i.decodeInternalKey(i.key)
  1007  	if hiddenPoint {
  1008  		return i.Next()
  1009  	}
  1010  	if !i.lazyValueHandling.hasValuePrefix ||
  1011  		base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
  1012  		i.lazyValue = base.MakeInPlaceValue(i.val)
  1013  	} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
  1014  		i.lazyValue = base.MakeInPlaceValue(i.val[1:])
  1015  	} else {
  1016  		i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
  1017  	}
  1018  	return &i.ikey, i.lazyValue
  1019  }
  1020  
  1021  func decodeRestart(b []byte) int32 {
  1022  	_ = b[3] // bounds check hint to compiler; see golang.org/issue/14808
  1023  	return int32(uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 |
  1024  		uint32(b[3]&restartMaskLittleEndianHighByteWithoutSetHasSamePrefix)<<24)
  1025  }
  1026  
  1027  // Last implements internalIterator.Last, as documented in the pebble package.
  1028  func (i *blockIter) Last() (*InternalKey, base.LazyValue) {
  1029  	if invariants.Enabled && i.isDataInvalidated() {
  1030  		panic(errors.AssertionFailedf("invalidated blockIter used"))
  1031  	}
  1032  
  1033  	// Seek forward from the last restart point.
  1034  	i.offset = decodeRestart(i.data[i.restarts+4*(i.numRestarts-1):])
  1035  	if !i.valid() {
  1036  		return nil, base.LazyValue{}
  1037  	}
  1038  
  1039  	i.readEntry()
  1040  	i.clearCache()
  1041  
  1042  	for i.nextOffset < i.restarts {
  1043  		i.cacheEntry()
  1044  		i.offset = i.nextOffset
  1045  		i.readEntry()
  1046  	}
  1047  
  1048  	hiddenPoint := i.decodeInternalKey(i.key)
  1049  	if hiddenPoint {
  1050  		return i.Prev()
  1051  	}
  1052  	if !i.lazyValueHandling.hasValuePrefix ||
  1053  		base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
  1054  		i.lazyValue = base.MakeInPlaceValue(i.val)
  1055  	} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
  1056  		i.lazyValue = base.MakeInPlaceValue(i.val[1:])
  1057  	} else {
  1058  		i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
  1059  	}
  1060  	return &i.ikey, i.lazyValue
  1061  }
  1062  
  1063  // Next implements internalIterator.Next, as documented in the pebble
  1064  // package.
  1065  func (i *blockIter) Next() (*InternalKey, base.LazyValue) {
  1066  	if len(i.cachedBuf) > 0 {
  1067  		// We're switching from reverse iteration to forward iteration. We need to
  1068  		// populate i.fullKey with the current key we're positioned at so that
  1069  		// readEntry() can use i.fullKey for key prefix decompression. Note that we
  1070  		// don't know whether i.key is backed by i.cachedBuf or i.fullKey (if
  1071  		// SeekLT was the previous call, i.key may be backed by i.fullKey), but
  1072  		// copying into i.fullKey works for both cases.
  1073  		//
  1074  		// TODO(peter): Rather than clearing the cache, we could instead use the
  1075  		// cache until it is exhausted. This would likely be faster than falling
  1076  		// through to the normal forward iteration code below.
  1077  		i.fullKey = append(i.fullKey[:0], i.key...)
  1078  		i.clearCache()
  1079  	}
  1080  
  1081  start:
  1082  	i.offset = i.nextOffset
  1083  	if !i.valid() {
  1084  		return nil, base.LazyValue{}
  1085  	}
  1086  	i.readEntry()
  1087  	// Manually inlined version of i.decodeInternalKey(i.key).
  1088  	if n := len(i.key) - 8; n >= 0 {
  1089  		trailer := binary.LittleEndian.Uint64(i.key[n:])
  1090  		hiddenPoint := i.hideObsoletePoints &&
  1091  			(trailer&trailerObsoleteBit != 0)
  1092  		i.ikey.Trailer = trailer & trailerObsoleteMask
  1093  		i.ikey.UserKey = i.key[:n:n]
  1094  		if i.globalSeqNum != 0 {
  1095  			i.ikey.SetSeqNum(i.globalSeqNum)
  1096  		}
  1097  		if hiddenPoint {
  1098  			goto start
  1099  		}
  1100  	} else {
  1101  		i.ikey.Trailer = uint64(InternalKeyKindInvalid)
  1102  		i.ikey.UserKey = nil
  1103  	}
  1104  	if !i.lazyValueHandling.hasValuePrefix ||
  1105  		base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
  1106  		i.lazyValue = base.MakeInPlaceValue(i.val)
  1107  	} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
  1108  		i.lazyValue = base.MakeInPlaceValue(i.val[1:])
  1109  	} else {
  1110  		i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
  1111  	}
  1112  	return &i.ikey, i.lazyValue
  1113  }
  1114  
  1115  // NextPrefix implements (base.InternalIterator).NextPrefix.
  1116  func (i *blockIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
  1117  	if i.lazyValueHandling.hasValuePrefix {
  1118  		return i.nextPrefixV3(succKey)
  1119  	}
  1120  	const nextsBeforeSeek = 3
  1121  	k, v := i.Next()
  1122  	for j := 1; k != nil && i.cmp(k.UserKey, succKey) < 0; j++ {
  1123  		if j >= nextsBeforeSeek {
  1124  			return i.SeekGE(succKey, base.SeekGEFlagsNone)
  1125  		}
  1126  		k, v = i.Next()
  1127  	}
  1128  	return k, v
  1129  }
  1130  
  1131  func (i *blockIter) nextPrefixV3(succKey []byte) (*InternalKey, base.LazyValue) {
  1132  	// Doing nexts that involve a key comparison can be expensive (and the cost
  1133  	// depends on the key length), so we use the same threshold of 3 that we use
  1134  	// for TableFormatPebblev2 in blockIter.nextPrefix above. The next fast path
  1135  	// that looks at setHasSamePrefix takes ~5ns per key, which is ~150x faster
  1136  	// than doing a SeekGE within the block, so we do this 16 times
  1137  	// (~5ns*16=80ns), and then switch to looking at restarts. Doing the binary
  1138  	// search for the restart consumes > 100ns. If the number of versions is >
  1139  	// 17, we will increment nextFastCount to 17, then do a binary search, and
  1140  	// on average need to find a key between two restarts, so another 8 steps
  1141  	// corresponding to nextFastCount, for a mean total of 17 + 8 = 25 such
  1142  	// steps.
  1143  	//
  1144  	// TODO(sumeer): use the configured restartInterval for the sstable when it
  1145  	// was written (which we don't currently store) instead of the default value
  1146  	// of 16.
  1147  	const nextCmpThresholdBeforeSeek = 3
  1148  	const nextFastThresholdBeforeRestarts = 16
  1149  	nextCmpCount := 0
  1150  	nextFastCount := 0
  1151  	usedRestarts := false
  1152  	// INVARIANT: blockIter is valid.
  1153  	if invariants.Enabled && !i.valid() {
  1154  		panic(errors.AssertionFailedf("nextPrefixV3 called on invalid blockIter"))
  1155  	}
  1156  	prevKeyIsSet := i.ikey.Kind() == InternalKeyKindSet
  1157  	for {
  1158  		i.offset = i.nextOffset
  1159  		if !i.valid() {
  1160  			return nil, base.LazyValue{}
  1161  		}
  1162  		// Need to decode the length integers, so we can compute nextOffset.
  1163  		ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
  1164  		// This is an ugly performance hack. Reading entries from blocks is one of
  1165  		// the inner-most routines and decoding the 3 varints per-entry takes
  1166  		// significant time. Neither go1.11 or go1.12 will inline decodeVarint for
  1167  		// us, so we do it manually. This provides a 10-15% performance improvement
  1168  		// on blockIter benchmarks on both go1.11 and go1.12.
  1169  		//
  1170  		// TODO(peter): remove this hack if go:inline is ever supported.
  1171  
  1172  		// Decode the shared key length integer.
  1173  		var shared uint32
  1174  		if a := *((*uint8)(ptr)); a < 128 {
  1175  			shared = uint32(a)
  1176  			ptr = unsafe.Pointer(uintptr(ptr) + 1)
  1177  		} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
  1178  			shared = uint32(b)<<7 | uint32(a)
  1179  			ptr = unsafe.Pointer(uintptr(ptr) + 2)
  1180  		} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
  1181  			shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1182  			ptr = unsafe.Pointer(uintptr(ptr) + 3)
  1183  		} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
  1184  			shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1185  			ptr = unsafe.Pointer(uintptr(ptr) + 4)
  1186  		} else {
  1187  			d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
  1188  			shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1189  			ptr = unsafe.Pointer(uintptr(ptr) + 5)
  1190  		}
  1191  		// Decode the unshared key length integer.
  1192  		var unshared uint32
  1193  		if a := *((*uint8)(ptr)); a < 128 {
  1194  			unshared = uint32(a)
  1195  			ptr = unsafe.Pointer(uintptr(ptr) + 1)
  1196  		} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
  1197  			unshared = uint32(b)<<7 | uint32(a)
  1198  			ptr = unsafe.Pointer(uintptr(ptr) + 2)
  1199  		} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
  1200  			unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1201  			ptr = unsafe.Pointer(uintptr(ptr) + 3)
  1202  		} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
  1203  			unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1204  			ptr = unsafe.Pointer(uintptr(ptr) + 4)
  1205  		} else {
  1206  			d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
  1207  			unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1208  			ptr = unsafe.Pointer(uintptr(ptr) + 5)
  1209  		}
  1210  		// Decode the value length integer.
  1211  		var value uint32
  1212  		if a := *((*uint8)(ptr)); a < 128 {
  1213  			value = uint32(a)
  1214  			ptr = unsafe.Pointer(uintptr(ptr) + 1)
  1215  		} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
  1216  			value = uint32(b)<<7 | uint32(a)
  1217  			ptr = unsafe.Pointer(uintptr(ptr) + 2)
  1218  		} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
  1219  			value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1220  			ptr = unsafe.Pointer(uintptr(ptr) + 3)
  1221  		} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
  1222  			value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1223  			ptr = unsafe.Pointer(uintptr(ptr) + 4)
  1224  		} else {
  1225  			d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
  1226  			value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1227  			ptr = unsafe.Pointer(uintptr(ptr) + 5)
  1228  		}
  1229  		// The starting position of the value.
  1230  		valuePtr := unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
  1231  		i.nextOffset = int32(uintptr(valuePtr)-uintptr(i.ptr)) + int32(value)
  1232  		if invariants.Enabled && unshared < 8 {
  1233  			// This should not happen since only the key prefix is shared, so even
  1234  			// if the prefix length is the same as the user key length, the unshared
  1235  			// will include the trailer.
  1236  			panic(errors.AssertionFailedf("unshared %d is too small", unshared))
  1237  		}
  1238  		// The trailer is written in little endian, so the key kind is the first
  1239  		// byte in the trailer that is encoded in the slice [unshared-8:unshared].
  1240  		keyKind := InternalKeyKind((*[manual.MaxArrayLen]byte)(ptr)[unshared-8])
  1241  		keyKind = keyKind & base.InternalKeyKindSSTableInternalObsoleteMask
  1242  		prefixChanged := false
  1243  		if keyKind == InternalKeyKindSet {
  1244  			if invariants.Enabled && value == 0 {
  1245  				panic(errors.AssertionFailedf("value is of length 0, but we expect a valuePrefix"))
  1246  			}
  1247  			valPrefix := *((*valuePrefix)(valuePtr))
  1248  			if setHasSamePrefix(valPrefix) {
  1249  				// Fast-path. No need to assemble i.fullKey, or update i.key. We know
  1250  				// that subsequent keys will not have a shared length that is greater
  1251  				// than the prefix of the current key, which is also the prefix of
  1252  				// i.key. Since we are continuing to iterate, we don't need to
  1253  				// initialize i.ikey and i.lazyValue (these are initialized before
  1254  				// returning).
  1255  				nextFastCount++
  1256  				if nextFastCount > nextFastThresholdBeforeRestarts {
  1257  					if usedRestarts {
  1258  						// Exhausted iteration budget. This will never happen unless
  1259  						// someone is using a restart interval > 16. It is just to guard
  1260  						// against long restart intervals causing too much iteration.
  1261  						break
  1262  					}
  1263  					// Haven't used restarts yet, so find the first restart at or beyond
  1264  					// the current offset.
  1265  					targetOffset := i.offset
  1266  					var index int32
  1267  					{
  1268  						// NB: manually inlined sort.Sort is ~5% faster.
  1269  						//
  1270  						// f defined for a restart point is true iff the offset >=
  1271  						// targetOffset.
  1272  						// Define f(-1) == false and f(i.numRestarts) == true.
  1273  						// Invariant: f(index-1) == false, f(upper) == true.
  1274  						upper := i.numRestarts
  1275  						for index < upper {
  1276  							h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
  1277  							// index ≤ h < upper
  1278  							offset := decodeRestart(i.data[i.restarts+4*h:])
  1279  							if offset < targetOffset {
  1280  								index = h + 1 // preserves f(index-1) == false
  1281  							} else {
  1282  								upper = h // preserves f(upper) == true
  1283  							}
  1284  						}
  1285  						// index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
  1286  						// => answer is index.
  1287  					}
  1288  					usedRestarts = true
  1289  					nextFastCount = 0
  1290  					if index == i.numRestarts {
  1291  						// Already past the last real restart, so iterate a bit more until
  1292  						// we are done with the block.
  1293  						continue
  1294  					}
  1295  					// Have some real restarts after index. NB: index is the first
  1296  					// restart at or beyond the current offset.
  1297  					startingIndex := index
  1298  					for index != i.numRestarts &&
  1299  						// The restart at index is 4 bytes written in little endian format
  1300  						// starting at i.restart+4*index. The 0th byte is the least
  1301  						// significant and the 3rd byte is the most significant. Since the
  1302  						// most significant bit of the 3rd byte is what we use for
  1303  						// encoding the set-has-same-prefix information, the indexing
  1304  						// below has +3.
  1305  						i.data[i.restarts+4*index+3]&restartMaskLittleEndianHighByteOnlySetHasSamePrefix != 0 {
  1306  						// We still have the same prefix, so move to the next restart.
  1307  						index++
  1308  					}
  1309  					// index is the first restart that did not have the same prefix.
  1310  					if index != startingIndex {
  1311  						// Managed to skip past at least one restart. Resume iteration
  1312  						// from index-1. Since nextFastCount has been reset to 0, we
  1313  						// should be able to iterate to the next prefix.
  1314  						i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
  1315  						i.readEntry()
  1316  					}
  1317  					// Else, unable to skip past any restart. Resume iteration. Since
  1318  					// nextFastCount has been reset to 0, we should be able to iterate
  1319  					// to the next prefix.
  1320  					continue
  1321  				}
  1322  				continue
  1323  			} else if prevKeyIsSet {
  1324  				prefixChanged = true
  1325  			}
  1326  		} else {
  1327  			prevKeyIsSet = false
  1328  		}
  1329  		// Slow-path cases:
  1330  		// - (Likely) The prefix has changed.
  1331  		// - (Unlikely) The prefix has not changed.
  1332  		// We assemble the key etc. under the assumption that it is the likely
  1333  		// case.
  1334  		unsharedKey := getBytes(ptr, int(unshared))
  1335  		// TODO(sumeer): move this into the else block below. This is a bit tricky
  1336  		// since the current logic assumes we have always copied the latest key
  1337  		// into fullKey, which is why when we get to the next key we can (a)
  1338  		// access i.fullKey[:shared], (b) append only the unsharedKey to
  1339  		// i.fullKey. For (a), we can access i.key[:shared] since that memory is
  1340  		// valid (even if unshared). For (b), we will need to remember whether
  1341  		// i.key refers to i.fullKey or not, and can append the unsharedKey only
  1342  		// in the former case and for the latter case need to copy the shared part
  1343  		// too. This same comment applies to the other place where we can do this
  1344  		// optimization, in readEntry().
  1345  		i.fullKey = append(i.fullKey[:shared], unsharedKey...)
  1346  		i.val = getBytes(valuePtr, int(value))
  1347  		if shared == 0 {
  1348  			// Provide stability for the key across positioning calls if the key
  1349  			// doesn't share a prefix with the previous key. This removes requiring the
  1350  			// key to be copied if the caller knows the block has a restart interval of
  1351  			// 1. An important example of this is range-del blocks.
  1352  			i.key = unsharedKey
  1353  		} else {
  1354  			i.key = i.fullKey
  1355  		}
  1356  		// Manually inlined version of i.decodeInternalKey(i.key).
  1357  		hiddenPoint := false
  1358  		if n := len(i.key) - 8; n >= 0 {
  1359  			trailer := binary.LittleEndian.Uint64(i.key[n:])
  1360  			hiddenPoint = i.hideObsoletePoints &&
  1361  				(trailer&trailerObsoleteBit != 0)
  1362  			i.ikey.Trailer = trailer & trailerObsoleteMask
  1363  			i.ikey.UserKey = i.key[:n:n]
  1364  			if i.globalSeqNum != 0 {
  1365  				i.ikey.SetSeqNum(i.globalSeqNum)
  1366  			}
  1367  		} else {
  1368  			i.ikey.Trailer = uint64(InternalKeyKindInvalid)
  1369  			i.ikey.UserKey = nil
  1370  		}
  1371  		nextCmpCount++
  1372  		if invariants.Enabled && prefixChanged && i.cmp(i.ikey.UserKey, succKey) < 0 {
  1373  			panic(errors.AssertionFailedf("prefix should have changed but %x < %x",
  1374  				i.ikey.UserKey, succKey))
  1375  		}
  1376  		if prefixChanged || i.cmp(i.ikey.UserKey, succKey) >= 0 {
  1377  			// Prefix has changed.
  1378  			if hiddenPoint {
  1379  				return i.Next()
  1380  			}
  1381  			if invariants.Enabled && !i.lazyValueHandling.hasValuePrefix {
  1382  				panic(errors.AssertionFailedf("nextPrefixV3 being run for non-v3 sstable"))
  1383  			}
  1384  			if base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
  1385  				i.lazyValue = base.MakeInPlaceValue(i.val)
  1386  			} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
  1387  				i.lazyValue = base.MakeInPlaceValue(i.val[1:])
  1388  			} else {
  1389  				i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
  1390  			}
  1391  			return &i.ikey, i.lazyValue
  1392  		}
  1393  		// Else prefix has not changed.
  1394  
  1395  		if nextCmpCount >= nextCmpThresholdBeforeSeek {
  1396  			break
  1397  		}
  1398  	}
  1399  	return i.SeekGE(succKey, base.SeekGEFlagsNone)
  1400  }
  1401  
  1402  // Prev implements internalIterator.Prev, as documented in the pebble
  1403  // package.
  1404  func (i *blockIter) Prev() (*InternalKey, base.LazyValue) {
  1405  start:
  1406  	for n := len(i.cached) - 1; n >= 0; n-- {
  1407  		i.nextOffset = i.offset
  1408  		e := &i.cached[n]
  1409  		i.offset = e.offset
  1410  		i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize))
  1411  		// Manually inlined version of i.decodeInternalKey(i.key).
  1412  		i.key = i.cachedBuf[e.keyStart:e.keyEnd]
  1413  		if n := len(i.key) - 8; n >= 0 {
  1414  			trailer := binary.LittleEndian.Uint64(i.key[n:])
  1415  			hiddenPoint := i.hideObsoletePoints &&
  1416  				(trailer&trailerObsoleteBit != 0)
  1417  			if hiddenPoint {
  1418  				continue
  1419  			}
  1420  			i.ikey.Trailer = trailer & trailerObsoleteMask
  1421  			i.ikey.UserKey = i.key[:n:n]
  1422  			if i.globalSeqNum != 0 {
  1423  				i.ikey.SetSeqNum(i.globalSeqNum)
  1424  			}
  1425  		} else {
  1426  			i.ikey.Trailer = uint64(InternalKeyKindInvalid)
  1427  			i.ikey.UserKey = nil
  1428  		}
  1429  		i.cached = i.cached[:n]
  1430  		if !i.lazyValueHandling.hasValuePrefix ||
  1431  			base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
  1432  			i.lazyValue = base.MakeInPlaceValue(i.val)
  1433  		} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
  1434  			i.lazyValue = base.MakeInPlaceValue(i.val[1:])
  1435  		} else {
  1436  			i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
  1437  		}
  1438  		return &i.ikey, i.lazyValue
  1439  	}
  1440  
  1441  	i.clearCache()
  1442  	if i.offset <= 0 {
  1443  		i.offset = -1
  1444  		i.nextOffset = 0
  1445  		return nil, base.LazyValue{}
  1446  	}
  1447  
  1448  	targetOffset := i.offset
  1449  	var index int32
  1450  
  1451  	{
  1452  		// NB: manually inlined sort.Sort is ~5% faster.
  1453  		//
  1454  		// Define f(-1) == false and f(n) == true.
  1455  		// Invariant: f(index-1) == false, f(upper) == true.
  1456  		upper := i.numRestarts
  1457  		for index < upper {
  1458  			h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
  1459  			// index ≤ h < upper
  1460  			offset := decodeRestart(i.data[i.restarts+4*h:])
  1461  			if offset < targetOffset {
  1462  				// Looking for the first restart that has offset >= targetOffset, so
  1463  				// ignore h and earlier.
  1464  				index = h + 1 // preserves f(i-1) == false
  1465  			} else {
  1466  				upper = h // preserves f(j) == true
  1467  			}
  1468  		}
  1469  		// index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
  1470  		// => answer is index.
  1471  	}
  1472  
  1473  	// index is first restart with offset >= targetOffset. Note that
  1474  	// targetOffset may not be at a restart point since one can call Prev()
  1475  	// after Next() (so the cache was not populated) and targetOffset refers to
  1476  	// the current entry. index-1 must have an offset < targetOffset (it can't
  1477  	// be equal to targetOffset since the binary search would have selected that
  1478  	// as the index).
  1479  	i.offset = 0
  1480  	if index > 0 {
  1481  		i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
  1482  	}
  1483  	// TODO(sumeer): why is the else case not an error given targetOffset is a
  1484  	// valid offset.
  1485  
  1486  	i.readEntry()
  1487  
  1488  	// We stop when i.nextOffset == targetOffset since the targetOffset is the
  1489  	// entry we are stepping back from, and we don't need to cache the entry
  1490  	// before it, since it is the candidate to return.
  1491  	for i.nextOffset < targetOffset {
  1492  		i.cacheEntry()
  1493  		i.offset = i.nextOffset
  1494  		i.readEntry()
  1495  	}
  1496  
  1497  	hiddenPoint := i.decodeInternalKey(i.key)
  1498  	if hiddenPoint {
  1499  		// Use the cache.
  1500  		goto start
  1501  	}
  1502  	if !i.lazyValueHandling.hasValuePrefix ||
  1503  		base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
  1504  		i.lazyValue = base.MakeInPlaceValue(i.val)
  1505  	} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
  1506  		i.lazyValue = base.MakeInPlaceValue(i.val[1:])
  1507  	} else {
  1508  		i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
  1509  	}
  1510  	return &i.ikey, i.lazyValue
  1511  }
  1512  
  1513  // Key implements internalIterator.Key, as documented in the pebble package.
  1514  func (i *blockIter) Key() *InternalKey {
  1515  	return &i.ikey
  1516  }
  1517  
  1518  func (i *blockIter) value() base.LazyValue {
  1519  	return i.lazyValue
  1520  }
  1521  
  1522  // Error implements internalIterator.Error, as documented in the pebble
  1523  // package.
  1524  func (i *blockIter) Error() error {
  1525  	return nil // infallible
  1526  }
  1527  
  1528  // Close implements internalIterator.Close, as documented in the pebble
  1529  // package.
  1530  func (i *blockIter) Close() error {
  1531  	i.handle.Release()
  1532  	i.handle = bufferHandle{}
  1533  	i.val = nil
  1534  	i.lazyValue = base.LazyValue{}
  1535  	i.lazyValueHandling.vbr = nil
  1536  	return nil
  1537  }
  1538  
  1539  func (i *blockIter) SetBounds(lower, upper []byte) {
  1540  	// This should never be called as bounds are handled by sstable.Iterator.
  1541  	panic("pebble: SetBounds unimplemented")
  1542  }
  1543  
  1544  func (i *blockIter) SetContext(_ context.Context) {}
  1545  
  1546  func (i *blockIter) valid() bool {
  1547  	return i.offset >= 0 && i.offset < i.restarts
  1548  }
  1549  
  1550  // fragmentBlockIter wraps a blockIter, implementing the
  1551  // keyspan.FragmentIterator interface. It's used for reading range deletion and
  1552  // range key blocks.
  1553  //
  1554  // Range deletions and range keys are fragmented before they're persisted to the
  1555  // block. Overlapping fragments have identical bounds.  The fragmentBlockIter
  1556  // gathers all the fragments with identical bounds within a block and returns a
  1557  // single keyspan.Span describing all the keys defined over the span.
  1558  //
  1559  // # Memory lifetime
  1560  //
  1561  // A Span returned by fragmentBlockIter is only guaranteed to be stable until
  1562  // the next fragmentBlockIter iteration positioning method. A Span's Keys slice
  1563  // may be reused, so the user must not assume it's stable.
  1564  //
  1565  // Blocks holding range deletions and range keys are configured to use a restart
  1566  // interval of 1. This provides key stability. The caller may treat the various
  1567  // byte slices (start, end, suffix, value) as stable for the lifetime of the
  1568  // iterator.
  1569  type fragmentBlockIter struct {
  1570  	blockIter blockIter
  1571  	keyBuf    [2]keyspan.Key
  1572  	span      keyspan.Span
  1573  	err       error
  1574  	dir       int8
  1575  	closeHook func(i keyspan.FragmentIterator) error
  1576  
  1577  	// elideSameSeqnum, if true, returns only the first-occurring (in forward
  1578  	// order) Key for each sequence number.
  1579  	elideSameSeqnum bool
  1580  }
  1581  
  1582  func (i *fragmentBlockIter) resetForReuse() fragmentBlockIter {
  1583  	return fragmentBlockIter{blockIter: i.blockIter.resetForReuse()}
  1584  }
  1585  
  1586  func (i *fragmentBlockIter) decodeSpanKeys(k *InternalKey, internalValue []byte) {
  1587  	// TODO(jackson): The use of i.span.Keys to accumulate keys across multiple
  1588  	// calls to Decode is too confusing and subtle. Refactor to make it
  1589  	// explicit.
  1590  
  1591  	// decode the contents of the fragment's value. This always includes at
  1592  	// least the end key: RANGEDELs store the end key directly as the value,
  1593  	// whereas the various range key kinds store are more complicated.  The
  1594  	// details of the range key internal value format are documented within the
  1595  	// internal/rangekey package.
  1596  	switch k.Kind() {
  1597  	case base.InternalKeyKindRangeDelete:
  1598  		i.span = rangedel.Decode(*k, internalValue, i.span.Keys)
  1599  		i.err = nil
  1600  	case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset, base.InternalKeyKindRangeKeyDelete:
  1601  		i.span, i.err = rangekey.Decode(*k, internalValue, i.span.Keys)
  1602  	default:
  1603  		i.span = keyspan.Span{}
  1604  		i.err = base.CorruptionErrorf("pebble: corrupt keyspan fragment of kind %d", k.Kind())
  1605  	}
  1606  }
  1607  
  1608  func (i *fragmentBlockIter) elideKeysOfSameSeqNum() {
  1609  	if invariants.Enabled {
  1610  		if !i.elideSameSeqnum || len(i.span.Keys) == 0 {
  1611  			panic("elideKeysOfSameSeqNum called when it should not be")
  1612  		}
  1613  	}
  1614  	lastSeqNum := i.span.Keys[0].SeqNum()
  1615  	k := 1
  1616  	for j := 1; j < len(i.span.Keys); j++ {
  1617  		if lastSeqNum != i.span.Keys[j].SeqNum() {
  1618  			lastSeqNum = i.span.Keys[j].SeqNum()
  1619  			i.span.Keys[k] = i.span.Keys[j]
  1620  			k++
  1621  		}
  1622  	}
  1623  	i.span.Keys = i.span.Keys[:k]
  1624  }
  1625  
  1626  // gatherForward gathers internal keys with identical bounds. Keys defined over
  1627  // spans of the keyspace are fragmented such that any overlapping key spans have
  1628  // identical bounds. When these spans are persisted to a range deletion or range
  1629  // key block, they may be persisted as multiple internal keys in order to encode
  1630  // multiple sequence numbers or key kinds.
  1631  //
  1632  // gatherForward iterates forward, re-combining the fragmented internal keys to
  1633  // reconstruct a keyspan.Span that holds all the keys defined over the span.
  1634  func (i *fragmentBlockIter) gatherForward(k *InternalKey, lazyValue base.LazyValue) *keyspan.Span {
  1635  	i.span = keyspan.Span{}
  1636  	if k == nil || !i.blockIter.valid() {
  1637  		return nil
  1638  	}
  1639  	i.err = nil
  1640  	// Use the i.keyBuf array to back the Keys slice to prevent an allocation
  1641  	// when a span contains few keys.
  1642  	i.span.Keys = i.keyBuf[:0]
  1643  
  1644  	// Decode the span's end key and individual keys from the value.
  1645  	internalValue := lazyValue.InPlaceValue()
  1646  	i.decodeSpanKeys(k, internalValue)
  1647  	if i.err != nil {
  1648  		return nil
  1649  	}
  1650  	prevEnd := i.span.End
  1651  
  1652  	// There might exist additional internal keys with identical bounds encoded
  1653  	// within the block. Iterate forward, accumulating all the keys with
  1654  	// identical bounds to s.
  1655  	k, lazyValue = i.blockIter.Next()
  1656  	internalValue = lazyValue.InPlaceValue()
  1657  	for k != nil && i.blockIter.cmp(k.UserKey, i.span.Start) == 0 {
  1658  		i.decodeSpanKeys(k, internalValue)
  1659  		if i.err != nil {
  1660  			return nil
  1661  		}
  1662  
  1663  		// Since k indicates an equal start key, the encoded end key must
  1664  		// exactly equal the original end key from the first internal key.
  1665  		// Overlapping fragments are required to have exactly equal start and
  1666  		// end bounds.
  1667  		if i.blockIter.cmp(prevEnd, i.span.End) != 0 {
  1668  			i.err = base.CorruptionErrorf("pebble: corrupt keyspan fragmentation")
  1669  			i.span = keyspan.Span{}
  1670  			return nil
  1671  		}
  1672  		k, lazyValue = i.blockIter.Next()
  1673  		internalValue = lazyValue.InPlaceValue()
  1674  	}
  1675  	if i.elideSameSeqnum && len(i.span.Keys) > 0 {
  1676  		i.elideKeysOfSameSeqNum()
  1677  	}
  1678  	// i.blockIter is positioned over the first internal key for the next span.
  1679  	return &i.span
  1680  }
  1681  
  1682  // gatherBackward gathers internal keys with identical bounds. Keys defined over
  1683  // spans of the keyspace are fragmented such that any overlapping key spans have
  1684  // identical bounds. When these spans are persisted to a range deletion or range
  1685  // key block, they may be persisted as multiple internal keys in order to encode
  1686  // multiple sequence numbers or key kinds.
  1687  //
  1688  // gatherBackward iterates backwards, re-combining the fragmented internal keys
  1689  // to reconstruct a keyspan.Span that holds all the keys defined over the span.
  1690  func (i *fragmentBlockIter) gatherBackward(k *InternalKey, lazyValue base.LazyValue) *keyspan.Span {
  1691  	i.span = keyspan.Span{}
  1692  	if k == nil || !i.blockIter.valid() {
  1693  		return nil
  1694  	}
  1695  	i.err = nil
  1696  	// Use the i.keyBuf array to back the Keys slice to prevent an allocation
  1697  	// when a span contains few keys.
  1698  	i.span.Keys = i.keyBuf[:0]
  1699  
  1700  	// Decode the span's end key and individual keys from the value.
  1701  	internalValue := lazyValue.InPlaceValue()
  1702  	i.decodeSpanKeys(k, internalValue)
  1703  	if i.err != nil {
  1704  		return nil
  1705  	}
  1706  	prevEnd := i.span.End
  1707  
  1708  	// There might exist additional internal keys with identical bounds encoded
  1709  	// within the block. Iterate backward, accumulating all the keys with
  1710  	// identical bounds to s.
  1711  	k, lazyValue = i.blockIter.Prev()
  1712  	internalValue = lazyValue.InPlaceValue()
  1713  	for k != nil && i.blockIter.cmp(k.UserKey, i.span.Start) == 0 {
  1714  		i.decodeSpanKeys(k, internalValue)
  1715  		if i.err != nil {
  1716  			return nil
  1717  		}
  1718  
  1719  		// Since k indicates an equal start key, the encoded end key must
  1720  		// exactly equal the original end key from the first internal key.
  1721  		// Overlapping fragments are required to have exactly equal start and
  1722  		// end bounds.
  1723  		if i.blockIter.cmp(prevEnd, i.span.End) != 0 {
  1724  			i.err = base.CorruptionErrorf("pebble: corrupt keyspan fragmentation")
  1725  			i.span = keyspan.Span{}
  1726  			return nil
  1727  		}
  1728  		k, lazyValue = i.blockIter.Prev()
  1729  		internalValue = lazyValue.InPlaceValue()
  1730  	}
  1731  	// i.blockIter is positioned over the last internal key for the previous
  1732  	// span.
  1733  
  1734  	// Backwards iteration encounters internal keys in the wrong order.
  1735  	keyspan.SortKeysByTrailer(&i.span.Keys)
  1736  
  1737  	if i.elideSameSeqnum && len(i.span.Keys) > 0 {
  1738  		i.elideKeysOfSameSeqNum()
  1739  	}
  1740  	return &i.span
  1741  }
  1742  
  1743  // Error implements (keyspan.FragmentIterator).Error.
  1744  func (i *fragmentBlockIter) Error() error {
  1745  	return i.err
  1746  }
  1747  
  1748  // Close implements (keyspan.FragmentIterator).Close.
  1749  func (i *fragmentBlockIter) Close() error {
  1750  	var err error
  1751  	if i.closeHook != nil {
  1752  		err = i.closeHook(i)
  1753  	}
  1754  	err = firstError(err, i.blockIter.Close())
  1755  	return err
  1756  }
  1757  
  1758  // First implements (keyspan.FragmentIterator).First
  1759  func (i *fragmentBlockIter) First() *keyspan.Span {
  1760  	i.dir = +1
  1761  	return i.gatherForward(i.blockIter.First())
  1762  }
  1763  
  1764  // Last implements (keyspan.FragmentIterator).Last.
  1765  func (i *fragmentBlockIter) Last() *keyspan.Span {
  1766  	i.dir = -1
  1767  	return i.gatherBackward(i.blockIter.Last())
  1768  }
  1769  
  1770  // Next implements (keyspan.FragmentIterator).Next.
  1771  func (i *fragmentBlockIter) Next() *keyspan.Span {
  1772  	switch {
  1773  	case i.dir == -1 && !i.span.Valid():
  1774  		// Switching directions.
  1775  		//
  1776  		// i.blockIter is exhausted, before the first key. Move onto the first.
  1777  		i.blockIter.First()
  1778  		i.dir = +1
  1779  	case i.dir == -1 && i.span.Valid():
  1780  		// Switching directions.
  1781  		//
  1782  		// i.blockIter is currently positioned over the last internal key for
  1783  		// the previous span. Next it once to move to the first internal key
  1784  		// that makes up the current span, and gatherForwaad to land on the
  1785  		// first internal key making up the next span.
  1786  		//
  1787  		// In the diagram below, if the last span returned to the user during
  1788  		// reverse iteration was [b,c), i.blockIter is currently positioned at
  1789  		// [a,b). The block iter must be positioned over [d,e) to gather the
  1790  		// next span's fragments.
  1791  		//
  1792  		//    ... [a,b) [b,c) [b,c) [b,c) [d,e) ...
  1793  		//          ^                       ^
  1794  		//     i.blockIter                 want
  1795  		if x := i.gatherForward(i.blockIter.Next()); invariants.Enabled && !x.Valid() {
  1796  			panic("pebble: invariant violation: next entry unexpectedly invalid")
  1797  		}
  1798  		i.dir = +1
  1799  	}
  1800  	// We know that this blockIter has in-place values.
  1801  	return i.gatherForward(&i.blockIter.ikey, base.MakeInPlaceValue(i.blockIter.val))
  1802  }
  1803  
  1804  // Prev implements (keyspan.FragmentIterator).Prev.
  1805  func (i *fragmentBlockIter) Prev() *keyspan.Span {
  1806  	switch {
  1807  	case i.dir == +1 && !i.span.Valid():
  1808  		// Switching directions.
  1809  		//
  1810  		// i.blockIter is exhausted, after the last key. Move onto the last.
  1811  		i.blockIter.Last()
  1812  		i.dir = -1
  1813  	case i.dir == +1 && i.span.Valid():
  1814  		// Switching directions.
  1815  		//
  1816  		// i.blockIter is currently positioned over the first internal key for
  1817  		// the next span. Prev it once to move to the last internal key that
  1818  		// makes up the current span, and gatherBackward to land on the last
  1819  		// internal key making up the previous span.
  1820  		//
  1821  		// In the diagram below, if the last span returned to the user during
  1822  		// forward iteration was [b,c), i.blockIter is currently positioned at
  1823  		// [d,e). The block iter must be positioned over [a,b) to gather the
  1824  		// previous span's fragments.
  1825  		//
  1826  		//    ... [a,b) [b,c) [b,c) [b,c) [d,e) ...
  1827  		//          ^                       ^
  1828  		//        want                  i.blockIter
  1829  		if x := i.gatherBackward(i.blockIter.Prev()); invariants.Enabled && !x.Valid() {
  1830  			panic("pebble: invariant violation: previous entry unexpectedly invalid")
  1831  		}
  1832  		i.dir = -1
  1833  	}
  1834  	// We know that this blockIter has in-place values.
  1835  	return i.gatherBackward(&i.blockIter.ikey, base.MakeInPlaceValue(i.blockIter.val))
  1836  }
  1837  
  1838  // SeekGE implements (keyspan.FragmentIterator).SeekGE.
  1839  func (i *fragmentBlockIter) SeekGE(k []byte) *keyspan.Span {
  1840  	if s := i.SeekLT(k); s != nil && i.blockIter.cmp(k, s.End) < 0 {
  1841  		return s
  1842  	}
  1843  	// TODO(jackson): If the above i.SeekLT(k) discovers a span but the span
  1844  	// doesn't meet the k < s.End comparison, then there's no need for the
  1845  	// SeekLT to gatherBackward.
  1846  	return i.Next()
  1847  }
  1848  
  1849  // SeekLT implements (keyspan.FragmentIterator).SeekLT.
  1850  func (i *fragmentBlockIter) SeekLT(k []byte) *keyspan.Span {
  1851  	i.dir = -1
  1852  	return i.gatherBackward(i.blockIter.SeekLT(k, base.SeekLTFlagsNone))
  1853  }
  1854  
  1855  // String implements fmt.Stringer.
  1856  func (i *fragmentBlockIter) String() string {
  1857  	return "fragment-block-iter"
  1858  }
  1859  
  1860  // SetCloseHook implements sstable.FragmentIterator.
  1861  func (i *fragmentBlockIter) SetCloseHook(fn func(i keyspan.FragmentIterator) error) {
  1862  	i.closeHook = fn
  1863  }