github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/sstable/block.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package sstable
     6  
     7  import (
     8  	"encoding/binary"
     9  	"unsafe"
    10  
    11  	"github.com/cockroachdb/errors"
    12  	"github.com/cockroachdb/pebble/internal/base"
    13  	"github.com/cockroachdb/pebble/internal/invariants"
    14  	"github.com/cockroachdb/pebble/internal/keyspan"
    15  	"github.com/cockroachdb/pebble/internal/manual"
    16  	"github.com/cockroachdb/pebble/internal/rangedel"
    17  	"github.com/cockroachdb/pebble/internal/rangekey"
    18  )
    19  
    20  func uvarintLen(v uint32) int {
    21  	i := 0
    22  	for v >= 0x80 {
    23  		v >>= 7
    24  		i++
    25  	}
    26  	return i + 1
    27  }
    28  
    29  type blockWriter struct {
    30  	restartInterval int
    31  	nEntries        int
    32  	nextRestart     int
    33  	buf             []byte
    34  	// For datablocks in TableFormatPebblev3, we steal the most significant bit
    35  	// in restarts for encoding setHasSameKeyPrefixSinceLastRestart. This leaves
    36  	// us with 31 bits, which is more than enough (no one needs > 2GB blocks).
    37  	// Typically, restarts occur every 16 keys, and by storing this bit with the
    38  	// restart, we can optimize for the case where a user wants to skip to the
    39  	// next prefix which happens to be in the same data block, but is > 16 keys
    40  	// away. We have seen production situations with 100+ versions per MVCC key
    41  	// (which share the same prefix). Additionally, for such writers, the prefix
    42  	// compression of the key, that shares the key with the preceding key, is
    43  	// limited to the prefix part of the preceding key -- this ensures that when
    44  	// doing NPrefix (see blockIter) we don't need to assemble the full key
    45  	// for each step since by limiting the length of the shared key we are
    46  	// ensuring that any of the keys with the same prefix can be used to
    47  	// assemble the full key when the prefix does change.
    48  	restarts []uint32
    49  	// Do not read curKey directly from outside blockWriter since it can have
    50  	// the InternalKeyKindSSTableInternalObsoleteBit set. Use getCurKey() or
    51  	// getCurUserKey() instead.
    52  	curKey []byte
    53  	// curValue excludes the optional prefix provided to
    54  	// storeWithOptionalValuePrefix.
    55  	curValue []byte
    56  	prevKey  []byte
    57  	tmp      [4]byte
    58  	// We don't know the state of the sets that were at the end of the previous
    59  	// block, so this is initially 0. It may be true for the second and later
    60  	// restarts in a block. Not having inter-block information is fine since we
    61  	// will optimize by stepping through restarts only within the same block.
    62  	// Note that the first restart is the first key in the block.
    63  	setHasSameKeyPrefixSinceLastRestart bool
    64  }
    65  
    66  func (w *blockWriter) clear() {
    67  	*w = blockWriter{
    68  		buf:      w.buf[:0],
    69  		restarts: w.restarts[:0],
    70  		curKey:   w.curKey[:0],
    71  		curValue: w.curValue[:0],
    72  		prevKey:  w.prevKey[:0],
    73  	}
    74  }
    75  
    76  // MaximumBlockSize is an extremely generous maximum block size of 256MiB. We
    77  // explicitly place this limit to reserve a few bits in the restart for
    78  // internal use.
    79  const MaximumBlockSize = 1 << 28
    80  const setHasSameKeyPrefixRestartMask uint32 = 1 << 31
    81  const restartMaskLittleEndianHighByteWithoutSetHasSamePrefix byte = 0b0111_1111
    82  const restartMaskLittleEndianHighByteOnlySetHasSamePrefix byte = 0b1000_0000
    83  
    84  func (w *blockWriter) getCurKey() InternalKey {
    85  	k := base.DecodeInternalKey(w.curKey)
    86  	k.Trailer = k.Trailer & trailerObsoleteMask
    87  	return k
    88  }
    89  
    90  func (w *blockWriter) getCurUserKey() []byte {
    91  	n := len(w.curKey) - base.InternalTrailerLen
    92  	if n < 0 {
    93  		panic(errors.AssertionFailedf("corrupt key in blockWriter buffer"))
    94  	}
    95  	return w.curKey[:n:n]
    96  }
    97  
    98  // If !addValuePrefix, the valuePrefix is ignored.
    99  func (w *blockWriter) storeWithOptionalValuePrefix(
   100  	keySize int,
   101  	value []byte,
   102  	maxSharedKeyLen int,
   103  	addValuePrefix bool,
   104  	valuePrefix valuePrefix,
   105  	setHasSameKeyPrefix bool,
   106  ) {
   107  	shared := 0
   108  	if !setHasSameKeyPrefix {
   109  		w.setHasSameKeyPrefixSinceLastRestart = false
   110  	}
   111  	if w.nEntries == w.nextRestart {
   112  		w.nextRestart = w.nEntries + w.restartInterval
   113  		restart := uint32(len(w.buf))
   114  		if w.setHasSameKeyPrefixSinceLastRestart {
   115  			restart = restart | setHasSameKeyPrefixRestartMask
   116  		}
   117  		w.setHasSameKeyPrefixSinceLastRestart = true
   118  		w.restarts = append(w.restarts, restart)
   119  	} else {
   120  		// TODO(peter): Manually inlined version of base.SharedPrefixLen(). This
   121  		// is 3% faster on BenchmarkWriter on go1.16. Remove if future versions
   122  		// show this to not be a performance win. For now, functions that use of
   123  		// unsafe cannot be inlined.
   124  		n := maxSharedKeyLen
   125  		if n > len(w.prevKey) {
   126  			n = len(w.prevKey)
   127  		}
   128  		asUint64 := func(b []byte, i int) uint64 {
   129  			return binary.LittleEndian.Uint64(b[i:])
   130  		}
   131  		for shared < n-7 && asUint64(w.curKey, shared) == asUint64(w.prevKey, shared) {
   132  			shared += 8
   133  		}
   134  		for shared < n && w.curKey[shared] == w.prevKey[shared] {
   135  			shared++
   136  		}
   137  	}
   138  
   139  	lenValuePlusOptionalPrefix := len(value)
   140  	if addValuePrefix {
   141  		lenValuePlusOptionalPrefix++
   142  	}
   143  	needed := 3*binary.MaxVarintLen32 + len(w.curKey[shared:]) + lenValuePlusOptionalPrefix
   144  	n := len(w.buf)
   145  	if cap(w.buf) < n+needed {
   146  		newCap := 2 * cap(w.buf)
   147  		if newCap == 0 {
   148  			newCap = 1024
   149  		}
   150  		for newCap < n+needed {
   151  			newCap *= 2
   152  		}
   153  		newBuf := make([]byte, n, newCap)
   154  		copy(newBuf, w.buf)
   155  		w.buf = newBuf
   156  	}
   157  	w.buf = w.buf[:n+needed]
   158  
   159  	// TODO(peter): Manually inlined versions of binary.PutUvarint(). This is 15%
   160  	// faster on BenchmarkWriter on go1.13. Remove if go1.14 or future versions
   161  	// show this to not be a performance win.
   162  	{
   163  		x := uint32(shared)
   164  		for x >= 0x80 {
   165  			w.buf[n] = byte(x) | 0x80
   166  			x >>= 7
   167  			n++
   168  		}
   169  		w.buf[n] = byte(x)
   170  		n++
   171  	}
   172  
   173  	{
   174  		x := uint32(keySize - shared)
   175  		for x >= 0x80 {
   176  			w.buf[n] = byte(x) | 0x80
   177  			x >>= 7
   178  			n++
   179  		}
   180  		w.buf[n] = byte(x)
   181  		n++
   182  	}
   183  
   184  	{
   185  		x := uint32(lenValuePlusOptionalPrefix)
   186  		for x >= 0x80 {
   187  			w.buf[n] = byte(x) | 0x80
   188  			x >>= 7
   189  			n++
   190  		}
   191  		w.buf[n] = byte(x)
   192  		n++
   193  	}
   194  
   195  	n += copy(w.buf[n:], w.curKey[shared:])
   196  	if addValuePrefix {
   197  		w.buf[n : n+1][0] = byte(valuePrefix)
   198  		n++
   199  	}
   200  	n += copy(w.buf[n:], value)
   201  	w.buf = w.buf[:n]
   202  
   203  	w.curValue = w.buf[n-len(value):]
   204  
   205  	w.nEntries++
   206  }
   207  
   208  func (w *blockWriter) add(key InternalKey, value []byte) {
   209  	w.addWithOptionalValuePrefix(
   210  		key, false, value, len(key.UserKey), false, 0, false)
   211  }
   212  
   213  // Callers that always set addValuePrefix to false should use add() instead.
   214  //
   215  // isObsolete indicates whether this key-value pair is obsolete in this
   216  // sstable (only applicable when writing data blocks) -- see the comment in
   217  // table.go and the longer one in format.go. addValuePrefix adds a 1 byte
   218  // prefix to the value, specified in valuePrefix -- this is used for data
   219  // blocks in TableFormatPebblev3 onwards for SETs (see the comment in
   220  // format.go, with more details in value_block.go). setHasSameKeyPrefix is
   221  // also used in TableFormatPebblev3 onwards for SETs.
   222  func (w *blockWriter) addWithOptionalValuePrefix(
   223  	key InternalKey,
   224  	isObsolete bool,
   225  	value []byte,
   226  	maxSharedKeyLen int,
   227  	addValuePrefix bool,
   228  	valuePrefix valuePrefix,
   229  	setHasSameKeyPrefix bool,
   230  ) {
   231  	w.curKey, w.prevKey = w.prevKey, w.curKey
   232  
   233  	size := key.Size()
   234  	if cap(w.curKey) < size {
   235  		w.curKey = make([]byte, 0, size*2)
   236  	}
   237  	w.curKey = w.curKey[:size]
   238  	if isObsolete {
   239  		key.Trailer = key.Trailer | trailerObsoleteBit
   240  	}
   241  	key.Encode(w.curKey)
   242  
   243  	w.storeWithOptionalValuePrefix(
   244  		size, value, maxSharedKeyLen, addValuePrefix, valuePrefix, setHasSameKeyPrefix)
   245  }
   246  
   247  func (w *blockWriter) finish() []byte {
   248  	// Write the restart points to the buffer.
   249  	if w.nEntries == 0 {
   250  		// Every block must have at least one restart point.
   251  		if cap(w.restarts) > 0 {
   252  			w.restarts = w.restarts[:1]
   253  			w.restarts[0] = 0
   254  		} else {
   255  			w.restarts = append(w.restarts, 0)
   256  		}
   257  	}
   258  	tmp4 := w.tmp[:4]
   259  	for _, x := range w.restarts {
   260  		binary.LittleEndian.PutUint32(tmp4, x)
   261  		w.buf = append(w.buf, tmp4...)
   262  	}
   263  	binary.LittleEndian.PutUint32(tmp4, uint32(len(w.restarts)))
   264  	w.buf = append(w.buf, tmp4...)
   265  	result := w.buf
   266  
   267  	// Reset the block state.
   268  	w.nEntries = 0
   269  	w.nextRestart = 0
   270  	w.buf = w.buf[:0]
   271  	w.restarts = w.restarts[:0]
   272  	return result
   273  }
   274  
   275  // emptyBlockSize holds the size of an empty block. Every block ends
   276  // in a uint32 trailer encoding the number of restart points within the
   277  // block.
   278  const emptyBlockSize = 4
   279  
   280  func (w *blockWriter) estimatedSize() int {
   281  	return len(w.buf) + 4*len(w.restarts) + emptyBlockSize
   282  }
   283  
   284  type blockEntry struct {
   285  	offset   int32
   286  	keyStart int32
   287  	keyEnd   int32
   288  	valStart int32
   289  	valSize  int32
   290  }
   291  
   292  // blockIter is an iterator over a single block of data.
   293  //
   294  // A blockIter provides an additional guarantee around key stability when a
   295  // block has a restart interval of 1 (i.e. when there is no prefix
   296  // compression). Key stability refers to whether the InternalKey.UserKey bytes
   297  // returned by a positioning call will remain stable after a subsequent
   298  // positioning call. The normal case is that a positioning call will invalidate
   299  // any previously returned InternalKey.UserKey. If a block has a restart
   300  // interval of 1 (no prefix compression), blockIter guarantees that
   301  // InternalKey.UserKey will point to the key as stored in the block itself
   302  // which will remain valid until the blockIter is closed. The key stability
   303  // guarantee is used by the range tombstone and range key code, which knows that
   304  // the respective blocks are always encoded with a restart interval of 1. This
   305  // per-block key stability guarantee is sufficient for range tombstones and
   306  // range deletes as they are always encoded in a single block.
   307  //
   308  // A blockIter also provides a value stability guarantee for range deletions and
   309  // range keys since there is only a single range deletion and range key block
   310  // per sstable and the blockIter will not release the bytes for the block until
   311  // it is closed.
   312  //
   313  // Note on why blockIter knows about lazyValueHandling:
   314  //
   315  // blockIter's positioning functions (that return a LazyValue), are too
   316  // complex to inline even prior to lazyValueHandling. blockIter.Next and
   317  // blockIter.First were by far the cheapest and had costs 195 and 180
   318  // respectively, which exceeds the budget of 80. We initially tried to keep
   319  // the lazyValueHandling logic out of blockIter by wrapping it with a
   320  // lazyValueDataBlockIter. singleLevelIter and twoLevelIter would use this
   321  // wrapped iter. The functions in lazyValueDataBlockIter were simple, in that
   322  // they called the corresponding blockIter func and then decided whether the
   323  // value was in fact in-place (so return immediately) or needed further
   324  // handling. But these also turned out too costly for mid-stack inlining since
   325  // simple calls like the following have a high cost that is barely under the
   326  // budget of 80
   327  //
   328  //	k, v := i.data.SeekGE(key, flags)  // cost 74
   329  //	k, v := i.data.Next()              // cost 72
   330  //
   331  // We have 2 options for minimizing performance regressions:
   332  //   - Include the lazyValueHandling logic in the already non-inlineable
   333  //     blockIter functions: Since most of the time is spent in data block iters,
   334  //     it is acceptable to take the small hit of unnecessary branching (which
   335  //     hopefully branch prediction will predict correctly) for other kinds of
   336  //     blocks.
   337  //   - Duplicate the logic of singleLevelIterator and twoLevelIterator for the
   338  //     v3 sstable and only use the aforementioned lazyValueDataBlockIter for a
   339  //     v3 sstable. We would want to manage these copies via code generation.
   340  //
   341  // We have picked the first option here.
   342  type blockIter struct {
   343  	cmp Compare
   344  	// offset is the byte index that marks where the current key/value is
   345  	// encoded in the block.
   346  	offset int32
   347  	// nextOffset is the byte index where the next key/value is encoded in the
   348  	// block.
   349  	nextOffset int32
   350  	// A "restart point" in a block is a point where the full key is encoded,
   351  	// instead of just having a suffix of the key encoded. See readEntry() for
   352  	// how prefix compression of keys works. Keys in between two restart points
   353  	// only have a suffix encoded in the block. When restart interval is 1, no
   354  	// prefix compression of keys happens. This is the case with range tombstone
   355  	// blocks.
   356  	//
   357  	// All restart offsets are listed in increasing order in
   358  	// i.ptr[i.restarts:len(block)-4], while numRestarts is encoded in the last
   359  	// 4 bytes of the block as a uint32 (i.ptr[len(block)-4:]). i.restarts can
   360  	// therefore be seen as the point where data in the block ends, and a list
   361  	// of offsets of all restart points begins.
   362  	restarts int32
   363  	// Number of restart points in this block. Encoded at the end of the block
   364  	// as a uint32.
   365  	numRestarts  int32
   366  	globalSeqNum uint64
   367  	ptr          unsafe.Pointer
   368  	data         []byte
   369  	// key contains the raw key the iterator is currently pointed at. This may
   370  	// point directly to data stored in the block (for a key which has no prefix
   371  	// compression), to fullKey (for a prefix compressed key), or to a slice of
   372  	// data stored in cachedBuf (during reverse iteration).
   373  	key []byte
   374  	// fullKey is a buffer used for key prefix decompression.
   375  	fullKey []byte
   376  	// val contains the value the iterator is currently pointed at. If non-nil,
   377  	// this points to a slice of the block data.
   378  	val []byte
   379  	// lazyValue is val turned into a LazyValue, whenever a positioning method
   380  	// returns a non-nil key-value pair.
   381  	lazyValue base.LazyValue
   382  	// ikey contains the decoded InternalKey the iterator is currently pointed
   383  	// at. Note that the memory backing ikey.UserKey is either data stored
   384  	// directly in the block, fullKey, or cachedBuf. The key stability guarantee
   385  	// for blocks built with a restart interval of 1 is achieved by having
   386  	// ikey.UserKey always point to data stored directly in the block.
   387  	ikey InternalKey
   388  	// cached and cachedBuf are used during reverse iteration. They are needed
   389  	// because we can't perform prefix decoding in reverse, only in the forward
   390  	// direction. In order to iterate in reverse, we decode and cache the entries
   391  	// between two restart points.
   392  	//
   393  	// Note that cached[len(cached)-1] contains the previous entry to the one the
   394  	// blockIter is currently pointed at. As usual, nextOffset will contain the
   395  	// offset of the next entry. During reverse iteration, nextOffset will be
   396  	// updated to point to offset, and we'll set the blockIter to point at the
   397  	// entry cached[len(cached)-1]. See Prev() for more details.
   398  	//
   399  	// For a block encoded with a restart interval of 1, cached and cachedBuf
   400  	// will not be used as there are no prefix compressed entries between the
   401  	// restart points.
   402  	cached    []blockEntry
   403  	cachedBuf []byte
   404  	handle    bufferHandle
   405  	// for block iteration for already loaded blocks.
   406  	firstUserKey      []byte
   407  	lazyValueHandling struct {
   408  		vbr            *valueBlockReader
   409  		hasValuePrefix bool
   410  	}
   411  	hideObsoletePoints bool
   412  }
   413  
   414  // blockIter implements the base.InternalIterator interface.
   415  var _ base.InternalIterator = (*blockIter)(nil)
   416  
   417  func newBlockIter(cmp Compare, block block) (*blockIter, error) {
   418  	i := &blockIter{}
   419  	return i, i.init(cmp, block, 0, false)
   420  }
   421  
   422  func (i *blockIter) String() string {
   423  	return "block"
   424  }
   425  
   426  func (i *blockIter) init(
   427  	cmp Compare, block block, globalSeqNum uint64, hideObsoletePoints bool,
   428  ) error {
   429  	numRestarts := int32(binary.LittleEndian.Uint32(block[len(block)-4:]))
   430  	if numRestarts == 0 {
   431  		return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)")
   432  	}
   433  	i.cmp = cmp
   434  	i.restarts = int32(len(block)) - 4*(1+numRestarts)
   435  	i.numRestarts = numRestarts
   436  	i.globalSeqNum = globalSeqNum
   437  	i.ptr = unsafe.Pointer(&block[0])
   438  	i.data = block
   439  	i.fullKey = i.fullKey[:0]
   440  	i.val = nil
   441  	i.hideObsoletePoints = hideObsoletePoints
   442  	i.clearCache()
   443  	if i.restarts > 0 {
   444  		if err := i.readFirstKey(); err != nil {
   445  			return err
   446  		}
   447  	} else {
   448  		// Block is empty.
   449  		i.firstUserKey = nil
   450  	}
   451  	return nil
   452  }
   453  
   454  // NB: two cases of hideObsoletePoints:
   455  //   - Local sstable iteration: globalSeqNum will be set iff the sstable was
   456  //     ingested.
   457  //   - Foreign sstable iteration: globalSeqNum is always set.
   458  func (i *blockIter) initHandle(
   459  	cmp Compare, block bufferHandle, globalSeqNum uint64, hideObsoletePoints bool,
   460  ) error {
   461  	i.handle.Release()
   462  	i.handle = block
   463  	return i.init(cmp, block.Get(), globalSeqNum, hideObsoletePoints)
   464  }
   465  
   466  func (i *blockIter) invalidate() {
   467  	i.clearCache()
   468  	i.offset = 0
   469  	i.nextOffset = 0
   470  	i.restarts = 0
   471  	i.numRestarts = 0
   472  	i.data = nil
   473  }
   474  
   475  // isDataInvalidated returns true when the blockIter has been invalidated
   476  // using an invalidate call. NB: this is different from blockIter.Valid
   477  // which is part of the InternalIterator implementation.
   478  func (i *blockIter) isDataInvalidated() bool {
   479  	return i.data == nil
   480  }
   481  
   482  func (i *blockIter) resetForReuse() blockIter {
   483  	return blockIter{
   484  		fullKey:   i.fullKey[:0],
   485  		cached:    i.cached[:0],
   486  		cachedBuf: i.cachedBuf[:0],
   487  		data:      nil,
   488  	}
   489  }
   490  
   491  func (i *blockIter) readEntry() {
   492  	ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
   493  
   494  	// This is an ugly performance hack. Reading entries from blocks is one of
   495  	// the inner-most routines and decoding the 3 varints per-entry takes
   496  	// significant time. Neither go1.11 or go1.12 will inline decodeVarint for
   497  	// us, so we do it manually. This provides a 10-15% performance improvement
   498  	// on blockIter benchmarks on both go1.11 and go1.12.
   499  	//
   500  	// TODO(peter): remove this hack if go:inline is ever supported.
   501  
   502  	var shared uint32
   503  	if a := *((*uint8)(ptr)); a < 128 {
   504  		shared = uint32(a)
   505  		ptr = unsafe.Pointer(uintptr(ptr) + 1)
   506  	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
   507  		shared = uint32(b)<<7 | uint32(a)
   508  		ptr = unsafe.Pointer(uintptr(ptr) + 2)
   509  	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
   510  		shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   511  		ptr = unsafe.Pointer(uintptr(ptr) + 3)
   512  	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
   513  		shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   514  		ptr = unsafe.Pointer(uintptr(ptr) + 4)
   515  	} else {
   516  		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
   517  		shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   518  		ptr = unsafe.Pointer(uintptr(ptr) + 5)
   519  	}
   520  
   521  	var unshared uint32
   522  	if a := *((*uint8)(ptr)); a < 128 {
   523  		unshared = uint32(a)
   524  		ptr = unsafe.Pointer(uintptr(ptr) + 1)
   525  	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
   526  		unshared = uint32(b)<<7 | uint32(a)
   527  		ptr = unsafe.Pointer(uintptr(ptr) + 2)
   528  	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
   529  		unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   530  		ptr = unsafe.Pointer(uintptr(ptr) + 3)
   531  	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
   532  		unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   533  		ptr = unsafe.Pointer(uintptr(ptr) + 4)
   534  	} else {
   535  		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
   536  		unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   537  		ptr = unsafe.Pointer(uintptr(ptr) + 5)
   538  	}
   539  
   540  	var value uint32
   541  	if a := *((*uint8)(ptr)); a < 128 {
   542  		value = uint32(a)
   543  		ptr = unsafe.Pointer(uintptr(ptr) + 1)
   544  	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
   545  		value = uint32(b)<<7 | uint32(a)
   546  		ptr = unsafe.Pointer(uintptr(ptr) + 2)
   547  	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
   548  		value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   549  		ptr = unsafe.Pointer(uintptr(ptr) + 3)
   550  	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
   551  		value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   552  		ptr = unsafe.Pointer(uintptr(ptr) + 4)
   553  	} else {
   554  		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
   555  		value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   556  		ptr = unsafe.Pointer(uintptr(ptr) + 5)
   557  	}
   558  
   559  	unsharedKey := getBytes(ptr, int(unshared))
   560  	// TODO(sumeer): move this into the else block below.
   561  	i.fullKey = append(i.fullKey[:shared], unsharedKey...)
   562  	if shared == 0 {
   563  		// Provide stability for the key across positioning calls if the key
   564  		// doesn't share a prefix with the previous key. This removes requiring the
   565  		// key to be copied if the caller knows the block has a restart interval of
   566  		// 1. An important example of this is range-del blocks.
   567  		i.key = unsharedKey
   568  	} else {
   569  		i.key = i.fullKey
   570  	}
   571  	ptr = unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
   572  	i.val = getBytes(ptr, int(value))
   573  	i.nextOffset = int32(uintptr(ptr)-uintptr(i.ptr)) + int32(value)
   574  }
   575  
   576  func (i *blockIter) readFirstKey() error {
   577  	ptr := i.ptr
   578  
   579  	// This is an ugly performance hack. Reading entries from blocks is one of
   580  	// the inner-most routines and decoding the 3 varints per-entry takes
   581  	// significant time. Neither go1.11 or go1.12 will inline decodeVarint for
   582  	// us, so we do it manually. This provides a 10-15% performance improvement
   583  	// on blockIter benchmarks on both go1.11 and go1.12.
   584  	//
   585  	// TODO(peter): remove this hack if go:inline is ever supported.
   586  
   587  	if shared := *((*uint8)(ptr)); shared == 0 {
   588  		ptr = unsafe.Pointer(uintptr(ptr) + 1)
   589  	} else {
   590  		// The shared length is != 0, which is invalid.
   591  		panic("first key in block must have zero shared length")
   592  	}
   593  
   594  	var unshared uint32
   595  	if a := *((*uint8)(ptr)); a < 128 {
   596  		unshared = uint32(a)
   597  		ptr = unsafe.Pointer(uintptr(ptr) + 1)
   598  	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
   599  		unshared = uint32(b)<<7 | uint32(a)
   600  		ptr = unsafe.Pointer(uintptr(ptr) + 2)
   601  	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
   602  		unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   603  		ptr = unsafe.Pointer(uintptr(ptr) + 3)
   604  	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
   605  		unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   606  		ptr = unsafe.Pointer(uintptr(ptr) + 4)
   607  	} else {
   608  		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
   609  		unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   610  		ptr = unsafe.Pointer(uintptr(ptr) + 5)
   611  	}
   612  
   613  	// Skip the value length.
   614  	if a := *((*uint8)(ptr)); a < 128 {
   615  		ptr = unsafe.Pointer(uintptr(ptr) + 1)
   616  	} else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); a < 128 {
   617  		ptr = unsafe.Pointer(uintptr(ptr) + 2)
   618  	} else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); a < 128 {
   619  		ptr = unsafe.Pointer(uintptr(ptr) + 3)
   620  	} else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); a < 128 {
   621  		ptr = unsafe.Pointer(uintptr(ptr) + 4)
   622  	} else {
   623  		ptr = unsafe.Pointer(uintptr(ptr) + 5)
   624  	}
   625  
   626  	firstKey := getBytes(ptr, int(unshared))
   627  	// Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
   628  	// BlockIter benchmarks.
   629  	if n := len(firstKey) - 8; n >= 0 {
   630  		i.firstUserKey = firstKey[:n:n]
   631  	} else {
   632  		i.firstUserKey = nil
   633  		return base.CorruptionErrorf("pebble/table: invalid firstKey in block")
   634  	}
   635  	return nil
   636  }
   637  
   638  // The sstable internal obsolete bit is set when writing a block and unset by
   639  // blockIter, so no code outside block writing/reading code ever sees it.
   640  const trailerObsoleteBit = uint64(base.InternalKeyKindSSTableInternalObsoleteBit)
   641  const trailerObsoleteMask = (InternalKeySeqNumMax << 8) | uint64(base.InternalKeyKindSSTableInternalObsoleteMask)
   642  
   643  func (i *blockIter) decodeInternalKey(key []byte) (hiddenPoint bool) {
   644  	// Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
   645  	// BlockIter benchmarks.
   646  	if n := len(key) - 8; n >= 0 {
   647  		trailer := binary.LittleEndian.Uint64(key[n:])
   648  		hiddenPoint = i.hideObsoletePoints &&
   649  			(trailer&trailerObsoleteBit != 0)
   650  		i.ikey.Trailer = trailer & trailerObsoleteMask
   651  		i.ikey.UserKey = key[:n:n]
   652  		if i.globalSeqNum != 0 {
   653  			i.ikey.SetSeqNum(i.globalSeqNum)
   654  		}
   655  	} else {
   656  		i.ikey.Trailer = uint64(InternalKeyKindInvalid)
   657  		i.ikey.UserKey = nil
   658  	}
   659  	return hiddenPoint
   660  }
   661  
   662  func (i *blockIter) clearCache() {
   663  	i.cached = i.cached[:0]
   664  	i.cachedBuf = i.cachedBuf[:0]
   665  }
   666  
   667  func (i *blockIter) cacheEntry() {
   668  	var valStart int32
   669  	valSize := int32(len(i.val))
   670  	if valSize > 0 {
   671  		valStart = int32(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr))
   672  	}
   673  
   674  	i.cached = append(i.cached, blockEntry{
   675  		offset:   i.offset,
   676  		keyStart: int32(len(i.cachedBuf)),
   677  		keyEnd:   int32(len(i.cachedBuf) + len(i.key)),
   678  		valStart: valStart,
   679  		valSize:  valSize,
   680  	})
   681  	i.cachedBuf = append(i.cachedBuf, i.key...)
   682  }
   683  
   684  func (i *blockIter) getFirstUserKey() []byte {
   685  	return i.firstUserKey
   686  }
   687  
   688  // SeekGE implements internalIterator.SeekGE, as documented in the pebble
   689  // package.
   690  func (i *blockIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) {
   691  	if invariants.Enabled && i.isDataInvalidated() {
   692  		panic(errors.AssertionFailedf("invalidated blockIter used"))
   693  	}
   694  
   695  	i.clearCache()
   696  	// Find the index of the smallest restart point whose key is > the key
   697  	// sought; index will be numRestarts if there is no such restart point.
   698  	i.offset = 0
   699  	var index int32
   700  
   701  	{
   702  		// NB: manually inlined sort.Seach is ~5% faster.
   703  		//
   704  		// Define f(-1) == false and f(n) == true.
   705  		// Invariant: f(index-1) == false, f(upper) == true.
   706  		upper := i.numRestarts
   707  		for index < upper {
   708  			h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
   709  			// index ≤ h < upper
   710  			offset := decodeRestart(i.data[i.restarts+4*h:])
   711  			// For a restart point, there are 0 bytes shared with the previous key.
   712  			// The varint encoding of 0 occupies 1 byte.
   713  			ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
   714  
   715  			// Decode the key at that restart point, and compare it to the key
   716  			// sought. See the comment in readEntry for why we manually inline the
   717  			// varint decoding.
   718  			var v1 uint32
   719  			if a := *((*uint8)(ptr)); a < 128 {
   720  				v1 = uint32(a)
   721  				ptr = unsafe.Pointer(uintptr(ptr) + 1)
   722  			} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
   723  				v1 = uint32(b)<<7 | uint32(a)
   724  				ptr = unsafe.Pointer(uintptr(ptr) + 2)
   725  			} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
   726  				v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   727  				ptr = unsafe.Pointer(uintptr(ptr) + 3)
   728  			} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
   729  				v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   730  				ptr = unsafe.Pointer(uintptr(ptr) + 4)
   731  			} else {
   732  				d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
   733  				v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   734  				ptr = unsafe.Pointer(uintptr(ptr) + 5)
   735  			}
   736  
   737  			if *((*uint8)(ptr)) < 128 {
   738  				ptr = unsafe.Pointer(uintptr(ptr) + 1)
   739  			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
   740  				ptr = unsafe.Pointer(uintptr(ptr) + 2)
   741  			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
   742  				ptr = unsafe.Pointer(uintptr(ptr) + 3)
   743  			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
   744  				ptr = unsafe.Pointer(uintptr(ptr) + 4)
   745  			} else {
   746  				ptr = unsafe.Pointer(uintptr(ptr) + 5)
   747  			}
   748  
   749  			// Manually inlining part of base.DecodeInternalKey provides a 5-10%
   750  			// speedup on BlockIter benchmarks.
   751  			s := getBytes(ptr, int(v1))
   752  			var k []byte
   753  			if n := len(s) - 8; n >= 0 {
   754  				k = s[:n:n]
   755  			}
   756  			// Else k is invalid, and left as nil
   757  
   758  			if i.cmp(key, k) > 0 {
   759  				// The search key is greater than the user key at this restart point.
   760  				// Search beyond this restart point, since we are trying to find the
   761  				// first restart point with a user key >= the search key.
   762  				index = h + 1 // preserves f(i-1) == false
   763  			} else {
   764  				// k >= search key, so prune everything after index (since index
   765  				// satisfies the property we are looking for).
   766  				upper = h // preserves f(j) == true
   767  			}
   768  		}
   769  		// index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
   770  		// => answer is index.
   771  	}
   772  
   773  	// index is the first restart point with key >= search key. Define the keys
   774  	// between a restart point and the next restart point as belonging to that
   775  	// restart point.
   776  	//
   777  	// Since keys are strictly increasing, if index > 0 then the restart point
   778  	// at index-1 will be the first one that has some keys belonging to it that
   779  	// could be equal to the search key.  If index == 0, then all keys in this
   780  	// block are larger than the key sought, and offset remains at zero.
   781  	if index > 0 {
   782  		i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
   783  	}
   784  	i.readEntry()
   785  	hiddenPoint := i.decodeInternalKey(i.key)
   786  
   787  	// Iterate from that restart point to somewhere >= the key sought.
   788  	if !i.valid() {
   789  		return nil, base.LazyValue{}
   790  	}
   791  	if !hiddenPoint && i.cmp(i.ikey.UserKey, key) >= 0 {
   792  		// Initialize i.lazyValue
   793  		if !i.lazyValueHandling.hasValuePrefix ||
   794  			base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
   795  			i.lazyValue = base.MakeInPlaceValue(i.val)
   796  		} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
   797  			i.lazyValue = base.MakeInPlaceValue(i.val[1:])
   798  		} else {
   799  			i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
   800  		}
   801  		return &i.ikey, i.lazyValue
   802  	}
   803  	for i.Next(); i.valid(); i.Next() {
   804  		if i.cmp(i.ikey.UserKey, key) >= 0 {
   805  			// i.Next() has already initialized i.lazyValue.
   806  			return &i.ikey, i.lazyValue
   807  		}
   808  	}
   809  	return nil, base.LazyValue{}
   810  }
   811  
   812  // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
   813  // pebble package.
   814  func (i *blockIter) SeekPrefixGE(
   815  	prefix, key []byte, flags base.SeekGEFlags,
   816  ) (*base.InternalKey, base.LazyValue) {
   817  	// This should never be called as prefix iteration is handled by sstable.Iterator.
   818  	panic("pebble: SeekPrefixGE unimplemented")
   819  }
   820  
   821  // SeekLT implements internalIterator.SeekLT, as documented in the pebble
   822  // package.
   823  func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) {
   824  	if invariants.Enabled && i.isDataInvalidated() {
   825  		panic(errors.AssertionFailedf("invalidated blockIter used"))
   826  	}
   827  
   828  	i.clearCache()
   829  	// Find the index of the smallest restart point whose key is >= the key
   830  	// sought; index will be numRestarts if there is no such restart point.
   831  	i.offset = 0
   832  	var index int32
   833  
   834  	{
   835  		// NB: manually inlined sort.Search is ~5% faster.
   836  		//
   837  		// Define f(-1) == false and f(n) == true.
   838  		// Invariant: f(index-1) == false, f(upper) == true.
   839  		upper := i.numRestarts
   840  		for index < upper {
   841  			h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
   842  			// index ≤ h < upper
   843  			offset := decodeRestart(i.data[i.restarts+4*h:])
   844  			// For a restart point, there are 0 bytes shared with the previous key.
   845  			// The varint encoding of 0 occupies 1 byte.
   846  			ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
   847  
   848  			// Decode the key at that restart point, and compare it to the key
   849  			// sought. See the comment in readEntry for why we manually inline the
   850  			// varint decoding.
   851  			var v1 uint32
   852  			if a := *((*uint8)(ptr)); a < 128 {
   853  				v1 = uint32(a)
   854  				ptr = unsafe.Pointer(uintptr(ptr) + 1)
   855  			} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
   856  				v1 = uint32(b)<<7 | uint32(a)
   857  				ptr = unsafe.Pointer(uintptr(ptr) + 2)
   858  			} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
   859  				v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   860  				ptr = unsafe.Pointer(uintptr(ptr) + 3)
   861  			} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
   862  				v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   863  				ptr = unsafe.Pointer(uintptr(ptr) + 4)
   864  			} else {
   865  				d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
   866  				v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   867  				ptr = unsafe.Pointer(uintptr(ptr) + 5)
   868  			}
   869  
   870  			if *((*uint8)(ptr)) < 128 {
   871  				ptr = unsafe.Pointer(uintptr(ptr) + 1)
   872  			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
   873  				ptr = unsafe.Pointer(uintptr(ptr) + 2)
   874  			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
   875  				ptr = unsafe.Pointer(uintptr(ptr) + 3)
   876  			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
   877  				ptr = unsafe.Pointer(uintptr(ptr) + 4)
   878  			} else {
   879  				ptr = unsafe.Pointer(uintptr(ptr) + 5)
   880  			}
   881  
   882  			// Manually inlining part of base.DecodeInternalKey provides a 5-10%
   883  			// speedup on BlockIter benchmarks.
   884  			s := getBytes(ptr, int(v1))
   885  			var k []byte
   886  			if n := len(s) - 8; n >= 0 {
   887  				k = s[:n:n]
   888  			}
   889  			// Else k is invalid, and left as nil
   890  
   891  			if i.cmp(key, k) > 0 {
   892  				// The search key is greater than the user key at this restart point.
   893  				// Search beyond this restart point, since we are trying to find the
   894  				// first restart point with a user key >= the search key.
   895  				index = h + 1 // preserves f(i-1) == false
   896  			} else {
   897  				// k >= search key, so prune everything after index (since index
   898  				// satisfies the property we are looking for).
   899  				upper = h // preserves f(j) == true
   900  			}
   901  		}
   902  		// index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
   903  		// => answer is index.
   904  	}
   905  
   906  	// index is the first restart point with key >= search key. Define the keys
   907  	// between a restart point and the next restart point as belonging to that
   908  	// restart point. Note that index could be equal to i.numRestarts, i.e., we
   909  	// are past the last restart.
   910  	//
   911  	// Since keys are strictly increasing, if index > 0 then the restart point
   912  	// at index-1 will be the first one that has some keys belonging to it that
   913  	// are less than the search key.  If index == 0, then all keys in this block
   914  	// are larger than the search key, so there is no match.
   915  	targetOffset := i.restarts
   916  	if index > 0 {
   917  		i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
   918  		if index < i.numRestarts {
   919  			targetOffset = decodeRestart(i.data[i.restarts+4*(index):])
   920  		}
   921  	} else if index == 0 {
   922  		// If index == 0 then all keys in this block are larger than the key
   923  		// sought.
   924  		i.offset = -1
   925  		i.nextOffset = 0
   926  		return nil, base.LazyValue{}
   927  	}
   928  
   929  	// Iterate from that restart point to somewhere >= the key sought, then back
   930  	// up to the previous entry. The expectation is that we'll be performing
   931  	// reverse iteration, so we cache the entries as we advance forward.
   932  	i.nextOffset = i.offset
   933  
   934  	for {
   935  		i.offset = i.nextOffset
   936  		i.readEntry()
   937  		// When hidden keys are common, there is additional optimization possible
   938  		// by not caching entries that are hidden (note that some calls to
   939  		// cacheEntry don't decode the internal key before caching, but checking
   940  		// whether a key is hidden does not require full decoding). However, we do
   941  		// need to use the blockEntry.offset in the cache for the first entry at
   942  		// the reset point to do the binary search when the cache is empty -- so
   943  		// we would need to cache that first entry (though not the key) even if
   944  		// was hidden. Our current assumption is that if there are large numbers
   945  		// of hidden keys we will be able to skip whole blocks (using block
   946  		// property filters) so we don't bother optimizing.
   947  		hiddenPoint := i.decodeInternalKey(i.key)
   948  
   949  		// NB: we don't use the hiddenPoint return value of decodeInternalKey
   950  		// since we want to stop as soon as we reach a key >= ikey.UserKey, so
   951  		// that we can reverse.
   952  		if i.cmp(i.ikey.UserKey, key) >= 0 {
   953  			// The current key is greater than or equal to our search key. Back up to
   954  			// the previous key which was less than our search key. Note that this for
   955  			// loop will execute at least once with this if-block not being true, so
   956  			// the key we are backing up to is the last one this loop cached.
   957  			return i.Prev()
   958  		}
   959  
   960  		if i.nextOffset >= targetOffset {
   961  			// We've reached the end of the current restart block. Return the
   962  			// current key if not hidden, else call Prev().
   963  			//
   964  			// When the restart interval is 1, the first iteration of the for loop
   965  			// will bring us here. In that case ikey is backed by the block so we
   966  			// get the desired key stability guarantee for the lifetime of the
   967  			// blockIter. That is, we never cache anything and therefore never
   968  			// return a key backed by cachedBuf.
   969  			if hiddenPoint {
   970  				return i.Prev()
   971  			}
   972  			break
   973  		}
   974  
   975  		i.cacheEntry()
   976  	}
   977  
   978  	if !i.valid() {
   979  		return nil, base.LazyValue{}
   980  	}
   981  	if !i.lazyValueHandling.hasValuePrefix ||
   982  		base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
   983  		i.lazyValue = base.MakeInPlaceValue(i.val)
   984  	} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
   985  		i.lazyValue = base.MakeInPlaceValue(i.val[1:])
   986  	} else {
   987  		i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
   988  	}
   989  	return &i.ikey, i.lazyValue
   990  }
   991  
   992  // First implements internalIterator.First, as documented in the pebble
   993  // package.
   994  func (i *blockIter) First() (*InternalKey, base.LazyValue) {
   995  	if invariants.Enabled && i.isDataInvalidated() {
   996  		panic(errors.AssertionFailedf("invalidated blockIter used"))
   997  	}
   998  
   999  	i.offset = 0
  1000  	if !i.valid() {
  1001  		return nil, base.LazyValue{}
  1002  	}
  1003  	i.clearCache()
  1004  	i.readEntry()
  1005  	hiddenPoint := i.decodeInternalKey(i.key)
  1006  	if hiddenPoint {
  1007  		return i.Next()
  1008  	}
  1009  	if !i.lazyValueHandling.hasValuePrefix ||
  1010  		base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
  1011  		i.lazyValue = base.MakeInPlaceValue(i.val)
  1012  	} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
  1013  		i.lazyValue = base.MakeInPlaceValue(i.val[1:])
  1014  	} else {
  1015  		i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
  1016  	}
  1017  	return &i.ikey, i.lazyValue
  1018  }
  1019  
  1020  func decodeRestart(b []byte) int32 {
  1021  	_ = b[3] // bounds check hint to compiler; see golang.org/issue/14808
  1022  	return int32(uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 |
  1023  		uint32(b[3]&restartMaskLittleEndianHighByteWithoutSetHasSamePrefix)<<24)
  1024  }
  1025  
  1026  // Last implements internalIterator.Last, as documented in the pebble package.
  1027  func (i *blockIter) Last() (*InternalKey, base.LazyValue) {
  1028  	if invariants.Enabled && i.isDataInvalidated() {
  1029  		panic(errors.AssertionFailedf("invalidated blockIter used"))
  1030  	}
  1031  
  1032  	// Seek forward from the last restart point.
  1033  	i.offset = decodeRestart(i.data[i.restarts+4*(i.numRestarts-1):])
  1034  	if !i.valid() {
  1035  		return nil, base.LazyValue{}
  1036  	}
  1037  
  1038  	i.readEntry()
  1039  	i.clearCache()
  1040  
  1041  	for i.nextOffset < i.restarts {
  1042  		i.cacheEntry()
  1043  		i.offset = i.nextOffset
  1044  		i.readEntry()
  1045  	}
  1046  
  1047  	hiddenPoint := i.decodeInternalKey(i.key)
  1048  	if hiddenPoint {
  1049  		return i.Prev()
  1050  	}
  1051  	if !i.lazyValueHandling.hasValuePrefix ||
  1052  		base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
  1053  		i.lazyValue = base.MakeInPlaceValue(i.val)
  1054  	} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
  1055  		i.lazyValue = base.MakeInPlaceValue(i.val[1:])
  1056  	} else {
  1057  		i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
  1058  	}
  1059  	return &i.ikey, i.lazyValue
  1060  }
  1061  
  1062  // Next implements internalIterator.Next, as documented in the pebble
  1063  // package.
  1064  func (i *blockIter) Next() (*InternalKey, base.LazyValue) {
  1065  	if len(i.cachedBuf) > 0 {
  1066  		// We're switching from reverse iteration to forward iteration. We need to
  1067  		// populate i.fullKey with the current key we're positioned at so that
  1068  		// readEntry() can use i.fullKey for key prefix decompression. Note that we
  1069  		// don't know whether i.key is backed by i.cachedBuf or i.fullKey (if
  1070  		// SeekLT was the previous call, i.key may be backed by i.fullKey), but
  1071  		// copying into i.fullKey works for both cases.
  1072  		//
  1073  		// TODO(peter): Rather than clearing the cache, we could instead use the
  1074  		// cache until it is exhausted. This would likely be faster than falling
  1075  		// through to the normal forward iteration code below.
  1076  		i.fullKey = append(i.fullKey[:0], i.key...)
  1077  		i.clearCache()
  1078  	}
  1079  
  1080  start:
  1081  	i.offset = i.nextOffset
  1082  	if !i.valid() {
  1083  		return nil, base.LazyValue{}
  1084  	}
  1085  	i.readEntry()
  1086  	// Manually inlined version of i.decodeInternalKey(i.key).
  1087  	if n := len(i.key) - 8; n >= 0 {
  1088  		trailer := binary.LittleEndian.Uint64(i.key[n:])
  1089  		hiddenPoint := i.hideObsoletePoints &&
  1090  			(trailer&trailerObsoleteBit != 0)
  1091  		i.ikey.Trailer = trailer & trailerObsoleteMask
  1092  		i.ikey.UserKey = i.key[:n:n]
  1093  		if i.globalSeqNum != 0 {
  1094  			i.ikey.SetSeqNum(i.globalSeqNum)
  1095  		}
  1096  		if hiddenPoint {
  1097  			goto start
  1098  		}
  1099  	} else {
  1100  		i.ikey.Trailer = uint64(InternalKeyKindInvalid)
  1101  		i.ikey.UserKey = nil
  1102  	}
  1103  	if !i.lazyValueHandling.hasValuePrefix ||
  1104  		base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
  1105  		i.lazyValue = base.MakeInPlaceValue(i.val)
  1106  	} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
  1107  		i.lazyValue = base.MakeInPlaceValue(i.val[1:])
  1108  	} else {
  1109  		i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
  1110  	}
  1111  	return &i.ikey, i.lazyValue
  1112  }
  1113  
  1114  // NextPrefix implements (base.InternalIterator).NextPrefix.
  1115  func (i *blockIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
  1116  	if i.lazyValueHandling.hasValuePrefix {
  1117  		return i.nextPrefixV3(succKey)
  1118  	}
  1119  	const nextsBeforeSeek = 3
  1120  	k, v := i.Next()
  1121  	for j := 1; k != nil && i.cmp(k.UserKey, succKey) < 0; j++ {
  1122  		if j >= nextsBeforeSeek {
  1123  			return i.SeekGE(succKey, base.SeekGEFlagsNone)
  1124  		}
  1125  		k, v = i.Next()
  1126  	}
  1127  	return k, v
  1128  }
  1129  
  1130  func (i *blockIter) nextPrefixV3(succKey []byte) (*InternalKey, base.LazyValue) {
  1131  	// Doing nexts that involve a key comparison can be expensive (and the cost
  1132  	// depends on the key length), so we use the same threshold of 3 that we use
  1133  	// for TableFormatPebblev2 in blockIter.nextPrefix above. The next fast path
  1134  	// that looks at setHasSamePrefix takes ~5ns per key, which is ~150x faster
  1135  	// than doing a SeekGE within the block, so we do this 16 times
  1136  	// (~5ns*16=80ns), and then switch to looking at restarts. Doing the binary
  1137  	// search for the restart consumes > 100ns. If the number of versions is >
  1138  	// 17, we will increment nextFastCount to 17, then do a binary search, and
  1139  	// on average need to find a key between two restarts, so another 8 steps
  1140  	// corresponding to nextFastCount, for a mean total of 17 + 8 = 25 such
  1141  	// steps.
  1142  	//
  1143  	// TODO(sumeer): use the configured restartInterval for the sstable when it
  1144  	// was written (which we don't currently store) instead of the default value
  1145  	// of 16.
  1146  	const nextCmpThresholdBeforeSeek = 3
  1147  	const nextFastThresholdBeforeRestarts = 16
  1148  	nextCmpCount := 0
  1149  	nextFastCount := 0
  1150  	usedRestarts := false
  1151  	// INVARIANT: blockIter is valid.
  1152  	if invariants.Enabled && !i.valid() {
  1153  		panic(errors.AssertionFailedf("nextPrefixV3 called on invalid blockIter"))
  1154  	}
  1155  	prevKeyIsSet := i.ikey.Kind() == InternalKeyKindSet
  1156  	for {
  1157  		i.offset = i.nextOffset
  1158  		if !i.valid() {
  1159  			return nil, base.LazyValue{}
  1160  		}
  1161  		// Need to decode the length integers, so we can compute nextOffset.
  1162  		ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
  1163  		// This is an ugly performance hack. Reading entries from blocks is one of
  1164  		// the inner-most routines and decoding the 3 varints per-entry takes
  1165  		// significant time. Neither go1.11 or go1.12 will inline decodeVarint for
  1166  		// us, so we do it manually. This provides a 10-15% performance improvement
  1167  		// on blockIter benchmarks on both go1.11 and go1.12.
  1168  		//
  1169  		// TODO(peter): remove this hack if go:inline is ever supported.
  1170  
  1171  		// Decode the shared key length integer.
  1172  		var shared uint32
  1173  		if a := *((*uint8)(ptr)); a < 128 {
  1174  			shared = uint32(a)
  1175  			ptr = unsafe.Pointer(uintptr(ptr) + 1)
  1176  		} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
  1177  			shared = uint32(b)<<7 | uint32(a)
  1178  			ptr = unsafe.Pointer(uintptr(ptr) + 2)
  1179  		} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
  1180  			shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1181  			ptr = unsafe.Pointer(uintptr(ptr) + 3)
  1182  		} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
  1183  			shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1184  			ptr = unsafe.Pointer(uintptr(ptr) + 4)
  1185  		} else {
  1186  			d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
  1187  			shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1188  			ptr = unsafe.Pointer(uintptr(ptr) + 5)
  1189  		}
  1190  		// Decode the unshared key length integer.
  1191  		var unshared uint32
  1192  		if a := *((*uint8)(ptr)); a < 128 {
  1193  			unshared = uint32(a)
  1194  			ptr = unsafe.Pointer(uintptr(ptr) + 1)
  1195  		} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
  1196  			unshared = uint32(b)<<7 | uint32(a)
  1197  			ptr = unsafe.Pointer(uintptr(ptr) + 2)
  1198  		} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
  1199  			unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1200  			ptr = unsafe.Pointer(uintptr(ptr) + 3)
  1201  		} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
  1202  			unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1203  			ptr = unsafe.Pointer(uintptr(ptr) + 4)
  1204  		} else {
  1205  			d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
  1206  			unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1207  			ptr = unsafe.Pointer(uintptr(ptr) + 5)
  1208  		}
  1209  		// Decode the value length integer.
  1210  		var value uint32
  1211  		if a := *((*uint8)(ptr)); a < 128 {
  1212  			value = uint32(a)
  1213  			ptr = unsafe.Pointer(uintptr(ptr) + 1)
  1214  		} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
  1215  			value = uint32(b)<<7 | uint32(a)
  1216  			ptr = unsafe.Pointer(uintptr(ptr) + 2)
  1217  		} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
  1218  			value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1219  			ptr = unsafe.Pointer(uintptr(ptr) + 3)
  1220  		} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
  1221  			value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1222  			ptr = unsafe.Pointer(uintptr(ptr) + 4)
  1223  		} else {
  1224  			d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
  1225  			value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1226  			ptr = unsafe.Pointer(uintptr(ptr) + 5)
  1227  		}
  1228  		// The starting position of the value.
  1229  		valuePtr := unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
  1230  		i.nextOffset = int32(uintptr(valuePtr)-uintptr(i.ptr)) + int32(value)
  1231  		if invariants.Enabled && unshared < 8 {
  1232  			// This should not happen since only the key prefix is shared, so even
  1233  			// if the prefix length is the same as the user key length, the unshared
  1234  			// will include the trailer.
  1235  			panic(errors.AssertionFailedf("unshared %d is too small", unshared))
  1236  		}
  1237  		// The trailer is written in little endian, so the key kind is the first
  1238  		// byte in the trailer that is encoded in the slice [unshared-8:unshared].
  1239  		keyKind := InternalKeyKind((*[manual.MaxArrayLen]byte)(ptr)[unshared-8])
  1240  		keyKind = keyKind & base.InternalKeyKindSSTableInternalObsoleteMask
  1241  		prefixChanged := false
  1242  		if keyKind == InternalKeyKindSet {
  1243  			if invariants.Enabled && value == 0 {
  1244  				panic(errors.AssertionFailedf("value is of length 0, but we expect a valuePrefix"))
  1245  			}
  1246  			valPrefix := *((*valuePrefix)(valuePtr))
  1247  			if setHasSamePrefix(valPrefix) {
  1248  				// Fast-path. No need to assemble i.fullKey, or update i.key. We know
  1249  				// that subsequent keys will not have a shared length that is greater
  1250  				// than the prefix of the current key, which is also the prefix of
  1251  				// i.key. Since we are continuing to iterate, we don't need to
  1252  				// initialize i.ikey and i.lazyValue (these are initialized before
  1253  				// returning).
  1254  				nextFastCount++
  1255  				if nextFastCount > nextFastThresholdBeforeRestarts {
  1256  					if usedRestarts {
  1257  						// Exhausted iteration budget. This will never happen unless
  1258  						// someone is using a restart interval > 16. It is just to guard
  1259  						// against long restart intervals causing too much iteration.
  1260  						break
  1261  					}
  1262  					// Haven't used restarts yet, so find the first restart at or beyond
  1263  					// the current offset.
  1264  					targetOffset := i.offset
  1265  					var index int32
  1266  					{
  1267  						// NB: manually inlined sort.Sort is ~5% faster.
  1268  						//
  1269  						// f defined for a restart point is true iff the offset >=
  1270  						// targetOffset.
  1271  						// Define f(-1) == false and f(i.numRestarts) == true.
  1272  						// Invariant: f(index-1) == false, f(upper) == true.
  1273  						upper := i.numRestarts
  1274  						for index < upper {
  1275  							h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
  1276  							// index ≤ h < upper
  1277  							offset := decodeRestart(i.data[i.restarts+4*h:])
  1278  							if offset < targetOffset {
  1279  								index = h + 1 // preserves f(index-1) == false
  1280  							} else {
  1281  								upper = h // preserves f(upper) == true
  1282  							}
  1283  						}
  1284  						// index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
  1285  						// => answer is index.
  1286  					}
  1287  					usedRestarts = true
  1288  					nextFastCount = 0
  1289  					if index == i.numRestarts {
  1290  						// Already past the last real restart, so iterate a bit more until
  1291  						// we are done with the block.
  1292  						continue
  1293  					}
  1294  					// Have some real restarts after index. NB: index is the first
  1295  					// restart at or beyond the current offset.
  1296  					startingIndex := index
  1297  					for index != i.numRestarts &&
  1298  						// The restart at index is 4 bytes written in little endian format
  1299  						// starting at i.restart+4*index. The 0th byte is the least
  1300  						// significant and the 3rd byte is the most significant. Since the
  1301  						// most significant bit of the 3rd byte is what we use for
  1302  						// encoding the set-has-same-prefix information, the indexing
  1303  						// below has +3.
  1304  						i.data[i.restarts+4*index+3]&restartMaskLittleEndianHighByteOnlySetHasSamePrefix != 0 {
  1305  						// We still have the same prefix, so move to the next restart.
  1306  						index++
  1307  					}
  1308  					// index is the first restart that did not have the same prefix.
  1309  					if index != startingIndex {
  1310  						// Managed to skip past at least one restart. Resume iteration
  1311  						// from index-1. Since nextFastCount has been reset to 0, we
  1312  						// should be able to iterate to the next prefix.
  1313  						i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
  1314  						i.readEntry()
  1315  					}
  1316  					// Else, unable to skip past any restart. Resume iteration. Since
  1317  					// nextFastCount has been reset to 0, we should be able to iterate
  1318  					// to the next prefix.
  1319  					continue
  1320  				}
  1321  				continue
  1322  			} else if prevKeyIsSet {
  1323  				prefixChanged = true
  1324  			}
  1325  		} else {
  1326  			prevKeyIsSet = false
  1327  		}
  1328  		// Slow-path cases:
  1329  		// - (Likely) The prefix has changed.
  1330  		// - (Unlikely) The prefix has not changed.
  1331  		// We assemble the key etc. under the assumption that it is the likely
  1332  		// case.
  1333  		unsharedKey := getBytes(ptr, int(unshared))
  1334  		// TODO(sumeer): move this into the else block below. This is a bit tricky
  1335  		// since the current logic assumes we have always copied the latest key
  1336  		// into fullKey, which is why when we get to the next key we can (a)
  1337  		// access i.fullKey[:shared], (b) append only the unsharedKey to
  1338  		// i.fullKey. For (a), we can access i.key[:shared] since that memory is
  1339  		// valid (even if unshared). For (b), we will need to remember whether
  1340  		// i.key refers to i.fullKey or not, and can append the unsharedKey only
  1341  		// in the former case and for the latter case need to copy the shared part
  1342  		// too. This same comment applies to the other place where we can do this
  1343  		// optimization, in readEntry().
  1344  		i.fullKey = append(i.fullKey[:shared], unsharedKey...)
  1345  		i.val = getBytes(valuePtr, int(value))
  1346  		if shared == 0 {
  1347  			// Provide stability for the key across positioning calls if the key
  1348  			// doesn't share a prefix with the previous key. This removes requiring the
  1349  			// key to be copied if the caller knows the block has a restart interval of
  1350  			// 1. An important example of this is range-del blocks.
  1351  			i.key = unsharedKey
  1352  		} else {
  1353  			i.key = i.fullKey
  1354  		}
  1355  		// Manually inlined version of i.decodeInternalKey(i.key).
  1356  		hiddenPoint := false
  1357  		if n := len(i.key) - 8; n >= 0 {
  1358  			trailer := binary.LittleEndian.Uint64(i.key[n:])
  1359  			hiddenPoint = i.hideObsoletePoints &&
  1360  				(trailer&trailerObsoleteBit != 0)
  1361  			i.ikey.Trailer = trailer & trailerObsoleteMask
  1362  			i.ikey.UserKey = i.key[:n:n]
  1363  			if i.globalSeqNum != 0 {
  1364  				i.ikey.SetSeqNum(i.globalSeqNum)
  1365  			}
  1366  		} else {
  1367  			i.ikey.Trailer = uint64(InternalKeyKindInvalid)
  1368  			i.ikey.UserKey = nil
  1369  		}
  1370  		nextCmpCount++
  1371  		if invariants.Enabled && prefixChanged && i.cmp(i.ikey.UserKey, succKey) < 0 {
  1372  			panic(errors.AssertionFailedf("prefix should have changed but %x < %x",
  1373  				i.ikey.UserKey, succKey))
  1374  		}
  1375  		if prefixChanged || i.cmp(i.ikey.UserKey, succKey) >= 0 {
  1376  			// Prefix has changed.
  1377  			if hiddenPoint {
  1378  				return i.Next()
  1379  			}
  1380  			if invariants.Enabled && !i.lazyValueHandling.hasValuePrefix {
  1381  				panic(errors.AssertionFailedf("nextPrefixV3 being run for non-v3 sstable"))
  1382  			}
  1383  			if base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
  1384  				i.lazyValue = base.MakeInPlaceValue(i.val)
  1385  			} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
  1386  				i.lazyValue = base.MakeInPlaceValue(i.val[1:])
  1387  			} else {
  1388  				i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
  1389  			}
  1390  			return &i.ikey, i.lazyValue
  1391  		}
  1392  		// Else prefix has not changed.
  1393  
  1394  		if nextCmpCount >= nextCmpThresholdBeforeSeek {
  1395  			break
  1396  		}
  1397  	}
  1398  	return i.SeekGE(succKey, base.SeekGEFlagsNone)
  1399  }
  1400  
  1401  // Prev implements internalIterator.Prev, as documented in the pebble
  1402  // package.
  1403  func (i *blockIter) Prev() (*InternalKey, base.LazyValue) {
  1404  start:
  1405  	for n := len(i.cached) - 1; n >= 0; n-- {
  1406  		i.nextOffset = i.offset
  1407  		e := &i.cached[n]
  1408  		i.offset = e.offset
  1409  		i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize))
  1410  		// Manually inlined version of i.decodeInternalKey(i.key).
  1411  		i.key = i.cachedBuf[e.keyStart:e.keyEnd]
  1412  		if n := len(i.key) - 8; n >= 0 {
  1413  			trailer := binary.LittleEndian.Uint64(i.key[n:])
  1414  			hiddenPoint := i.hideObsoletePoints &&
  1415  				(trailer&trailerObsoleteBit != 0)
  1416  			if hiddenPoint {
  1417  				continue
  1418  			}
  1419  			i.ikey.Trailer = trailer & trailerObsoleteMask
  1420  			i.ikey.UserKey = i.key[:n:n]
  1421  			if i.globalSeqNum != 0 {
  1422  				i.ikey.SetSeqNum(i.globalSeqNum)
  1423  			}
  1424  		} else {
  1425  			i.ikey.Trailer = uint64(InternalKeyKindInvalid)
  1426  			i.ikey.UserKey = nil
  1427  		}
  1428  		i.cached = i.cached[:n]
  1429  		if !i.lazyValueHandling.hasValuePrefix ||
  1430  			base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
  1431  			i.lazyValue = base.MakeInPlaceValue(i.val)
  1432  		} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
  1433  			i.lazyValue = base.MakeInPlaceValue(i.val[1:])
  1434  		} else {
  1435  			i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
  1436  		}
  1437  		return &i.ikey, i.lazyValue
  1438  	}
  1439  
  1440  	i.clearCache()
  1441  	if i.offset <= 0 {
  1442  		i.offset = -1
  1443  		i.nextOffset = 0
  1444  		return nil, base.LazyValue{}
  1445  	}
  1446  
  1447  	targetOffset := i.offset
  1448  	var index int32
  1449  
  1450  	{
  1451  		// NB: manually inlined sort.Sort is ~5% faster.
  1452  		//
  1453  		// Define f(-1) == false and f(n) == true.
  1454  		// Invariant: f(index-1) == false, f(upper) == true.
  1455  		upper := i.numRestarts
  1456  		for index < upper {
  1457  			h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
  1458  			// index ≤ h < upper
  1459  			offset := decodeRestart(i.data[i.restarts+4*h:])
  1460  			if offset < targetOffset {
  1461  				// Looking for the first restart that has offset >= targetOffset, so
  1462  				// ignore h and earlier.
  1463  				index = h + 1 // preserves f(i-1) == false
  1464  			} else {
  1465  				upper = h // preserves f(j) == true
  1466  			}
  1467  		}
  1468  		// index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
  1469  		// => answer is index.
  1470  	}
  1471  
  1472  	// index is first restart with offset >= targetOffset. Note that
  1473  	// targetOffset may not be at a restart point since one can call Prev()
  1474  	// after Next() (so the cache was not populated) and targetOffset refers to
  1475  	// the current entry. index-1 must have an offset < targetOffset (it can't
  1476  	// be equal to targetOffset since the binary search would have selected that
  1477  	// as the index).
  1478  	i.offset = 0
  1479  	if index > 0 {
  1480  		i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
  1481  	}
  1482  	// TODO(sumeer): why is the else case not an error given targetOffset is a
  1483  	// valid offset.
  1484  
  1485  	i.readEntry()
  1486  
  1487  	// We stop when i.nextOffset == targetOffset since the targetOffset is the
  1488  	// entry we are stepping back from, and we don't need to cache the entry
  1489  	// before it, since it is the candidate to return.
  1490  	for i.nextOffset < targetOffset {
  1491  		i.cacheEntry()
  1492  		i.offset = i.nextOffset
  1493  		i.readEntry()
  1494  	}
  1495  
  1496  	hiddenPoint := i.decodeInternalKey(i.key)
  1497  	if hiddenPoint {
  1498  		// Use the cache.
  1499  		goto start
  1500  	}
  1501  	if !i.lazyValueHandling.hasValuePrefix ||
  1502  		base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
  1503  		i.lazyValue = base.MakeInPlaceValue(i.val)
  1504  	} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
  1505  		i.lazyValue = base.MakeInPlaceValue(i.val[1:])
  1506  	} else {
  1507  		i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
  1508  	}
  1509  	return &i.ikey, i.lazyValue
  1510  }
  1511  
  1512  // Key implements internalIterator.Key, as documented in the pebble package.
  1513  func (i *blockIter) Key() *InternalKey {
  1514  	return &i.ikey
  1515  }
  1516  
  1517  func (i *blockIter) value() base.LazyValue {
  1518  	return i.lazyValue
  1519  }
  1520  
  1521  // Error implements internalIterator.Error, as documented in the pebble
  1522  // package.
  1523  func (i *blockIter) Error() error {
  1524  	return nil // infallible
  1525  }
  1526  
  1527  // Close implements internalIterator.Close, as documented in the pebble
  1528  // package.
  1529  func (i *blockIter) Close() error {
  1530  	i.handle.Release()
  1531  	i.handle = bufferHandle{}
  1532  	i.val = nil
  1533  	i.lazyValue = base.LazyValue{}
  1534  	i.lazyValueHandling.vbr = nil
  1535  	return nil
  1536  }
  1537  
  1538  func (i *blockIter) SetBounds(lower, upper []byte) {
  1539  	// This should never be called as bounds are handled by sstable.Iterator.
  1540  	panic("pebble: SetBounds unimplemented")
  1541  }
  1542  
  1543  func (i *blockIter) valid() bool {
  1544  	return i.offset >= 0 && i.offset < i.restarts
  1545  }
  1546  
  1547  // fragmentBlockIter wraps a blockIter, implementing the
  1548  // keyspan.FragmentIterator interface. It's used for reading range deletion and
  1549  // range key blocks.
  1550  //
  1551  // Range deletions and range keys are fragmented before they're persisted to the
  1552  // block. Overlapping fragments have identical bounds.  The fragmentBlockIter
  1553  // gathers all the fragments with identical bounds within a block and returns a
  1554  // single keyspan.Span describing all the keys defined over the span.
  1555  //
  1556  // # Memory lifetime
  1557  //
  1558  // A Span returned by fragmentBlockIter is only guaranteed to be stable until
  1559  // the next fragmentBlockIter iteration positioning method. A Span's Keys slice
  1560  // may be reused, so the user must not assume it's stable.
  1561  //
  1562  // Blocks holding range deletions and range keys are configured to use a restart
  1563  // interval of 1. This provides key stability. The caller may treat the various
  1564  // byte slices (start, end, suffix, value) as stable for the lifetime of the
  1565  // iterator.
  1566  type fragmentBlockIter struct {
  1567  	blockIter blockIter
  1568  	keyBuf    [2]keyspan.Key
  1569  	span      keyspan.Span
  1570  	err       error
  1571  	dir       int8
  1572  	closeHook func(i keyspan.FragmentIterator) error
  1573  
  1574  	// elideSameSeqnum, if true, returns only the first-occurring (in forward
  1575  	// order) Key for each sequence number.
  1576  	elideSameSeqnum bool
  1577  }
  1578  
  1579  func (i *fragmentBlockIter) resetForReuse() fragmentBlockIter {
  1580  	return fragmentBlockIter{blockIter: i.blockIter.resetForReuse()}
  1581  }
  1582  
  1583  func (i *fragmentBlockIter) decodeSpanKeys(k *InternalKey, internalValue []byte) {
  1584  	// TODO(jackson): The use of i.span.Keys to accumulate keys across multiple
  1585  	// calls to Decode is too confusing and subtle. Refactor to make it
  1586  	// explicit.
  1587  
  1588  	// decode the contents of the fragment's value. This always includes at
  1589  	// least the end key: RANGEDELs store the end key directly as the value,
  1590  	// whereas the various range key kinds store are more complicated.  The
  1591  	// details of the range key internal value format are documented within the
  1592  	// internal/rangekey package.
  1593  	switch k.Kind() {
  1594  	case base.InternalKeyKindRangeDelete:
  1595  		i.span = rangedel.Decode(*k, internalValue, i.span.Keys)
  1596  		i.err = nil
  1597  	case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset, base.InternalKeyKindRangeKeyDelete:
  1598  		i.span, i.err = rangekey.Decode(*k, internalValue, i.span.Keys)
  1599  	default:
  1600  		i.span = keyspan.Span{}
  1601  		i.err = base.CorruptionErrorf("pebble: corrupt keyspan fragment of kind %d", k.Kind())
  1602  	}
  1603  }
  1604  
  1605  func (i *fragmentBlockIter) elideKeysOfSameSeqNum() {
  1606  	if invariants.Enabled {
  1607  		if !i.elideSameSeqnum || len(i.span.Keys) == 0 {
  1608  			panic("elideKeysOfSameSeqNum called when it should not be")
  1609  		}
  1610  	}
  1611  	lastSeqNum := i.span.Keys[0].SeqNum()
  1612  	k := 1
  1613  	for j := 1; j < len(i.span.Keys); j++ {
  1614  		if lastSeqNum != i.span.Keys[j].SeqNum() {
  1615  			lastSeqNum = i.span.Keys[j].SeqNum()
  1616  			i.span.Keys[k] = i.span.Keys[j]
  1617  			k++
  1618  		}
  1619  	}
  1620  	i.span.Keys = i.span.Keys[:k]
  1621  }
  1622  
  1623  // gatherForward gathers internal keys with identical bounds. Keys defined over
  1624  // spans of the keyspace are fragmented such that any overlapping key spans have
  1625  // identical bounds. When these spans are persisted to a range deletion or range
  1626  // key block, they may be persisted as multiple internal keys in order to encode
  1627  // multiple sequence numbers or key kinds.
  1628  //
  1629  // gatherForward iterates forward, re-combining the fragmented internal keys to
  1630  // reconstruct a keyspan.Span that holds all the keys defined over the span.
  1631  func (i *fragmentBlockIter) gatherForward(k *InternalKey, lazyValue base.LazyValue) *keyspan.Span {
  1632  	i.span = keyspan.Span{}
  1633  	if k == nil || !i.blockIter.valid() {
  1634  		return nil
  1635  	}
  1636  	i.err = nil
  1637  	// Use the i.keyBuf array to back the Keys slice to prevent an allocation
  1638  	// when a span contains few keys.
  1639  	i.span.Keys = i.keyBuf[:0]
  1640  
  1641  	// Decode the span's end key and individual keys from the value.
  1642  	internalValue := lazyValue.InPlaceValue()
  1643  	i.decodeSpanKeys(k, internalValue)
  1644  	if i.err != nil {
  1645  		return nil
  1646  	}
  1647  	prevEnd := i.span.End
  1648  
  1649  	// There might exist additional internal keys with identical bounds encoded
  1650  	// within the block. Iterate forward, accumulating all the keys with
  1651  	// identical bounds to s.
  1652  	k, lazyValue = i.blockIter.Next()
  1653  	internalValue = lazyValue.InPlaceValue()
  1654  	for k != nil && i.blockIter.cmp(k.UserKey, i.span.Start) == 0 {
  1655  		i.decodeSpanKeys(k, internalValue)
  1656  		if i.err != nil {
  1657  			return nil
  1658  		}
  1659  
  1660  		// Since k indicates an equal start key, the encoded end key must
  1661  		// exactly equal the original end key from the first internal key.
  1662  		// Overlapping fragments are required to have exactly equal start and
  1663  		// end bounds.
  1664  		if i.blockIter.cmp(prevEnd, i.span.End) != 0 {
  1665  			i.err = base.CorruptionErrorf("pebble: corrupt keyspan fragmentation")
  1666  			i.span = keyspan.Span{}
  1667  			return nil
  1668  		}
  1669  		k, lazyValue = i.blockIter.Next()
  1670  		internalValue = lazyValue.InPlaceValue()
  1671  	}
  1672  	if i.elideSameSeqnum && len(i.span.Keys) > 0 {
  1673  		i.elideKeysOfSameSeqNum()
  1674  	}
  1675  	// i.blockIter is positioned over the first internal key for the next span.
  1676  	return &i.span
  1677  }
  1678  
  1679  // gatherBackward gathers internal keys with identical bounds. Keys defined over
  1680  // spans of the keyspace are fragmented such that any overlapping key spans have
  1681  // identical bounds. When these spans are persisted to a range deletion or range
  1682  // key block, they may be persisted as multiple internal keys in order to encode
  1683  // multiple sequence numbers or key kinds.
  1684  //
  1685  // gatherBackward iterates backwards, re-combining the fragmented internal keys
  1686  // to reconstruct a keyspan.Span that holds all the keys defined over the span.
  1687  func (i *fragmentBlockIter) gatherBackward(k *InternalKey, lazyValue base.LazyValue) *keyspan.Span {
  1688  	i.span = keyspan.Span{}
  1689  	if k == nil || !i.blockIter.valid() {
  1690  		return nil
  1691  	}
  1692  	i.err = nil
  1693  	// Use the i.keyBuf array to back the Keys slice to prevent an allocation
  1694  	// when a span contains few keys.
  1695  	i.span.Keys = i.keyBuf[:0]
  1696  
  1697  	// Decode the span's end key and individual keys from the value.
  1698  	internalValue := lazyValue.InPlaceValue()
  1699  	i.decodeSpanKeys(k, internalValue)
  1700  	if i.err != nil {
  1701  		return nil
  1702  	}
  1703  	prevEnd := i.span.End
  1704  
  1705  	// There might exist additional internal keys with identical bounds encoded
  1706  	// within the block. Iterate backward, accumulating all the keys with
  1707  	// identical bounds to s.
  1708  	k, lazyValue = i.blockIter.Prev()
  1709  	internalValue = lazyValue.InPlaceValue()
  1710  	for k != nil && i.blockIter.cmp(k.UserKey, i.span.Start) == 0 {
  1711  		i.decodeSpanKeys(k, internalValue)
  1712  		if i.err != nil {
  1713  			return nil
  1714  		}
  1715  
  1716  		// Since k indicates an equal start key, the encoded end key must
  1717  		// exactly equal the original end key from the first internal key.
  1718  		// Overlapping fragments are required to have exactly equal start and
  1719  		// end bounds.
  1720  		if i.blockIter.cmp(prevEnd, i.span.End) != 0 {
  1721  			i.err = base.CorruptionErrorf("pebble: corrupt keyspan fragmentation")
  1722  			i.span = keyspan.Span{}
  1723  			return nil
  1724  		}
  1725  		k, lazyValue = i.blockIter.Prev()
  1726  		internalValue = lazyValue.InPlaceValue()
  1727  	}
  1728  	// i.blockIter is positioned over the last internal key for the previous
  1729  	// span.
  1730  
  1731  	// Backwards iteration encounters internal keys in the wrong order.
  1732  	keyspan.SortKeysByTrailer(&i.span.Keys)
  1733  
  1734  	if i.elideSameSeqnum && len(i.span.Keys) > 0 {
  1735  		i.elideKeysOfSameSeqNum()
  1736  	}
  1737  	return &i.span
  1738  }
  1739  
  1740  // Error implements (keyspan.FragmentIterator).Error.
  1741  func (i *fragmentBlockIter) Error() error {
  1742  	return i.err
  1743  }
  1744  
  1745  // Close implements (keyspan.FragmentIterator).Close.
  1746  func (i *fragmentBlockIter) Close() error {
  1747  	var err error
  1748  	if i.closeHook != nil {
  1749  		err = i.closeHook(i)
  1750  	}
  1751  	err = firstError(err, i.blockIter.Close())
  1752  	return err
  1753  }
  1754  
  1755  // First implements (keyspan.FragmentIterator).First
  1756  func (i *fragmentBlockIter) First() *keyspan.Span {
  1757  	i.dir = +1
  1758  	return i.gatherForward(i.blockIter.First())
  1759  }
  1760  
  1761  // Last implements (keyspan.FragmentIterator).Last.
  1762  func (i *fragmentBlockIter) Last() *keyspan.Span {
  1763  	i.dir = -1
  1764  	return i.gatherBackward(i.blockIter.Last())
  1765  }
  1766  
  1767  // Next implements (keyspan.FragmentIterator).Next.
  1768  func (i *fragmentBlockIter) Next() *keyspan.Span {
  1769  	switch {
  1770  	case i.dir == -1 && !i.span.Valid():
  1771  		// Switching directions.
  1772  		//
  1773  		// i.blockIter is exhausted, before the first key. Move onto the first.
  1774  		i.blockIter.First()
  1775  		i.dir = +1
  1776  	case i.dir == -1 && i.span.Valid():
  1777  		// Switching directions.
  1778  		//
  1779  		// i.blockIter is currently positioned over the last internal key for
  1780  		// the previous span. Next it once to move to the first internal key
  1781  		// that makes up the current span, and gatherForwaad to land on the
  1782  		// first internal key making up the next span.
  1783  		//
  1784  		// In the diagram below, if the last span returned to the user during
  1785  		// reverse iteration was [b,c), i.blockIter is currently positioned at
  1786  		// [a,b). The block iter must be positioned over [d,e) to gather the
  1787  		// next span's fragments.
  1788  		//
  1789  		//    ... [a,b) [b,c) [b,c) [b,c) [d,e) ...
  1790  		//          ^                       ^
  1791  		//     i.blockIter                 want
  1792  		if x := i.gatherForward(i.blockIter.Next()); invariants.Enabled && !x.Valid() {
  1793  			panic("pebble: invariant violation: next entry unexpectedly invalid")
  1794  		}
  1795  		i.dir = +1
  1796  	}
  1797  	// We know that this blockIter has in-place values.
  1798  	return i.gatherForward(&i.blockIter.ikey, base.MakeInPlaceValue(i.blockIter.val))
  1799  }
  1800  
  1801  // Prev implements (keyspan.FragmentIterator).Prev.
  1802  func (i *fragmentBlockIter) Prev() *keyspan.Span {
  1803  	switch {
  1804  	case i.dir == +1 && !i.span.Valid():
  1805  		// Switching directions.
  1806  		//
  1807  		// i.blockIter is exhausted, after the last key. Move onto the last.
  1808  		i.blockIter.Last()
  1809  		i.dir = -1
  1810  	case i.dir == +1 && i.span.Valid():
  1811  		// Switching directions.
  1812  		//
  1813  		// i.blockIter is currently positioned over the first internal key for
  1814  		// the next span. Prev it once to move to the last internal key that
  1815  		// makes up the current span, and gatherBackward to land on the last
  1816  		// internal key making up the previous span.
  1817  		//
  1818  		// In the diagram below, if the last span returned to the user during
  1819  		// forward iteration was [b,c), i.blockIter is currently positioned at
  1820  		// [d,e). The block iter must be positioned over [a,b) to gather the
  1821  		// previous span's fragments.
  1822  		//
  1823  		//    ... [a,b) [b,c) [b,c) [b,c) [d,e) ...
  1824  		//          ^                       ^
  1825  		//        want                  i.blockIter
  1826  		if x := i.gatherBackward(i.blockIter.Prev()); invariants.Enabled && !x.Valid() {
  1827  			panic("pebble: invariant violation: previous entry unexpectedly invalid")
  1828  		}
  1829  		i.dir = -1
  1830  	}
  1831  	// We know that this blockIter has in-place values.
  1832  	return i.gatherBackward(&i.blockIter.ikey, base.MakeInPlaceValue(i.blockIter.val))
  1833  }
  1834  
  1835  // SeekGE implements (keyspan.FragmentIterator).SeekGE.
  1836  func (i *fragmentBlockIter) SeekGE(k []byte) *keyspan.Span {
  1837  	if s := i.SeekLT(k); s != nil && i.blockIter.cmp(k, s.End) < 0 {
  1838  		return s
  1839  	}
  1840  	// TODO(jackson): If the above i.SeekLT(k) discovers a span but the span
  1841  	// doesn't meet the k < s.End comparison, then there's no need for the
  1842  	// SeekLT to gatherBackward.
  1843  	return i.Next()
  1844  }
  1845  
  1846  // SeekLT implements (keyspan.FragmentIterator).SeekLT.
  1847  func (i *fragmentBlockIter) SeekLT(k []byte) *keyspan.Span {
  1848  	i.dir = -1
  1849  	return i.gatherBackward(i.blockIter.SeekLT(k, base.SeekLTFlagsNone))
  1850  }
  1851  
  1852  // String implements fmt.Stringer.
  1853  func (i *fragmentBlockIter) String() string {
  1854  	return "fragment-block-iter"
  1855  }
  1856  
  1857  // SetCloseHook implements sstable.FragmentIterator.
  1858  func (i *fragmentBlockIter) SetCloseHook(fn func(i keyspan.FragmentIterator) error) {
  1859  	i.closeHook = fn
  1860  }