github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/sstable/block.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package sstable
     6  
     7  import (
     8  	"encoding/binary"
     9  	"unsafe"
    10  
    11  	"github.com/zuoyebang/bitalostable/internal/base"
    12  	"github.com/zuoyebang/bitalostable/internal/cache"
    13  	"github.com/zuoyebang/bitalostable/internal/invariants"
    14  	"github.com/zuoyebang/bitalostable/internal/keyspan"
    15  	"github.com/zuoyebang/bitalostable/internal/rangedel"
    16  	"github.com/zuoyebang/bitalostable/internal/rangekey"
    17  )
    18  
    19  func uvarintLen(v uint32) int {
    20  	i := 0
    21  	for v >= 0x80 {
    22  		v >>= 7
    23  		i++
    24  	}
    25  	return i + 1
    26  }
    27  
    28  type blockWriter struct {
    29  	restartInterval int
    30  	nEntries        int
    31  	nextRestart     int
    32  	buf             []byte
    33  	restarts        []uint32
    34  	curKey          []byte
    35  	curValue        []byte
    36  	prevKey         []byte
    37  	tmp             [4]byte
    38  }
    39  
    40  func (w *blockWriter) clear() {
    41  	*w = blockWriter{
    42  		buf:      w.buf[:0],
    43  		restarts: w.restarts[:0],
    44  		curKey:   w.curKey[:0],
    45  		curValue: w.curValue[:0],
    46  		prevKey:  w.prevKey[:0],
    47  	}
    48  }
    49  
    50  func (w *blockWriter) store(keySize int, value []byte) {
    51  	shared := 0
    52  	if w.nEntries == w.nextRestart {
    53  		w.nextRestart = w.nEntries + w.restartInterval
    54  		w.restarts = append(w.restarts, uint32(len(w.buf)))
    55  	} else {
    56  		// TODO(peter): Manually inlined version of base.SharedPrefixLen(). This
    57  		// is 3% faster on BenchmarkWriter on go1.16. Remove if future versions
    58  		// show this to not be a performance win. For now, functions that use of
    59  		// unsafe cannot be inlined.
    60  		n := len(w.curKey)
    61  		if n > len(w.prevKey) {
    62  			n = len(w.prevKey)
    63  		}
    64  		asUint64 := func(b []byte, i int) uint64 {
    65  			return binary.LittleEndian.Uint64(b[i:])
    66  		}
    67  		for shared < n-7 && asUint64(w.curKey, shared) == asUint64(w.prevKey, shared) {
    68  			shared += 8
    69  		}
    70  		for shared < n && w.curKey[shared] == w.prevKey[shared] {
    71  			shared++
    72  		}
    73  	}
    74  
    75  	needed := 3*binary.MaxVarintLen32 + len(w.curKey[shared:]) + len(value)
    76  	n := len(w.buf)
    77  	if cap(w.buf) < n+needed {
    78  		newCap := 2 * cap(w.buf)
    79  		if newCap == 0 {
    80  			newCap = 1024
    81  		}
    82  		for newCap < n+needed {
    83  			newCap *= 2
    84  		}
    85  		newBuf := make([]byte, n, newCap)
    86  		copy(newBuf, w.buf)
    87  		w.buf = newBuf
    88  	}
    89  	w.buf = w.buf[:n+needed]
    90  
    91  	// TODO(peter): Manually inlined versions of binary.PutUvarint(). This is 15%
    92  	// faster on BenchmarkWriter on go1.13. Remove if go1.14 or future versions
    93  	// show this to not be a performance win.
    94  	{
    95  		x := uint32(shared)
    96  		for x >= 0x80 {
    97  			w.buf[n] = byte(x) | 0x80
    98  			x >>= 7
    99  			n++
   100  		}
   101  		w.buf[n] = byte(x)
   102  		n++
   103  	}
   104  
   105  	{
   106  		x := uint32(keySize - shared)
   107  		for x >= 0x80 {
   108  			w.buf[n] = byte(x) | 0x80
   109  			x >>= 7
   110  			n++
   111  		}
   112  		w.buf[n] = byte(x)
   113  		n++
   114  	}
   115  
   116  	{
   117  		x := uint32(len(value))
   118  		for x >= 0x80 {
   119  			w.buf[n] = byte(x) | 0x80
   120  			x >>= 7
   121  			n++
   122  		}
   123  		w.buf[n] = byte(x)
   124  		n++
   125  	}
   126  
   127  	n += copy(w.buf[n:], w.curKey[shared:])
   128  	n += copy(w.buf[n:], value)
   129  	w.buf = w.buf[:n]
   130  
   131  	w.curValue = w.buf[n-len(value):]
   132  
   133  	w.nEntries++
   134  }
   135  
   136  func (w *blockWriter) add(key InternalKey, value []byte) {
   137  	w.curKey, w.prevKey = w.prevKey, w.curKey
   138  
   139  	size := key.Size()
   140  	if cap(w.curKey) < size {
   141  		w.curKey = make([]byte, 0, size*2)
   142  	}
   143  	w.curKey = w.curKey[:size]
   144  	key.Encode(w.curKey)
   145  
   146  	w.store(size, value)
   147  }
   148  
   149  func (w *blockWriter) finish() []byte {
   150  	// Write the restart points to the buffer.
   151  	if w.nEntries == 0 {
   152  		// Every block must have at least one restart point.
   153  		if cap(w.restarts) > 0 {
   154  			w.restarts = w.restarts[:1]
   155  			w.restarts[0] = 0
   156  		} else {
   157  			w.restarts = append(w.restarts, 0)
   158  		}
   159  	}
   160  	tmp4 := w.tmp[:4]
   161  	for _, x := range w.restarts {
   162  		binary.LittleEndian.PutUint32(tmp4, x)
   163  		w.buf = append(w.buf, tmp4...)
   164  	}
   165  	binary.LittleEndian.PutUint32(tmp4, uint32(len(w.restarts)))
   166  	w.buf = append(w.buf, tmp4...)
   167  	result := w.buf
   168  
   169  	// Reset the block state.
   170  	w.nEntries = 0
   171  	w.nextRestart = 0
   172  	w.buf = w.buf[:0]
   173  	w.restarts = w.restarts[:0]
   174  	return result
   175  }
   176  
   177  // emptyBlockSize holds the size of an empty block. Every block ends
   178  // in a uint32 trailer encoding the number of restart points within the
   179  // block.
   180  const emptyBlockSize = 4
   181  
   182  func (w *blockWriter) estimatedSize() int {
   183  	return len(w.buf) + 4*len(w.restarts) + emptyBlockSize
   184  }
   185  
   186  type blockEntry struct {
   187  	offset   int32
   188  	keyStart int32
   189  	keyEnd   int32
   190  	valStart int32
   191  	valSize  int32
   192  }
   193  
   194  // blockIter is an iterator over a single block of data.
   195  //
   196  // A blockIter provides an additional guarantee around key stability when a
   197  // block has a restart interval of 1 (i.e. when there is no prefix
   198  // compression). Key stability refers to whether the InternalKey.UserKey bytes
   199  // returned by a positioning call will remain stable after a subsequent
   200  // positioning call. The normal case is that a positioning call will invalidate
   201  // any previously returned InternalKey.UserKey. If a block has a restart
   202  // interval of 1 (no prefix compression), blockIter guarantees that
   203  // InternalKey.UserKey will point to the key as stored in the block itself
   204  // which will remain valid until the blockIter is closed. The key stability
   205  // guarantee is used by the range tombstone and range key code, which knows that
   206  // the respective blocks are always encoded with a restart interval of 1. This
   207  // per-block key stability guarantee is sufficient for range tombstones and
   208  // range deletes as they are always encoded in a single block.
   209  //
   210  // A blockIter also provides a value stability guarantee for range deletions and
   211  // range keys since there is only a single range deletion and range key block
   212  // per sstable and the blockIter will not release the bytes for the block until
   213  // it is closed.
   214  type blockIter struct {
   215  	cmp Compare
   216  	// offset is the byte index that marks where the current key/value is
   217  	// encoded in the block.
   218  	offset int32
   219  	// nextOffset is the byte index where the next key/value is encoded in the
   220  	// block.
   221  	nextOffset int32
   222  	// A "restart point" in a block is a point where the full key is encoded,
   223  	// instead of just having a suffix of the key encoded. See readEntry() for
   224  	// how prefix compression of keys works. Keys in between two restart points
   225  	// only have a suffix encoded in the block. When restart interval is 1, no
   226  	// prefix compression of keys happens. This is the case with range tombstone
   227  	// blocks.
   228  	//
   229  	// All restart offsets are listed in increasing order in
   230  	// i.ptr[i.restarts:len(block)-4], while numRestarts is encoded in the last
   231  	// 4 bytes of the block as a uint32 (i.ptr[len(block)-4:]). i.restarts can
   232  	// therefore be seen as the point where data in the block ends, and a list
   233  	// of offsets of all restart points begins.
   234  	restarts int32
   235  	// Number of restart points in this block. Encoded at the end of the block
   236  	// as a uint32.
   237  	numRestarts  int32
   238  	globalSeqNum uint64
   239  	ptr          unsafe.Pointer
   240  	data         []byte
   241  	// key contains the raw key the iterator is currently pointed at. This may
   242  	// point directly to data stored in the block (for a key which has no prefix
   243  	// compression), to fullKey (for a prefix compressed key), or to a slice of
   244  	// data stored in cachedBuf (during reverse iteration).
   245  	key []byte
   246  	// fullKey is a buffer used for key prefix decompression.
   247  	fullKey []byte
   248  	// val contains the value the iterator is currently pointed at. If non-nil,
   249  	// this points to a slice of the block data.
   250  	val []byte
   251  	// ikey contains the decoded InternalKey the iterator is currently pointed
   252  	// at. Note that the memory backing ikey.UserKey is either data stored
   253  	// directly in the block, fullKey, or cachedBuf. The key stability guarantee
   254  	// for blocks built with a restart interval of 1 is achieved by having
   255  	// ikey.UserKey always point to data stored directly in the block.
   256  	ikey InternalKey
   257  	// cached and cachedBuf are used during reverse iteration. They are needed
   258  	// because we can't perform prefix decoding in reverse, only in the forward
   259  	// direction. In order to iterate in reverse, we decode and cache the entries
   260  	// between two restart points.
   261  	//
   262  	// Note that cached[len(cached)-1] contains the previous entry to the one the
   263  	// blockIter is currently pointed at. As usual, nextOffset will contain the
   264  	// offset of the next entry. During reverse iteration, nextOffset will be
   265  	// updated to point to offset, and we'll set the blockIter to point at the
   266  	// entry cached[len(cached)-1]. See Prev() for more details.
   267  	//
   268  	// For a block encoded with a restart interval of 1, cached and cachedBuf
   269  	// will not be used as there are no prefix compressed entries between the
   270  	// restart points.
   271  	cached      []blockEntry
   272  	cachedBuf   []byte
   273  	cacheHandle cache.Handle
   274  	// The first key in the block. This is used by the caller to set bounds
   275  	// for block iteration for already loaded blocks.
   276  	firstKey InternalKey
   277  }
   278  
   279  // blockIter implements the base.InternalIterator interface.
   280  var _ base.InternalIterator = (*blockIter)(nil)
   281  
   282  func newBlockIter(cmp Compare, block block) (*blockIter, error) {
   283  	i := &blockIter{}
   284  	return i, i.init(cmp, block, 0)
   285  }
   286  
   287  func (i *blockIter) String() string {
   288  	return "block"
   289  }
   290  
   291  func (i *blockIter) init(cmp Compare, block block, globalSeqNum uint64) error {
   292  	numRestarts := int32(binary.LittleEndian.Uint32(block[len(block)-4:]))
   293  	if numRestarts == 0 {
   294  		return base.CorruptionErrorf("bitalostable/table: invalid table (block has no restart points)")
   295  	}
   296  	i.cmp = cmp
   297  	i.restarts = int32(len(block)) - 4*(1+numRestarts)
   298  	i.numRestarts = numRestarts
   299  	i.globalSeqNum = globalSeqNum
   300  	i.ptr = unsafe.Pointer(&block[0])
   301  	i.data = block
   302  	i.fullKey = i.fullKey[:0]
   303  	i.val = nil
   304  	i.clearCache()
   305  	if i.restarts > 0 {
   306  		if err := i.readFirstKey(); err != nil {
   307  			return err
   308  		}
   309  	} else {
   310  		// Block is empty.
   311  		i.firstKey = InternalKey{}
   312  	}
   313  	return nil
   314  }
   315  
   316  func (i *blockIter) initHandle(cmp Compare, block cache.Handle, globalSeqNum uint64) error {
   317  	i.cacheHandle.Release()
   318  	i.cacheHandle = block
   319  	return i.init(cmp, block.Get(), globalSeqNum)
   320  }
   321  
   322  func (i *blockIter) invalidate() {
   323  	i.clearCache()
   324  	i.offset = 0
   325  	i.nextOffset = 0
   326  	i.restarts = 0
   327  	i.numRestarts = 0
   328  	i.data = nil
   329  }
   330  
   331  // isDataInvalidated returns true when the blockIter has been invalidated
   332  // using an invalidate call. NB: this is different from blockIter.Valid
   333  // which is part of the InternalIterator implementation.
   334  func (i *blockIter) isDataInvalidated() bool {
   335  	return i.data == nil
   336  }
   337  
   338  func (i *blockIter) resetForReuse() blockIter {
   339  	return blockIter{
   340  		fullKey:   i.fullKey[:0],
   341  		cached:    i.cached[:0],
   342  		cachedBuf: i.cachedBuf[:0],
   343  		data:      nil,
   344  	}
   345  }
   346  
   347  func (i *blockIter) readEntry() {
   348  	ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
   349  
   350  	// This is an ugly performance hack. Reading entries from blocks is one of
   351  	// the inner-most routines and decoding the 3 varints per-entry takes
   352  	// significant time. Neither go1.11 or go1.12 will inline decodeVarint for
   353  	// us, so we do it manually. This provides a 10-15% performance improvement
   354  	// on blockIter benchmarks on both go1.11 and go1.12.
   355  	//
   356  	// TODO(peter): remove this hack if go:inline is ever supported.
   357  
   358  	var shared uint32
   359  	if a := *((*uint8)(ptr)); a < 128 {
   360  		shared = uint32(a)
   361  		ptr = unsafe.Pointer(uintptr(ptr) + 1)
   362  	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
   363  		shared = uint32(b)<<7 | uint32(a)
   364  		ptr = unsafe.Pointer(uintptr(ptr) + 2)
   365  	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
   366  		shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   367  		ptr = unsafe.Pointer(uintptr(ptr) + 3)
   368  	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
   369  		shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   370  		ptr = unsafe.Pointer(uintptr(ptr) + 4)
   371  	} else {
   372  		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
   373  		shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   374  		ptr = unsafe.Pointer(uintptr(ptr) + 5)
   375  	}
   376  
   377  	var unshared uint32
   378  	if a := *((*uint8)(ptr)); a < 128 {
   379  		unshared = uint32(a)
   380  		ptr = unsafe.Pointer(uintptr(ptr) + 1)
   381  	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
   382  		unshared = uint32(b)<<7 | uint32(a)
   383  		ptr = unsafe.Pointer(uintptr(ptr) + 2)
   384  	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
   385  		unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   386  		ptr = unsafe.Pointer(uintptr(ptr) + 3)
   387  	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
   388  		unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   389  		ptr = unsafe.Pointer(uintptr(ptr) + 4)
   390  	} else {
   391  		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
   392  		unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   393  		ptr = unsafe.Pointer(uintptr(ptr) + 5)
   394  	}
   395  
   396  	var value uint32
   397  	if a := *((*uint8)(ptr)); a < 128 {
   398  		value = uint32(a)
   399  		ptr = unsafe.Pointer(uintptr(ptr) + 1)
   400  	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
   401  		value = uint32(b)<<7 | uint32(a)
   402  		ptr = unsafe.Pointer(uintptr(ptr) + 2)
   403  	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
   404  		value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   405  		ptr = unsafe.Pointer(uintptr(ptr) + 3)
   406  	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
   407  		value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   408  		ptr = unsafe.Pointer(uintptr(ptr) + 4)
   409  	} else {
   410  		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
   411  		value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   412  		ptr = unsafe.Pointer(uintptr(ptr) + 5)
   413  	}
   414  
   415  	unsharedKey := getBytes(ptr, int(unshared))
   416  	i.fullKey = append(i.fullKey[:shared], unsharedKey...)
   417  	if shared == 0 {
   418  		// Provide stability for the key across positioning calls if the key
   419  		// doesn't share a prefix with the previous key. This removes requiring the
   420  		// key to be copied if the caller knows the block has a restart interval of
   421  		// 1. An important example of this is range-del blocks.
   422  		i.key = unsharedKey
   423  	} else {
   424  		i.key = i.fullKey
   425  	}
   426  	ptr = unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
   427  	i.val = getBytes(ptr, int(value))
   428  	i.nextOffset = int32(uintptr(ptr)-uintptr(i.ptr)) + int32(value)
   429  }
   430  
   431  func (i *blockIter) readFirstKey() error {
   432  	ptr := i.ptr
   433  
   434  	// This is an ugly performance hack. Reading entries from blocks is one of
   435  	// the inner-most routines and decoding the 3 varints per-entry takes
   436  	// significant time. Neither go1.11 or go1.12 will inline decodeVarint for
   437  	// us, so we do it manually. This provides a 10-15% performance improvement
   438  	// on blockIter benchmarks on both go1.11 and go1.12.
   439  	//
   440  	// TODO(peter): remove this hack if go:inline is ever supported.
   441  
   442  	if shared := *((*uint8)(ptr)); shared == 0 {
   443  		ptr = unsafe.Pointer(uintptr(ptr) + 1)
   444  	} else {
   445  		// The shared length is != 0, which is invalid.
   446  		panic("first key in block must have zero shared length")
   447  	}
   448  
   449  	var unshared uint32
   450  	if a := *((*uint8)(ptr)); a < 128 {
   451  		unshared = uint32(a)
   452  		ptr = unsafe.Pointer(uintptr(ptr) + 1)
   453  	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
   454  		unshared = uint32(b)<<7 | uint32(a)
   455  		ptr = unsafe.Pointer(uintptr(ptr) + 2)
   456  	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
   457  		unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   458  		ptr = unsafe.Pointer(uintptr(ptr) + 3)
   459  	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
   460  		unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   461  		ptr = unsafe.Pointer(uintptr(ptr) + 4)
   462  	} else {
   463  		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
   464  		unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   465  		ptr = unsafe.Pointer(uintptr(ptr) + 5)
   466  	}
   467  
   468  	// Skip the value length.
   469  	if a := *((*uint8)(ptr)); a < 128 {
   470  		ptr = unsafe.Pointer(uintptr(ptr) + 1)
   471  	} else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); a < 128 {
   472  		ptr = unsafe.Pointer(uintptr(ptr) + 2)
   473  	} else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); a < 128 {
   474  		ptr = unsafe.Pointer(uintptr(ptr) + 3)
   475  	} else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); a < 128 {
   476  		ptr = unsafe.Pointer(uintptr(ptr) + 4)
   477  	} else {
   478  		ptr = unsafe.Pointer(uintptr(ptr) + 5)
   479  	}
   480  
   481  	firstKey := getBytes(ptr, int(unshared))
   482  	// Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
   483  	// BlockIter benchmarks.
   484  	if n := len(firstKey) - 8; n >= 0 {
   485  		i.firstKey.Trailer = binary.LittleEndian.Uint64(firstKey[n:])
   486  		i.firstKey.UserKey = firstKey[:n:n]
   487  		if i.globalSeqNum != 0 {
   488  			i.firstKey.SetSeqNum(i.globalSeqNum)
   489  		}
   490  	} else {
   491  		i.firstKey.Trailer = uint64(InternalKeyKindInvalid)
   492  		i.firstKey.UserKey = nil
   493  		return base.CorruptionErrorf("bitalostable/table: invalid firstKey in block")
   494  	}
   495  	return nil
   496  }
   497  
   498  func (i *blockIter) decodeInternalKey(key []byte) {
   499  	// Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
   500  	// BlockIter benchmarks.
   501  	if n := len(key) - 8; n >= 0 {
   502  		i.ikey.Trailer = binary.LittleEndian.Uint64(key[n:])
   503  		i.ikey.UserKey = key[:n:n]
   504  		if i.globalSeqNum != 0 {
   505  			i.ikey.SetSeqNum(i.globalSeqNum)
   506  		}
   507  	} else {
   508  		i.ikey.Trailer = uint64(InternalKeyKindInvalid)
   509  		i.ikey.UserKey = nil
   510  	}
   511  }
   512  
   513  func (i *blockIter) clearCache() {
   514  	i.cached = i.cached[:0]
   515  	i.cachedBuf = i.cachedBuf[:0]
   516  }
   517  
   518  func (i *blockIter) cacheEntry() {
   519  	var valStart int32
   520  	valSize := int32(len(i.val))
   521  	if valSize > 0 {
   522  		valStart = int32(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr))
   523  	}
   524  
   525  	i.cached = append(i.cached, blockEntry{
   526  		offset:   i.offset,
   527  		keyStart: int32(len(i.cachedBuf)),
   528  		keyEnd:   int32(len(i.cachedBuf) + len(i.key)),
   529  		valStart: valStart,
   530  		valSize:  valSize,
   531  	})
   532  	i.cachedBuf = append(i.cachedBuf, i.key...)
   533  }
   534  
   535  // SeekGE implements internalIterator.SeekGE, as documented in the bitalostable
   536  // package.
   537  func (i *blockIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, []byte) {
   538  	i.clearCache()
   539  
   540  	ikey := base.MakeSearchKey(key)
   541  
   542  	// Find the index of the smallest restart point whose key is > the key
   543  	// sought; index will be numRestarts if there is no such restart point.
   544  	i.offset = 0
   545  	var index int32
   546  
   547  	{
   548  		// NB: manually inlined sort.Seach is ~5% faster.
   549  		//
   550  		// Define f(-1) == false and f(n) == true.
   551  		// Invariant: f(index-1) == false, f(upper) == true.
   552  		upper := i.numRestarts
   553  		for index < upper {
   554  			h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
   555  			// index ≤ h < upper
   556  			offset := int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*h:]))
   557  			// For a restart point, there are 0 bytes shared with the previous key.
   558  			// The varint encoding of 0 occupies 1 byte.
   559  			ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
   560  
   561  			// Decode the key at that restart point, and compare it to the key
   562  			// sought. See the comment in readEntry for why we manually inline the
   563  			// varint decoding.
   564  			var v1 uint32
   565  			if a := *((*uint8)(ptr)); a < 128 {
   566  				v1 = uint32(a)
   567  				ptr = unsafe.Pointer(uintptr(ptr) + 1)
   568  			} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
   569  				v1 = uint32(b)<<7 | uint32(a)
   570  				ptr = unsafe.Pointer(uintptr(ptr) + 2)
   571  			} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
   572  				v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   573  				ptr = unsafe.Pointer(uintptr(ptr) + 3)
   574  			} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
   575  				v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   576  				ptr = unsafe.Pointer(uintptr(ptr) + 4)
   577  			} else {
   578  				d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
   579  				v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   580  				ptr = unsafe.Pointer(uintptr(ptr) + 5)
   581  			}
   582  
   583  			if *((*uint8)(ptr)) < 128 {
   584  				ptr = unsafe.Pointer(uintptr(ptr) + 1)
   585  			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
   586  				ptr = unsafe.Pointer(uintptr(ptr) + 2)
   587  			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
   588  				ptr = unsafe.Pointer(uintptr(ptr) + 3)
   589  			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
   590  				ptr = unsafe.Pointer(uintptr(ptr) + 4)
   591  			} else {
   592  				ptr = unsafe.Pointer(uintptr(ptr) + 5)
   593  			}
   594  
   595  			// Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
   596  			// BlockIter benchmarks.
   597  			s := getBytes(ptr, int(v1))
   598  			var k InternalKey
   599  			if n := len(s) - 8; n >= 0 {
   600  				k.Trailer = binary.LittleEndian.Uint64(s[n:])
   601  				k.UserKey = s[:n:n]
   602  				// NB: We can't have duplicate keys if the globalSeqNum != 0, so we
   603  				// leave the seqnum on this key as 0 as it won't affect our search
   604  				// since ikey has the maximum seqnum.
   605  			} else {
   606  				k.Trailer = uint64(InternalKeyKindInvalid)
   607  			}
   608  
   609  			if base.InternalCompare(i.cmp, ikey, k) >= 0 {
   610  				index = h + 1 // preserves f(i-1) == false
   611  			} else {
   612  				upper = h // preserves f(j) == true
   613  			}
   614  		}
   615  		// index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
   616  		// => answer is index.
   617  	}
   618  
   619  	// Since keys are strictly increasing, if index > 0 then the restart point at
   620  	// index-1 will be the largest whose key is <= the key sought.  If index ==
   621  	// 0, then all keys in this block are larger than the key sought, and offset
   622  	// remains at zero.
   623  	if index > 0 {
   624  		i.offset = int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*(index-1):]))
   625  	}
   626  	i.readEntry()
   627  	i.decodeInternalKey(i.key)
   628  
   629  	// Iterate from that restart point to somewhere >= the key sought.
   630  	for ; i.valid(); i.Next() {
   631  		if base.InternalCompare(i.cmp, i.ikey, ikey) >= 0 {
   632  			return &i.ikey, i.val
   633  		}
   634  	}
   635  
   636  	return nil, nil
   637  }
   638  
   639  // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
   640  // bitalostable package.
   641  func (i *blockIter) SeekPrefixGE(
   642  	prefix, key []byte, flags base.SeekGEFlags,
   643  ) (*base.InternalKey, []byte) {
   644  	// This should never be called as prefix iteration is handled by sstable.Iterator.
   645  	panic("bitalostable: SeekPrefixGE unimplemented")
   646  }
   647  
   648  // SeekLT implements internalIterator.SeekLT, as documented in the bitalostable
   649  // package.
   650  func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, []byte) {
   651  	i.clearCache()
   652  
   653  	ikey := base.MakeSearchKey(key)
   654  
   655  	// Find the index of the smallest restart point whose key is >= the key
   656  	// sought; index will be numRestarts if there is no such restart point.
   657  	i.offset = 0
   658  	var index int32
   659  
   660  	{
   661  		// NB: manually inlined sort.Search is ~5% faster.
   662  		//
   663  		// Define f(-1) == false and f(n) == true.
   664  		// Invariant: f(index-1) == false, f(upper) == true.
   665  		upper := i.numRestarts
   666  		for index < upper {
   667  			h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
   668  			// index ≤ h < upper
   669  			offset := int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*h:]))
   670  			// For a restart point, there are 0 bytes shared with the previous key.
   671  			// The varint encoding of 0 occupies 1 byte.
   672  			ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
   673  
   674  			// Decode the key at that restart point, and compare it to the key
   675  			// sought. See the comment in readEntry for why we manually inline the
   676  			// varint decoding.
   677  			var v1 uint32
   678  			if a := *((*uint8)(ptr)); a < 128 {
   679  				v1 = uint32(a)
   680  				ptr = unsafe.Pointer(uintptr(ptr) + 1)
   681  			} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
   682  				v1 = uint32(b)<<7 | uint32(a)
   683  				ptr = unsafe.Pointer(uintptr(ptr) + 2)
   684  			} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
   685  				v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   686  				ptr = unsafe.Pointer(uintptr(ptr) + 3)
   687  			} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
   688  				v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   689  				ptr = unsafe.Pointer(uintptr(ptr) + 4)
   690  			} else {
   691  				d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
   692  				v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   693  				ptr = unsafe.Pointer(uintptr(ptr) + 5)
   694  			}
   695  
   696  			if *((*uint8)(ptr)) < 128 {
   697  				ptr = unsafe.Pointer(uintptr(ptr) + 1)
   698  			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
   699  				ptr = unsafe.Pointer(uintptr(ptr) + 2)
   700  			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
   701  				ptr = unsafe.Pointer(uintptr(ptr) + 3)
   702  			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
   703  				ptr = unsafe.Pointer(uintptr(ptr) + 4)
   704  			} else {
   705  				ptr = unsafe.Pointer(uintptr(ptr) + 5)
   706  			}
   707  
   708  			// Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
   709  			// BlockIter benchmarks.
   710  			s := getBytes(ptr, int(v1))
   711  			var k InternalKey
   712  			if n := len(s) - 8; n >= 0 {
   713  				k.Trailer = binary.LittleEndian.Uint64(s[n:])
   714  				k.UserKey = s[:n:n]
   715  				// NB: We can't have duplicate keys if the globalSeqNum != 0, so we
   716  				// leave the seqnum on this key as 0 as it won't affect our search
   717  				// since ikey has the maximum seqnum.
   718  			} else {
   719  				k.Trailer = uint64(InternalKeyKindInvalid)
   720  			}
   721  
   722  			if base.InternalCompare(i.cmp, ikey, k) > 0 {
   723  				index = h + 1 // preserves f(i-1) == false
   724  			} else {
   725  				upper = h // preserves f(j) == true
   726  			}
   727  		}
   728  		// index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
   729  		// => answer is index.
   730  	}
   731  
   732  	// Since keys are strictly increasing, if index > 0 then the restart point at
   733  	// index-1 will be the largest whose key is < the key sought.
   734  	targetOffset := i.restarts
   735  	if index > 0 {
   736  		i.offset = int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*(index-1):]))
   737  		if index < i.numRestarts {
   738  			targetOffset = int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*(index):]))
   739  		}
   740  	} else if index == 0 {
   741  		// If index == 0 then all keys in this block are larger than the key
   742  		// sought.
   743  		i.offset = -1
   744  		i.nextOffset = 0
   745  		return nil, nil
   746  	}
   747  
   748  	// Iterate from that restart point to somewhere >= the key sought, then back
   749  	// up to the previous entry. The expectation is that we'll be performing
   750  	// reverse iteration, so we cache the entries as we advance forward.
   751  	i.nextOffset = i.offset
   752  
   753  	for {
   754  		i.offset = i.nextOffset
   755  		i.readEntry()
   756  		i.decodeInternalKey(i.key)
   757  
   758  		if i.cmp(i.ikey.UserKey, ikey.UserKey) >= 0 {
   759  			// The current key is greater than or equal to our search key. Back up to
   760  			// the previous key which was less than our search key. Note that his for
   761  			// loop will execute at least once with this if-block not being true, so
   762  			// the key we are backing up to is the last one this loop cached.
   763  			i.Prev()
   764  			return &i.ikey, i.val
   765  		}
   766  
   767  		if i.nextOffset >= targetOffset {
   768  			// We've reached the end of the current restart block. Return the current
   769  			// key. When the restart interval is 1, the first iteration of the for
   770  			// loop will bring us here. In that case ikey is backed by the block so
   771  			// we get the desired key stability guarantee for the lifetime of the
   772  			// blockIter.
   773  			break
   774  		}
   775  
   776  		i.cacheEntry()
   777  	}
   778  
   779  	if !i.valid() {
   780  		return nil, nil
   781  	}
   782  	return &i.ikey, i.val
   783  }
   784  
   785  // First implements internalIterator.First, as documented in the bitalostable
   786  // package.
   787  func (i *blockIter) First() (*InternalKey, []byte) {
   788  	i.offset = 0
   789  	if !i.valid() {
   790  		return nil, nil
   791  	}
   792  	i.clearCache()
   793  	i.readEntry()
   794  	i.decodeInternalKey(i.key)
   795  	return &i.ikey, i.val
   796  }
   797  
   798  // Last implements internalIterator.Last, as documented in the bitalostable package.
   799  func (i *blockIter) Last() (*InternalKey, []byte) {
   800  	// Seek forward from the last restart point.
   801  	i.offset = int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*(i.numRestarts-1):]))
   802  	if !i.valid() {
   803  		return nil, nil
   804  	}
   805  
   806  	i.readEntry()
   807  	i.clearCache()
   808  
   809  	for i.nextOffset < i.restarts {
   810  		i.cacheEntry()
   811  		i.offset = i.nextOffset
   812  		i.readEntry()
   813  	}
   814  
   815  	i.decodeInternalKey(i.key)
   816  	return &i.ikey, i.val
   817  }
   818  
   819  // Next implements internalIterator.Next, as documented in the bitalostable
   820  // package.
   821  func (i *blockIter) Next() (*InternalKey, []byte) {
   822  	if len(i.cachedBuf) > 0 {
   823  		// We're switching from reverse iteration to forward iteration. We need to
   824  		// populate i.fullKey with the current key we're positioned at so that
   825  		// readEntry() can use i.fullKey for key prefix decompression. Note that we
   826  		// don't know whether i.key is backed by i.cachedBuf or i.fullKey (if
   827  		// SeekLT was the previous call, i.key may be backed by i.fullKey), but
   828  		// copying into i.fullKey works for both cases.
   829  		//
   830  		// TODO(peter): Rather than clearing the cache, we could instead use the
   831  		// cache until it is exhausted. This would likely be faster than falling
   832  		// through to the normal forward iteration code below.
   833  		i.fullKey = append(i.fullKey[:0], i.key...)
   834  		i.clearCache()
   835  	}
   836  
   837  	i.offset = i.nextOffset
   838  	if !i.valid() {
   839  		return nil, nil
   840  	}
   841  	i.readEntry()
   842  	// Manually inlined version of i.decodeInternalKey(i.key).
   843  	if n := len(i.key) - 8; n >= 0 {
   844  		i.ikey.Trailer = binary.LittleEndian.Uint64(i.key[n:])
   845  		i.ikey.UserKey = i.key[:n:n]
   846  		if i.globalSeqNum != 0 {
   847  			i.ikey.SetSeqNum(i.globalSeqNum)
   848  		}
   849  	} else {
   850  		i.ikey.Trailer = uint64(InternalKeyKindInvalid)
   851  		i.ikey.UserKey = nil
   852  	}
   853  	return &i.ikey, i.val
   854  }
   855  
   856  // Prev implements internalIterator.Prev, as documented in the bitalostable
   857  // package.
   858  func (i *blockIter) Prev() (*InternalKey, []byte) {
   859  	if n := len(i.cached) - 1; n >= 0 {
   860  		i.nextOffset = i.offset
   861  		e := &i.cached[n]
   862  		i.offset = e.offset
   863  		i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize))
   864  		// Manually inlined version of i.decodeInternalKey(i.key).
   865  		i.key = i.cachedBuf[e.keyStart:e.keyEnd]
   866  		if n := len(i.key) - 8; n >= 0 {
   867  			i.ikey.Trailer = binary.LittleEndian.Uint64(i.key[n:])
   868  			i.ikey.UserKey = i.key[:n:n]
   869  			if i.globalSeqNum != 0 {
   870  				i.ikey.SetSeqNum(i.globalSeqNum)
   871  			}
   872  		} else {
   873  			i.ikey.Trailer = uint64(InternalKeyKindInvalid)
   874  			i.ikey.UserKey = nil
   875  		}
   876  		i.cached = i.cached[:n]
   877  		return &i.ikey, i.val
   878  	}
   879  
   880  	i.clearCache()
   881  	if i.offset <= 0 {
   882  		i.offset = -1
   883  		i.nextOffset = 0
   884  		return nil, nil
   885  	}
   886  
   887  	targetOffset := i.offset
   888  	var index int32
   889  
   890  	{
   891  		// NB: manually inlined sort.Sort is ~5% faster.
   892  		//
   893  		// Define f(-1) == false and f(n) == true.
   894  		// Invariant: f(index-1) == false, f(upper) == true.
   895  		upper := i.numRestarts
   896  		for index < upper {
   897  			h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
   898  			// index ≤ h < upper
   899  			offset := int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*h:]))
   900  			if offset < targetOffset {
   901  				index = h + 1 // preserves f(i-1) == false
   902  			} else {
   903  				upper = h // preserves f(j) == true
   904  			}
   905  		}
   906  		// index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
   907  		// => answer is index.
   908  	}
   909  
   910  	i.offset = 0
   911  	if index > 0 {
   912  		i.offset = int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*(index-1):]))
   913  	}
   914  
   915  	i.readEntry()
   916  
   917  	for i.nextOffset < targetOffset {
   918  		i.cacheEntry()
   919  		i.offset = i.nextOffset
   920  		i.readEntry()
   921  	}
   922  
   923  	i.decodeInternalKey(i.key)
   924  	return &i.ikey, i.val
   925  }
   926  
   927  // Key implements internalIterator.Key, as documented in the bitalostable package.
   928  func (i *blockIter) Key() *InternalKey {
   929  	return &i.ikey
   930  }
   931  
   932  // Value implements internalIterator.Value, as documented in the bitalostable
   933  // package.
   934  func (i *blockIter) Value() []byte {
   935  	return i.val
   936  }
   937  
   938  // Error implements internalIterator.Error, as documented in the bitalostable
   939  // package.
   940  func (i *blockIter) Error() error {
   941  	return nil // infallible
   942  }
   943  
   944  // Close implements internalIterator.Close, as documented in the bitalostable
   945  // package.
   946  func (i *blockIter) Close() error {
   947  	i.cacheHandle.Release()
   948  	i.cacheHandle = cache.Handle{}
   949  	i.val = nil
   950  	return nil
   951  }
   952  
   953  func (i *blockIter) SetBounds(lower, upper []byte) {
   954  	// This should never be called as bounds are handled by sstable.Iterator.
   955  	panic("bitalostable: SetBounds unimplemented")
   956  }
   957  
   958  func (i *blockIter) valid() bool {
   959  	return i.offset >= 0 && i.offset < i.restarts
   960  }
   961  
   962  // fragmentBlockIter wraps a blockIter, implementing the
   963  // keyspan.FragmentIterator interface. It's used for reading range deletion and
   964  // range key blocks.
   965  //
   966  // Range deletions and range keys are fragmented before they're persisted to the
   967  // block. Overlapping fragments have identical bounds.  The fragmentBlockIter
   968  // gathers all the fragments with identical bounds within a block and returns a
   969  // single keyspan.Span describing all the keys defined over the span.
   970  //
   971  // # Memory lifetime
   972  //
   973  // A Span returned by fragmentBlockIter is only guaranteed to be stable until
   974  // the next fragmentBlockIter iteration positioning method. A Span's Keys slice
   975  // may be reused, so the user must not assume it's stable.
   976  //
   977  // Blocks holding range deletions and range keys are configured to use a restart
   978  // interval of 1. This provides key stability. The caller may treat the various
   979  // byte slices (start, end, suffix, value) as stable for the lifetime of the
   980  // iterator.
   981  type fragmentBlockIter struct {
   982  	blockIter blockIter
   983  	keyBuf    [2]keyspan.Key
   984  	span      keyspan.Span
   985  	err       error
   986  	dir       int8
   987  	closeHook func(i keyspan.FragmentIterator) error
   988  }
   989  
   990  func (i *fragmentBlockIter) resetForReuse() fragmentBlockIter {
   991  	return fragmentBlockIter{blockIter: i.blockIter.resetForReuse()}
   992  }
   993  
   994  func (i *fragmentBlockIter) decodeSpanKeys(k *InternalKey, internalValue []byte) {
   995  	// TODO(jackson): The use of i.span.Keys to accumulate keys across multiple
   996  	// calls to Decode is too confusing and subtle. Refactor to make it
   997  	// explicit.
   998  
   999  	// decode the contents of the fragment's value. This always includes at
  1000  	// least the end key: RANGEDELs store the end key directly as the value,
  1001  	// whereas the various range key kinds store are more complicated.  The
  1002  	// details of the range key internal value format are documented within the
  1003  	// internal/rangekey package.
  1004  	switch k.Kind() {
  1005  	case base.InternalKeyKindRangeDelete:
  1006  		i.span = rangedel.Decode(*k, internalValue, i.span.Keys)
  1007  		i.err = nil
  1008  	case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset, base.InternalKeyKindRangeKeyDelete:
  1009  		i.span, i.err = rangekey.Decode(*k, internalValue, i.span.Keys)
  1010  	default:
  1011  		i.span = keyspan.Span{}
  1012  		i.err = base.CorruptionErrorf("bitalostable: corrupt keyspan fragment of kind %d", k.Kind())
  1013  	}
  1014  }
  1015  
  1016  // gatherForward gathers internal keys with identical bounds. Keys defined over
  1017  // spans of the keyspace are fragmented such that any overlapping key spans have
  1018  // identical bounds. When these spans are persisted to a range deletion or range
  1019  // key block, they may be persisted as multiple internal keys in order to encode
  1020  // multiple sequence numbers or key kinds.
  1021  //
  1022  // gatherForward iterates forward, re-combining the fragmented internal keys to
  1023  // reconstruct a keyspan.Span that holds all the keys defined over the span.
  1024  func (i *fragmentBlockIter) gatherForward(k *InternalKey, internalValue []byte) *keyspan.Span {
  1025  	i.span = keyspan.Span{}
  1026  	if k == nil || !i.blockIter.valid() {
  1027  		return nil
  1028  	}
  1029  	i.err = nil
  1030  	// Use the i.keyBuf array to back the Keys slice to prevent an allocation
  1031  	// when a span contains few keys.
  1032  	i.span.Keys = i.keyBuf[:0]
  1033  
  1034  	// Decode the span's end key and individual keys from the value.
  1035  	i.decodeSpanKeys(k, internalValue)
  1036  	if i.err != nil {
  1037  		return nil
  1038  	}
  1039  	prevEnd := i.span.End
  1040  
  1041  	// There might exist additional internal keys with identical bounds encoded
  1042  	// within the block. Iterate forward, accumulating all the keys with
  1043  	// identical bounds to s.
  1044  	k, internalValue = i.blockIter.Next()
  1045  	for k != nil && i.blockIter.cmp(k.UserKey, i.span.Start) == 0 {
  1046  		i.decodeSpanKeys(k, internalValue)
  1047  		if i.err != nil {
  1048  			return nil
  1049  		}
  1050  
  1051  		// Since k indicates an equal start key, the encoded end key must
  1052  		// exactly equal the original end key from the first internal key.
  1053  		// Overlapping fragments are required to have exactly equal start and
  1054  		// end bounds.
  1055  		if i.blockIter.cmp(prevEnd, i.span.End) != 0 {
  1056  			i.err = base.CorruptionErrorf("bitalostable: corrupt keyspan fragmentation")
  1057  			i.span = keyspan.Span{}
  1058  			return nil
  1059  		}
  1060  		k, internalValue = i.blockIter.Next()
  1061  	}
  1062  	// i.blockIter is positioned over the first internal key for the next span.
  1063  	return &i.span
  1064  }
  1065  
  1066  // gatherBackward gathers internal keys with identical bounds. Keys defined over
  1067  // spans of the keyspace are fragmented such that any overlapping key spans have
  1068  // identical bounds. When these spans are persisted to a range deletion or range
  1069  // key block, they may be persisted as multiple internal keys in order to encode
  1070  // multiple sequence numbers or key kinds.
  1071  //
  1072  // gatherBackward iterates backwards, re-combining the fragmented internal keys
  1073  // to reconstruct a keyspan.Span that holds all the keys defined over the span.
  1074  func (i *fragmentBlockIter) gatherBackward(k *InternalKey, internalValue []byte) *keyspan.Span {
  1075  	i.span = keyspan.Span{}
  1076  	if k == nil || !i.blockIter.valid() {
  1077  		return nil
  1078  	}
  1079  	i.err = nil
  1080  	// Use the i.keyBuf array to back the Keys slice to prevent an allocation
  1081  	// when a span contains few keys.
  1082  	i.span.Keys = i.keyBuf[:0]
  1083  
  1084  	// Decode the span's end key and individual keys from the value.
  1085  	i.decodeSpanKeys(k, internalValue)
  1086  	if i.err != nil {
  1087  		return nil
  1088  	}
  1089  	prevEnd := i.span.End
  1090  
  1091  	// There might exist additional internal keys with identical bounds encoded
  1092  	// within the block. Iterate backward, accumulating all the keys with
  1093  	// identical bounds to s.
  1094  	k, internalValue = i.blockIter.Prev()
  1095  	for k != nil && i.blockIter.cmp(k.UserKey, i.span.Start) == 0 {
  1096  		i.decodeSpanKeys(k, internalValue)
  1097  		if i.err != nil {
  1098  			return nil
  1099  		}
  1100  
  1101  		// Since k indicates an equal start key, the encoded end key must
  1102  		// exactly equal the original end key from the first internal key.
  1103  		// Overlapping fragments are required to have exactly equal start and
  1104  		// end bounds.
  1105  		if i.blockIter.cmp(prevEnd, i.span.End) != 0 {
  1106  			i.err = base.CorruptionErrorf("bitalostable: corrupt keyspan fragmentation")
  1107  			i.span = keyspan.Span{}
  1108  			return nil
  1109  		}
  1110  		k, internalValue = i.blockIter.Prev()
  1111  	}
  1112  	// i.blockIter is positioned over the last internal key for the previous
  1113  	// span.
  1114  
  1115  	// Backwards iteration encounters internal keys in the wrong order.
  1116  	keyspan.SortKeysByTrailer(&i.span.Keys)
  1117  
  1118  	return &i.span
  1119  }
  1120  
  1121  // Error implements (keyspan.FragmentIterator).Error.
  1122  func (i *fragmentBlockIter) Error() error {
  1123  	return i.err
  1124  }
  1125  
  1126  // Close implements (keyspan.FragmentIterator).Close.
  1127  func (i *fragmentBlockIter) Close() error {
  1128  	var err error
  1129  	if i.closeHook != nil {
  1130  		err = i.closeHook(i)
  1131  	}
  1132  	err = firstError(err, i.blockIter.Close())
  1133  	return err
  1134  }
  1135  
  1136  // First implements (keyspan.FragmentIterator).First
  1137  func (i *fragmentBlockIter) First() *keyspan.Span {
  1138  	i.dir = +1
  1139  	return i.gatherForward(i.blockIter.First())
  1140  }
  1141  
  1142  // Last implements (keyspan.FragmentIterator).Last.
  1143  func (i *fragmentBlockIter) Last() *keyspan.Span {
  1144  	i.dir = -1
  1145  	return i.gatherBackward(i.blockIter.Last())
  1146  }
  1147  
  1148  // Next implements (keyspan.FragmentIterator).Next.
  1149  func (i *fragmentBlockIter) Next() *keyspan.Span {
  1150  	switch {
  1151  	case i.dir == -1 && !i.span.Valid():
  1152  		// Switching directions.
  1153  		//
  1154  		// i.blockIter is exhausted, before the first key. Move onto the first.
  1155  		i.blockIter.First()
  1156  		i.dir = +1
  1157  	case i.dir == -1 && i.span.Valid():
  1158  		// Switching directions.
  1159  		//
  1160  		// i.blockIter is currently positioned over the last internal key for
  1161  		// the previous span. Next it once to move to the first internal key
  1162  		// that makes up the current span, and gatherForwaad to land on the
  1163  		// first internal key making up the next span.
  1164  		//
  1165  		// In the diagram below, if the last span returned to the user during
  1166  		// reverse iteration was [b,c), i.blockIter is currently positioned at
  1167  		// [a,b). The block iter must be positioned over [d,e) to gather the
  1168  		// next span's fragments.
  1169  		//
  1170  		//    ... [a,b) [b,c) [b,c) [b,c) [d,e) ...
  1171  		//          ^                       ^
  1172  		//     i.blockIter                 want
  1173  		if x := i.gatherForward(i.blockIter.Next()); invariants.Enabled && !x.Valid() {
  1174  			panic("bitalostable: invariant violation: next entry unexpectedly invalid")
  1175  		}
  1176  		i.dir = +1
  1177  	}
  1178  	return i.gatherForward(&i.blockIter.ikey, i.blockIter.val)
  1179  }
  1180  
  1181  // Prev implements (keyspan.FragmentIterator).Prev.
  1182  func (i *fragmentBlockIter) Prev() *keyspan.Span {
  1183  	switch {
  1184  	case i.dir == +1 && !i.span.Valid():
  1185  		// Switching directions.
  1186  		//
  1187  		// i.blockIter is exhausted, after the last key. Move onto the last.
  1188  		i.blockIter.Last()
  1189  		i.dir = -1
  1190  	case i.dir == +1 && i.span.Valid():
  1191  		// Switching directions.
  1192  		//
  1193  		// i.blockIter is currently positioned over the first internal key for
  1194  		// the next span. Prev it once to move to the last internal key that
  1195  		// makes up the current span, and gatherBackward to land on the last
  1196  		// internal key making up the previous span.
  1197  		//
  1198  		// In the diagram below, if the last span returned to the user during
  1199  		// forward iteration was [b,c), i.blockIter is currently positioned at
  1200  		// [d,e). The block iter must be positioned over [a,b) to gather the
  1201  		// previous span's fragments.
  1202  		//
  1203  		//    ... [a,b) [b,c) [b,c) [b,c) [d,e) ...
  1204  		//          ^                       ^
  1205  		//        want                  i.blockIter
  1206  		if x := i.gatherBackward(i.blockIter.Prev()); invariants.Enabled && !x.Valid() {
  1207  			panic("bitalostable: invariant violation: previous entry unexpectedly invalid")
  1208  		}
  1209  		i.dir = -1
  1210  	}
  1211  	return i.gatherBackward(&i.blockIter.ikey, i.blockIter.val)
  1212  }
  1213  
  1214  // SeekGE implements (keyspan.FragmentIterator).SeekGE.
  1215  func (i *fragmentBlockIter) SeekGE(k []byte) *keyspan.Span {
  1216  	i.dir = +1
  1217  	return i.gatherForward(i.blockIter.SeekGE(k, base.SeekGEFlags(0)))
  1218  }
  1219  
  1220  // SeekLT implements (keyspan.FragmentIterator).SeekLT.
  1221  func (i *fragmentBlockIter) SeekLT(k []byte) *keyspan.Span {
  1222  	i.dir = -1
  1223  	return i.gatherBackward(i.blockIter.SeekLT(k, base.SeekLTFlagsNone))
  1224  }
  1225  
  1226  // String implements fmt.Stringer.
  1227  func (i *fragmentBlockIter) String() string {
  1228  	return "fragment-block-iter"
  1229  }
  1230  
  1231  // SetCloseHook implements sstable.FragmentIterator.
  1232  func (i *fragmentBlockIter) SetCloseHook(fn func(i keyspan.FragmentIterator) error) {
  1233  	i.closeHook = fn
  1234  }