github.com/cockroachdb/pebble@v1.1.2/sstable/value_block.go (about)

     1  // Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package sstable
     6  
     7  import (
     8  	"context"
     9  	"encoding/binary"
    10  	"io"
    11  	"sync"
    12  	"unsafe"
    13  
    14  	"github.com/cockroachdb/errors"
    15  	"github.com/cockroachdb/pebble/internal/base"
    16  	"github.com/cockroachdb/pebble/internal/invariants"
    17  	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing"
    18  	"golang.org/x/exp/rand"
    19  )
    20  
    21  // Value blocks are supported in TableFormatPebblev3.
    22  //
    23  // 1. Motivation and overview
    24  //
    25  // Value blocks are a mechanism designed for sstables storing MVCC data, where
    26  // there can be many versions of a key that need to be kept, but only the
    27  // latest value is typically read (see the documentation for Comparer.Split
    28  // regarding MVCC keys). The goal is faster reads. Unlike Pebble versions,
    29  // which can be eagerly thrown away (except when there are snapshots), MVCC
    30  // versions are long-lived (e.g. default CockroachDB garbage collection
    31  // threshold for older versions is 24 hours) and can significantly slow down
    32  // reads. We have seen CockroachDB production workloads with very slow reads
    33  // due to:
    34  // - 100s of versions for each key in a table.
    35  //
    36  // - Tables with mostly MVCC garbage consisting of 2 versions per key -- a
    37  //   real key-value pair, followed by a key-value pair whose value (usually
    38  //   with zero byte length) indicates it is an MVCC tombstone.
    39  //
    40  // The value blocks mechanism attempts to improve read throughput in these
    41  // cases when the key size is smaller than the value sizes of older versions.
    42  // This is done by moving the value of an older version to a value block in a
    43  // different part of the sstable. This improves spatial locality of the data
    44  // being read by the workload, which increases caching effectiveness.
    45  //
    46  // Additionally, even when the key size is not smaller than the value of older
    47  // versions (e.g. secondary indexes in CockroachDB), TableFormatPebblev3
    48  // stores the result of key comparisons done at write time inside the sstable,
    49  // which makes stepping from one key prefix to the next prefix (i.e., skipping
    50  // over older versions of a MVCC key) more efficient by avoiding key
    51  // comparisons and key decoding. See the results in
    52  // https://github.com/cockroachdb/pebble/pull/2149 and more details in the
    53  // comment inside BenchmarkIteratorScanNextPrefix. These improvements are also
    54  // visible in end-to-end CockroachDB tests, as outlined in
    55  // https://github.com/cockroachdb/cockroach/pull/96652.
    56  //
    57  // In TableFormatPebblev3, each SET has a one byte value prefix that tells us
    58  // whether the value is in-place or in a value block. This 1 byte prefix
    59  // encodes additional information:
    60  //
    61  // - ShortAttribute: This is an attribute of the value. Currently, CockroachDB
    62  //   uses it to represent whether the value is a tombstone or not. This avoids
    63  //   the need to fetch a value from the value block if the caller only wants
    64  //   to figure out whether it is an MVCC tombstone. The length of the value is
    65  //   another attribute that the caller can be interested in, and it is also
    66  //   accessible without reading the value in the value block (see the value
    67  //   handle in the details section).
    68  //
    69  // - SET-same-prefix: this enables the aforementioned optimization when
    70  //   stepping from one key prefix to the next key prefix.
    71  //
    72  // We further optimize this iteration over prefixes by using the restart
    73  // points in a block to encode whether the SET at a restart point has the same
    74  // prefix since the last restart point. This allows us to skip over restart
    75  // points within the same block. See the comment in blockWriter, and how both
    76  // SET-same-prefix and the restart point information is used in
    77  // blockIter.nextPrefixV3.
    78  //
    79  // This flexibility of values that are in-place or in value blocks requires
    80  // flexibility in the iterator interface. The InternalIterator interface
    81  // returns a LazyValue instead of a byte slice. Additionally, pebble.Iterator
    82  // allows the caller to ask for a LazyValue. See lazy_value.go for details,
    83  // including the memory lifetime management.
    84  //
    85  // For historical discussions about this feature, see the issue
    86  // https://github.com/cockroachdb/pebble/issues/1170 and the prototype in
    87  // https://github.com/cockroachdb/pebble/pull/1443.
    88  //
    89  // The code in this file mainly covers value block and related encodings. We
    90  // discuss these in the next section.
    91  //
    92  // 2. Details
    93  //
    94  // Note that the notion of the latest value is local to the sstable. It is
    95  // possible that that latest value has been deleted by a sstable in a higher
    96  // level, and what is the latest value from the perspective of the whole LSM
    97  // is an older MVCC version. This only affects performance and not
    98  // correctness. This local knowledge is also why we continue to store these
    99  // older versions in the same sstable -- we need to be able to conveniently
   100  // read them. The code in this file is agnostic to the policy regarding what
   101  // should be stored in value blocks -- it allows even the latest MVCC version
   102  // to be stored in a value block. The policy decision in made in the
   103  // sstable.Writer. See Writer.makeAddPointDecisionV3.
   104  //
   105  // Data blocks contain two kinds of SET keys: those with in-place values and
   106  // those with a value handle. To distinguish these two cases we use a single
   107  // byte prefix (valuePrefix). This single byte prefix is split into multiple
   108  // parts, where nb represents information that is encoded in n bits.
   109  //
   110  // +---------------+--------------------+-----------+--------------------+
   111  // | value-kind 2b | SET-same-prefix 1b | unused 2b | short-attribute 3b |
   112  // +---------------+--------------------+-----------+--------------------+
   113  //
   114  // The 2 bit value-kind specifies whether this is an in-place value or a value
   115  // handle pointing to a value block. We use 2 bits here for future
   116  // representation of values that are in separate files. The 1 bit
   117  // SET-same-prefix is true if this key is a SET and is immediately preceded by
   118  // a SET that shares the same prefix. The 3 bit short-attribute is described
   119  // in base.ShortAttribute -- it stores user-defined attributes about the
   120  // value. It is unused for in-place values.
   121  //
   122  // Value Handle and Value Blocks:
   123  // valueHandles refer to values in value blocks. Value blocks are simpler than
   124  // normal data blocks (that contain key-value pairs, and allow for binary
   125  // search), which makes them cheap for value retrieval purposes. A valueHandle
   126  // is a tuple (valueLen, blockNum, offsetInBlock), where blockNum is the 0
   127  // indexed value block number and offsetInBlock is the byte offset in that
   128  // block containing the value. The valueHandle.valueLen is included since
   129  // there are multiple use cases in CockroachDB that need the value length but
   130  // not the value, for which we can avoid reading the value in the value block
   131  // (see
   132  // https://github.com/cockroachdb/pebble/issues/1170#issuecomment-958203245).
   133  //
   134  // A value block has a checksum like other blocks, and is optionally
   135  // compressed. An uncompressed value block is a sequence of values with no
   136  // separator or length (we rely on the valueHandle to demarcate). The
   137  // valueHandle.offsetInBlock points to the value, of length
   138  // valueHandle.valueLen. While writing a sstable, all the (possibly
   139  // compressed) value blocks need to be held in-memory until they can be
   140  // written. Value blocks are placed after the "meta rangedel" and "meta range
   141  // key" blocks since value blocks are considered less likely to be read.
   142  //
   143  // Meta Value Index Block:
   144  // Since the (key, valueHandle) pair are written before there is any knowledge
   145  // of the byte offset of the value block in the file, or its compressed
   146  // length, we need another lookup to map the valueHandle.blockNum to the
   147  // information needed to read it from the file. This information is provided
   148  // by the "value index block". The "value index block" is referred to by the
   149  // metaindex block. The design intentionally avoids making the "value index
   150  // block" a general purpose key-value block, since each caller wants to lookup
   151  // the information for a particular blockNum (there is no need for SeekGE
   152  // etc.). Instead, this index block stores a sequence of (blockNum,
   153  // blockOffset, blockLength) tuples, where the blockNums are consecutive
   154  // integers, and the tuples are encoded with a fixed width encoding. This
   155  // allows a reader to find the tuple for block K by looking at the offset
   156  // K*fixed-width. The fixed width for each field is decided by looking at the
   157  // maximum value of each of these fields. As a concrete example of a large
   158  // sstable with many value blocks, we constructed a 100MB sstable with many
   159  // versions and had 2475 value blocks (~32KB each). This sstable had this
   160  // tuple encoded using 2+4+2=8 bytes, which means the uncompressed value index
   161  // block was 2475*8=~19KB, which is modest. Therefore, we don't support more
   162  // than one value index block. Consider the example of 2 byte blockNum, 4 byte
   163  // blockOffset and 2 byte blockLen. The value index block will look like:
   164  //
   165  //   +---------------+------------------+---------------+
   166  //   | blockNum (2B) | blockOffset (4B) | blockLen (2B) |
   167  //   +---------------+------------------+---------------+
   168  //   |       0       |    7,123,456     |  30,000       |
   169  //   +---------------+------------------+---------------+
   170  //   |       1       |    7,153,456     |  20,000       |
   171  //   +---------------+------------------+---------------+
   172  //   |       2       |    7,173,456     |  25,567       |
   173  //   +---------------+------------------+---------------+
   174  //   |     ....      |      ...         |    ...        |
   175  //
   176  //
   177  // The metaindex block contains the valueBlocksIndexHandle which in addition
   178  // to the BlockHandle also specifies the widths of these tuple fields. In the
   179  // above example, the
   180  // valueBlockIndexHandle.{blockNumByteLength,blockOffsetByteLength,blockLengthByteLength}
   181  // will be (2,4,2).
   182  
   183  // valueHandle is stored with a key when the value is in a value block. This
   184  // handle is the pointer to that value.
   185  type valueHandle struct {
   186  	valueLen      uint32
   187  	blockNum      uint32
   188  	offsetInBlock uint32
   189  }
   190  
   191  // valuePrefix is the single byte prefix for either the in-place value or the
   192  // encoded valueHandle. It encoded multiple kinds of information.
   193  type valuePrefix byte
   194  
   195  const (
   196  	// 2 most-significant bits of valuePrefix encodes the value-kind.
   197  	valueKindMask           valuePrefix = '\xC0'
   198  	valueKindIsValueHandle  valuePrefix = '\x80'
   199  	valueKindIsInPlaceValue valuePrefix = '\x00'
   200  
   201  	// 1 bit indicates SET has same key prefix as immediately preceding key that
   202  	// is also a SET. If the immediately preceding key in the same block is a
   203  	// SET, AND this bit is 0, the prefix must have changed.
   204  	//
   205  	// Note that the current policy of only storing older MVCC versions in value
   206  	// blocks means that valueKindIsValueHandle => SET has same prefix. But no
   207  	// code should rely on this behavior. Also, SET has same prefix does *not*
   208  	// imply valueKindIsValueHandle.
   209  	setHasSameKeyPrefixMask valuePrefix = '\x20'
   210  
   211  	// 3 least-significant bits for the user-defined base.ShortAttribute.
   212  	// Undefined for valueKindIsInPlaceValue.
   213  	userDefinedShortAttributeMask valuePrefix = '\x07'
   214  )
   215  
   216  // valueHandle fields are varint encoded, so maximum 5 bytes each, plus 1 byte
   217  // for the valuePrefix. This could alternatively be group varint encoded, but
   218  // experiments were inconclusive
   219  // (https://github.com/cockroachdb/pebble/pull/1443#issuecomment-1270298802).
   220  const valueHandleMaxLen = 5*3 + 1
   221  
   222  // Assert blockHandleLikelyMaxLen >= valueHandleMaxLen.
   223  const _ = uint(blockHandleLikelyMaxLen - valueHandleMaxLen)
   224  
   225  func encodeValueHandle(dst []byte, v valueHandle) int {
   226  	n := 0
   227  	n += binary.PutUvarint(dst[n:], uint64(v.valueLen))
   228  	n += binary.PutUvarint(dst[n:], uint64(v.blockNum))
   229  	n += binary.PutUvarint(dst[n:], uint64(v.offsetInBlock))
   230  	return n
   231  }
   232  
   233  func makePrefixForValueHandle(setHasSameKeyPrefix bool, attribute base.ShortAttribute) valuePrefix {
   234  	prefix := valueKindIsValueHandle | valuePrefix(attribute)
   235  	if setHasSameKeyPrefix {
   236  		prefix = prefix | setHasSameKeyPrefixMask
   237  	}
   238  	return prefix
   239  }
   240  
   241  func makePrefixForInPlaceValue(setHasSameKeyPrefix bool) valuePrefix {
   242  	prefix := valueKindIsInPlaceValue
   243  	if setHasSameKeyPrefix {
   244  		prefix = prefix | setHasSameKeyPrefixMask
   245  	}
   246  	return prefix
   247  }
   248  
   249  func isValueHandle(b valuePrefix) bool {
   250  	return b&valueKindMask == valueKindIsValueHandle
   251  }
   252  
   253  // REQUIRES: isValueHandle(b)
   254  func getShortAttribute(b valuePrefix) base.ShortAttribute {
   255  	return base.ShortAttribute(b & userDefinedShortAttributeMask)
   256  }
   257  
   258  func setHasSamePrefix(b valuePrefix) bool {
   259  	return b&setHasSameKeyPrefixMask == setHasSameKeyPrefixMask
   260  }
   261  
   262  func decodeLenFromValueHandle(src []byte) (uint32, []byte) {
   263  	ptr := unsafe.Pointer(&src[0])
   264  	var v uint32
   265  	if a := *((*uint8)(ptr)); a < 128 {
   266  		v = uint32(a)
   267  		src = src[1:]
   268  	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
   269  		v = uint32(b)<<7 | uint32(a)
   270  		src = src[2:]
   271  	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
   272  		v = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   273  		src = src[3:]
   274  	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
   275  		v = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   276  		src = src[4:]
   277  	} else {
   278  		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
   279  		v = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   280  		src = src[5:]
   281  	}
   282  	return v, src
   283  }
   284  
   285  func decodeRemainingValueHandle(src []byte) valueHandle {
   286  	var vh valueHandle
   287  	ptr := unsafe.Pointer(&src[0])
   288  	// Manually inlined uvarint decoding. Saves ~25% in benchmarks. Unrolling
   289  	// a loop for i:=0; i<2; i++, saves ~6%.
   290  	var v uint32
   291  	if a := *((*uint8)(ptr)); a < 128 {
   292  		v = uint32(a)
   293  		ptr = unsafe.Pointer(uintptr(ptr) + 1)
   294  	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
   295  		v = uint32(b)<<7 | uint32(a)
   296  		ptr = unsafe.Pointer(uintptr(ptr) + 2)
   297  	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
   298  		v = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   299  		ptr = unsafe.Pointer(uintptr(ptr) + 3)
   300  	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
   301  		v = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   302  		ptr = unsafe.Pointer(uintptr(ptr) + 4)
   303  	} else {
   304  		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
   305  		v = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   306  		ptr = unsafe.Pointer(uintptr(ptr) + 5)
   307  	}
   308  	vh.blockNum = v
   309  
   310  	if a := *((*uint8)(ptr)); a < 128 {
   311  		v = uint32(a)
   312  	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
   313  		v = uint32(b)<<7 | uint32(a)
   314  	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
   315  		v = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   316  	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
   317  		v = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   318  	} else {
   319  		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
   320  		v = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
   321  	}
   322  	vh.offsetInBlock = v
   323  
   324  	return vh
   325  }
   326  
   327  func decodeValueHandle(src []byte) valueHandle {
   328  	valLen, src := decodeLenFromValueHandle(src)
   329  	vh := decodeRemainingValueHandle(src)
   330  	vh.valueLen = valLen
   331  	return vh
   332  }
   333  
   334  // valueBlocksIndexHandle is placed in the metaindex if there are any value
   335  // blocks. If there are no value blocks, there is no value blocks index, and
   336  // no entry in the metaindex. Note that the lack of entry in the metaindex
   337  // should not be used to ascertain whether the values are prefixed, since the
   338  // former is an emergent property of the data that was written and not known
   339  // until all the key-value pairs in the sstable are written.
   340  type valueBlocksIndexHandle struct {
   341  	h                     BlockHandle
   342  	blockNumByteLength    uint8
   343  	blockOffsetByteLength uint8
   344  	blockLengthByteLength uint8
   345  }
   346  
   347  const valueBlocksIndexHandleMaxLen = blockHandleMaxLenWithoutProperties + 3
   348  
   349  // Assert blockHandleLikelyMaxLen >= valueBlocksIndexHandleMaxLen.
   350  const _ = uint(blockHandleLikelyMaxLen - valueBlocksIndexHandleMaxLen)
   351  
   352  func encodeValueBlocksIndexHandle(dst []byte, v valueBlocksIndexHandle) int {
   353  	n := encodeBlockHandle(dst, v.h)
   354  	dst[n] = v.blockNumByteLength
   355  	n++
   356  	dst[n] = v.blockOffsetByteLength
   357  	n++
   358  	dst[n] = v.blockLengthByteLength
   359  	n++
   360  	return n
   361  }
   362  
   363  func decodeValueBlocksIndexHandle(src []byte) (valueBlocksIndexHandle, int, error) {
   364  	var vbih valueBlocksIndexHandle
   365  	var n int
   366  	vbih.h, n = decodeBlockHandle(src)
   367  	if n <= 0 {
   368  		return vbih, 0, errors.Errorf("bad BlockHandle %x", src)
   369  	}
   370  	if len(src) != n+3 {
   371  		return vbih, 0, errors.Errorf("bad BlockHandle %x", src)
   372  	}
   373  	vbih.blockNumByteLength = src[n]
   374  	vbih.blockOffsetByteLength = src[n+1]
   375  	vbih.blockLengthByteLength = src[n+2]
   376  	return vbih, n + 3, nil
   377  }
   378  
   379  type valueBlocksAndIndexStats struct {
   380  	numValueBlocks         uint64
   381  	numValuesInValueBlocks uint64
   382  	// Includes both value blocks and value index block.
   383  	valueBlocksAndIndexSize uint64
   384  }
   385  
   386  // valueBlockWriter writes a sequence of value blocks, and the value blocks
   387  // index, for a sstable.
   388  type valueBlockWriter struct {
   389  	// The configured uncompressed block size and size threshold
   390  	blockSize, blockSizeThreshold int
   391  	// Configured compression.
   392  	compression Compression
   393  	// checksummer with configured checksum type.
   394  	checksummer checksummer
   395  	// Block finished callback.
   396  	blockFinishedFunc func(compressedSize int)
   397  
   398  	// buf is the current block being written to (uncompressed).
   399  	buf *blockBuffer
   400  	// compressedBuf is used for compressing the block.
   401  	compressedBuf *blockBuffer
   402  	// Sequence of blocks that are finished.
   403  	blocks []blockAndHandle
   404  	// Cumulative value block bytes written so far.
   405  	totalBlockBytes uint64
   406  	numValues       uint64
   407  }
   408  
   409  type blockAndHandle struct {
   410  	block      *blockBuffer
   411  	handle     BlockHandle
   412  	compressed bool
   413  }
   414  
   415  type blockBuffer struct {
   416  	b []byte
   417  }
   418  
   419  // Pool of block buffers that should be roughly the blockSize.
   420  var uncompressedValueBlockBufPool = sync.Pool{
   421  	New: func() interface{} {
   422  		return &blockBuffer{}
   423  	},
   424  }
   425  
   426  // Pool of block buffers for compressed value blocks. These may widely vary in
   427  // size based on compression ratios.
   428  var compressedValueBlockBufPool = sync.Pool{
   429  	New: func() interface{} {
   430  		return &blockBuffer{}
   431  	},
   432  }
   433  
   434  func releaseToValueBlockBufPool(pool *sync.Pool, b *blockBuffer) {
   435  	// Don't pool buffers larger than 128KB, in case we had some rare large
   436  	// values.
   437  	if len(b.b) > 128*1024 {
   438  		return
   439  	}
   440  	if invariants.Enabled {
   441  		// Set the bytes to a random value. Cap the number of bytes being
   442  		// randomized to prevent test timeouts.
   443  		length := cap(b.b)
   444  		if length > 1000 {
   445  			length = 1000
   446  		}
   447  		b.b = b.b[:length:length]
   448  		rand.Read(b.b)
   449  	}
   450  	pool.Put(b)
   451  }
   452  
   453  var valueBlockWriterPool = sync.Pool{
   454  	New: func() interface{} {
   455  		return &valueBlockWriter{}
   456  	},
   457  }
   458  
   459  func newValueBlockWriter(
   460  	blockSize int,
   461  	blockSizeThreshold int,
   462  	compression Compression,
   463  	checksumType ChecksumType,
   464  	// compressedSize should exclude the block trailer.
   465  	blockFinishedFunc func(compressedSize int),
   466  ) *valueBlockWriter {
   467  	w := valueBlockWriterPool.Get().(*valueBlockWriter)
   468  	*w = valueBlockWriter{
   469  		blockSize:          blockSize,
   470  		blockSizeThreshold: blockSizeThreshold,
   471  		compression:        compression,
   472  		checksummer: checksummer{
   473  			checksumType: checksumType,
   474  		},
   475  		blockFinishedFunc: blockFinishedFunc,
   476  		buf:               uncompressedValueBlockBufPool.Get().(*blockBuffer),
   477  		compressedBuf:     compressedValueBlockBufPool.Get().(*blockBuffer),
   478  		blocks:            w.blocks[:0],
   479  	}
   480  	w.buf.b = w.buf.b[:0]
   481  	w.compressedBuf.b = w.compressedBuf.b[:0]
   482  	return w
   483  }
   484  
   485  func releaseValueBlockWriter(w *valueBlockWriter) {
   486  	for i := range w.blocks {
   487  		if w.blocks[i].compressed {
   488  			releaseToValueBlockBufPool(&compressedValueBlockBufPool, w.blocks[i].block)
   489  		} else {
   490  			releaseToValueBlockBufPool(&uncompressedValueBlockBufPool, w.blocks[i].block)
   491  		}
   492  		w.blocks[i].block = nil
   493  	}
   494  	if w.buf != nil {
   495  		releaseToValueBlockBufPool(&uncompressedValueBlockBufPool, w.buf)
   496  	}
   497  	if w.compressedBuf != nil {
   498  		releaseToValueBlockBufPool(&compressedValueBlockBufPool, w.compressedBuf)
   499  	}
   500  	*w = valueBlockWriter{
   501  		blocks: w.blocks[:0],
   502  	}
   503  	valueBlockWriterPool.Put(w)
   504  }
   505  
   506  func (w *valueBlockWriter) addValue(v []byte) (valueHandle, error) {
   507  	if invariants.Enabled && len(v) == 0 {
   508  		return valueHandle{}, errors.Errorf("cannot write empty value to value block")
   509  	}
   510  	w.numValues++
   511  	blockLen := len(w.buf.b)
   512  	valueLen := len(v)
   513  	if blockLen >= w.blockSize ||
   514  		(blockLen > w.blockSizeThreshold && blockLen+valueLen > w.blockSize) {
   515  		// Block is not currently empty and adding this value will become too big,
   516  		// so finish this block.
   517  		w.compressAndFlush()
   518  		blockLen = len(w.buf.b)
   519  		if invariants.Enabled && blockLen != 0 {
   520  			panic("blockLen of new block should be 0")
   521  		}
   522  	}
   523  	vh := valueHandle{
   524  		valueLen:      uint32(valueLen),
   525  		blockNum:      uint32(len(w.blocks)),
   526  		offsetInBlock: uint32(blockLen),
   527  	}
   528  	blockLen = int(vh.offsetInBlock + vh.valueLen)
   529  	if cap(w.buf.b) < blockLen {
   530  		size := 2 * cap(w.buf.b)
   531  		if size < 1024 {
   532  			size = 1024
   533  		}
   534  		for size < blockLen {
   535  			size *= 2
   536  		}
   537  		buf := make([]byte, blockLen, size)
   538  		_ = copy(buf, w.buf.b)
   539  		w.buf.b = buf
   540  	} else {
   541  		w.buf.b = w.buf.b[:blockLen]
   542  	}
   543  	buf := w.buf.b[vh.offsetInBlock:]
   544  	n := copy(buf, v)
   545  	if n != len(buf) {
   546  		panic("incorrect length computation")
   547  	}
   548  	return vh, nil
   549  }
   550  
   551  func (w *valueBlockWriter) compressAndFlush() {
   552  	// Compress the buffer, discarding the result if the improvement isn't at
   553  	// least 12.5%.
   554  	blockType := noCompressionBlockType
   555  	b := w.buf
   556  	if w.compression != NoCompression {
   557  		blockType, w.compressedBuf.b =
   558  			compressBlock(w.compression, w.buf.b, w.compressedBuf.b[:cap(w.compressedBuf.b)])
   559  		if len(w.compressedBuf.b) < len(w.buf.b)-len(w.buf.b)/8 {
   560  			b = w.compressedBuf
   561  		} else {
   562  			blockType = noCompressionBlockType
   563  		}
   564  	}
   565  	n := len(b.b)
   566  	if n+blockTrailerLen > cap(b.b) {
   567  		block := make([]byte, n+blockTrailerLen)
   568  		copy(block, b.b)
   569  		b.b = block
   570  	} else {
   571  		b.b = b.b[:n+blockTrailerLen]
   572  	}
   573  	b.b[n] = byte(blockType)
   574  	w.computeChecksum(b.b)
   575  	bh := BlockHandle{Offset: w.totalBlockBytes, Length: uint64(n)}
   576  	w.totalBlockBytes += uint64(len(b.b))
   577  	// blockFinishedFunc length excludes the block trailer.
   578  	w.blockFinishedFunc(n)
   579  	compressed := blockType != noCompressionBlockType
   580  	w.blocks = append(w.blocks, blockAndHandle{
   581  		block:      b,
   582  		handle:     bh,
   583  		compressed: compressed,
   584  	})
   585  	// Handed off a buffer to w.blocks, so need get a new one.
   586  	if compressed {
   587  		w.compressedBuf = compressedValueBlockBufPool.Get().(*blockBuffer)
   588  	} else {
   589  		w.buf = uncompressedValueBlockBufPool.Get().(*blockBuffer)
   590  	}
   591  	w.buf.b = w.buf.b[:0]
   592  }
   593  
   594  func (w *valueBlockWriter) computeChecksum(block []byte) {
   595  	n := len(block) - blockTrailerLen
   596  	checksum := w.checksummer.checksum(block[:n], block[n:n+1])
   597  	binary.LittleEndian.PutUint32(block[n+1:], checksum)
   598  }
   599  
   600  func (w *valueBlockWriter) finish(
   601  	writer io.Writer, fileOffset uint64,
   602  ) (valueBlocksIndexHandle, valueBlocksAndIndexStats, error) {
   603  	if len(w.buf.b) > 0 {
   604  		w.compressAndFlush()
   605  	}
   606  	n := len(w.blocks)
   607  	if n == 0 {
   608  		return valueBlocksIndexHandle{}, valueBlocksAndIndexStats{}, nil
   609  	}
   610  	largestOffset := uint64(0)
   611  	largestLength := uint64(0)
   612  	for i := range w.blocks {
   613  		_, err := writer.Write(w.blocks[i].block.b)
   614  		if err != nil {
   615  			return valueBlocksIndexHandle{}, valueBlocksAndIndexStats{}, err
   616  		}
   617  		w.blocks[i].handle.Offset += fileOffset
   618  		largestOffset = w.blocks[i].handle.Offset
   619  		if largestLength < w.blocks[i].handle.Length {
   620  			largestLength = w.blocks[i].handle.Length
   621  		}
   622  	}
   623  	vbihOffset := fileOffset + w.totalBlockBytes
   624  
   625  	vbih := valueBlocksIndexHandle{
   626  		h: BlockHandle{
   627  			Offset: vbihOffset,
   628  		},
   629  		blockNumByteLength:    uint8(lenLittleEndian(uint64(n - 1))),
   630  		blockOffsetByteLength: uint8(lenLittleEndian(largestOffset)),
   631  		blockLengthByteLength: uint8(lenLittleEndian(largestLength)),
   632  	}
   633  	var err error
   634  	if vbih, err = w.writeValueBlocksIndex(writer, vbih); err != nil {
   635  		return valueBlocksIndexHandle{}, valueBlocksAndIndexStats{}, err
   636  	}
   637  	stats := valueBlocksAndIndexStats{
   638  		numValueBlocks:          uint64(n),
   639  		numValuesInValueBlocks:  w.numValues,
   640  		valueBlocksAndIndexSize: w.totalBlockBytes + vbih.h.Length + blockTrailerLen,
   641  	}
   642  	return vbih, stats, err
   643  }
   644  
   645  func (w *valueBlockWriter) writeValueBlocksIndex(
   646  	writer io.Writer, h valueBlocksIndexHandle,
   647  ) (valueBlocksIndexHandle, error) {
   648  	blockLen :=
   649  		int(h.blockNumByteLength+h.blockOffsetByteLength+h.blockLengthByteLength) * len(w.blocks)
   650  	h.h.Length = uint64(blockLen)
   651  	blockLen += blockTrailerLen
   652  	var buf []byte
   653  	if cap(w.buf.b) < blockLen {
   654  		buf = make([]byte, blockLen)
   655  		w.buf.b = buf
   656  	} else {
   657  		buf = w.buf.b[:blockLen]
   658  	}
   659  	b := buf
   660  	for i := range w.blocks {
   661  		littleEndianPut(uint64(i), b, int(h.blockNumByteLength))
   662  		b = b[int(h.blockNumByteLength):]
   663  		littleEndianPut(w.blocks[i].handle.Offset, b, int(h.blockOffsetByteLength))
   664  		b = b[int(h.blockOffsetByteLength):]
   665  		littleEndianPut(w.blocks[i].handle.Length, b, int(h.blockLengthByteLength))
   666  		b = b[int(h.blockLengthByteLength):]
   667  	}
   668  	if len(b) != blockTrailerLen {
   669  		panic("incorrect length calculation")
   670  	}
   671  	b[0] = byte(noCompressionBlockType)
   672  	w.computeChecksum(buf)
   673  	if _, err := writer.Write(buf); err != nil {
   674  		return valueBlocksIndexHandle{}, err
   675  	}
   676  	return h, nil
   677  }
   678  
   679  // littleEndianPut writes v to b using little endian encoding, under the
   680  // assumption that v can be represented using n bytes.
   681  func littleEndianPut(v uint64, b []byte, n int) {
   682  	_ = b[n-1] // bounds check
   683  	for i := 0; i < n; i++ {
   684  		b[i] = byte(v)
   685  		v = v >> 8
   686  	}
   687  }
   688  
   689  // lenLittleEndian returns the minimum number of bytes needed to encode v
   690  // using little endian encoding.
   691  func lenLittleEndian(v uint64) int {
   692  	n := 0
   693  	for i := 0; i < 8; i++ {
   694  		n++
   695  		v = v >> 8
   696  		if v == 0 {
   697  			break
   698  		}
   699  	}
   700  	return n
   701  }
   702  
   703  func littleEndianGet(b []byte, n int) uint64 {
   704  	_ = b[n-1] // bounds check
   705  	v := uint64(b[0])
   706  	for i := 1; i < n; i++ {
   707  		v |= uint64(b[i]) << (8 * i)
   708  	}
   709  	return v
   710  }
   711  
   712  // UserKeyPrefixBound represents a [Lower,Upper) bound of user key prefixes.
   713  // If both are nil, there is no bound specified. Else, Compare(Lower,Upper)
   714  // must be < 0.
   715  type UserKeyPrefixBound struct {
   716  	// Lower is a lower bound user key prefix.
   717  	Lower []byte
   718  	// Upper is an upper bound user key prefix.
   719  	Upper []byte
   720  }
   721  
   722  // IsEmpty returns true iff the bound is empty.
   723  func (ukb *UserKeyPrefixBound) IsEmpty() bool {
   724  	return len(ukb.Lower) == 0 && len(ukb.Upper) == 0
   725  }
   726  
   727  type blockProviderWhenOpen interface {
   728  	readBlockForVBR(
   729  		ctx context.Context, h BlockHandle, stats *base.InternalIteratorStats,
   730  	) (bufferHandle, error)
   731  }
   732  
   733  type blockProviderWhenClosed struct {
   734  	rp ReaderProvider
   735  	r  *Reader
   736  }
   737  
   738  func (bpwc *blockProviderWhenClosed) open() error {
   739  	var err error
   740  	bpwc.r, err = bpwc.rp.GetReader()
   741  	return err
   742  }
   743  
   744  func (bpwc *blockProviderWhenClosed) close() {
   745  	bpwc.rp.Close()
   746  	bpwc.r = nil
   747  }
   748  
   749  func (bpwc blockProviderWhenClosed) readBlockForVBR(
   750  	ctx context.Context, h BlockHandle, stats *base.InternalIteratorStats,
   751  ) (bufferHandle, error) {
   752  	ctx = objiotracing.WithBlockType(ctx, objiotracing.ValueBlock)
   753  	// TODO(jackson,sumeer): Consider whether to use a buffer pool in this case.
   754  	// The bpwc is not allowed to outlive the iterator tree, so it cannot
   755  	// outlive the buffer pool.
   756  	return bpwc.r.readBlock(ctx, h, nil, nil, stats, nil /* buffer pool */)
   757  }
   758  
   759  // ReaderProvider supports the implementation of blockProviderWhenClosed.
   760  // GetReader and Close can be called multiple times in pairs.
   761  type ReaderProvider interface {
   762  	GetReader() (r *Reader, err error)
   763  	Close()
   764  }
   765  
   766  // TrivialReaderProvider implements ReaderProvider for a Reader that will
   767  // outlive the top-level iterator in the iterator tree.
   768  type TrivialReaderProvider struct {
   769  	*Reader
   770  }
   771  
   772  var _ ReaderProvider = TrivialReaderProvider{}
   773  
   774  // GetReader implements ReaderProvider.
   775  func (trp TrivialReaderProvider) GetReader() (*Reader, error) {
   776  	return trp.Reader, nil
   777  }
   778  
   779  // Close implements ReaderProvider.
   780  func (trp TrivialReaderProvider) Close() {}
   781  
   782  // valueBlockReader is used to retrieve values in value
   783  // blocks. It is used when the sstable was written with
   784  // Properties.ValueBlocksAreEnabled.
   785  type valueBlockReader struct {
   786  	ctx    context.Context
   787  	bpOpen blockProviderWhenOpen
   788  	rp     ReaderProvider
   789  	vbih   valueBlocksIndexHandle
   790  	stats  *base.InternalIteratorStats
   791  
   792  	// The value blocks index is lazily retrieved the first time the reader
   793  	// needs to read a value that resides in a value block.
   794  	vbiBlock []byte
   795  	vbiCache bufferHandle
   796  	// When sequentially iterating through all key-value pairs, the cost of
   797  	// repeatedly getting a block that is already in the cache and releasing the
   798  	// bufferHandle can be ~40% of the cpu overhead. So the reader remembers the
   799  	// last value block it retrieved, in case there is locality of access, and
   800  	// this value block can be used for the next value retrieval.
   801  	valueBlockNum uint32
   802  	valueBlock    []byte
   803  	valueBlockPtr unsafe.Pointer
   804  	valueCache    bufferHandle
   805  	lazyFetcher   base.LazyFetcher
   806  	closed        bool
   807  	bufToMangle   []byte
   808  }
   809  
   810  func (r *valueBlockReader) getLazyValueForPrefixAndValueHandle(handle []byte) base.LazyValue {
   811  	fetcher := &r.lazyFetcher
   812  	valLen, h := decodeLenFromValueHandle(handle[1:])
   813  	*fetcher = base.LazyFetcher{
   814  		Fetcher: r,
   815  		Attribute: base.AttributeAndLen{
   816  			ValueLen:       int32(valLen),
   817  			ShortAttribute: getShortAttribute(valuePrefix(handle[0])),
   818  		},
   819  	}
   820  	if r.stats != nil {
   821  		r.stats.SeparatedPointValue.Count++
   822  		r.stats.SeparatedPointValue.ValueBytes += uint64(valLen)
   823  	}
   824  	return base.LazyValue{
   825  		ValueOrHandle: h,
   826  		Fetcher:       fetcher,
   827  	}
   828  }
   829  
   830  func (r *valueBlockReader) close() {
   831  	r.bpOpen = nil
   832  	r.vbiBlock = nil
   833  	r.vbiCache.Release()
   834  	// Set the handle to empty since Release does not nil the Handle.value. If
   835  	// we were to reopen this valueBlockReader and retrieve the same
   836  	// Handle.value from the cache, we don't want to accidentally unref it when
   837  	// attempting to unref the old handle.
   838  	r.vbiCache = bufferHandle{}
   839  	r.valueBlock = nil
   840  	r.valueBlockPtr = nil
   841  	r.valueCache.Release()
   842  	// See comment above.
   843  	r.valueCache = bufferHandle{}
   844  	r.closed = true
   845  	// rp, vbih, stats remain valid, so that LazyFetcher.ValueFetcher can be
   846  	// implemented.
   847  }
   848  
   849  // Fetch implements base.ValueFetcher.
   850  func (r *valueBlockReader) Fetch(
   851  	handle []byte, valLen int32, buf []byte,
   852  ) (val []byte, callerOwned bool, err error) {
   853  	if !r.closed {
   854  		val, err := r.getValueInternal(handle, valLen)
   855  		if invariants.Enabled {
   856  			val = r.doValueMangling(val)
   857  		}
   858  		return val, false, err
   859  	}
   860  
   861  	bp := blockProviderWhenClosed{rp: r.rp}
   862  	err = bp.open()
   863  	if err != nil {
   864  		return nil, false, err
   865  	}
   866  	defer bp.close()
   867  	defer r.close()
   868  	r.bpOpen = bp
   869  	var v []byte
   870  	v, err = r.getValueInternal(handle, valLen)
   871  	if err != nil {
   872  		return nil, false, err
   873  	}
   874  	buf = append(buf[:0], v...)
   875  	return buf, true, nil
   876  }
   877  
   878  // doValueMangling attempts to uncover violations of the contract listed in
   879  // the declaration comment of LazyValue. It is expensive, hence only called
   880  // when invariants.Enabled.
   881  func (r *valueBlockReader) doValueMangling(v []byte) []byte {
   882  	// Randomly set the bytes in the previous retrieved value to 0, since
   883  	// property P1 only requires the valueBlockReader to maintain the memory of
   884  	// one fetched value.
   885  	if rand.Intn(2) == 0 {
   886  		for i := range r.bufToMangle {
   887  			r.bufToMangle[i] = 0
   888  		}
   889  	}
   890  	// Store the current value in a new buffer for future mangling.
   891  	r.bufToMangle = append([]byte(nil), v...)
   892  	return r.bufToMangle
   893  }
   894  
   895  func (r *valueBlockReader) getValueInternal(handle []byte, valLen int32) (val []byte, err error) {
   896  	vh := decodeRemainingValueHandle(handle)
   897  	vh.valueLen = uint32(valLen)
   898  	if r.vbiBlock == nil {
   899  		ch, err := r.bpOpen.readBlockForVBR(r.ctx, r.vbih.h, r.stats)
   900  		if err != nil {
   901  			return nil, err
   902  		}
   903  		r.vbiCache = ch
   904  		r.vbiBlock = ch.Get()
   905  	}
   906  	if r.valueBlock == nil || r.valueBlockNum != vh.blockNum {
   907  		vbh, err := r.getBlockHandle(vh.blockNum)
   908  		if err != nil {
   909  			return nil, err
   910  		}
   911  		vbCacheHandle, err := r.bpOpen.readBlockForVBR(r.ctx, vbh, r.stats)
   912  		if err != nil {
   913  			return nil, err
   914  		}
   915  		r.valueBlockNum = vh.blockNum
   916  		r.valueCache.Release()
   917  		r.valueCache = vbCacheHandle
   918  		r.valueBlock = vbCacheHandle.Get()
   919  		r.valueBlockPtr = unsafe.Pointer(&r.valueBlock[0])
   920  	}
   921  	if r.stats != nil {
   922  		r.stats.SeparatedPointValue.ValueBytesFetched += uint64(valLen)
   923  	}
   924  	return r.valueBlock[vh.offsetInBlock : vh.offsetInBlock+vh.valueLen], nil
   925  }
   926  
   927  func (r *valueBlockReader) getBlockHandle(blockNum uint32) (BlockHandle, error) {
   928  	indexEntryLen :=
   929  		int(r.vbih.blockNumByteLength + r.vbih.blockOffsetByteLength + r.vbih.blockLengthByteLength)
   930  	offsetInIndex := indexEntryLen * int(blockNum)
   931  	if len(r.vbiBlock) < offsetInIndex+indexEntryLen {
   932  		return BlockHandle{}, errors.Errorf(
   933  			"cannot read at offset %d and length %d from block of length %d",
   934  			offsetInIndex, indexEntryLen, len(r.vbiBlock))
   935  	}
   936  	b := r.vbiBlock[offsetInIndex : offsetInIndex+indexEntryLen]
   937  	n := int(r.vbih.blockNumByteLength)
   938  	bn := littleEndianGet(b, n)
   939  	if uint32(bn) != blockNum {
   940  		return BlockHandle{},
   941  			errors.Errorf("expected block num %d but found %d", blockNum, bn)
   942  	}
   943  	b = b[n:]
   944  	n = int(r.vbih.blockOffsetByteLength)
   945  	blockOffset := littleEndianGet(b, n)
   946  	b = b[n:]
   947  	n = int(r.vbih.blockLengthByteLength)
   948  	blockLen := littleEndianGet(b, n)
   949  	return BlockHandle{Offset: blockOffset, Length: blockLen}, nil
   950  }