github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/sstable/writer.go (about)

     1  // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package sstable
     6  
     7  import (
     8  	"bytes"
     9  	"encoding/binary"
    10  	"fmt"
    11  	"math"
    12  	"runtime"
    13  	"sort"
    14  	"sync"
    15  
    16  	"github.com/cespare/xxhash/v2"
    17  	"github.com/cockroachdb/errors"
    18  	"github.com/cockroachdb/pebble/internal/base"
    19  	"github.com/cockroachdb/pebble/internal/bytealloc"
    20  	"github.com/cockroachdb/pebble/internal/cache"
    21  	"github.com/cockroachdb/pebble/internal/crc"
    22  	"github.com/cockroachdb/pebble/internal/invariants"
    23  	"github.com/cockroachdb/pebble/internal/keyspan"
    24  	"github.com/cockroachdb/pebble/internal/private"
    25  	"github.com/cockroachdb/pebble/internal/rangekey"
    26  	"github.com/cockroachdb/pebble/objstorage"
    27  )
    28  
    29  // encodedBHPEstimatedSize estimates the size of the encoded BlockHandleWithProperties.
    30  // It would also be nice to account for the length of the data block properties here,
    31  // but isn't necessary since this is an estimate.
    32  const encodedBHPEstimatedSize = binary.MaxVarintLen64 * 2
    33  
    34  var errWriterClosed = errors.New("pebble: writer is closed")
    35  
    36  // WriterMetadata holds info about a finished sstable.
    37  type WriterMetadata struct {
    38  	Size          uint64
    39  	SmallestPoint InternalKey
    40  	// LargestPoint, LargestRangeKey, LargestRangeDel should not be accessed
    41  	// before Writer.Close is called, because they may only be set on
    42  	// Writer.Close.
    43  	LargestPoint     InternalKey
    44  	SmallestRangeDel InternalKey
    45  	LargestRangeDel  InternalKey
    46  	SmallestRangeKey InternalKey
    47  	LargestRangeKey  InternalKey
    48  	HasPointKeys     bool
    49  	HasRangeDelKeys  bool
    50  	HasRangeKeys     bool
    51  	SmallestSeqNum   uint64
    52  	LargestSeqNum    uint64
    53  	Properties       Properties
    54  }
    55  
    56  // SetSmallestPointKey sets the smallest point key to the given key.
    57  // NB: this method set the "absolute" smallest point key. Any existing key is
    58  // overridden.
    59  func (m *WriterMetadata) SetSmallestPointKey(k InternalKey) {
    60  	m.SmallestPoint = k
    61  	m.HasPointKeys = true
    62  }
    63  
    64  // SetSmallestRangeDelKey sets the smallest rangedel key to the given key.
    65  // NB: this method set the "absolute" smallest rangedel key. Any existing key is
    66  // overridden.
    67  func (m *WriterMetadata) SetSmallestRangeDelKey(k InternalKey) {
    68  	m.SmallestRangeDel = k
    69  	m.HasRangeDelKeys = true
    70  }
    71  
    72  // SetSmallestRangeKey sets the smallest range key to the given key.
    73  // NB: this method set the "absolute" smallest range key. Any existing key is
    74  // overridden.
    75  func (m *WriterMetadata) SetSmallestRangeKey(k InternalKey) {
    76  	m.SmallestRangeKey = k
    77  	m.HasRangeKeys = true
    78  }
    79  
    80  // SetLargestPointKey sets the largest point key to the given key.
    81  // NB: this method set the "absolute" largest point key. Any existing key is
    82  // overridden.
    83  func (m *WriterMetadata) SetLargestPointKey(k InternalKey) {
    84  	m.LargestPoint = k
    85  	m.HasPointKeys = true
    86  }
    87  
    88  // SetLargestRangeDelKey sets the largest rangedel key to the given key.
    89  // NB: this method set the "absolute" largest rangedel key. Any existing key is
    90  // overridden.
    91  func (m *WriterMetadata) SetLargestRangeDelKey(k InternalKey) {
    92  	m.LargestRangeDel = k
    93  	m.HasRangeDelKeys = true
    94  }
    95  
    96  // SetLargestRangeKey sets the largest range key to the given key.
    97  // NB: this method set the "absolute" largest range key. Any existing key is
    98  // overridden.
    99  func (m *WriterMetadata) SetLargestRangeKey(k InternalKey) {
   100  	m.LargestRangeKey = k
   101  	m.HasRangeKeys = true
   102  }
   103  
   104  func (m *WriterMetadata) updateSeqNum(seqNum uint64) {
   105  	if m.SmallestSeqNum > seqNum {
   106  		m.SmallestSeqNum = seqNum
   107  	}
   108  	if m.LargestSeqNum < seqNum {
   109  		m.LargestSeqNum = seqNum
   110  	}
   111  }
   112  
   113  // Writer is a table writer.
   114  type Writer struct {
   115  	writable objstorage.Writable
   116  	meta     WriterMetadata
   117  	err      error
   118  	// cacheID and fileNum are used to remove blocks written to the sstable from
   119  	// the cache, providing a defense in depth against bugs which cause cache
   120  	// collisions.
   121  	cacheID uint64
   122  	fileNum base.DiskFileNum
   123  	// The following fields are copied from Options.
   124  	blockSize               int
   125  	blockSizeThreshold      int
   126  	indexBlockSize          int
   127  	indexBlockSizeThreshold int
   128  	compare                 Compare
   129  	split                   Split
   130  	formatKey               base.FormatKey
   131  	compression             Compression
   132  	separator               Separator
   133  	successor               Successor
   134  	tableFormat             TableFormat
   135  	isStrictObsolete        bool
   136  	writingToLowestLevel    bool
   137  	cache                   *cache.Cache
   138  	restartInterval         int
   139  	checksumType            ChecksumType
   140  	// disableKeyOrderChecks disables the checks that keys are added to an
   141  	// sstable in order. It is intended for internal use only in the construction
   142  	// of invalid sstables for testing. See tool/make_test_sstables.go.
   143  	disableKeyOrderChecks bool
   144  	// With two level indexes, the index/filter of a SST file is partitioned into
   145  	// smaller blocks with an additional top-level index on them. When reading an
   146  	// index/filter, only the top-level index is loaded into memory. The two level
   147  	// index/filter then uses the top-level index to load on demand into the block
   148  	// cache the partitions that are required to perform the index/filter query.
   149  	//
   150  	// Two level indexes are enabled automatically when there is more than one
   151  	// index block.
   152  	//
   153  	// This is useful when there are very large index blocks, which generally occurs
   154  	// with the usage of large keys. With large index blocks, the index blocks fight
   155  	// the data blocks for block cache space and the index blocks are likely to be
   156  	// re-read many times from the disk. The top level index, which has a much
   157  	// smaller memory footprint, can be used to prevent the entire index block from
   158  	// being loaded into the block cache.
   159  	twoLevelIndex bool
   160  	// Internal flag to allow creation of range-del-v1 format blocks. Only used
   161  	// for testing. Note that v2 format blocks are backwards compatible with v1
   162  	// format blocks.
   163  	rangeDelV1Format    bool
   164  	indexBlock          *indexBlockBuf
   165  	rangeDelBlock       blockWriter
   166  	rangeKeyBlock       blockWriter
   167  	topLevelIndexBlock  blockWriter
   168  	props               Properties
   169  	propCollectors      []TablePropertyCollector
   170  	blockPropCollectors []BlockPropertyCollector
   171  	obsoleteCollector   obsoleteKeyBlockPropertyCollector
   172  	blockPropsEncoder   blockPropertiesEncoder
   173  	// filter accumulates the filter block. If populated, the filter ingests
   174  	// either the output of w.split (i.e. a prefix extractor) if w.split is not
   175  	// nil, or the full keys otherwise.
   176  	filter          filterWriter
   177  	indexPartitions []indexBlockAndBlockProperties
   178  
   179  	// indexBlockAlloc is used to bulk-allocate byte slices used to store index
   180  	// blocks in indexPartitions. These live until the index finishes.
   181  	indexBlockAlloc []byte
   182  	// indexSepAlloc is used to bulk-allocate index block separator slices stored
   183  	// in indexPartitions. These live until the index finishes.
   184  	indexSepAlloc bytealloc.A
   185  
   186  	// To allow potentially overlapping (i.e. un-fragmented) range keys spans to
   187  	// be added to the Writer, a keyspan.Fragmenter is used to retain the keys
   188  	// and values, emitting fragmented, coalesced spans as appropriate. Range
   189  	// keys must be added in order of their start user-key.
   190  	fragmenter        keyspan.Fragmenter
   191  	rangeKeyEncoder   rangekey.Encoder
   192  	rangeKeysBySuffix keyspan.KeysBySuffix
   193  	rangeKeySpan      keyspan.Span
   194  	rkBuf             []byte
   195  	// dataBlockBuf consists of the state which is currently owned by and used by
   196  	// the Writer client goroutine. This state can be handed off to other goroutines.
   197  	dataBlockBuf *dataBlockBuf
   198  	// blockBuf consists of the state which is owned by and used by the Writer client
   199  	// goroutine.
   200  	blockBuf blockBuf
   201  
   202  	coordination coordinationState
   203  
   204  	// Information (other than the byte slice) about the last point key, to
   205  	// avoid extracting it again.
   206  	lastPointKeyInfo pointKeyInfo
   207  
   208  	// For value blocks.
   209  	shortAttributeExtractor   base.ShortAttributeExtractor
   210  	requiredInPlaceValueBound UserKeyPrefixBound
   211  	valueBlockWriter          *valueBlockWriter
   212  }
   213  
   214  type pointKeyInfo struct {
   215  	trailer uint64
   216  	// Only computed when w.valueBlockWriter is not nil.
   217  	userKeyLen int
   218  	// prefixLen uses w.split, if not nil. Only computed when w.valueBlockWriter
   219  	// is not nil.
   220  	prefixLen int
   221  	// True iff the point was marked obsolete.
   222  	isObsolete bool
   223  }
   224  
   225  type coordinationState struct {
   226  	parallelismEnabled bool
   227  
   228  	// writeQueue is used to write data blocks to disk. The writeQueue is primarily
   229  	// used to maintain the order in which data blocks must be written to disk. For
   230  	// this reason, every single data block write must be done through the writeQueue.
   231  	writeQueue *writeQueue
   232  
   233  	sizeEstimate dataBlockEstimates
   234  }
   235  
   236  func (c *coordinationState) init(parallelismEnabled bool, writer *Writer) {
   237  	c.parallelismEnabled = parallelismEnabled
   238  	// useMutex is false regardless of parallelismEnabled, because we do not do
   239  	// parallel compression yet.
   240  	c.sizeEstimate.useMutex = false
   241  
   242  	// writeQueueSize determines the size of the write queue, or the number
   243  	// of items which can be added to the queue without blocking. By default, we
   244  	// use a writeQueue size of 0, since we won't be doing any block writes in
   245  	// parallel.
   246  	writeQueueSize := 0
   247  	if parallelismEnabled {
   248  		writeQueueSize = runtime.GOMAXPROCS(0)
   249  	}
   250  	c.writeQueue = newWriteQueue(writeQueueSize, writer)
   251  }
   252  
   253  // sizeEstimate is a general purpose helper for estimating two kinds of sizes:
   254  // A. The compressed sstable size, which is useful for deciding when to start
   255  //
   256  //	a new sstable during flushes or compactions. In practice, we use this in
   257  //	estimating the data size (excluding the index).
   258  //
   259  // B. The size of index blocks to decide when to start a new index block.
   260  //
   261  // There are some terminology peculiarities which are due to the origin of
   262  // sizeEstimate for use case A with parallel compression enabled (for which
   263  // the code has not been merged). Specifically this relates to the terms
   264  // "written" and "compressed".
   265  //   - The notion of "written" for case A is sufficiently defined by saying that
   266  //     the data block is compressed. Waiting for the actual data block write to
   267  //     happen can result in unnecessary estimation, when we already know how big
   268  //     it will be in compressed form. Additionally, with the forthcoming value
   269  //     blocks containing older MVCC values, these compressed block will be held
   270  //     in-memory until late in the sstable writing, and we do want to accurately
   271  //     account for them without waiting for the actual write.
   272  //     For case B, "written" means that the index entry has been fully
   273  //     generated, and has been added to the uncompressed block buffer for that
   274  //     index block. It does not include actually writing a potentially
   275  //     compressed index block.
   276  //   - The notion of "compressed" is to differentiate between a "inflight" size
   277  //     and the actual size, and is handled via computing a compression ratio
   278  //     observed so far (defaults to 1).
   279  //     For case A, this is actual data block compression, so the "inflight" size
   280  //     is uncompressed blocks (that are no longer being written to) and the
   281  //     "compressed" size is after they have been compressed.
   282  //     For case B the inflight size is for a key-value pair in the index for
   283  //     which the value size (the encoded size of the BlockHandleWithProperties)
   284  //     is not accurately known, while the compressed size is the size of that
   285  //     entry when it has been added to the (in-progress) index ssblock.
   286  //
   287  // Usage: To update state, one can optionally provide an inflight write value
   288  // using addInflight (used for case B). When something is "written" the state
   289  // can be updated using either writtenWithDelta or writtenWithTotal, which
   290  // provide the actual delta size or the total size (latter must be
   291  // monotonically non-decreasing). If there were no calls to addInflight, there
   292  // isn't any real estimation happening here. So case A does not do any real
   293  // estimation. However, when we introduce parallel compression, there will be
   294  // estimation in that the client goroutine will call addInFlight and the
   295  // compression goroutines will call writtenWithDelta.
   296  type sizeEstimate struct {
   297  	// emptySize is the size when there is no inflight data, and numEntries is 0.
   298  	// emptySize is constant once set.
   299  	emptySize uint64
   300  
   301  	// inflightSize is the estimated size of some inflight data which hasn't
   302  	// been written yet.
   303  	inflightSize uint64
   304  
   305  	// totalSize is the total size of the data which has already been written.
   306  	totalSize uint64
   307  
   308  	// numWrittenEntries is the total number of entries which have already been
   309  	// written.
   310  	numWrittenEntries uint64
   311  	// numInflightEntries is the total number of entries which are inflight, and
   312  	// haven't been written.
   313  	numInflightEntries uint64
   314  
   315  	// maxEstimatedSize stores the maximum result returned from sizeEstimate.size.
   316  	// It ensures that values returned from subsequent calls to Writer.EstimatedSize
   317  	// never decrease.
   318  	maxEstimatedSize uint64
   319  
   320  	// We assume that the entries added to the sizeEstimate can be compressed.
   321  	// For this reason, we keep track of a compressedSize and an uncompressedSize
   322  	// to compute a compression ratio for the inflight entries. If the entries
   323  	// aren't being compressed, then compressedSize and uncompressedSize must be
   324  	// equal.
   325  	compressedSize   uint64
   326  	uncompressedSize uint64
   327  }
   328  
   329  func (s *sizeEstimate) init(emptySize uint64) {
   330  	s.emptySize = emptySize
   331  }
   332  
   333  func (s *sizeEstimate) size() uint64 {
   334  	ratio := float64(1)
   335  	if s.uncompressedSize > 0 {
   336  		ratio = float64(s.compressedSize) / float64(s.uncompressedSize)
   337  	}
   338  	estimatedInflightSize := uint64(float64(s.inflightSize) * ratio)
   339  	total := s.totalSize + estimatedInflightSize
   340  	if total > s.maxEstimatedSize {
   341  		s.maxEstimatedSize = total
   342  	} else {
   343  		total = s.maxEstimatedSize
   344  	}
   345  
   346  	if total == 0 {
   347  		return s.emptySize
   348  	}
   349  
   350  	return total
   351  }
   352  
   353  func (s *sizeEstimate) numTotalEntries() uint64 {
   354  	return s.numWrittenEntries + s.numInflightEntries
   355  }
   356  
   357  func (s *sizeEstimate) addInflight(size int) {
   358  	s.numInflightEntries++
   359  	s.inflightSize += uint64(size)
   360  }
   361  
   362  func (s *sizeEstimate) writtenWithTotal(newTotalSize uint64, inflightSize int) {
   363  	finalEntrySize := int(newTotalSize - s.totalSize)
   364  	s.writtenWithDelta(finalEntrySize, inflightSize)
   365  }
   366  
   367  func (s *sizeEstimate) writtenWithDelta(finalEntrySize int, inflightSize int) {
   368  	if inflightSize > 0 {
   369  		// This entry was previously inflight, so we should decrement inflight
   370  		// entries and update the "compression" stats for future estimation.
   371  		s.numInflightEntries--
   372  		s.inflightSize -= uint64(inflightSize)
   373  		s.uncompressedSize += uint64(inflightSize)
   374  		s.compressedSize += uint64(finalEntrySize)
   375  	}
   376  	s.numWrittenEntries++
   377  	s.totalSize += uint64(finalEntrySize)
   378  }
   379  
   380  func (s *sizeEstimate) clear() {
   381  	*s = sizeEstimate{emptySize: s.emptySize}
   382  }
   383  
   384  type indexBlockBuf struct {
   385  	// block will only be accessed from the writeQueue.
   386  	block blockWriter
   387  
   388  	size struct {
   389  		useMutex bool
   390  		mu       sync.Mutex
   391  		estimate sizeEstimate
   392  	}
   393  
   394  	// restartInterval matches indexBlockBuf.block.restartInterval. We store it twice, because the `block`
   395  	// must only be accessed from the writeQueue goroutine.
   396  	restartInterval int
   397  }
   398  
   399  func (i *indexBlockBuf) clear() {
   400  	i.block.clear()
   401  	if i.size.useMutex {
   402  		i.size.mu.Lock()
   403  		defer i.size.mu.Unlock()
   404  	}
   405  	i.size.estimate.clear()
   406  	i.restartInterval = 0
   407  }
   408  
   409  var indexBlockBufPool = sync.Pool{
   410  	New: func() interface{} {
   411  		return &indexBlockBuf{}
   412  	},
   413  }
   414  
   415  const indexBlockRestartInterval = 1
   416  
   417  func newIndexBlockBuf(useMutex bool) *indexBlockBuf {
   418  	i := indexBlockBufPool.Get().(*indexBlockBuf)
   419  	i.size.useMutex = useMutex
   420  	i.restartInterval = indexBlockRestartInterval
   421  	i.block.restartInterval = indexBlockRestartInterval
   422  	i.size.estimate.init(emptyBlockSize)
   423  	return i
   424  }
   425  
   426  func (i *indexBlockBuf) shouldFlush(
   427  	sep InternalKey, valueLen, targetBlockSize, sizeThreshold int,
   428  ) bool {
   429  	if i.size.useMutex {
   430  		i.size.mu.Lock()
   431  		defer i.size.mu.Unlock()
   432  	}
   433  
   434  	nEntries := i.size.estimate.numTotalEntries()
   435  	return shouldFlush(
   436  		sep, valueLen, i.restartInterval, int(i.size.estimate.size()),
   437  		int(nEntries), targetBlockSize, sizeThreshold)
   438  }
   439  
   440  func (i *indexBlockBuf) add(key InternalKey, value []byte, inflightSize int) {
   441  	i.block.add(key, value)
   442  	size := i.block.estimatedSize()
   443  	if i.size.useMutex {
   444  		i.size.mu.Lock()
   445  		defer i.size.mu.Unlock()
   446  	}
   447  	i.size.estimate.writtenWithTotal(uint64(size), inflightSize)
   448  }
   449  
   450  func (i *indexBlockBuf) finish() []byte {
   451  	b := i.block.finish()
   452  	return b
   453  }
   454  
   455  func (i *indexBlockBuf) addInflight(inflightSize int) {
   456  	if i.size.useMutex {
   457  		i.size.mu.Lock()
   458  		defer i.size.mu.Unlock()
   459  	}
   460  	i.size.estimate.addInflight(inflightSize)
   461  }
   462  
   463  func (i *indexBlockBuf) estimatedSize() uint64 {
   464  	if i.size.useMutex {
   465  		i.size.mu.Lock()
   466  		defer i.size.mu.Unlock()
   467  	}
   468  
   469  	// Make sure that the size estimation works as expected when parallelism
   470  	// is disabled.
   471  	if invariants.Enabled && !i.size.useMutex {
   472  		if i.size.estimate.inflightSize != 0 {
   473  			panic("unexpected inflight entry in index block size estimation")
   474  		}
   475  
   476  		// NB: The i.block should only be accessed from the writeQueue goroutine,
   477  		// when parallelism is enabled. We break that invariant here, but that's
   478  		// okay since parallelism is disabled.
   479  		if i.size.estimate.size() != uint64(i.block.estimatedSize()) {
   480  			panic("index block size estimation sans parallelism is incorrect")
   481  		}
   482  	}
   483  	return i.size.estimate.size()
   484  }
   485  
   486  // sizeEstimate is used for sstable size estimation. sizeEstimate can be
   487  // accessed by the Writer client and compressionQueue goroutines. Fields
   488  // should only be read/updated through the functions defined on the
   489  // *sizeEstimate type.
   490  type dataBlockEstimates struct {
   491  	// If we don't do block compression in parallel, then we don't need to take
   492  	// the performance hit of synchronizing using this mutex.
   493  	useMutex bool
   494  	mu       sync.Mutex
   495  
   496  	estimate sizeEstimate
   497  }
   498  
   499  // inflightSize is the uncompressed block size estimate which has been
   500  // previously provided to addInflightDataBlock(). If addInflightDataBlock()
   501  // has not been called, this must be set to 0. compressedSize is the
   502  // compressed size of the block.
   503  func (d *dataBlockEstimates) dataBlockCompressed(compressedSize int, inflightSize int) {
   504  	if d.useMutex {
   505  		d.mu.Lock()
   506  		defer d.mu.Unlock()
   507  	}
   508  	d.estimate.writtenWithDelta(compressedSize+blockTrailerLen, inflightSize)
   509  }
   510  
   511  // size is an estimated size of datablock data which has been written to disk.
   512  func (d *dataBlockEstimates) size() uint64 {
   513  	if d.useMutex {
   514  		d.mu.Lock()
   515  		defer d.mu.Unlock()
   516  	}
   517  	// If there is no parallel compression, there should not be any inflight bytes.
   518  	if invariants.Enabled && !d.useMutex {
   519  		if d.estimate.inflightSize != 0 {
   520  			panic("unexpected inflight entry in data block size estimation")
   521  		}
   522  	}
   523  	return d.estimate.size()
   524  }
   525  
   526  // Avoid linter unused error.
   527  var _ = (&dataBlockEstimates{}).addInflightDataBlock
   528  
   529  // NB: unused since no parallel compression.
   530  func (d *dataBlockEstimates) addInflightDataBlock(size int) {
   531  	if d.useMutex {
   532  		d.mu.Lock()
   533  		defer d.mu.Unlock()
   534  	}
   535  
   536  	d.estimate.addInflight(size)
   537  }
   538  
   539  var writeTaskPool = sync.Pool{
   540  	New: func() interface{} {
   541  		t := &writeTask{}
   542  		t.compressionDone = make(chan bool, 1)
   543  		return t
   544  	},
   545  }
   546  
   547  type checksummer struct {
   548  	checksumType ChecksumType
   549  	xxHasher     *xxhash.Digest
   550  }
   551  
   552  func (c *checksummer) checksum(block []byte, blockType []byte) (checksum uint32) {
   553  	// Calculate the checksum.
   554  	switch c.checksumType {
   555  	case ChecksumTypeCRC32c:
   556  		checksum = crc.New(block).Update(blockType).Value()
   557  	case ChecksumTypeXXHash64:
   558  		if c.xxHasher == nil {
   559  			c.xxHasher = xxhash.New()
   560  		} else {
   561  			c.xxHasher.Reset()
   562  		}
   563  		c.xxHasher.Write(block)
   564  		c.xxHasher.Write(blockType)
   565  		checksum = uint32(c.xxHasher.Sum64())
   566  	default:
   567  		panic(errors.Newf("unsupported checksum type: %d", c.checksumType))
   568  	}
   569  	return checksum
   570  }
   571  
   572  type blockBuf struct {
   573  	// tmp is a scratch buffer, large enough to hold either footerLen bytes,
   574  	// blockTrailerLen bytes, (5 * binary.MaxVarintLen64) bytes, and most
   575  	// likely large enough for a block handle with properties.
   576  	tmp [blockHandleLikelyMaxLen]byte
   577  	// compressedBuf is the destination buffer for compression. It is re-used over the
   578  	// lifetime of the blockBuf, avoiding the allocation of a temporary buffer for each block.
   579  	compressedBuf []byte
   580  	checksummer   checksummer
   581  }
   582  
   583  func (b *blockBuf) clear() {
   584  	// We can't assign b.compressedBuf[:0] to compressedBuf because snappy relies
   585  	// on the length of the buffer, and not the capacity to determine if it needs
   586  	// to make an allocation.
   587  	*b = blockBuf{
   588  		compressedBuf: b.compressedBuf, checksummer: b.checksummer,
   589  	}
   590  }
   591  
   592  // A dataBlockBuf holds all the state required to compress and write a data block to disk.
   593  // A dataBlockBuf begins its lifecycle owned by the Writer client goroutine. The Writer
   594  // client goroutine adds keys to the sstable, writing directly into a dataBlockBuf's blockWriter
   595  // until the block is full. Once a dataBlockBuf's block is full, the dataBlockBuf may be passed
   596  // to other goroutines for compression and file I/O.
   597  type dataBlockBuf struct {
   598  	blockBuf
   599  	dataBlock blockWriter
   600  
   601  	// uncompressed is a reference to a byte slice which is owned by the dataBlockBuf. It is the
   602  	// next byte slice to be compressed. The uncompressed byte slice will be backed by the
   603  	// dataBlock.buf.
   604  	uncompressed []byte
   605  	// compressed is a reference to a byte slice which is owned by the dataBlockBuf. It is the
   606  	// compressed byte slice which must be written to disk. The compressed byte slice may be
   607  	// backed by the dataBlock.buf, or the dataBlockBuf.compressedBuf, depending on whether
   608  	// we use the result of the compression.
   609  	compressed []byte
   610  
   611  	// We're making calls to BlockPropertyCollectors from the Writer client goroutine. We need to
   612  	// pass the encoded block properties over to the write queue. To prevent copies, and allocations,
   613  	// we give each dataBlockBuf, a blockPropertiesEncoder.
   614  	blockPropsEncoder blockPropertiesEncoder
   615  	// dataBlockProps is set when Writer.finishDataBlockProps is called. The dataBlockProps slice is
   616  	// a shallow copy of the internal buffer of the dataBlockBuf.blockPropsEncoder.
   617  	dataBlockProps []byte
   618  
   619  	// sepScratch is reusable scratch space for computing separator keys.
   620  	sepScratch []byte
   621  }
   622  
   623  func (d *dataBlockBuf) clear() {
   624  	d.blockBuf.clear()
   625  	d.dataBlock.clear()
   626  
   627  	d.uncompressed = nil
   628  	d.compressed = nil
   629  	d.dataBlockProps = nil
   630  	d.sepScratch = d.sepScratch[:0]
   631  }
   632  
   633  var dataBlockBufPool = sync.Pool{
   634  	New: func() interface{} {
   635  		return &dataBlockBuf{}
   636  	},
   637  }
   638  
   639  func newDataBlockBuf(restartInterval int, checksumType ChecksumType) *dataBlockBuf {
   640  	d := dataBlockBufPool.Get().(*dataBlockBuf)
   641  	d.dataBlock.restartInterval = restartInterval
   642  	d.checksummer.checksumType = checksumType
   643  	return d
   644  }
   645  
   646  func (d *dataBlockBuf) finish() {
   647  	d.uncompressed = d.dataBlock.finish()
   648  }
   649  
   650  func (d *dataBlockBuf) compressAndChecksum(c Compression) {
   651  	d.compressed = compressAndChecksum(d.uncompressed, c, &d.blockBuf)
   652  }
   653  
   654  func (d *dataBlockBuf) shouldFlush(
   655  	key InternalKey, valueLen, targetBlockSize, sizeThreshold int,
   656  ) bool {
   657  	return shouldFlush(
   658  		key, valueLen, d.dataBlock.restartInterval, d.dataBlock.estimatedSize(),
   659  		d.dataBlock.nEntries, targetBlockSize, sizeThreshold)
   660  }
   661  
   662  type indexBlockAndBlockProperties struct {
   663  	nEntries int
   664  	// sep is the last key added to this block, for computing a separator later.
   665  	sep        InternalKey
   666  	properties []byte
   667  	// block is the encoded block produced by blockWriter.finish.
   668  	block []byte
   669  }
   670  
   671  // Set sets the value for the given key. The sequence number is set to 0.
   672  // Intended for use to externally construct an sstable before ingestion into a
   673  // DB. For a given Writer, the keys passed to Set must be in strictly increasing
   674  // order.
   675  //
   676  // TODO(peter): untested
   677  func (w *Writer) Set(key, value []byte) error {
   678  	if w.err != nil {
   679  		return w.err
   680  	}
   681  	if w.isStrictObsolete {
   682  		return errors.Errorf("use AddWithForceObsolete")
   683  	}
   684  	// forceObsolete is false based on the assumption that no RANGEDELs in the
   685  	// sstable delete the added points.
   686  	return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindSet), value, false)
   687  }
   688  
   689  // Delete deletes the value for the given key. The sequence number is set to
   690  // 0. Intended for use to externally construct an sstable before ingestion into
   691  // a DB.
   692  //
   693  // TODO(peter): untested
   694  func (w *Writer) Delete(key []byte) error {
   695  	if w.err != nil {
   696  		return w.err
   697  	}
   698  	if w.isStrictObsolete {
   699  		return errors.Errorf("use AddWithForceObsolete")
   700  	}
   701  	// forceObsolete is false based on the assumption that no RANGEDELs in the
   702  	// sstable delete the added points.
   703  	return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindDelete), nil, false)
   704  }
   705  
   706  // DeleteRange deletes all of the keys (and values) in the range [start,end)
   707  // (inclusive on start, exclusive on end). The sequence number is set to
   708  // 0. Intended for use to externally construct an sstable before ingestion into
   709  // a DB.
   710  //
   711  // TODO(peter): untested
   712  func (w *Writer) DeleteRange(start, end []byte) error {
   713  	if w.err != nil {
   714  		return w.err
   715  	}
   716  	return w.addTombstone(base.MakeInternalKey(start, 0, InternalKeyKindRangeDelete), end)
   717  }
   718  
   719  // Merge adds an action to the DB that merges the value at key with the new
   720  // value. The details of the merge are dependent upon the configured merge
   721  // operator. The sequence number is set to 0. Intended for use to externally
   722  // construct an sstable before ingestion into a DB.
   723  //
   724  // TODO(peter): untested
   725  func (w *Writer) Merge(key, value []byte) error {
   726  	if w.err != nil {
   727  		return w.err
   728  	}
   729  	if w.isStrictObsolete {
   730  		return errors.Errorf("use AddWithForceObsolete")
   731  	}
   732  	// forceObsolete is false based on the assumption that no RANGEDELs in the
   733  	// sstable that delete the added points. If the user configured this writer
   734  	// to be strict-obsolete, addPoint will reject the addition of this MERGE.
   735  	return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindMerge), value, false)
   736  }
   737  
   738  // Add adds a key/value pair to the table being written. For a given Writer,
   739  // the keys passed to Add must be in increasing order. The exception to this
   740  // rule is range deletion tombstones. Range deletion tombstones need to be
   741  // added ordered by their start key, but they can be added out of order from
   742  // point entries. Additionally, range deletion tombstones must be fragmented
   743  // (i.e. by keyspan.Fragmenter).
   744  func (w *Writer) Add(key InternalKey, value []byte) error {
   745  	if w.isStrictObsolete {
   746  		return errors.Errorf("use AddWithForceObsolete")
   747  	}
   748  	return w.AddWithForceObsolete(key, value, false)
   749  }
   750  
   751  // AddWithForceObsolete must be used when writing a strict-obsolete sstable.
   752  //
   753  // forceObsolete indicates whether the caller has determined that this key is
   754  // obsolete even though it may be the latest point key for this userkey. This
   755  // should be set to true for keys obsoleted by RANGEDELs, and is required for
   756  // strict-obsolete sstables.
   757  //
   758  // Note that there are two properties, S1 and S2 (see comment in format.go)
   759  // that strict-obsolete ssts must satisfy. S2, due to RANGEDELs, is solely the
   760  // responsibility of the caller. S1 is solely the responsibility of the
   761  // callee.
   762  func (w *Writer) AddWithForceObsolete(key InternalKey, value []byte, forceObsolete bool) error {
   763  	if w.err != nil {
   764  		return w.err
   765  	}
   766  
   767  	switch key.Kind() {
   768  	case InternalKeyKindRangeDelete:
   769  		return w.addTombstone(key, value)
   770  	case base.InternalKeyKindRangeKeyDelete,
   771  		base.InternalKeyKindRangeKeySet,
   772  		base.InternalKeyKindRangeKeyUnset:
   773  		w.err = errors.Errorf(
   774  			"pebble: range keys must be added via one of the RangeKey* functions")
   775  		return w.err
   776  	}
   777  	return w.addPoint(key, value, forceObsolete)
   778  }
   779  
   780  func (w *Writer) makeAddPointDecisionV2(key InternalKey) error {
   781  	prevTrailer := w.lastPointKeyInfo.trailer
   782  	w.lastPointKeyInfo.trailer = key.Trailer
   783  	if w.dataBlockBuf.dataBlock.nEntries == 0 {
   784  		return nil
   785  	}
   786  	if !w.disableKeyOrderChecks {
   787  		prevPointUserKey := w.dataBlockBuf.dataBlock.getCurUserKey()
   788  		cmpUser := w.compare(prevPointUserKey, key.UserKey)
   789  		if cmpUser > 0 || (cmpUser == 0 && prevTrailer <= key.Trailer) {
   790  			return errors.Errorf(
   791  				"pebble: keys must be added in strictly increasing order: %s, %s",
   792  				InternalKey{UserKey: prevPointUserKey, Trailer: prevTrailer}.Pretty(w.formatKey),
   793  				key.Pretty(w.formatKey))
   794  		}
   795  	}
   796  	return nil
   797  }
   798  
   799  // REQUIRES: at least one point has been written to the Writer.
   800  func (w *Writer) getLastPointUserKey() []byte {
   801  	if w.dataBlockBuf.dataBlock.nEntries == 0 {
   802  		panic(errors.AssertionFailedf("no point keys added to writer"))
   803  	}
   804  	return w.dataBlockBuf.dataBlock.getCurUserKey()
   805  }
   806  
   807  func (w *Writer) makeAddPointDecisionV3(
   808  	key InternalKey, valueLen int,
   809  ) (setHasSamePrefix bool, writeToValueBlock bool, isObsolete bool, err error) {
   810  	prevPointKeyInfo := w.lastPointKeyInfo
   811  	w.lastPointKeyInfo.userKeyLen = len(key.UserKey)
   812  	w.lastPointKeyInfo.prefixLen = w.lastPointKeyInfo.userKeyLen
   813  	if w.split != nil {
   814  		w.lastPointKeyInfo.prefixLen = w.split(key.UserKey)
   815  	}
   816  	w.lastPointKeyInfo.trailer = key.Trailer
   817  	w.lastPointKeyInfo.isObsolete = false
   818  	if !w.meta.HasPointKeys {
   819  		return false, false, false, nil
   820  	}
   821  	keyKind := base.TrailerKind(key.Trailer)
   822  	prevPointUserKey := w.getLastPointUserKey()
   823  	prevPointKey := InternalKey{UserKey: prevPointUserKey, Trailer: prevPointKeyInfo.trailer}
   824  	prevKeyKind := base.TrailerKind(prevPointKeyInfo.trailer)
   825  	considerWriteToValueBlock := prevKeyKind == InternalKeyKindSet &&
   826  		keyKind == InternalKeyKindSet
   827  	if considerWriteToValueBlock && !w.requiredInPlaceValueBound.IsEmpty() {
   828  		keyPrefix := key.UserKey[:w.lastPointKeyInfo.prefixLen]
   829  		cmpUpper := w.compare(
   830  			w.requiredInPlaceValueBound.Upper, keyPrefix)
   831  		if cmpUpper <= 0 {
   832  			// Common case for CockroachDB. Make it empty since all future keys in
   833  			// this sstable will also have cmpUpper <= 0.
   834  			w.requiredInPlaceValueBound = UserKeyPrefixBound{}
   835  		} else if w.compare(keyPrefix, w.requiredInPlaceValueBound.Lower) >= 0 {
   836  			considerWriteToValueBlock = false
   837  		}
   838  	}
   839  	// cmpPrefix is initialized iff considerWriteToValueBlock.
   840  	var cmpPrefix int
   841  	var cmpUser int
   842  	if considerWriteToValueBlock {
   843  		// Compare the prefixes.
   844  		cmpPrefix = w.compare(prevPointUserKey[:prevPointKeyInfo.prefixLen],
   845  			key.UserKey[:w.lastPointKeyInfo.prefixLen])
   846  		cmpUser = cmpPrefix
   847  		if cmpPrefix == 0 {
   848  			// Need to compare suffixes to compute cmpUser.
   849  			cmpUser = w.compare(prevPointUserKey[prevPointKeyInfo.prefixLen:],
   850  				key.UserKey[w.lastPointKeyInfo.prefixLen:])
   851  		}
   852  	} else {
   853  		cmpUser = w.compare(prevPointUserKey, key.UserKey)
   854  	}
   855  	// Ensure that no one adds a point key kind without considering the obsolete
   856  	// handling for that kind.
   857  	switch keyKind {
   858  	case InternalKeyKindSet, InternalKeyKindSetWithDelete, InternalKeyKindMerge,
   859  		InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
   860  	default:
   861  		panic(errors.AssertionFailedf("unexpected key kind %s", keyKind.String()))
   862  	}
   863  	// If same user key, then the current key is obsolete if any of the
   864  	// following is true:
   865  	// C1 The prev key was obsolete.
   866  	// C2 The prev key was not a MERGE. When the previous key is a MERGE we must
   867  	//    preserve SET* and MERGE since their values will be merged into the
   868  	//    previous key. We also must preserve DEL* since there may be an older
   869  	//    SET*/MERGE in a lower level that must not be merged with the MERGE --
   870  	//    if we omit the DEL* that lower SET*/MERGE will become visible.
   871  	//
   872  	// Regardless of whether it is the same user key or not
   873  	// C3 The current key is some kind of point delete, and we are writing to
   874  	//    the lowest level, then it is also obsolete. The correctness of this
   875  	//    relies on the same user key not spanning multiple sstables in a level.
   876  	//
   877  	// C1 ensures that for a user key there is at most one transition from
   878  	// !obsolete to obsolete. Consider a user key k, for which the first n keys
   879  	// are not obsolete. We consider the various value of n:
   880  	//
   881  	// n = 0: This happens due to forceObsolete being set by the caller, or due
   882  	// to C3. forceObsolete must only be set due a RANGEDEL, and that RANGEDEL
   883  	// must also delete all the lower seqnums for the same user key. C3 triggers
   884  	// due to a point delete and that deletes all the lower seqnums for the same
   885  	// user key.
   886  	//
   887  	// n = 1: This is the common case. It happens when the first key is not a
   888  	// MERGE, or the current key is some kind of point delete.
   889  	//
   890  	// n > 1: This is due to a sequence of MERGE keys, potentially followed by a
   891  	// single non-MERGE key.
   892  	isObsoleteC1AndC2 := cmpUser == 0 &&
   893  		(prevPointKeyInfo.isObsolete || prevKeyKind != InternalKeyKindMerge)
   894  	isObsoleteC3 := w.writingToLowestLevel &&
   895  		(keyKind == InternalKeyKindDelete || keyKind == InternalKeyKindSingleDelete ||
   896  			keyKind == InternalKeyKindDeleteSized)
   897  	isObsolete = isObsoleteC1AndC2 || isObsoleteC3
   898  	// TODO(sumeer): storing isObsolete SET and SETWITHDEL in value blocks is
   899  	// possible, but requires some care in documenting and checking invariants.
   900  	// There is code that assumes nothing in value blocks because of single MVCC
   901  	// version (those should be ok). We have to ensure setHasSamePrefix is
   902  	// correctly initialized here etc.
   903  
   904  	if !w.disableKeyOrderChecks &&
   905  		(cmpUser > 0 || (cmpUser == 0 && prevPointKeyInfo.trailer <= key.Trailer)) {
   906  		return false, false, false, errors.Errorf(
   907  			"pebble: keys must be added in strictly increasing order: %s, %s",
   908  			prevPointKey.Pretty(w.formatKey), key.Pretty(w.formatKey))
   909  	}
   910  	if !considerWriteToValueBlock {
   911  		return false, false, isObsolete, nil
   912  	}
   913  	// NB: it is possible that cmpUser == 0, i.e., these two SETs have identical
   914  	// user keys (because of an open snapshot). This should be the rare case.
   915  	setHasSamePrefix = cmpPrefix == 0
   916  	considerWriteToValueBlock = setHasSamePrefix
   917  	// Use of 0 here is somewhat arbitrary. Given the minimum 3 byte encoding of
   918  	// valueHandle, this should be > 3. But tiny values are common in test and
   919  	// unlikely in production, so we use 0 here for better test coverage.
   920  	const tinyValueThreshold = 0
   921  	if considerWriteToValueBlock && valueLen <= tinyValueThreshold {
   922  		considerWriteToValueBlock = false
   923  	}
   924  	return setHasSamePrefix, considerWriteToValueBlock, isObsolete, nil
   925  }
   926  
   927  func (w *Writer) addPoint(key InternalKey, value []byte, forceObsolete bool) error {
   928  	if w.isStrictObsolete && key.Kind() == InternalKeyKindMerge {
   929  		return errors.Errorf("MERGE not supported in a strict-obsolete sstable")
   930  	}
   931  	var err error
   932  	var setHasSameKeyPrefix, writeToValueBlock, addPrefixToValueStoredWithKey bool
   933  	var isObsolete bool
   934  	maxSharedKeyLen := len(key.UserKey)
   935  	if w.valueBlockWriter != nil {
   936  		// maxSharedKeyLen is limited to the prefix of the preceding key. If the
   937  		// preceding key was in a different block, then the blockWriter will
   938  		// ignore this maxSharedKeyLen.
   939  		maxSharedKeyLen = w.lastPointKeyInfo.prefixLen
   940  		setHasSameKeyPrefix, writeToValueBlock, isObsolete, err =
   941  			w.makeAddPointDecisionV3(key, len(value))
   942  		addPrefixToValueStoredWithKey = base.TrailerKind(key.Trailer) == InternalKeyKindSet
   943  	} else {
   944  		err = w.makeAddPointDecisionV2(key)
   945  	}
   946  	if err != nil {
   947  		return err
   948  	}
   949  	isObsolete = w.tableFormat >= TableFormatPebblev4 && (isObsolete || forceObsolete)
   950  	w.lastPointKeyInfo.isObsolete = isObsolete
   951  	var valueStoredWithKey []byte
   952  	var prefix valuePrefix
   953  	var valueStoredWithKeyLen int
   954  	if writeToValueBlock {
   955  		vh, err := w.valueBlockWriter.addValue(value)
   956  		if err != nil {
   957  			return err
   958  		}
   959  		n := encodeValueHandle(w.blockBuf.tmp[:], vh)
   960  		valueStoredWithKey = w.blockBuf.tmp[:n]
   961  		valueStoredWithKeyLen = len(valueStoredWithKey) + 1
   962  		var attribute base.ShortAttribute
   963  		if w.shortAttributeExtractor != nil {
   964  			// TODO(sumeer): for compactions, it is possible that the input sstable
   965  			// already has this value in the value section and so we have already
   966  			// extracted the ShortAttribute. Avoid extracting it again. This will
   967  			// require changing the Writer.Add interface.
   968  			if attribute, err = w.shortAttributeExtractor(
   969  				key.UserKey, w.lastPointKeyInfo.prefixLen, value); err != nil {
   970  				return err
   971  			}
   972  		}
   973  		prefix = makePrefixForValueHandle(setHasSameKeyPrefix, attribute)
   974  	} else {
   975  		valueStoredWithKey = value
   976  		valueStoredWithKeyLen = len(value)
   977  		if addPrefixToValueStoredWithKey {
   978  			valueStoredWithKeyLen++
   979  		}
   980  		prefix = makePrefixForInPlaceValue(setHasSameKeyPrefix)
   981  	}
   982  
   983  	if err := w.maybeFlush(key, valueStoredWithKeyLen); err != nil {
   984  		return err
   985  	}
   986  
   987  	for i := range w.propCollectors {
   988  		if err := w.propCollectors[i].Add(key, value); err != nil {
   989  			w.err = err
   990  			return err
   991  		}
   992  	}
   993  	for i := range w.blockPropCollectors {
   994  		v := value
   995  		if addPrefixToValueStoredWithKey {
   996  			// Values for SET are not required to be in-place, and in the future may
   997  			// not even be read by the compaction, so pass nil values. Block
   998  			// property collectors in such Pebble DB's must not look at the value.
   999  			v = nil
  1000  		}
  1001  		if err := w.blockPropCollectors[i].Add(key, v); err != nil {
  1002  			w.err = err
  1003  			return err
  1004  		}
  1005  	}
  1006  	if w.tableFormat >= TableFormatPebblev4 {
  1007  		w.obsoleteCollector.AddPoint(isObsolete)
  1008  	}
  1009  
  1010  	w.maybeAddToFilter(key.UserKey)
  1011  	w.dataBlockBuf.dataBlock.addWithOptionalValuePrefix(
  1012  		key, isObsolete, valueStoredWithKey, maxSharedKeyLen, addPrefixToValueStoredWithKey, prefix,
  1013  		setHasSameKeyPrefix)
  1014  
  1015  	w.meta.updateSeqNum(key.SeqNum())
  1016  
  1017  	if !w.meta.HasPointKeys {
  1018  		k := w.dataBlockBuf.dataBlock.getCurKey()
  1019  		// NB: We need to ensure that SmallestPoint.UserKey is set, so we create
  1020  		// an InternalKey which is semantically identical to the key, but won't
  1021  		// have a nil UserKey. We do this, because key.UserKey could be nil, and
  1022  		// we don't want SmallestPoint.UserKey to be nil.
  1023  		//
  1024  		// todo(bananabrick): Determine if it's okay to have a nil SmallestPoint
  1025  		// .UserKey now that we don't rely on a nil UserKey to determine if the
  1026  		// key has been set or not.
  1027  		w.meta.SetSmallestPointKey(k.Clone())
  1028  	}
  1029  
  1030  	w.props.NumEntries++
  1031  	switch key.Kind() {
  1032  	case InternalKeyKindDelete, InternalKeyKindSingleDelete:
  1033  		w.props.NumDeletions++
  1034  		w.props.RawPointTombstoneKeySize += uint64(len(key.UserKey))
  1035  	case InternalKeyKindDeleteSized:
  1036  		var size uint64
  1037  		if len(value) > 0 {
  1038  			var n int
  1039  			size, n = binary.Uvarint(value)
  1040  			if n <= 0 {
  1041  				w.err = errors.Newf("%s key's value (%x) does not parse as uvarint",
  1042  					errors.Safe(key.Kind().String()), value)
  1043  				return w.err
  1044  			}
  1045  		}
  1046  		w.props.NumDeletions++
  1047  		w.props.NumSizedDeletions++
  1048  		w.props.RawPointTombstoneKeySize += uint64(len(key.UserKey))
  1049  		w.props.RawPointTombstoneValueSize += size
  1050  	case InternalKeyKindMerge:
  1051  		w.props.NumMergeOperands++
  1052  	}
  1053  	w.props.RawKeySize += uint64(key.Size())
  1054  	w.props.RawValueSize += uint64(len(value))
  1055  	return nil
  1056  }
  1057  
  1058  func (w *Writer) prettyTombstone(k InternalKey, value []byte) fmt.Formatter {
  1059  	return keyspan.Span{
  1060  		Start: k.UserKey,
  1061  		End:   value,
  1062  		Keys:  []keyspan.Key{{Trailer: k.Trailer}},
  1063  	}.Pretty(w.formatKey)
  1064  }
  1065  
  1066  func (w *Writer) addTombstone(key InternalKey, value []byte) error {
  1067  	if !w.disableKeyOrderChecks && !w.rangeDelV1Format && w.rangeDelBlock.nEntries > 0 {
  1068  		// Check that tombstones are being added in fragmented order. If the two
  1069  		// tombstones overlap, their start and end keys must be identical.
  1070  		prevKey := w.rangeDelBlock.getCurKey()
  1071  		switch c := w.compare(prevKey.UserKey, key.UserKey); {
  1072  		case c > 0:
  1073  			w.err = errors.Errorf("pebble: keys must be added in order: %s, %s",
  1074  				prevKey.Pretty(w.formatKey), key.Pretty(w.formatKey))
  1075  			return w.err
  1076  		case c == 0:
  1077  			prevValue := w.rangeDelBlock.curValue
  1078  			if w.compare(prevValue, value) != 0 {
  1079  				w.err = errors.Errorf("pebble: overlapping tombstones must be fragmented: %s vs %s",
  1080  					w.prettyTombstone(prevKey, prevValue),
  1081  					w.prettyTombstone(key, value))
  1082  				return w.err
  1083  			}
  1084  			if prevKey.SeqNum() <= key.SeqNum() {
  1085  				w.err = errors.Errorf("pebble: keys must be added in strictly increasing order: %s, %s",
  1086  					prevKey.Pretty(w.formatKey), key.Pretty(w.formatKey))
  1087  				return w.err
  1088  			}
  1089  		default:
  1090  			prevValue := w.rangeDelBlock.curValue
  1091  			if w.compare(prevValue, key.UserKey) > 0 {
  1092  				w.err = errors.Errorf("pebble: overlapping tombstones must be fragmented: %s vs %s",
  1093  					w.prettyTombstone(prevKey, prevValue),
  1094  					w.prettyTombstone(key, value))
  1095  				return w.err
  1096  			}
  1097  		}
  1098  	}
  1099  
  1100  	if key.Trailer == InternalKeyRangeDeleteSentinel {
  1101  		w.err = errors.Errorf("pebble: cannot add range delete sentinel: %s", key.Pretty(w.formatKey))
  1102  		return w.err
  1103  	}
  1104  
  1105  	for i := range w.propCollectors {
  1106  		if err := w.propCollectors[i].Add(key, value); err != nil {
  1107  			w.err = err
  1108  			return err
  1109  		}
  1110  	}
  1111  
  1112  	w.meta.updateSeqNum(key.SeqNum())
  1113  
  1114  	switch {
  1115  	case w.rangeDelV1Format:
  1116  		// Range tombstones are not fragmented in the v1 (i.e. RocksDB) range
  1117  		// deletion block format, so we need to track the largest range tombstone
  1118  		// end key as every range tombstone is added.
  1119  		//
  1120  		// Note that writing the v1 format is only supported for tests.
  1121  		if w.props.NumRangeDeletions == 0 {
  1122  			w.meta.SetSmallestRangeDelKey(key.Clone())
  1123  			w.meta.SetLargestRangeDelKey(base.MakeRangeDeleteSentinelKey(value).Clone())
  1124  		} else {
  1125  			if base.InternalCompare(w.compare, w.meta.SmallestRangeDel, key) > 0 {
  1126  				w.meta.SetSmallestRangeDelKey(key.Clone())
  1127  			}
  1128  			end := base.MakeRangeDeleteSentinelKey(value)
  1129  			if base.InternalCompare(w.compare, w.meta.LargestRangeDel, end) < 0 {
  1130  				w.meta.SetLargestRangeDelKey(end.Clone())
  1131  			}
  1132  		}
  1133  
  1134  	default:
  1135  		// Range tombstones are fragmented in the v2 range deletion block format,
  1136  		// so the start key of the first range tombstone added will be the smallest
  1137  		// range tombstone key. The largest range tombstone key will be determined
  1138  		// in Writer.Close() as the end key of the last range tombstone added.
  1139  		if w.props.NumRangeDeletions == 0 {
  1140  			w.meta.SetSmallestRangeDelKey(key.Clone())
  1141  		}
  1142  	}
  1143  
  1144  	w.props.NumEntries++
  1145  	w.props.NumDeletions++
  1146  	w.props.NumRangeDeletions++
  1147  	w.props.RawKeySize += uint64(key.Size())
  1148  	w.props.RawValueSize += uint64(len(value))
  1149  	w.rangeDelBlock.add(key, value)
  1150  	return nil
  1151  }
  1152  
  1153  // RangeKeySet sets a range between start (inclusive) and end (exclusive) with
  1154  // the given suffix to the given value. The resulting range key is given the
  1155  // sequence number zero, with the expectation that the resulting sstable will be
  1156  // ingested.
  1157  //
  1158  // Keys must be added to the table in increasing order of start key. Spans are
  1159  // not required to be fragmented. The same suffix may not be set or unset twice
  1160  // over the same keyspan, because it would result in inconsistent state. Both
  1161  // the Set and Unset would share the zero sequence number, and a key cannot be
  1162  // both simultaneously set and unset.
  1163  func (w *Writer) RangeKeySet(start, end, suffix, value []byte) error {
  1164  	return w.addRangeKeySpan(keyspan.Span{
  1165  		Start: w.tempRangeKeyCopy(start),
  1166  		End:   w.tempRangeKeyCopy(end),
  1167  		Keys: []keyspan.Key{
  1168  			{
  1169  				Trailer: base.MakeTrailer(0, base.InternalKeyKindRangeKeySet),
  1170  				Suffix:  w.tempRangeKeyCopy(suffix),
  1171  				Value:   w.tempRangeKeyCopy(value),
  1172  			},
  1173  		},
  1174  	})
  1175  }
  1176  
  1177  // RangeKeyUnset un-sets a range between start (inclusive) and end (exclusive)
  1178  // with the given suffix. The resulting range key is given the
  1179  // sequence number zero, with the expectation that the resulting sstable will be
  1180  // ingested.
  1181  //
  1182  // Keys must be added to the table in increasing order of start key. Spans are
  1183  // not required to be fragmented. The same suffix may not be set or unset twice
  1184  // over the same keyspan, because it would result in inconsistent state. Both
  1185  // the Set and Unset would share the zero sequence number, and a key cannot be
  1186  // both simultaneously set and unset.
  1187  func (w *Writer) RangeKeyUnset(start, end, suffix []byte) error {
  1188  	return w.addRangeKeySpan(keyspan.Span{
  1189  		Start: w.tempRangeKeyCopy(start),
  1190  		End:   w.tempRangeKeyCopy(end),
  1191  		Keys: []keyspan.Key{
  1192  			{
  1193  				Trailer: base.MakeTrailer(0, base.InternalKeyKindRangeKeyUnset),
  1194  				Suffix:  w.tempRangeKeyCopy(suffix),
  1195  			},
  1196  		},
  1197  	})
  1198  }
  1199  
  1200  // RangeKeyDelete deletes a range between start (inclusive) and end (exclusive).
  1201  //
  1202  // Keys must be added to the table in increasing order of start key. Spans are
  1203  // not required to be fragmented.
  1204  func (w *Writer) RangeKeyDelete(start, end []byte) error {
  1205  	return w.addRangeKeySpan(keyspan.Span{
  1206  		Start: w.tempRangeKeyCopy(start),
  1207  		End:   w.tempRangeKeyCopy(end),
  1208  		Keys: []keyspan.Key{
  1209  			{Trailer: base.MakeTrailer(0, base.InternalKeyKindRangeKeyDelete)},
  1210  		},
  1211  	})
  1212  }
  1213  
  1214  // AddRangeKey adds a range key set, unset, or delete key/value pair to the
  1215  // table being written.
  1216  //
  1217  // Range keys must be supplied in strictly ascending order of start key (i.e.
  1218  // user key ascending, sequence number descending, and key type descending).
  1219  // Ranges added must also be supplied in fragmented span order - i.e. other than
  1220  // spans that are perfectly aligned (same start and end keys), spans may not
  1221  // overlap. Range keys may be added out of order relative to point keys and
  1222  // range deletions.
  1223  func (w *Writer) AddRangeKey(key InternalKey, value []byte) error {
  1224  	if w.err != nil {
  1225  		return w.err
  1226  	}
  1227  	return w.addRangeKey(key, value)
  1228  }
  1229  
  1230  func (w *Writer) addRangeKeySpan(span keyspan.Span) error {
  1231  	if w.compare(span.Start, span.End) >= 0 {
  1232  		return errors.Errorf(
  1233  			"pebble: start key must be strictly less than end key",
  1234  		)
  1235  	}
  1236  	if w.fragmenter.Start() != nil && w.compare(w.fragmenter.Start(), span.Start) > 0 {
  1237  		return errors.Errorf("pebble: spans must be added in order: %s > %s",
  1238  			w.formatKey(w.fragmenter.Start()), w.formatKey(span.Start))
  1239  	}
  1240  	// Add this span to the fragmenter.
  1241  	w.fragmenter.Add(span)
  1242  	return w.err
  1243  }
  1244  
  1245  func (w *Writer) encodeRangeKeySpan(span keyspan.Span) {
  1246  	// This method is the emit function of the Fragmenter.
  1247  	//
  1248  	// NB: The span should only contain range keys and be internally consistent
  1249  	// (eg, no duplicate suffixes, no additional keys after a RANGEKEYDEL).
  1250  	//
  1251  	// We use w.rangeKeysBySuffix and w.rangeKeySpan to avoid allocations.
  1252  
  1253  	// Sort the keys by suffix. Iteration doesn't *currently* depend on it, but
  1254  	// we may want to in the future.
  1255  	w.rangeKeysBySuffix.Cmp = w.compare
  1256  	w.rangeKeysBySuffix.Keys = span.Keys
  1257  	sort.Sort(&w.rangeKeysBySuffix)
  1258  
  1259  	w.rangeKeySpan = span
  1260  	w.rangeKeySpan.Keys = w.rangeKeysBySuffix.Keys
  1261  	w.err = firstError(w.err, w.rangeKeyEncoder.Encode(&w.rangeKeySpan))
  1262  }
  1263  
  1264  func (w *Writer) addRangeKey(key InternalKey, value []byte) error {
  1265  	if !w.disableKeyOrderChecks && w.rangeKeyBlock.nEntries > 0 {
  1266  		prevStartKey := w.rangeKeyBlock.getCurKey()
  1267  		prevEndKey, _, ok := rangekey.DecodeEndKey(prevStartKey.Kind(), w.rangeKeyBlock.curValue)
  1268  		if !ok {
  1269  			// We panic here as we should have previously decoded and validated this
  1270  			// key and value when it was first added to the range key block.
  1271  			panic(errors.Errorf("pebble: invalid end key for span: %s",
  1272  				prevStartKey.Pretty(w.formatKey)))
  1273  		}
  1274  
  1275  		curStartKey := key
  1276  		curEndKey, _, ok := rangekey.DecodeEndKey(curStartKey.Kind(), value)
  1277  		if !ok {
  1278  			w.err = errors.Errorf("pebble: invalid end key for span: %s",
  1279  				curStartKey.Pretty(w.formatKey))
  1280  			return w.err
  1281  		}
  1282  
  1283  		// Start keys must be strictly increasing.
  1284  		if base.InternalCompare(w.compare, prevStartKey, curStartKey) >= 0 {
  1285  			w.err = errors.Errorf(
  1286  				"pebble: range keys starts must be added in increasing order: %s, %s",
  1287  				prevStartKey.Pretty(w.formatKey), key.Pretty(w.formatKey))
  1288  			return w.err
  1289  		}
  1290  
  1291  		// Start keys are increasing. If the start user keys are equal, the
  1292  		// end keys must be equal (i.e. aligned spans).
  1293  		if w.compare(prevStartKey.UserKey, curStartKey.UserKey) == 0 {
  1294  			if w.compare(prevEndKey, curEndKey) != 0 {
  1295  				w.err = errors.Errorf("pebble: overlapping range keys must be fragmented: %s, %s",
  1296  					prevStartKey.Pretty(w.formatKey),
  1297  					curStartKey.Pretty(w.formatKey))
  1298  				return w.err
  1299  			}
  1300  		} else if w.compare(prevEndKey, curStartKey.UserKey) > 0 {
  1301  			// If the start user keys are NOT equal, the spans must be disjoint (i.e.
  1302  			// no overlap).
  1303  			// NOTE: the inequality excludes zero, as we allow the end key of the
  1304  			// lower span be the same as the start key of the upper span, because
  1305  			// the range end key is considered an exclusive bound.
  1306  			w.err = errors.Errorf("pebble: overlapping range keys must be fragmented: %s, %s",
  1307  				prevStartKey.Pretty(w.formatKey),
  1308  				curStartKey.Pretty(w.formatKey))
  1309  			return w.err
  1310  		}
  1311  	}
  1312  
  1313  	// TODO(travers): Add an invariant-gated check to ensure that suffix-values
  1314  	// are sorted within coalesced spans.
  1315  
  1316  	// Range-keys and point-keys are intended to live in "parallel" keyspaces.
  1317  	// However, we track a single seqnum in the table metadata that spans both of
  1318  	// these keyspaces.
  1319  	// TODO(travers): Consider tracking range key seqnums separately.
  1320  	w.meta.updateSeqNum(key.SeqNum())
  1321  
  1322  	// Range tombstones are fragmented, so the start key of the first range key
  1323  	// added will be the smallest. The largest range key is determined in
  1324  	// Writer.Close() as the end key of the last range key added to the block.
  1325  	if w.props.NumRangeKeys() == 0 {
  1326  		w.meta.SetSmallestRangeKey(key.Clone())
  1327  	}
  1328  
  1329  	// Update block properties.
  1330  	w.props.RawRangeKeyKeySize += uint64(key.Size())
  1331  	w.props.RawRangeKeyValueSize += uint64(len(value))
  1332  	switch key.Kind() {
  1333  	case base.InternalKeyKindRangeKeyDelete:
  1334  		w.props.NumRangeKeyDels++
  1335  	case base.InternalKeyKindRangeKeySet:
  1336  		w.props.NumRangeKeySets++
  1337  	case base.InternalKeyKindRangeKeyUnset:
  1338  		w.props.NumRangeKeyUnsets++
  1339  	default:
  1340  		panic(errors.Errorf("pebble: invalid range key type: %s", key.Kind()))
  1341  	}
  1342  
  1343  	for i := range w.blockPropCollectors {
  1344  		if err := w.blockPropCollectors[i].Add(key, value); err != nil {
  1345  			return err
  1346  		}
  1347  	}
  1348  
  1349  	// Add the key to the block.
  1350  	w.rangeKeyBlock.add(key, value)
  1351  	return nil
  1352  }
  1353  
  1354  // tempRangeKeyBuf returns a slice of length n from the Writer's rkBuf byte
  1355  // slice. Any byte written to the returned slice is retained for the lifetime of
  1356  // the Writer.
  1357  func (w *Writer) tempRangeKeyBuf(n int) []byte {
  1358  	if cap(w.rkBuf)-len(w.rkBuf) < n {
  1359  		size := len(w.rkBuf) + 2*n
  1360  		if size < 2*cap(w.rkBuf) {
  1361  			size = 2 * cap(w.rkBuf)
  1362  		}
  1363  		buf := make([]byte, len(w.rkBuf), size)
  1364  		copy(buf, w.rkBuf)
  1365  		w.rkBuf = buf
  1366  	}
  1367  	b := w.rkBuf[len(w.rkBuf) : len(w.rkBuf)+n]
  1368  	w.rkBuf = w.rkBuf[:len(w.rkBuf)+n]
  1369  	return b
  1370  }
  1371  
  1372  // tempRangeKeyCopy returns a copy of the provided slice, stored in the Writer's
  1373  // range key buffer.
  1374  func (w *Writer) tempRangeKeyCopy(k []byte) []byte {
  1375  	if len(k) == 0 {
  1376  		return nil
  1377  	}
  1378  	buf := w.tempRangeKeyBuf(len(k))
  1379  	copy(buf, k)
  1380  	return buf
  1381  }
  1382  
  1383  func (w *Writer) maybeAddToFilter(key []byte) {
  1384  	if w.filter != nil {
  1385  		if w.split != nil {
  1386  			prefix := key[:w.split(key)]
  1387  			w.filter.addKey(prefix)
  1388  		} else {
  1389  			w.filter.addKey(key)
  1390  		}
  1391  	}
  1392  }
  1393  
  1394  func (w *Writer) flush(key InternalKey) error {
  1395  	// We're finishing a data block.
  1396  	err := w.finishDataBlockProps(w.dataBlockBuf)
  1397  	if err != nil {
  1398  		return err
  1399  	}
  1400  	w.dataBlockBuf.finish()
  1401  	w.dataBlockBuf.compressAndChecksum(w.compression)
  1402  	// Since dataBlockEstimates.addInflightDataBlock was never called, the
  1403  	// inflightSize is set to 0.
  1404  	w.coordination.sizeEstimate.dataBlockCompressed(len(w.dataBlockBuf.compressed), 0)
  1405  
  1406  	// Determine if the index block should be flushed. Since we're accessing the
  1407  	// dataBlockBuf.dataBlock.curKey here, we have to make sure that once we start
  1408  	// to pool the dataBlockBufs, the curKey isn't used by the Writer once the
  1409  	// dataBlockBuf is added back to a sync.Pool. In this particular case, the
  1410  	// byte slice which supports "sep" will eventually be copied when "sep" is
  1411  	// added to the index block.
  1412  	prevKey := w.dataBlockBuf.dataBlock.getCurKey()
  1413  	sep := w.indexEntrySep(prevKey, key, w.dataBlockBuf)
  1414  	// We determine that we should flush an index block from the Writer client
  1415  	// goroutine, but we actually finish the index block from the writeQueue.
  1416  	// When we determine that an index block should be flushed, we need to call
  1417  	// BlockPropertyCollector.FinishIndexBlock. But block property collector
  1418  	// calls must happen sequentially from the Writer client. Therefore, we need
  1419  	// to determine that we are going to flush the index block from the Writer
  1420  	// client.
  1421  	shouldFlushIndexBlock := supportsTwoLevelIndex(w.tableFormat) && w.indexBlock.shouldFlush(
  1422  		sep, encodedBHPEstimatedSize, w.indexBlockSize, w.indexBlockSizeThreshold,
  1423  	)
  1424  
  1425  	var indexProps []byte
  1426  	var flushableIndexBlock *indexBlockBuf
  1427  	if shouldFlushIndexBlock {
  1428  		flushableIndexBlock = w.indexBlock
  1429  		w.indexBlock = newIndexBlockBuf(w.coordination.parallelismEnabled)
  1430  		// Call BlockPropertyCollector.FinishIndexBlock, since we've decided to
  1431  		// flush the index block.
  1432  		indexProps, err = w.finishIndexBlockProps()
  1433  		if err != nil {
  1434  			return err
  1435  		}
  1436  	}
  1437  
  1438  	// We've called BlockPropertyCollector.FinishDataBlock, and, if necessary,
  1439  	// BlockPropertyCollector.FinishIndexBlock. Since we've decided to finish
  1440  	// the data block, we can call
  1441  	// BlockPropertyCollector.AddPrevDataBlockToIndexBlock.
  1442  	w.addPrevDataBlockToIndexBlockProps()
  1443  
  1444  	// Schedule a write.
  1445  	writeTask := writeTaskPool.Get().(*writeTask)
  1446  	// We're setting compressionDone to indicate that compression of this block
  1447  	// has already been completed.
  1448  	writeTask.compressionDone <- true
  1449  	writeTask.buf = w.dataBlockBuf
  1450  	writeTask.indexEntrySep = sep
  1451  	writeTask.currIndexBlock = w.indexBlock
  1452  	writeTask.indexInflightSize = sep.Size() + encodedBHPEstimatedSize
  1453  	writeTask.finishedIndexProps = indexProps
  1454  	writeTask.flushableIndexBlock = flushableIndexBlock
  1455  
  1456  	// The writeTask corresponds to an unwritten index entry.
  1457  	w.indexBlock.addInflight(writeTask.indexInflightSize)
  1458  
  1459  	w.dataBlockBuf = nil
  1460  	if w.coordination.parallelismEnabled {
  1461  		w.coordination.writeQueue.add(writeTask)
  1462  	} else {
  1463  		err = w.coordination.writeQueue.addSync(writeTask)
  1464  	}
  1465  	w.dataBlockBuf = newDataBlockBuf(w.restartInterval, w.checksumType)
  1466  
  1467  	return err
  1468  }
  1469  
  1470  func (w *Writer) maybeFlush(key InternalKey, valueLen int) error {
  1471  	if !w.dataBlockBuf.shouldFlush(key, valueLen, w.blockSize, w.blockSizeThreshold) {
  1472  		return nil
  1473  	}
  1474  
  1475  	err := w.flush(key)
  1476  
  1477  	if err != nil {
  1478  		w.err = err
  1479  		return err
  1480  	}
  1481  
  1482  	return nil
  1483  }
  1484  
  1485  // dataBlockBuf.dataBlockProps set by this method must be encoded before any future use of the
  1486  // dataBlockBuf.blockPropsEncoder, since the properties slice will get reused by the
  1487  // blockPropsEncoder.
  1488  func (w *Writer) finishDataBlockProps(buf *dataBlockBuf) error {
  1489  	if len(w.blockPropCollectors) == 0 {
  1490  		return nil
  1491  	}
  1492  	var err error
  1493  	buf.blockPropsEncoder.resetProps()
  1494  	for i := range w.blockPropCollectors {
  1495  		scratch := buf.blockPropsEncoder.getScratchForProp()
  1496  		if scratch, err = w.blockPropCollectors[i].FinishDataBlock(scratch); err != nil {
  1497  			return err
  1498  		}
  1499  		if len(scratch) > 0 {
  1500  			buf.blockPropsEncoder.addProp(shortID(i), scratch)
  1501  		}
  1502  	}
  1503  
  1504  	buf.dataBlockProps = buf.blockPropsEncoder.unsafeProps()
  1505  	return nil
  1506  }
  1507  
  1508  // The BlockHandleWithProperties returned by this method must be encoded before any future use of
  1509  // the Writer.blockPropsEncoder, since the properties slice will get reused by the blockPropsEncoder.
  1510  // maybeAddBlockPropertiesToBlockHandle should only be called if block is being written synchronously
  1511  // with the Writer client.
  1512  func (w *Writer) maybeAddBlockPropertiesToBlockHandle(
  1513  	bh BlockHandle,
  1514  ) (BlockHandleWithProperties, error) {
  1515  	err := w.finishDataBlockProps(w.dataBlockBuf)
  1516  	if err != nil {
  1517  		return BlockHandleWithProperties{}, err
  1518  	}
  1519  	return BlockHandleWithProperties{BlockHandle: bh, Props: w.dataBlockBuf.dataBlockProps}, nil
  1520  }
  1521  
  1522  func (w *Writer) indexEntrySep(prevKey, key InternalKey, dataBlockBuf *dataBlockBuf) InternalKey {
  1523  	// Make a rough guess that we want key-sized scratch to compute the separator.
  1524  	if cap(dataBlockBuf.sepScratch) < key.Size() {
  1525  		dataBlockBuf.sepScratch = make([]byte, 0, key.Size()*2)
  1526  	}
  1527  
  1528  	var sep InternalKey
  1529  	if key.UserKey == nil && key.Trailer == 0 {
  1530  		sep = prevKey.Successor(w.compare, w.successor, dataBlockBuf.sepScratch[:0])
  1531  	} else {
  1532  		sep = prevKey.Separator(w.compare, w.separator, dataBlockBuf.sepScratch[:0], key)
  1533  	}
  1534  	return sep
  1535  }
  1536  
  1537  // addIndexEntry adds an index entry for the specified key and block handle.
  1538  // addIndexEntry can be called from both the Writer client goroutine, and the
  1539  // writeQueue goroutine. If the flushIndexBuf != nil, then the indexProps, as
  1540  // they're used when the index block is finished.
  1541  //
  1542  // Invariant:
  1543  //  1. addIndexEntry must not store references to the sep InternalKey, the tmp
  1544  //     byte slice, bhp.Props. That is, these must be either deep copied or
  1545  //     encoded.
  1546  //  2. addIndexEntry must not hold references to the flushIndexBuf, and the writeTo
  1547  //     indexBlockBufs.
  1548  func (w *Writer) addIndexEntry(
  1549  	sep InternalKey,
  1550  	bhp BlockHandleWithProperties,
  1551  	tmp []byte,
  1552  	flushIndexBuf *indexBlockBuf,
  1553  	writeTo *indexBlockBuf,
  1554  	inflightSize int,
  1555  	indexProps []byte,
  1556  ) error {
  1557  	if bhp.Length == 0 {
  1558  		// A valid blockHandle must be non-zero.
  1559  		// In particular, it must have a non-zero length.
  1560  		return nil
  1561  	}
  1562  
  1563  	encoded := encodeBlockHandleWithProperties(tmp, bhp)
  1564  
  1565  	if flushIndexBuf != nil {
  1566  		if cap(w.indexPartitions) == 0 {
  1567  			w.indexPartitions = make([]indexBlockAndBlockProperties, 0, 32)
  1568  		}
  1569  		// Enable two level indexes if there is more than one index block.
  1570  		w.twoLevelIndex = true
  1571  		if err := w.finishIndexBlock(flushIndexBuf, indexProps); err != nil {
  1572  			return err
  1573  		}
  1574  	}
  1575  
  1576  	writeTo.add(sep, encoded, inflightSize)
  1577  	return nil
  1578  }
  1579  
  1580  func (w *Writer) addPrevDataBlockToIndexBlockProps() {
  1581  	for i := range w.blockPropCollectors {
  1582  		w.blockPropCollectors[i].AddPrevDataBlockToIndexBlock()
  1583  	}
  1584  }
  1585  
  1586  // addIndexEntrySync adds an index entry for the specified key and block handle.
  1587  // Writer.addIndexEntry is only called synchronously once Writer.Close is called.
  1588  // addIndexEntrySync should only be called if we're sure that index entries
  1589  // aren't being written asynchronously.
  1590  //
  1591  // Invariant:
  1592  //  1. addIndexEntrySync must not store references to the prevKey, key InternalKey's,
  1593  //     the tmp byte slice. That is, these must be either deep copied or encoded.
  1594  func (w *Writer) addIndexEntrySync(
  1595  	prevKey, key InternalKey, bhp BlockHandleWithProperties, tmp []byte,
  1596  ) error {
  1597  	sep := w.indexEntrySep(prevKey, key, w.dataBlockBuf)
  1598  	shouldFlush := supportsTwoLevelIndex(
  1599  		w.tableFormat) && w.indexBlock.shouldFlush(
  1600  		sep, encodedBHPEstimatedSize, w.indexBlockSize, w.indexBlockSizeThreshold,
  1601  	)
  1602  	var flushableIndexBlock *indexBlockBuf
  1603  	var props []byte
  1604  	var err error
  1605  	if shouldFlush {
  1606  		flushableIndexBlock = w.indexBlock
  1607  		w.indexBlock = newIndexBlockBuf(w.coordination.parallelismEnabled)
  1608  
  1609  		// Call BlockPropertyCollector.FinishIndexBlock, since we've decided to
  1610  		// flush the index block.
  1611  		props, err = w.finishIndexBlockProps()
  1612  		if err != nil {
  1613  			return err
  1614  		}
  1615  	}
  1616  
  1617  	err = w.addIndexEntry(sep, bhp, tmp, flushableIndexBlock, w.indexBlock, 0, props)
  1618  	if flushableIndexBlock != nil {
  1619  		flushableIndexBlock.clear()
  1620  		indexBlockBufPool.Put(flushableIndexBlock)
  1621  	}
  1622  	w.addPrevDataBlockToIndexBlockProps()
  1623  	return err
  1624  }
  1625  
  1626  func shouldFlush(
  1627  	key InternalKey,
  1628  	valueLen int,
  1629  	restartInterval, estimatedBlockSize, numEntries, targetBlockSize, sizeThreshold int,
  1630  ) bool {
  1631  	if numEntries == 0 {
  1632  		return false
  1633  	}
  1634  
  1635  	if estimatedBlockSize >= targetBlockSize {
  1636  		return true
  1637  	}
  1638  
  1639  	// The block is currently smaller than the target size.
  1640  	if estimatedBlockSize <= sizeThreshold {
  1641  		// The block is smaller than the threshold size at which we'll consider
  1642  		// flushing it.
  1643  		return false
  1644  	}
  1645  
  1646  	newSize := estimatedBlockSize + key.Size() + valueLen
  1647  	if numEntries%restartInterval == 0 {
  1648  		newSize += 4
  1649  	}
  1650  	newSize += 4                              // varint for shared prefix length
  1651  	newSize += uvarintLen(uint32(key.Size())) // varint for unshared key bytes
  1652  	newSize += uvarintLen(uint32(valueLen))   // varint for value size
  1653  	// Flush if the block plus the new entry is larger than the target size.
  1654  	return newSize > targetBlockSize
  1655  }
  1656  
  1657  func cloneKeyWithBuf(k InternalKey, a bytealloc.A) (bytealloc.A, InternalKey) {
  1658  	if len(k.UserKey) == 0 {
  1659  		return a, k
  1660  	}
  1661  	a, keyCopy := a.Copy(k.UserKey)
  1662  	return a, InternalKey{UserKey: keyCopy, Trailer: k.Trailer}
  1663  }
  1664  
  1665  // Invariants: The byte slice returned by finishIndexBlockProps is heap-allocated
  1666  //
  1667  //	and has its own lifetime, independent of the Writer and the blockPropsEncoder,
  1668  //
  1669  // and it is safe to:
  1670  //  1. Reuse w.blockPropsEncoder without first encoding the byte slice returned.
  1671  //  2. Store the byte slice in the Writer since it is a copy and not supported by
  1672  //     an underlying buffer.
  1673  func (w *Writer) finishIndexBlockProps() ([]byte, error) {
  1674  	w.blockPropsEncoder.resetProps()
  1675  	for i := range w.blockPropCollectors {
  1676  		scratch := w.blockPropsEncoder.getScratchForProp()
  1677  		var err error
  1678  		if scratch, err = w.blockPropCollectors[i].FinishIndexBlock(scratch); err != nil {
  1679  			return nil, err
  1680  		}
  1681  		if len(scratch) > 0 {
  1682  			w.blockPropsEncoder.addProp(shortID(i), scratch)
  1683  		}
  1684  	}
  1685  	return w.blockPropsEncoder.props(), nil
  1686  }
  1687  
  1688  // finishIndexBlock finishes the current index block and adds it to the top
  1689  // level index block. This is only used when two level indexes are enabled.
  1690  //
  1691  // Invariants:
  1692  //  1. The props slice passed into finishedIndexBlock must not be a
  1693  //     owned by any other struct, since it will be stored in the Writer.indexPartitions
  1694  //     slice.
  1695  //  2. None of the buffers owned by indexBuf will be shallow copied and stored elsewhere.
  1696  //     That is, it must be safe to reuse indexBuf after finishIndexBlock has been called.
  1697  func (w *Writer) finishIndexBlock(indexBuf *indexBlockBuf, props []byte) error {
  1698  	part := indexBlockAndBlockProperties{
  1699  		nEntries: indexBuf.block.nEntries, properties: props,
  1700  	}
  1701  	w.indexSepAlloc, part.sep = cloneKeyWithBuf(
  1702  		indexBuf.block.getCurKey(), w.indexSepAlloc,
  1703  	)
  1704  	bk := indexBuf.finish()
  1705  	if len(w.indexBlockAlloc) < len(bk) {
  1706  		// Allocate enough bytes for approximately 16 index blocks.
  1707  		w.indexBlockAlloc = make([]byte, len(bk)*16)
  1708  	}
  1709  	n := copy(w.indexBlockAlloc, bk)
  1710  	part.block = w.indexBlockAlloc[:n:n]
  1711  	w.indexBlockAlloc = w.indexBlockAlloc[n:]
  1712  	w.indexPartitions = append(w.indexPartitions, part)
  1713  	return nil
  1714  }
  1715  
  1716  func (w *Writer) writeTwoLevelIndex() (BlockHandle, error) {
  1717  	props, err := w.finishIndexBlockProps()
  1718  	if err != nil {
  1719  		return BlockHandle{}, err
  1720  	}
  1721  	// Add the final unfinished index.
  1722  	if err = w.finishIndexBlock(w.indexBlock, props); err != nil {
  1723  		return BlockHandle{}, err
  1724  	}
  1725  
  1726  	for i := range w.indexPartitions {
  1727  		b := &w.indexPartitions[i]
  1728  		w.props.NumDataBlocks += uint64(b.nEntries)
  1729  
  1730  		data := b.block
  1731  		w.props.IndexSize += uint64(len(data))
  1732  		bh, err := w.writeBlock(data, w.compression, &w.blockBuf)
  1733  		if err != nil {
  1734  			return BlockHandle{}, err
  1735  		}
  1736  		bhp := BlockHandleWithProperties{
  1737  			BlockHandle: bh,
  1738  			Props:       b.properties,
  1739  		}
  1740  		encoded := encodeBlockHandleWithProperties(w.blockBuf.tmp[:], bhp)
  1741  		w.topLevelIndexBlock.add(b.sep, encoded)
  1742  	}
  1743  
  1744  	// NB: RocksDB includes the block trailer length in the index size
  1745  	// property, though it doesn't include the trailer in the top level
  1746  	// index size property.
  1747  	w.props.IndexPartitions = uint64(len(w.indexPartitions))
  1748  	w.props.TopLevelIndexSize = uint64(w.topLevelIndexBlock.estimatedSize())
  1749  	w.props.IndexSize += w.props.TopLevelIndexSize + blockTrailerLen
  1750  
  1751  	return w.writeBlock(w.topLevelIndexBlock.finish(), w.compression, &w.blockBuf)
  1752  }
  1753  
  1754  func compressAndChecksum(b []byte, compression Compression, blockBuf *blockBuf) []byte {
  1755  	// Compress the buffer, discarding the result if the improvement isn't at
  1756  	// least 12.5%.
  1757  	blockType, compressed := compressBlock(compression, b, blockBuf.compressedBuf)
  1758  	if blockType != noCompressionBlockType && cap(compressed) > cap(blockBuf.compressedBuf) {
  1759  		blockBuf.compressedBuf = compressed[:cap(compressed)]
  1760  	}
  1761  	if len(compressed) < len(b)-len(b)/8 {
  1762  		b = compressed
  1763  	} else {
  1764  		blockType = noCompressionBlockType
  1765  	}
  1766  
  1767  	blockBuf.tmp[0] = byte(blockType)
  1768  
  1769  	// Calculate the checksum.
  1770  	checksum := blockBuf.checksummer.checksum(b, blockBuf.tmp[:1])
  1771  	binary.LittleEndian.PutUint32(blockBuf.tmp[1:5], checksum)
  1772  	return b
  1773  }
  1774  
  1775  func (w *Writer) writeCompressedBlock(block []byte, blockTrailerBuf []byte) (BlockHandle, error) {
  1776  	bh := BlockHandle{Offset: w.meta.Size, Length: uint64(len(block))}
  1777  
  1778  	if w.cacheID != 0 && w.fileNum.FileNum() != 0 {
  1779  		// Remove the block being written from the cache. This provides defense in
  1780  		// depth against bugs which cause cache collisions.
  1781  		//
  1782  		// TODO(peter): Alternatively, we could add the uncompressed value to the
  1783  		// cache.
  1784  		w.cache.Delete(w.cacheID, w.fileNum, bh.Offset)
  1785  	}
  1786  
  1787  	// Write the bytes to the file.
  1788  	if err := w.writable.Write(block); err != nil {
  1789  		return BlockHandle{}, err
  1790  	}
  1791  	w.meta.Size += uint64(len(block))
  1792  	if err := w.writable.Write(blockTrailerBuf[:blockTrailerLen]); err != nil {
  1793  		return BlockHandle{}, err
  1794  	}
  1795  	w.meta.Size += blockTrailerLen
  1796  
  1797  	return bh, nil
  1798  }
  1799  
  1800  // Write implements io.Writer. This is analogous to writeCompressedBlock for
  1801  // blocks that already incorporate the trailer, and don't need the callee to
  1802  // return a BlockHandle.
  1803  func (w *Writer) Write(blockWithTrailer []byte) (n int, err error) {
  1804  	offset := w.meta.Size
  1805  	if w.cacheID != 0 && w.fileNum.FileNum() != 0 {
  1806  		// Remove the block being written from the cache. This provides defense in
  1807  		// depth against bugs which cause cache collisions.
  1808  		//
  1809  		// TODO(peter): Alternatively, we could add the uncompressed value to the
  1810  		// cache.
  1811  		w.cache.Delete(w.cacheID, w.fileNum, offset)
  1812  	}
  1813  	w.meta.Size += uint64(len(blockWithTrailer))
  1814  	if err := w.writable.Write(blockWithTrailer); err != nil {
  1815  		return 0, err
  1816  	}
  1817  	return len(blockWithTrailer), nil
  1818  }
  1819  
  1820  func (w *Writer) writeBlock(
  1821  	b []byte, compression Compression, blockBuf *blockBuf,
  1822  ) (BlockHandle, error) {
  1823  	b = compressAndChecksum(b, compression, blockBuf)
  1824  	return w.writeCompressedBlock(b, blockBuf.tmp[:])
  1825  }
  1826  
  1827  // assertFormatCompatibility ensures that the features present on the table are
  1828  // compatible with the table format version.
  1829  func (w *Writer) assertFormatCompatibility() error {
  1830  	// PebbleDBv1: block properties.
  1831  	if len(w.blockPropCollectors) > 0 && w.tableFormat < TableFormatPebblev1 {
  1832  		return errors.Newf(
  1833  			"table format version %s is less than the minimum required version %s for block properties",
  1834  			w.tableFormat, TableFormatPebblev1,
  1835  		)
  1836  	}
  1837  
  1838  	// PebbleDBv2: range keys.
  1839  	if w.props.NumRangeKeys() > 0 && w.tableFormat < TableFormatPebblev2 {
  1840  		return errors.Newf(
  1841  			"table format version %s is less than the minimum required version %s for range keys",
  1842  			w.tableFormat, TableFormatPebblev2,
  1843  		)
  1844  	}
  1845  
  1846  	// PebbleDBv3: value blocks.
  1847  	if (w.props.NumValueBlocks > 0 || w.props.NumValuesInValueBlocks > 0 ||
  1848  		w.props.ValueBlocksSize > 0) && w.tableFormat < TableFormatPebblev3 {
  1849  		return errors.Newf(
  1850  			"table format version %s is less than the minimum required version %s for value blocks",
  1851  			w.tableFormat, TableFormatPebblev3)
  1852  	}
  1853  
  1854  	// PebbleDBv4: DELSIZED tombstones.
  1855  	if w.props.NumSizedDeletions > 0 && w.tableFormat < TableFormatPebblev4 {
  1856  		return errors.Newf(
  1857  			"table format version %s is less than the minimum required version %s for sized deletion tombstones",
  1858  			w.tableFormat, TableFormatPebblev4)
  1859  	}
  1860  	return nil
  1861  }
  1862  
  1863  // Close finishes writing the table and closes the underlying file that the
  1864  // table was written to.
  1865  func (w *Writer) Close() (err error) {
  1866  	defer func() {
  1867  		if w.valueBlockWriter != nil {
  1868  			releaseValueBlockWriter(w.valueBlockWriter)
  1869  			// Defensive code in case Close gets called again. We don't want to put
  1870  			// the same object to a sync.Pool.
  1871  			w.valueBlockWriter = nil
  1872  		}
  1873  		if w.writable != nil {
  1874  			w.writable.Abort()
  1875  			w.writable = nil
  1876  		}
  1877  		// Record any error in the writer (so we can exit early if Close is called
  1878  		// again).
  1879  		if err != nil {
  1880  			w.err = err
  1881  		}
  1882  	}()
  1883  
  1884  	// finish must be called before we check for an error, because finish will
  1885  	// block until every single task added to the writeQueue has been processed,
  1886  	// and an error could be encountered while any of those tasks are processed.
  1887  	if err := w.coordination.writeQueue.finish(); err != nil {
  1888  		return err
  1889  	}
  1890  
  1891  	if w.err != nil {
  1892  		return w.err
  1893  	}
  1894  
  1895  	// The w.meta.LargestPointKey is only used once the Writer is closed, so it is safe to set it
  1896  	// when the Writer is closed.
  1897  	//
  1898  	// The following invariants ensure that setting the largest key at this point of a Writer close
  1899  	// is correct:
  1900  	// 1. Keys must only be added to the Writer in an increasing order.
  1901  	// 2. The current w.dataBlockBuf is guaranteed to have the latest key added to the Writer. This
  1902  	//    must be true, because a w.dataBlockBuf is only switched out when a dataBlock is flushed,
  1903  	//    however, if a dataBlock is flushed, then we add a key to the new w.dataBlockBuf in the
  1904  	//    addPoint function after the flush occurs.
  1905  	if w.dataBlockBuf.dataBlock.nEntries >= 1 {
  1906  		w.meta.SetLargestPointKey(w.dataBlockBuf.dataBlock.getCurKey().Clone())
  1907  	}
  1908  
  1909  	// Finish the last data block, or force an empty data block if there
  1910  	// aren't any data blocks at all.
  1911  	if w.dataBlockBuf.dataBlock.nEntries > 0 || w.indexBlock.block.nEntries == 0 {
  1912  		bh, err := w.writeBlock(w.dataBlockBuf.dataBlock.finish(), w.compression, &w.dataBlockBuf.blockBuf)
  1913  		if err != nil {
  1914  			return err
  1915  		}
  1916  		bhp, err := w.maybeAddBlockPropertiesToBlockHandle(bh)
  1917  		if err != nil {
  1918  			return err
  1919  		}
  1920  		prevKey := w.dataBlockBuf.dataBlock.getCurKey()
  1921  		if err := w.addIndexEntrySync(prevKey, InternalKey{}, bhp, w.dataBlockBuf.tmp[:]); err != nil {
  1922  			return err
  1923  		}
  1924  	}
  1925  	w.props.DataSize = w.meta.Size
  1926  
  1927  	// Write the filter block.
  1928  	var metaindex rawBlockWriter
  1929  	metaindex.restartInterval = 1
  1930  	if w.filter != nil {
  1931  		b, err := w.filter.finish()
  1932  		if err != nil {
  1933  			return err
  1934  		}
  1935  		bh, err := w.writeBlock(b, NoCompression, &w.blockBuf)
  1936  		if err != nil {
  1937  			return err
  1938  		}
  1939  		n := encodeBlockHandle(w.blockBuf.tmp[:], bh)
  1940  		metaindex.add(InternalKey{UserKey: []byte(w.filter.metaName())}, w.blockBuf.tmp[:n])
  1941  		w.props.FilterPolicyName = w.filter.policyName()
  1942  		w.props.FilterSize = bh.Length
  1943  	}
  1944  
  1945  	var indexBH BlockHandle
  1946  	if w.twoLevelIndex {
  1947  		w.props.IndexType = twoLevelIndex
  1948  		// Write the two level index block.
  1949  		indexBH, err = w.writeTwoLevelIndex()
  1950  		if err != nil {
  1951  			return err
  1952  		}
  1953  	} else {
  1954  		w.props.IndexType = binarySearchIndex
  1955  		// NB: RocksDB includes the block trailer length in the index size
  1956  		// property, though it doesn't include the trailer in the filter size
  1957  		// property.
  1958  		w.props.IndexSize = uint64(w.indexBlock.estimatedSize()) + blockTrailerLen
  1959  		w.props.NumDataBlocks = uint64(w.indexBlock.block.nEntries)
  1960  
  1961  		// Write the single level index block.
  1962  		indexBH, err = w.writeBlock(w.indexBlock.finish(), w.compression, &w.blockBuf)
  1963  		if err != nil {
  1964  			return err
  1965  		}
  1966  	}
  1967  
  1968  	// Write the range-del block. The block handle must added to the meta index block
  1969  	// after the properties block has been written. This is because the entries in the
  1970  	// metaindex block must be sorted by key.
  1971  	var rangeDelBH BlockHandle
  1972  	if w.props.NumRangeDeletions > 0 {
  1973  		if !w.rangeDelV1Format {
  1974  			// Because the range tombstones are fragmented in the v2 format, the end
  1975  			// key of the last added range tombstone will be the largest range
  1976  			// tombstone key. Note that we need to make this into a range deletion
  1977  			// sentinel because sstable boundaries are inclusive while the end key of
  1978  			// a range deletion tombstone is exclusive. A Clone() is necessary as
  1979  			// rangeDelBlock.curValue is the same slice that will get passed
  1980  			// into w.writer, and some implementations of vfs.File mutate the
  1981  			// slice passed into Write(). Also, w.meta will often outlive the
  1982  			// blockWriter, and so cloning curValue allows the rangeDelBlock's
  1983  			// internal buffer to get gc'd.
  1984  			k := base.MakeRangeDeleteSentinelKey(w.rangeDelBlock.curValue).Clone()
  1985  			w.meta.SetLargestRangeDelKey(k)
  1986  		}
  1987  		rangeDelBH, err = w.writeBlock(w.rangeDelBlock.finish(), NoCompression, &w.blockBuf)
  1988  		if err != nil {
  1989  			return err
  1990  		}
  1991  	}
  1992  
  1993  	// Write the range-key block, flushing any remaining spans from the
  1994  	// fragmenter first.
  1995  	w.fragmenter.Finish()
  1996  
  1997  	var rangeKeyBH BlockHandle
  1998  	if w.props.NumRangeKeys() > 0 {
  1999  		key := w.rangeKeyBlock.getCurKey()
  2000  		kind := key.Kind()
  2001  		endKey, _, ok := rangekey.DecodeEndKey(kind, w.rangeKeyBlock.curValue)
  2002  		if !ok {
  2003  			return errors.Newf("invalid end key: %s", w.rangeKeyBlock.curValue)
  2004  		}
  2005  		k := base.MakeExclusiveSentinelKey(kind, endKey).Clone()
  2006  		w.meta.SetLargestRangeKey(k)
  2007  		// TODO(travers): The lack of compression on the range key block matches the
  2008  		// lack of compression on the range-del block. Revisit whether we want to
  2009  		// enable compression on this block.
  2010  		rangeKeyBH, err = w.writeBlock(w.rangeKeyBlock.finish(), NoCompression, &w.blockBuf)
  2011  		if err != nil {
  2012  			return err
  2013  		}
  2014  	}
  2015  
  2016  	if w.valueBlockWriter != nil {
  2017  		vbiHandle, vbStats, err := w.valueBlockWriter.finish(w, w.meta.Size)
  2018  		if err != nil {
  2019  			return err
  2020  		}
  2021  		w.props.NumValueBlocks = vbStats.numValueBlocks
  2022  		w.props.NumValuesInValueBlocks = vbStats.numValuesInValueBlocks
  2023  		w.props.ValueBlocksSize = vbStats.valueBlocksAndIndexSize
  2024  		if vbStats.numValueBlocks > 0 {
  2025  			n := encodeValueBlocksIndexHandle(w.blockBuf.tmp[:], vbiHandle)
  2026  			metaindex.add(InternalKey{UserKey: []byte(metaValueIndexName)}, w.blockBuf.tmp[:n])
  2027  		}
  2028  	}
  2029  
  2030  	// Add the range key block handle to the metaindex block. Note that we add the
  2031  	// block handle to the metaindex block before the other meta blocks as the
  2032  	// metaindex block entries must be sorted, and the range key block name sorts
  2033  	// before the other block names.
  2034  	if w.props.NumRangeKeys() > 0 {
  2035  		n := encodeBlockHandle(w.blockBuf.tmp[:], rangeKeyBH)
  2036  		metaindex.add(InternalKey{UserKey: []byte(metaRangeKeyName)}, w.blockBuf.tmp[:n])
  2037  	}
  2038  
  2039  	{
  2040  		userProps := make(map[string]string)
  2041  		for i := range w.propCollectors {
  2042  			if err := w.propCollectors[i].Finish(userProps); err != nil {
  2043  				return err
  2044  			}
  2045  		}
  2046  		for i := range w.blockPropCollectors {
  2047  			scratch := w.blockPropsEncoder.getScratchForProp()
  2048  			// Place the shortID in the first byte.
  2049  			scratch = append(scratch, byte(i))
  2050  			buf, err := w.blockPropCollectors[i].FinishTable(scratch)
  2051  			if err != nil {
  2052  				return err
  2053  			}
  2054  			var prop string
  2055  			if len(buf) > 0 {
  2056  				prop = string(buf)
  2057  			}
  2058  			// NB: The property is populated in the map even if it is the
  2059  			// empty string, since the presence in the map is what indicates
  2060  			// that the block property collector was used when writing.
  2061  			userProps[w.blockPropCollectors[i].Name()] = prop
  2062  		}
  2063  		if len(userProps) > 0 {
  2064  			w.props.UserProperties = userProps
  2065  		}
  2066  
  2067  		// Write the properties block.
  2068  		var raw rawBlockWriter
  2069  		// The restart interval is set to infinity because the properties block
  2070  		// is always read sequentially and cached in a heap located object. This
  2071  		// reduces table size without a significant impact on performance.
  2072  		raw.restartInterval = propertiesBlockRestartInterval
  2073  		w.props.CompressionOptions = rocksDBCompressionOptions
  2074  		w.props.save(w.tableFormat, &raw)
  2075  		bh, err := w.writeBlock(raw.finish(), NoCompression, &w.blockBuf)
  2076  		if err != nil {
  2077  			return err
  2078  		}
  2079  		n := encodeBlockHandle(w.blockBuf.tmp[:], bh)
  2080  		metaindex.add(InternalKey{UserKey: []byte(metaPropertiesName)}, w.blockBuf.tmp[:n])
  2081  	}
  2082  
  2083  	// Add the range deletion block handle to the metaindex block.
  2084  	if w.props.NumRangeDeletions > 0 {
  2085  		n := encodeBlockHandle(w.blockBuf.tmp[:], rangeDelBH)
  2086  		// The v2 range-del block encoding is backwards compatible with the v1
  2087  		// encoding. We add meta-index entries for both the old name and the new
  2088  		// name so that old code can continue to find the range-del block and new
  2089  		// code knows that the range tombstones in the block are fragmented and
  2090  		// sorted.
  2091  		metaindex.add(InternalKey{UserKey: []byte(metaRangeDelName)}, w.blockBuf.tmp[:n])
  2092  		if !w.rangeDelV1Format {
  2093  			metaindex.add(InternalKey{UserKey: []byte(metaRangeDelV2Name)}, w.blockBuf.tmp[:n])
  2094  		}
  2095  	}
  2096  
  2097  	// Write the metaindex block. It might be an empty block, if the filter
  2098  	// policy is nil. NoCompression is specified because a) RocksDB never
  2099  	// compresses the meta-index block and b) RocksDB has some code paths which
  2100  	// expect the meta-index block to not be compressed.
  2101  	metaindexBH, err := w.writeBlock(metaindex.blockWriter.finish(), NoCompression, &w.blockBuf)
  2102  	if err != nil {
  2103  		return err
  2104  	}
  2105  
  2106  	// Write the table footer.
  2107  	footer := footer{
  2108  		format:      w.tableFormat,
  2109  		checksum:    w.blockBuf.checksummer.checksumType,
  2110  		metaindexBH: metaindexBH,
  2111  		indexBH:     indexBH,
  2112  	}
  2113  	encoded := footer.encode(w.blockBuf.tmp[:])
  2114  	if err := w.writable.Write(footer.encode(w.blockBuf.tmp[:])); err != nil {
  2115  		return err
  2116  	}
  2117  	w.meta.Size += uint64(len(encoded))
  2118  	w.meta.Properties = w.props
  2119  
  2120  	// Check that the features present in the table are compatible with the format
  2121  	// configured for the table.
  2122  	if err = w.assertFormatCompatibility(); err != nil {
  2123  		return err
  2124  	}
  2125  
  2126  	if err := w.writable.Finish(); err != nil {
  2127  		w.writable = nil
  2128  		return err
  2129  	}
  2130  	w.writable = nil
  2131  
  2132  	w.dataBlockBuf.clear()
  2133  	dataBlockBufPool.Put(w.dataBlockBuf)
  2134  	w.dataBlockBuf = nil
  2135  	w.indexBlock.clear()
  2136  	indexBlockBufPool.Put(w.indexBlock)
  2137  	w.indexBlock = nil
  2138  
  2139  	// Make any future calls to Set or Close return an error.
  2140  	w.err = errWriterClosed
  2141  	return nil
  2142  }
  2143  
  2144  // EstimatedSize returns the estimated size of the sstable being written if a
  2145  // call to Finish() was made without adding additional keys.
  2146  func (w *Writer) EstimatedSize() uint64 {
  2147  	return w.coordination.sizeEstimate.size() +
  2148  		uint64(w.dataBlockBuf.dataBlock.estimatedSize()) +
  2149  		w.indexBlock.estimatedSize()
  2150  }
  2151  
  2152  // Metadata returns the metadata for the finished sstable. Only valid to call
  2153  // after the sstable has been finished.
  2154  func (w *Writer) Metadata() (*WriterMetadata, error) {
  2155  	if w.writable != nil {
  2156  		return nil, errors.New("pebble: writer is not closed")
  2157  	}
  2158  	return &w.meta, nil
  2159  }
  2160  
  2161  // WriterOption provide an interface to do work on Writer while it is being
  2162  // opened.
  2163  type WriterOption interface {
  2164  	// writerApply is called on the writer during opening in order to set
  2165  	// internal parameters.
  2166  	writerApply(*Writer)
  2167  }
  2168  
  2169  // PreviousPointKeyOpt is a WriterOption that provides access to the last
  2170  // point key written to the writer while building a sstable.
  2171  type PreviousPointKeyOpt struct {
  2172  	w *Writer
  2173  }
  2174  
  2175  // UnsafeKey returns the last point key written to the writer to which this
  2176  // option was passed during creation. The returned key points directly into
  2177  // a buffer belonging to the Writer. The value's lifetime ends the next time a
  2178  // point key is added to the Writer.
  2179  // Invariant: UnsafeKey isn't and shouldn't be called after the Writer is closed.
  2180  func (o PreviousPointKeyOpt) UnsafeKey() base.InternalKey {
  2181  	if o.w == nil {
  2182  		return base.InvalidInternalKey
  2183  	}
  2184  
  2185  	if o.w.dataBlockBuf.dataBlock.nEntries >= 1 {
  2186  		// o.w.dataBlockBuf.dataBlock.curKey is guaranteed to point to the last point key
  2187  		// which was added to the Writer.
  2188  		return o.w.dataBlockBuf.dataBlock.getCurKey()
  2189  	}
  2190  	return base.InternalKey{}
  2191  }
  2192  
  2193  func (o *PreviousPointKeyOpt) writerApply(w *Writer) {
  2194  	o.w = w
  2195  }
  2196  
  2197  // NewWriter returns a new table writer for the file. Closing the writer will
  2198  // close the file.
  2199  func NewWriter(writable objstorage.Writable, o WriterOptions, extraOpts ...WriterOption) *Writer {
  2200  	o = o.ensureDefaults()
  2201  	w := &Writer{
  2202  		writable: writable,
  2203  		meta: WriterMetadata{
  2204  			SmallestSeqNum: math.MaxUint64,
  2205  		},
  2206  		blockSize:               o.BlockSize,
  2207  		blockSizeThreshold:      (o.BlockSize*o.BlockSizeThreshold + 99) / 100,
  2208  		indexBlockSize:          o.IndexBlockSize,
  2209  		indexBlockSizeThreshold: (o.IndexBlockSize*o.BlockSizeThreshold + 99) / 100,
  2210  		compare:                 o.Comparer.Compare,
  2211  		split:                   o.Comparer.Split,
  2212  		formatKey:               o.Comparer.FormatKey,
  2213  		compression:             o.Compression,
  2214  		separator:               o.Comparer.Separator,
  2215  		successor:               o.Comparer.Successor,
  2216  		tableFormat:             o.TableFormat,
  2217  		isStrictObsolete:        o.IsStrictObsolete,
  2218  		writingToLowestLevel:    o.WritingToLowestLevel,
  2219  		cache:                   o.Cache,
  2220  		restartInterval:         o.BlockRestartInterval,
  2221  		checksumType:            o.Checksum,
  2222  		indexBlock:              newIndexBlockBuf(o.Parallelism),
  2223  		rangeDelBlock: blockWriter{
  2224  			restartInterval: 1,
  2225  		},
  2226  		rangeKeyBlock: blockWriter{
  2227  			restartInterval: 1,
  2228  		},
  2229  		topLevelIndexBlock: blockWriter{
  2230  			restartInterval: 1,
  2231  		},
  2232  		fragmenter: keyspan.Fragmenter{
  2233  			Cmp:    o.Comparer.Compare,
  2234  			Format: o.Comparer.FormatKey,
  2235  		},
  2236  	}
  2237  	if w.tableFormat >= TableFormatPebblev3 {
  2238  		w.shortAttributeExtractor = o.ShortAttributeExtractor
  2239  		w.requiredInPlaceValueBound = o.RequiredInPlaceValueBound
  2240  		w.valueBlockWriter = newValueBlockWriter(
  2241  			w.blockSize, w.blockSizeThreshold, w.compression, w.checksumType, func(compressedSize int) {
  2242  				w.coordination.sizeEstimate.dataBlockCompressed(compressedSize, 0)
  2243  			})
  2244  	}
  2245  
  2246  	w.dataBlockBuf = newDataBlockBuf(w.restartInterval, w.checksumType)
  2247  
  2248  	w.blockBuf = blockBuf{
  2249  		checksummer: checksummer{checksumType: o.Checksum},
  2250  	}
  2251  
  2252  	w.coordination.init(o.Parallelism, w)
  2253  
  2254  	if writable == nil {
  2255  		w.err = errors.New("pebble: nil writable")
  2256  		return w
  2257  	}
  2258  
  2259  	// Note that WriterOptions are applied in two places; the ones with a
  2260  	// preApply() method are applied here. The rest are applied down below after
  2261  	// default properties are set.
  2262  	type preApply interface{ preApply() }
  2263  	for _, opt := range extraOpts {
  2264  		if _, ok := opt.(preApply); ok {
  2265  			opt.writerApply(w)
  2266  		}
  2267  	}
  2268  
  2269  	w.props.PrefixExtractorName = "nullptr"
  2270  	if o.FilterPolicy != nil {
  2271  		switch o.FilterType {
  2272  		case TableFilter:
  2273  			w.filter = newTableFilterWriter(o.FilterPolicy)
  2274  			if w.split != nil {
  2275  				w.props.PrefixExtractorName = o.Comparer.Name
  2276  				w.props.PrefixFiltering = true
  2277  			} else {
  2278  				w.props.WholeKeyFiltering = true
  2279  			}
  2280  		default:
  2281  			panic(fmt.Sprintf("unknown filter type: %v", o.FilterType))
  2282  		}
  2283  	}
  2284  
  2285  	w.props.ComparerName = o.Comparer.Name
  2286  	w.props.CompressionName = o.Compression.String()
  2287  	w.props.MergerName = o.MergerName
  2288  	w.props.PropertyCollectorNames = "[]"
  2289  	w.props.ExternalFormatVersion = rocksDBExternalFormatVersion
  2290  
  2291  	if len(o.TablePropertyCollectors) > 0 || len(o.BlockPropertyCollectors) > 0 ||
  2292  		w.tableFormat >= TableFormatPebblev4 {
  2293  		var buf bytes.Buffer
  2294  		buf.WriteString("[")
  2295  		if len(o.TablePropertyCollectors) > 0 {
  2296  			w.propCollectors = make([]TablePropertyCollector, len(o.TablePropertyCollectors))
  2297  			for i := range o.TablePropertyCollectors {
  2298  				w.propCollectors[i] = o.TablePropertyCollectors[i]()
  2299  				if i > 0 {
  2300  					buf.WriteString(",")
  2301  				}
  2302  				buf.WriteString(w.propCollectors[i].Name())
  2303  			}
  2304  		}
  2305  		numBlockPropertyCollectors := len(o.BlockPropertyCollectors)
  2306  		if w.tableFormat >= TableFormatPebblev4 {
  2307  			numBlockPropertyCollectors++
  2308  		}
  2309  		// shortID is a uint8, so we cannot exceed that number of block
  2310  		// property collectors.
  2311  		if numBlockPropertyCollectors > math.MaxUint8 {
  2312  			w.err = errors.New("pebble: too many block property collectors")
  2313  			return w
  2314  		}
  2315  		if numBlockPropertyCollectors > 0 {
  2316  			w.blockPropCollectors = make([]BlockPropertyCollector, numBlockPropertyCollectors)
  2317  		}
  2318  		if len(o.BlockPropertyCollectors) > 0 {
  2319  			// The shortID assigned to a collector is the same as its index in
  2320  			// this slice.
  2321  			for i := range o.BlockPropertyCollectors {
  2322  				w.blockPropCollectors[i] = o.BlockPropertyCollectors[i]()
  2323  				if i > 0 || len(o.TablePropertyCollectors) > 0 {
  2324  					buf.WriteString(",")
  2325  				}
  2326  				buf.WriteString(w.blockPropCollectors[i].Name())
  2327  			}
  2328  		}
  2329  		if w.tableFormat >= TableFormatPebblev4 {
  2330  			if numBlockPropertyCollectors > 1 || len(o.TablePropertyCollectors) > 0 {
  2331  				buf.WriteString(",")
  2332  			}
  2333  			w.blockPropCollectors[numBlockPropertyCollectors-1] = &w.obsoleteCollector
  2334  			buf.WriteString(w.obsoleteCollector.Name())
  2335  		}
  2336  		buf.WriteString("]")
  2337  		w.props.PropertyCollectorNames = buf.String()
  2338  	}
  2339  
  2340  	// Apply the remaining WriterOptions that do not have a preApply() method.
  2341  	for _, opt := range extraOpts {
  2342  		if _, ok := opt.(preApply); ok {
  2343  			continue
  2344  		}
  2345  		opt.writerApply(w)
  2346  	}
  2347  
  2348  	// Initialize the range key fragmenter and encoder.
  2349  	w.fragmenter.Emit = w.encodeRangeKeySpan
  2350  	w.rangeKeyEncoder.Emit = w.addRangeKey
  2351  	return w
  2352  }
  2353  
  2354  // internalGetProperties is a private, internal-use-only function that takes a
  2355  // Writer and returns a pointer to its Properties, allowing direct mutation.
  2356  // It's used by internal Pebble flushes and compactions to set internal
  2357  // properties. It gets installed in private.
  2358  func internalGetProperties(w *Writer) *Properties {
  2359  	return &w.props
  2360  }
  2361  
  2362  func init() {
  2363  	private.SSTableWriterDisableKeyOrderChecks = func(i interface{}) {
  2364  		w := i.(*Writer)
  2365  		w.disableKeyOrderChecks = true
  2366  	}
  2367  	private.SSTableInternalProperties = internalGetProperties
  2368  }
  2369  
  2370  type obsoleteKeyBlockPropertyCollector struct {
  2371  	blockIsNonObsolete bool
  2372  	indexIsNonObsolete bool
  2373  	tableIsNonObsolete bool
  2374  }
  2375  
  2376  func encodeNonObsolete(isNonObsolete bool, buf []byte) []byte {
  2377  	if isNonObsolete {
  2378  		return buf
  2379  	}
  2380  	return append(buf, 't')
  2381  }
  2382  
  2383  func (o *obsoleteKeyBlockPropertyCollector) Name() string {
  2384  	return "obsolete-key"
  2385  }
  2386  
  2387  func (o *obsoleteKeyBlockPropertyCollector) Add(key InternalKey, value []byte) error {
  2388  	// Ignore.
  2389  	return nil
  2390  }
  2391  
  2392  func (o *obsoleteKeyBlockPropertyCollector) AddPoint(isObsolete bool) {
  2393  	o.blockIsNonObsolete = o.blockIsNonObsolete || !isObsolete
  2394  }
  2395  
  2396  func (o *obsoleteKeyBlockPropertyCollector) FinishDataBlock(buf []byte) ([]byte, error) {
  2397  	o.tableIsNonObsolete = o.tableIsNonObsolete || o.blockIsNonObsolete
  2398  	return encodeNonObsolete(o.blockIsNonObsolete, buf), nil
  2399  }
  2400  
  2401  func (o *obsoleteKeyBlockPropertyCollector) AddPrevDataBlockToIndexBlock() {
  2402  	o.indexIsNonObsolete = o.indexIsNonObsolete || o.blockIsNonObsolete
  2403  	o.blockIsNonObsolete = false
  2404  }
  2405  
  2406  func (o *obsoleteKeyBlockPropertyCollector) FinishIndexBlock(buf []byte) ([]byte, error) {
  2407  	indexIsNonObsolete := o.indexIsNonObsolete
  2408  	o.indexIsNonObsolete = false
  2409  	return encodeNonObsolete(indexIsNonObsolete, buf), nil
  2410  }
  2411  
  2412  func (o *obsoleteKeyBlockPropertyCollector) FinishTable(buf []byte) ([]byte, error) {
  2413  	return encodeNonObsolete(o.tableIsNonObsolete, buf), nil
  2414  }
  2415  
  2416  func (o *obsoleteKeyBlockPropertyCollector) UpdateKeySuffixes(
  2417  	oldProp []byte, oldSuffix, newSuffix []byte,
  2418  ) error {
  2419  	_, err := propToIsObsolete(oldProp)
  2420  	if err != nil {
  2421  		return err
  2422  	}
  2423  	// Suffix rewriting currently loses the obsolete bit.
  2424  	o.blockIsNonObsolete = true
  2425  	return nil
  2426  }
  2427  
  2428  // NB: obsoleteKeyBlockPropertyFilter is stateless. This aspect of the filter
  2429  // is used in table_cache.go for in-place modification of a filters slice.
  2430  type obsoleteKeyBlockPropertyFilter struct{}
  2431  
  2432  func (o obsoleteKeyBlockPropertyFilter) Name() string {
  2433  	return "obsolete-key"
  2434  }
  2435  
  2436  // Intersects returns true if the set represented by prop intersects with
  2437  // the set in the filter.
  2438  func (o obsoleteKeyBlockPropertyFilter) Intersects(prop []byte) (bool, error) {
  2439  	return propToIsObsolete(prop)
  2440  }
  2441  
  2442  func propToIsObsolete(prop []byte) (bool, error) {
  2443  	if len(prop) == 0 {
  2444  		return true, nil
  2445  	}
  2446  	if len(prop) > 1 || prop[0] != 't' {
  2447  		return false, errors.Errorf("unexpected property %x", prop)
  2448  	}
  2449  	return false, nil
  2450  }