github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/sstable/writer.go (about)

     1  // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package sstable
     6  
     7  import (
     8  	"bufio"
     9  	"bytes"
    10  	"encoding/binary"
    11  	"fmt"
    12  	"io"
    13  	"math"
    14  	"runtime"
    15  	"sync"
    16  
    17  	"github.com/cespare/xxhash/v2"
    18  	"github.com/cockroachdb/errors"
    19  	"github.com/zuoyebang/bitalostable/internal/base"
    20  	"github.com/zuoyebang/bitalostable/internal/cache"
    21  	"github.com/zuoyebang/bitalostable/internal/crc"
    22  	"github.com/zuoyebang/bitalostable/internal/invariants"
    23  	"github.com/zuoyebang/bitalostable/internal/keyspan"
    24  	"github.com/zuoyebang/bitalostable/internal/private"
    25  	"github.com/zuoyebang/bitalostable/internal/rangekey"
    26  )
    27  
    28  // encodedBHPEstimatedSize estimates the size of the encoded BlockHandleWithProperties.
    29  // It would also be nice to account for the length of the data block properties here,
    30  // but isn't necessary since this is an estimate.
    31  const encodedBHPEstimatedSize = binary.MaxVarintLen64 * 2
    32  
    33  var errWriterClosed = errors.New("bitalostable: writer is closed")
    34  
    35  // WriterMetadata holds info about a finished sstable.
    36  type WriterMetadata struct {
    37  	Size          uint64
    38  	SmallestPoint InternalKey
    39  	// LargestPoint, LargestRangeKey, LargestRangeDel should not be accessed
    40  	// before Writer.Close is called, because they may only be set on
    41  	// Writer.Close.
    42  	LargestPoint     InternalKey
    43  	SmallestRangeDel InternalKey
    44  	LargestRangeDel  InternalKey
    45  	SmallestRangeKey InternalKey
    46  	LargestRangeKey  InternalKey
    47  	HasPointKeys     bool
    48  	HasRangeDelKeys  bool
    49  	HasRangeKeys     bool
    50  	SmallestSeqNum   uint64
    51  	LargestSeqNum    uint64
    52  	Properties       Properties
    53  }
    54  
    55  // SetSmallestPointKey sets the smallest point key to the given key.
    56  // NB: this method set the "absolute" smallest point key. Any existing key is
    57  // overridden.
    58  func (m *WriterMetadata) SetSmallestPointKey(k InternalKey) {
    59  	m.SmallestPoint = k
    60  	m.HasPointKeys = true
    61  }
    62  
    63  // SetSmallestRangeDelKey sets the smallest rangedel key to the given key.
    64  // NB: this method set the "absolute" smallest rangedel key. Any existing key is
    65  // overridden.
    66  func (m *WriterMetadata) SetSmallestRangeDelKey(k InternalKey) {
    67  	m.SmallestRangeDel = k
    68  	m.HasRangeDelKeys = true
    69  }
    70  
    71  // SetSmallestRangeKey sets the smallest range key to the given key.
    72  // NB: this method set the "absolute" smallest range key. Any existing key is
    73  // overridden.
    74  func (m *WriterMetadata) SetSmallestRangeKey(k InternalKey) {
    75  	m.SmallestRangeKey = k
    76  	m.HasRangeKeys = true
    77  }
    78  
    79  // SetLargestPointKey sets the largest point key to the given key.
    80  // NB: this method set the "absolute" largest point key. Any existing key is
    81  // overridden.
    82  func (m *WriterMetadata) SetLargestPointKey(k InternalKey) {
    83  	m.LargestPoint = k
    84  	m.HasPointKeys = true
    85  }
    86  
    87  // SetLargestRangeDelKey sets the largest rangedel key to the given key.
    88  // NB: this method set the "absolute" largest rangedel key. Any existing key is
    89  // overridden.
    90  func (m *WriterMetadata) SetLargestRangeDelKey(k InternalKey) {
    91  	m.LargestRangeDel = k
    92  	m.HasRangeDelKeys = true
    93  }
    94  
    95  // SetLargestRangeKey sets the largest range key to the given key.
    96  // NB: this method set the "absolute" largest range key. Any existing key is
    97  // overridden.
    98  func (m *WriterMetadata) SetLargestRangeKey(k InternalKey) {
    99  	m.LargestRangeKey = k
   100  	m.HasRangeKeys = true
   101  }
   102  
   103  func (m *WriterMetadata) updateSeqNum(seqNum uint64) {
   104  	if m.SmallestSeqNum > seqNum {
   105  		m.SmallestSeqNum = seqNum
   106  	}
   107  	if m.LargestSeqNum < seqNum {
   108  		m.LargestSeqNum = seqNum
   109  	}
   110  }
   111  
   112  type flusher interface {
   113  	Flush() error
   114  }
   115  
   116  type writeCloseSyncer interface {
   117  	io.WriteCloser
   118  	Sync() error
   119  }
   120  
   121  // Writer is a table writer.
   122  type Writer struct {
   123  	writer    io.Writer
   124  	bufWriter *bufio.Writer
   125  	syncer    writeCloseSyncer
   126  	meta      WriterMetadata
   127  	err       error
   128  	// cacheID and fileNum are used to remove blocks written to the sstable from
   129  	// the cache, providing a defense in depth against bugs which cause cache
   130  	// collisions.
   131  	cacheID uint64
   132  	fileNum base.FileNum
   133  	// The following fields are copied from Options.
   134  	blockSize               int
   135  	blockSizeThreshold      int
   136  	indexBlockSize          int
   137  	indexBlockSizeThreshold int
   138  	compare                 Compare
   139  	split                   Split
   140  	formatKey               base.FormatKey
   141  	compression             Compression
   142  	separator               Separator
   143  	successor               Successor
   144  	tableFormat             TableFormat
   145  	cache                   *cache.Cache
   146  	restartInterval         int
   147  	checksumType            ChecksumType
   148  	// disableKeyOrderChecks disables the checks that keys are added to an
   149  	// sstable in order. It is intended for internal use only in the construction
   150  	// of invalid sstables for testing. See tool/make_test_sstables.go.
   151  	disableKeyOrderChecks bool
   152  	// With two level indexes, the index/filter of a SST file is partitioned into
   153  	// smaller blocks with an additional top-level index on them. When reading an
   154  	// index/filter, only the top-level index is loaded into memory. The two level
   155  	// index/filter then uses the top-level index to load on demand into the block
   156  	// cache the partitions that are required to perform the index/filter query.
   157  	//
   158  	// Two level indexes are enabled automatically when there is more than one
   159  	// index block.
   160  	//
   161  	// This is useful when there are very large index blocks, which generally occurs
   162  	// with the usage of large keys. With large index blocks, the index blocks fight
   163  	// the data blocks for block cache space and the index blocks are likely to be
   164  	// re-read many times from the disk. The top level index, which has a much
   165  	// smaller memory footprint, can be used to prevent the entire index block from
   166  	// being loaded into the block cache.
   167  	twoLevelIndex bool
   168  	// Internal flag to allow creation of range-del-v1 format blocks. Only used
   169  	// for testing. Note that v2 format blocks are backwards compatible with v1
   170  	// format blocks.
   171  	rangeDelV1Format    bool
   172  	indexBlock          *indexBlockBuf
   173  	rangeDelBlock       blockWriter
   174  	rangeKeyBlock       blockWriter
   175  	topLevelIndexBlock  blockWriter
   176  	props               Properties
   177  	propCollectors      []TablePropertyCollector
   178  	blockPropCollectors []BlockPropertyCollector
   179  	blockPropsEncoder   blockPropertiesEncoder
   180  	// filter accumulates the filter block. If populated, the filter ingests
   181  	// either the output of w.split (i.e. a prefix extractor) if w.split is not
   182  	// nil, or the full keys otherwise.
   183  	filter          filterWriter
   184  	indexPartitions []indexBlockAndBlockProperties
   185  
   186  	// indexBlockAlloc is used to bulk-allocate byte slices used to store index
   187  	// blocks in indexPartitions. These live until the index finishes.
   188  	indexBlockAlloc []byte
   189  	// indexSepAlloc is used to bulk-allocate index block seperator slices stored
   190  	// in indexPartitions. These live until the index finishes.
   191  	indexSepAlloc []byte
   192  
   193  	// To allow potentially overlapping (i.e. un-fragmented) range keys spans to
   194  	// be added to the Writer, a keyspan.Fragmenter is used to retain the keys
   195  	// and values, emitting fragmented, coalesced spans as appropriate. Range
   196  	// keys must be added in order of their start user-key.
   197  	fragmenter        keyspan.Fragmenter
   198  	rangeKeyEncoder   rangekey.Encoder
   199  	rangeKeyCoalesced keyspan.Span
   200  	rkBuf             []byte
   201  	// dataBlockBuf consists of the state which is currently owned by and used by
   202  	// the Writer client goroutine. This state can be handed off to other goroutines.
   203  	dataBlockBuf *dataBlockBuf
   204  	// blockBuf consists of the state which is owned by and used by the Writer client
   205  	// goroutine.
   206  	blockBuf blockBuf
   207  
   208  	coordination coordinationState
   209  }
   210  
   211  type coordinationState struct {
   212  	parallelismEnabled bool
   213  
   214  	// writeQueue is used to write data blocks to disk. The writeQueue is primarily
   215  	// used to maintain the order in which data blocks must be written to disk. For
   216  	// this reason, every single data block write must be done through the writeQueue.
   217  	writeQueue *writeQueue
   218  
   219  	sizeEstimate dataBlockEstimates
   220  }
   221  
   222  func (c *coordinationState) init(parallelismEnabled bool, writer *Writer) {
   223  	c.parallelismEnabled = parallelismEnabled
   224  	c.sizeEstimate.useMutex = parallelismEnabled
   225  
   226  	// writeQueueSize determines the size of the write queue, or the number
   227  	// of items which can be added to the queue without blocking. By default, we
   228  	// use a writeQueue size of 0, since we won't be doing any block writes in
   229  	// parallel.
   230  	writeQueueSize := 0
   231  	if parallelismEnabled {
   232  		writeQueueSize = runtime.GOMAXPROCS(0)
   233  	}
   234  	c.writeQueue = newWriteQueue(writeQueueSize, writer)
   235  }
   236  
   237  type sizeEstimate struct {
   238  	// emptySize is the size when there is no inflight data, and numEntries is 0.
   239  	// emptySize is constant once set.
   240  	emptySize uint64
   241  
   242  	// inflightSize is the estimated size of some inflight data which hasn't
   243  	// been written yet.
   244  	inflightSize uint64
   245  
   246  	// totalSize is the total size of the data which has already been written.
   247  	totalSize uint64
   248  
   249  	// numWrittenEntries is the total number of entries which have already been
   250  	// written.
   251  	numWrittenEntries uint64
   252  	// numInflightEntries is the total number of entries which are inflight, and
   253  	// haven't been written.
   254  	numInflightEntries uint64
   255  
   256  	// maxEstimatedSize stores the maximum result returned from sizeEstimate.size.
   257  	// It ensures that values returned from subsequent calls to Writer.EstimatedSize
   258  	// never decrease.
   259  	maxEstimatedSize uint64
   260  
   261  	// We assume that the entries added to the sizeEstimate can be compressed.
   262  	// For this reason, we keep track of a compressedSize and an uncompressedSize
   263  	// to compute a compression ratio for the inflight entries. If the entries
   264  	// aren't being compressed, then compressedSize and uncompressedSize must be
   265  	// equal.
   266  	compressedSize   uint64
   267  	uncompressedSize uint64
   268  }
   269  
   270  func (s *sizeEstimate) init(emptySize uint64) {
   271  	s.emptySize = emptySize
   272  }
   273  
   274  func (s *sizeEstimate) size() uint64 {
   275  	ratio := float64(1)
   276  	if s.uncompressedSize > 0 {
   277  		ratio = float64(s.compressedSize) / float64(s.uncompressedSize)
   278  	}
   279  	estimatedInflightSize := uint64(float64(s.inflightSize) * ratio)
   280  	total := s.totalSize + estimatedInflightSize
   281  	if total > s.maxEstimatedSize {
   282  		s.maxEstimatedSize = total
   283  	} else {
   284  		total = s.maxEstimatedSize
   285  	}
   286  
   287  	if total == 0 {
   288  		return s.emptySize
   289  	}
   290  
   291  	return total
   292  }
   293  
   294  func (s *sizeEstimate) numTotalEntries() uint64 {
   295  	return s.numWrittenEntries + s.numInflightEntries
   296  }
   297  
   298  func (s *sizeEstimate) addInflight(size int) {
   299  	s.numInflightEntries++
   300  	s.inflightSize += uint64(size)
   301  }
   302  
   303  func (s *sizeEstimate) written(newTotalSize uint64, inflightSize int, finalEntrySize int) {
   304  	s.inflightSize -= uint64(inflightSize)
   305  	if inflightSize > 0 {
   306  		// This entry was previously inflight, so we should decrement inflight
   307  		// entries.
   308  		s.numInflightEntries--
   309  	}
   310  	s.numWrittenEntries++
   311  	s.totalSize = newTotalSize
   312  
   313  	s.uncompressedSize += uint64(inflightSize)
   314  	s.compressedSize += uint64(finalEntrySize)
   315  }
   316  
   317  func (s *sizeEstimate) clear() {
   318  	*s = sizeEstimate{emptySize: s.emptySize}
   319  }
   320  
   321  type indexBlockBuf struct {
   322  	// block will only be accessed from the writeQueue.
   323  	block blockWriter
   324  
   325  	size struct {
   326  		useMutex bool
   327  		mu       sync.Mutex
   328  		estimate sizeEstimate
   329  	}
   330  
   331  	// restartInterval matches indexBlockBuf.block.restartInterval. We store it twice, because the `block`
   332  	// must only be accessed from the writeQueue goroutine.
   333  	restartInterval int
   334  }
   335  
   336  func (i *indexBlockBuf) clear() {
   337  	i.block.clear()
   338  	if i.size.useMutex {
   339  		i.size.mu.Lock()
   340  		defer i.size.mu.Unlock()
   341  	}
   342  	i.size.estimate.clear()
   343  	i.restartInterval = 0
   344  }
   345  
   346  var indexBlockBufPool = sync.Pool{
   347  	New: func() interface{} {
   348  		return &indexBlockBuf{}
   349  	},
   350  }
   351  
   352  const indexBlockRestartInterval = 1
   353  
   354  func newIndexBlockBuf(useMutex bool) *indexBlockBuf {
   355  	i := indexBlockBufPool.Get().(*indexBlockBuf)
   356  	i.size.useMutex = useMutex
   357  	i.restartInterval = indexBlockRestartInterval
   358  	i.block.restartInterval = indexBlockRestartInterval
   359  	i.size.estimate.init(emptyBlockSize)
   360  	return i
   361  }
   362  
   363  func (i *indexBlockBuf) shouldFlush(
   364  	sep InternalKey, valueLen, targetBlockSize, sizeThreshold int,
   365  ) bool {
   366  	if i.size.useMutex {
   367  		i.size.mu.Lock()
   368  		defer i.size.mu.Unlock()
   369  	}
   370  
   371  	// nEntries := i.size.estimate.numWrittenEntries + i.size.estimate.numInflightEntries
   372  	nEntries := i.size.estimate.numTotalEntries()
   373  	return shouldFlush(
   374  		sep, valueLen, i.restartInterval, int(i.size.estimate.size()),
   375  		int(nEntries), targetBlockSize, sizeThreshold)
   376  }
   377  
   378  func (i *indexBlockBuf) add(key InternalKey, value []byte, inflightSize int) {
   379  	i.block.add(key, value)
   380  	size := i.block.estimatedSize()
   381  	if i.size.useMutex {
   382  		i.size.mu.Lock()
   383  		defer i.size.mu.Unlock()
   384  	}
   385  	// Since, we're not compressing index entries when adding them to index blocks,
   386  	// we assume that the size of entry written to the index block is equal to the
   387  	// size of the inflight entry, giving us a compression ratio of 1.
   388  	i.size.estimate.written(uint64(size), inflightSize, inflightSize)
   389  }
   390  
   391  func (i *indexBlockBuf) finish() []byte {
   392  	b := i.block.finish()
   393  	return b
   394  }
   395  
   396  func (i *indexBlockBuf) addInflight(inflightSize int) {
   397  	if i.size.useMutex {
   398  		i.size.mu.Lock()
   399  		defer i.size.mu.Unlock()
   400  	}
   401  	i.size.estimate.addInflight(inflightSize)
   402  }
   403  
   404  func (i *indexBlockBuf) estimatedSize() uint64 {
   405  	if i.size.useMutex {
   406  		i.size.mu.Lock()
   407  		defer i.size.mu.Unlock()
   408  	}
   409  
   410  	// Make sure that the size estimation works as expected when parallelism
   411  	// is disabled.
   412  	if invariants.Enabled && !i.size.useMutex {
   413  		if i.size.estimate.inflightSize != 0 {
   414  			panic("unexpected inflight entry in index block size estimation")
   415  		}
   416  
   417  		// NB: The i.block should only be accessed from the writeQueue goroutine,
   418  		// when parallelism is enabled. We break that invariant here, but that's
   419  		// okay since parallelism is disabled.
   420  		if i.size.estimate.size() != uint64(i.block.estimatedSize()) {
   421  			panic("index block size estimation sans parallelism is incorrect")
   422  		}
   423  	}
   424  	return i.size.estimate.size()
   425  }
   426  
   427  // sizeEstimate is used for sstable size estimation. sizeEstimate can be accessed by
   428  // the Writer client, writeQueue, compressionQueue goroutines. Fields should only be
   429  // read/updated through the functions defined on the *sizeEstimate type.
   430  type dataBlockEstimates struct {
   431  	// If we don't do block compression, block writes in parallel, then we don't need to take
   432  	// the performance hit of synchronizing using this mutex.
   433  	useMutex bool
   434  	mu       sync.Mutex
   435  
   436  	estimate sizeEstimate
   437  }
   438  
   439  // newTotalSize is the new w.meta.Size. inflightSize is the uncompressed block size estimate which
   440  // was previously added to sizeEstimate.inflightSize. writtenSize is the compressed size of the block
   441  // which was written to disk.
   442  func (d *dataBlockEstimates) dataBlockWritten(
   443  	newTotalSize uint64, inflightSize int, writtenSize int,
   444  ) {
   445  	if d.useMutex {
   446  		d.mu.Lock()
   447  		defer d.mu.Unlock()
   448  	}
   449  
   450  	d.estimate.written(newTotalSize, inflightSize, writtenSize)
   451  }
   452  
   453  // size is an estimated size of datablock data which has been written to disk.
   454  func (d *dataBlockEstimates) size() uint64 {
   455  	if d.useMutex {
   456  		d.mu.Lock()
   457  		defer d.mu.Unlock()
   458  	}
   459  
   460  	// Use invariants to make sure that the size estimation works as expected
   461  	// when parallelism is disabled.
   462  	if invariants.Enabled && !d.useMutex {
   463  		if d.estimate.inflightSize != 0 {
   464  			panic("unexpected inflight entry in data block size estimation")
   465  		}
   466  	}
   467  
   468  	return d.estimate.size()
   469  }
   470  
   471  func (d *dataBlockEstimates) addInflightDataBlock(size int) {
   472  	if d.useMutex {
   473  		d.mu.Lock()
   474  		defer d.mu.Unlock()
   475  	}
   476  
   477  	d.estimate.addInflight(size)
   478  }
   479  
   480  var writeTaskPool = sync.Pool{
   481  	New: func() interface{} {
   482  		t := &writeTask{}
   483  		t.compressionDone = make(chan bool, 1)
   484  		return t
   485  	},
   486  }
   487  
   488  type checksummer struct {
   489  	checksumType ChecksumType
   490  	xxHasher     *xxhash.Digest
   491  }
   492  
   493  func (c *checksummer) checksum(block []byte, blockType []byte) (checksum uint32) {
   494  	// Calculate the checksum.
   495  	switch c.checksumType {
   496  	case ChecksumTypeCRC32c:
   497  		checksum = crc.New(block).Update(blockType).Value()
   498  	case ChecksumTypeXXHash64:
   499  		if c.xxHasher == nil {
   500  			c.xxHasher = xxhash.New()
   501  		} else {
   502  			c.xxHasher.Reset()
   503  		}
   504  		c.xxHasher.Write(block)
   505  		c.xxHasher.Write(blockType)
   506  		checksum = uint32(c.xxHasher.Sum64())
   507  	default:
   508  		panic(errors.Newf("unsupported checksum type: %d", c.checksumType))
   509  	}
   510  	return checksum
   511  }
   512  
   513  type blockBuf struct {
   514  	// tmp is a scratch buffer, large enough to hold either footerLen bytes,
   515  	// blockTrailerLen bytes, (5 * binary.MaxVarintLen64) bytes, and most
   516  	// likely large enough for a block handle with properties.
   517  	tmp [blockHandleLikelyMaxLen]byte
   518  	// compressedBuf is the destination buffer for compression. It is re-used over the
   519  	// lifetime of the blockBuf, avoiding the allocation of a temporary buffer for each block.
   520  	compressedBuf []byte
   521  	checksummer   checksummer
   522  }
   523  
   524  func (b *blockBuf) clear() {
   525  	// We can't assign b.compressedBuf[:0] to compressedBuf because snappy relies
   526  	// on the length of the buffer, and not the capacity to determine if it needs
   527  	// to make an allocation.
   528  	*b = blockBuf{
   529  		compressedBuf: b.compressedBuf, checksummer: b.checksummer,
   530  	}
   531  }
   532  
   533  // A dataBlockBuf holds all the state required to compress and write a data block to disk.
   534  // A dataBlockBuf begins its lifecycle owned by the Writer client goroutine. The Writer
   535  // client goroutine adds keys to the sstable, writing directly into a dataBlockBuf's blockWriter
   536  // until the block is full. Once a dataBlockBuf's block is full, the dataBlockBuf may be passed
   537  // to other goroutines for compression and file I/O.
   538  type dataBlockBuf struct {
   539  	blockBuf
   540  	dataBlock blockWriter
   541  
   542  	// uncompressed is a reference to a byte slice which is owned by the dataBlockBuf. It is the
   543  	// next byte slice to be compressed. The uncompressed byte slice will be backed by the
   544  	// dataBlock.buf.
   545  	uncompressed []byte
   546  	// compressed is a reference to a byte slice which is owned by the dataBlockBuf. It is the
   547  	// compressed byte slice which must be written to disk. The compressed byte slice may be
   548  	// backed by the dataBlock.buf, or the dataBlockBuf.compressedBuf, depending on whether
   549  	// we use the result of the compression.
   550  	compressed []byte
   551  
   552  	// We're making calls to BlockPropertyCollectors from the Writer client goroutine. We need to
   553  	// pass the encoded block properties over to the write queue. To prevent copies, and allocations,
   554  	// we give each dataBlockBuf, a blockPropertiesEncoder.
   555  	blockPropsEncoder blockPropertiesEncoder
   556  	// dataBlockProps is set when Writer.finishDataBlockProps is called. The dataBlockProps slice is
   557  	// a shallow copy of the internal buffer of the dataBlockBuf.blockPropsEncoder.
   558  	dataBlockProps []byte
   559  
   560  	// sepScratch is reusable scratch space for computing separator keys.
   561  	sepScratch []byte
   562  }
   563  
   564  func (d *dataBlockBuf) clear() {
   565  	d.blockBuf.clear()
   566  	d.dataBlock.clear()
   567  
   568  	d.uncompressed = nil
   569  	d.compressed = nil
   570  	d.dataBlockProps = nil
   571  	d.sepScratch = d.sepScratch[:0]
   572  }
   573  
   574  var dataBlockBufPool = sync.Pool{
   575  	New: func() interface{} {
   576  		return &dataBlockBuf{}
   577  	},
   578  }
   579  
   580  func newDataBlockBuf(restartInterval int, checksumType ChecksumType) *dataBlockBuf {
   581  	d := dataBlockBufPool.Get().(*dataBlockBuf)
   582  	d.dataBlock.restartInterval = restartInterval
   583  	d.checksummer.checksumType = checksumType
   584  	return d
   585  }
   586  
   587  func (d *dataBlockBuf) finish() {
   588  	d.uncompressed = d.dataBlock.finish()
   589  }
   590  
   591  func (d *dataBlockBuf) compressAndChecksum(c Compression) {
   592  	d.compressed = compressAndChecksum(d.uncompressed, c, &d.blockBuf)
   593  }
   594  
   595  func (d *dataBlockBuf) shouldFlush(
   596  	key InternalKey, valueLen, targetBlockSize, sizeThreshold int,
   597  ) bool {
   598  	return shouldFlush(
   599  		key, valueLen, d.dataBlock.restartInterval, d.dataBlock.estimatedSize(),
   600  		d.dataBlock.nEntries, targetBlockSize, sizeThreshold)
   601  }
   602  
   603  type indexBlockAndBlockProperties struct {
   604  	nEntries int
   605  	// sep is the last key added to this block, for computing a separator later.
   606  	sep        InternalKey
   607  	properties []byte
   608  	// block is the encoded block produced by blockWriter.finish.
   609  	block []byte
   610  }
   611  
   612  // Set sets the value for the given key. The sequence number is set to 0.
   613  // Intended for use to externally construct an sstable before ingestion into a
   614  // DB. For a given Writer, the keys passed to Set must be in strictly increasing
   615  // order.
   616  //
   617  // TODO(peter): untested
   618  func (w *Writer) Set(key, value []byte) error {
   619  	if w.err != nil {
   620  		return w.err
   621  	}
   622  	return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindSet), value)
   623  }
   624  
   625  // Delete deletes the value for the given key. The sequence number is set to
   626  // 0. Intended for use to externally construct an sstable before ingestion into
   627  // a DB.
   628  //
   629  // TODO(peter): untested
   630  func (w *Writer) Delete(key []byte) error {
   631  	if w.err != nil {
   632  		return w.err
   633  	}
   634  	return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindDelete), nil)
   635  }
   636  
   637  // DeleteRange deletes all of the keys (and values) in the range [start,end)
   638  // (inclusive on start, exclusive on end). The sequence number is set to
   639  // 0. Intended for use to externally construct an sstable before ingestion into
   640  // a DB.
   641  //
   642  // TODO(peter): untested
   643  func (w *Writer) DeleteRange(start, end []byte) error {
   644  	if w.err != nil {
   645  		return w.err
   646  	}
   647  	return w.addTombstone(base.MakeInternalKey(start, 0, InternalKeyKindRangeDelete), end)
   648  }
   649  
   650  // Merge adds an action to the DB that merges the value at key with the new
   651  // value. The details of the merge are dependent upon the configured merge
   652  // operator. The sequence number is set to 0. Intended for use to externally
   653  // construct an sstable before ingestion into a DB.
   654  //
   655  // TODO(peter): untested
   656  func (w *Writer) Merge(key, value []byte) error {
   657  	if w.err != nil {
   658  		return w.err
   659  	}
   660  	return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindMerge), value)
   661  }
   662  
   663  // Add adds a key/value pair to the table being written. For a given Writer,
   664  // the keys passed to Add must be in increasing order. The exception to this
   665  // rule is range deletion tombstones. Range deletion tombstones need to be
   666  // added ordered by their start key, but they can be added out of order from
   667  // point entries. Additionally, range deletion tombstones must be fragmented
   668  // (i.e. by keyspan.Fragmenter).
   669  func (w *Writer) Add(key InternalKey, value []byte) error {
   670  	if w.err != nil {
   671  		return w.err
   672  	}
   673  
   674  	switch key.Kind() {
   675  	case InternalKeyKindRangeDelete:
   676  		return w.addTombstone(key, value)
   677  	case base.InternalKeyKindRangeKeyDelete,
   678  		base.InternalKeyKindRangeKeySet,
   679  		base.InternalKeyKindRangeKeyUnset:
   680  		w.err = errors.Errorf(
   681  			"bitalostable: range keys must be added via one of the RangeKey* functions")
   682  		return w.err
   683  	}
   684  	return w.addPoint(key, value)
   685  }
   686  
   687  func (w *Writer) addPoint(key InternalKey, value []byte) error {
   688  	if !w.disableKeyOrderChecks && w.dataBlockBuf.dataBlock.nEntries >= 1 {
   689  		// curKey is guaranteed to be the last point key which was added to the Writer.
   690  		// Inlining base.DecodeInternalKey has a 2-3% improve in the BenchmarkWriter
   691  		// benchmark.
   692  		encodedKey := w.dataBlockBuf.dataBlock.curKey
   693  		n := len(encodedKey) - base.InternalTrailerLen
   694  		var trailer uint64
   695  		if n >= 0 {
   696  			trailer = binary.LittleEndian.Uint64(encodedKey[n:])
   697  			encodedKey = encodedKey[:n:n]
   698  		} else {
   699  			trailer = uint64(InternalKeyKindInvalid)
   700  			encodedKey = nil
   701  		}
   702  		largestPointKey := InternalKey{
   703  			UserKey: encodedKey,
   704  			Trailer: trailer,
   705  		}
   706  
   707  		if largestPointKey.UserKey != nil {
   708  			// TODO(peter): Manually inlined version of base.InternalCompare(). This is
   709  			// 3.5% faster on BenchmarkWriter on go1.13. Remove if go1.14 or future
   710  			// versions show this to not be a performance win.
   711  			x := w.compare(largestPointKey.UserKey, key.UserKey)
   712  			if x > 0 || (x == 0 && largestPointKey.Trailer <= key.Trailer) {
   713  				w.err = errors.Errorf("bitalostable: keys must be added in strictly increasing order: %s, %s",
   714  					largestPointKey.Pretty(w.formatKey), key.Pretty(w.formatKey))
   715  				return w.err
   716  			}
   717  		}
   718  	}
   719  
   720  	if err := w.maybeFlush(key, value); err != nil {
   721  		return err
   722  	}
   723  
   724  	for i := range w.propCollectors {
   725  		if err := w.propCollectors[i].Add(key, value); err != nil {
   726  			w.err = err
   727  			return err
   728  		}
   729  	}
   730  	for i := range w.blockPropCollectors {
   731  		if err := w.blockPropCollectors[i].Add(key, value); err != nil {
   732  			w.err = err
   733  			return err
   734  		}
   735  	}
   736  
   737  	w.maybeAddToFilter(key.UserKey)
   738  	w.dataBlockBuf.dataBlock.add(key, value)
   739  
   740  	w.meta.updateSeqNum(key.SeqNum())
   741  
   742  	if !w.meta.HasPointKeys {
   743  		k := base.DecodeInternalKey(w.dataBlockBuf.dataBlock.curKey)
   744  		// NB: We need to ensure that SmallestPoint.UserKey is set, so we create
   745  		// an InternalKey which is semantically identical to the key, but won't
   746  		// have a nil UserKey. We do this, because key.UserKey could be nil, and
   747  		// we don't want SmallestPoint.UserKey to be nil.
   748  		//
   749  		// todo(bananabrick): Determine if it's okay to have a nil SmallestPoint
   750  		// .UserKey now that we don't rely on a nil UserKey to determine if the
   751  		// key has been set or not.
   752  		w.meta.SetSmallestPointKey(k.Clone())
   753  	}
   754  
   755  	w.props.NumEntries++
   756  	switch key.Kind() {
   757  	case InternalKeyKindDelete:
   758  		w.props.NumDeletions++
   759  	case InternalKeyKindMerge:
   760  		w.props.NumMergeOperands++
   761  	}
   762  	w.props.RawKeySize += uint64(key.Size())
   763  	w.props.RawValueSize += uint64(len(value))
   764  	return nil
   765  }
   766  
   767  func (w *Writer) prettyTombstone(k InternalKey, value []byte) fmt.Formatter {
   768  	return keyspan.Span{
   769  		Start: k.UserKey,
   770  		End:   value,
   771  		Keys:  []keyspan.Key{{Trailer: k.Trailer}},
   772  	}.Pretty(w.formatKey)
   773  }
   774  
   775  func (w *Writer) addTombstone(key InternalKey, value []byte) error {
   776  	if !w.disableKeyOrderChecks && !w.rangeDelV1Format && w.rangeDelBlock.nEntries > 0 {
   777  		// Check that tombstones are being added in fragmented order. If the two
   778  		// tombstones overlap, their start and end keys must be identical.
   779  		prevKey := base.DecodeInternalKey(w.rangeDelBlock.curKey)
   780  		switch c := w.compare(prevKey.UserKey, key.UserKey); {
   781  		case c > 0:
   782  			w.err = errors.Errorf("bitalostable: keys must be added in order: %s, %s",
   783  				prevKey.Pretty(w.formatKey), key.Pretty(w.formatKey))
   784  			return w.err
   785  		case c == 0:
   786  			prevValue := w.rangeDelBlock.curValue
   787  			if w.compare(prevValue, value) != 0 {
   788  				w.err = errors.Errorf("bitalostable: overlapping tombstones must be fragmented: %s vs %s",
   789  					w.prettyTombstone(prevKey, prevValue),
   790  					w.prettyTombstone(key, value))
   791  				return w.err
   792  			}
   793  			if prevKey.SeqNum() <= key.SeqNum() {
   794  				w.err = errors.Errorf("bitalostable: keys must be added in strictly increasing order: %s, %s",
   795  					prevKey.Pretty(w.formatKey), key.Pretty(w.formatKey))
   796  				return w.err
   797  			}
   798  		default:
   799  			prevValue := w.rangeDelBlock.curValue
   800  			if w.compare(prevValue, key.UserKey) > 0 {
   801  				w.err = errors.Errorf("bitalostable: overlapping tombstones must be fragmented: %s vs %s",
   802  					w.prettyTombstone(prevKey, prevValue),
   803  					w.prettyTombstone(key, value))
   804  				return w.err
   805  			}
   806  		}
   807  	}
   808  
   809  	if key.Trailer == InternalKeyRangeDeleteSentinel {
   810  		w.err = errors.Errorf("bitalostable: cannot add range delete sentinel: %s", key.Pretty(w.formatKey))
   811  		return w.err
   812  	}
   813  
   814  	for i := range w.propCollectors {
   815  		if err := w.propCollectors[i].Add(key, value); err != nil {
   816  			w.err = err
   817  			return err
   818  		}
   819  	}
   820  
   821  	w.meta.updateSeqNum(key.SeqNum())
   822  
   823  	switch {
   824  	case w.rangeDelV1Format:
   825  		// Range tombstones are not fragmented in the v1 (i.e. RocksDB) range
   826  		// deletion block format, so we need to track the largest range tombstone
   827  		// end key as every range tombstone is added.
   828  		//
   829  		// Note that writing the v1 format is only supported for tests.
   830  		if w.props.NumRangeDeletions == 0 {
   831  			w.meta.SetSmallestRangeDelKey(key.Clone())
   832  			w.meta.SetLargestRangeDelKey(base.MakeRangeDeleteSentinelKey(value).Clone())
   833  		} else {
   834  			if base.InternalCompare(w.compare, w.meta.SmallestRangeDel, key) > 0 {
   835  				w.meta.SetSmallestRangeDelKey(key.Clone())
   836  			}
   837  			end := base.MakeRangeDeleteSentinelKey(value)
   838  			if base.InternalCompare(w.compare, w.meta.LargestRangeDel, end) < 0 {
   839  				w.meta.SetLargestRangeDelKey(end.Clone())
   840  			}
   841  		}
   842  
   843  	default:
   844  		// Range tombstones are fragmented in the v2 range deletion block format,
   845  		// so the start key of the first range tombstone added will be the smallest
   846  		// range tombstone key. The largest range tombstone key will be determined
   847  		// in Writer.Close() as the end key of the last range tombstone added.
   848  		if w.props.NumRangeDeletions == 0 {
   849  			w.meta.SetSmallestRangeDelKey(key.Clone())
   850  		}
   851  	}
   852  
   853  	w.props.NumEntries++
   854  	w.props.NumDeletions++
   855  	w.props.NumRangeDeletions++
   856  	w.props.RawKeySize += uint64(key.Size())
   857  	w.props.RawValueSize += uint64(len(value))
   858  	w.rangeDelBlock.add(key, value)
   859  	return nil
   860  }
   861  
   862  // RangeKeySet sets a range between start (inclusive) and end (exclusive) with
   863  // the given suffix to the given value.
   864  //
   865  // Keys must be added to the table in increasing order of start key. Spans are
   866  // not required to be fragmented.
   867  func (w *Writer) RangeKeySet(start, end, suffix, value []byte) error {
   868  	return w.addRangeKeySpan(keyspan.Span{
   869  		Start: w.tempRangeKeyCopy(start),
   870  		End:   w.tempRangeKeyCopy(end),
   871  		Keys: []keyspan.Key{
   872  			{
   873  				Trailer: base.MakeTrailer(0, base.InternalKeyKindRangeKeySet),
   874  				Suffix:  w.tempRangeKeyCopy(suffix),
   875  				Value:   w.tempRangeKeyCopy(value),
   876  			},
   877  		},
   878  	})
   879  }
   880  
   881  // RangeKeyUnset un-sets a range between start (inclusive) and end (exclusive)
   882  // with the given suffix.
   883  //
   884  // Keys must be added to the table in increasing order of start key. Spans are
   885  // not required to be fragmented.
   886  func (w *Writer) RangeKeyUnset(start, end, suffix []byte) error {
   887  	return w.addRangeKeySpan(keyspan.Span{
   888  		Start: w.tempRangeKeyCopy(start),
   889  		End:   w.tempRangeKeyCopy(end),
   890  		Keys: []keyspan.Key{
   891  			{
   892  				Trailer: base.MakeTrailer(0, base.InternalKeyKindRangeKeyUnset),
   893  				Suffix:  w.tempRangeKeyCopy(suffix),
   894  			},
   895  		},
   896  	})
   897  }
   898  
   899  // RangeKeyDelete deletes a range between start (inclusive) and end (exclusive).
   900  //
   901  // Keys must be added to the table in increasing order of start key. Spans are
   902  // not required to be fragmented.
   903  func (w *Writer) RangeKeyDelete(start, end []byte) error {
   904  	return w.addRangeKeySpan(keyspan.Span{
   905  		Start: w.tempRangeKeyCopy(start),
   906  		End:   w.tempRangeKeyCopy(end),
   907  		Keys: []keyspan.Key{
   908  			{Trailer: base.MakeTrailer(0, base.InternalKeyKindRangeKeyDelete)},
   909  		},
   910  	})
   911  }
   912  
   913  // AddRangeKey adds a range key set, unset, or delete key/value pair to the
   914  // table being written.
   915  //
   916  // Range keys must be supplied in strictly ascending order of start key (i.e.
   917  // user key ascending, sequence number descending, and key type descending).
   918  // Ranges added must also be supplied in fragmented span order - i.e. other than
   919  // spans that are perfectly aligned (same start and end keys), spans may not
   920  // overlap. Range keys may be added out of order relative to point keys and
   921  // range deletions.
   922  func (w *Writer) AddRangeKey(key InternalKey, value []byte) error {
   923  	if w.err != nil {
   924  		return w.err
   925  	}
   926  	return w.addRangeKey(key, value)
   927  }
   928  
   929  func (w *Writer) addRangeKeySpan(span keyspan.Span) error {
   930  	if w.fragmenter.Start() != nil && w.compare(w.fragmenter.Start(), span.Start) > 0 {
   931  		return errors.Errorf("bitalostable: spans must be added in order: %s > %s",
   932  			w.formatKey(w.fragmenter.Start()), w.formatKey(span.Start))
   933  	}
   934  	// Add this span to the fragmenter.
   935  	w.fragmenter.Add(span)
   936  	return w.err
   937  }
   938  
   939  func (w *Writer) coalesceSpans(span keyspan.Span) {
   940  	// This method is the emit function of the Fragmenter, so span.Keys is only
   941  	// owned by this span and it's safe to mutate.
   942  	w.rangeKeyCoalesced.Start = span.Start
   943  	w.rangeKeyCoalesced.End = span.End
   944  	err := rangekey.Coalesce(w.compare, span.Keys, &w.rangeKeyCoalesced.Keys)
   945  	if err != nil {
   946  		w.err = errors.Newf("sstable: could not coalesce span: %s", err)
   947  		return
   948  	}
   949  
   950  	// NB: The span only contains range keys and is internally consistent (eg,
   951  	// no duplicate suffixes, no additional keys after a RANGEKEYDEL).
   952  	w.err = firstError(w.err, w.rangeKeyEncoder.Encode(&w.rangeKeyCoalesced))
   953  }
   954  
   955  func (w *Writer) addRangeKey(key InternalKey, value []byte) error {
   956  	if !w.disableKeyOrderChecks && w.rangeKeyBlock.nEntries > 0 {
   957  		prevStartKey := base.DecodeInternalKey(w.rangeKeyBlock.curKey)
   958  		prevEndKey, _, ok := rangekey.DecodeEndKey(prevStartKey.Kind(), w.rangeKeyBlock.curValue)
   959  		if !ok {
   960  			// We panic here as we should have previously decoded and validated this
   961  			// key and value when it was first added to the range key block.
   962  			panic(errors.Errorf("bitalostable: invalid end key for span: %s",
   963  				prevStartKey.Pretty(w.formatKey)))
   964  		}
   965  
   966  		curStartKey := key
   967  		curEndKey, _, ok := rangekey.DecodeEndKey(curStartKey.Kind(), value)
   968  		if !ok {
   969  			w.err = errors.Errorf("bitalostable: invalid end key for span: %s",
   970  				curStartKey.Pretty(w.formatKey))
   971  			return w.err
   972  		}
   973  
   974  		// Start keys must be strictly increasing.
   975  		if base.InternalCompare(w.compare, prevStartKey, curStartKey) >= 0 {
   976  			w.err = errors.Errorf(
   977  				"bitalostable: range keys starts must be added in increasing order: %s, %s",
   978  				prevStartKey.Pretty(w.formatKey), key.Pretty(w.formatKey))
   979  			return w.err
   980  		}
   981  
   982  		// Start keys are increasing. If the start user keys are equal, the
   983  		// end keys must be equal (i.e. aligned spans).
   984  		if w.compare(prevStartKey.UserKey, curStartKey.UserKey) == 0 {
   985  			if w.compare(prevEndKey, curEndKey) != 0 {
   986  				w.err = errors.Errorf("bitalostable: overlapping range keys must be fragmented: %s, %s",
   987  					prevStartKey.Pretty(w.formatKey),
   988  					curStartKey.Pretty(w.formatKey))
   989  				return w.err
   990  			}
   991  		} else if w.compare(prevEndKey, curStartKey.UserKey) > 0 {
   992  			// If the start user keys are NOT equal, the spans must be disjoint (i.e.
   993  			// no overlap).
   994  			// NOTE: the inequality excludes zero, as we allow the end key of the
   995  			// lower span be the same as the start key of the upper span, because
   996  			// the range end key is considered an exclusive bound.
   997  			w.err = errors.Errorf("bitalostable: overlapping range keys must be fragmented: %s, %s",
   998  				prevStartKey.Pretty(w.formatKey),
   999  				curStartKey.Pretty(w.formatKey))
  1000  			return w.err
  1001  		}
  1002  	}
  1003  
  1004  	// TODO(travers): Add an invariant-gated check to ensure that suffix-values
  1005  	// are sorted within coalesced spans.
  1006  
  1007  	// Range-keys and point-keys are intended to live in "parallel" keyspaces.
  1008  	// However, we track a single seqnum in the table metadata that spans both of
  1009  	// these keyspaces.
  1010  	// TODO(travers): Consider tracking range key seqnums separately.
  1011  	w.meta.updateSeqNum(key.SeqNum())
  1012  
  1013  	// Range tombstones are fragmented, so the start key of the first range key
  1014  	// added will be the smallest. The largest range key is determined in
  1015  	// Writer.Close() as the end key of the last range key added to the block.
  1016  	if w.props.NumRangeKeys() == 0 {
  1017  		w.meta.SetSmallestRangeKey(key.Clone())
  1018  	}
  1019  
  1020  	// Update block properties.
  1021  	w.props.RawRangeKeyKeySize += uint64(key.Size())
  1022  	w.props.RawRangeKeyValueSize += uint64(len(value))
  1023  	switch key.Kind() {
  1024  	case base.InternalKeyKindRangeKeyDelete:
  1025  		w.props.NumRangeKeyDels++
  1026  	case base.InternalKeyKindRangeKeySet:
  1027  		w.props.NumRangeKeySets++
  1028  	case base.InternalKeyKindRangeKeyUnset:
  1029  		w.props.NumRangeKeyUnsets++
  1030  	default:
  1031  		panic(errors.Errorf("bitalostable: invalid range key type: %s", key.Kind()))
  1032  	}
  1033  
  1034  	for i := range w.blockPropCollectors {
  1035  		if err := w.blockPropCollectors[i].Add(key, value); err != nil {
  1036  			return err
  1037  		}
  1038  	}
  1039  
  1040  	// Add the key to the block.
  1041  	w.rangeKeyBlock.add(key, value)
  1042  	return nil
  1043  }
  1044  
  1045  // tempRangeKeyBuf returns a slice of length n from the Writer's rkBuf byte
  1046  // slice. Any byte written to the returned slice is retained for the lifetime of
  1047  // the Writer.
  1048  func (w *Writer) tempRangeKeyBuf(n int) []byte {
  1049  	if cap(w.rkBuf)-len(w.rkBuf) < n {
  1050  		size := len(w.rkBuf) + 2*n
  1051  		if size < 2*cap(w.rkBuf) {
  1052  			size = 2 * cap(w.rkBuf)
  1053  		}
  1054  		buf := make([]byte, len(w.rkBuf), size)
  1055  		copy(buf, w.rkBuf)
  1056  		w.rkBuf = buf
  1057  	}
  1058  	b := w.rkBuf[len(w.rkBuf) : len(w.rkBuf)+n]
  1059  	w.rkBuf = w.rkBuf[:len(w.rkBuf)+n]
  1060  	return b
  1061  }
  1062  
  1063  // tempRangeKeyCopy returns a copy of the provided slice, stored in the Writer's
  1064  // range key buffer.
  1065  func (w *Writer) tempRangeKeyCopy(k []byte) []byte {
  1066  	if len(k) == 0 {
  1067  		return nil
  1068  	}
  1069  	buf := w.tempRangeKeyBuf(len(k))
  1070  	copy(buf, k)
  1071  	return buf
  1072  }
  1073  
  1074  func (w *Writer) maybeAddToFilter(key []byte) {
  1075  	if w.filter != nil {
  1076  		if w.split != nil {
  1077  			prefix := key[:w.split(key)]
  1078  			w.filter.addKey(prefix)
  1079  		} else {
  1080  			w.filter.addKey(key)
  1081  		}
  1082  	}
  1083  }
  1084  
  1085  func (w *Writer) flush(key InternalKey) error {
  1086  	estimatedUncompressedSize := w.dataBlockBuf.dataBlock.estimatedSize()
  1087  	w.coordination.sizeEstimate.addInflightDataBlock(estimatedUncompressedSize)
  1088  
  1089  	var err error
  1090  
  1091  	// We're finishing a data block.
  1092  	err = w.finishDataBlockProps(w.dataBlockBuf)
  1093  	if err != nil {
  1094  		return err
  1095  	}
  1096  
  1097  	w.dataBlockBuf.finish()
  1098  	w.dataBlockBuf.compressAndChecksum(w.compression)
  1099  
  1100  	// Determine if the index block should be flushed. Since we're accessing the
  1101  	// dataBlockBuf.dataBlock.curKey here, we have to make sure that once we start
  1102  	// to pool the dataBlockBufs, the curKey isn't used by the Writer once the
  1103  	// dataBlockBuf is added back to a sync.Pool. In this particular case, the
  1104  	// byte slice which supports "sep" will eventually be copied when "sep" is
  1105  	// added to the index block.
  1106  	prevKey := base.DecodeInternalKey(w.dataBlockBuf.dataBlock.curKey)
  1107  	sep := w.indexEntrySep(prevKey, key, w.dataBlockBuf)
  1108  	// We determine that we should flush an index block from the Writer client
  1109  	// goroutine, but we actually finish the index block from the writeQueue.
  1110  	// When we determine that an index block should be flushed, we need to call
  1111  	// BlockPropertyCollector.FinishIndexBlock. But block property collector
  1112  	// calls must happen sequentially from the Writer client. Therefore, we need
  1113  	// to determine that we are going to flush the index block from the Writer
  1114  	// client.
  1115  	shouldFlushIndexBlock := supportsTwoLevelIndex(w.tableFormat) && w.indexBlock.shouldFlush(
  1116  		sep, encodedBHPEstimatedSize, w.indexBlockSize, w.indexBlockSizeThreshold,
  1117  	)
  1118  
  1119  	var indexProps []byte
  1120  	var flushableIndexBlock *indexBlockBuf
  1121  	if shouldFlushIndexBlock {
  1122  		flushableIndexBlock = w.indexBlock
  1123  		w.indexBlock = newIndexBlockBuf(w.coordination.parallelismEnabled)
  1124  		// Call BlockPropertyCollector.FinishIndexBlock, since we've decided to
  1125  		// flush the index block.
  1126  		indexProps, err = w.finishIndexBlockProps()
  1127  		if err != nil {
  1128  			return err
  1129  		}
  1130  	}
  1131  
  1132  	// We've called BlockPropertyCollector.FinishDataBlock, and, if necessary,
  1133  	// BlockPropertyCollector.FinishIndexBlock. Since we've decided to finish
  1134  	// the data block, we can call
  1135  	// BlockPropertyCollector.AddPrevDataBlockToIndexBlock.
  1136  	w.addPrevDataBlockToIndexBlockProps()
  1137  
  1138  	// Schedule a write.
  1139  	writeTask := writeTaskPool.Get().(*writeTask)
  1140  	// We're setting compressionDone to indicate that compression of this block
  1141  	// has already been completed.
  1142  	writeTask.compressionDone <- true
  1143  	writeTask.buf = w.dataBlockBuf
  1144  	writeTask.indexEntrySep = sep
  1145  	writeTask.inflightSize = estimatedUncompressedSize
  1146  	writeTask.currIndexBlock = w.indexBlock
  1147  	writeTask.indexInflightSize = sep.Size() + encodedBHPEstimatedSize
  1148  	writeTask.finishedIndexProps = indexProps
  1149  	writeTask.flushableIndexBlock = flushableIndexBlock
  1150  
  1151  	// The writeTask corresponds to an unwritten index entry.
  1152  	w.indexBlock.addInflight(writeTask.indexInflightSize)
  1153  
  1154  	w.dataBlockBuf = nil
  1155  	if w.coordination.parallelismEnabled {
  1156  		w.coordination.writeQueue.add(writeTask)
  1157  	} else {
  1158  		err = w.coordination.writeQueue.addSync(writeTask)
  1159  	}
  1160  	w.dataBlockBuf = newDataBlockBuf(w.restartInterval, w.checksumType)
  1161  
  1162  	return err
  1163  }
  1164  
  1165  func (w *Writer) maybeFlush(key InternalKey, value []byte) error {
  1166  	if !w.dataBlockBuf.shouldFlush(key, len(value), w.blockSize, w.blockSizeThreshold) {
  1167  		return nil
  1168  	}
  1169  
  1170  	err := w.flush(key)
  1171  
  1172  	if err != nil {
  1173  		w.err = err
  1174  		return err
  1175  	}
  1176  
  1177  	return nil
  1178  }
  1179  
  1180  // dataBlockBuf.dataBlockProps set by this method must be encoded before any future use of the
  1181  // dataBlockBuf.blockPropsEncoder, since the properties slice will get reused by the
  1182  // blockPropsEncoder.
  1183  func (w *Writer) finishDataBlockProps(buf *dataBlockBuf) error {
  1184  	if len(w.blockPropCollectors) == 0 {
  1185  		return nil
  1186  	}
  1187  	var err error
  1188  	buf.blockPropsEncoder.resetProps()
  1189  	for i := range w.blockPropCollectors {
  1190  		scratch := buf.blockPropsEncoder.getScratchForProp()
  1191  		if scratch, err = w.blockPropCollectors[i].FinishDataBlock(scratch); err != nil {
  1192  			return err
  1193  		}
  1194  		if len(scratch) > 0 {
  1195  			buf.blockPropsEncoder.addProp(shortID(i), scratch)
  1196  		}
  1197  	}
  1198  
  1199  	buf.dataBlockProps = buf.blockPropsEncoder.unsafeProps()
  1200  	return nil
  1201  }
  1202  
  1203  // The BlockHandleWithProperties returned by this method must be encoded before any future use of
  1204  // the Writer.blockPropsEncoder, since the properties slice will get reused by the blockPropsEncoder.
  1205  // maybeAddBlockPropertiesToBlockHandle should only be called if block is being written synchronously
  1206  // with the Writer client.
  1207  func (w *Writer) maybeAddBlockPropertiesToBlockHandle(
  1208  	bh BlockHandle,
  1209  ) (BlockHandleWithProperties, error) {
  1210  	err := w.finishDataBlockProps(w.dataBlockBuf)
  1211  	if err != nil {
  1212  		return BlockHandleWithProperties{}, err
  1213  	}
  1214  	return BlockHandleWithProperties{BlockHandle: bh, Props: w.dataBlockBuf.dataBlockProps}, nil
  1215  }
  1216  
  1217  func (w *Writer) indexEntrySep(prevKey, key InternalKey, dataBlockBuf *dataBlockBuf) InternalKey {
  1218  	// Make a rough guess that we want key-sized scratch to compute the separator.
  1219  	if cap(dataBlockBuf.sepScratch) < key.Size() {
  1220  		dataBlockBuf.sepScratch = make([]byte, 0, key.Size()*2)
  1221  	}
  1222  
  1223  	var sep InternalKey
  1224  	if key.UserKey == nil && key.Trailer == 0 {
  1225  		sep = prevKey.Successor(w.compare, w.successor, dataBlockBuf.sepScratch[:0])
  1226  	} else {
  1227  		sep = prevKey.Separator(w.compare, w.separator, dataBlockBuf.sepScratch[:0], key)
  1228  	}
  1229  	return sep
  1230  }
  1231  
  1232  // addIndexEntry adds an index entry for the specified key and block handle.
  1233  // addIndexEntry can be called from both the Writer client goroutine, and the
  1234  // writeQueue goroutine. If the flushIndexBuf != nil, then the indexProps, as
  1235  // they're used when the index block is finished.
  1236  //
  1237  // Invariant:
  1238  //  1. addIndexEntry must not store references to the sep InternalKey, the tmp
  1239  //     byte slice, bhp.Props. That is, these must be either deep copied or
  1240  //     encoded.
  1241  //  2. addIndexEntry must not hold references to the flushIndexBuf, and the writeTo
  1242  //     indexBlockBufs.
  1243  func (w *Writer) addIndexEntry(
  1244  	sep InternalKey,
  1245  	bhp BlockHandleWithProperties,
  1246  	tmp []byte,
  1247  	flushIndexBuf *indexBlockBuf,
  1248  	writeTo *indexBlockBuf,
  1249  	inflightSize int,
  1250  	indexProps []byte,
  1251  ) error {
  1252  	if bhp.Length == 0 {
  1253  		// A valid blockHandle must be non-zero.
  1254  		// In particular, it must have a non-zero length.
  1255  		return nil
  1256  	}
  1257  
  1258  	encoded := encodeBlockHandleWithProperties(tmp, bhp)
  1259  
  1260  	if flushIndexBuf != nil {
  1261  		if cap(w.indexPartitions) == 0 {
  1262  			w.indexPartitions = make([]indexBlockAndBlockProperties, 0, 32)
  1263  		}
  1264  		// Enable two level indexes if there is more than one index block.
  1265  		w.twoLevelIndex = true
  1266  		if err := w.finishIndexBlock(flushIndexBuf, indexProps); err != nil {
  1267  			return err
  1268  		}
  1269  	}
  1270  
  1271  	writeTo.add(sep, encoded, inflightSize)
  1272  	return nil
  1273  }
  1274  
  1275  func (w *Writer) addPrevDataBlockToIndexBlockProps() {
  1276  	for i := range w.blockPropCollectors {
  1277  		w.blockPropCollectors[i].AddPrevDataBlockToIndexBlock()
  1278  	}
  1279  }
  1280  
  1281  // addIndexEntrySync adds an index entry for the specified key and block handle.
  1282  // Writer.addIndexEntry is only called synchronously once Writer.Close is called.
  1283  // addIndexEntrySync should only be called if we're sure that index entries
  1284  // aren't being written asynchronously.
  1285  //
  1286  // Invariant:
  1287  //  1. addIndexEntrySync must not store references to the prevKey, key InternalKey's,
  1288  //     the tmp byte slice. That is, these must be either deep copied or encoded.
  1289  func (w *Writer) addIndexEntrySync(
  1290  	prevKey, key InternalKey, bhp BlockHandleWithProperties, tmp []byte,
  1291  ) error {
  1292  	sep := w.indexEntrySep(prevKey, key, w.dataBlockBuf)
  1293  	shouldFlush := supportsTwoLevelIndex(
  1294  		w.tableFormat) && w.indexBlock.shouldFlush(
  1295  		sep, encodedBHPEstimatedSize, w.indexBlockSize, w.indexBlockSizeThreshold,
  1296  	)
  1297  	var flushableIndexBlock *indexBlockBuf
  1298  	var props []byte
  1299  	var err error
  1300  	if shouldFlush {
  1301  		flushableIndexBlock = w.indexBlock
  1302  		w.indexBlock = newIndexBlockBuf(w.coordination.parallelismEnabled)
  1303  
  1304  		// Call BlockPropertyCollector.FinishIndexBlock, since we've decided to
  1305  		// flush the index block.
  1306  		props, err = w.finishIndexBlockProps()
  1307  		if err != nil {
  1308  			return err
  1309  		}
  1310  	}
  1311  
  1312  	err = w.addIndexEntry(sep, bhp, tmp, flushableIndexBlock, w.indexBlock, 0, props)
  1313  	if flushableIndexBlock != nil {
  1314  		flushableIndexBlock.clear()
  1315  		indexBlockBufPool.Put(flushableIndexBlock)
  1316  	}
  1317  	w.addPrevDataBlockToIndexBlockProps()
  1318  	return err
  1319  }
  1320  
  1321  func shouldFlush(
  1322  	key InternalKey,
  1323  	valueLen int,
  1324  	restartInterval, estimatedBlockSize, numEntries, targetBlockSize, sizeThreshold int,
  1325  ) bool {
  1326  	if numEntries == 0 {
  1327  		return false
  1328  	}
  1329  
  1330  	if estimatedBlockSize >= targetBlockSize {
  1331  		return true
  1332  	}
  1333  
  1334  	// The block is currently smaller than the target size.
  1335  	if estimatedBlockSize <= sizeThreshold {
  1336  		// The block is smaller than the threshold size at which we'll consider
  1337  		// flushing it.
  1338  		return false
  1339  	}
  1340  
  1341  	newSize := estimatedBlockSize + key.Size() + valueLen
  1342  	if numEntries%restartInterval == 0 {
  1343  		newSize += 4
  1344  	}
  1345  	newSize += 4                              // varint for shared prefix length
  1346  	newSize += uvarintLen(uint32(key.Size())) // varint for unshared key bytes
  1347  	newSize += uvarintLen(uint32(valueLen))   // varint for value size
  1348  	// Flush if the block plus the new entry is larger than the target size.
  1349  	return newSize > targetBlockSize
  1350  }
  1351  
  1352  const keyAllocSize = 256 << 10
  1353  
  1354  func cloneKeyWithBuf(k InternalKey, buf []byte) ([]byte, InternalKey) {
  1355  	if len(k.UserKey) == 0 {
  1356  		return buf, k
  1357  	}
  1358  	if len(buf) < len(k.UserKey) {
  1359  		buf = make([]byte, len(k.UserKey)+keyAllocSize)
  1360  	}
  1361  	n := copy(buf, k.UserKey)
  1362  	return buf[n:], InternalKey{UserKey: buf[:n:n], Trailer: k.Trailer}
  1363  }
  1364  
  1365  // Invariants: The byte slice returned by finishIndexBlockProps is heap-allocated
  1366  //
  1367  //	and has its own lifetime, independent of the Writer and the blockPropsEncoder,
  1368  //
  1369  // and it is safe to:
  1370  //  1. Reuse w.blockPropsEncoder without first encoding the byte slice returned.
  1371  //  2. Store the byte slice in the Writer since it is a copy and not supported by
  1372  //     an underlying buffer.
  1373  func (w *Writer) finishIndexBlockProps() ([]byte, error) {
  1374  	w.blockPropsEncoder.resetProps()
  1375  	for i := range w.blockPropCollectors {
  1376  		scratch := w.blockPropsEncoder.getScratchForProp()
  1377  		var err error
  1378  		if scratch, err = w.blockPropCollectors[i].FinishIndexBlock(scratch); err != nil {
  1379  			return nil, err
  1380  		}
  1381  		if len(scratch) > 0 {
  1382  			w.blockPropsEncoder.addProp(shortID(i), scratch)
  1383  		}
  1384  	}
  1385  	return w.blockPropsEncoder.props(), nil
  1386  }
  1387  
  1388  // finishIndexBlock finishes the current index block and adds it to the top
  1389  // level index block. This is only used when two level indexes are enabled.
  1390  //
  1391  // Invariants:
  1392  //  1. The props slice passed into finishedIndexBlock must not be a
  1393  //     owned by any other struct, since it will be stored in the Writer.indexPartitions
  1394  //     slice.
  1395  //  2. None of the buffers owned by indexBuf will be shallow copied and stored elsewhere.
  1396  //     That is, it must be safe to reuse indexBuf after finishIndexBlock has been called.
  1397  func (w *Writer) finishIndexBlock(indexBuf *indexBlockBuf, props []byte) error {
  1398  	part := indexBlockAndBlockProperties{
  1399  		nEntries: indexBuf.block.nEntries, properties: props,
  1400  	}
  1401  	w.indexSepAlloc, part.sep = cloneKeyWithBuf(
  1402  		base.DecodeInternalKey(indexBuf.block.curKey), w.indexSepAlloc,
  1403  	)
  1404  	bk := indexBuf.finish()
  1405  	if len(w.indexBlockAlloc) < len(bk) {
  1406  		// Allocate enough bytes for approximately 16 index blocks.
  1407  		w.indexBlockAlloc = make([]byte, len(bk)*16)
  1408  	}
  1409  	n := copy(w.indexBlockAlloc, bk)
  1410  	part.block = w.indexBlockAlloc[:n:n]
  1411  	w.indexBlockAlloc = w.indexBlockAlloc[n:]
  1412  	w.indexPartitions = append(w.indexPartitions, part)
  1413  	return nil
  1414  }
  1415  
  1416  func (w *Writer) writeTwoLevelIndex() (BlockHandle, error) {
  1417  	props, err := w.finishIndexBlockProps()
  1418  	if err != nil {
  1419  		return BlockHandle{}, err
  1420  	}
  1421  	// Add the final unfinished index.
  1422  	if err = w.finishIndexBlock(w.indexBlock, props); err != nil {
  1423  		return BlockHandle{}, err
  1424  	}
  1425  
  1426  	for i := range w.indexPartitions {
  1427  		b := &w.indexPartitions[i]
  1428  		w.props.NumDataBlocks += uint64(b.nEntries)
  1429  
  1430  		data := b.block
  1431  		w.props.IndexSize += uint64(len(data))
  1432  		bh, err := w.writeBlock(data, w.compression, &w.blockBuf)
  1433  		if err != nil {
  1434  			return BlockHandle{}, err
  1435  		}
  1436  		bhp := BlockHandleWithProperties{
  1437  			BlockHandle: bh,
  1438  			Props:       b.properties,
  1439  		}
  1440  		encoded := encodeBlockHandleWithProperties(w.blockBuf.tmp[:], bhp)
  1441  		w.topLevelIndexBlock.add(b.sep, encoded)
  1442  	}
  1443  
  1444  	// NB: RocksDB includes the block trailer length in the index size
  1445  	// property, though it doesn't include the trailer in the top level
  1446  	// index size property.
  1447  	w.props.IndexPartitions = uint64(len(w.indexPartitions))
  1448  	w.props.TopLevelIndexSize = uint64(w.topLevelIndexBlock.estimatedSize())
  1449  	w.props.IndexSize += w.props.TopLevelIndexSize + blockTrailerLen
  1450  
  1451  	return w.writeBlock(w.topLevelIndexBlock.finish(), w.compression, &w.blockBuf)
  1452  }
  1453  
  1454  func compressAndChecksum(b []byte, compression Compression, blockBuf *blockBuf) []byte {
  1455  	// Compress the buffer, discarding the result if the improvement isn't at
  1456  	// least 12.5%.
  1457  	blockType, compressed := compressBlock(compression, b, blockBuf.compressedBuf)
  1458  	if blockType != noCompressionBlockType && cap(compressed) > cap(blockBuf.compressedBuf) {
  1459  		blockBuf.compressedBuf = compressed[:cap(compressed)]
  1460  	}
  1461  	if len(compressed) < len(b)-len(b)/8 {
  1462  		b = compressed
  1463  	} else {
  1464  		blockType = noCompressionBlockType
  1465  	}
  1466  
  1467  	blockBuf.tmp[0] = byte(blockType)
  1468  
  1469  	// Calculate the checksum.
  1470  	checksum := blockBuf.checksummer.checksum(b, blockBuf.tmp[:1])
  1471  	binary.LittleEndian.PutUint32(blockBuf.tmp[1:5], checksum)
  1472  	return b
  1473  }
  1474  
  1475  func (w *Writer) writeCompressedBlock(block []byte, blockTrailerBuf []byte) (BlockHandle, error) {
  1476  	bh := BlockHandle{Offset: w.meta.Size, Length: uint64(len(block))}
  1477  
  1478  	if w.cacheID != 0 && w.fileNum != 0 {
  1479  		// Remove the block being written from the cache. This provides defense in
  1480  		// depth against bugs which cause cache collisions.
  1481  		//
  1482  		// TODO(peter): Alternatively, we could add the uncompressed value to the
  1483  		// cache.
  1484  		w.cache.Delete(w.cacheID, w.fileNum, bh.Offset)
  1485  	}
  1486  
  1487  	// Write the bytes to the file.
  1488  	n, err := w.writer.Write(block)
  1489  	if err != nil {
  1490  		return BlockHandle{}, err
  1491  	}
  1492  	w.meta.Size += uint64(n)
  1493  	n, err = w.writer.Write(blockTrailerBuf[:blockTrailerLen])
  1494  	if err != nil {
  1495  		return BlockHandle{}, err
  1496  	}
  1497  	w.meta.Size += uint64(n)
  1498  
  1499  	return bh, nil
  1500  }
  1501  
  1502  func (w *Writer) writeBlock(
  1503  	b []byte, compression Compression, blockBuf *blockBuf,
  1504  ) (BlockHandle, error) {
  1505  	b = compressAndChecksum(b, compression, blockBuf)
  1506  	return w.writeCompressedBlock(b, blockBuf.tmp[:])
  1507  }
  1508  
  1509  // assertFormatCompatibility ensures that the features present on the table are
  1510  // compatible with the table format version.
  1511  func (w *Writer) assertFormatCompatibility() error {
  1512  	// PebbleDBv1: block properties.
  1513  	if len(w.blockPropCollectors) > 0 && w.tableFormat < TableFormatPebblev1 {
  1514  		return errors.Newf(
  1515  			"table format version %s is less than the minimum required version %s for block properties",
  1516  			w.tableFormat, TableFormatPebblev1,
  1517  		)
  1518  	}
  1519  
  1520  	// PebbleDBv2: range keys.
  1521  	if w.props.NumRangeKeys() > 0 && w.tableFormat < TableFormatPebblev2 {
  1522  		return errors.Newf(
  1523  			"table format version %s is less than the minimum required version %s for range keys",
  1524  			w.tableFormat, TableFormatPebblev2,
  1525  		)
  1526  	}
  1527  
  1528  	return nil
  1529  }
  1530  
  1531  // Close finishes writing the table and closes the underlying file that the
  1532  // table was written to.
  1533  func (w *Writer) Close() (err error) {
  1534  	defer func() {
  1535  		if w.syncer == nil {
  1536  			return
  1537  		}
  1538  		err1 := w.syncer.Close()
  1539  		if err == nil {
  1540  			err = err1
  1541  		}
  1542  		w.syncer = nil
  1543  	}()
  1544  
  1545  	// finish must be called before we check for an error, because finish will
  1546  	// block until every single task added to the writeQueue has been processed,
  1547  	// and an error could be encountered while any of those tasks are processed.
  1548  	if err = w.coordination.writeQueue.finish(); err != nil {
  1549  		w.err = err
  1550  	}
  1551  
  1552  	if w.err != nil {
  1553  		return w.err
  1554  	}
  1555  
  1556  	// The w.meta.LargestPointKey is only used once the Writer is closed, so it is safe to set it
  1557  	// when the Writer is closed.
  1558  	//
  1559  	// The following invariants ensure that setting the largest key at this point of a Writer close
  1560  	// is correct:
  1561  	// 1. Keys must only be added to the Writer in an increasing order.
  1562  	// 2. The current w.dataBlockBuf is guaranteed to have the latest key added to the Writer. This
  1563  	//    must be true, because a w.dataBlockBuf is only switched out when a dataBlock is flushed,
  1564  	//    however, if a dataBlock is flushed, then we add a key to the new w.dataBlockBuf in the
  1565  	//    addPoint function after the flush occurs.
  1566  	if w.dataBlockBuf.dataBlock.nEntries >= 1 {
  1567  		w.meta.SetLargestPointKey(base.DecodeInternalKey(w.dataBlockBuf.dataBlock.curKey).Clone())
  1568  	}
  1569  
  1570  	// Finish the last data block, or force an empty data block if there
  1571  	// aren't any data blocks at all.
  1572  	if w.dataBlockBuf.dataBlock.nEntries > 0 || w.indexBlock.block.nEntries == 0 {
  1573  		bh, err := w.writeBlock(w.dataBlockBuf.dataBlock.finish(), w.compression, &w.dataBlockBuf.blockBuf)
  1574  		if err != nil {
  1575  			w.err = err
  1576  			return w.err
  1577  		}
  1578  		var bhp BlockHandleWithProperties
  1579  		if bhp, err = w.maybeAddBlockPropertiesToBlockHandle(bh); err != nil {
  1580  			w.err = err
  1581  			return err
  1582  		}
  1583  		prevKey := base.DecodeInternalKey(w.dataBlockBuf.dataBlock.curKey)
  1584  		if err = w.addIndexEntrySync(prevKey, InternalKey{}, bhp, w.dataBlockBuf.tmp[:]); err != nil {
  1585  			w.err = err
  1586  			return err
  1587  		}
  1588  	}
  1589  	w.props.DataSize = w.meta.Size
  1590  
  1591  	// Write the filter block.
  1592  	var metaindex rawBlockWriter
  1593  	metaindex.restartInterval = 1
  1594  	if w.filter != nil {
  1595  		b, err := w.filter.finish()
  1596  		if err != nil {
  1597  			w.err = err
  1598  			return w.err
  1599  		}
  1600  		bh, err := w.writeBlock(b, NoCompression, &w.blockBuf)
  1601  		if err != nil {
  1602  			w.err = err
  1603  			return w.err
  1604  		}
  1605  		n := encodeBlockHandle(w.blockBuf.tmp[:], bh)
  1606  		metaindex.add(InternalKey{UserKey: []byte(w.filter.metaName())}, w.blockBuf.tmp[:n])
  1607  		w.props.FilterPolicyName = w.filter.policyName()
  1608  		w.props.FilterSize = bh.Length
  1609  	}
  1610  
  1611  	var indexBH BlockHandle
  1612  	if w.twoLevelIndex {
  1613  		w.props.IndexType = twoLevelIndex
  1614  		// Write the two level index block.
  1615  		indexBH, err = w.writeTwoLevelIndex()
  1616  		if err != nil {
  1617  			w.err = err
  1618  			return w.err
  1619  		}
  1620  	} else {
  1621  		w.props.IndexType = binarySearchIndex
  1622  		// NB: RocksDB includes the block trailer length in the index size
  1623  		// property, though it doesn't include the trailer in the filter size
  1624  		// property.
  1625  		w.props.IndexSize = uint64(w.indexBlock.estimatedSize()) + blockTrailerLen
  1626  		w.props.NumDataBlocks = uint64(w.indexBlock.block.nEntries)
  1627  
  1628  		// Write the single level index block.
  1629  		indexBH, err = w.writeBlock(w.indexBlock.finish(), w.compression, &w.blockBuf)
  1630  		if err != nil {
  1631  			w.err = err
  1632  			return w.err
  1633  		}
  1634  	}
  1635  
  1636  	// Write the range-del block. The block handle must added to the meta index block
  1637  	// after the properties block has been written. This is because the entries in the
  1638  	// metaindex block must be sorted by key.
  1639  	var rangeDelBH BlockHandle
  1640  	if w.props.NumRangeDeletions > 0 {
  1641  		if !w.rangeDelV1Format {
  1642  			// Because the range tombstones are fragmented in the v2 format, the end
  1643  			// key of the last added range tombstone will be the largest range
  1644  			// tombstone key. Note that we need to make this into a range deletion
  1645  			// sentinel because sstable boundaries are inclusive while the end key of
  1646  			// a range deletion tombstone is exclusive. A Clone() is necessary as
  1647  			// rangeDelBlock.curValue is the same slice that will get passed
  1648  			// into w.writer, and some implementations of vfs.File mutate the
  1649  			// slice passed into Write(). Also, w.meta will often outlive the
  1650  			// blockWriter, and so cloning curValue allows the rangeDelBlock's
  1651  			// internal buffer to get gc'd.
  1652  			k := base.MakeRangeDeleteSentinelKey(w.rangeDelBlock.curValue).Clone()
  1653  			w.meta.SetLargestRangeDelKey(k)
  1654  		}
  1655  		rangeDelBH, err = w.writeBlock(w.rangeDelBlock.finish(), NoCompression, &w.blockBuf)
  1656  		if err != nil {
  1657  			w.err = err
  1658  			return w.err
  1659  		}
  1660  	}
  1661  
  1662  	// Write the range-key block, flushing any remaining spans from the
  1663  	// fragmenter first.
  1664  	w.fragmenter.Finish()
  1665  
  1666  	var rangeKeyBH BlockHandle
  1667  	if w.props.NumRangeKeys() > 0 {
  1668  		key := base.DecodeInternalKey(w.rangeKeyBlock.curKey)
  1669  		kind := key.Kind()
  1670  		endKey, _, ok := rangekey.DecodeEndKey(kind, w.rangeKeyBlock.curValue)
  1671  		if !ok {
  1672  			w.err = errors.Newf("invalid end key: %s", w.rangeKeyBlock.curValue)
  1673  			return w.err
  1674  		}
  1675  		k := base.MakeExclusiveSentinelKey(kind, endKey).Clone()
  1676  		w.meta.SetLargestRangeKey(k)
  1677  		// TODO(travers): The lack of compression on the range key block matches the
  1678  		// lack of compression on the range-del block. Revisit whether we want to
  1679  		// enable compression on this block.
  1680  		rangeKeyBH, err = w.writeBlock(w.rangeKeyBlock.finish(), NoCompression, &w.blockBuf)
  1681  		if err != nil {
  1682  			w.err = err
  1683  			return w.err
  1684  		}
  1685  	}
  1686  
  1687  	// Add the range key block handle to the metaindex block. Note that we add the
  1688  	// block handle to the metaindex block before the other meta blocks as the
  1689  	// metaindex block entries must be sorted, and the range key block name sorts
  1690  	// before the other block names.
  1691  	if w.props.NumRangeKeys() > 0 {
  1692  		n := encodeBlockHandle(w.blockBuf.tmp[:], rangeKeyBH)
  1693  		metaindex.add(InternalKey{UserKey: []byte(metaRangeKeyName)}, w.blockBuf.tmp[:n])
  1694  	}
  1695  
  1696  	{
  1697  		userProps := make(map[string]string)
  1698  		for i := range w.propCollectors {
  1699  			if err := w.propCollectors[i].Finish(userProps); err != nil {
  1700  				w.err = err
  1701  				return err
  1702  			}
  1703  		}
  1704  		for i := range w.blockPropCollectors {
  1705  			scratch := w.blockPropsEncoder.getScratchForProp()
  1706  			// Place the shortID in the first byte.
  1707  			scratch = append(scratch, byte(i))
  1708  			buf, err :=
  1709  				w.blockPropCollectors[i].FinishTable(scratch)
  1710  			if err != nil {
  1711  				w.err = err
  1712  				return err
  1713  			}
  1714  			var prop string
  1715  			if len(buf) > 0 {
  1716  				prop = string(buf)
  1717  			}
  1718  			// NB: The property is populated in the map even if it is the
  1719  			// empty string, since the presence in the map is what indicates
  1720  			// that the block property collector was used when writing.
  1721  			userProps[w.blockPropCollectors[i].Name()] = prop
  1722  		}
  1723  		if len(userProps) > 0 {
  1724  			w.props.UserProperties = userProps
  1725  		}
  1726  
  1727  		// Write the properties block.
  1728  		var raw rawBlockWriter
  1729  		// The restart interval is set to infinity because the properties block
  1730  		// is always read sequentially and cached in a heap located object. This
  1731  		// reduces table size without a significant impact on performance.
  1732  		raw.restartInterval = propertiesBlockRestartInterval
  1733  		w.props.CompressionOptions = rocksDBCompressionOptions
  1734  		w.props.save(&raw)
  1735  		bh, err := w.writeBlock(raw.finish(), NoCompression, &w.blockBuf)
  1736  		if err != nil {
  1737  			w.err = err
  1738  			return w.err
  1739  		}
  1740  		n := encodeBlockHandle(w.blockBuf.tmp[:], bh)
  1741  		metaindex.add(InternalKey{UserKey: []byte(metaPropertiesName)}, w.blockBuf.tmp[:n])
  1742  	}
  1743  
  1744  	// Add the range deletion block handle to the metaindex block.
  1745  	if w.props.NumRangeDeletions > 0 {
  1746  		n := encodeBlockHandle(w.blockBuf.tmp[:], rangeDelBH)
  1747  		// The v2 range-del block encoding is backwards compatible with the v1
  1748  		// encoding. We add meta-index entries for both the old name and the new
  1749  		// name so that old code can continue to find the range-del block and new
  1750  		// code knows that the range tombstones in the block are fragmented and
  1751  		// sorted.
  1752  		metaindex.add(InternalKey{UserKey: []byte(metaRangeDelName)}, w.blockBuf.tmp[:n])
  1753  		if !w.rangeDelV1Format {
  1754  			metaindex.add(InternalKey{UserKey: []byte(metaRangeDelV2Name)}, w.blockBuf.tmp[:n])
  1755  		}
  1756  	}
  1757  
  1758  	// Write the metaindex block. It might be an empty block, if the filter
  1759  	// policy is nil. NoCompression is specified because a) RocksDB never
  1760  	// compresses the meta-index block and b) RocksDB has some code paths which
  1761  	// expect the meta-index block to not be compressed.
  1762  	metaindexBH, err := w.writeBlock(metaindex.blockWriter.finish(), NoCompression, &w.blockBuf)
  1763  	if err != nil {
  1764  		w.err = err
  1765  		return w.err
  1766  	}
  1767  
  1768  	// Write the table footer.
  1769  	footer := footer{
  1770  		format:      w.tableFormat,
  1771  		checksum:    w.blockBuf.checksummer.checksumType,
  1772  		metaindexBH: metaindexBH,
  1773  		indexBH:     indexBH,
  1774  	}
  1775  	var n int
  1776  	if n, err = w.writer.Write(footer.encode(w.blockBuf.tmp[:])); err != nil {
  1777  		w.err = err
  1778  		return w.err
  1779  	}
  1780  	w.meta.Size += uint64(n)
  1781  	w.meta.Properties = w.props
  1782  
  1783  	// Flush the buffer.
  1784  	if w.bufWriter != nil {
  1785  		if err := w.bufWriter.Flush(); err != nil {
  1786  			w.err = err
  1787  			return err
  1788  		}
  1789  	}
  1790  
  1791  	// Check that the features present in the table are compatible with the format
  1792  	// configured for the table.
  1793  	if err = w.assertFormatCompatibility(); err != nil {
  1794  		w.err = err
  1795  		return w.err
  1796  	}
  1797  
  1798  	if err := w.syncer.Sync(); err != nil {
  1799  		w.err = err
  1800  		return err
  1801  	}
  1802  
  1803  	w.dataBlockBuf.clear()
  1804  	dataBlockBufPool.Put(w.dataBlockBuf)
  1805  	w.dataBlockBuf = nil
  1806  	w.indexBlock.clear()
  1807  	indexBlockBufPool.Put(w.indexBlock)
  1808  	w.indexBlock = nil
  1809  
  1810  	// Make any future calls to Set or Close return an error.
  1811  	if w.err != nil {
  1812  		return w.err
  1813  	}
  1814  	w.err = errWriterClosed
  1815  	return nil
  1816  }
  1817  
  1818  // EstimatedSize returns the estimated size of the sstable being written if a
  1819  // call to Finish() was made without adding additional keys.
  1820  func (w *Writer) EstimatedSize() uint64 {
  1821  	if invariants.Enabled && !w.coordination.parallelismEnabled {
  1822  		// The w.meta.Size should only be accessed from the writeQueue goroutine
  1823  		// if parallelism is enabled, but since it isn't we break that invariant
  1824  		// here.
  1825  		if w.coordination.sizeEstimate.size() != w.meta.Size {
  1826  			panic("sstable size estimation sans parallelism is incorrect")
  1827  		}
  1828  	}
  1829  	return w.coordination.sizeEstimate.size() +
  1830  		uint64(w.dataBlockBuf.dataBlock.estimatedSize()) +
  1831  		w.indexBlock.estimatedSize()
  1832  }
  1833  
  1834  // Metadata returns the metadata for the finished sstable. Only valid to call
  1835  // after the sstable has been finished.
  1836  func (w *Writer) Metadata() (*WriterMetadata, error) {
  1837  	if w.syncer != nil {
  1838  		return nil, errors.New("bitalostable: writer is not closed")
  1839  	}
  1840  	return &w.meta, nil
  1841  }
  1842  
  1843  // WriterOption provide an interface to do work on Writer while it is being
  1844  // opened.
  1845  type WriterOption interface {
  1846  	// writerApply is called on the writer during opening in order to set
  1847  	// internal parameters.
  1848  	writerApply(*Writer)
  1849  }
  1850  
  1851  // PreviousPointKeyOpt is a WriterOption that provides access to the last
  1852  // point key written to the writer while building a sstable.
  1853  type PreviousPointKeyOpt struct {
  1854  	w *Writer
  1855  }
  1856  
  1857  // UnsafeKey returns the last point key written to the writer to which this
  1858  // option was passed during creation. The returned key points directly into
  1859  // a buffer belonging to the Writer. The value's lifetime ends the next time a
  1860  // point key is added to the Writer.
  1861  // Invariant: UnsafeKey isn't and shouldn't be called after the Writer is closed.
  1862  func (o PreviousPointKeyOpt) UnsafeKey() base.InternalKey {
  1863  	if o.w == nil {
  1864  		return base.InvalidInternalKey
  1865  	}
  1866  
  1867  	if o.w.dataBlockBuf.dataBlock.nEntries >= 1 {
  1868  		// o.w.dataBlockBuf.dataBlock.curKey is guaranteed to point to the last point key
  1869  		// which was added to the Writer.
  1870  		return base.DecodeInternalKey(o.w.dataBlockBuf.dataBlock.curKey)
  1871  	}
  1872  	return base.InternalKey{}
  1873  }
  1874  
  1875  func (o *PreviousPointKeyOpt) writerApply(w *Writer) {
  1876  	o.w = w
  1877  }
  1878  
  1879  // internalTableOpt is a WriterOption that sets properties for sstables being
  1880  // created by the db itself (i.e. through flushes and compactions), as opposed
  1881  // to those meant for ingestion.
  1882  type internalTableOpt struct{}
  1883  
  1884  func (i internalTableOpt) writerApply(w *Writer) {
  1885  	// Set the external sst version to 0. This is what RocksDB expects for
  1886  	// db-internal sstables; otherwise, it could apply a global sequence number.
  1887  	w.props.ExternalFormatVersion = 0
  1888  }
  1889  
  1890  // NewWriter returns a new table writer for the file. Closing the writer will
  1891  // close the file.
  1892  func NewWriter(f writeCloseSyncer, o WriterOptions, extraOpts ...WriterOption) *Writer {
  1893  	o = o.ensureDefaults()
  1894  	w := &Writer{
  1895  		syncer: f,
  1896  		meta: WriterMetadata{
  1897  			SmallestSeqNum: math.MaxUint64,
  1898  		},
  1899  		blockSize:               o.BlockSize,
  1900  		blockSizeThreshold:      (o.BlockSize*o.BlockSizeThreshold + 99) / 100,
  1901  		indexBlockSize:          o.IndexBlockSize,
  1902  		indexBlockSizeThreshold: (o.IndexBlockSize*o.BlockSizeThreshold + 99) / 100,
  1903  		compare:                 o.Comparer.Compare,
  1904  		split:                   o.Comparer.Split,
  1905  		formatKey:               o.Comparer.FormatKey,
  1906  		compression:             o.Compression,
  1907  		separator:               o.Comparer.Separator,
  1908  		successor:               o.Comparer.Successor,
  1909  		tableFormat:             o.TableFormat,
  1910  		cache:                   o.Cache,
  1911  		restartInterval:         o.BlockRestartInterval,
  1912  		checksumType:            o.Checksum,
  1913  		indexBlock:              newIndexBlockBuf(o.Parallelism),
  1914  		rangeDelBlock: blockWriter{
  1915  			restartInterval: 1,
  1916  		},
  1917  		rangeKeyBlock: blockWriter{
  1918  			restartInterval: 1,
  1919  		},
  1920  		topLevelIndexBlock: blockWriter{
  1921  			restartInterval: 1,
  1922  		},
  1923  		fragmenter: keyspan.Fragmenter{
  1924  			Cmp:    o.Comparer.Compare,
  1925  			Format: o.Comparer.FormatKey,
  1926  		},
  1927  	}
  1928  
  1929  	w.dataBlockBuf = newDataBlockBuf(w.restartInterval, w.checksumType)
  1930  
  1931  	w.blockBuf = blockBuf{
  1932  		checksummer: checksummer{checksumType: o.Checksum},
  1933  	}
  1934  
  1935  	w.coordination.init(o.Parallelism, w)
  1936  
  1937  	if f == nil {
  1938  		w.err = errors.New("bitalostable: nil file")
  1939  		return w
  1940  	}
  1941  
  1942  	// Note that WriterOptions are applied in two places; the ones with a
  1943  	// preApply() method are applied here, and the rest are applied after
  1944  	// default properties are set.
  1945  	type preApply interface{ preApply() }
  1946  	for _, opt := range extraOpts {
  1947  		if _, ok := opt.(preApply); ok {
  1948  			opt.writerApply(w)
  1949  		}
  1950  	}
  1951  
  1952  	w.props.PrefixExtractorName = "nullptr"
  1953  	if o.FilterPolicy != nil {
  1954  		switch o.FilterType {
  1955  		case TableFilter:
  1956  			w.filter = newTableFilterWriter(o.FilterPolicy)
  1957  			if w.split != nil {
  1958  				w.props.PrefixExtractorName = o.Comparer.Name
  1959  				w.props.PrefixFiltering = true
  1960  			} else {
  1961  				w.props.WholeKeyFiltering = true
  1962  			}
  1963  		default:
  1964  			panic(fmt.Sprintf("unknown filter type: %v", o.FilterType))
  1965  		}
  1966  	}
  1967  
  1968  	w.props.ColumnFamilyID = math.MaxInt32
  1969  	w.props.ComparerName = o.Comparer.Name
  1970  	w.props.CompressionName = o.Compression.String()
  1971  	w.props.MergerName = o.MergerName
  1972  	w.props.PropertyCollectorNames = "[]"
  1973  	w.props.ExternalFormatVersion = rocksDBExternalFormatVersion
  1974  
  1975  	if len(o.TablePropertyCollectors) > 0 || len(o.BlockPropertyCollectors) > 0 {
  1976  		var buf bytes.Buffer
  1977  		buf.WriteString("[")
  1978  		if len(o.TablePropertyCollectors) > 0 {
  1979  			w.propCollectors = make([]TablePropertyCollector, len(o.TablePropertyCollectors))
  1980  			for i := range o.TablePropertyCollectors {
  1981  				w.propCollectors[i] = o.TablePropertyCollectors[i]()
  1982  				if i > 0 {
  1983  					buf.WriteString(",")
  1984  				}
  1985  				buf.WriteString(w.propCollectors[i].Name())
  1986  			}
  1987  		}
  1988  		if len(o.BlockPropertyCollectors) > 0 {
  1989  			// shortID is a uint8, so we cannot exceed that number of block
  1990  			// property collectors.
  1991  			if len(o.BlockPropertyCollectors) > math.MaxUint8 {
  1992  				w.err = errors.New("bitalostable: too many block property collectors")
  1993  				return w
  1994  			}
  1995  			// The shortID assigned to a collector is the same as its index in
  1996  			// this slice.
  1997  			w.blockPropCollectors = make([]BlockPropertyCollector, len(o.BlockPropertyCollectors))
  1998  			for i := range o.BlockPropertyCollectors {
  1999  				w.blockPropCollectors[i] = o.BlockPropertyCollectors[i]()
  2000  				if i > 0 || len(o.TablePropertyCollectors) > 0 {
  2001  					buf.WriteString(",")
  2002  				}
  2003  				buf.WriteString(w.blockPropCollectors[i].Name())
  2004  			}
  2005  		}
  2006  		buf.WriteString("]")
  2007  		w.props.PropertyCollectorNames = buf.String()
  2008  	}
  2009  
  2010  	// Apply the remaining WriterOptions that do not have a preApply() method.
  2011  	for _, opt := range extraOpts {
  2012  		if _, ok := opt.(preApply); !ok {
  2013  			opt.writerApply(w)
  2014  		}
  2015  	}
  2016  
  2017  	// Initialize the range key fragmenter and encoder.
  2018  	w.fragmenter.Emit = w.coalesceSpans
  2019  	w.rangeKeyEncoder.Emit = w.addRangeKey
  2020  
  2021  	// If f does not have a Flush method, do our own buffering.
  2022  	if _, ok := f.(flusher); ok {
  2023  		w.writer = f
  2024  	} else {
  2025  		w.bufWriter = bufio.NewWriter(f)
  2026  		w.writer = w.bufWriter
  2027  	}
  2028  	return w
  2029  }
  2030  
  2031  func init() {
  2032  	private.SSTableWriterDisableKeyOrderChecks = func(i interface{}) {
  2033  		w := i.(*Writer)
  2034  		w.disableKeyOrderChecks = true
  2035  	}
  2036  	private.SSTableInternalTableOpt = internalTableOpt{}
  2037  }