github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/sstable/writer.go

github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/sstable/writer.go (about)

     1  // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package sstable
     6  
     7  import (
     8  	"bufio"
     9  	"bytes"
    10  	"encoding/binary"
    11  	"errors"
    12  	"fmt"
    13  	"io"
    14  	"math"
    15  
    16  	"github.com/golang/snappy"
    17  	"github.com/petermattis/pebble/internal/base"
    18  	"github.com/petermattis/pebble/internal/crc"
    19  	"github.com/petermattis/pebble/internal/rangedel"
    20  )
    21  
    22  // WriterMetadata holds info about a finished sstable.
    23  type WriterMetadata struct {
    24  	Size           uint64
    25  	SmallestPoint  InternalKey
    26  	SmallestRange  InternalKey
    27  	LargestPoint   InternalKey
    28  	LargestRange   InternalKey
    29  	SmallestSeqNum uint64
    30  	LargestSeqNum  uint64
    31  }
    32  
    33  func (m *WriterMetadata) updateSeqNum(seqNum uint64) {
    34  	if m.SmallestSeqNum > seqNum {
    35  		m.SmallestSeqNum = seqNum
    36  	}
    37  	if m.LargestSeqNum < seqNum {
    38  		m.LargestSeqNum = seqNum
    39  	}
    40  }
    41  
    42  func (m *WriterMetadata) updateLargestPoint(key InternalKey) {
    43  	// Avoid the memory allocation in InternalKey.Clone() by reusing the buffer.
    44  	m.LargestPoint.UserKey = append(m.LargestPoint.UserKey[:0], key.UserKey...)
    45  	m.LargestPoint.Trailer = key.Trailer
    46  }
    47  
    48  // Smallest returns the smaller of SmallestPoint and SmallestRange.
    49  func (m *WriterMetadata) Smallest(cmp Compare) InternalKey {
    50  	if m.SmallestPoint.UserKey == nil {
    51  		return m.SmallestRange
    52  	}
    53  	if m.SmallestRange.UserKey == nil {
    54  		return m.SmallestPoint
    55  	}
    56  	if base.InternalCompare(cmp, m.SmallestPoint, m.SmallestRange) < 0 {
    57  		return m.SmallestPoint
    58  	}
    59  	return m.SmallestRange
    60  }
    61  
    62  // Largest returns the larget of LargestPoint and LargestRange.
    63  func (m *WriterMetadata) Largest(cmp Compare) InternalKey {
    64  	if m.LargestPoint.UserKey == nil {
    65  		return m.LargestRange
    66  	}
    67  	if m.LargestRange.UserKey == nil {
    68  		return m.LargestPoint
    69  	}
    70  	if base.InternalCompare(cmp, m.LargestPoint, m.LargestRange) > 0 {
    71  		return m.LargestPoint
    72  	}
    73  	return m.LargestRange
    74  }
    75  
    76  type flusher interface {
    77  	Flush() error
    78  }
    79  
    80  type writeCloseSyncer interface {
    81  	io.WriteCloser
    82  	Sync() error
    83  }
    84  
    85  // Writer is a table writer.
    86  type Writer struct {
    87  	writer    io.Writer
    88  	bufWriter *bufio.Writer
    89  	syncer    writeCloseSyncer
    90  	meta      WriterMetadata
    91  	err       error
    92  	// The following fields are copied from Options.
    93  	blockSize               int
    94  	blockSizeThreshold      int
    95  	indexBlockSize          int
    96  	indexBlockSizeThreshold int
    97  	compare                 Compare
    98  	split                   Split
    99  	compression             Compression
   100  	separator               Separator
   101  	successor               Successor
   102  	tableFormat             TableFormat
   103  	// With two level indexes, the index/filter of a SST file is partitioned into
   104  	// smaller blocks with an additional top-level index on them. When reading an
   105  	// index/filter, only the top-level index is loaded into memory. The two level
   106  	// index/filter then uses the top-level index to load on demand into the block
   107  	// cache the partitions that are required to perform the index/filter query.
   108  	//
   109  	// Two level indexes are enabled automatically when there is more than one
   110  	// index block.
   111  	//
   112  	// This is useful when there are very large index blocks, which generally occurs
   113  	// with the usage of large keys. With large index blocks, the index blocks fight
   114  	// the data blocks for block cache space and the index blocks are likely to be
   115  	// re-read many times from the disk. The top level index, which has a much
   116  	// smaller memory footprint, can be used to prevent the entire index block from
   117  	// being loaded into the block cache.
   118  	twoLevelIndex bool
   119  	// Internal flag to allow creation of range-del-v1 format blocks. Only used
   120  	// for testing. Note that v2 format blocks are backwards compatible with v1
   121  	// format blocks.
   122  	rangeDelV1Format bool
   123  	// A table is a series of blocks and a block's index entry contains a
   124  	// separator key between one block and the next. Thus, a finished block
   125  	// cannot be written until the first key in the next block is seen.
   126  	// pendingBH is the blockHandle of a finished block that is waiting for
   127  	// the next call to Set. If the writer is not in this state, pendingBH
   128  	// is zero.
   129  	pendingBH      BlockHandle
   130  	block          blockWriter
   131  	indexBlock     blockWriter
   132  	rangeDelBlock  blockWriter
   133  	props          Properties
   134  	propCollectors []TablePropertyCollector
   135  	// compressedBuf is the destination buffer for snappy compression. It is
   136  	// re-used over the lifetime of the writer, avoiding the allocation of a
   137  	// temporary buffer for each block.
   138  	compressedBuf []byte
   139  	// filter accumulates the filter block. If populated, the filter ingests
   140  	// either the output of w.split (i.e. a prefix extractor) if w.split is not
   141  	// nil, or the full keys otherwise.
   142  	filter filterWriter
   143  	// tmp is a scratch buffer, large enough to hold either footerLen bytes,
   144  	// blockTrailerLen bytes, or (5 * binary.MaxVarintLen64) bytes.
   145  	tmp [rocksDBFooterLen]byte
   146  
   147  	topLevelIndexBlock blockWriter
   148  	indexPartitions    []blockWriter
   149  }
   150  
   151  // Set sets the value for the given key. The sequence number is set to
   152  // 0. Intended for use to externally construct an sstable before ingestion into
   153  // a DB.
   154  //
   155  // TODO(peter): untested
   156  func (w *Writer) Set(key, value []byte) error {
   157  	if w.err != nil {
   158  		return w.err
   159  	}
   160  	return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindSet), value)
   161  }
   162  
   163  // Delete deletes the value for the given key. The sequence number is set to
   164  // 0. Intended for use to externally construct an sstable before ingestion into
   165  // a DB.
   166  //
   167  // TODO(peter): untested
   168  func (w *Writer) Delete(key []byte) error {
   169  	if w.err != nil {
   170  		return w.err
   171  	}
   172  	return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindDelete), nil)
   173  }
   174  
   175  // DeleteRange deletes all of the keys (and values) in the range [start,end)
   176  // (inclusive on start, exclusive on end). The sequence number is set to
   177  // 0. Intended for use to externally construct an sstable before ingestion into
   178  // a DB.
   179  //
   180  // TODO(peter): untested
   181  func (w *Writer) DeleteRange(start, end []byte) error {
   182  	if w.err != nil {
   183  		return w.err
   184  	}
   185  	return w.addTombstone(base.MakeInternalKey(start, 0, InternalKeyKindRangeDelete), end)
   186  }
   187  
   188  // Merge adds an action to the DB that merges the value at key with the new
   189  // value. The details of the merge are dependent upon the configured merge
   190  // operator. The sequence number is set to 0. Intended for use to externally
   191  // construct an sstable before ingestion into a DB.
   192  //
   193  // TODO(peter): untested
   194  func (w *Writer) Merge(key, value []byte) error {
   195  	if w.err != nil {
   196  		return w.err
   197  	}
   198  	return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindMerge), value)
   199  }
   200  
   201  // Add adds a key/value pair to the table being written. For a given Writer,
   202  // the keys passed to Add must be in increasing order. The exception to this
   203  // rule is range deletion tombstones. Range deletion tombstones need to be
   204  // added ordered by their start key, but they can be added out of order from
   205  // point entries. Additionally, range deletion tombstones must be fragmented
   206  // (i.e. by rangedel.Fragmenter).
   207  func (w *Writer) Add(key InternalKey, value []byte) error {
   208  	if w.err != nil {
   209  		return w.err
   210  	}
   211  
   212  	if key.Kind() == InternalKeyKindRangeDelete {
   213  		return w.addTombstone(key, value)
   214  	}
   215  	return w.addPoint(key, value)
   216  }
   217  
   218  func (w *Writer) addPoint(key InternalKey, value []byte) error {
   219  	if base.InternalCompare(w.compare, w.meta.LargestPoint, key) >= 0 {
   220  		w.err = fmt.Errorf("pebble: keys must be added in order: %s, %s", w.meta.LargestPoint, key)
   221  		return w.err
   222  	}
   223  
   224  	if err := w.maybeFlush(key, value); err != nil {
   225  		return err
   226  	}
   227  
   228  	for i := range w.propCollectors {
   229  		if err := w.propCollectors[i].Add(key, value); err != nil {
   230  			return err
   231  		}
   232  	}
   233  
   234  	w.meta.updateSeqNum(key.SeqNum())
   235  	w.meta.updateLargestPoint(key)
   236  
   237  	w.maybeAddToFilter(key.UserKey)
   238  
   239  	if w.props.NumEntries == 0 {
   240  		w.meta.SmallestPoint = key.Clone()
   241  	}
   242  	w.props.NumEntries++
   243  	switch key.Kind() {
   244  	case InternalKeyKindDelete:
   245  		w.props.NumDeletions++
   246  	case InternalKeyKindMerge:
   247  		w.props.NumMergeOperands++
   248  	}
   249  	w.props.RawKeySize += uint64(key.Size())
   250  	w.props.RawValueSize += uint64(len(value))
   251  	w.block.add(key, value)
   252  	return nil
   253  }
   254  
   255  func (w *Writer) addTombstone(key InternalKey, value []byte) error {
   256  	if !w.rangeDelV1Format && w.rangeDelBlock.nEntries > 0 {
   257  		// Check that tombstones are being added in fragmented order. If the two
   258  		// tombstones overlap, their start and end keys must be identical.
   259  		prevKey := base.DecodeInternalKey(w.rangeDelBlock.curKey)
   260  		switch c := w.compare(prevKey.UserKey, key.UserKey); {
   261  		case c > 0:
   262  			w.err = fmt.Errorf("pebble: keys must be added in order: %s, %s", prevKey, key)
   263  			return w.err
   264  		case c == 0:
   265  			prevValue := w.rangeDelBlock.curValue
   266  			if w.compare(prevValue, value) != 0 {
   267  				w.err = fmt.Errorf("pebble: overlapping tombstones must be fragmented: %s vs %s",
   268  					rangedel.Tombstone{Start: prevKey, End: prevValue},
   269  					rangedel.Tombstone{Start: key, End: value})
   270  				return w.err
   271  			}
   272  			if prevKey.SeqNum() <= key.SeqNum() {
   273  				w.err = fmt.Errorf("pebble: keys must be added in order: %s, %s", prevKey, key)
   274  				return w.err
   275  			}
   276  		default:
   277  			prevValue := w.rangeDelBlock.curValue
   278  			if w.compare(prevValue, key.UserKey) > 0 {
   279  				w.err = fmt.Errorf("pebble: overlapping tombstones must be fragmented: %s vs %s",
   280  					rangedel.Tombstone{Start: prevKey, End: prevValue},
   281  					rangedel.Tombstone{Start: key, End: value})
   282  				return w.err
   283  			}
   284  		}
   285  	}
   286  
   287  	for i := range w.propCollectors {
   288  		if err := w.propCollectors[i].Add(key, value); err != nil {
   289  			return err
   290  		}
   291  	}
   292  
   293  	w.meta.updateSeqNum(key.SeqNum())
   294  
   295  	if w.props.NumRangeDeletions == 0 {
   296  		w.meta.SmallestRange = key.Clone()
   297  		w.meta.LargestRange = base.MakeRangeDeleteSentinelKey(value).Clone()
   298  	} else if w.rangeDelV1Format {
   299  		if base.InternalCompare(w.compare, w.meta.SmallestRange, key) > 0 {
   300  			w.meta.SmallestRange = key.Clone()
   301  		}
   302  		end := base.MakeRangeDeleteSentinelKey(value)
   303  		if base.InternalCompare(w.compare, w.meta.LargestRange, end) < 0 {
   304  			w.meta.LargestRange = end.Clone()
   305  		}
   306  	}
   307  	w.props.NumEntries++
   308  	w.props.NumDeletions++
   309  	w.props.NumRangeDeletions++
   310  	w.props.RawKeySize += uint64(key.Size())
   311  	w.props.RawValueSize += uint64(len(value))
   312  	w.rangeDelBlock.add(key, value)
   313  	return nil
   314  }
   315  
   316  func (w *Writer) maybeAddToFilter(key []byte) {
   317  	if w.filter != nil {
   318  		if w.split != nil {
   319  			prefix := key[:w.split(key)]
   320  			w.filter.addKey(prefix)
   321  		} else {
   322  			w.filter.addKey(key)
   323  		}
   324  	}
   325  }
   326  
   327  func (w *Writer) maybeFlush(key InternalKey, value []byte) error {
   328  	if !shouldFlush(key, value, w.block, w.blockSize, w.blockSizeThreshold) {
   329  		return nil
   330  	}
   331  
   332  	bh, err := w.finishBlock(&w.block)
   333  	if err != nil {
   334  		w.err = err
   335  		return w.err
   336  	}
   337  	w.pendingBH = bh
   338  	w.flushPendingBH(key)
   339  	return nil
   340  }
   341  
   342  // flushPendingBH adds any pending block handle to the index entries.
   343  func (w *Writer) flushPendingBH(key InternalKey) {
   344  	if w.pendingBH.Length == 0 {
   345  		// A valid blockHandle must be non-zero.
   346  		// In particular, it must have a non-zero length.
   347  		return
   348  	}
   349  	prevKey := base.DecodeInternalKey(w.block.curKey)
   350  	var sep InternalKey
   351  	if key.UserKey == nil && key.Trailer == 0 {
   352  		sep = prevKey.Successor(w.compare, w.successor, nil)
   353  	} else {
   354  		sep = prevKey.Separator(w.compare, w.separator, nil, key)
   355  	}
   356  	n := encodeBlockHandle(w.tmp[:], w.pendingBH)
   357  
   358  	if shouldFlush(sep, w.tmp[:n], w.indexBlock, w.indexBlockSize, w.indexBlockSizeThreshold) {
   359  		// Enable two level indexes if there is more than one index block.
   360  		w.twoLevelIndex = true
   361  		w.finishIndexBlock()
   362  	}
   363  
   364  	w.indexBlock.add(sep, w.tmp[:n])
   365  
   366  	w.pendingBH = BlockHandle{}
   367  }
   368  
   369  func shouldFlush(key InternalKey, value []byte, block blockWriter, blockSize, sizeThreshold int) bool {
   370  	if size := block.estimatedSize(); size < blockSize {
   371  		// The block is currently smaller than the target size.
   372  		if size <= sizeThreshold {
   373  			// The block is smaller than the threshold size at which we'll consider
   374  			// flushing it.
   375  			return false
   376  		}
   377  		newSize := size + key.Size() + len(value)
   378  		if block.nEntries%block.restartInterval == 0 {
   379  			newSize += 4
   380  		}
   381  		newSize += 4                              // varint for shared prefix length
   382  		newSize += uvarintLen(uint32(key.Size())) // varint for unshared key bytes
   383  		newSize += uvarintLen(uint32(len(value))) // varint for value size
   384  		if newSize <= blockSize {
   385  			// The block plus the new entry is smaller than the target size.
   386  			return false
   387  		}
   388  	}
   389  
   390  	return true
   391  }
   392  
   393  // finishBlock finishes the current block and returns its block handle, which is
   394  // its offset and length in the table.
   395  func (w *Writer) finishBlock(block *blockWriter) (BlockHandle, error) {
   396  	bh, err := w.writeRawBlock(block.finish(), w.compression)
   397  
   398  	// Calculate filters.
   399  	if w.filter != nil {
   400  		w.filter.finishBlock(w.meta.Size)
   401  	}
   402  
   403  	// Reset the per-block state.
   404  	block.reset()
   405  	return bh, err
   406  }
   407  
   408  // finishIndexBlock finishes the current index block and adds it to the top
   409  // level index block. This is only used when two level indexes are enabled.
   410  func (w *Writer) finishIndexBlock() {
   411  	w.indexPartitions = append(w.indexPartitions, w.indexBlock)
   412  	w.indexBlock = blockWriter{
   413  		restartInterval: 1,
   414  	}
   415  }
   416  
   417  func (w *Writer) writeTwoLevelIndex() (BlockHandle, error) {
   418  	// Add the final unfinished index.
   419  	w.finishIndexBlock()
   420  
   421  	for _, b := range w.indexPartitions {
   422  		sep := base.DecodeInternalKey(b.curKey)
   423  		bh, _ := w.writeRawBlock(b.finish(), w.compression)
   424  
   425  		if w.filter != nil {
   426  			w.filter.finishBlock(w.meta.Size)
   427  		}
   428  
   429  		n := encodeBlockHandle(w.tmp[:], bh)
   430  		w.topLevelIndexBlock.add(sep, w.tmp[:n])
   431  
   432  		w.props.IndexSize += uint64(len(b.buf))
   433  		w.props.NumDataBlocks += uint64(b.nEntries)
   434  	}
   435  
   436  	// NB: RocksDB includes the block trailer length in the index size
   437  	// property, though it doesn't include the trailer in the top level
   438  	// index size property.
   439  	w.props.IndexPartitions = uint64(len(w.indexPartitions))
   440  	w.props.TopLevelIndexSize = uint64(w.topLevelIndexBlock.estimatedSize())
   441  	w.props.IndexSize += w.props.TopLevelIndexSize + blockTrailerLen
   442  
   443  	return w.finishBlock(&w.topLevelIndexBlock)
   444  }
   445  
   446  func (w *Writer) writeRawBlock(b []byte, compression Compression) (BlockHandle, error) {
   447  	blockType := noCompressionBlockType
   448  	if compression == SnappyCompression {
   449  		// Compress the buffer, discarding the result if the improvement isn't at
   450  		// least 12.5%.
   451  		compressed := snappy.Encode(w.compressedBuf, b)
   452  		w.compressedBuf = compressed[:cap(compressed)]
   453  		if len(compressed) < len(b)-len(b)/8 {
   454  			blockType = snappyCompressionBlockType
   455  			b = compressed
   456  		}
   457  	}
   458  	w.tmp[0] = blockType
   459  
   460  	// Calculate the checksum.
   461  	checksum := crc.New(b).Update(w.tmp[:1]).Value()
   462  	binary.LittleEndian.PutUint32(w.tmp[1:5], checksum)
   463  	bh := BlockHandle{w.meta.Size, uint64(len(b))}
   464  
   465  	// Write the bytes to the file.
   466  	n, err := w.writer.Write(b)
   467  	if err != nil {
   468  		return BlockHandle{}, err
   469  	}
   470  	w.meta.Size += uint64(n)
   471  	n, err = w.writer.Write(w.tmp[:blockTrailerLen])
   472  	if err != nil {
   473  		return BlockHandle{}, err
   474  	}
   475  	w.meta.Size += uint64(n)
   476  
   477  	return bh, nil
   478  }
   479  
   480  // Close finishes writing the table and closes the underlying file that the
   481  // table was written to.
   482  func (w *Writer) Close() (err error) {
   483  	defer func() {
   484  		if w.syncer == nil {
   485  			return
   486  		}
   487  		err1 := w.syncer.Close()
   488  		if err == nil {
   489  			err = err1
   490  		}
   491  		w.syncer = nil
   492  	}()
   493  	if w.err != nil {
   494  		return w.err
   495  	}
   496  
   497  	// Finish the last data block, or force an empty data block if there
   498  	// aren't any data blocks at all.
   499  	w.flushPendingBH(InternalKey{})
   500  	if w.block.nEntries > 0 || w.indexBlock.nEntries == 0 {
   501  		bh, err := w.finishBlock(&w.block)
   502  		if err != nil {
   503  			w.err = err
   504  			return w.err
   505  		}
   506  		w.pendingBH = bh
   507  		w.flushPendingBH(InternalKey{})
   508  	}
   509  	w.props.DataSize = w.meta.Size
   510  
   511  	// Write the filter block.
   512  	var metaindex rawBlockWriter
   513  	metaindex.restartInterval = 1
   514  	if w.filter != nil {
   515  		b, err := w.filter.finish()
   516  		if err != nil {
   517  			w.err = err
   518  			return w.err
   519  		}
   520  		bh, err := w.writeRawBlock(b, NoCompression)
   521  		if err != nil {
   522  			w.err = err
   523  			return w.err
   524  		}
   525  		n := encodeBlockHandle(w.tmp[:], bh)
   526  		metaindex.add(InternalKey{UserKey: []byte(w.filter.metaName())}, w.tmp[:n])
   527  		w.props.FilterPolicyName = w.filter.policyName()
   528  		w.props.FilterSize = bh.Length
   529  	}
   530  
   531  	var indexBH BlockHandle
   532  	if w.twoLevelIndex {
   533  		w.props.IndexType = twoLevelIndex
   534  		// Write the two level index block.
   535  		indexBH, err = w.writeTwoLevelIndex()
   536  		if err != nil {
   537  			w.err = err
   538  			return w.err
   539  		}
   540  	} else {
   541  		w.props.IndexType = binarySearchIndex
   542  		// NB: RocksDB includes the block trailer length in the index size
   543  		// property, though it doesn't include the trailer in the filter size
   544  		// property.
   545  		w.props.IndexSize = uint64(w.indexBlock.estimatedSize()) + blockTrailerLen
   546  		w.props.NumDataBlocks = uint64(w.indexBlock.nEntries)
   547  
   548  		// Write the single level index block.
   549  		indexBH, err = w.finishBlock(&w.indexBlock)
   550  		if err != nil {
   551  			w.err = err
   552  			return w.err
   553  		}
   554  	}
   555  
   556  	// Write the range-del block. The block handle must added to the meta index block
   557  	// after the properties block has been written. This is because the entries in the
   558  	// metaindex block must be sorted by key.
   559  	var rangeDelBH BlockHandle
   560  	if w.props.NumRangeDeletions > 0 {
   561  		if !w.rangeDelV1Format {
   562  			// Because the range tombstones are fragmented, the end key of the last
   563  			// added range tombstone will be the largest range tombstone key. Note
   564  			// that we need to make this into a range deletion sentinel because
   565  			// sstable boundaries are inclusive while the end key of a range deletion
   566  			// tombstone is exclusive.
   567  			w.meta.LargestRange = base.MakeRangeDeleteSentinelKey(w.rangeDelBlock.curValue)
   568  		}
   569  		b := w.rangeDelBlock.finish()
   570  		rangeDelBH, err = w.writeRawBlock(b, NoCompression)
   571  		if err != nil {
   572  			w.err = err
   573  			return w.err
   574  		}
   575  	}
   576  
   577  	{
   578  		userProps := make(map[string]string)
   579  		for i := range w.propCollectors {
   580  			if err := w.propCollectors[i].Finish(userProps); err != nil {
   581  				return err
   582  			}
   583  		}
   584  		if len(userProps) > 0 {
   585  			w.props.UserProperties = userProps
   586  		}
   587  
   588  		// Write the properties block.
   589  		var raw rawBlockWriter
   590  		// The restart interval is set to infinity because the properties block
   591  		// is always read sequentially and cached in a heap located object. This
   592  		// reduces table size without a significant impact on performance.
   593  		raw.restartInterval = propertiesBlockRestartInterval
   594  		w.props.CompressionOptions = rocksDBCompressionOptions
   595  		w.props.save(&raw)
   596  		bh, err := w.writeRawBlock(raw.finish(), NoCompression)
   597  		if err != nil {
   598  			w.err = err
   599  			return w.err
   600  		}
   601  		n := encodeBlockHandle(w.tmp[:], bh)
   602  		metaindex.add(InternalKey{UserKey: []byte(metaPropertiesName)}, w.tmp[:n])
   603  	}
   604  
   605  	// Add the range deletion block handle to the metaindex block.
   606  	if w.props.NumRangeDeletions > 0 {
   607  		n := encodeBlockHandle(w.tmp[:], rangeDelBH)
   608  		// The v2 range-del block encoding is backwards compatible with the v1
   609  		// encoding. We add meta-index entries for both the old name and the new
   610  		// name so that old code can continue to find the range-del block and new
   611  		// code knows that the range tombstones in the block are fragmented and
   612  		// sorted.
   613  		metaindex.add(InternalKey{UserKey: []byte(metaRangeDelName)}, w.tmp[:n])
   614  		if !w.rangeDelV1Format {
   615  			metaindex.add(InternalKey{UserKey: []byte(metaRangeDelV2Name)}, w.tmp[:n])
   616  		}
   617  	}
   618  
   619  	// Write the metaindex block. It might be an empty block, if the filter
   620  	// policy is nil.
   621  	metaindexBH, err := w.finishBlock(&metaindex.blockWriter)
   622  	if err != nil {
   623  		w.err = err
   624  		return w.err
   625  	}
   626  
   627  	// Write the table footer.
   628  	footer := footer{
   629  		format:      w.tableFormat,
   630  		checksum:    checksumCRC32c,
   631  		metaindexBH: metaindexBH,
   632  		indexBH:     indexBH,
   633  	}
   634  	var n int
   635  	if n, err = w.writer.Write(footer.encode(w.tmp[:])); err != nil {
   636  		w.err = err
   637  		return w.err
   638  	}
   639  	w.meta.Size += uint64(n)
   640  
   641  	// Flush the buffer.
   642  	if w.bufWriter != nil {
   643  		if err := w.bufWriter.Flush(); err != nil {
   644  			w.err = err
   645  			return err
   646  		}
   647  	}
   648  
   649  	if err := w.syncer.Sync(); err != nil {
   650  		w.err = err
   651  		return err
   652  	}
   653  
   654  	// Make any future calls to Set or Close return an error.
   655  	w.err = errors.New("pebble: writer is closed")
   656  	return nil
   657  }
   658  
   659  // EstimatedSize returns the estimated size of the sstable being written if a
   660  // called to Finish() was made without adding additional keys.
   661  func (w *Writer) EstimatedSize() uint64 {
   662  	return w.meta.Size + uint64(w.block.estimatedSize()+w.indexBlock.estimatedSize())
   663  }
   664  
   665  // Metadata returns the metadata for the finished sstable. Only valid to call
   666  // after the sstable has been finished.
   667  func (w *Writer) Metadata() (*WriterMetadata, error) {
   668  	if w.syncer != nil {
   669  		return nil, errors.New("pebble: writer is not closed")
   670  	}
   671  	return &w.meta, nil
   672  }
   673  
   674  // NewWriter returns a new table writer for the file. Closing the writer will
   675  // close the file.
   676  func NewWriter(f writeCloseSyncer, o *Options, lo TableOptions) *Writer {
   677  	o = o.EnsureDefaults()
   678  	lo = *lo.EnsureDefaults()
   679  
   680  	w := &Writer{
   681  		syncer: f,
   682  		meta: WriterMetadata{
   683  			SmallestSeqNum: math.MaxUint64,
   684  		},
   685  		blockSize:               lo.BlockSize,
   686  		blockSizeThreshold:      (lo.BlockSize*lo.BlockSizeThreshold + 99) / 100,
   687  		indexBlockSize:          lo.IndexBlockSize,
   688  		indexBlockSizeThreshold: (lo.IndexBlockSize*lo.BlockSizeThreshold + 99) / 100,
   689  		compare:                 o.Comparer.Compare,
   690  		split:                   o.Comparer.Split,
   691  		compression:             lo.Compression,
   692  		separator:               o.Comparer.Separator,
   693  		successor:               o.Comparer.Successor,
   694  		tableFormat:             o.TableFormat,
   695  		block: blockWriter{
   696  			restartInterval: lo.BlockRestartInterval,
   697  		},
   698  		indexBlock: blockWriter{
   699  			restartInterval: 1,
   700  		},
   701  		rangeDelBlock: blockWriter{
   702  			restartInterval: 1,
   703  		},
   704  		topLevelIndexBlock: blockWriter{
   705  			restartInterval: 1,
   706  		},
   707  	}
   708  	if f == nil {
   709  		w.err = errors.New("pebble: nil file")
   710  		return w
   711  	}
   712  
   713  	w.props.PrefixExtractorName = "nullptr"
   714  	if lo.FilterPolicy != nil {
   715  		switch lo.FilterType {
   716  		case TableFilter:
   717  			w.filter = newTableFilterWriter(lo.FilterPolicy)
   718  			if w.split != nil {
   719  				w.props.PrefixExtractorName = o.Comparer.Name
   720  				w.props.PrefixFiltering = true
   721  			} else {
   722  				w.props.WholeKeyFiltering = true
   723  			}
   724  		default:
   725  			panic(fmt.Sprintf("unknown filter type: %v", lo.FilterType))
   726  		}
   727  	}
   728  
   729  	w.props.ColumnFamilyID = math.MaxInt32
   730  	w.props.ComparerName = o.Comparer.Name
   731  	w.props.CompressionName = lo.Compression.String()
   732  	w.props.MergerName = o.Merger.Name
   733  	w.props.PropertyCollectorNames = "[]"
   734  	w.props.Version = 2 // TODO(peter): what is this?
   735  
   736  	if len(o.TablePropertyCollectors) > 0 {
   737  		w.propCollectors = make([]TablePropertyCollector, len(o.TablePropertyCollectors))
   738  		var buf bytes.Buffer
   739  		buf.WriteString("[")
   740  		for i := range o.TablePropertyCollectors {
   741  			w.propCollectors[i] = o.TablePropertyCollectors[i]()
   742  			if i > 0 {
   743  				buf.WriteString(",")
   744  			}
   745  			buf.WriteString(w.propCollectors[i].Name())
   746  		}
   747  		buf.WriteString("]")
   748  		w.props.PropertyCollectorNames = buf.String()
   749  	}
   750  
   751  	// If f does not have a Flush method, do our own buffering.
   752  	if _, ok := f.(flusher); ok {
   753  		w.writer = f
   754  	} else {
   755  		w.bufWriter = bufio.NewWriter(f)
   756  		w.writer = w.bufWriter
   757  	}
   758  	return w
   759  }