github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/compaction.go (about)

     1  // Copyright 2013 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package bitalostable
     6  
     7  import (
     8  	"bytes"
     9  	"context"
    10  	"fmt"
    11  	"io"
    12  	"math"
    13  	"runtime/pprof"
    14  	"sort"
    15  	"strings"
    16  	"sync/atomic"
    17  	"time"
    18  
    19  	"github.com/cockroachdb/errors"
    20  	"github.com/cockroachdb/errors/oserror"
    21  	"github.com/zuoyebang/bitalostable/internal/base"
    22  	"github.com/zuoyebang/bitalostable/internal/keyspan"
    23  	"github.com/zuoyebang/bitalostable/internal/manifest"
    24  	"github.com/zuoyebang/bitalostable/internal/private"
    25  	"github.com/zuoyebang/bitalostable/internal/rangedel"
    26  	"github.com/zuoyebang/bitalostable/internal/rangekey"
    27  	"github.com/zuoyebang/bitalostable/sstable"
    28  	"github.com/zuoyebang/bitalostable/vfs"
    29  )
    30  
    31  var errEmptyTable = errors.New("bitalostable: empty table")
    32  var errFlushInvariant = errors.New("bitalostable: flush next log number is unset")
    33  
    34  var compactLabels = pprof.Labels("bitalostable", "compact")
    35  var flushLabels = pprof.Labels("bitalostable", "flush")
    36  var gcLabels = pprof.Labels("bitalostable", "gc")
    37  
    38  // expandedCompactionByteSizeLimit is the maximum number of bytes in all
    39  // compacted files. We avoid expanding the lower level file set of a compaction
    40  // if it would make the total compaction cover more than this many bytes.
    41  func expandedCompactionByteSizeLimit(opts *Options, level int, availBytes uint64) uint64 {
    42  	v := uint64(25 * opts.Level(level).TargetFileSize)
    43  
    44  	// Never expand a compaction beyond half the available capacity, divided
    45  	// by the maximum number of concurrent compactions. Each of the concurrent
    46  	// compactions may expand up to this limit, so this attempts to limit
    47  	// compactions to half of available disk space. Note that this will not
    48  	// prevent compaction picking from pursuing compactions that are larger
    49  	// than this threshold before expansion.
    50  	diskMax := (availBytes / 2) / uint64(opts.MaxConcurrentCompactions())
    51  	if v > diskMax {
    52  		v = diskMax
    53  	}
    54  	return v
    55  }
    56  
    57  // maxGrandparentOverlapBytes is the maximum bytes of overlap with level+1
    58  // before we stop building a single file in a level-1 to level compaction.
    59  func maxGrandparentOverlapBytes(opts *Options, level int) uint64 {
    60  	return uint64(10 * opts.Level(level).TargetFileSize)
    61  }
    62  
    63  // maxReadCompactionBytes is used to prevent read compactions which
    64  // are too wide.
    65  func maxReadCompactionBytes(opts *Options, level int) uint64 {
    66  	return uint64(10 * opts.Level(level).TargetFileSize)
    67  }
    68  
    69  // noCloseIter wraps around a FragmentIterator, intercepting and eliding
    70  // calls to Close. It is used during compaction to ensure that rangeDelIters
    71  // are not closed prematurely.
    72  type noCloseIter struct {
    73  	keyspan.FragmentIterator
    74  }
    75  
    76  func (i noCloseIter) Close() error {
    77  	return nil
    78  }
    79  
    80  type compactionLevel struct {
    81  	level int
    82  	files manifest.LevelSlice
    83  }
    84  
    85  // Return output from compactionOutputSplitters. See comment on
    86  // compactionOutputSplitter.shouldSplitBefore() on how this value is used.
    87  type compactionSplitSuggestion int
    88  
    89  const (
    90  	noSplit compactionSplitSuggestion = iota
    91  	splitNow
    92  )
    93  
    94  // String implements the Stringer interface.
    95  func (c compactionSplitSuggestion) String() string {
    96  	if c == noSplit {
    97  		return "no-split"
    98  	}
    99  	return "split-now"
   100  }
   101  
   102  // compactionOutputSplitter is an interface for encapsulating logic around
   103  // switching the output of a compaction to a new output file. Additional
   104  // constraints around switching compaction outputs that are specific to that
   105  // compaction type (eg. flush splits) are implemented in
   106  // compactionOutputSplitters that compose other child compactionOutputSplitters.
   107  type compactionOutputSplitter interface {
   108  	// shouldSplitBefore returns whether we should split outputs before the
   109  	// specified "current key". The return value is splitNow or noSplit.
   110  	// splitNow means a split is advised before the specified key, and noSplit
   111  	// means no split is advised. If shouldSplitBefore(a) advises a split then
   112  	// shouldSplitBefore(b) should also advise a split given b >= a, until
   113  	// onNewOutput is called.
   114  	shouldSplitBefore(key *InternalKey, tw *sstable.Writer) compactionSplitSuggestion
   115  	// onNewOutput updates internal splitter state when the compaction switches
   116  	// to a new sstable, and returns the next limit for the new output which
   117  	// would get used to truncate range tombstones if the compaction iterator
   118  	// runs out of keys. The limit returned MUST be > key according to the
   119  	// compaction's comparator. The specified key is the first key in the new
   120  	// output, or nil if this sstable will only contain range tombstones already
   121  	// in the fragmenter.
   122  	onNewOutput(key *InternalKey) []byte
   123  }
   124  
   125  // fileSizeSplitter is a compactionOutputSplitter that makes a determination
   126  // to split outputs based on the estimated file size of the current output.
   127  // Note that, unlike most other splitters, this splitter does not guarantee
   128  // that it will advise splits only at user key change boundaries.
   129  type fileSizeSplitter struct {
   130  	maxFileSize uint64
   131  }
   132  
   133  func (f *fileSizeSplitter) shouldSplitBefore(
   134  	key *InternalKey, tw *sstable.Writer,
   135  ) compactionSplitSuggestion {
   136  	// The Kind != RangeDelete part exists because EstimatedSize doesn't grow
   137  	// rightaway when a range tombstone is added to the fragmenter. It's always
   138  	// better to make a sequence of range tombstones visible to the fragmenter.
   139  	if key.Kind() != InternalKeyKindRangeDelete && tw != nil &&
   140  		tw.EstimatedSize() >= f.maxFileSize {
   141  		return splitNow
   142  	}
   143  	return noSplit
   144  }
   145  
   146  func (f *fileSizeSplitter) onNewOutput(key *InternalKey) []byte {
   147  	return nil
   148  }
   149  
   150  type limitFuncSplitter struct {
   151  	c         *compaction
   152  	limitFunc func(userKey []byte) []byte
   153  	limit     []byte
   154  }
   155  
   156  func (lf *limitFuncSplitter) shouldSplitBefore(
   157  	key *InternalKey, tw *sstable.Writer,
   158  ) compactionSplitSuggestion {
   159  	// NB: The limit must be applied using >= since lf.limit may be used as the
   160  	// `splitterSuggestion` ultimately passed to `compactionIter.Tombstones` to
   161  	// serve as an *exclusive* end boundary truncation point. If we used > then,
   162  	// we may have already added a key with the user key `lf.limit` to the
   163  	// previous sstable.
   164  	if lf.limit != nil && lf.c.cmp(key.UserKey, lf.limit) >= 0 {
   165  		return splitNow
   166  	}
   167  	return noSplit
   168  }
   169  
   170  func (lf *limitFuncSplitter) onNewOutput(key *InternalKey) []byte {
   171  	lf.limit = nil
   172  	if key != nil {
   173  		lf.limit = lf.limitFunc(key.UserKey)
   174  	} else {
   175  		// Use the start key of the first pending tombstone to find the
   176  		// next limit. All pending tombstones have the same start key.
   177  		// We use this as opposed to the end key of the
   178  		// last written sstable to effectively handle cases like these:
   179  		//
   180  		// a.SET.3
   181  		// (lf.limit at b)
   182  		// d.RANGEDEL.4:f
   183  		//
   184  		// In this case, the partition after b has only range deletions,
   185  		// so if we were to find the limit after the last written key at
   186  		// the split point (key a), we'd get the limit b again, and
   187  		// finishOutput() would not advance any further because the next
   188  		// range tombstone to write does not start until after the L0
   189  		// split point.
   190  		if startKey := lf.c.rangeDelFrag.Start(); startKey != nil {
   191  			lf.limit = lf.limitFunc(startKey)
   192  		}
   193  	}
   194  	return lf.limit
   195  }
   196  
   197  // splitterGroup is a compactionOutputSplitter that splits whenever one of its
   198  // child splitters advises a compaction split.
   199  type splitterGroup struct {
   200  	cmp       Compare
   201  	splitters []compactionOutputSplitter
   202  }
   203  
   204  func (a *splitterGroup) shouldSplitBefore(
   205  	key *InternalKey, tw *sstable.Writer,
   206  ) (suggestion compactionSplitSuggestion) {
   207  	for _, splitter := range a.splitters {
   208  		if splitter.shouldSplitBefore(key, tw) == splitNow {
   209  			return splitNow
   210  		}
   211  	}
   212  	return noSplit
   213  }
   214  
   215  func (a *splitterGroup) onNewOutput(key *InternalKey) []byte {
   216  	var earliestLimit []byte
   217  	for _, splitter := range a.splitters {
   218  		limit := splitter.onNewOutput(key)
   219  		if limit == nil {
   220  			continue
   221  		}
   222  		if earliestLimit == nil || a.cmp(limit, earliestLimit) < 0 {
   223  			earliestLimit = limit
   224  		}
   225  	}
   226  	return earliestLimit
   227  }
   228  
   229  // userKeyChangeSplitter is a compactionOutputSplitter that takes in a child
   230  // splitter, and splits when 1) that child splitter has advised a split, and 2)
   231  // the compaction output is at the boundary between two user keys (also
   232  // the boundary between atomic compaction units). Use this splitter to wrap
   233  // any splitters that don't guarantee user key splits (i.e. splitters that make
   234  // their determination in ways other than comparing the current key against a
   235  // limit key.) If a wrapped splitter advises a split, it must continue
   236  // to advise a split until a new output.
   237  type userKeyChangeSplitter struct {
   238  	cmp               Compare
   239  	splitter          compactionOutputSplitter
   240  	unsafePrevUserKey func() []byte
   241  }
   242  
   243  func (u *userKeyChangeSplitter) shouldSplitBefore(
   244  	key *InternalKey, tw *sstable.Writer,
   245  ) compactionSplitSuggestion {
   246  	if split := u.splitter.shouldSplitBefore(key, tw); split != splitNow {
   247  		return split
   248  	}
   249  	if u.cmp(key.UserKey, u.unsafePrevUserKey()) > 0 {
   250  		return splitNow
   251  	}
   252  	return noSplit
   253  }
   254  
   255  func (u *userKeyChangeSplitter) onNewOutput(key *InternalKey) []byte {
   256  	return u.splitter.onNewOutput(key)
   257  }
   258  
   259  // compactionFile is a vfs.File wrapper that, on every write, updates a metric
   260  // in `versions` on bytes written by in-progress compactions so far. It also
   261  // increments a per-compaction `written` int.
   262  type compactionFile struct {
   263  	vfs.File
   264  
   265  	versions *versionSet
   266  	written  *int64
   267  }
   268  
   269  // Write implements the io.Writer interface.
   270  func (c *compactionFile) Write(p []byte) (n int, err error) {
   271  	n, err = c.File.Write(p)
   272  	if err != nil {
   273  		return n, err
   274  	}
   275  
   276  	*c.written += int64(n)
   277  	c.versions.incrementCompactionBytes(int64(n))
   278  	return n, err
   279  }
   280  
   281  type compactionKind int
   282  
   283  const (
   284  	compactionKindDefault compactionKind = iota
   285  	compactionKindFlush
   286  	compactionKindMove
   287  	compactionKindDeleteOnly
   288  	compactionKindElisionOnly
   289  	compactionKindRead
   290  	compactionKindRewrite
   291  )
   292  
   293  func (k compactionKind) String() string {
   294  	switch k {
   295  	case compactionKindDefault:
   296  		return "default"
   297  	case compactionKindFlush:
   298  		return "flush"
   299  	case compactionKindMove:
   300  		return "move"
   301  	case compactionKindDeleteOnly:
   302  		return "delete-only"
   303  	case compactionKindElisionOnly:
   304  		return "elision-only"
   305  	case compactionKindRead:
   306  		return "read"
   307  	case compactionKindRewrite:
   308  		return "rewrite"
   309  	}
   310  	return "?"
   311  }
   312  
   313  // rangeKeyCompactionTransform is used to transform range key spans as part of the
   314  // keyspan.MergingIter. As part of this transformation step, we can elide range
   315  // keys in the last snapshot stripe, as well as coalesce range keys within
   316  // snapshot stripes.
   317  func rangeKeyCompactionTransform(
   318  	snapshots []uint64, elideRangeKey func(start, end []byte) bool,
   319  ) keyspan.Transformer {
   320  	return keyspan.TransformerFunc(func(cmp base.Compare, s keyspan.Span, dst *keyspan.Span) error {
   321  		elideInLastStripe := func(keys []keyspan.Key) []keyspan.Key {
   322  			// Unsets and deletes in the last snapshot stripe can be elided.
   323  			k := 0
   324  			for j := range keys {
   325  				if elideRangeKey(s.Start, s.End) &&
   326  					(keys[j].Kind() == InternalKeyKindRangeKeyUnset || keys[j].Kind() == InternalKeyKindRangeKeyDelete) {
   327  					continue
   328  				}
   329  				keys[k] = keys[j]
   330  				k++
   331  			}
   332  			keys = keys[:k]
   333  			return keys
   334  		}
   335  		// snapshots are in ascending order, while s.keys are in descending seqnum
   336  		// order. Partition s.keys by snapshot stripes, and call rangekey.Coalesce
   337  		// on each partition.
   338  		dst.Start = s.Start
   339  		dst.End = s.End
   340  		dst.Keys = dst.Keys[:0]
   341  		i, j := len(snapshots)-1, 0
   342  		usedLen := 0
   343  		for i >= 0 {
   344  			start := j
   345  			for j < len(s.Keys) && !base.Visible(s.Keys[j].SeqNum(), snapshots[i]) {
   346  				// Include j in current partition.
   347  				j++
   348  			}
   349  			if j > start {
   350  				keysDst := dst.Keys[usedLen:cap(dst.Keys)]
   351  				if err := rangekey.Coalesce(cmp, s.Keys[start:j], &keysDst); err != nil {
   352  					return err
   353  				}
   354  				if j == len(s.Keys) {
   355  					// This is the last snapshot stripe. Unsets and deletes can be elided.
   356  					keysDst = elideInLastStripe(keysDst)
   357  				}
   358  				usedLen += len(keysDst)
   359  				dst.Keys = append(dst.Keys, keysDst...)
   360  			}
   361  			i--
   362  		}
   363  		if j < len(s.Keys) {
   364  			keysDst := dst.Keys[usedLen:cap(dst.Keys)]
   365  			if err := rangekey.Coalesce(cmp, s.Keys[j:], &keysDst); err != nil {
   366  				return err
   367  			}
   368  			keysDst = elideInLastStripe(keysDst)
   369  			usedLen += len(keysDst)
   370  			dst.Keys = append(dst.Keys, keysDst...)
   371  		}
   372  		return nil
   373  	})
   374  }
   375  
   376  // compaction is a table compaction from one level to the next, starting from a
   377  // given version.
   378  type compaction struct {
   379  	kind      compactionKind
   380  	cmp       Compare
   381  	equal     Equal
   382  	comparer  *base.Comparer
   383  	formatKey base.FormatKey
   384  	logger    Logger
   385  	version   *version
   386  	stats     base.InternalIteratorStats
   387  
   388  	score float64
   389  
   390  	// startLevel is the level that is being compacted. Inputs from startLevel
   391  	// and outputLevel will be merged to produce a set of outputLevel files.
   392  	startLevel *compactionLevel
   393  
   394  	// outputLevel is the level that files are being produced in. outputLevel is
   395  	// equal to startLevel+1 except when:
   396  	//    - if startLevel is 0, the output level equals compactionPicker.baseLevel().
   397  	//    - in multilevel compaction, the output level is the lowest level involved in
   398  	//      the compaction
   399  	outputLevel *compactionLevel
   400  
   401  	// extraLevels point to additional levels in between the input and output
   402  	// levels that get compacted in multilevel compactions
   403  	extraLevels []*compactionLevel
   404  
   405  	inputs []compactionLevel
   406  
   407  	// maxOutputFileSize is the maximum size of an individual table created
   408  	// during compaction.
   409  	maxOutputFileSize uint64
   410  	// maxOverlapBytes is the maximum number of bytes of overlap allowed for a
   411  	// single output table with the tables in the grandparent level.
   412  	maxOverlapBytes uint64
   413  	// disableSpanElision disables elision of range tombstones and range keys. Used
   414  	// by tests to allow range tombstones or range keys to be added to tables where
   415  	// they would otherwise be elided.
   416  	disableSpanElision bool
   417  
   418  	// flushing contains the flushables (aka memtables) that are being flushed.
   419  	flushing flushableList
   420  	// bytesIterated contains the number of bytes that have been flushed/compacted.
   421  	bytesIterated uint64
   422  	// bytesWritten contains the number of bytes that have been written to outputs.
   423  	bytesWritten int64
   424  
   425  	// The boundaries of the input data.
   426  	smallest InternalKey
   427  	largest  InternalKey
   428  
   429  	// The range deletion tombstone fragmenter. Adds range tombstones as they are
   430  	// returned from `compactionIter` and fragments them for output to files.
   431  	// Referenced by `compactionIter` which uses it to check whether keys are deleted.
   432  	rangeDelFrag keyspan.Fragmenter
   433  	// The range key fragmenter. Similar to rangeDelFrag in that it gets range
   434  	// keys from the compaction iter and fragments them for output to files.
   435  	rangeKeyFrag keyspan.Fragmenter
   436  	// The range deletion tombstone iterator, that merges and fragments
   437  	// tombstones across levels. This iterator is included within the compaction
   438  	// input iterator as a single level.
   439  	// TODO(jackson): Remove this when the refactor of FragmentIterator,
   440  	// InterleavingIterator, etc is complete.
   441  	rangeDelIter keyspan.InternalIteratorShim
   442  	// rangeKeyInterleaving is the interleaving iter for range keys.
   443  	rangeKeyInterleaving keyspan.InterleavingIter
   444  
   445  	// A list of objects to close when the compaction finishes. Used by input
   446  	// iteration to keep rangeDelIters open for the lifetime of the compaction,
   447  	// and only close them when the compaction finishes.
   448  	closers []io.Closer
   449  
   450  	// grandparents are the tables in level+2 that overlap with the files being
   451  	// compacted. Used to determine output table boundaries. Do not assume that the actual files
   452  	// in the grandparent when this compaction finishes will be the same.
   453  	grandparents manifest.LevelSlice
   454  
   455  	// Boundaries at which flushes to L0 should be split. Determined by
   456  	// L0Sublevels. If nil, flushes aren't split.
   457  	l0Limits [][]byte
   458  
   459  	// L0 sublevel info is used for compactions out of L0. It is nil for all
   460  	// other compactions.
   461  	l0SublevelInfo []sublevelInfo
   462  
   463  	// List of disjoint inuse key ranges the compaction overlaps with in
   464  	// grandparent and lower levels. See setupInuseKeyRanges() for the
   465  	// construction. Used by elideTombstone() and elideRangeTombstone() to
   466  	// determine if keys affected by a tombstone possibly exist at a lower level.
   467  	inuseKeyRanges []manifest.UserKeyRange
   468  	// inuseEntireRange is set if the above inuse key ranges wholly contain the
   469  	// compaction's key range. This allows compactions in higher levels to often
   470  	// elide key comparisons.
   471  	inuseEntireRange    bool
   472  	elideTombstoneIndex int
   473  
   474  	// allowedZeroSeqNum is true if seqnums can be zeroed if there are no
   475  	// snapshots requiring them to be kept. This determination is made by
   476  	// looking for an sstable which overlaps the bounds of the compaction at a
   477  	// lower level in the LSM during runCompaction.
   478  	allowedZeroSeqNum bool
   479  
   480  	metrics map[int]*LevelMetrics
   481  }
   482  
   483  func (c *compaction) makeInfo(jobID int) CompactionInfo {
   484  	info := CompactionInfo{
   485  		JobID:  jobID,
   486  		Reason: c.kind.String(),
   487  		Input:  make([]LevelInfo, 0, len(c.inputs)),
   488  	}
   489  	for _, cl := range c.inputs {
   490  		inputInfo := LevelInfo{Level: cl.level, Tables: nil}
   491  		iter := cl.files.Iter()
   492  		for m := iter.First(); m != nil; m = iter.Next() {
   493  			inputInfo.Tables = append(inputInfo.Tables, m.TableInfo())
   494  		}
   495  		info.Input = append(info.Input, inputInfo)
   496  	}
   497  	if c.outputLevel != nil {
   498  		info.Output.Level = c.outputLevel.level
   499  
   500  		// If there are no inputs from the output level (eg, a move
   501  		// compaction), add an empty LevelInfo to info.Input.
   502  		if len(c.inputs) > 0 && c.inputs[len(c.inputs)-1].level != c.outputLevel.level {
   503  			info.Input = append(info.Input, LevelInfo{Level: c.outputLevel.level})
   504  		}
   505  	} else {
   506  		// For a delete-only compaction, set the output level to L6. The
   507  		// output level is not meaningful here, but complicating the
   508  		// info.Output interface with a pointer doesn't seem worth the
   509  		// semantic distinction.
   510  		info.Output.Level = numLevels - 1
   511  	}
   512  	return info
   513  }
   514  
   515  func newCompaction(pc *pickedCompaction, opts *Options) *compaction {
   516  	c := &compaction{
   517  		kind:              compactionKindDefault,
   518  		cmp:               pc.cmp,
   519  		equal:             opts.equal(),
   520  		comparer:          opts.Comparer,
   521  		formatKey:         opts.Comparer.FormatKey,
   522  		score:             pc.score,
   523  		inputs:            pc.inputs,
   524  		smallest:          pc.smallest,
   525  		largest:           pc.largest,
   526  		logger:            opts.Logger,
   527  		version:           pc.version,
   528  		maxOutputFileSize: pc.maxOutputFileSize,
   529  		maxOverlapBytes:   pc.maxOverlapBytes,
   530  		l0SublevelInfo:    pc.l0SublevelInfo,
   531  	}
   532  	c.startLevel = &c.inputs[0]
   533  	c.outputLevel = &c.inputs[1]
   534  
   535  	if len(pc.extraLevels) > 0 {
   536  		c.extraLevels = pc.extraLevels
   537  		c.outputLevel = &c.inputs[len(c.inputs)-1]
   538  	}
   539  	// Compute the set of outputLevel+1 files that overlap this compaction (these
   540  	// are the grandparent sstables).
   541  	if c.outputLevel.level+1 < numLevels {
   542  		c.grandparents = c.version.Overlaps(c.outputLevel.level+1, c.cmp,
   543  			c.smallest.UserKey, c.largest.UserKey, c.largest.IsExclusiveSentinel())
   544  	}
   545  	c.setupInuseKeyRanges()
   546  
   547  	c.kind = pc.kind
   548  	if c.kind == compactionKindDefault && c.outputLevel.files.Empty() && !c.hasExtraLevelData() &&
   549  		c.startLevel.files.Len() == 1 && c.grandparents.SizeSum() <= c.maxOverlapBytes {
   550  		// This compaction can be converted into a trivial move from one level
   551  		// to the next. We avoid such a move if there is lots of overlapping
   552  		// grandparent data. Otherwise, the move could create a parent file
   553  		// that will require a very expensive merge later on.
   554  		c.kind = compactionKindMove
   555  	}
   556  	return c
   557  }
   558  
   559  func newDeleteOnlyCompaction(opts *Options, cur *version, inputs []compactionLevel) *compaction {
   560  	c := &compaction{
   561  		kind:      compactionKindDeleteOnly,
   562  		cmp:       opts.Comparer.Compare,
   563  		equal:     opts.equal(),
   564  		comparer:  opts.Comparer,
   565  		formatKey: opts.Comparer.FormatKey,
   566  		logger:    opts.Logger,
   567  		version:   cur,
   568  		inputs:    inputs,
   569  	}
   570  
   571  	// Set c.smallest, c.largest.
   572  	files := make([]manifest.LevelIterator, 0, len(inputs))
   573  	for _, in := range inputs {
   574  		files = append(files, in.files.Iter())
   575  	}
   576  	c.smallest, c.largest = manifest.KeyRange(opts.Comparer.Compare, files...)
   577  	return c
   578  }
   579  
   580  func adjustGrandparentOverlapBytesForFlush(c *compaction, flushingBytes uint64) {
   581  	// Heuristic to place a lower bound on compaction output file size
   582  	// caused by Lbase. Prior to this heuristic we have observed an L0 in
   583  	// production with 310K files of which 290K files were < 10KB in size.
   584  	// Our hypothesis is that it was caused by L1 having 2600 files and
   585  	// ~10GB, such that each flush got split into many tiny files due to
   586  	// overlapping with most of the files in Lbase.
   587  	//
   588  	// The computation below is general in that it accounts
   589  	// for flushing different volumes of data (e.g. we may be flushing
   590  	// many memtables). For illustration, we consider the typical
   591  	// example of flushing a 64MB memtable. So 12.8MB output,
   592  	// based on the compression guess below. If the compressed bytes
   593  	// guess is an over-estimate we will end up with smaller files,
   594  	// and if an under-estimate we will end up with larger files.
   595  	// With a 2MB target file size, 7 files. We are willing to accept
   596  	// 4x the number of files, if it results in better write amplification
   597  	// when later compacting to Lbase, i.e., ~450KB files (target file
   598  	// size / 4).
   599  	//
   600  	// Note that this is a pessimistic heuristic in that
   601  	// fileCountUpperBoundDueToGrandparents could be far from the actual
   602  	// number of files produced due to the grandparent limits. For
   603  	// example, in the extreme, consider a flush that overlaps with 1000
   604  	// files in Lbase f0...f999, and the initially calculated value of
   605  	// maxOverlapBytes will cause splits at f10, f20,..., f990, which
   606  	// means an upper bound file count of 100 files. Say the input bytes
   607  	// in the flush are such that acceptableFileCount=10. We will fatten
   608  	// up maxOverlapBytes by 10x to ensure that the upper bound file count
   609  	// drops to 10. However, it is possible that in practice, even without
   610  	// this change, we would have produced no more than 10 files, and that
   611  	// this change makes the files unnecessarily wide. Say the input bytes
   612  	// are distributed such that 10% are in f0...f9, 10% in f10...f19, ...
   613  	// 10% in f80...f89 and 10% in f990...f999. The original value of
   614  	// maxOverlapBytes would have actually produced only 10 sstables. But
   615  	// by increasing maxOverlapBytes by 10x, we may produce 1 sstable that
   616  	// spans f0...f89, i.e., a much wider sstable than necessary.
   617  	//
   618  	// We could produce a tighter estimate of
   619  	// fileCountUpperBoundDueToGrandparents if we had knowledge of the key
   620  	// distribution of the flush. The 4x multiplier mentioned earlier is
   621  	// a way to try to compensate for this pessimism.
   622  	//
   623  	// TODO(sumeer): we don't have compression info for the data being
   624  	// flushed, but it is likely that existing files that overlap with
   625  	// this flush in Lbase are representative wrt compression ratio. We
   626  	// could store the uncompressed size in FileMetadata and estimate
   627  	// the compression ratio.
   628  	const approxCompressionRatio = 0.2
   629  	approxOutputBytes := approxCompressionRatio * float64(flushingBytes)
   630  	approxNumFilesBasedOnTargetSize :=
   631  		int(math.Ceil(approxOutputBytes / float64(c.maxOutputFileSize)))
   632  	acceptableFileCount := float64(4 * approxNumFilesBasedOnTargetSize)
   633  	// The byte calculation is linear in numGrandparentFiles, but we will
   634  	// incur this linear cost in findGrandparentLimit too, so we are also
   635  	// willing to pay it now. We could approximate this cheaply by using
   636  	// the mean file size of Lbase.
   637  	grandparentFileBytes := c.grandparents.SizeSum()
   638  	fileCountUpperBoundDueToGrandparents :=
   639  		float64(grandparentFileBytes) / float64(c.maxOverlapBytes)
   640  	if fileCountUpperBoundDueToGrandparents > acceptableFileCount {
   641  		c.maxOverlapBytes = uint64(
   642  			float64(c.maxOverlapBytes) *
   643  				(fileCountUpperBoundDueToGrandparents / acceptableFileCount))
   644  	}
   645  }
   646  
   647  func newFlush(opts *Options, cur *version, baseLevel int, flushing flushableList) *compaction {
   648  	c := &compaction{
   649  		kind:              compactionKindFlush,
   650  		cmp:               opts.Comparer.Compare,
   651  		equal:             opts.equal(),
   652  		comparer:          opts.Comparer,
   653  		formatKey:         opts.Comparer.FormatKey,
   654  		logger:            opts.Logger,
   655  		version:           cur,
   656  		inputs:            []compactionLevel{{level: -1}, {level: 0}},
   657  		maxOutputFileSize: math.MaxUint64,
   658  		maxOverlapBytes:   math.MaxUint64,
   659  		flushing:          flushing,
   660  	}
   661  	c.startLevel = &c.inputs[0]
   662  	c.outputLevel = &c.inputs[1]
   663  	if cur.L0Sublevels != nil {
   664  		c.l0Limits = cur.L0Sublevels.FlushSplitKeys()
   665  	}
   666  
   667  	smallestSet, largestSet := false, false
   668  	updatePointBounds := func(iter internalIterator) {
   669  		if key, _ := iter.First(); key != nil {
   670  			if !smallestSet ||
   671  				base.InternalCompare(c.cmp, c.smallest, *key) > 0 {
   672  				smallestSet = true
   673  				c.smallest = key.Clone()
   674  			}
   675  		}
   676  		if key, _ := iter.Last(); key != nil {
   677  			if !largestSet ||
   678  				base.InternalCompare(c.cmp, c.largest, *key) < 0 {
   679  				largestSet = true
   680  				c.largest = key.Clone()
   681  			}
   682  		}
   683  	}
   684  
   685  	updateRangeBounds := func(iter keyspan.FragmentIterator) {
   686  		// File bounds require s != nil && !s.Empty(). We only need to check for
   687  		// s != nil here, as the memtable's FragmentIterator would never surface
   688  		// empty spans.
   689  		if s := iter.First(); s != nil {
   690  			if key := s.SmallestKey(); !smallestSet ||
   691  				base.InternalCompare(c.cmp, c.smallest, key) > 0 {
   692  				smallestSet = true
   693  				c.smallest = key.Clone()
   694  			}
   695  		}
   696  		if s := iter.Last(); s != nil {
   697  			if key := s.LargestKey(); !largestSet ||
   698  				base.InternalCompare(c.cmp, c.largest, key) < 0 {
   699  				largestSet = true
   700  				c.largest = key.Clone()
   701  			}
   702  		}
   703  	}
   704  
   705  	var flushingBytes uint64
   706  	for i := range flushing {
   707  		f := flushing[i]
   708  		updatePointBounds(f.newIter(nil))
   709  		if rangeDelIter := f.newRangeDelIter(nil); rangeDelIter != nil {
   710  			updateRangeBounds(rangeDelIter)
   711  		}
   712  		if rangeKeyIter := f.newRangeKeyIter(nil); rangeKeyIter != nil {
   713  			updateRangeBounds(rangeKeyIter)
   714  		}
   715  		flushingBytes += f.inuseBytes()
   716  	}
   717  
   718  	if opts.FlushSplitBytes > 0 {
   719  		c.maxOutputFileSize = uint64(opts.Level(0).TargetFileSize)
   720  		c.maxOverlapBytes = maxGrandparentOverlapBytes(opts, 0)
   721  		c.grandparents = c.version.Overlaps(baseLevel, c.cmp, c.smallest.UserKey,
   722  			c.largest.UserKey, c.largest.IsExclusiveSentinel())
   723  		adjustGrandparentOverlapBytesForFlush(c, flushingBytes)
   724  	}
   725  
   726  	c.setupInuseKeyRanges()
   727  	return c
   728  }
   729  
   730  func (c *compaction) hasExtraLevelData() bool {
   731  	if len(c.extraLevels) == 0 {
   732  		// not a multi level compaction
   733  		return false
   734  	} else if c.extraLevels[0].files.Empty() {
   735  		// a multi level compaction without data in the intermediate input level;
   736  		// e.g. for a multi level compaction with levels 4,5, and 6, this could
   737  		// occur if there is no files to compact in 5, or in 5 and 6 (i.e. a move).
   738  		return false
   739  	}
   740  	return true
   741  }
   742  
   743  func (c *compaction) setupInuseKeyRanges() {
   744  	level := c.outputLevel.level + 1
   745  	if c.outputLevel.level == 0 {
   746  		level = 0
   747  	}
   748  	// calculateInuseKeyRanges will return a series of sorted spans. Overlapping
   749  	// or abutting spans have already been merged.
   750  	c.inuseKeyRanges = calculateInuseKeyRanges(
   751  		c.version, c.cmp, level, numLevels-1, c.smallest.UserKey, c.largest.UserKey,
   752  	)
   753  	// Check if there's a single in-use span that encompasses the entire key
   754  	// range of the compaction. This is an optimization to avoid key comparisons
   755  	// against inuseKeyRanges during the compaction when every key within the
   756  	// compaction overlaps with an in-use span.
   757  	if len(c.inuseKeyRanges) > 0 {
   758  		c.inuseEntireRange = c.cmp(c.inuseKeyRanges[0].Start, c.smallest.UserKey) <= 0 &&
   759  			c.cmp(c.inuseKeyRanges[0].End, c.largest.UserKey) >= 0
   760  	}
   761  }
   762  
   763  func calculateInuseKeyRanges(
   764  	v *version, cmp base.Compare, level, maxLevel int, smallest, largest []byte,
   765  ) []manifest.UserKeyRange {
   766  	// Use two slices, alternating which one is input and which one is output
   767  	// as we descend the LSM.
   768  	var input, output []manifest.UserKeyRange
   769  
   770  	// L0 requires special treatment, since sstables within L0 may overlap.
   771  	// We use the L0 Sublevels structure to efficiently calculate the merged
   772  	// in-use key ranges.
   773  	if level == 0 {
   774  		output = v.L0Sublevels.InUseKeyRanges(smallest, largest)
   775  		level++
   776  	}
   777  
   778  	for ; level <= maxLevel; level++ {
   779  		// NB: We always treat `largest` as inclusive for simplicity, because
   780  		// there's little consequence to calculating slightly broader in-use key
   781  		// ranges.
   782  		overlaps := v.Overlaps(level, cmp, smallest, largest, false /* exclusiveEnd */)
   783  		iter := overlaps.Iter()
   784  
   785  		// We may already have in-use key ranges from higher levels. Iterate
   786  		// through both our accumulated in-use key ranges and this level's
   787  		// files, merging the two.
   788  		//
   789  		// Tables higher within the LSM have broader key spaces. We use this
   790  		// when possible to seek past a level's files that are contained by
   791  		// our current accumulated in-use key ranges. This helps avoid
   792  		// per-sstable work during flushes or compactions in high levels which
   793  		// overlap the majority of the LSM's sstables.
   794  		input, output = output, input
   795  		output = output[:0]
   796  
   797  		var currFile *fileMetadata
   798  		var currAccum *manifest.UserKeyRange
   799  		if len(input) > 0 {
   800  			currAccum, input = &input[0], input[1:]
   801  		}
   802  
   803  		// If we have an accumulated key range and its start is ≤ smallest,
   804  		// we can seek to the accumulated range's end. Otherwise, we need to
   805  		// start at the first overlapping file within the level.
   806  		if currAccum != nil && cmp(currAccum.Start, smallest) <= 0 {
   807  			currFile = seekGT(&iter, cmp, currAccum.End)
   808  		} else {
   809  			currFile = iter.First()
   810  		}
   811  
   812  		for currFile != nil || currAccum != nil {
   813  			// If we've exhausted either the files in the level or the
   814  			// accumulated key ranges, we just need to append the one we have.
   815  			// If we have both a currFile and a currAccum, they either overlap
   816  			// or they're disjoint. If they're disjoint, we append whichever
   817  			// one sorts first and move on to the next file or range. If they
   818  			// overlap, we merge them into currAccum and proceed to the next
   819  			// file.
   820  			switch {
   821  			case currAccum == nil || (currFile != nil && cmp(currFile.Largest.UserKey, currAccum.Start) < 0):
   822  				// This file is strictly before the current accumulated range,
   823  				// or there are no more accumulated ranges.
   824  				output = append(output, manifest.UserKeyRange{
   825  					Start: currFile.Smallest.UserKey,
   826  					End:   currFile.Largest.UserKey,
   827  				})
   828  				currFile = iter.Next()
   829  			case currFile == nil || (currAccum != nil && cmp(currAccum.End, currFile.Smallest.UserKey) < 0):
   830  				// The current accumulated key range is strictly before the
   831  				// current file, or there are no more files.
   832  				output = append(output, *currAccum)
   833  				currAccum = nil
   834  				if len(input) > 0 {
   835  					currAccum, input = &input[0], input[1:]
   836  				}
   837  			default:
   838  				// The current accumulated range and the current file overlap.
   839  				// Adjust the accumulated range to be the union.
   840  				if cmp(currFile.Smallest.UserKey, currAccum.Start) < 0 {
   841  					currAccum.Start = currFile.Smallest.UserKey
   842  				}
   843  				if cmp(currFile.Largest.UserKey, currAccum.End) > 0 {
   844  					currAccum.End = currFile.Largest.UserKey
   845  				}
   846  
   847  				// Extending `currAccum`'s end boundary may have caused it to
   848  				// overlap with `input` key ranges that we haven't processed
   849  				// yet. Merge any such key ranges.
   850  				for len(input) > 0 && cmp(input[0].Start, currAccum.End) <= 0 {
   851  					if cmp(input[0].End, currAccum.End) > 0 {
   852  						currAccum.End = input[0].End
   853  					}
   854  					input = input[1:]
   855  				}
   856  				// Seek the level iterator past our current accumulated end.
   857  				currFile = seekGT(&iter, cmp, currAccum.End)
   858  			}
   859  		}
   860  	}
   861  	return output
   862  }
   863  
   864  func seekGT(iter *manifest.LevelIterator, cmp base.Compare, key []byte) *manifest.FileMetadata {
   865  	f := iter.SeekGE(cmp, key)
   866  	for f != nil && cmp(f.Largest.UserKey, key) == 0 {
   867  		f = iter.Next()
   868  	}
   869  	return f
   870  }
   871  
   872  // findGrandparentLimit takes the start user key for a table and returns the
   873  // user key to which that table can extend without excessively overlapping
   874  // the grandparent level. If no limit is needed considering the grandparent
   875  // files, this function returns nil. This is done in order to prevent a table
   876  // at level N from overlapping too much data at level N+1. We want to avoid
   877  // such large overlaps because they translate into large compactions. The
   878  // current heuristic stops output of a table if the addition of another key
   879  // would cause the table to overlap more than 10x the target file size at
   880  // level N. See maxGrandparentOverlapBytes.
   881  func (c *compaction) findGrandparentLimit(start []byte) []byte {
   882  	iter := c.grandparents.Iter()
   883  	var overlappedBytes uint64
   884  	var greater bool
   885  	for f := iter.SeekGE(c.cmp, start); f != nil; f = iter.Next() {
   886  		overlappedBytes += f.Size
   887  		// To ensure forward progress we always return a larger user
   888  		// key than where we started. See comments above clients of
   889  		// this function for how this is used.
   890  		greater = greater || c.cmp(f.Smallest.UserKey, start) > 0
   891  		if !greater {
   892  			continue
   893  		}
   894  
   895  		// We return the smallest bound of a sstable rather than the
   896  		// largest because the smallest is always inclusive, and limits
   897  		// are used exlusively when truncating range tombstones. If we
   898  		// truncated an output to the largest key while there's a
   899  		// pending tombstone, the next output file would also overlap
   900  		// the same grandparent f.
   901  		if overlappedBytes > c.maxOverlapBytes {
   902  			return f.Smallest.UserKey
   903  		}
   904  	}
   905  	return nil
   906  }
   907  
   908  // findL0Limit takes the start key for a table and returns the user key to which
   909  // that table can be extended without hitting the next l0Limit. Having flushed
   910  // sstables "bridging across" an l0Limit could lead to increased L0 -> LBase
   911  // compaction sizes as well as elevated read amplification.
   912  func (c *compaction) findL0Limit(start []byte) []byte {
   913  	if c.startLevel.level > -1 || c.outputLevel.level != 0 || len(c.l0Limits) == 0 {
   914  		return nil
   915  	}
   916  	index := sort.Search(len(c.l0Limits), func(i int) bool {
   917  		return c.cmp(c.l0Limits[i], start) > 0
   918  	})
   919  	if index < len(c.l0Limits) {
   920  		return c.l0Limits[index]
   921  	}
   922  	return nil
   923  }
   924  
   925  // errorOnUserKeyOverlap returns an error if the last two written sstables in
   926  // this compaction have revisions of the same user key present in both sstables,
   927  // when it shouldn't (eg. when splitting flushes).
   928  func (c *compaction) errorOnUserKeyOverlap(ve *versionEdit) error {
   929  	if n := len(ve.NewFiles); n > 1 {
   930  		meta := ve.NewFiles[n-1].Meta
   931  		prevMeta := ve.NewFiles[n-2].Meta
   932  		if !prevMeta.Largest.IsExclusiveSentinel() &&
   933  			c.cmp(prevMeta.Largest.UserKey, meta.Smallest.UserKey) >= 0 {
   934  			return errors.Errorf("bitalostable: compaction split user key across two sstables: %s in %s and %s",
   935  				prevMeta.Largest.Pretty(c.formatKey),
   936  				prevMeta.FileNum,
   937  				meta.FileNum)
   938  		}
   939  	}
   940  	return nil
   941  }
   942  
   943  // allowZeroSeqNum returns true if seqnum's can be zeroed if there are no
   944  // snapshots requiring them to be kept. It performs this determination by
   945  // looking for an sstable which overlaps the bounds of the compaction at a
   946  // lower level in the LSM.
   947  func (c *compaction) allowZeroSeqNum() bool {
   948  	return c.elideRangeTombstone(c.smallest.UserKey, c.largest.UserKey)
   949  }
   950  
   951  // elideTombstone returns true if it is ok to elide a tombstone for the
   952  // specified key. A return value of true guarantees that there are no key/value
   953  // pairs at c.level+2 or higher that possibly contain the specified user
   954  // key. The keys in multiple invocations to elideTombstone must be supplied in
   955  // order.
   956  func (c *compaction) elideTombstone(key []byte) bool {
   957  	if c.inuseEntireRange || len(c.flushing) != 0 {
   958  		return false
   959  	}
   960  
   961  	for ; c.elideTombstoneIndex < len(c.inuseKeyRanges); c.elideTombstoneIndex++ {
   962  		r := &c.inuseKeyRanges[c.elideTombstoneIndex]
   963  		if c.cmp(key, r.End) <= 0 {
   964  			if c.cmp(key, r.Start) >= 0 {
   965  				return false
   966  			}
   967  			break
   968  		}
   969  	}
   970  	return true
   971  }
   972  
   973  // elideRangeTombstone returns true if it is ok to elide the specified range
   974  // tombstone. A return value of true guarantees that there are no key/value
   975  // pairs at c.outputLevel.level+1 or higher that possibly overlap the specified
   976  // tombstone.
   977  func (c *compaction) elideRangeTombstone(start, end []byte) bool {
   978  	// Disable range tombstone elision if the testing knob for that is enabled,
   979  	// or if we are flushing memtables. The latter requirement is due to
   980  	// inuseKeyRanges not accounting for key ranges in other memtables that are
   981  	// being flushed in the same compaction. It's possible for a range tombstone
   982  	// in one memtable to overlap keys in a preceding memtable in c.flushing.
   983  	//
   984  	// This function is also used in setting allowZeroSeqNum, so disabling
   985  	// elision of range tombstones also disables zeroing of SeqNums.
   986  	//
   987  	// TODO(peter): we disable zeroing of seqnums during flushing to match
   988  	// RocksDB behavior and to avoid generating overlapping sstables during
   989  	// DB.replayWAL. When replaying WAL files at startup, we flush after each
   990  	// WAL is replayed building up a single version edit that is
   991  	// applied. Because we don't apply the version edit after each flush, this
   992  	// code doesn't know that L0 contains files and zeroing of seqnums should
   993  	// be disabled. That is fixable, but it seems safer to just match the
   994  	// RocksDB behavior for now.
   995  	if c.disableSpanElision || len(c.flushing) != 0 {
   996  		return false
   997  	}
   998  
   999  	lower := sort.Search(len(c.inuseKeyRanges), func(i int) bool {
  1000  		return c.cmp(c.inuseKeyRanges[i].End, start) >= 0
  1001  	})
  1002  	upper := sort.Search(len(c.inuseKeyRanges), func(i int) bool {
  1003  		return c.cmp(c.inuseKeyRanges[i].Start, end) > 0
  1004  	})
  1005  	return lower >= upper
  1006  }
  1007  
  1008  // elideRangeKey returns true if it is ok to elide the specified range key. A
  1009  // return value of true guarantees that there are no key/value pairs at
  1010  // c.outputLevel.level+1 or higher that possibly overlap the specified range key.
  1011  func (c *compaction) elideRangeKey(start, end []byte) bool {
  1012  	// TODO(bilal): Track inuseKeyRanges separately for the range keyspace as
  1013  	// opposed to the point keyspace. Once that is done, elideRangeTombstone
  1014  	// can just check in the point keyspace, and this function can check for
  1015  	// inuseKeyRanges in the range keyspace.
  1016  	return c.elideRangeTombstone(start, end)
  1017  }
  1018  
  1019  // newInputIter returns an iterator over all the input tables in a compaction.
  1020  func (c *compaction) newInputIter(
  1021  	newIters tableNewIters, newRangeKeyIter keyspan.TableNewSpanIter, snapshots []uint64,
  1022  ) (_ internalIterator, retErr error) {
  1023  	var rangeDelIters []keyspan.FragmentIterator
  1024  	var rangeKeyIters []keyspan.FragmentIterator
  1025  
  1026  	if len(c.flushing) != 0 {
  1027  		if len(c.flushing) == 1 {
  1028  			f := c.flushing[0]
  1029  			iter := f.newFlushIter(nil, &c.bytesIterated)
  1030  			if rangeDelIter := f.newRangeDelIter(nil); rangeDelIter != nil {
  1031  				c.rangeDelIter.Init(c.cmp, rangeDelIter)
  1032  				iter = newMergingIter(c.logger, &c.stats, c.cmp, nil, iter, &c.rangeDelIter)
  1033  			}
  1034  			if rangeKeyIter := f.newRangeKeyIter(nil); rangeKeyIter != nil {
  1035  				mi := &keyspan.MergingIter{}
  1036  				mi.Init(c.cmp, rangeKeyCompactionTransform(snapshots, c.elideRangeKey), rangeKeyIter)
  1037  				c.rangeKeyInterleaving.Init(c.comparer, iter, mi, nil /* hooks */, nil /* lowerBound */, nil /* upperBound */)
  1038  				iter = &c.rangeKeyInterleaving
  1039  			}
  1040  			return iter, nil
  1041  		}
  1042  		iters := make([]internalIterator, 0, len(c.flushing)+1)
  1043  		rangeDelIters = make([]keyspan.FragmentIterator, 0, len(c.flushing))
  1044  		rangeKeyIters = make([]keyspan.FragmentIterator, 0, len(c.flushing))
  1045  		for i := range c.flushing {
  1046  			f := c.flushing[i]
  1047  			iters = append(iters, f.newFlushIter(nil, &c.bytesIterated))
  1048  			rangeDelIter := f.newRangeDelIter(nil)
  1049  			if rangeDelIter != nil {
  1050  				rangeDelIters = append(rangeDelIters, rangeDelIter)
  1051  			}
  1052  			if rangeKeyIter := f.newRangeKeyIter(nil); rangeKeyIter != nil {
  1053  				rangeKeyIters = append(rangeKeyIters, rangeKeyIter)
  1054  			}
  1055  		}
  1056  		if len(rangeDelIters) > 0 {
  1057  			c.rangeDelIter.Init(c.cmp, rangeDelIters...)
  1058  			iters = append(iters, &c.rangeDelIter)
  1059  		}
  1060  		var iter internalIterator = newMergingIter(c.logger, &c.stats, c.cmp, nil, iters...)
  1061  		if len(rangeKeyIters) > 0 {
  1062  			mi := &keyspan.MergingIter{}
  1063  			mi.Init(c.cmp, rangeKeyCompactionTransform(snapshots, c.elideRangeKey), rangeKeyIters...)
  1064  			c.rangeKeyInterleaving.Init(c.comparer, iter, mi, nil /* hooks */, nil /* lowerBound */, nil /* upperBound */)
  1065  			iter = &c.rangeKeyInterleaving
  1066  		}
  1067  		return iter, nil
  1068  	}
  1069  
  1070  	if c.startLevel.level >= 0 {
  1071  		err := manifest.CheckOrdering(c.cmp, c.formatKey,
  1072  			manifest.Level(c.startLevel.level), c.startLevel.files.Iter())
  1073  		if err != nil {
  1074  			return nil, err
  1075  		}
  1076  	}
  1077  	err := manifest.CheckOrdering(c.cmp, c.formatKey,
  1078  		manifest.Level(c.outputLevel.level), c.outputLevel.files.Iter())
  1079  	if err != nil {
  1080  		return nil, err
  1081  	}
  1082  
  1083  	if c.startLevel.level == 0 {
  1084  		if c.l0SublevelInfo == nil {
  1085  			panic("l0SublevelInfo not created for compaction out of L0")
  1086  		}
  1087  
  1088  		for _, info := range c.l0SublevelInfo {
  1089  			err := manifest.CheckOrdering(c.cmp, c.formatKey,
  1090  				info.sublevel, info.Iter())
  1091  			if err != nil {
  1092  				return nil, err
  1093  			}
  1094  		}
  1095  	}
  1096  
  1097  	if len(c.extraLevels) > 0 {
  1098  		if len(c.extraLevels) > 1 {
  1099  			panic("n>2 multi level compaction not implemented yet")
  1100  		}
  1101  		interLevel := c.extraLevels[0]
  1102  		err := manifest.CheckOrdering(c.cmp, c.formatKey,
  1103  			manifest.Level(interLevel.level), interLevel.files.Iter())
  1104  		if err != nil {
  1105  			return nil, err
  1106  		}
  1107  	}
  1108  	iters := make([]internalIterator, 0, len(c.inputs)*c.startLevel.files.Len()+1)
  1109  	defer func() {
  1110  		if retErr != nil {
  1111  			for _, iter := range iters {
  1112  				if iter != nil {
  1113  					iter.Close()
  1114  				}
  1115  			}
  1116  			for _, rangeDelIter := range rangeDelIters {
  1117  				rangeDelIter.Close()
  1118  			}
  1119  		}
  1120  	}()
  1121  
  1122  	// In normal operation, levelIter iterates over the point operations in a
  1123  	// level, and initializes a rangeDelIter pointer for the range deletions in
  1124  	// each table. During compaction, we want to iterate over the merged view of
  1125  	// point operations and range deletions. In order to do this we create one
  1126  	// levelIter per level to iterate over the point operations, and collect up
  1127  	// all the range deletion files.
  1128  	//
  1129  	// The range deletion levels are first combined with a keyspan.MergingIter
  1130  	// (currently wrapped by a keyspan.InternalIteratorShim to satisfy the
  1131  	// internal iterator interface). The resulting merged rangedel iterator is
  1132  	// then included with the point levels in a single mergingIter.
  1133  	newRangeDelIter := func(
  1134  		f *manifest.FileMetadata, slice manifest.LevelSlice, _ *IterOptions, bytesIterated *uint64,
  1135  	) (keyspan.FragmentIterator, error) {
  1136  		iter, rangeDelIter, err := newIters(f, nil, /* iter options */
  1137  			internalIterOpts{bytesIterated: &c.bytesIterated})
  1138  		if err == nil {
  1139  			// TODO(peter): It is mildly wasteful to open the point iterator only to
  1140  			// immediately close it. One way to solve this would be to add new
  1141  			// methods to tableCache for creating point and range-deletion iterators
  1142  			// independently. We'd only want to use those methods here,
  1143  			// though. Doesn't seem worth the hassle in the near term.
  1144  			if err = iter.Close(); err != nil {
  1145  				rangeDelIter.Close()
  1146  				rangeDelIter = nil
  1147  			}
  1148  		}
  1149  		if rangeDelIter != nil {
  1150  			// Ensure that rangeDelIter is not closed until the compaction is
  1151  			// finished. This is necessary because range tombstone processing
  1152  			// requires the range tombstones to be held in memory for up to the
  1153  			// lifetime of the compaction.
  1154  			c.closers = append(c.closers, rangeDelIter)
  1155  			rangeDelIter = noCloseIter{rangeDelIter}
  1156  
  1157  			// Truncate the range tombstones returned by the iterator to the
  1158  			// upper bound of the atomic compaction unit. Note that we need do
  1159  			// this truncation at read time in order to handle sstables
  1160  			// generated by RocksDB and earlier versions of Pebble which do not
  1161  			// truncate range tombstones to atomic compaction unit boundaries at
  1162  			// write time.
  1163  			//
  1164  			// The current Pebble compaction logic DOES truncate tombstones to
  1165  			// atomic unit boundaries at compaction time too.
  1166  			atomicUnit, _ := expandToAtomicUnit(c.cmp, slice, true /* disableIsCompacting */)
  1167  			lowerBound, upperBound := manifest.KeyRange(c.cmp, atomicUnit.Iter())
  1168  			// Range deletion tombstones are often written to sstables
  1169  			// untruncated on the end key side. However, they are still only
  1170  			// valid within a given file's bounds. The logic for writing range
  1171  			// tombstones to an output file sometimes has an incomplete view
  1172  			// of range tombstones outside the file's internal key bounds. Skip
  1173  			// any range tombstones completely outside file bounds.
  1174  			rangeDelIter = keyspan.Truncate(
  1175  				c.cmp, rangeDelIter, lowerBound.UserKey, upperBound.UserKey, &f.Smallest, &f.Largest)
  1176  		}
  1177  		if rangeDelIter == nil {
  1178  			rangeDelIter = emptyKeyspanIter
  1179  		}
  1180  		return rangeDelIter, err
  1181  	}
  1182  
  1183  	iterOpts := IterOptions{logger: c.logger}
  1184  	// TODO(bananabrick): Get rid of the extra manifest.Level parameter and fold it into
  1185  	// compactionLevel.
  1186  	addItersForLevel := func(level *compactionLevel, l manifest.Level) error {
  1187  		iters = append(iters, newLevelIter(iterOpts, c.cmp, nil /* split */, newIters,
  1188  			level.files.Iter(), l, &c.bytesIterated))
  1189  		// Create a wrapping closure to turn newRangeDelIter into a
  1190  		// keyspan.TableNewSpanIter, and return a LevelIter that lazily creates
  1191  		// rangedel iterators. This is safe now that range deletions are truncated
  1192  		// at file bounds; the merging iterator no longer needs to see all range
  1193  		// deletes for correctness.
  1194  		wrapper := func(file *manifest.FileMetadata, iterOptions *keyspan.SpanIterOptions) (keyspan.FragmentIterator, error) {
  1195  			return newRangeDelIter(file, level.files, nil, &c.bytesIterated)
  1196  		}
  1197  		li := &keyspan.LevelIter{}
  1198  		li.Init(keyspan.SpanIterOptions{}, c.cmp, wrapper, level.files.Iter(), l, c.logger, manifest.KeyTypePoint)
  1199  		rangeDelIters = append(rangeDelIters, li)
  1200  		// Check if this level has any range keys.
  1201  		hasRangeKeys := false
  1202  		iter := level.files.Iter()
  1203  		for f := iter.First(); f != nil; f = iter.Next() {
  1204  			if f.HasRangeKeys {
  1205  				hasRangeKeys = true
  1206  				break
  1207  			}
  1208  		}
  1209  		if hasRangeKeys {
  1210  			li := &keyspan.LevelIter{}
  1211  			newRangeKeyIterWrapper := func(file *manifest.FileMetadata, iterOptions *keyspan.SpanIterOptions) (keyspan.FragmentIterator, error) {
  1212  				iter, err := newRangeKeyIter(file, iterOptions)
  1213  				if iter != nil {
  1214  					// Ensure that the range key iter is not closed until the compaction is
  1215  					// finished. This is necessary because range key processing
  1216  					// requires the range keys to be held in memory for up to the
  1217  					// lifetime of the compaction.
  1218  					c.closers = append(c.closers, iter)
  1219  					iter = noCloseIter{iter}
  1220  
  1221  					// We do not need to truncate range keys to sstable boundaries, or
  1222  					// only read within the file's atomic compaction units, unlike with
  1223  					// range tombstones. This is because range keys were added after we
  1224  					// stopped splitting user keys across sstables, so all the range keys
  1225  					// in this sstable must wholly lie within the file's bounds.
  1226  				}
  1227  				if iter == nil {
  1228  					iter = emptyKeyspanIter
  1229  				}
  1230  				return iter, err
  1231  			}
  1232  			li.Init(keyspan.SpanIterOptions{}, c.cmp, newRangeKeyIterWrapper, level.files.Iter(), l, c.logger, manifest.KeyTypeRange)
  1233  			rangeKeyIters = append(rangeKeyIters, li)
  1234  		}
  1235  		return nil
  1236  	}
  1237  
  1238  	if c.startLevel.level != 0 {
  1239  		if err = addItersForLevel(c.startLevel, manifest.Level(c.startLevel.level)); err != nil {
  1240  			return nil, err
  1241  		}
  1242  	} else {
  1243  		for _, info := range c.l0SublevelInfo {
  1244  			if err = addItersForLevel(
  1245  				&compactionLevel{0, info.LevelSlice}, info.sublevel); err != nil {
  1246  				return nil, err
  1247  			}
  1248  		}
  1249  	}
  1250  	if len(c.extraLevels) > 0 {
  1251  		if err = addItersForLevel(c.extraLevels[0], manifest.Level(c.extraLevels[0].level)); err != nil {
  1252  			return nil, err
  1253  		}
  1254  	}
  1255  	if err = addItersForLevel(c.outputLevel, manifest.Level(c.outputLevel.level)); err != nil {
  1256  		return nil, err
  1257  	}
  1258  
  1259  	// Combine all the rangedel iterators using a keyspan.MergingIterator and a
  1260  	// InternalIteratorShim so that the range deletions may be interleaved in
  1261  	// the compaction input.
  1262  	// TODO(jackson): Replace the InternalIteratorShim with an interleaving
  1263  	// iterator.
  1264  	if len(rangeDelIters) > 0 {
  1265  		c.rangeDelIter.Init(c.cmp, rangeDelIters...)
  1266  		iters = append(iters, &c.rangeDelIter)
  1267  	}
  1268  	pointKeyIter := newMergingIter(c.logger, &c.stats, c.cmp, nil, iters...)
  1269  	if len(rangeKeyIters) > 0 {
  1270  		mi := &keyspan.MergingIter{}
  1271  		mi.Init(c.cmp, rangeKeyCompactionTransform(snapshots, c.elideRangeKey), rangeKeyIters...)
  1272  		di := &keyspan.DefragmentingIter{}
  1273  		di.Init(c.comparer, mi, keyspan.DefragmentInternal, keyspan.StaticDefragmentReducer)
  1274  		c.rangeKeyInterleaving.Init(c.comparer, pointKeyIter, di, nil /* hooks */, nil /* lowerBound */, nil /* upperBound */)
  1275  		return &c.rangeKeyInterleaving, nil
  1276  	}
  1277  
  1278  	return pointKeyIter, nil
  1279  }
  1280  
  1281  func (c *compaction) String() string {
  1282  	if len(c.flushing) != 0 {
  1283  		return "flush\n"
  1284  	}
  1285  
  1286  	var buf bytes.Buffer
  1287  	for level := c.startLevel.level; level <= c.outputLevel.level; level++ {
  1288  		i := level - c.startLevel.level
  1289  		fmt.Fprintf(&buf, "%d:", level)
  1290  		iter := c.inputs[i].files.Iter()
  1291  		for f := iter.First(); f != nil; f = iter.Next() {
  1292  			fmt.Fprintf(&buf, " %s:%s-%s", f.FileNum, f.Smallest, f.Largest)
  1293  		}
  1294  		fmt.Fprintf(&buf, "\n")
  1295  	}
  1296  	return buf.String()
  1297  }
  1298  
  1299  type manualCompaction struct {
  1300  	// Count of the retries either due to too many concurrent compactions, or a
  1301  	// concurrent compaction to overlapping levels.
  1302  	retries     int
  1303  	level       int
  1304  	outputLevel int
  1305  	done        chan error
  1306  	start       []byte
  1307  	end         []byte
  1308  	split       bool
  1309  }
  1310  
  1311  type readCompaction struct {
  1312  	level int
  1313  	// [start, end] key ranges are used for de-duping.
  1314  	start []byte
  1315  	end   []byte
  1316  
  1317  	// The file associated with the compaction.
  1318  	// If the file no longer belongs in the same
  1319  	// level, then we skip the compaction.
  1320  	fileNum base.FileNum
  1321  }
  1322  
  1323  func (d *DB) addInProgressCompaction(c *compaction) {
  1324  	d.mu.compact.inProgress[c] = struct{}{}
  1325  	var isBase, isIntraL0 bool
  1326  	for _, cl := range c.inputs {
  1327  		iter := cl.files.Iter()
  1328  		for f := iter.First(); f != nil; f = iter.Next() {
  1329  			if f.IsCompacting() {
  1330  				d.opts.Logger.Fatalf("L%d->L%d: %s already being compacted", c.startLevel.level, c.outputLevel.level, f.FileNum)
  1331  			}
  1332  			f.SetCompactionState(manifest.CompactionStateCompacting)
  1333  			if c.startLevel != nil && c.outputLevel != nil && c.startLevel.level == 0 {
  1334  				if c.outputLevel.level == 0 {
  1335  					f.IsIntraL0Compacting = true
  1336  					isIntraL0 = true
  1337  				} else {
  1338  					isBase = true
  1339  				}
  1340  			}
  1341  		}
  1342  	}
  1343  
  1344  	if (isIntraL0 || isBase) && c.version.L0Sublevels != nil {
  1345  		l0Inputs := []manifest.LevelSlice{c.startLevel.files}
  1346  		if isIntraL0 {
  1347  			l0Inputs = append(l0Inputs, c.outputLevel.files)
  1348  		}
  1349  		if err := c.version.L0Sublevels.UpdateStateForStartedCompaction(l0Inputs, isBase); err != nil {
  1350  			d.opts.Logger.Fatalf("could not update state for compaction: %s", err)
  1351  		}
  1352  	}
  1353  
  1354  	if false {
  1355  		// TODO(peter): Do we want to keep this? It is useful for seeing the
  1356  		// concurrent compactions/flushes that are taking place. Right now, this
  1357  		// spams the logs and output to tests. Figure out a way to useful expose
  1358  		// it.
  1359  		strs := make([]string, 0, len(d.mu.compact.inProgress))
  1360  		for c := range d.mu.compact.inProgress {
  1361  			var s string
  1362  			if c.startLevel.level == -1 {
  1363  				s = fmt.Sprintf("mem->L%d", c.outputLevel.level)
  1364  			} else {
  1365  				s = fmt.Sprintf("L%d->L%d:%.1f", c.startLevel.level, c.outputLevel.level, c.score)
  1366  			}
  1367  			strs = append(strs, s)
  1368  		}
  1369  		// This odd sorting function is intended to sort "mem" before "L*".
  1370  		sort.Slice(strs, func(i, j int) bool {
  1371  			if strs[i][0] == strs[j][0] {
  1372  				return strs[i] < strs[j]
  1373  			}
  1374  			return strs[i] > strs[j]
  1375  		})
  1376  		d.opts.Logger.Infof("compactions: %s", strings.Join(strs, " "))
  1377  	}
  1378  }
  1379  
  1380  // Removes compaction markers from files in a compaction. The rollback parameter
  1381  // indicates whether the compaction state should be rolled back to its original
  1382  // state in the case of an unsuccessful compaction.
  1383  //
  1384  // DB.mu must be held when calling this method. All writes to the manifest for
  1385  // this compaction should have completed by this point.
  1386  func (d *DB) removeInProgressCompaction(c *compaction, rollback bool) {
  1387  	for _, cl := range c.inputs {
  1388  		iter := cl.files.Iter()
  1389  		for f := iter.First(); f != nil; f = iter.Next() {
  1390  			if !f.IsCompacting() {
  1391  				d.opts.Logger.Fatalf("L%d->L%d: %s not being compacted", c.startLevel.level, c.outputLevel.level, f.FileNum)
  1392  			}
  1393  			if !rollback {
  1394  				// On success all compactions other than move-compactions transition the
  1395  				// file into the Compacted state. Move-compacted files become eligible
  1396  				// for compaction again and transition back to NotCompacting.
  1397  				if c.kind != compactionKindMove {
  1398  					f.SetCompactionState(manifest.CompactionStateCompacted)
  1399  				} else {
  1400  					f.SetCompactionState(manifest.CompactionStateNotCompacting)
  1401  				}
  1402  			} else {
  1403  				// Else, on rollback, all input files unconditionally transition back to
  1404  				// NotCompacting.
  1405  				f.SetCompactionState(manifest.CompactionStateNotCompacting)
  1406  			}
  1407  			f.IsIntraL0Compacting = false
  1408  		}
  1409  	}
  1410  	delete(d.mu.compact.inProgress, c)
  1411  
  1412  	l0InProgress := inProgressL0Compactions(d.getInProgressCompactionInfoLocked(c))
  1413  	d.mu.versions.currentVersion().L0Sublevels.InitCompactingFileInfo(l0InProgress)
  1414  }
  1415  
  1416  func (d *DB) calculateDiskAvailableBytes() uint64 {
  1417  	if space, err := d.opts.FS.GetDiskUsage(d.dirname); err == nil {
  1418  		atomic.StoreUint64(&d.atomic.diskAvailBytes, space.AvailBytes)
  1419  		return space.AvailBytes
  1420  	} else if !errors.Is(err, vfs.ErrUnsupported) {
  1421  		d.opts.EventListener.BackgroundError(err)
  1422  	}
  1423  	return atomic.LoadUint64(&d.atomic.diskAvailBytes)
  1424  }
  1425  
  1426  func (d *DB) getDiskAvailableBytesCached() uint64 {
  1427  	return atomic.LoadUint64(&d.atomic.diskAvailBytes)
  1428  }
  1429  
  1430  func (d *DB) getDeletionPacerInfo() deletionPacerInfo {
  1431  	var pacerInfo deletionPacerInfo
  1432  	// Call GetDiskUsage after every file deletion. This may seem inefficient,
  1433  	// but in practice this was observed to take constant time, regardless of
  1434  	// volume size used, at least on linux with ext4 and zfs. All invocations
  1435  	// take 10 microseconds or less.
  1436  	pacerInfo.freeBytes = d.calculateDiskAvailableBytes()
  1437  	d.mu.Lock()
  1438  	pacerInfo.obsoleteBytes = d.mu.versions.metrics.Table.ObsoleteSize
  1439  	pacerInfo.liveBytes = uint64(d.mu.versions.metrics.Total().Size)
  1440  	d.mu.Unlock()
  1441  	return pacerInfo
  1442  }
  1443  
  1444  // maybeScheduleFlush schedules a flush if necessary.
  1445  //
  1446  // d.mu must be held when calling this.
  1447  func (d *DB) maybeScheduleFlush(needReport bool) {
  1448  	if d.mu.compact.flushing || d.closed.Load() != nil || d.opts.ReadOnly {
  1449  		return
  1450  	}
  1451  	if len(d.mu.mem.queue) <= 1 {
  1452  		return
  1453  	}
  1454  
  1455  	if !d.passedFlushThreshold() {
  1456  		return
  1457  	}
  1458  
  1459  	d.mu.compact.flushing = true
  1460  	go d.flush(needReport)
  1461  }
  1462  
  1463  func (d *DB) passedFlushThreshold() bool {
  1464  	var n int
  1465  	var size uint64
  1466  	for ; n < len(d.mu.mem.queue)-1; n++ {
  1467  		if !d.mu.mem.queue[n].readyForFlush() {
  1468  			break
  1469  		}
  1470  		if d.mu.mem.queue[n].flushForced {
  1471  			// A flush was forced. Pretend the memtable size is the configured
  1472  			// size. See minFlushSize below.
  1473  			size += uint64(d.opts.MemTableSize)
  1474  		} else {
  1475  			size += d.mu.mem.queue[n].totalBytes()
  1476  		}
  1477  	}
  1478  	if n == 0 {
  1479  		// None of the immutable memtables are ready for flushing.
  1480  		return false
  1481  	}
  1482  
  1483  	// Only flush once the sum of the queued memtable sizes exceeds half the
  1484  	// configured memtable size. This prevents flushing of memtables at startup
  1485  	// while we're undergoing the ramp period on the memtable size. See
  1486  	// DB.newMemTable().
  1487  	minFlushSize := uint64(d.opts.MemTableSize) / 2
  1488  	return size >= minFlushSize
  1489  }
  1490  
  1491  func (d *DB) maybeScheduleDelayedFlush(tbl *memTable, dur time.Duration) {
  1492  	var mem *flushableEntry
  1493  	for _, m := range d.mu.mem.queue {
  1494  		if m.flushable == tbl {
  1495  			mem = m
  1496  			break
  1497  		}
  1498  	}
  1499  	if mem == nil || mem.flushForced {
  1500  		return
  1501  	}
  1502  	deadline := d.timeNow().Add(dur)
  1503  	if !mem.delayedFlushForcedAt.IsZero() && deadline.After(mem.delayedFlushForcedAt) {
  1504  		// Already scheduled to flush sooner than within `dur`.
  1505  		return
  1506  	}
  1507  	mem.delayedFlushForcedAt = deadline
  1508  	go func() {
  1509  		timer := time.NewTimer(dur)
  1510  		defer timer.Stop()
  1511  
  1512  		select {
  1513  		case <-d.closedCh:
  1514  			return
  1515  		case <-mem.flushed:
  1516  			return
  1517  		case <-timer.C:
  1518  			d.commit.mu.Lock()
  1519  			defer d.commit.mu.Unlock()
  1520  			d.mu.Lock()
  1521  			defer d.mu.Unlock()
  1522  
  1523  			// NB: The timer may fire concurrently with a call to Close.  If a
  1524  			// Close call beat us to acquiring d.mu, d.closed holds ErrClosed,
  1525  			// and it's too late to flush anything. Otherwise, the Close call
  1526  			// will block on locking d.mu until we've finished scheduling the
  1527  			// flush and set `d.mu.compact.flushing` to true. Close will wait
  1528  			// for the current flush to complete.
  1529  			if d.closed.Load() != nil {
  1530  				return
  1531  			}
  1532  
  1533  			if d.mu.mem.mutable == tbl {
  1534  				d.makeRoomForWrite(nil, true)
  1535  			} else {
  1536  				mem.flushForced = true
  1537  				d.maybeScheduleFlush(true)
  1538  			}
  1539  		}
  1540  	}()
  1541  }
  1542  
  1543  func (d *DB) flush(needReport bool) {
  1544  	pprof.Do(context.Background(), flushLabels, func(context.Context) {
  1545  		flushingWorkStart := time.Now()
  1546  		d.mu.Lock()
  1547  		defer d.mu.Unlock()
  1548  		idleDuration := flushingWorkStart.Sub(d.mu.compact.noOngoingFlushStartTime)
  1549  		var bytesFlushed uint64
  1550  		var err error
  1551  		if bytesFlushed, err = d.flush1(needReport); err != nil {
  1552  			// TODO(peter): count consecutive flush errors and backoff.
  1553  			d.opts.EventListener.BackgroundError(err)
  1554  		}
  1555  		d.mu.compact.flushing = false
  1556  		d.mu.compact.noOngoingFlushStartTime = time.Now()
  1557  		workDuration := d.mu.compact.noOngoingFlushStartTime.Sub(flushingWorkStart)
  1558  		d.mu.compact.flushWriteThroughput.Bytes += int64(bytesFlushed)
  1559  		d.mu.compact.flushWriteThroughput.WorkDuration += workDuration
  1560  		d.mu.compact.flushWriteThroughput.IdleDuration += idleDuration
  1561  		// More flush work may have arrived while we were flushing, so schedule
  1562  		// another flush if needed.
  1563  		d.maybeScheduleFlush(true)
  1564  		// The flush may have produced too many files in a level, so schedule a
  1565  		// compaction if needed.
  1566  		d.maybeScheduleCompaction()
  1567  		d.mu.compact.cond.Broadcast()
  1568  	})
  1569  }
  1570  
  1571  // flush runs a compaction that copies the immutable memtables from memory to
  1572  // disk.
  1573  //
  1574  // d.mu must be held when calling this, but the mutex may be dropped and
  1575  // re-acquired during the course of this method.
  1576  func (d *DB) flush1(needReport bool) (bytesFlushed uint64, err error) {
  1577  	var n int
  1578  	for ; n < len(d.mu.mem.queue)-1; n++ {
  1579  		if !d.mu.mem.queue[n].readyForFlush() {
  1580  			break
  1581  		}
  1582  	}
  1583  	if n == 0 {
  1584  		// None of the immutable memtables are ready for flushing.
  1585  		return 0, nil
  1586  	}
  1587  
  1588  	// Require that every memtable being flushed has a log number less than the
  1589  	// new minimum unflushed log number.
  1590  	minUnflushedLogNum := d.mu.mem.queue[n].logNum
  1591  	if !d.opts.DisableWAL {
  1592  		for i := 0; i < n; i++ {
  1593  			logNum := d.mu.mem.queue[i].logNum
  1594  			if logNum >= minUnflushedLogNum {
  1595  				return 0, errFlushInvariant
  1596  			}
  1597  		}
  1598  	}
  1599  
  1600  	if needReport && d.opts.FlushReporter != nil {
  1601  		d.opts.FlushReporter(d.opts.Id)
  1602  	}
  1603  
  1604  	c := newFlush(d.opts, d.mu.versions.currentVersion(),
  1605  		d.mu.versions.picker.getBaseLevel(), d.mu.mem.queue[:n])
  1606  	d.addInProgressCompaction(c)
  1607  
  1608  	jobID := d.mu.nextJobID
  1609  	d.mu.nextJobID++
  1610  	d.opts.EventListener.FlushBegin(FlushInfo{
  1611  		JobID: jobID,
  1612  		Input: n,
  1613  	})
  1614  	startTime := d.timeNow()
  1615  
  1616  	ve, pendingOutputs, err := d.runCompaction(jobID, c)
  1617  
  1618  	info := FlushInfo{
  1619  		JobID:    jobID,
  1620  		Input:    n,
  1621  		Duration: d.timeNow().Sub(startTime),
  1622  		Done:     true,
  1623  		Err:      err,
  1624  	}
  1625  	if err == nil {
  1626  		for i := range ve.NewFiles {
  1627  			e := &ve.NewFiles[i]
  1628  			info.Output = append(info.Output, e.Meta.TableInfo())
  1629  		}
  1630  		if len(ve.NewFiles) == 0 {
  1631  			info.Err = errEmptyTable
  1632  		}
  1633  
  1634  		// The flush succeeded or it produced an empty sstable. In either case we
  1635  		// want to bump the minimum unflushed log number to the log number of the
  1636  		// oldest unflushed memtable.
  1637  		ve.MinUnflushedLogNum = minUnflushedLogNum
  1638  		metrics := c.metrics[0]
  1639  		for i := 0; i < n; i++ {
  1640  			metrics.BytesIn += d.mu.mem.queue[i].logSize
  1641  		}
  1642  
  1643  		d.mu.versions.logLock()
  1644  		err = d.mu.versions.logAndApply(jobID, ve, c.metrics, false, /* forceRotation */
  1645  			func() []compactionInfo { return d.getInProgressCompactionInfoLocked(c) })
  1646  		if err != nil {
  1647  			info.Err = err
  1648  			// TODO(peter): untested.
  1649  			d.mu.versions.obsoleteTables = append(d.mu.versions.obsoleteTables, pendingOutputs...)
  1650  			d.mu.versions.incrementObsoleteTablesLocked(pendingOutputs)
  1651  		}
  1652  	}
  1653  
  1654  	bytesFlushed = c.bytesIterated
  1655  	d.maybeUpdateDeleteCompactionHints(c)
  1656  	d.removeInProgressCompaction(c, err != nil)
  1657  	d.mu.versions.incrementCompactions(c.kind, c.extraLevels)
  1658  	d.mu.versions.incrementCompactionBytes(-c.bytesWritten)
  1659  
  1660  	var flushed flushableList
  1661  	if err == nil {
  1662  		flushed = d.mu.mem.queue[:n]
  1663  		d.mu.mem.queue = d.mu.mem.queue[n:]
  1664  		d.updateReadStateLocked(d.opts.DebugCheck)
  1665  		d.updateTableStatsLocked(ve.NewFiles)
  1666  	}
  1667  	// Signal FlushEnd after installing the new readState. This helps for unit
  1668  	// tests that use the callback to trigger a read using an iterator with
  1669  	// IterOptions.OnlyReadGuaranteedDurable.
  1670  	info.TotalDuration = d.timeNow().Sub(startTime)
  1671  	d.opts.EventListener.FlushEnd(info)
  1672  
  1673  	d.deleteObsoleteFiles(jobID, false /* waitForOngoing */)
  1674  
  1675  	// Mark all the memtables we flushed as flushed. Note that we do this last so
  1676  	// that a synchronous call to DB.Flush() will not return until the deletion
  1677  	// of obsolete files from this job have completed. This makes testing easier
  1678  	// and provides similar behavior to manual compactions where the compaction
  1679  	// is not marked as completed until the deletion of obsolete files job has
  1680  	// completed.
  1681  	for i := range flushed {
  1682  		// The order of these operations matters here for ease of testing. Removing
  1683  		// the reader reference first allows tests to be guaranteed that the
  1684  		// memtable reservation has been released by the time a synchronous flush
  1685  		// returns.
  1686  		flushed[i].readerUnref()
  1687  		close(flushed[i].flushed)
  1688  	}
  1689  	return bytesFlushed, err
  1690  }
  1691  
  1692  // maybeScheduleCompactionAsync should be used when
  1693  // we want to possibly schedule a compaction, but don't
  1694  // want to eat the cost of running maybeScheduleCompaction.
  1695  // This method should be launched in a separate goroutine.
  1696  // d.mu must not be held when this is called.
  1697  func (d *DB) maybeScheduleCompactionAsync() {
  1698  	defer d.compactionSchedulers.Done()
  1699  
  1700  	d.mu.Lock()
  1701  	d.maybeScheduleCompaction()
  1702  	d.mu.Unlock()
  1703  }
  1704  
  1705  // maybeScheduleCompaction schedules a compaction if necessary.
  1706  //
  1707  // d.mu must be held when calling this.
  1708  func (d *DB) maybeScheduleCompaction() {
  1709  	d.maybeScheduleCompactionPicker(pickAuto)
  1710  }
  1711  
  1712  func pickAuto(picker compactionPicker, env compactionEnv) *pickedCompaction {
  1713  	return picker.pickAuto(env)
  1714  }
  1715  
  1716  func pickElisionOnly(picker compactionPicker, env compactionEnv) *pickedCompaction {
  1717  	return picker.pickElisionOnlyCompaction(env)
  1718  }
  1719  
  1720  // maybeScheduleCompactionPicker schedules a compaction if necessary,
  1721  // calling `pickFunc` to pick automatic compactions.
  1722  //
  1723  // d.mu must be held when calling this.
  1724  func (d *DB) maybeScheduleCompactionPicker(
  1725  	pickFunc func(compactionPicker, compactionEnv) *pickedCompaction,
  1726  ) {
  1727  	if d.closed.Load() != nil || d.opts.ReadOnly {
  1728  		return
  1729  	}
  1730  	maxConcurrentCompactions := d.opts.MaxConcurrentCompactions()
  1731  	if d.mu.compact.compactingCount >= maxConcurrentCompactions {
  1732  		if len(d.mu.compact.manual) > 0 {
  1733  			// Inability to run head blocks later manual compactions.
  1734  			d.mu.compact.manual[0].retries++
  1735  		}
  1736  		return
  1737  	}
  1738  
  1739  	// Compaction picking needs a coherent view of a Version. In particular, we
  1740  	// need to exlude concurrent ingestions from making a decision on which level
  1741  	// to ingest into that conflicts with our compaction
  1742  	// decision. versionSet.logLock provides the necessary mutual exclusion.
  1743  	d.mu.versions.logLock()
  1744  	defer d.mu.versions.logUnlock()
  1745  
  1746  	// Check for the closed flag again, in case the DB was closed while we were
  1747  	// waiting for logLock().
  1748  	if d.closed.Load() != nil {
  1749  		return
  1750  	}
  1751  
  1752  	env := compactionEnv{
  1753  		earliestSnapshotSeqNum:  d.mu.snapshots.earliest(),
  1754  		earliestUnflushedSeqNum: d.getEarliestUnflushedSeqNumLocked(),
  1755  	}
  1756  
  1757  	// Check for delete-only compactions first, because they're expected to be
  1758  	// cheap and reduce future compaction work.
  1759  	if len(d.mu.compact.deletionHints) > 0 &&
  1760  		d.mu.compact.compactingCount < maxConcurrentCompactions &&
  1761  		!d.opts.DisableAutomaticCompactions {
  1762  		v := d.mu.versions.currentVersion()
  1763  		snapshots := d.mu.snapshots.toSlice()
  1764  		inputs, unresolvedHints := checkDeleteCompactionHints(d.cmp, v, d.mu.compact.deletionHints, snapshots)
  1765  		d.mu.compact.deletionHints = unresolvedHints
  1766  
  1767  		if len(inputs) > 0 {
  1768  			c := newDeleteOnlyCompaction(d.opts, v, inputs)
  1769  			d.mu.compact.compactingCount++
  1770  			d.addInProgressCompaction(c)
  1771  			go d.compact(c, nil)
  1772  		}
  1773  	}
  1774  
  1775  	for len(d.mu.compact.manual) > 0 && d.mu.compact.compactingCount < maxConcurrentCompactions {
  1776  		manual := d.mu.compact.manual[0]
  1777  		env.inProgressCompactions = d.getInProgressCompactionInfoLocked(nil)
  1778  		pc, retryLater := d.mu.versions.picker.pickManual(env, manual)
  1779  		if pc != nil {
  1780  			c := newCompaction(pc, d.opts)
  1781  			d.mu.compact.manual = d.mu.compact.manual[1:]
  1782  			d.mu.compact.compactingCount++
  1783  			d.addInProgressCompaction(c)
  1784  			go d.compact(c, manual.done)
  1785  		} else if !retryLater {
  1786  			// Noop
  1787  			d.mu.compact.manual = d.mu.compact.manual[1:]
  1788  			manual.done <- nil
  1789  		} else {
  1790  			// Inability to run head blocks later manual compactions.
  1791  			manual.retries++
  1792  			break
  1793  		}
  1794  	}
  1795  
  1796  	for !d.opts.DisableAutomaticCompactions && d.mu.compact.compactingCount < maxConcurrentCompactions {
  1797  		env.inProgressCompactions = d.getInProgressCompactionInfoLocked(nil)
  1798  		env.readCompactionEnv = readCompactionEnv{
  1799  			readCompactions:          &d.mu.compact.readCompactions,
  1800  			flushing:                 d.mu.compact.flushing || d.passedFlushThreshold(),
  1801  			rescheduleReadCompaction: &d.mu.compact.rescheduleReadCompaction,
  1802  		}
  1803  		pc := pickFunc(d.mu.versions.picker, env)
  1804  		if pc == nil {
  1805  			break
  1806  		}
  1807  		c := newCompaction(pc, d.opts)
  1808  		d.mu.compact.compactingCount++
  1809  		d.addInProgressCompaction(c)
  1810  		go d.compact(c, nil)
  1811  	}
  1812  }
  1813  
  1814  // deleteCompactionHintType indicates whether the deleteCompactionHint was
  1815  // generated from a span containing a range del (point key only), a range key
  1816  // delete (range key only), or both a point and range key.
  1817  type deleteCompactionHintType uint8
  1818  
  1819  const (
  1820  	// NOTE: While these are primarily used as enumeration types, they are also
  1821  	// used for some bitwise operations. Care should be taken when updating.
  1822  	deleteCompactionHintTypeUnknown deleteCompactionHintType = iota
  1823  	deleteCompactionHintTypePointKeyOnly
  1824  	deleteCompactionHintTypeRangeKeyOnly
  1825  	deleteCompactionHintTypePointAndRangeKey
  1826  )
  1827  
  1828  // String implements fmt.Stringer.
  1829  func (h deleteCompactionHintType) String() string {
  1830  	switch h {
  1831  	case deleteCompactionHintTypeUnknown:
  1832  		return "unknown"
  1833  	case deleteCompactionHintTypePointKeyOnly:
  1834  		return "point-key-only"
  1835  	case deleteCompactionHintTypeRangeKeyOnly:
  1836  		return "range-key-only"
  1837  	case deleteCompactionHintTypePointAndRangeKey:
  1838  		return "point-and-range-key"
  1839  	default:
  1840  		panic(fmt.Sprintf("unknown hint type: %d", h))
  1841  	}
  1842  }
  1843  
  1844  // compactionHintFromKeys returns a deleteCompactionHintType given a slice of
  1845  // keyspan.Keys.
  1846  func compactionHintFromKeys(keys []keyspan.Key) deleteCompactionHintType {
  1847  	var hintType deleteCompactionHintType
  1848  	for _, k := range keys {
  1849  		switch k.Kind() {
  1850  		case base.InternalKeyKindRangeDelete:
  1851  			hintType |= deleteCompactionHintTypePointKeyOnly
  1852  		case base.InternalKeyKindRangeKeyDelete:
  1853  			hintType |= deleteCompactionHintTypeRangeKeyOnly
  1854  		default:
  1855  			panic(fmt.Sprintf("unsupported key kind: %s", k.Kind()))
  1856  		}
  1857  	}
  1858  	return hintType
  1859  }
  1860  
  1861  // A deleteCompactionHint records a user key and sequence number span that has been
  1862  // deleted by a range tombstone. A hint is recorded if at least one sstable
  1863  // falls completely within both the user key and sequence number spans.
  1864  // Once the tombstones and the observed completely-contained sstables fall
  1865  // into the same snapshot stripe, a delete-only compaction may delete any
  1866  // sstables within the range.
  1867  type deleteCompactionHint struct {
  1868  	// The type of key span that generated this hint (point key, range key, or
  1869  	// both).
  1870  	hintType deleteCompactionHintType
  1871  	// start and end are user keys specifying a key range [start, end) of
  1872  	// deleted keys.
  1873  	start []byte
  1874  	end   []byte
  1875  	// The level of the file containing the range tombstone(s) when the hint
  1876  	// was created. Only lower levels need to be searched for files that may
  1877  	// be deleted.
  1878  	tombstoneLevel int
  1879  	// The file containing the range tombstone(s) that created the hint.
  1880  	tombstoneFile *fileMetadata
  1881  	// The smallest and largest sequence numbers of the abutting tombstones
  1882  	// merged to form this hint. All of a tables' keys must be less than the
  1883  	// tombstone smallest sequence number to be deleted. All of a tables'
  1884  	// sequence numbers must fall into the same snapshot stripe as the
  1885  	// tombstone largest sequence number to be deleted.
  1886  	tombstoneLargestSeqNum  uint64
  1887  	tombstoneSmallestSeqNum uint64
  1888  	// The smallest sequence number of a sstable that was found to be covered
  1889  	// by this hint. The hint cannot be resolved until this sequence number is
  1890  	// in the same snapshot stripe as the largest tombstone sequence number.
  1891  	// This is set when a hint is created, so the LSM may look different and
  1892  	// notably no longer contain the sstable that contained the key at this
  1893  	// sequence number.
  1894  	fileSmallestSeqNum uint64
  1895  }
  1896  
  1897  func (h deleteCompactionHint) String() string {
  1898  	return fmt.Sprintf(
  1899  		"L%d.%s %s-%s seqnums(tombstone=%d-%d, file-smallest=%d, type=%s)",
  1900  		h.tombstoneLevel, h.tombstoneFile.FileNum, h.start, h.end,
  1901  		h.tombstoneSmallestSeqNum, h.tombstoneLargestSeqNum, h.fileSmallestSeqNum,
  1902  		h.hintType,
  1903  	)
  1904  }
  1905  
  1906  func (h *deleteCompactionHint) canDelete(cmp Compare, m *fileMetadata, snapshots []uint64) bool {
  1907  	// The file can only be deleted if all of its keys are older than the
  1908  	// earliest tombstone aggregated into the hint.
  1909  	if m.LargestSeqNum >= h.tombstoneSmallestSeqNum || m.SmallestSeqNum < h.fileSmallestSeqNum {
  1910  		return false
  1911  	}
  1912  
  1913  	// The file's oldest key must  be in the same snapshot stripe as the
  1914  	// newest tombstone. NB: We already checked the hint's sequence numbers,
  1915  	// but this file's oldest sequence number might be lower than the hint's
  1916  	// smallest sequence number despite the file falling within the key range
  1917  	// if this file was constructed after the hint by a compaction.
  1918  	ti, _ := snapshotIndex(h.tombstoneLargestSeqNum, snapshots)
  1919  	fi, _ := snapshotIndex(m.SmallestSeqNum, snapshots)
  1920  	if ti != fi {
  1921  		return false
  1922  	}
  1923  
  1924  	switch h.hintType {
  1925  	case deleteCompactionHintTypePointKeyOnly:
  1926  		// A hint generated by a range del span cannot delete tables that contain
  1927  		// range keys.
  1928  		if m.HasRangeKeys {
  1929  			return false
  1930  		}
  1931  	case deleteCompactionHintTypeRangeKeyOnly:
  1932  		// A hint generated by a range key del span cannot delete tables that
  1933  		// contain point keys.
  1934  		if m.HasPointKeys {
  1935  			return false
  1936  		}
  1937  	case deleteCompactionHintTypePointAndRangeKey:
  1938  		// A hint from a span that contains both range dels *and* range keys can
  1939  		// only be deleted if both bounds fall within the hint. The next check takes
  1940  		// care of this.
  1941  	default:
  1942  		panic(fmt.Sprintf("bitalostable: unknown delete compaction hint type: %d", h.hintType))
  1943  	}
  1944  
  1945  	// The file's keys must be completely contained within the hint range.
  1946  	return cmp(h.start, m.Smallest.UserKey) <= 0 && cmp(m.Largest.UserKey, h.end) < 0
  1947  }
  1948  
  1949  func (d *DB) maybeUpdateDeleteCompactionHints(c *compaction) {
  1950  	// Compactions that zero sequence numbers can interfere with compaction
  1951  	// deletion hints. Deletion hints apply to tables containing keys older
  1952  	// than a threshold. If a key more recent than the threshold is zeroed in
  1953  	// a compaction, a delete-only compaction may mistake it as meeting the
  1954  	// threshold and drop a table containing live data.
  1955  	//
  1956  	// To avoid this scenario, compactions that zero sequence numbers remove
  1957  	// any conflicting deletion hints. A deletion hint is conflicting if both
  1958  	// of the following conditions apply:
  1959  	// * its key space overlaps with the compaction
  1960  	// * at least one of its inputs contains a key as recent as one of the
  1961  	//   hint's tombstones.
  1962  	//
  1963  	if !c.allowedZeroSeqNum {
  1964  		return
  1965  	}
  1966  
  1967  	updatedHints := d.mu.compact.deletionHints[:0]
  1968  	for _, h := range d.mu.compact.deletionHints {
  1969  		// If the compaction's key space is disjoint from the hint's key
  1970  		// space, the zeroing of sequence numbers won't affect the hint. Keep
  1971  		// the hint.
  1972  		keysDisjoint := d.cmp(h.end, c.smallest.UserKey) < 0 || d.cmp(h.start, c.largest.UserKey) > 0
  1973  		if keysDisjoint {
  1974  			updatedHints = append(updatedHints, h)
  1975  			continue
  1976  		}
  1977  
  1978  		// All of the compaction's inputs must be older than the hint's
  1979  		// tombstones.
  1980  		inputsOlder := true
  1981  		for _, in := range c.inputs {
  1982  			iter := in.files.Iter()
  1983  			for f := iter.First(); f != nil; f = iter.Next() {
  1984  				inputsOlder = inputsOlder && f.LargestSeqNum < h.tombstoneSmallestSeqNum
  1985  			}
  1986  		}
  1987  		if inputsOlder {
  1988  			updatedHints = append(updatedHints, h)
  1989  			continue
  1990  		}
  1991  
  1992  		// Drop h, because the compaction c may have zeroed sequence numbers
  1993  		// of keys more recent than some of h's tombstones.
  1994  	}
  1995  	d.mu.compact.deletionHints = updatedHints
  1996  }
  1997  
  1998  func checkDeleteCompactionHints(
  1999  	cmp Compare, v *version, hints []deleteCompactionHint, snapshots []uint64,
  2000  ) ([]compactionLevel, []deleteCompactionHint) {
  2001  	var files map[*fileMetadata]bool
  2002  	var byLevel [numLevels][]*fileMetadata
  2003  
  2004  	unresolvedHints := hints[:0]
  2005  	for _, h := range hints {
  2006  		// Check each compaction hint to see if it's resolvable. Resolvable
  2007  		// hints are removed and trigger a delete-only compaction if any files
  2008  		// in the current LSM still meet their criteria. Unresolvable hints
  2009  		// are saved and don't trigger a delete-only compaction.
  2010  		//
  2011  		// When a compaction hint is created, the sequence numbers of the
  2012  		// range tombstones and the covered file with the oldest key are
  2013  		// recorded. The largest tombstone sequence number and the smallest
  2014  		// file sequence number must be in the same snapshot stripe for the
  2015  		// hint to be resolved. The below graphic models a compaction hint
  2016  		// covering the keyspace [b, r). The hint completely contains two
  2017  		// files, 000002 and 000003. The file 000003 contains the lowest
  2018  		// covered sequence number at #90. The tombstone b.RANGEDEL.230:h has
  2019  		// the highest tombstone sequence number incorporated into the hint.
  2020  		// The hint may be resolved only once the snapshots at #100, #180 and
  2021  		// #210 are all closed. File 000001 is not included within the hint
  2022  		// because it extends beyond the range tombstones in user key space.
  2023  		//
  2024  		// 250
  2025  		//
  2026  		//       |-b...230:h-|
  2027  		// _____________________________________________________ snapshot #210
  2028  		// 200               |--h.RANGEDEL.200:r--|
  2029  		//
  2030  		// _____________________________________________________ snapshot #180
  2031  		//
  2032  		// 150                     +--------+
  2033  		//           +---------+   | 000003 |
  2034  		//           | 000002  |   |        |
  2035  		//           +_________+   |        |
  2036  		// 100_____________________|________|___________________ snapshot #100
  2037  		//                         +--------+
  2038  		// _____________________________________________________ snapshot #70
  2039  		//                             +---------------+
  2040  		//  50                         | 000001        |
  2041  		//                             |               |
  2042  		//                             +---------------+
  2043  		// ______________________________________________________________
  2044  		//     a b c d e f g h i j k l m n o p q r s t u v w x y z
  2045  
  2046  		ti, _ := snapshotIndex(h.tombstoneLargestSeqNum, snapshots)
  2047  		fi, _ := snapshotIndex(h.fileSmallestSeqNum, snapshots)
  2048  		if ti != fi {
  2049  			// Cannot resolve yet.
  2050  			unresolvedHints = append(unresolvedHints, h)
  2051  			continue
  2052  		}
  2053  
  2054  		// The hint h will be resolved and dropped, regardless of whether
  2055  		// there are any tables that can be deleted.
  2056  		for l := h.tombstoneLevel + 1; l < numLevels; l++ {
  2057  			overlaps := v.Overlaps(l, cmp, h.start, h.end, true /* exclusiveEnd */)
  2058  			iter := overlaps.Iter()
  2059  			for m := iter.First(); m != nil; m = iter.Next() {
  2060  				if m.IsCompacting() || !h.canDelete(cmp, m, snapshots) || files[m] {
  2061  					continue
  2062  				}
  2063  				if files == nil {
  2064  					// Construct files lazily, assuming most calls will not
  2065  					// produce delete-only compactions.
  2066  					files = make(map[*fileMetadata]bool)
  2067  				}
  2068  				files[m] = true
  2069  				byLevel[l] = append(byLevel[l], m)
  2070  			}
  2071  		}
  2072  	}
  2073  
  2074  	var compactLevels []compactionLevel
  2075  	for l, files := range byLevel {
  2076  		if len(files) == 0 {
  2077  			continue
  2078  		}
  2079  		compactLevels = append(compactLevels, compactionLevel{
  2080  			level: l,
  2081  			files: manifest.NewLevelSliceKeySorted(cmp, files),
  2082  		})
  2083  	}
  2084  	return compactLevels, unresolvedHints
  2085  }
  2086  
  2087  // compact runs one compaction and maybe schedules another call to compact.
  2088  func (d *DB) compact(c *compaction, errChannel chan error) {
  2089  	pprof.Do(context.Background(), compactLabels, func(context.Context) {
  2090  		d.mu.Lock()
  2091  		defer d.mu.Unlock()
  2092  		if err := d.compact1(c, errChannel); err != nil {
  2093  			// TODO(peter): count consecutive compaction errors and backoff.
  2094  			d.opts.EventListener.BackgroundError(err)
  2095  		}
  2096  		d.mu.compact.compactingCount--
  2097  		// The previous compaction may have produced too many files in a
  2098  		// level, so reschedule another compaction if needed.
  2099  		d.maybeScheduleCompaction()
  2100  		d.mu.compact.cond.Broadcast()
  2101  	})
  2102  }
  2103  
  2104  // compact1 runs one compaction.
  2105  //
  2106  // d.mu must be held when calling this, but the mutex may be dropped and
  2107  // re-acquired during the course of this method.
  2108  func (d *DB) compact1(c *compaction, errChannel chan error) (err error) {
  2109  	if errChannel != nil {
  2110  		defer func() {
  2111  			errChannel <- err
  2112  		}()
  2113  	}
  2114  
  2115  	jobID := d.mu.nextJobID
  2116  	d.mu.nextJobID++
  2117  	info := c.makeInfo(jobID)
  2118  	d.opts.EventListener.CompactionBegin(info)
  2119  	startTime := d.timeNow()
  2120  
  2121  	ve, pendingOutputs, err := d.runCompaction(jobID, c)
  2122  
  2123  	info.Duration = d.timeNow().Sub(startTime)
  2124  	if err == nil {
  2125  		d.mu.versions.logLock()
  2126  		err = d.mu.versions.logAndApply(jobID, ve, c.metrics, false /* forceRotation */, func() []compactionInfo {
  2127  			return d.getInProgressCompactionInfoLocked(c)
  2128  		})
  2129  		if err != nil {
  2130  			// TODO(peter): untested.
  2131  			d.mu.versions.obsoleteTables = append(d.mu.versions.obsoleteTables, pendingOutputs...)
  2132  			d.mu.versions.incrementObsoleteTablesLocked(pendingOutputs)
  2133  		}
  2134  	}
  2135  
  2136  	info.Done = true
  2137  	info.Err = err
  2138  	if err == nil {
  2139  		for i := range ve.NewFiles {
  2140  			e := &ve.NewFiles[i]
  2141  			info.Output.Tables = append(info.Output.Tables, e.Meta.TableInfo())
  2142  		}
  2143  	}
  2144  
  2145  	d.maybeUpdateDeleteCompactionHints(c)
  2146  	d.removeInProgressCompaction(c, err != nil)
  2147  	d.mu.versions.incrementCompactions(c.kind, c.extraLevels)
  2148  	d.mu.versions.incrementCompactionBytes(-c.bytesWritten)
  2149  
  2150  	info.TotalDuration = d.timeNow().Sub(startTime)
  2151  	d.opts.EventListener.CompactionEnd(info)
  2152  
  2153  	// Update the read state before deleting obsolete files because the
  2154  	// read-state update will cause the previous version to be unref'd and if
  2155  	// there are no references obsolete tables will be added to the obsolete
  2156  	// table list.
  2157  	if err == nil {
  2158  		d.updateReadStateLocked(d.opts.DebugCheck)
  2159  		d.updateTableStatsLocked(ve.NewFiles)
  2160  	}
  2161  	d.deleteObsoleteFiles(jobID, true /* waitForOngoing */)
  2162  
  2163  	return err
  2164  }
  2165  
  2166  // runCompactions runs a compaction that produces new on-disk tables from
  2167  // memtables or old on-disk tables.
  2168  //
  2169  // d.mu must be held when calling this, but the mutex may be dropped and
  2170  // re-acquired during the course of this method.
  2171  func (d *DB) runCompaction(
  2172  	jobID int, c *compaction,
  2173  ) (ve *versionEdit, pendingOutputs []*fileMetadata, retErr error) {
  2174  	// As a sanity check, confirm that the smallest / largest keys for new and
  2175  	// deleted files in the new versionEdit pass a validation function before
  2176  	// returning the edit.
  2177  	defer func() {
  2178  		if ve != nil {
  2179  			err := validateVersionEdit(ve, d.opts.Experimental.KeyValidationFunc, d.opts.Comparer.FormatKey)
  2180  			if err != nil {
  2181  				d.opts.Logger.Fatalf("bitalostable: version edit validation failed: %s", err)
  2182  			}
  2183  		}
  2184  	}()
  2185  
  2186  	// Check for a delete-only compaction. This can occur when wide range
  2187  	// tombstones completely contain sstables.
  2188  	if c.kind == compactionKindDeleteOnly {
  2189  		c.metrics = make(map[int]*LevelMetrics, len(c.inputs))
  2190  		ve := &versionEdit{
  2191  			DeletedFiles: map[deletedFileEntry]*fileMetadata{},
  2192  		}
  2193  		for _, cl := range c.inputs {
  2194  			levelMetrics := &LevelMetrics{}
  2195  			iter := cl.files.Iter()
  2196  			for f := iter.First(); f != nil; f = iter.Next() {
  2197  				levelMetrics.NumFiles--
  2198  				levelMetrics.Size -= int64(f.Size)
  2199  				ve.DeletedFiles[deletedFileEntry{
  2200  					Level:   cl.level,
  2201  					FileNum: f.FileNum,
  2202  				}] = f
  2203  			}
  2204  			c.metrics[cl.level] = levelMetrics
  2205  		}
  2206  		return ve, nil, nil
  2207  	}
  2208  
  2209  	// Check for a trivial move of one table from one level to the next. We avoid
  2210  	// such a move if there is lots of overlapping grandparent data. Otherwise,
  2211  	// the move could create a parent file that will require a very expensive
  2212  	// merge later on.
  2213  	if c.kind == compactionKindMove {
  2214  		iter := c.startLevel.files.Iter()
  2215  		meta := iter.First()
  2216  		c.metrics = map[int]*LevelMetrics{
  2217  			c.startLevel.level: {
  2218  				NumFiles: -1,
  2219  				Size:     -int64(meta.Size),
  2220  			},
  2221  			c.outputLevel.level: {
  2222  				NumFiles:    1,
  2223  				Size:        int64(meta.Size),
  2224  				BytesMoved:  meta.Size,
  2225  				TablesMoved: 1,
  2226  			},
  2227  		}
  2228  		ve := &versionEdit{
  2229  			DeletedFiles: map[deletedFileEntry]*fileMetadata{
  2230  				{Level: c.startLevel.level, FileNum: meta.FileNum}: meta,
  2231  			},
  2232  			NewFiles: []newFileEntry{
  2233  				{Level: c.outputLevel.level, Meta: meta},
  2234  			},
  2235  		}
  2236  		return ve, nil, nil
  2237  	}
  2238  
  2239  	defer func() {
  2240  		if retErr != nil {
  2241  			pendingOutputs = nil
  2242  		}
  2243  	}()
  2244  
  2245  	snapshots := d.mu.snapshots.toSlice()
  2246  	formatVers := d.mu.formatVers.vers
  2247  	// The table is written at the maximum allowable format implied by the current
  2248  	// format major version of the DB.
  2249  	tableFormat := formatVers.MaxTableFormat()
  2250  
  2251  	// Release the d.mu lock while doing I/O.
  2252  	// Note the unusual order: Unlock and then Lock.
  2253  	d.mu.Unlock()
  2254  	defer d.mu.Lock()
  2255  
  2256  	iiter, err := c.newInputIter(d.newIters, d.tableNewRangeKeyIter, snapshots)
  2257  	if err != nil {
  2258  		return nil, pendingOutputs, err
  2259  	}
  2260  	c.allowedZeroSeqNum = c.allowZeroSeqNum()
  2261  	iter := newCompactionIter(c.cmp, c.equal, c.formatKey, d.merge, iiter, snapshots,
  2262  		&c.rangeDelFrag, &c.rangeKeyFrag, c.allowedZeroSeqNum, c.elideTombstone,
  2263  		c.elideRangeTombstone, d.FormatMajorVersion())
  2264  
  2265  	var (
  2266  		filenames []string
  2267  		tw        *sstable.Writer
  2268  	)
  2269  	defer func() {
  2270  		if iter != nil {
  2271  			retErr = firstError(retErr, iter.Close())
  2272  		}
  2273  		if tw != nil {
  2274  			retErr = firstError(retErr, tw.Close())
  2275  		}
  2276  		if retErr != nil {
  2277  			for _, filename := range filenames {
  2278  				d.opts.FS.Remove(filename)
  2279  			}
  2280  		}
  2281  		for _, closer := range c.closers {
  2282  			retErr = firstError(retErr, closer.Close())
  2283  		}
  2284  	}()
  2285  
  2286  	ve = &versionEdit{
  2287  		DeletedFiles: map[deletedFileEntry]*fileMetadata{},
  2288  	}
  2289  
  2290  	outputMetrics := &LevelMetrics{
  2291  		BytesIn:   c.startLevel.files.SizeSum(),
  2292  		BytesRead: c.outputLevel.files.SizeSum(),
  2293  	}
  2294  	if len(c.extraLevels) > 0 {
  2295  		outputMetrics.BytesIn += c.extraLevels[0].files.SizeSum()
  2296  	}
  2297  	outputMetrics.BytesRead += outputMetrics.BytesIn
  2298  
  2299  	c.metrics = map[int]*LevelMetrics{
  2300  		c.outputLevel.level: outputMetrics,
  2301  	}
  2302  	if len(c.flushing) == 0 && c.metrics[c.startLevel.level] == nil {
  2303  		c.metrics[c.startLevel.level] = &LevelMetrics{}
  2304  	}
  2305  	if len(c.extraLevels) > 0 {
  2306  		c.metrics[c.extraLevels[0].level] = &LevelMetrics{}
  2307  	}
  2308  
  2309  	writerOpts := d.opts.MakeWriterOptions(c.outputLevel.level, tableFormat)
  2310  	if formatVers < FormatBlockPropertyCollector {
  2311  		// Cannot yet write block properties.
  2312  		writerOpts.BlockPropertyCollectors = nil
  2313  	}
  2314  
  2315  	// prevPointKey is a sstable.WriterOption that provides access to
  2316  	// the last point key written to a writer's sstable. When a new
  2317  	// output begins in newOutput, prevPointKey is updated to point to
  2318  	// the new output's sstable.Writer. This allows the compaction loop
  2319  	// to access the last written point key without requiring the
  2320  	// compaction loop to make a copy of each key ahead of time. Users
  2321  	// must be careful, because the byte slice returned by UnsafeKey
  2322  	// points directly into the Writer's block buffer.
  2323  	var prevPointKey sstable.PreviousPointKeyOpt
  2324  	var additionalCPUProcs int
  2325  	defer func() {
  2326  		if additionalCPUProcs > 0 {
  2327  			d.opts.Experimental.CPUWorkPermissionGranter.ReturnProcs(additionalCPUProcs)
  2328  		}
  2329  	}()
  2330  
  2331  	newOutput := func() error {
  2332  		fileMeta := &fileMetadata{}
  2333  		d.mu.Lock()
  2334  		fileNum := d.mu.versions.getNextFileNum()
  2335  		fileMeta.FileNum = fileNum
  2336  		pendingOutputs = append(pendingOutputs, fileMeta)
  2337  		d.mu.Unlock()
  2338  
  2339  		filename := base.MakeFilepath(d.opts.FS, d.dirname, fileTypeTable, fileNum)
  2340  		file, err := d.opts.FS.Create(filename)
  2341  		if err != nil {
  2342  			return err
  2343  		}
  2344  		reason := "flushing"
  2345  		if c.flushing == nil {
  2346  			reason = "compacting"
  2347  		}
  2348  		d.opts.EventListener.TableCreated(TableCreateInfo{
  2349  			JobID:   jobID,
  2350  			Reason:  reason,
  2351  			Path:    filename,
  2352  			FileNum: fileNum,
  2353  		})
  2354  		file = vfs.NewSyncingFile(file, vfs.SyncingFileOptions{
  2355  			NoSyncOnClose: d.opts.NoSyncOnClose,
  2356  			BytesPerSync:  d.opts.BytesPerSync,
  2357  		})
  2358  		file = &compactionFile{
  2359  			File:     file,
  2360  			versions: d.mu.versions,
  2361  			written:  &c.bytesWritten,
  2362  		}
  2363  		filenames = append(filenames, filename)
  2364  		cacheOpts := private.SSTableCacheOpts(d.cacheID, fileNum).(sstable.WriterOption)
  2365  		internalTableOpt := private.SSTableInternalTableOpt.(sstable.WriterOption)
  2366  		if d.opts.Experimental.CPUWorkPermissionGranter != nil {
  2367  			additionalCPUProcs = d.opts.Experimental.CPUWorkPermissionGranter.TryGetProcs(1)
  2368  		}
  2369  		writerOpts.Parallelism =
  2370  			d.opts.Experimental.MaxWriterConcurrency > 0 &&
  2371  				(additionalCPUProcs > 0 || d.opts.Experimental.ForceWriterParallelism)
  2372  		tw = sstable.NewWriter(file, writerOpts, cacheOpts, internalTableOpt, &prevPointKey)
  2373  
  2374  		fileMeta.CreationTime = time.Now().Unix()
  2375  		ve.NewFiles = append(ve.NewFiles, newFileEntry{
  2376  			Level: c.outputLevel.level,
  2377  			Meta:  fileMeta,
  2378  		})
  2379  		return nil
  2380  	}
  2381  
  2382  	// splitL0Outputs is true during flushes and intra-L0 compactions with flush
  2383  	// splits enabled.
  2384  	splitL0Outputs := c.outputLevel.level == 0 && d.opts.FlushSplitBytes > 0
  2385  
  2386  	// finishOutput is called with the a user key up to which all tombstones
  2387  	// should be flushed. Typically, this is the first key of the next
  2388  	// sstable or an empty key if this output is the final sstable.
  2389  	finishOutput := func(splitKey []byte) error {
  2390  		// If we haven't output any point records to the sstable (tw == nil) then the
  2391  		// sstable will only contain range tombstones and/or range keys. The smallest
  2392  		// key in the sstable will be the start key of the first range tombstone or
  2393  		// range key added. We need to ensure that this start key is distinct from
  2394  		// the splitKey passed to finishOutput (if set), otherwise we would generate
  2395  		// an sstable where the largest key is smaller than the smallest key due to
  2396  		// how the largest key boundary is set below. NB: It is permissible for the
  2397  		// range tombstone / range key start key to be the empty string.
  2398  		//
  2399  		// TODO: It is unfortunate that we have to do this check here rather than
  2400  		// when we decide to finish the sstable in the runCompaction loop. A better
  2401  		// structure currently eludes us.
  2402  		if tw == nil {
  2403  			startKey := c.rangeDelFrag.Start()
  2404  			if len(iter.tombstones) > 0 {
  2405  				startKey = iter.tombstones[0].Start
  2406  			}
  2407  			if startKey == nil {
  2408  				startKey = c.rangeKeyFrag.Start()
  2409  				if len(iter.rangeKeys) > 0 {
  2410  					startKey = iter.rangeKeys[0].Start
  2411  				}
  2412  			}
  2413  			if splitKey != nil && d.cmp(startKey, splitKey) == 0 {
  2414  				return nil
  2415  			}
  2416  		}
  2417  
  2418  		// NB: clone the key because the data can be held on to by the call to
  2419  		// compactionIter.Tombstones via keyspan.Fragmenter.FlushTo, and by the
  2420  		// WriterMetadata.LargestRangeDel.UserKey.
  2421  		splitKey = append([]byte(nil), splitKey...)
  2422  		for _, v := range iter.Tombstones(splitKey) {
  2423  			if tw == nil {
  2424  				if err := newOutput(); err != nil {
  2425  					return err
  2426  				}
  2427  			}
  2428  			// The tombstone being added could be completely outside the
  2429  			// eventual bounds of the sstable. Consider this example (bounds
  2430  			// in square brackets next to table filename):
  2431  			//
  2432  			// ./000240.sst   [tmgc#391,MERGE-tmgc#391,MERGE]
  2433  			// tmgc#391,MERGE [786e627a]
  2434  			// tmgc-udkatvs#331,RANGEDEL
  2435  			//
  2436  			// ./000241.sst   [tmgc#384,MERGE-tmgc#384,MERGE]
  2437  			// tmgc#384,MERGE [666c7070]
  2438  			// tmgc-tvsalezade#383,RANGEDEL
  2439  			// tmgc-tvsalezade#331,RANGEDEL
  2440  			//
  2441  			// ./000242.sst   [tmgc#383,RANGEDEL-tvsalezade#72057594037927935,RANGEDEL]
  2442  			// tmgc-tvsalezade#383,RANGEDEL
  2443  			// tmgc#375,SET [72646c78766965616c72776865676e79]
  2444  			// tmgc-tvsalezade#356,RANGEDEL
  2445  			//
  2446  			// Note that both of the top two SSTables have range tombstones
  2447  			// that start after the file's end keys. Since the file bound
  2448  			// computation happens well after all range tombstones have been
  2449  			// added to the writer, eliding out-of-file range tombstones based
  2450  			// on sequence number at this stage is difficult, and necessitates
  2451  			// read-time logic to ignore range tombstones outside file bounds.
  2452  			if err := rangedel.Encode(&v, tw.Add); err != nil {
  2453  				return err
  2454  			}
  2455  		}
  2456  		for _, v := range iter.RangeKeys(splitKey) {
  2457  			// Same logic as for range tombstones, except added using tw.AddRangeKey.
  2458  			if tw == nil {
  2459  				if err := newOutput(); err != nil {
  2460  					return err
  2461  				}
  2462  			}
  2463  			if err := rangekey.Encode(&v, tw.AddRangeKey); err != nil {
  2464  				return err
  2465  			}
  2466  		}
  2467  
  2468  		if tw == nil {
  2469  			return nil
  2470  		}
  2471  
  2472  		if err := tw.Close(); err != nil {
  2473  			tw = nil
  2474  			return err
  2475  		}
  2476  		if additionalCPUProcs > 0 {
  2477  			d.opts.Experimental.CPUWorkPermissionGranter.ReturnProcs(additionalCPUProcs)
  2478  			additionalCPUProcs = 0
  2479  		}
  2480  		writerMeta, err := tw.Metadata()
  2481  		if err != nil {
  2482  			tw = nil
  2483  			return err
  2484  		}
  2485  		tw = nil
  2486  		meta := ve.NewFiles[len(ve.NewFiles)-1].Meta
  2487  		meta.Size = writerMeta.Size
  2488  		meta.SmallestSeqNum = writerMeta.SmallestSeqNum
  2489  		meta.LargestSeqNum = writerMeta.LargestSeqNum
  2490  		// If the file didn't contain any range deletions, we can fill its
  2491  		// table stats now, avoiding unnecessarily loading the table later.
  2492  		maybeSetStatsFromProperties(meta, &writerMeta.Properties)
  2493  
  2494  		if c.flushing == nil {
  2495  			outputMetrics.TablesCompacted++
  2496  			outputMetrics.BytesCompacted += meta.Size
  2497  		} else {
  2498  			outputMetrics.TablesFlushed++
  2499  			outputMetrics.BytesFlushed += meta.Size
  2500  		}
  2501  		outputMetrics.Size += int64(meta.Size)
  2502  		outputMetrics.NumFiles++
  2503  
  2504  		if n := len(ve.NewFiles); n > 1 {
  2505  			// This is not the first output file. Ensure the sstable boundaries
  2506  			// are nonoverlapping.
  2507  			prevMeta := ve.NewFiles[n-2].Meta
  2508  			if writerMeta.SmallestRangeDel.UserKey != nil {
  2509  				c := d.cmp(writerMeta.SmallestRangeDel.UserKey, prevMeta.Largest.UserKey)
  2510  				if c < 0 {
  2511  					return errors.Errorf(
  2512  						"bitalostable: smallest range tombstone start key is less than previous sstable largest key: %s < %s",
  2513  						writerMeta.SmallestRangeDel.Pretty(d.opts.Comparer.FormatKey),
  2514  						prevMeta.Largest.Pretty(d.opts.Comparer.FormatKey))
  2515  				} else if c == 0 && !prevMeta.Largest.IsExclusiveSentinel() {
  2516  					// The user key portion of the range boundary start key is
  2517  					// equal to the previous table's largest key user key, and
  2518  					// the previous table's largest key is not exclusive. This
  2519  					// violates the invariant that tables are key-space
  2520  					// partitioned.
  2521  					return errors.Errorf(
  2522  						"bitalostable: invariant violation: previous sstable largest key %s, current sstable smallest rangedel: %s",
  2523  						prevMeta.Largest.Pretty(d.opts.Comparer.FormatKey),
  2524  						writerMeta.SmallestRangeDel.Pretty(d.opts.Comparer.FormatKey),
  2525  					)
  2526  				}
  2527  			}
  2528  		}
  2529  
  2530  		// Verify that all range deletions outputted to the sstable are
  2531  		// truncated to split key.
  2532  		if splitKey != nil && writerMeta.LargestRangeDel.UserKey != nil &&
  2533  			d.cmp(writerMeta.LargestRangeDel.UserKey, splitKey) > 0 {
  2534  			return errors.Errorf(
  2535  				"bitalostable: invariant violation: rangedel largest key %q extends beyond split key %q",
  2536  				writerMeta.LargestRangeDel.Pretty(d.opts.Comparer.FormatKey),
  2537  				d.opts.Comparer.FormatKey(splitKey),
  2538  			)
  2539  		}
  2540  
  2541  		if writerMeta.HasPointKeys {
  2542  			meta.ExtendPointKeyBounds(d.cmp, writerMeta.SmallestPoint, writerMeta.LargestPoint)
  2543  		}
  2544  		if writerMeta.HasRangeDelKeys {
  2545  			meta.ExtendPointKeyBounds(d.cmp, writerMeta.SmallestRangeDel, writerMeta.LargestRangeDel)
  2546  		}
  2547  		if writerMeta.HasRangeKeys {
  2548  			meta.ExtendRangeKeyBounds(d.cmp, writerMeta.SmallestRangeKey, writerMeta.LargestRangeKey)
  2549  		}
  2550  
  2551  		// Verify that the sstable bounds fall within the compaction input
  2552  		// bounds. This is a sanity check that we don't have a logic error
  2553  		// elsewhere that causes the sstable bounds to accidentally expand past the
  2554  		// compaction input bounds as doing so could lead to various badness such
  2555  		// as keys being deleted by a range tombstone incorrectly.
  2556  		if c.smallest.UserKey != nil {
  2557  			switch v := d.cmp(meta.Smallest.UserKey, c.smallest.UserKey); {
  2558  			case v >= 0:
  2559  				// Nothing to do.
  2560  			case v < 0:
  2561  				return errors.Errorf("bitalostable: compaction output grew beyond bounds of input: %s < %s",
  2562  					meta.Smallest.Pretty(d.opts.Comparer.FormatKey),
  2563  					c.smallest.Pretty(d.opts.Comparer.FormatKey))
  2564  			}
  2565  		}
  2566  		if c.largest.UserKey != nil {
  2567  			switch v := d.cmp(meta.Largest.UserKey, c.largest.UserKey); {
  2568  			case v <= 0:
  2569  				// Nothing to do.
  2570  			case v > 0:
  2571  				return errors.Errorf("bitalostable: compaction output grew beyond bounds of input: %s > %s",
  2572  					meta.Largest.Pretty(d.opts.Comparer.FormatKey),
  2573  					c.largest.Pretty(d.opts.Comparer.FormatKey))
  2574  			}
  2575  		}
  2576  		// Verify that we never split different revisions of the same user key
  2577  		// across two different sstables.
  2578  		if err := c.errorOnUserKeyOverlap(ve); err != nil {
  2579  			return err
  2580  		}
  2581  		if err := meta.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil {
  2582  			return err
  2583  		}
  2584  		return nil
  2585  	}
  2586  
  2587  	// compactionOutputSplitters contain all logic to determine whether the
  2588  	// compaction loop should stop writing to one output sstable and switch to
  2589  	// a new one. Some splitters can wrap other splitters, and
  2590  	// the splitterGroup can be composed of multiple splitters. In this case,
  2591  	// we start off with splitters for file sizes, grandparent limits, and (for
  2592  	// L0 splits) L0 limits, before wrapping them in an splitterGroup.
  2593  	outputSplitters := []compactionOutputSplitter{
  2594  		// We do not split the same user key across different sstables within
  2595  		// one flush or compaction. The fileSizeSplitter may request a split in
  2596  		// the middle of a user key, so the userKeyChangeSplitter ensures we are
  2597  		// at a user key change boundary when doing a split.
  2598  		&userKeyChangeSplitter{
  2599  			cmp:      c.cmp,
  2600  			splitter: &fileSizeSplitter{maxFileSize: c.maxOutputFileSize},
  2601  			unsafePrevUserKey: func() []byte {
  2602  				// Return the largest point key written to tw or the start of
  2603  				// the current range deletion in the fragmenter, whichever is
  2604  				// greater.
  2605  				prevPoint := prevPointKey.UnsafeKey()
  2606  				if c.cmp(prevPoint.UserKey, c.rangeDelFrag.Start()) > 0 {
  2607  					return prevPoint.UserKey
  2608  				}
  2609  				return c.rangeDelFrag.Start()
  2610  			},
  2611  		},
  2612  		&limitFuncSplitter{c: c, limitFunc: c.findGrandparentLimit},
  2613  	}
  2614  	if splitL0Outputs {
  2615  		outputSplitters = append(outputSplitters, &limitFuncSplitter{c: c, limitFunc: c.findL0Limit})
  2616  	}
  2617  	splitter := &splitterGroup{cmp: c.cmp, splitters: outputSplitters}
  2618  
  2619  	// Each outer loop iteration produces one output file. An iteration that
  2620  	// produces a file containing point keys (and optionally range tombstones)
  2621  	// guarantees that the input iterator advanced. An iteration that produces
  2622  	// a file containing only range tombstones guarantees the limit passed to
  2623  	// `finishOutput()` advanced to a strictly greater user key corresponding
  2624  	// to a grandparent file largest key, or nil. Taken together, these
  2625  	// progress guarantees ensure that eventually the input iterator will be
  2626  	// exhausted and the range tombstone fragments will all be flushed.
  2627  	for key, val := iter.First(); key != nil || !c.rangeDelFrag.Empty() || !c.rangeKeyFrag.Empty(); {
  2628  		splitterSuggestion := splitter.onNewOutput(key)
  2629  
  2630  		// Each inner loop iteration processes one key from the input iterator.
  2631  		for ; key != nil; key, val = iter.Next() {
  2632  			if split := splitter.shouldSplitBefore(key, tw); split == splitNow {
  2633  				break
  2634  			}
  2635  
  2636  			switch key.Kind() {
  2637  			case InternalKeyKindRangeDelete:
  2638  				// Range tombstones are handled specially. They are fragmented,
  2639  				// and they're not written until later during `finishOutput()`.
  2640  				// We add them to the `Fragmenter` now to make them visible to
  2641  				// `compactionIter` so covered keys in the same snapshot stripe
  2642  				// can be elided.
  2643  
  2644  				// The interleaved range deletion might only be one of many with
  2645  				// these bounds. Some fragmenting is performed ahead of time by
  2646  				// keyspan.MergingIter.
  2647  				if s := c.rangeDelIter.Span(); !s.Empty() {
  2648  					// The memory management here is subtle. Range deletions
  2649  					// blocks do NOT use prefix compression, which ensures that
  2650  					// range deletion spans' memory is available as long we keep
  2651  					// the iterator open. However, the keyspan.MergingIter that
  2652  					// merges spans across levels only guarantees the lifetime
  2653  					// of the [start, end) bounds until the next positioning
  2654  					// method is called.
  2655  					//
  2656  					// Additionally, the Span.Keys slice is owned by the the
  2657  					// range deletion iterator stack, and it may be overwritten
  2658  					// when we advance.
  2659  					//
  2660  					// Clone the Keys slice and the start and end keys.
  2661  					//
  2662  					// TODO(jackson): Avoid the clone by removing c.rangeDelFrag
  2663  					// and performing explicit truncation of the pending
  2664  					// rangedel span as necessary.
  2665  					clone := keyspan.Span{
  2666  						Start: iter.cloneKey(s.Start),
  2667  						End:   iter.cloneKey(s.End),
  2668  						Keys:  make([]keyspan.Key, len(s.Keys)),
  2669  					}
  2670  					copy(clone.Keys, s.Keys)
  2671  					c.rangeDelFrag.Add(clone)
  2672  				}
  2673  				continue
  2674  			case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
  2675  				// Range keys are handled in the same way as range tombstones, except
  2676  				// with a dedicated fragmenter.
  2677  				if s := c.rangeKeyInterleaving.Span(); !s.Empty() {
  2678  					clone := keyspan.Span{
  2679  						Start: iter.cloneKey(s.Start),
  2680  						End:   iter.cloneKey(s.End),
  2681  						Keys:  make([]keyspan.Key, len(s.Keys)),
  2682  					}
  2683  					// Since the keys' Suffix and Value fields are not deep cloned, the
  2684  					// underlying blockIter must be kept open for the lifetime of the
  2685  					// compaction.
  2686  					copy(clone.Keys, s.Keys)
  2687  					c.rangeKeyFrag.Add(clone)
  2688  				}
  2689  				continue
  2690  			}
  2691  			if tw == nil {
  2692  				if err := newOutput(); err != nil {
  2693  					return nil, pendingOutputs, err
  2694  				}
  2695  			}
  2696  
  2697  			if key.Kind() == InternalKeyKindSet {
  2698  				if d.opts.KvCheckExpireFunc(key.UserKey, val) {
  2699  					key.SetKind(InternalKeyKindDelete)
  2700  					val = nil
  2701  				}
  2702  			}
  2703  
  2704  			if err := tw.Add(*key, val); err != nil {
  2705  				return nil, pendingOutputs, err
  2706  			}
  2707  		}
  2708  
  2709  		// A splitter requested a split, and we're ready to finish the output.
  2710  		// We need to choose the key at which to split any pending range
  2711  		// tombstones. There are two options:
  2712  		// 1. splitterSuggestion — The key suggested by the splitter. This key
  2713  		//    is guaranteed to be greater than the last key written to the
  2714  		//    current output.
  2715  		// 2. key.UserKey — the first key of the next sstable output. This user
  2716  		//     key is also guaranteed to be greater than the last user key
  2717  		//     written to the current output (see userKeyChangeSplitter).
  2718  		//
  2719  		// Use whichever is smaller. Using the smaller of the two limits
  2720  		// overlap with grandparents. Consider the case where the
  2721  		// grandparent limit is calculated to be 'b', key is 'x', and
  2722  		// there exist many sstables between 'b' and 'x'. If the range
  2723  		// deletion fragmenter has a pending tombstone [a,x), splitting
  2724  		// at 'x' would cause the output table to overlap many
  2725  		// grandparents well beyond the calculated grandparent limit
  2726  		// 'b'. Splitting at the smaller `splitterSuggestion` avoids
  2727  		// this unbounded overlap with grandparent tables.
  2728  		splitKey := splitterSuggestion
  2729  		if key != nil && (splitKey == nil || c.cmp(splitKey, key.UserKey) > 0) {
  2730  			splitKey = key.UserKey
  2731  		}
  2732  		if err := finishOutput(splitKey); err != nil {
  2733  			return nil, pendingOutputs, err
  2734  		}
  2735  	}
  2736  
  2737  	for _, cl := range c.inputs {
  2738  		iter := cl.files.Iter()
  2739  		for f := iter.First(); f != nil; f = iter.Next() {
  2740  			c.metrics[cl.level].NumFiles--
  2741  			c.metrics[cl.level].Size -= int64(f.Size)
  2742  			ve.DeletedFiles[deletedFileEntry{
  2743  				Level:   cl.level,
  2744  				FileNum: f.FileNum,
  2745  			}] = f
  2746  		}
  2747  	}
  2748  
  2749  	if err := d.dataDir.Sync(); err != nil {
  2750  		return nil, pendingOutputs, err
  2751  	}
  2752  
  2753  	// Refresh the disk available statistic whenever a compaction/flush
  2754  	// completes, before re-acquiring the mutex.
  2755  	_ = d.calculateDiskAvailableBytes()
  2756  
  2757  	return ve, pendingOutputs, nil
  2758  }
  2759  
  2760  // validateVersionEdit validates that start and end keys across new and deleted
  2761  // files in a versionEdit pass the given validation function.
  2762  func validateVersionEdit(
  2763  	ve *versionEdit, validateFn func([]byte) error, format base.FormatKey,
  2764  ) error {
  2765  	validateMetaFn := func(f *manifest.FileMetadata) error {
  2766  		for _, key := range []InternalKey{f.Smallest, f.Largest} {
  2767  			if err := validateFn(key.UserKey); err != nil {
  2768  				return errors.Wrapf(err, "key=%q; file=%s", format(key.UserKey), f)
  2769  			}
  2770  		}
  2771  		return nil
  2772  	}
  2773  
  2774  	// Validate both new and deleted files.
  2775  	for _, f := range ve.NewFiles {
  2776  		if err := validateMetaFn(f.Meta); err != nil {
  2777  			return err
  2778  		}
  2779  	}
  2780  	for _, m := range ve.DeletedFiles {
  2781  		if err := validateMetaFn(m); err != nil {
  2782  			return err
  2783  		}
  2784  	}
  2785  
  2786  	return nil
  2787  }
  2788  
  2789  // scanObsoleteFiles scans the filesystem for files that are no longer needed
  2790  // and adds those to the internal lists of obsolete files. Note that the files
  2791  // are not actually deleted by this method. A subsequent call to
  2792  // deleteObsoleteFiles must be performed. Must be not be called concurrently
  2793  // with compactions and flushes. db.mu must be held when calling this function.
  2794  func (d *DB) scanObsoleteFiles(list []string) {
  2795  	// Disable automatic compactions temporarily to avoid concurrent compactions /
  2796  	// flushes from interfering. The original value is restored on completion.
  2797  	disabledPrev := d.opts.DisableAutomaticCompactions
  2798  	defer func() {
  2799  		d.opts.DisableAutomaticCompactions = disabledPrev
  2800  	}()
  2801  	d.opts.DisableAutomaticCompactions = true
  2802  
  2803  	// Wait for any ongoing compaction to complete before continuing.
  2804  	if d.mu.compact.compactingCount > 0 || d.mu.compact.flushing {
  2805  		d.mu.compact.cond.Wait()
  2806  	}
  2807  
  2808  	liveFileNums := make(map[FileNum]struct{})
  2809  	d.mu.versions.addLiveFileNums(liveFileNums)
  2810  	minUnflushedLogNum := d.mu.versions.minUnflushedLogNum
  2811  	manifestFileNum := d.mu.versions.manifestFileNum
  2812  
  2813  	var obsoleteLogs []fileInfo
  2814  	var obsoleteTables []*fileMetadata
  2815  	var obsoleteManifests []fileInfo
  2816  	var obsoleteOptions []fileInfo
  2817  
  2818  	for _, filename := range list {
  2819  		fileType, fileNum, ok := base.ParseFilename(d.opts.FS, filename)
  2820  		if !ok {
  2821  			continue
  2822  		}
  2823  		switch fileType {
  2824  		case fileTypeLog:
  2825  			if fileNum >= minUnflushedLogNum {
  2826  				continue
  2827  			}
  2828  			fi := fileInfo{fileNum: fileNum}
  2829  			if stat, err := d.opts.FS.Stat(filename); err == nil {
  2830  				fi.fileSize = uint64(stat.Size())
  2831  			}
  2832  			obsoleteLogs = append(obsoleteLogs, fi)
  2833  		case fileTypeManifest:
  2834  			if fileNum >= manifestFileNum {
  2835  				continue
  2836  			}
  2837  			fi := fileInfo{fileNum: fileNum}
  2838  			if stat, err := d.opts.FS.Stat(filename); err == nil {
  2839  				fi.fileSize = uint64(stat.Size())
  2840  			}
  2841  			obsoleteManifests = append(obsoleteManifests, fi)
  2842  		case fileTypeOptions:
  2843  			if fileNum >= d.optionsFileNum {
  2844  				continue
  2845  			}
  2846  			fi := fileInfo{fileNum: fileNum}
  2847  			if stat, err := d.opts.FS.Stat(filename); err == nil {
  2848  				fi.fileSize = uint64(stat.Size())
  2849  			}
  2850  			obsoleteOptions = append(obsoleteOptions, fi)
  2851  		case fileTypeTable:
  2852  			if _, ok := liveFileNums[fileNum]; ok {
  2853  				continue
  2854  			}
  2855  			fileMeta := &fileMetadata{
  2856  				FileNum: fileNum,
  2857  			}
  2858  			if stat, err := d.opts.FS.Stat(filename); err == nil {
  2859  				fileMeta.Size = uint64(stat.Size())
  2860  			}
  2861  			obsoleteTables = append(obsoleteTables, fileMeta)
  2862  		default:
  2863  			// Don't delete files we don't know about.
  2864  			continue
  2865  		}
  2866  	}
  2867  
  2868  	d.mu.log.queue = merge(d.mu.log.queue, obsoleteLogs)
  2869  	d.mu.versions.metrics.WAL.Files += int64(len(obsoleteLogs))
  2870  	d.mu.versions.obsoleteTables = mergeFileMetas(d.mu.versions.obsoleteTables, obsoleteTables)
  2871  	d.mu.versions.incrementObsoleteTablesLocked(obsoleteTables)
  2872  	d.mu.versions.obsoleteManifests = merge(d.mu.versions.obsoleteManifests, obsoleteManifests)
  2873  	d.mu.versions.obsoleteOptions = merge(d.mu.versions.obsoleteOptions, obsoleteOptions)
  2874  }
  2875  
  2876  // disableFileDeletions disables file deletions and then waits for any
  2877  // in-progress deletion to finish. The caller is required to call
  2878  // enableFileDeletions in order to enable file deletions again. It is ok for
  2879  // multiple callers to disable file deletions simultaneously, though they must
  2880  // all invoke enableFileDeletions in order for file deletions to be re-enabled
  2881  // (there is an internal reference count on file deletion disablement).
  2882  //
  2883  // d.mu must be held when calling this method.
  2884  func (d *DB) disableFileDeletions() {
  2885  	d.mu.cleaner.disabled++
  2886  	for d.mu.cleaner.cleaning {
  2887  		d.mu.cleaner.cond.Wait()
  2888  	}
  2889  	d.mu.cleaner.cond.Broadcast()
  2890  }
  2891  
  2892  // enableFileDeletions enables previously disabled file deletions. Note that if
  2893  // file deletions have been re-enabled, the current goroutine will be used to
  2894  // perform the queued up deletions.
  2895  //
  2896  // d.mu must be held when calling this method.
  2897  func (d *DB) enableFileDeletions() {
  2898  	if d.mu.cleaner.disabled <= 0 || d.mu.cleaner.cleaning {
  2899  		panic("bitalostable: file deletion disablement invariant violated")
  2900  	}
  2901  	d.mu.cleaner.disabled--
  2902  	if d.mu.cleaner.disabled > 0 {
  2903  		return
  2904  	}
  2905  	jobID := d.mu.nextJobID
  2906  	d.mu.nextJobID++
  2907  	d.deleteObsoleteFiles(jobID, true /* waitForOngoing */)
  2908  }
  2909  
  2910  // d.mu must be held when calling this.
  2911  func (d *DB) acquireCleaningTurn(waitForOngoing bool) bool {
  2912  	// Only allow a single delete obsolete files job to run at a time.
  2913  	for d.mu.cleaner.cleaning && d.mu.cleaner.disabled == 0 && waitForOngoing {
  2914  		d.mu.cleaner.cond.Wait()
  2915  	}
  2916  	if d.mu.cleaner.cleaning {
  2917  		return false
  2918  	}
  2919  	if d.mu.cleaner.disabled > 0 {
  2920  		// File deletions are currently disabled. When they are re-enabled a new
  2921  		// job will be created to catch up on file deletions.
  2922  		return false
  2923  	}
  2924  	d.mu.cleaner.cleaning = true
  2925  	return true
  2926  }
  2927  
  2928  // d.mu must be held when calling this.
  2929  func (d *DB) releaseCleaningTurn() {
  2930  	d.mu.cleaner.cleaning = false
  2931  	d.mu.cleaner.cond.Broadcast()
  2932  }
  2933  
  2934  // deleteObsoleteFiles deletes those files that are no longer needed. If
  2935  // waitForOngoing is true, it waits for any ongoing cleaning turns to complete,
  2936  // and if false, it returns rightaway if a cleaning turn is ongoing.
  2937  //
  2938  // d.mu must be held when calling this, but the mutex may be dropped and
  2939  // re-acquired during the course of this method.
  2940  func (d *DB) deleteObsoleteFiles(jobID int, waitForOngoing bool) {
  2941  	if !d.acquireCleaningTurn(waitForOngoing) {
  2942  		return
  2943  	}
  2944  	d.doDeleteObsoleteFiles(jobID)
  2945  	d.releaseCleaningTurn()
  2946  }
  2947  
  2948  // obsoleteFile holds information about a file that needs to be deleted soon.
  2949  type obsoleteFile struct {
  2950  	dir      string
  2951  	fileNum  base.FileNum
  2952  	fileType fileType
  2953  	fileSize uint64
  2954  }
  2955  
  2956  type fileInfo struct {
  2957  	fileNum  FileNum
  2958  	fileSize uint64
  2959  }
  2960  
  2961  // d.mu must be held when calling this, but the mutex may be dropped and
  2962  // re-acquired during the course of this method.
  2963  func (d *DB) doDeleteObsoleteFiles(jobID int) {
  2964  	var obsoleteTables []fileInfo
  2965  
  2966  	defer func() {
  2967  		for _, tbl := range obsoleteTables {
  2968  			delete(d.mu.versions.zombieTables, tbl.fileNum)
  2969  		}
  2970  	}()
  2971  
  2972  	var obsoleteLogs []fileInfo
  2973  	for i := range d.mu.log.queue {
  2974  		// NB: d.mu.versions.minUnflushedLogNum is the log number of the earliest
  2975  		// log that has not had its contents flushed to an sstable. We can recycle
  2976  		// the prefix of d.mu.log.queue with log numbers less than
  2977  		// minUnflushedLogNum.
  2978  		if d.mu.log.queue[i].fileNum >= d.mu.versions.minUnflushedLogNum {
  2979  			obsoleteLogs = d.mu.log.queue[:i]
  2980  			d.mu.log.queue = d.mu.log.queue[i:]
  2981  			d.mu.versions.metrics.WAL.Files -= int64(len(obsoleteLogs))
  2982  			break
  2983  		}
  2984  	}
  2985  
  2986  	for _, table := range d.mu.versions.obsoleteTables {
  2987  		obsoleteTables = append(obsoleteTables, fileInfo{
  2988  			fileNum:  table.FileNum,
  2989  			fileSize: table.Size,
  2990  		})
  2991  	}
  2992  	d.mu.versions.obsoleteTables = nil
  2993  
  2994  	// Sort the manifests cause we want to delete some contiguous prefix
  2995  	// of the older manifests.
  2996  	sort.Slice(d.mu.versions.obsoleteManifests, func(i, j int) bool {
  2997  		return d.mu.versions.obsoleteManifests[i].fileNum <
  2998  			d.mu.versions.obsoleteManifests[j].fileNum
  2999  	})
  3000  
  3001  	var obsoleteManifests []fileInfo
  3002  	manifestsToDelete := len(d.mu.versions.obsoleteManifests) - d.opts.NumPrevManifest
  3003  	if manifestsToDelete > 0 {
  3004  		obsoleteManifests = d.mu.versions.obsoleteManifests[:manifestsToDelete]
  3005  		d.mu.versions.obsoleteManifests = d.mu.versions.obsoleteManifests[manifestsToDelete:]
  3006  		if len(d.mu.versions.obsoleteManifests) == 0 {
  3007  			d.mu.versions.obsoleteManifests = nil
  3008  		}
  3009  	}
  3010  
  3011  	obsoleteOptions := d.mu.versions.obsoleteOptions
  3012  	d.mu.versions.obsoleteOptions = nil
  3013  
  3014  	// Release d.mu while doing I/O
  3015  	// Note the unusual order: Unlock and then Lock.
  3016  	d.mu.Unlock()
  3017  	defer d.mu.Lock()
  3018  
  3019  	files := [4]struct {
  3020  		fileType fileType
  3021  		obsolete []fileInfo
  3022  	}{
  3023  		{fileTypeLog, obsoleteLogs},
  3024  		{fileTypeTable, obsoleteTables},
  3025  		{fileTypeManifest, obsoleteManifests},
  3026  		{fileTypeOptions, obsoleteOptions},
  3027  	}
  3028  	_, noRecycle := d.opts.Cleaner.(base.NeedsFileContents)
  3029  	filesToDelete := make([]obsoleteFile, 0, len(files))
  3030  	for _, f := range files {
  3031  		// We sort to make the order of deletions deterministic, which is nice for
  3032  		// tests.
  3033  		sort.Slice(f.obsolete, func(i, j int) bool {
  3034  			return f.obsolete[i].fileNum < f.obsolete[j].fileNum
  3035  		})
  3036  		for _, fi := range f.obsolete {
  3037  			dir := d.dirname
  3038  			switch f.fileType {
  3039  			case fileTypeLog:
  3040  				if !noRecycle && d.logRecycler.add(fi) {
  3041  					continue
  3042  				}
  3043  				dir = d.walDirname
  3044  			case fileTypeTable:
  3045  				d.tableCache.evict(fi.fileNum)
  3046  			}
  3047  
  3048  			filesToDelete = append(filesToDelete, obsoleteFile{
  3049  				dir:      dir,
  3050  				fileNum:  fi.fileNum,
  3051  				fileType: f.fileType,
  3052  				fileSize: fi.fileSize,
  3053  			})
  3054  		}
  3055  	}
  3056  	if len(filesToDelete) > 0 {
  3057  		d.deleters.Add(1)
  3058  		// Delete asynchronously if that could get held up in the pacer.
  3059  		if d.opts.Experimental.MinDeletionRate > 0 {
  3060  			go d.paceAndDeleteObsoleteFiles(jobID, filesToDelete)
  3061  		} else {
  3062  			d.paceAndDeleteObsoleteFiles(jobID, filesToDelete)
  3063  		}
  3064  	}
  3065  }
  3066  
  3067  // Paces and eventually deletes the list of obsolete files passed in. db.mu
  3068  // must NOT be held when calling this method.
  3069  func (d *DB) paceAndDeleteObsoleteFiles(jobID int, files []obsoleteFile) {
  3070  	defer d.deleters.Done()
  3071  	pacer := (pacer)(nilPacer)
  3072  	if d.opts.Experimental.MinDeletionRate > 0 {
  3073  		pacer = newDeletionPacer(d.deletionLimiter, d.getDeletionPacerInfo)
  3074  	}
  3075  
  3076  	for _, of := range files {
  3077  		path := base.MakeFilepath(d.opts.FS, of.dir, of.fileType, of.fileNum)
  3078  		if of.fileType == fileTypeTable {
  3079  			_ = pacer.maybeThrottle(of.fileSize)
  3080  			d.mu.Lock()
  3081  			d.mu.versions.metrics.Table.ObsoleteCount--
  3082  			d.mu.versions.metrics.Table.ObsoleteSize -= of.fileSize
  3083  			d.mu.Unlock()
  3084  		}
  3085  		d.deleteObsoleteFile(of.fileType, jobID, path, of.fileNum)
  3086  	}
  3087  }
  3088  
  3089  func (d *DB) maybeScheduleObsoleteTableDeletion() {
  3090  	d.mu.Lock()
  3091  	defer d.mu.Unlock()
  3092  
  3093  	if len(d.mu.versions.obsoleteTables) == 0 {
  3094  		return
  3095  	}
  3096  	if !d.acquireCleaningTurn(false) {
  3097  		return
  3098  	}
  3099  
  3100  	go func() {
  3101  		pprof.Do(context.Background(), gcLabels, func(context.Context) {
  3102  			d.mu.Lock()
  3103  			defer d.mu.Unlock()
  3104  
  3105  			jobID := d.mu.nextJobID
  3106  			d.mu.nextJobID++
  3107  			d.doDeleteObsoleteFiles(jobID)
  3108  			d.releaseCleaningTurn()
  3109  		})
  3110  	}()
  3111  }
  3112  
  3113  // deleteObsoleteFile deletes file that is no longer needed.
  3114  func (d *DB) deleteObsoleteFile(fileType fileType, jobID int, path string, fileNum FileNum) {
  3115  	// TODO(peter): need to handle this error, probably by re-adding the
  3116  	// file that couldn't be deleted to one of the obsolete slices map.
  3117  	err := d.opts.Cleaner.Clean(d.opts.FS, fileType, path)
  3118  	if oserror.IsNotExist(err) {
  3119  		return
  3120  	}
  3121  
  3122  	switch fileType {
  3123  	case fileTypeLog:
  3124  		d.opts.EventListener.WALDeleted(WALDeleteInfo{
  3125  			JobID:   jobID,
  3126  			Path:    path,
  3127  			FileNum: fileNum,
  3128  			Err:     err,
  3129  		})
  3130  	case fileTypeManifest:
  3131  		d.opts.EventListener.ManifestDeleted(ManifestDeleteInfo{
  3132  			JobID:   jobID,
  3133  			Path:    path,
  3134  			FileNum: fileNum,
  3135  			Err:     err,
  3136  		})
  3137  	case fileTypeTable:
  3138  		d.opts.EventListener.TableDeleted(TableDeleteInfo{
  3139  			JobID:   jobID,
  3140  			Path:    path,
  3141  			FileNum: fileNum,
  3142  			Err:     err,
  3143  		})
  3144  	}
  3145  }
  3146  
  3147  func merge(a, b []fileInfo) []fileInfo {
  3148  	if len(b) == 0 {
  3149  		return a
  3150  	}
  3151  
  3152  	a = append(a, b...)
  3153  	sort.Slice(a, func(i, j int) bool {
  3154  		return a[i].fileNum < a[j].fileNum
  3155  	})
  3156  
  3157  	n := 0
  3158  	for i := 0; i < len(a); i++ {
  3159  		if n == 0 || a[i].fileNum != a[n-1].fileNum {
  3160  			a[n] = a[i]
  3161  			n++
  3162  		}
  3163  	}
  3164  	return a[:n]
  3165  }
  3166  
  3167  func mergeFileMetas(a, b []*fileMetadata) []*fileMetadata {
  3168  	if len(b) == 0 {
  3169  		return a
  3170  	}
  3171  
  3172  	a = append(a, b...)
  3173  	sort.Slice(a, func(i, j int) bool {
  3174  		return a[i].FileNum < a[j].FileNum
  3175  	})
  3176  
  3177  	n := 0
  3178  	for i := 0; i < len(a); i++ {
  3179  		if n == 0 || a[i].FileNum != a[n-1].FileNum {
  3180  			a[n] = a[i]
  3181  			n++
  3182  		}
  3183  	}
  3184  	return a[:n]
  3185  }