github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/compaction.go

github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/compaction.go (about)

     1  // Copyright 2013 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"bytes"
     9  	"cmp"
    10  	"context"
    11  	"fmt"
    12  	"io"
    13  	"math"
    14  	"runtime/pprof"
    15  	"slices"
    16  	"sort"
    17  	"sync/atomic"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/errors"
    21  	"github.com/cockroachdb/pebble/internal/base"
    22  	"github.com/cockroachdb/pebble/internal/invalidating"
    23  	"github.com/cockroachdb/pebble/internal/invariants"
    24  	"github.com/cockroachdb/pebble/internal/keyspan"
    25  	"github.com/cockroachdb/pebble/internal/manifest"
    26  	"github.com/cockroachdb/pebble/internal/private"
    27  	"github.com/cockroachdb/pebble/internal/rangedel"
    28  	"github.com/cockroachdb/pebble/internal/rangekey"
    29  	"github.com/cockroachdb/pebble/objstorage"
    30  	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing"
    31  	"github.com/cockroachdb/pebble/objstorage/remote"
    32  	"github.com/cockroachdb/pebble/sstable"
    33  	"github.com/cockroachdb/pebble/vfs"
    34  )
    35  
    36  var errEmptyTable = errors.New("pebble: empty table")
    37  
    38  // ErrCancelledCompaction is returned if a compaction is cancelled by a
    39  // concurrent excise or ingest-split operation.
    40  var ErrCancelledCompaction = errors.New("pebble: compaction cancelled by a concurrent operation, will retry compaction")
    41  
    42  var compactLabels = pprof.Labels("pebble", "compact")
    43  var flushLabels = pprof.Labels("pebble", "flush")
    44  var gcLabels = pprof.Labels("pebble", "gc")
    45  
    46  // getInternalWriterProperties accesses a private variable (in the
    47  // internal/private package) initialized by the sstable Writer. This indirection
    48  // is necessary to ensure non-Pebble users constructing sstables for ingestion
    49  // are unable to set internal-only properties.
    50  var getInternalWriterProperties = private.SSTableInternalProperties.(func(*sstable.Writer) *sstable.Properties)
    51  
    52  // expandedCompactionByteSizeLimit is the maximum number of bytes in all
    53  // compacted files. We avoid expanding the lower level file set of a compaction
    54  // if it would make the total compaction cover more than this many bytes.
    55  func expandedCompactionByteSizeLimit(opts *Options, level int, availBytes uint64) uint64 {
    56  	v := uint64(25 * opts.Level(level).TargetFileSize)
    57  
    58  	// Never expand a compaction beyond half the available capacity, divided
    59  	// by the maximum number of concurrent compactions. Each of the concurrent
    60  	// compactions may expand up to this limit, so this attempts to limit
    61  	// compactions to half of available disk space. Note that this will not
    62  	// prevent compaction picking from pursuing compactions that are larger
    63  	// than this threshold before expansion.
    64  	diskMax := (availBytes / 2) / uint64(opts.MaxConcurrentCompactions())
    65  	if v > diskMax {
    66  		v = diskMax
    67  	}
    68  	return v
    69  }
    70  
    71  // maxGrandparentOverlapBytes is the maximum bytes of overlap with level+1
    72  // before we stop building a single file in a level-1 to level compaction.
    73  func maxGrandparentOverlapBytes(opts *Options, level int) uint64 {
    74  	return uint64(10 * opts.Level(level).TargetFileSize)
    75  }
    76  
    77  // maxReadCompactionBytes is used to prevent read compactions which
    78  // are too wide.
    79  func maxReadCompactionBytes(opts *Options, level int) uint64 {
    80  	return uint64(10 * opts.Level(level).TargetFileSize)
    81  }
    82  
    83  // noCloseIter wraps around a FragmentIterator, intercepting and eliding
    84  // calls to Close. It is used during compaction to ensure that rangeDelIters
    85  // are not closed prematurely.
    86  type noCloseIter struct {
    87  	keyspan.FragmentIterator
    88  }
    89  
    90  func (i noCloseIter) Close() error {
    91  	return nil
    92  }
    93  
    94  type compactionLevel struct {
    95  	level int
    96  	files manifest.LevelSlice
    97  	// l0SublevelInfo contains information about L0 sublevels being compacted.
    98  	// It's only set for the start level of a compaction starting out of L0 and
    99  	// is nil for all other compactions.
   100  	l0SublevelInfo []sublevelInfo
   101  }
   102  
   103  func (cl compactionLevel) Clone() compactionLevel {
   104  	newCL := compactionLevel{
   105  		level: cl.level,
   106  		files: cl.files.Reslice(func(start, end *manifest.LevelIterator) {}),
   107  	}
   108  	return newCL
   109  }
   110  func (cl compactionLevel) String() string {
   111  	return fmt.Sprintf(`Level %d, Files %s`, cl.level, cl.files)
   112  }
   113  
   114  // Return output from compactionOutputSplitters. See comment on
   115  // compactionOutputSplitter.shouldSplitBefore() on how this value is used.
   116  type maybeSplit int
   117  
   118  const (
   119  	noSplit maybeSplit = iota
   120  	splitNow
   121  )
   122  
   123  // String implements the Stringer interface.
   124  func (c maybeSplit) String() string {
   125  	if c == noSplit {
   126  		return "no-split"
   127  	}
   128  	return "split-now"
   129  }
   130  
   131  // compactionOutputSplitter is an interface for encapsulating logic around
   132  // switching the output of a compaction to a new output file. Additional
   133  // constraints around switching compaction outputs that are specific to that
   134  // compaction type (eg. flush splits) are implemented in
   135  // compactionOutputSplitters that compose other child compactionOutputSplitters.
   136  type compactionOutputSplitter interface {
   137  	// shouldSplitBefore returns whether we should split outputs before the
   138  	// specified "current key". The return value is splitNow or noSplit.
   139  	// splitNow means a split is advised before the specified key, and noSplit
   140  	// means no split is advised. If shouldSplitBefore(a) advises a split then
   141  	// shouldSplitBefore(b) should also advise a split given b >= a, until
   142  	// onNewOutput is called.
   143  	shouldSplitBefore(key *InternalKey, tw *sstable.Writer) maybeSplit
   144  	// onNewOutput updates internal splitter state when the compaction switches
   145  	// to a new sstable, and returns the next limit for the new output which
   146  	// would get used to truncate range tombstones if the compaction iterator
   147  	// runs out of keys. The limit returned MUST be > key according to the
   148  	// compaction's comparator. The specified key is the first key in the new
   149  	// output, or nil if this sstable will only contain range tombstones already
   150  	// in the fragmenter.
   151  	onNewOutput(key []byte) []byte
   152  }
   153  
   154  // fileSizeSplitter is a compactionOutputSplitter that enforces target file
   155  // sizes. This splitter splits to a new output file when the estimated file size
   156  // is 0.5x-2x the target file size. If there are overlapping grandparent files,
   157  // this splitter will attempt to split at a grandparent boundary. For example,
   158  // consider the example where a compaction wrote 'd' to the current output file,
   159  // and the next key has a user key 'g':
   160  //
   161  //	                              previous key   next key
   162  //		                                 |           |
   163  //		                                 |           |
   164  //		                 +---------------|----+   +--|----------+
   165  //		  grandparents:  |       000006  |    |   |  | 000007   |
   166  //		                 +---------------|----+   +--|----------+
   167  //		                 a    b          d    e   f  g       i
   168  //
   169  // Splitting the output file F before 'g' will ensure that the current output
   170  // file F does not overlap the grandparent file 000007. Aligning sstable
   171  // boundaries like this can significantly reduce write amplification, since a
   172  // subsequent compaction of F into the grandparent level will avoid needlessly
   173  // rewriting any keys within 000007 that do not overlap F's bounds. Consider the
   174  // following compaction:
   175  //
   176  //	                       +----------------------+
   177  //		  input            |                      |
   178  //		  level            +----------------------+
   179  //		                              \/
   180  //		           +---------------+       +---------------+
   181  //		  output   |XXXXXXX|       |       |      |XXXXXXXX|
   182  //		  level    +---------------+       +---------------+
   183  //
   184  // The input-level file overlaps two files in the output level, but only
   185  // partially. The beginning of the first output-level file and the end of the
   186  // second output-level file will be rewritten verbatim. This write I/O is
   187  // "wasted" in the sense that no merging is being performed.
   188  //
   189  // To prevent the above waste, this splitter attempts to split output files
   190  // before the start key of grandparent files. It still strives to write output
   191  // files of approximately the target file size, by constraining this splitting
   192  // at grandparent points to apply only if the current output's file size is
   193  // about the right order of magnitude.
   194  //
   195  // Note that, unlike most other splitters, this splitter does not guarantee that
   196  // it will advise splits only at user key change boundaries.
   197  type fileSizeSplitter struct {
   198  	frontier              frontier
   199  	targetFileSize        uint64
   200  	atGrandparentBoundary bool
   201  	boundariesObserved    uint64
   202  	nextGrandparent       *fileMetadata
   203  	grandparents          manifest.LevelIterator
   204  }
   205  
   206  func newFileSizeSplitter(
   207  	f *frontiers, targetFileSize uint64, grandparents manifest.LevelIterator,
   208  ) *fileSizeSplitter {
   209  	s := &fileSizeSplitter{targetFileSize: targetFileSize}
   210  	s.nextGrandparent = grandparents.First()
   211  	s.grandparents = grandparents
   212  	if s.nextGrandparent != nil {
   213  		s.frontier.Init(f, s.nextGrandparent.Smallest.UserKey, s.reached)
   214  	}
   215  	return s
   216  }
   217  
   218  func (f *fileSizeSplitter) reached(nextKey []byte) []byte {
   219  	f.atGrandparentBoundary = true
   220  	f.boundariesObserved++
   221  	// NB: f.grandparents is a bounded iterator, constrained to the compaction
   222  	// key range.
   223  	f.nextGrandparent = f.grandparents.Next()
   224  	if f.nextGrandparent == nil {
   225  		return nil
   226  	}
   227  	// TODO(jackson): Should we also split before or immediately after
   228  	// grandparents' largest keys? Splitting before the start boundary prevents
   229  	// overlap with the grandparent. Also splitting after the end boundary may
   230  	// increase the probability of move compactions.
   231  	return f.nextGrandparent.Smallest.UserKey
   232  }
   233  
   234  func (f *fileSizeSplitter) shouldSplitBefore(key *InternalKey, tw *sstable.Writer) maybeSplit {
   235  	atGrandparentBoundary := f.atGrandparentBoundary
   236  
   237  	// Clear f.atGrandparentBoundary unconditionally.
   238  	//
   239  	// This is a bit subtle. Even if do decide to split, it's possible that a
   240  	// higher-level splitter will ignore our request (eg, because we're between
   241  	// two internal keys with the same user key). In this case, the next call to
   242  	// shouldSplitBefore will find atGrandparentBoundary=false. This is
   243  	// desirable, because in this case we would've already written the earlier
   244  	// key with the same user key to the output file. The current output file is
   245  	// already doomed to overlap the grandparent whose bound triggered
   246  	// atGrandparentBoundary=true. We should continue on, waiting for the next
   247  	// grandparent boundary.
   248  	f.atGrandparentBoundary = false
   249  
   250  	// If the key is a range tombstone, the EstimatedSize may not grow right
   251  	// away when a range tombstone is added to the fragmenter: It's dependent on
   252  	// whether or not the this new range deletion will start a new fragment.
   253  	// Range deletions are rare, so we choose to simply not split yet.
   254  	// TODO(jackson): Reconsider this, and consider range keys too as a part of
   255  	// #2321.
   256  	if key.Kind() == InternalKeyKindRangeDelete || tw == nil {
   257  		return noSplit
   258  	}
   259  
   260  	estSize := tw.EstimatedSize()
   261  	switch {
   262  	case estSize < f.targetFileSize/2:
   263  		// The estimated file size is less than half the target file size. Don't
   264  		// split it, even if currently aligned with a grandparent file because
   265  		// it's too small.
   266  		return noSplit
   267  	case estSize >= 2*f.targetFileSize:
   268  		// The estimated file size is double the target file size. Split it even
   269  		// if we were not aligned with a grandparent file boundary to avoid
   270  		// excessively exceeding the target file size.
   271  		return splitNow
   272  	case !atGrandparentBoundary:
   273  		// Don't split if we're not at a grandparent, except if we've exhausted
   274  		// all the grandparents overlapping this compaction's key range. Then we
   275  		// may want to split purely based on file size.
   276  		if f.nextGrandparent == nil {
   277  			// There are no more grandparents. Optimize for the target file size
   278  			// and split as soon as we hit the target file size.
   279  			if estSize >= f.targetFileSize {
   280  				return splitNow
   281  			}
   282  		}
   283  		return noSplit
   284  	default:
   285  		// INVARIANT: atGrandparentBoundary
   286  		// INVARIANT: targetSize/2 < estSize < 2*targetSize
   287  		//
   288  		// The estimated file size is close enough to the target file size that
   289  		// we should consider splitting.
   290  		//
   291  		// Determine whether to split now based on how many grandparent
   292  		// boundaries we have already observed while building this output file.
   293  		// The intuition here is that if the grandparent level is dense in this
   294  		// part of the keyspace, we're likely to continue to have more
   295  		// opportunities to split this file aligned with a grandparent. If this
   296  		// is the first grandparent boundary observed, we split immediately
   297  		// (we're already at ≥50% the target file size). Otherwise, each
   298  		// overlapping grandparent we've observed increases the minimum file
   299  		// size by 5% of the target file size, up to at most 90% of the target
   300  		// file size.
   301  		//
   302  		// TODO(jackson): The particular thresholds are somewhat unprincipled.
   303  		// This is the same heuristic as RocksDB implements. Is there are more
   304  		// principled formulation that can, further reduce w-amp, produce files
   305  		// closer to the target file size, or is more understandable?
   306  
   307  		// NB: Subtract 1 from `boundariesObserved` to account for the current
   308  		// boundary we're considering splitting at. `reached` will have
   309  		// incremented it at the same time it set `atGrandparentBoundary`.
   310  		minimumPctOfTargetSize := 50 + 5*min(f.boundariesObserved-1, 8)
   311  		if estSize < (minimumPctOfTargetSize*f.targetFileSize)/100 {
   312  			return noSplit
   313  		}
   314  		return splitNow
   315  	}
   316  }
   317  
   318  func (f *fileSizeSplitter) onNewOutput(key []byte) []byte {
   319  	f.boundariesObserved = 0
   320  	return nil
   321  }
   322  
   323  func newLimitFuncSplitter(f *frontiers, limitFunc func(userKey []byte) []byte) *limitFuncSplitter {
   324  	s := &limitFuncSplitter{limitFunc: limitFunc}
   325  	s.frontier.Init(f, nil, s.reached)
   326  	return s
   327  }
   328  
   329  type limitFuncSplitter struct {
   330  	frontier  frontier
   331  	limitFunc func(userKey []byte) []byte
   332  	split     maybeSplit
   333  }
   334  
   335  func (lf *limitFuncSplitter) shouldSplitBefore(key *InternalKey, tw *sstable.Writer) maybeSplit {
   336  	return lf.split
   337  }
   338  
   339  func (lf *limitFuncSplitter) reached(nextKey []byte) []byte {
   340  	lf.split = splitNow
   341  	return nil
   342  }
   343  
   344  func (lf *limitFuncSplitter) onNewOutput(key []byte) []byte {
   345  	lf.split = noSplit
   346  	if key != nil {
   347  		// TODO(jackson): For some users, like L0 flush splits, there's no need
   348  		// to binary search over all the flush splits every time. The next split
   349  		// point must be ahead of the previous flush split point.
   350  		limit := lf.limitFunc(key)
   351  		lf.frontier.Update(limit)
   352  		return limit
   353  	}
   354  	lf.frontier.Update(nil)
   355  	return nil
   356  }
   357  
   358  // splitterGroup is a compactionOutputSplitter that splits whenever one of its
   359  // child splitters advises a compaction split.
   360  type splitterGroup struct {
   361  	cmp       Compare
   362  	splitters []compactionOutputSplitter
   363  }
   364  
   365  func (a *splitterGroup) shouldSplitBefore(
   366  	key *InternalKey, tw *sstable.Writer,
   367  ) (suggestion maybeSplit) {
   368  	for _, splitter := range a.splitters {
   369  		if splitter.shouldSplitBefore(key, tw) == splitNow {
   370  			return splitNow
   371  		}
   372  	}
   373  	return noSplit
   374  }
   375  
   376  func (a *splitterGroup) onNewOutput(key []byte) []byte {
   377  	var earliestLimit []byte
   378  	for _, splitter := range a.splitters {
   379  		limit := splitter.onNewOutput(key)
   380  		if limit == nil {
   381  			continue
   382  		}
   383  		if earliestLimit == nil || a.cmp(limit, earliestLimit) < 0 {
   384  			earliestLimit = limit
   385  		}
   386  	}
   387  	return earliestLimit
   388  }
   389  
   390  // userKeyChangeSplitter is a compactionOutputSplitter that takes in a child
   391  // splitter, and splits when 1) that child splitter has advised a split, and 2)
   392  // the compaction output is at the boundary between two user keys (also
   393  // the boundary between atomic compaction units). Use this splitter to wrap
   394  // any splitters that don't guarantee user key splits (i.e. splitters that make
   395  // their determination in ways other than comparing the current key against a
   396  // limit key.) If a wrapped splitter advises a split, it must continue
   397  // to advise a split until a new output.
   398  type userKeyChangeSplitter struct {
   399  	cmp               Compare
   400  	splitter          compactionOutputSplitter
   401  	unsafePrevUserKey func() []byte
   402  }
   403  
   404  func (u *userKeyChangeSplitter) shouldSplitBefore(key *InternalKey, tw *sstable.Writer) maybeSplit {
   405  	// NB: The userKeyChangeSplitter only needs to suffer a key comparison if
   406  	// the wrapped splitter requests a split.
   407  	//
   408  	// We could implement this splitter using frontiers: When the inner splitter
   409  	// requests a split before key `k`, we'd update a frontier to be
   410  	// ImmediateSuccessor(k). Then on the next key greater than >k, the
   411  	// frontier's `reached` func would be called and we'd return splitNow.
   412  	// This doesn't really save work since duplicate user keys are rare, and it
   413  	// requires us to materialize the ImmediateSuccessor key. It also prevents
   414  	// us from splitting on the same key that the inner splitter requested a
   415  	// split for—instead we need to wait until the next key. The current
   416  	// implementation uses `unsafePrevUserKey` to gain access to the previous
   417  	// key which allows it to immediately respect the inner splitter if
   418  	// possible.
   419  	if split := u.splitter.shouldSplitBefore(key, tw); split != splitNow {
   420  		return split
   421  	}
   422  	if u.cmp(key.UserKey, u.unsafePrevUserKey()) > 0 {
   423  		return splitNow
   424  	}
   425  	return noSplit
   426  }
   427  
   428  func (u *userKeyChangeSplitter) onNewOutput(key []byte) []byte {
   429  	return u.splitter.onNewOutput(key)
   430  }
   431  
   432  // compactionWritable is a objstorage.Writable wrapper that, on every write,
   433  // updates a metric in `versions` on bytes written by in-progress compactions so
   434  // far. It also increments a per-compaction `written` int.
   435  type compactionWritable struct {
   436  	objstorage.Writable
   437  
   438  	versions *versionSet
   439  	written  *int64
   440  }
   441  
   442  // Write is part of the objstorage.Writable interface.
   443  func (c *compactionWritable) Write(p []byte) error {
   444  	if err := c.Writable.Write(p); err != nil {
   445  		return err
   446  	}
   447  
   448  	*c.written += int64(len(p))
   449  	c.versions.incrementCompactionBytes(int64(len(p)))
   450  	return nil
   451  }
   452  
   453  type compactionKind int
   454  
   455  const (
   456  	compactionKindDefault compactionKind = iota
   457  	compactionKindFlush
   458  	// compactionKindMove denotes a move compaction where the input file is
   459  	// retained and linked in a new level without being obsoleted.
   460  	compactionKindMove
   461  	// compactionKindCopy denotes a copy compaction where the input file is
   462  	// copied byte-by-byte into a new file with a new FileNum in the output level.
   463  	compactionKindCopy
   464  	compactionKindDeleteOnly
   465  	compactionKindElisionOnly
   466  	compactionKindRead
   467  	compactionKindRewrite
   468  	compactionKindIngestedFlushable
   469  )
   470  
   471  func (k compactionKind) String() string {
   472  	switch k {
   473  	case compactionKindDefault:
   474  		return "default"
   475  	case compactionKindFlush:
   476  		return "flush"
   477  	case compactionKindMove:
   478  		return "move"
   479  	case compactionKindDeleteOnly:
   480  		return "delete-only"
   481  	case compactionKindElisionOnly:
   482  		return "elision-only"
   483  	case compactionKindRead:
   484  		return "read"
   485  	case compactionKindRewrite:
   486  		return "rewrite"
   487  	case compactionKindIngestedFlushable:
   488  		return "ingested-flushable"
   489  	case compactionKindCopy:
   490  		return "copy"
   491  	}
   492  	return "?"
   493  }
   494  
   495  // rangeKeyCompactionTransform is used to transform range key spans as part of the
   496  // keyspan.MergingIter. As part of this transformation step, we can elide range
   497  // keys in the last snapshot stripe, as well as coalesce range keys within
   498  // snapshot stripes.
   499  func rangeKeyCompactionTransform(
   500  	eq base.Equal, snapshots []uint64, elideRangeKey func(start, end []byte) bool,
   501  ) keyspan.Transformer {
   502  	return keyspan.TransformerFunc(func(cmp base.Compare, s keyspan.Span, dst *keyspan.Span) error {
   503  		elideInLastStripe := func(keys []keyspan.Key) []keyspan.Key {
   504  			// Unsets and deletes in the last snapshot stripe can be elided.
   505  			k := 0
   506  			for j := range keys {
   507  				if elideRangeKey(s.Start, s.End) &&
   508  					(keys[j].Kind() == InternalKeyKindRangeKeyUnset || keys[j].Kind() == InternalKeyKindRangeKeyDelete) {
   509  					continue
   510  				}
   511  				keys[k] = keys[j]
   512  				k++
   513  			}
   514  			keys = keys[:k]
   515  			return keys
   516  		}
   517  		// snapshots are in ascending order, while s.keys are in descending seqnum
   518  		// order. Partition s.keys by snapshot stripes, and call rangekey.Coalesce
   519  		// on each partition.
   520  		dst.Start = s.Start
   521  		dst.End = s.End
   522  		dst.Keys = dst.Keys[:0]
   523  		i, j := len(snapshots)-1, 0
   524  		usedLen := 0
   525  		for i >= 0 {
   526  			start := j
   527  			for j < len(s.Keys) && !base.Visible(s.Keys[j].SeqNum(), snapshots[i], base.InternalKeySeqNumMax) {
   528  				// Include j in current partition.
   529  				j++
   530  			}
   531  			if j > start {
   532  				keysDst := dst.Keys[usedLen:cap(dst.Keys)]
   533  				if err := rangekey.Coalesce(cmp, eq, s.Keys[start:j], &keysDst); err != nil {
   534  					return err
   535  				}
   536  				if j == len(s.Keys) {
   537  					// This is the last snapshot stripe. Unsets and deletes can be elided.
   538  					keysDst = elideInLastStripe(keysDst)
   539  				}
   540  				usedLen += len(keysDst)
   541  				dst.Keys = append(dst.Keys, keysDst...)
   542  			}
   543  			i--
   544  		}
   545  		if j < len(s.Keys) {
   546  			keysDst := dst.Keys[usedLen:cap(dst.Keys)]
   547  			if err := rangekey.Coalesce(cmp, eq, s.Keys[j:], &keysDst); err != nil {
   548  				return err
   549  			}
   550  			keysDst = elideInLastStripe(keysDst)
   551  			usedLen += len(keysDst)
   552  			dst.Keys = append(dst.Keys, keysDst...)
   553  		}
   554  		return nil
   555  	})
   556  }
   557  
   558  // compaction is a table compaction from one level to the next, starting from a
   559  // given version.
   560  type compaction struct {
   561  	// cancel is a bool that can be used by other goroutines to signal a compaction
   562  	// to cancel, such as if a conflicting excise operation raced it to manifest
   563  	// application. Only holders of the manifest lock will write to this atomic.
   564  	cancel atomic.Bool
   565  
   566  	kind      compactionKind
   567  	cmp       Compare
   568  	equal     Equal
   569  	comparer  *base.Comparer
   570  	formatKey base.FormatKey
   571  	logger    Logger
   572  	version   *version
   573  	stats     base.InternalIteratorStats
   574  	beganAt   time.Time
   575  	// versionEditApplied is set to true when a compaction has completed and the
   576  	// resulting version has been installed (if successful), but the compaction
   577  	// goroutine is still cleaning up (eg, deleting obsolete files).
   578  	versionEditApplied bool
   579  	bufferPool         sstable.BufferPool
   580  
   581  	// startLevel is the level that is being compacted. Inputs from startLevel
   582  	// and outputLevel will be merged to produce a set of outputLevel files.
   583  	startLevel *compactionLevel
   584  
   585  	// outputLevel is the level that files are being produced in. outputLevel is
   586  	// equal to startLevel+1 except when:
   587  	//    - if startLevel is 0, the output level equals compactionPicker.baseLevel().
   588  	//    - in multilevel compaction, the output level is the lowest level involved in
   589  	//      the compaction
   590  	// A compaction's outputLevel is nil for delete-only compactions.
   591  	outputLevel *compactionLevel
   592  
   593  	// extraLevels point to additional levels in between the input and output
   594  	// levels that get compacted in multilevel compactions
   595  	extraLevels []*compactionLevel
   596  
   597  	inputs []compactionLevel
   598  
   599  	// maxOutputFileSize is the maximum size of an individual table created
   600  	// during compaction.
   601  	maxOutputFileSize uint64
   602  	// maxOverlapBytes is the maximum number of bytes of overlap allowed for a
   603  	// single output table with the tables in the grandparent level.
   604  	maxOverlapBytes uint64
   605  	// disableSpanElision disables elision of range tombstones and range keys. Used
   606  	// by tests to allow range tombstones or range keys to be added to tables where
   607  	// they would otherwise be elided.
   608  	disableSpanElision bool
   609  
   610  	// flushing contains the flushables (aka memtables) that are being flushed.
   611  	flushing flushableList
   612  	// bytesIterated contains the number of bytes that have been flushed/compacted.
   613  	bytesIterated uint64
   614  	// bytesWritten contains the number of bytes that have been written to outputs.
   615  	bytesWritten int64
   616  
   617  	// The boundaries of the input data.
   618  	smallest InternalKey
   619  	largest  InternalKey
   620  
   621  	// The range deletion tombstone fragmenter. Adds range tombstones as they are
   622  	// returned from `compactionIter` and fragments them for output to files.
   623  	// Referenced by `compactionIter` which uses it to check whether keys are deleted.
   624  	rangeDelFrag keyspan.Fragmenter
   625  	// The range key fragmenter. Similar to rangeDelFrag in that it gets range
   626  	// keys from the compaction iter and fragments them for output to files.
   627  	rangeKeyFrag keyspan.Fragmenter
   628  	// The range deletion tombstone iterator, that merges and fragments
   629  	// tombstones across levels. This iterator is included within the compaction
   630  	// input iterator as a single level.
   631  	// TODO(jackson): Remove this when the refactor of FragmentIterator,
   632  	// InterleavingIterator, etc is complete.
   633  	rangeDelIter keyspan.InternalIteratorShim
   634  	// rangeKeyInterleaving is the interleaving iter for range keys.
   635  	rangeKeyInterleaving keyspan.InterleavingIter
   636  
   637  	// A list of objects to close when the compaction finishes. Used by input
   638  	// iteration to keep rangeDelIters open for the lifetime of the compaction,
   639  	// and only close them when the compaction finishes.
   640  	closers []io.Closer
   641  
   642  	// grandparents are the tables in level+2 that overlap with the files being
   643  	// compacted. Used to determine output table boundaries. Do not assume that the actual files
   644  	// in the grandparent when this compaction finishes will be the same.
   645  	grandparents manifest.LevelSlice
   646  
   647  	// Boundaries at which flushes to L0 should be split. Determined by
   648  	// L0Sublevels. If nil, flushes aren't split.
   649  	l0Limits [][]byte
   650  
   651  	// List of disjoint inuse key ranges the compaction overlaps with in
   652  	// grandparent and lower levels. See setupInuseKeyRanges() for the
   653  	// construction. Used by elideTombstone() and elideRangeTombstone() to
   654  	// determine if keys affected by a tombstone possibly exist at a lower level.
   655  	inuseKeyRanges []manifest.UserKeyRange
   656  	// inuseEntireRange is set if the above inuse key ranges wholly contain the
   657  	// compaction's key range. This allows compactions in higher levels to often
   658  	// elide key comparisons.
   659  	inuseEntireRange    bool
   660  	elideTombstoneIndex int
   661  
   662  	// allowedZeroSeqNum is true if seqnums can be zeroed if there are no
   663  	// snapshots requiring them to be kept. This determination is made by
   664  	// looking for an sstable which overlaps the bounds of the compaction at a
   665  	// lower level in the LSM during runCompaction.
   666  	allowedZeroSeqNum bool
   667  
   668  	metrics map[int]*LevelMetrics
   669  
   670  	pickerMetrics compactionPickerMetrics
   671  }
   672  
   673  func (c *compaction) makeInfo(jobID int) CompactionInfo {
   674  	info := CompactionInfo{
   675  		JobID:       jobID,
   676  		Reason:      c.kind.String(),
   677  		Input:       make([]LevelInfo, 0, len(c.inputs)),
   678  		Annotations: []string{},
   679  	}
   680  	for _, cl := range c.inputs {
   681  		inputInfo := LevelInfo{Level: cl.level, Tables: nil}
   682  		iter := cl.files.Iter()
   683  		for m := iter.First(); m != nil; m = iter.Next() {
   684  			inputInfo.Tables = append(inputInfo.Tables, m.TableInfo())
   685  		}
   686  		info.Input = append(info.Input, inputInfo)
   687  	}
   688  	if c.outputLevel != nil {
   689  		info.Output.Level = c.outputLevel.level
   690  
   691  		// If there are no inputs from the output level (eg, a move
   692  		// compaction), add an empty LevelInfo to info.Input.
   693  		if len(c.inputs) > 0 && c.inputs[len(c.inputs)-1].level != c.outputLevel.level {
   694  			info.Input = append(info.Input, LevelInfo{Level: c.outputLevel.level})
   695  		}
   696  	} else {
   697  		// For a delete-only compaction, set the output level to L6. The
   698  		// output level is not meaningful here, but complicating the
   699  		// info.Output interface with a pointer doesn't seem worth the
   700  		// semantic distinction.
   701  		info.Output.Level = numLevels - 1
   702  	}
   703  
   704  	for i, score := range c.pickerMetrics.scores {
   705  		info.Input[i].Score = score
   706  	}
   707  	info.SingleLevelOverlappingRatio = c.pickerMetrics.singleLevelOverlappingRatio
   708  	info.MultiLevelOverlappingRatio = c.pickerMetrics.multiLevelOverlappingRatio
   709  	if len(info.Input) > 2 {
   710  		info.Annotations = append(info.Annotations, "multilevel")
   711  	}
   712  	return info
   713  }
   714  
   715  func newCompaction(
   716  	pc *pickedCompaction, opts *Options, beganAt time.Time, provider objstorage.Provider,
   717  ) *compaction {
   718  	c := &compaction{
   719  		kind:              compactionKindDefault,
   720  		cmp:               pc.cmp,
   721  		equal:             opts.equal(),
   722  		comparer:          opts.Comparer,
   723  		formatKey:         opts.Comparer.FormatKey,
   724  		inputs:            pc.inputs,
   725  		smallest:          pc.smallest,
   726  		largest:           pc.largest,
   727  		logger:            opts.Logger,
   728  		version:           pc.version,
   729  		beganAt:           beganAt,
   730  		maxOutputFileSize: pc.maxOutputFileSize,
   731  		maxOverlapBytes:   pc.maxOverlapBytes,
   732  		pickerMetrics:     pc.pickerMetrics,
   733  	}
   734  	c.startLevel = &c.inputs[0]
   735  	if pc.startLevel.l0SublevelInfo != nil {
   736  		c.startLevel.l0SublevelInfo = pc.startLevel.l0SublevelInfo
   737  	}
   738  	c.outputLevel = &c.inputs[1]
   739  
   740  	if len(pc.extraLevels) > 0 {
   741  		c.extraLevels = pc.extraLevels
   742  		c.outputLevel = &c.inputs[len(c.inputs)-1]
   743  	}
   744  	// Compute the set of outputLevel+1 files that overlap this compaction (these
   745  	// are the grandparent sstables).
   746  	if c.outputLevel.level+1 < numLevels {
   747  		c.grandparents = c.version.Overlaps(c.outputLevel.level+1, c.cmp,
   748  			c.smallest.UserKey, c.largest.UserKey, c.largest.IsExclusiveSentinel())
   749  	}
   750  	c.setupInuseKeyRanges()
   751  	c.kind = pc.kind
   752  
   753  	if c.kind == compactionKindDefault && c.outputLevel.files.Empty() && !c.hasExtraLevelData() &&
   754  		c.startLevel.files.Len() == 1 && c.grandparents.SizeSum() <= c.maxOverlapBytes {
   755  		// This compaction can be converted into a move or copy from one level
   756  		// to the next. We avoid such a move if there is lots of overlapping
   757  		// grandparent data. Otherwise, the move could create a parent file
   758  		// that will require a very expensive merge later on.
   759  		iter := c.startLevel.files.Iter()
   760  		meta := iter.First()
   761  		isRemote := false
   762  		// We should always be passed a provider, except in some unit tests.
   763  		if provider != nil {
   764  			objMeta, err := provider.Lookup(fileTypeTable, meta.FileBacking.DiskFileNum)
   765  			if err != nil {
   766  				panic(errors.Wrapf(err, "cannot lookup table %s in provider", meta.FileBacking.DiskFileNum))
   767  			}
   768  			isRemote = objMeta.IsRemote()
   769  		}
   770  		// Avoid a trivial move or copy if all of these are true, as rewriting a
   771  		// new file is better:
   772  		//
   773  		// 1) The source file is a virtual sstable
   774  		// 2) The existing file `meta` is on non-remote storage
   775  		// 3) The output level prefers shared storage
   776  		mustCopy := !isRemote && remote.ShouldCreateShared(opts.Experimental.CreateOnShared, c.outputLevel.level)
   777  		if mustCopy {
   778  			// If the source is virtual, it's best to just rewrite the file as all
   779  			// conditions in the above comment are met.
   780  			if !meta.Virtual {
   781  				c.kind = compactionKindCopy
   782  			}
   783  		} else {
   784  			c.kind = compactionKindMove
   785  		}
   786  	}
   787  	return c
   788  }
   789  
   790  func newDeleteOnlyCompaction(
   791  	opts *Options, cur *version, inputs []compactionLevel, beganAt time.Time,
   792  ) *compaction {
   793  	c := &compaction{
   794  		kind:      compactionKindDeleteOnly,
   795  		cmp:       opts.Comparer.Compare,
   796  		equal:     opts.equal(),
   797  		comparer:  opts.Comparer,
   798  		formatKey: opts.Comparer.FormatKey,
   799  		logger:    opts.Logger,
   800  		version:   cur,
   801  		beganAt:   beganAt,
   802  		inputs:    inputs,
   803  	}
   804  
   805  	// Set c.smallest, c.largest.
   806  	files := make([]manifest.LevelIterator, 0, len(inputs))
   807  	for _, in := range inputs {
   808  		files = append(files, in.files.Iter())
   809  	}
   810  	c.smallest, c.largest = manifest.KeyRange(opts.Comparer.Compare, files...)
   811  	return c
   812  }
   813  
   814  func adjustGrandparentOverlapBytesForFlush(c *compaction, flushingBytes uint64) {
   815  	// Heuristic to place a lower bound on compaction output file size
   816  	// caused by Lbase. Prior to this heuristic we have observed an L0 in
   817  	// production with 310K files of which 290K files were < 10KB in size.
   818  	// Our hypothesis is that it was caused by L1 having 2600 files and
   819  	// ~10GB, such that each flush got split into many tiny files due to
   820  	// overlapping with most of the files in Lbase.
   821  	//
   822  	// The computation below is general in that it accounts
   823  	// for flushing different volumes of data (e.g. we may be flushing
   824  	// many memtables). For illustration, we consider the typical
   825  	// example of flushing a 64MB memtable. So 12.8MB output,
   826  	// based on the compression guess below. If the compressed bytes
   827  	// guess is an over-estimate we will end up with smaller files,
   828  	// and if an under-estimate we will end up with larger files.
   829  	// With a 2MB target file size, 7 files. We are willing to accept
   830  	// 4x the number of files, if it results in better write amplification
   831  	// when later compacting to Lbase, i.e., ~450KB files (target file
   832  	// size / 4).
   833  	//
   834  	// Note that this is a pessimistic heuristic in that
   835  	// fileCountUpperBoundDueToGrandparents could be far from the actual
   836  	// number of files produced due to the grandparent limits. For
   837  	// example, in the extreme, consider a flush that overlaps with 1000
   838  	// files in Lbase f0...f999, and the initially calculated value of
   839  	// maxOverlapBytes will cause splits at f10, f20,..., f990, which
   840  	// means an upper bound file count of 100 files. Say the input bytes
   841  	// in the flush are such that acceptableFileCount=10. We will fatten
   842  	// up maxOverlapBytes by 10x to ensure that the upper bound file count
   843  	// drops to 10. However, it is possible that in practice, even without
   844  	// this change, we would have produced no more than 10 files, and that
   845  	// this change makes the files unnecessarily wide. Say the input bytes
   846  	// are distributed such that 10% are in f0...f9, 10% in f10...f19, ...
   847  	// 10% in f80...f89 and 10% in f990...f999. The original value of
   848  	// maxOverlapBytes would have actually produced only 10 sstables. But
   849  	// by increasing maxOverlapBytes by 10x, we may produce 1 sstable that
   850  	// spans f0...f89, i.e., a much wider sstable than necessary.
   851  	//
   852  	// We could produce a tighter estimate of
   853  	// fileCountUpperBoundDueToGrandparents if we had knowledge of the key
   854  	// distribution of the flush. The 4x multiplier mentioned earlier is
   855  	// a way to try to compensate for this pessimism.
   856  	//
   857  	// TODO(sumeer): we don't have compression info for the data being
   858  	// flushed, but it is likely that existing files that overlap with
   859  	// this flush in Lbase are representative wrt compression ratio. We
   860  	// could store the uncompressed size in FileMetadata and estimate
   861  	// the compression ratio.
   862  	const approxCompressionRatio = 0.2
   863  	approxOutputBytes := approxCompressionRatio * float64(flushingBytes)
   864  	approxNumFilesBasedOnTargetSize :=
   865  		int(math.Ceil(approxOutputBytes / float64(c.maxOutputFileSize)))
   866  	acceptableFileCount := float64(4 * approxNumFilesBasedOnTargetSize)
   867  	// The byte calculation is linear in numGrandparentFiles, but we will
   868  	// incur this linear cost in findGrandparentLimit too, so we are also
   869  	// willing to pay it now. We could approximate this cheaply by using
   870  	// the mean file size of Lbase.
   871  	grandparentFileBytes := c.grandparents.SizeSum()
   872  	fileCountUpperBoundDueToGrandparents :=
   873  		float64(grandparentFileBytes) / float64(c.maxOverlapBytes)
   874  	if fileCountUpperBoundDueToGrandparents > acceptableFileCount {
   875  		c.maxOverlapBytes = uint64(
   876  			float64(c.maxOverlapBytes) *
   877  				(fileCountUpperBoundDueToGrandparents / acceptableFileCount))
   878  	}
   879  }
   880  
   881  func newFlush(
   882  	opts *Options, cur *version, baseLevel int, flushing flushableList, beganAt time.Time,
   883  ) *compaction {
   884  	c := &compaction{
   885  		kind:              compactionKindFlush,
   886  		cmp:               opts.Comparer.Compare,
   887  		equal:             opts.equal(),
   888  		comparer:          opts.Comparer,
   889  		formatKey:         opts.Comparer.FormatKey,
   890  		logger:            opts.Logger,
   891  		version:           cur,
   892  		beganAt:           beganAt,
   893  		inputs:            []compactionLevel{{level: -1}, {level: 0}},
   894  		maxOutputFileSize: math.MaxUint64,
   895  		maxOverlapBytes:   math.MaxUint64,
   896  		flushing:          flushing,
   897  	}
   898  	c.startLevel = &c.inputs[0]
   899  	c.outputLevel = &c.inputs[1]
   900  
   901  	if len(flushing) > 0 {
   902  		if _, ok := flushing[0].flushable.(*ingestedFlushable); ok {
   903  			if len(flushing) != 1 {
   904  				panic("pebble: ingestedFlushable must be flushed one at a time.")
   905  			}
   906  			c.kind = compactionKindIngestedFlushable
   907  			return c
   908  		}
   909  	}
   910  
   911  	// Make sure there's no ingestedFlushable after the first flushable in the
   912  	// list.
   913  	for _, f := range flushing {
   914  		if _, ok := f.flushable.(*ingestedFlushable); ok {
   915  			panic("pebble: flushing shouldn't contain ingestedFlushable flushable")
   916  		}
   917  	}
   918  
   919  	if cur.L0Sublevels != nil {
   920  		c.l0Limits = cur.L0Sublevels.FlushSplitKeys()
   921  	}
   922  
   923  	smallestSet, largestSet := false, false
   924  	updatePointBounds := func(iter internalIterator) {
   925  		if key, _ := iter.First(); key != nil {
   926  			if !smallestSet ||
   927  				base.InternalCompare(c.cmp, c.smallest, *key) > 0 {
   928  				smallestSet = true
   929  				c.smallest = key.Clone()
   930  			}
   931  		}
   932  		if key, _ := iter.Last(); key != nil {
   933  			if !largestSet ||
   934  				base.InternalCompare(c.cmp, c.largest, *key) < 0 {
   935  				largestSet = true
   936  				c.largest = key.Clone()
   937  			}
   938  		}
   939  	}
   940  
   941  	updateRangeBounds := func(iter keyspan.FragmentIterator) {
   942  		// File bounds require s != nil && !s.Empty(). We only need to check for
   943  		// s != nil here, as the memtable's FragmentIterator would never surface
   944  		// empty spans.
   945  		if s := iter.First(); s != nil {
   946  			if key := s.SmallestKey(); !smallestSet ||
   947  				base.InternalCompare(c.cmp, c.smallest, key) > 0 {
   948  				smallestSet = true
   949  				c.smallest = key.Clone()
   950  			}
   951  		}
   952  		if s := iter.Last(); s != nil {
   953  			if key := s.LargestKey(); !largestSet ||
   954  				base.InternalCompare(c.cmp, c.largest, key) < 0 {
   955  				largestSet = true
   956  				c.largest = key.Clone()
   957  			}
   958  		}
   959  	}
   960  
   961  	var flushingBytes uint64
   962  	for i := range flushing {
   963  		f := flushing[i]
   964  		updatePointBounds(f.newIter(nil))
   965  		if rangeDelIter := f.newRangeDelIter(nil); rangeDelIter != nil {
   966  			updateRangeBounds(rangeDelIter)
   967  		}
   968  		if rangeKeyIter := f.newRangeKeyIter(nil); rangeKeyIter != nil {
   969  			updateRangeBounds(rangeKeyIter)
   970  		}
   971  		flushingBytes += f.inuseBytes()
   972  	}
   973  
   974  	if opts.FlushSplitBytes > 0 {
   975  		c.maxOutputFileSize = uint64(opts.Level(0).TargetFileSize)
   976  		c.maxOverlapBytes = maxGrandparentOverlapBytes(opts, 0)
   977  		c.grandparents = c.version.Overlaps(baseLevel, c.cmp, c.smallest.UserKey,
   978  			c.largest.UserKey, c.largest.IsExclusiveSentinel())
   979  		adjustGrandparentOverlapBytesForFlush(c, flushingBytes)
   980  	}
   981  
   982  	c.setupInuseKeyRanges()
   983  	return c
   984  }
   985  
   986  func (c *compaction) hasExtraLevelData() bool {
   987  	if len(c.extraLevels) == 0 {
   988  		// not a multi level compaction
   989  		return false
   990  	} else if c.extraLevels[0].files.Empty() {
   991  		// a multi level compaction without data in the intermediate input level;
   992  		// e.g. for a multi level compaction with levels 4,5, and 6, this could
   993  		// occur if there is no files to compact in 5, or in 5 and 6 (i.e. a move).
   994  		return false
   995  	}
   996  	return true
   997  }
   998  
   999  func (c *compaction) setupInuseKeyRanges() {
  1000  	level := c.outputLevel.level + 1
  1001  	if c.outputLevel.level == 0 {
  1002  		level = 0
  1003  	}
  1004  	// calculateInuseKeyRanges will return a series of sorted spans. Overlapping
  1005  	// or abutting spans have already been merged.
  1006  	c.inuseKeyRanges = calculateInuseKeyRanges(
  1007  		c.version, c.cmp, level, numLevels-1, c.smallest.UserKey, c.largest.UserKey,
  1008  	)
  1009  	// Check if there's a single in-use span that encompasses the entire key
  1010  	// range of the compaction. This is an optimization to avoid key comparisons
  1011  	// against inuseKeyRanges during the compaction when every key within the
  1012  	// compaction overlaps with an in-use span.
  1013  	if len(c.inuseKeyRanges) > 0 {
  1014  		c.inuseEntireRange = c.cmp(c.inuseKeyRanges[0].Start, c.smallest.UserKey) <= 0 &&
  1015  			c.cmp(c.inuseKeyRanges[0].End, c.largest.UserKey) >= 0
  1016  	}
  1017  }
  1018  
  1019  func calculateInuseKeyRanges(
  1020  	v *version, cmp base.Compare, level, maxLevel int, smallest, largest []byte,
  1021  ) []manifest.UserKeyRange {
  1022  	// Use two slices, alternating which one is input and which one is output
  1023  	// as we descend the LSM.
  1024  	var input, output []manifest.UserKeyRange
  1025  
  1026  	// L0 requires special treatment, since sstables within L0 may overlap.
  1027  	// We use the L0 Sublevels structure to efficiently calculate the merged
  1028  	// in-use key ranges.
  1029  	if level == 0 {
  1030  		output = v.L0Sublevels.InUseKeyRanges(smallest, largest)
  1031  		level++
  1032  	}
  1033  
  1034  	for ; level <= maxLevel; level++ {
  1035  		// NB: We always treat `largest` as inclusive for simplicity, because
  1036  		// there's little consequence to calculating slightly broader in-use key
  1037  		// ranges.
  1038  		overlaps := v.Overlaps(level, cmp, smallest, largest, false /* exclusiveEnd */)
  1039  		iter := overlaps.Iter()
  1040  
  1041  		// We may already have in-use key ranges from higher levels. Iterate
  1042  		// through both our accumulated in-use key ranges and this level's
  1043  		// files, merging the two.
  1044  		//
  1045  		// Tables higher within the LSM have broader key spaces. We use this
  1046  		// when possible to seek past a level's files that are contained by
  1047  		// our current accumulated in-use key ranges. This helps avoid
  1048  		// per-sstable work during flushes or compactions in high levels which
  1049  		// overlap the majority of the LSM's sstables.
  1050  		input, output = output, input
  1051  		output = output[:0]
  1052  
  1053  		var currFile *fileMetadata
  1054  		var currAccum *manifest.UserKeyRange
  1055  		if len(input) > 0 {
  1056  			currAccum, input = &input[0], input[1:]
  1057  		}
  1058  
  1059  		// If we have an accumulated key range and its start is ≤ smallest,
  1060  		// we can seek to the accumulated range's end. Otherwise, we need to
  1061  		// start at the first overlapping file within the level.
  1062  		if currAccum != nil && cmp(currAccum.Start, smallest) <= 0 {
  1063  			currFile = seekGT(&iter, cmp, currAccum.End)
  1064  		} else {
  1065  			currFile = iter.First()
  1066  		}
  1067  
  1068  		for currFile != nil || currAccum != nil {
  1069  			// If we've exhausted either the files in the level or the
  1070  			// accumulated key ranges, we just need to append the one we have.
  1071  			// If we have both a currFile and a currAccum, they either overlap
  1072  			// or they're disjoint. If they're disjoint, we append whichever
  1073  			// one sorts first and move on to the next file or range. If they
  1074  			// overlap, we merge them into currAccum and proceed to the next
  1075  			// file.
  1076  			switch {
  1077  			case currAccum == nil || (currFile != nil && cmp(currFile.Largest.UserKey, currAccum.Start) < 0):
  1078  				// This file is strictly before the current accumulated range,
  1079  				// or there are no more accumulated ranges.
  1080  				output = append(output, manifest.UserKeyRange{
  1081  					Start: currFile.Smallest.UserKey,
  1082  					End:   currFile.Largest.UserKey,
  1083  				})
  1084  				currFile = iter.Next()
  1085  			case currFile == nil || (currAccum != nil && cmp(currAccum.End, currFile.Smallest.UserKey) < 0):
  1086  				// The current accumulated key range is strictly before the
  1087  				// current file, or there are no more files.
  1088  				output = append(output, *currAccum)
  1089  				currAccum = nil
  1090  				if len(input) > 0 {
  1091  					currAccum, input = &input[0], input[1:]
  1092  				}
  1093  			default:
  1094  				// The current accumulated range and the current file overlap.
  1095  				// Adjust the accumulated range to be the union.
  1096  				if cmp(currFile.Smallest.UserKey, currAccum.Start) < 0 {
  1097  					currAccum.Start = currFile.Smallest.UserKey
  1098  				}
  1099  				if cmp(currFile.Largest.UserKey, currAccum.End) > 0 {
  1100  					currAccum.End = currFile.Largest.UserKey
  1101  				}
  1102  
  1103  				// Extending `currAccum`'s end boundary may have caused it to
  1104  				// overlap with `input` key ranges that we haven't processed
  1105  				// yet. Merge any such key ranges.
  1106  				for len(input) > 0 && cmp(input[0].Start, currAccum.End) <= 0 {
  1107  					if cmp(input[0].End, currAccum.End) > 0 {
  1108  						currAccum.End = input[0].End
  1109  					}
  1110  					input = input[1:]
  1111  				}
  1112  				// Seek the level iterator past our current accumulated end.
  1113  				currFile = seekGT(&iter, cmp, currAccum.End)
  1114  			}
  1115  		}
  1116  	}
  1117  	return output
  1118  }
  1119  
  1120  func seekGT(iter *manifest.LevelIterator, cmp base.Compare, key []byte) *manifest.FileMetadata {
  1121  	f := iter.SeekGE(cmp, key)
  1122  	for f != nil && cmp(f.Largest.UserKey, key) == 0 {
  1123  		f = iter.Next()
  1124  	}
  1125  	return f
  1126  }
  1127  
  1128  // findGrandparentLimit takes the start user key for a table and returns the
  1129  // user key to which that table can extend without excessively overlapping
  1130  // the grandparent level. If no limit is needed considering the grandparent
  1131  // files, this function returns nil. This is done in order to prevent a table
  1132  // at level N from overlapping too much data at level N+1. We want to avoid
  1133  // such large overlaps because they translate into large compactions. The
  1134  // current heuristic stops output of a table if the addition of another key
  1135  // would cause the table to overlap more than 10x the target file size at
  1136  // level N. See maxGrandparentOverlapBytes.
  1137  func (c *compaction) findGrandparentLimit(start []byte) []byte {
  1138  	iter := c.grandparents.Iter()
  1139  	var overlappedBytes uint64
  1140  	var greater bool
  1141  	for f := iter.SeekGE(c.cmp, start); f != nil; f = iter.Next() {
  1142  		overlappedBytes += f.Size
  1143  		// To ensure forward progress we always return a larger user
  1144  		// key than where we started. See comments above clients of
  1145  		// this function for how this is used.
  1146  		greater = greater || c.cmp(f.Smallest.UserKey, start) > 0
  1147  		if !greater {
  1148  			continue
  1149  		}
  1150  
  1151  		// We return the smallest bound of a sstable rather than the
  1152  		// largest because the smallest is always inclusive, and limits
  1153  		// are used exlusively when truncating range tombstones. If we
  1154  		// truncated an output to the largest key while there's a
  1155  		// pending tombstone, the next output file would also overlap
  1156  		// the same grandparent f.
  1157  		if overlappedBytes > c.maxOverlapBytes {
  1158  			return f.Smallest.UserKey
  1159  		}
  1160  	}
  1161  	return nil
  1162  }
  1163  
  1164  // findL0Limit takes the start key for a table and returns the user key to which
  1165  // that table can be extended without hitting the next l0Limit. Having flushed
  1166  // sstables "bridging across" an l0Limit could lead to increased L0 -> LBase
  1167  // compaction sizes as well as elevated read amplification.
  1168  func (c *compaction) findL0Limit(start []byte) []byte {
  1169  	if c.startLevel.level > -1 || c.outputLevel.level != 0 || len(c.l0Limits) == 0 {
  1170  		return nil
  1171  	}
  1172  	index := sort.Search(len(c.l0Limits), func(i int) bool {
  1173  		return c.cmp(c.l0Limits[i], start) > 0
  1174  	})
  1175  	if index < len(c.l0Limits) {
  1176  		return c.l0Limits[index]
  1177  	}
  1178  	return nil
  1179  }
  1180  
  1181  // errorOnUserKeyOverlap returns an error if the last two written sstables in
  1182  // this compaction have revisions of the same user key present in both sstables,
  1183  // when it shouldn't (eg. when splitting flushes).
  1184  func (c *compaction) errorOnUserKeyOverlap(ve *versionEdit) error {
  1185  	if n := len(ve.NewFiles); n > 1 {
  1186  		meta := ve.NewFiles[n-1].Meta
  1187  		prevMeta := ve.NewFiles[n-2].Meta
  1188  		if !prevMeta.Largest.IsExclusiveSentinel() &&
  1189  			c.cmp(prevMeta.Largest.UserKey, meta.Smallest.UserKey) >= 0 {
  1190  			return errors.Errorf("pebble: compaction split user key across two sstables: %s in %s and %s",
  1191  				prevMeta.Largest.Pretty(c.formatKey),
  1192  				prevMeta.FileNum,
  1193  				meta.FileNum)
  1194  		}
  1195  	}
  1196  	return nil
  1197  }
  1198  
  1199  // allowZeroSeqNum returns true if seqnum's can be zeroed if there are no
  1200  // snapshots requiring them to be kept. It performs this determination by
  1201  // looking for an sstable which overlaps the bounds of the compaction at a
  1202  // lower level in the LSM.
  1203  func (c *compaction) allowZeroSeqNum() bool {
  1204  	return c.elideRangeTombstone(c.smallest.UserKey, c.largest.UserKey)
  1205  }
  1206  
  1207  // elideTombstone returns true if it is ok to elide a tombstone for the
  1208  // specified key. A return value of true guarantees that there are no key/value
  1209  // pairs at c.level+2 or higher that possibly contain the specified user
  1210  // key. The keys in multiple invocations to elideTombstone must be supplied in
  1211  // order.
  1212  func (c *compaction) elideTombstone(key []byte) bool {
  1213  	if c.inuseEntireRange || len(c.flushing) != 0 {
  1214  		return false
  1215  	}
  1216  
  1217  	for ; c.elideTombstoneIndex < len(c.inuseKeyRanges); c.elideTombstoneIndex++ {
  1218  		r := &c.inuseKeyRanges[c.elideTombstoneIndex]
  1219  		if c.cmp(key, r.End) <= 0 {
  1220  			if c.cmp(key, r.Start) >= 0 {
  1221  				return false
  1222  			}
  1223  			break
  1224  		}
  1225  	}
  1226  	return true
  1227  }
  1228  
  1229  // elideRangeTombstone returns true if it is ok to elide the specified range
  1230  // tombstone. A return value of true guarantees that there are no key/value
  1231  // pairs at c.outputLevel.level+1 or higher that possibly overlap the specified
  1232  // tombstone.
  1233  func (c *compaction) elideRangeTombstone(start, end []byte) bool {
  1234  	// Disable range tombstone elision if the testing knob for that is enabled,
  1235  	// or if we are flushing memtables. The latter requirement is due to
  1236  	// inuseKeyRanges not accounting for key ranges in other memtables that are
  1237  	// being flushed in the same compaction. It's possible for a range tombstone
  1238  	// in one memtable to overlap keys in a preceding memtable in c.flushing.
  1239  	//
  1240  	// This function is also used in setting allowZeroSeqNum, so disabling
  1241  	// elision of range tombstones also disables zeroing of SeqNums.
  1242  	//
  1243  	// TODO(peter): we disable zeroing of seqnums during flushing to match
  1244  	// RocksDB behavior and to avoid generating overlapping sstables during
  1245  	// DB.replayWAL. When replaying WAL files at startup, we flush after each
  1246  	// WAL is replayed building up a single version edit that is
  1247  	// applied. Because we don't apply the version edit after each flush, this
  1248  	// code doesn't know that L0 contains files and zeroing of seqnums should
  1249  	// be disabled. That is fixable, but it seems safer to just match the
  1250  	// RocksDB behavior for now.
  1251  	if c.disableSpanElision || len(c.flushing) != 0 {
  1252  		return false
  1253  	}
  1254  
  1255  	lower := sort.Search(len(c.inuseKeyRanges), func(i int) bool {
  1256  		return c.cmp(c.inuseKeyRanges[i].End, start) >= 0
  1257  	})
  1258  	upper := sort.Search(len(c.inuseKeyRanges), func(i int) bool {
  1259  		return c.cmp(c.inuseKeyRanges[i].Start, end) > 0
  1260  	})
  1261  	return lower >= upper
  1262  }
  1263  
  1264  // elideRangeKey returns true if it is ok to elide the specified range key. A
  1265  // return value of true guarantees that there are no key/value pairs at
  1266  // c.outputLevel.level+1 or higher that possibly overlap the specified range key.
  1267  func (c *compaction) elideRangeKey(start, end []byte) bool {
  1268  	// TODO(bilal): Track inuseKeyRanges separately for the range keyspace as
  1269  	// opposed to the point keyspace. Once that is done, elideRangeTombstone
  1270  	// can just check in the point keyspace, and this function can check for
  1271  	// inuseKeyRanges in the range keyspace.
  1272  	return c.elideRangeTombstone(start, end)
  1273  }
  1274  
  1275  // newInputIter returns an iterator over all the input tables in a compaction.
  1276  func (c *compaction) newInputIter(
  1277  	newIters tableNewIters, newRangeKeyIter keyspan.TableNewSpanIter, snapshots []uint64,
  1278  ) (_ internalIterator, retErr error) {
  1279  	// Validate the ordering of compaction input files for defense in depth.
  1280  	// TODO(jackson): Some of the CheckOrdering calls may be adapted to pass
  1281  	// ProhibitSplitUserKeys if we thread the active format major version in. Or
  1282  	// if we remove support for earlier FMVs, we can remove the parameter
  1283  	// altogether.
  1284  	if len(c.flushing) == 0 {
  1285  		if c.startLevel.level >= 0 {
  1286  			err := manifest.CheckOrdering(c.cmp, c.formatKey,
  1287  				manifest.Level(c.startLevel.level), c.startLevel.files.Iter(),
  1288  				manifest.AllowSplitUserKeys)
  1289  			if err != nil {
  1290  				return nil, err
  1291  			}
  1292  		}
  1293  		err := manifest.CheckOrdering(c.cmp, c.formatKey,
  1294  			manifest.Level(c.outputLevel.level), c.outputLevel.files.Iter(),
  1295  			manifest.AllowSplitUserKeys)
  1296  		if err != nil {
  1297  			return nil, err
  1298  		}
  1299  		if c.startLevel.level == 0 {
  1300  			if c.startLevel.l0SublevelInfo == nil {
  1301  				panic("l0SublevelInfo not created for compaction out of L0")
  1302  			}
  1303  			for _, info := range c.startLevel.l0SublevelInfo {
  1304  				err := manifest.CheckOrdering(c.cmp, c.formatKey,
  1305  					info.sublevel, info.Iter(),
  1306  					// NB: L0 sublevels have never allowed split user keys.
  1307  					manifest.ProhibitSplitUserKeys)
  1308  				if err != nil {
  1309  					return nil, err
  1310  				}
  1311  			}
  1312  		}
  1313  		if len(c.extraLevels) > 0 {
  1314  			if len(c.extraLevels) > 1 {
  1315  				panic("n>2 multi level compaction not implemented yet")
  1316  			}
  1317  			interLevel := c.extraLevels[0]
  1318  			err := manifest.CheckOrdering(c.cmp, c.formatKey,
  1319  				manifest.Level(interLevel.level), interLevel.files.Iter(),
  1320  				manifest.AllowSplitUserKeys)
  1321  			if err != nil {
  1322  				return nil, err
  1323  			}
  1324  		}
  1325  	}
  1326  
  1327  	// There are three classes of keys that a compaction needs to process: point
  1328  	// keys, range deletion tombstones and range keys. Collect all iterators for
  1329  	// all these classes of keys from all the levels. We'll aggregate them
  1330  	// together farther below.
  1331  	//
  1332  	// numInputLevels is an approximation of the number of iterator levels. Due
  1333  	// to idiosyncrasies in iterator construction, we may (rarely) exceed this
  1334  	// initial capacity.
  1335  	numInputLevels := max(len(c.flushing), len(c.inputs))
  1336  	iters := make([]internalIterator, 0, numInputLevels)
  1337  	rangeDelIters := make([]keyspan.FragmentIterator, 0, numInputLevels)
  1338  	rangeKeyIters := make([]keyspan.FragmentIterator, 0, numInputLevels)
  1339  
  1340  	// If construction of the iterator inputs fails, ensure that we close all
  1341  	// the consitutent iterators.
  1342  	defer func() {
  1343  		if retErr != nil {
  1344  			for _, iter := range iters {
  1345  				if iter != nil {
  1346  					iter.Close()
  1347  				}
  1348  			}
  1349  			for _, rangeDelIter := range rangeDelIters {
  1350  				rangeDelIter.Close()
  1351  			}
  1352  		}
  1353  	}()
  1354  	iterOpts := IterOptions{
  1355  		CategoryAndQoS: sstable.CategoryAndQoS{
  1356  			Category: "pebble-compaction",
  1357  			QoSLevel: sstable.NonLatencySensitiveQoSLevel,
  1358  		},
  1359  		logger: c.logger,
  1360  	}
  1361  
  1362  	// Populate iters, rangeDelIters and rangeKeyIters with the appropriate
  1363  	// constituent iterators. This depends on whether this is a flush or a
  1364  	// compaction.
  1365  	if len(c.flushing) != 0 {
  1366  		// If flushing, we need to build the input iterators over the memtables
  1367  		// stored in c.flushing.
  1368  		for i := range c.flushing {
  1369  			f := c.flushing[i]
  1370  			iters = append(iters, f.newFlushIter(nil, &c.bytesIterated))
  1371  			rangeDelIter := f.newRangeDelIter(nil)
  1372  			if rangeDelIter != nil {
  1373  				rangeDelIters = append(rangeDelIters, rangeDelIter)
  1374  			}
  1375  			if rangeKeyIter := f.newRangeKeyIter(nil); rangeKeyIter != nil {
  1376  				rangeKeyIters = append(rangeKeyIters, rangeKeyIter)
  1377  			}
  1378  		}
  1379  	} else {
  1380  		addItersForLevel := func(level *compactionLevel, l manifest.Level) error {
  1381  			// Add a *levelIter for point iterators. Because we don't call
  1382  			// initRangeDel, the levelIter will close and forget the range
  1383  			// deletion iterator when it steps on to a new file. Surfacing range
  1384  			// deletions to compactions are handled below.
  1385  			iters = append(iters, newLevelIter(context.Background(),
  1386  				iterOpts, c.comparer, newIters, level.files.Iter(), l, internalIterOpts{
  1387  					bytesIterated: &c.bytesIterated,
  1388  					bufferPool:    &c.bufferPool,
  1389  				}))
  1390  			// TODO(jackson): Use keyspan.LevelIter to avoid loading all the range
  1391  			// deletions into memory upfront. (See #2015, which reverted this.)
  1392  			// There will be no user keys that are split between sstables
  1393  			// within a level in Cockroach 23.1, which unblocks this optimization.
  1394  
  1395  			// Add the range deletion iterator for each file as an independent level
  1396  			// in mergingIter, as opposed to making a levelIter out of those. This
  1397  			// is safer as levelIter expects all keys coming from underlying
  1398  			// iterators to be in order. Due to compaction / tombstone writing
  1399  			// logic in finishOutput(), it is possible for range tombstones to not
  1400  			// be strictly ordered across all files in one level.
  1401  			//
  1402  			// Consider this example from the metamorphic tests (also repeated in
  1403  			// finishOutput()), consisting of three L3 files with their bounds
  1404  			// specified in square brackets next to the file name:
  1405  			//
  1406  			// ./000240.sst   [tmgc#391,MERGE-tmgc#391,MERGE]
  1407  			// tmgc#391,MERGE [786e627a]
  1408  			// tmgc-udkatvs#331,RANGEDEL
  1409  			//
  1410  			// ./000241.sst   [tmgc#384,MERGE-tmgc#384,MERGE]
  1411  			// tmgc#384,MERGE [666c7070]
  1412  			// tmgc-tvsalezade#383,RANGEDEL
  1413  			// tmgc-tvsalezade#331,RANGEDEL
  1414  			//
  1415  			// ./000242.sst   [tmgc#383,RANGEDEL-tvsalezade#72057594037927935,RANGEDEL]
  1416  			// tmgc-tvsalezade#383,RANGEDEL
  1417  			// tmgc#375,SET [72646c78766965616c72776865676e79]
  1418  			// tmgc-tvsalezade#356,RANGEDEL
  1419  			//
  1420  			// Here, the range tombstone in 000240.sst falls "after" one in
  1421  			// 000241.sst, despite 000240.sst being ordered "before" 000241.sst for
  1422  			// levelIter's purposes. While each file is still consistent before its
  1423  			// bounds, it's safer to have all rangedel iterators be visible to
  1424  			// mergingIter.
  1425  			iter := level.files.Iter()
  1426  			for f := iter.First(); f != nil; f = iter.Next() {
  1427  				rangeDelIter, closer, err := c.newRangeDelIter(
  1428  					newIters, iter.Take(), iterOpts, l, &c.bytesIterated)
  1429  				if err != nil {
  1430  					// The error will already be annotated with the BackingFileNum, so
  1431  					// we annotate it with the FileNum.
  1432  					return errors.Wrapf(err, "pebble: could not open table %s", errors.Safe(f.FileNum))
  1433  				}
  1434  				if rangeDelIter == nil {
  1435  					continue
  1436  				}
  1437  				rangeDelIters = append(rangeDelIters, rangeDelIter)
  1438  				c.closers = append(c.closers, closer)
  1439  			}
  1440  
  1441  			// Check if this level has any range keys.
  1442  			hasRangeKeys := false
  1443  			for f := iter.First(); f != nil; f = iter.Next() {
  1444  				if f.HasRangeKeys {
  1445  					hasRangeKeys = true
  1446  					break
  1447  				}
  1448  			}
  1449  			if hasRangeKeys {
  1450  				li := &keyspan.LevelIter{}
  1451  				newRangeKeyIterWrapper := func(file *manifest.FileMetadata, iterOptions keyspan.SpanIterOptions) (keyspan.FragmentIterator, error) {
  1452  					iter, err := newRangeKeyIter(file, iterOptions)
  1453  					if err != nil {
  1454  						return nil, err
  1455  					} else if iter == nil {
  1456  						return emptyKeyspanIter, nil
  1457  					}
  1458  					// Ensure that the range key iter is not closed until the compaction is
  1459  					// finished. This is necessary because range key processing
  1460  					// requires the range keys to be held in memory for up to the
  1461  					// lifetime of the compaction.
  1462  					c.closers = append(c.closers, iter)
  1463  					iter = noCloseIter{iter}
  1464  
  1465  					// We do not need to truncate range keys to sstable boundaries, or
  1466  					// only read within the file's atomic compaction units, unlike with
  1467  					// range tombstones. This is because range keys were added after we
  1468  					// stopped splitting user keys across sstables, so all the range keys
  1469  					// in this sstable must wholly lie within the file's bounds.
  1470  					return iter, err
  1471  				}
  1472  				li.Init(keyspan.SpanIterOptions{}, c.cmp, newRangeKeyIterWrapper, level.files.Iter(), l, manifest.KeyTypeRange)
  1473  				rangeKeyIters = append(rangeKeyIters, li)
  1474  			}
  1475  			return nil
  1476  		}
  1477  
  1478  		for i := range c.inputs {
  1479  			// If the level is annotated with l0SublevelInfo, expand it into one
  1480  			// level per sublevel.
  1481  			// TODO(jackson): Perform this expansion even earlier when we pick the
  1482  			// compaction?
  1483  			if len(c.inputs[i].l0SublevelInfo) > 0 {
  1484  				for _, info := range c.startLevel.l0SublevelInfo {
  1485  					sublevelCompactionLevel := &compactionLevel{0, info.LevelSlice, nil}
  1486  					if err := addItersForLevel(sublevelCompactionLevel, info.sublevel); err != nil {
  1487  						return nil, err
  1488  					}
  1489  				}
  1490  				continue
  1491  			}
  1492  			if err := addItersForLevel(&c.inputs[i], manifest.Level(c.inputs[i].level)); err != nil {
  1493  				return nil, err
  1494  			}
  1495  		}
  1496  	}
  1497  
  1498  	// In normal operation, levelIter iterates over the point operations in a
  1499  	// level, and initializes a rangeDelIter pointer for the range deletions in
  1500  	// each table. During compaction, we want to iterate over the merged view of
  1501  	// point operations and range deletions. In order to do this we create one
  1502  	// levelIter per level to iterate over the point operations, and collect up
  1503  	// all the range deletion files.
  1504  	//
  1505  	// The range deletion levels are first combined with a keyspan.MergingIter
  1506  	// (currently wrapped by a keyspan.InternalIteratorShim to satisfy the
  1507  	// internal iterator interface). The resulting merged rangedel iterator is
  1508  	// then included with the point levels in a single mergingIter.
  1509  	//
  1510  	// Combine all the rangedel iterators using a keyspan.MergingIterator and a
  1511  	// InternalIteratorShim so that the range deletions may be interleaved in
  1512  	// the compaction input.
  1513  	// TODO(jackson): Replace the InternalIteratorShim with an interleaving
  1514  	// iterator.
  1515  	if len(rangeDelIters) > 0 {
  1516  		c.rangeDelIter.Init(c.cmp, rangeDelIters...)
  1517  		iters = append(iters, &c.rangeDelIter)
  1518  	}
  1519  
  1520  	// If there's only one constituent point iterator, we can avoid the overhead
  1521  	// of a *mergingIter. This is possible, for example, when performing a flush
  1522  	// of a single memtable. Otherwise, combine all the iterators into a merging
  1523  	// iter.
  1524  	iter := iters[0]
  1525  	if len(iters) > 0 {
  1526  		iter = newMergingIter(c.logger, &c.stats, c.cmp, nil, iters...)
  1527  	}
  1528  	// If there are range key iterators, we need to combine them using
  1529  	// keyspan.MergingIter, and then interleave them among the points.
  1530  	if len(rangeKeyIters) > 0 {
  1531  		mi := &keyspan.MergingIter{}
  1532  		mi.Init(c.cmp, rangeKeyCompactionTransform(c.equal, snapshots, c.elideRangeKey), new(keyspan.MergingBuffers), rangeKeyIters...)
  1533  		di := &keyspan.DefragmentingIter{}
  1534  		di.Init(c.comparer, mi, keyspan.DefragmentInternal, keyspan.StaticDefragmentReducer, new(keyspan.DefragmentingBuffers))
  1535  		c.rangeKeyInterleaving.Init(c.comparer, iter, di, keyspan.InterleavingIterOpts{})
  1536  		iter = &c.rangeKeyInterleaving
  1537  	}
  1538  	return iter, nil
  1539  }
  1540  
  1541  func (c *compaction) newRangeDelIter(
  1542  	newIters tableNewIters,
  1543  	f manifest.LevelFile,
  1544  	opts IterOptions,
  1545  	l manifest.Level,
  1546  	bytesIterated *uint64,
  1547  ) (keyspan.FragmentIterator, io.Closer, error) {
  1548  	opts.level = l
  1549  	iter, rangeDelIter, err := newIters(context.Background(), f.FileMetadata,
  1550  		&opts, internalIterOpts{
  1551  			bytesIterated: &c.bytesIterated,
  1552  			bufferPool:    &c.bufferPool,
  1553  		})
  1554  	if err != nil {
  1555  		return nil, nil, err
  1556  	}
  1557  	// TODO(peter): It is mildly wasteful to open the point iterator only to
  1558  	// immediately close it. One way to solve this would be to add new
  1559  	// methods to tableCache for creating point and range-deletion iterators
  1560  	// independently. We'd only want to use those methods here,
  1561  	// though. Doesn't seem worth the hassle in the near term.
  1562  	if err = iter.Close(); err != nil {
  1563  		if rangeDelIter != nil {
  1564  			err = errors.CombineErrors(err, rangeDelIter.Close())
  1565  		}
  1566  		return nil, nil, err
  1567  	}
  1568  	if rangeDelIter == nil {
  1569  		// The file doesn't contain any range deletions.
  1570  		return nil, nil, nil
  1571  	}
  1572  
  1573  	// Ensure that rangeDelIter is not closed until the compaction is
  1574  	// finished. This is necessary because range tombstone processing
  1575  	// requires the range tombstones to be held in memory for up to the
  1576  	// lifetime of the compaction.
  1577  	closer := rangeDelIter
  1578  	rangeDelIter = noCloseIter{rangeDelIter}
  1579  
  1580  	// Truncate the range tombstones returned by the iterator to the
  1581  	// upper bound of the atomic compaction unit of the file. We want to
  1582  	// truncate the range tombstone to the bounds of the file, but files
  1583  	// with split user keys pose an obstacle: The file's largest bound
  1584  	// is inclusive whereas the range tombstone's end is exclusive.
  1585  	//
  1586  	// Consider the example:
  1587  	//
  1588  	//   000001:[b-f#200]         range del [c,k)
  1589  	//   000002:[f#190-g#inf]     range del [c,k)
  1590  	//   000003:[g#500-i#3]
  1591  	//
  1592  	// Files 000001 and 000002 contain the untruncated range tombstones
  1593  	// [c,k). While the keyspace covered by 000003 was at one point
  1594  	// deleted by the tombstone [c,k), the tombstone may have already
  1595  	// been compacted away and the file does not contain an untruncated
  1596  	// range tombstone. We want to bound 000001's tombstone to the file
  1597  	// bounds, but it's not possible to encode a range tombstone with an
  1598  	// end boundary within a user key (eg, between sequence numbers
  1599  	// f#200 and f#190). Instead, we expand 000001 to its atomic
  1600  	// compaction unit (000001 and 000002) and truncate the tombstone to
  1601  	// g#inf.
  1602  	//
  1603  	// NB: We must not use the atomic compaction unit of the entire
  1604  	// compaction, because the [c,k) tombstone contained in the file
  1605  	// 000001 ≥ g. If 000001, 000002 and 000003 are all included in the
  1606  	// same compaction, the compaction's atomic compaction unit includes
  1607  	// 000003. However 000003's keys must not be covered by 000001's
  1608  	// untruncated range tombstone.
  1609  	//
  1610  	// Note that we need do this truncation at read time in order to
  1611  	// handle sstables generated by RocksDB and earlier versions of
  1612  	// Pebble which do not truncate range tombstones to atomic
  1613  	// compaction unit boundaries at write time.
  1614  	//
  1615  	// The current Pebble compaction logic DOES truncate tombstones to
  1616  	// atomic unit boundaries at compaction time too.
  1617  	atomicUnit, _ := expandToAtomicUnit(c.cmp, f.Slice(), true /* disableIsCompacting */)
  1618  	lowerBound, upperBound := manifest.KeyRange(c.cmp, atomicUnit.Iter())
  1619  	// Range deletion tombstones are often written to sstables
  1620  	// untruncated on the end key side. However, they are still only
  1621  	// valid within a given file's bounds. The logic for writing range
  1622  	// tombstones to an output file sometimes has an incomplete view
  1623  	// of range tombstones outside the file's internal key bounds. Skip
  1624  	// any range tombstones completely outside file bounds.
  1625  	rangeDelIter = keyspan.Truncate(
  1626  		c.cmp, rangeDelIter, lowerBound.UserKey, upperBound.UserKey,
  1627  		&f.Smallest, &f.Largest, false, /* panicOnUpperTruncate */
  1628  	)
  1629  	return rangeDelIter, closer, nil
  1630  }
  1631  
  1632  func (c *compaction) String() string {
  1633  	if len(c.flushing) != 0 {
  1634  		return "flush\n"
  1635  	}
  1636  
  1637  	var buf bytes.Buffer
  1638  	for level := c.startLevel.level; level <= c.outputLevel.level; level++ {
  1639  		i := level - c.startLevel.level
  1640  		fmt.Fprintf(&buf, "%d:", level)
  1641  		iter := c.inputs[i].files.Iter()
  1642  		for f := iter.First(); f != nil; f = iter.Next() {
  1643  			fmt.Fprintf(&buf, " %s:%s-%s", f.FileNum, f.Smallest, f.Largest)
  1644  		}
  1645  		fmt.Fprintf(&buf, "\n")
  1646  	}
  1647  	return buf.String()
  1648  }
  1649  
  1650  type manualCompaction struct {
  1651  	// Count of the retries either due to too many concurrent compactions, or a
  1652  	// concurrent compaction to overlapping levels.
  1653  	retries     int
  1654  	level       int
  1655  	outputLevel int
  1656  	done        chan error
  1657  	start       []byte
  1658  	end         []byte
  1659  	split       bool
  1660  }
  1661  
  1662  type readCompaction struct {
  1663  	level int
  1664  	// [start, end] key ranges are used for de-duping.
  1665  	start []byte
  1666  	end   []byte
  1667  
  1668  	// The file associated with the compaction.
  1669  	// If the file no longer belongs in the same
  1670  	// level, then we skip the compaction.
  1671  	fileNum base.FileNum
  1672  }
  1673  
  1674  type downloadSpan struct {
  1675  	start []byte
  1676  	end   []byte
  1677  	// doneChans contains a list of channels passed into compactions as done
  1678  	// channels. Each channel has a buffer size of 1 and is only passed into
  1679  	// one compaction. This slice can grow over the lifetime of a downloadSpan.
  1680  	doneChans []chan error
  1681  	// compactionsStarted is the number of compactions started for this
  1682  	// downloadSpan. Must be equal to len(doneChans)-1, i.e. there's one spare
  1683  	// doneChan created each time a compaction starts up, for the next compaction.
  1684  	compactionsStarted int
  1685  }
  1686  
  1687  func (d *DB) addInProgressCompaction(c *compaction) {
  1688  	d.mu.compact.inProgress[c] = struct{}{}
  1689  	var isBase, isIntraL0 bool
  1690  	for _, cl := range c.inputs {
  1691  		iter := cl.files.Iter()
  1692  		for f := iter.First(); f != nil; f = iter.Next() {
  1693  			if f.IsCompacting() {
  1694  				d.opts.Logger.Fatalf("L%d->L%d: %s already being compacted", c.startLevel.level, c.outputLevel.level, f.FileNum)
  1695  			}
  1696  			f.SetCompactionState(manifest.CompactionStateCompacting)
  1697  			if c.startLevel != nil && c.outputLevel != nil && c.startLevel.level == 0 {
  1698  				if c.outputLevel.level == 0 {
  1699  					f.IsIntraL0Compacting = true
  1700  					isIntraL0 = true
  1701  				} else {
  1702  					isBase = true
  1703  				}
  1704  			}
  1705  		}
  1706  	}
  1707  
  1708  	if (isIntraL0 || isBase) && c.version.L0Sublevels != nil {
  1709  		l0Inputs := []manifest.LevelSlice{c.startLevel.files}
  1710  		if isIntraL0 {
  1711  			l0Inputs = append(l0Inputs, c.outputLevel.files)
  1712  		}
  1713  		if err := c.version.L0Sublevels.UpdateStateForStartedCompaction(l0Inputs, isBase); err != nil {
  1714  			d.opts.Logger.Fatalf("could not update state for compaction: %s", err)
  1715  		}
  1716  	}
  1717  }
  1718  
  1719  // Removes compaction markers from files in a compaction. The rollback parameter
  1720  // indicates whether the compaction state should be rolled back to its original
  1721  // state in the case of an unsuccessful compaction.
  1722  //
  1723  // DB.mu must be held when calling this method, however this method can drop and
  1724  // re-acquire that mutex. All writes to the manifest for this compaction should
  1725  // have completed by this point.
  1726  func (d *DB) clearCompactingState(c *compaction, rollback bool) {
  1727  	c.versionEditApplied = true
  1728  	for _, cl := range c.inputs {
  1729  		iter := cl.files.Iter()
  1730  		for f := iter.First(); f != nil; f = iter.Next() {
  1731  			if !f.IsCompacting() {
  1732  				d.opts.Logger.Fatalf("L%d->L%d: %s not being compacted", c.startLevel.level, c.outputLevel.level, f.FileNum)
  1733  			}
  1734  			if !rollback {
  1735  				// On success all compactions other than move-compactions transition the
  1736  				// file into the Compacted state. Move-compacted files become eligible
  1737  				// for compaction again and transition back to NotCompacting.
  1738  				if c.kind != compactionKindMove {
  1739  					f.SetCompactionState(manifest.CompactionStateCompacted)
  1740  				} else {
  1741  					f.SetCompactionState(manifest.CompactionStateNotCompacting)
  1742  				}
  1743  			} else {
  1744  				// Else, on rollback, all input files unconditionally transition back to
  1745  				// NotCompacting.
  1746  				f.SetCompactionState(manifest.CompactionStateNotCompacting)
  1747  			}
  1748  			f.IsIntraL0Compacting = false
  1749  		}
  1750  	}
  1751  	l0InProgress := inProgressL0Compactions(d.getInProgressCompactionInfoLocked(c))
  1752  	func() {
  1753  		// InitCompactingFileInfo requires that no other manifest writes be
  1754  		// happening in parallel with it, i.e. we're not in the midst of installing
  1755  		// another version. Otherwise, it's possible that we've created another
  1756  		// L0Sublevels instance, but not added it to the versions list, causing
  1757  		// all the indices in FileMetadata to be inaccurate. To ensure this,
  1758  		// grab the manifest lock.
  1759  		d.mu.versions.logLock()
  1760  		defer d.mu.versions.logUnlock()
  1761  		d.mu.versions.currentVersion().L0Sublevels.InitCompactingFileInfo(l0InProgress)
  1762  	}()
  1763  }
  1764  
  1765  func (d *DB) calculateDiskAvailableBytes() uint64 {
  1766  	if space, err := d.opts.FS.GetDiskUsage(d.dirname); err == nil {
  1767  		d.diskAvailBytes.Store(space.AvailBytes)
  1768  		return space.AvailBytes
  1769  	} else if !errors.Is(err, vfs.ErrUnsupported) {
  1770  		d.opts.EventListener.BackgroundError(err)
  1771  	}
  1772  	return d.diskAvailBytes.Load()
  1773  }
  1774  
  1775  func (d *DB) getDeletionPacerInfo() deletionPacerInfo {
  1776  	var pacerInfo deletionPacerInfo
  1777  	// Call GetDiskUsage after every file deletion. This may seem inefficient,
  1778  	// but in practice this was observed to take constant time, regardless of
  1779  	// volume size used, at least on linux with ext4 and zfs. All invocations
  1780  	// take 10 microseconds or less.
  1781  	pacerInfo.freeBytes = d.calculateDiskAvailableBytes()
  1782  	d.mu.Lock()
  1783  	pacerInfo.obsoleteBytes = d.mu.versions.metrics.Table.ObsoleteSize
  1784  	pacerInfo.liveBytes = uint64(d.mu.versions.metrics.Total().Size)
  1785  	d.mu.Unlock()
  1786  	return pacerInfo
  1787  }
  1788  
  1789  // onObsoleteTableDelete is called to update metrics when an sstable is deleted.
  1790  func (d *DB) onObsoleteTableDelete(fileSize uint64) {
  1791  	d.mu.Lock()
  1792  	d.mu.versions.metrics.Table.ObsoleteCount--
  1793  	d.mu.versions.metrics.Table.ObsoleteSize -= fileSize
  1794  	d.mu.Unlock()
  1795  }
  1796  
  1797  // maybeScheduleFlush schedules a flush if necessary.
  1798  //
  1799  // d.mu must be held when calling this.
  1800  func (d *DB) maybeScheduleFlush() {
  1801  	if d.mu.compact.flushing || d.closed.Load() != nil || d.opts.ReadOnly {
  1802  		return
  1803  	}
  1804  	if len(d.mu.mem.queue) <= 1 {
  1805  		return
  1806  	}
  1807  
  1808  	if !d.passedFlushThreshold() {
  1809  		return
  1810  	}
  1811  
  1812  	d.mu.compact.flushing = true
  1813  	go d.flush()
  1814  }
  1815  
  1816  func (d *DB) passedFlushThreshold() bool {
  1817  	var n int
  1818  	var size uint64
  1819  	for ; n < len(d.mu.mem.queue)-1; n++ {
  1820  		if !d.mu.mem.queue[n].readyForFlush() {
  1821  			break
  1822  		}
  1823  		if d.mu.mem.queue[n].flushForced {
  1824  			// A flush was forced. Pretend the memtable size is the configured
  1825  			// size. See minFlushSize below.
  1826  			size += d.opts.MemTableSize
  1827  		} else {
  1828  			size += d.mu.mem.queue[n].totalBytes()
  1829  		}
  1830  	}
  1831  	if n == 0 {
  1832  		// None of the immutable memtables are ready for flushing.
  1833  		return false
  1834  	}
  1835  
  1836  	// Only flush once the sum of the queued memtable sizes exceeds half the
  1837  	// configured memtable size. This prevents flushing of memtables at startup
  1838  	// while we're undergoing the ramp period on the memtable size. See
  1839  	// DB.newMemTable().
  1840  	minFlushSize := d.opts.MemTableSize / 2
  1841  	return size >= minFlushSize
  1842  }
  1843  
  1844  func (d *DB) maybeScheduleDelayedFlush(tbl *memTable, dur time.Duration) {
  1845  	var mem *flushableEntry
  1846  	for _, m := range d.mu.mem.queue {
  1847  		if m.flushable == tbl {
  1848  			mem = m
  1849  			break
  1850  		}
  1851  	}
  1852  	if mem == nil || mem.flushForced {
  1853  		return
  1854  	}
  1855  	deadline := d.timeNow().Add(dur)
  1856  	if !mem.delayedFlushForcedAt.IsZero() && deadline.After(mem.delayedFlushForcedAt) {
  1857  		// Already scheduled to flush sooner than within `dur`.
  1858  		return
  1859  	}
  1860  	mem.delayedFlushForcedAt = deadline
  1861  	go func() {
  1862  		timer := time.NewTimer(dur)
  1863  		defer timer.Stop()
  1864  
  1865  		select {
  1866  		case <-d.closedCh:
  1867  			return
  1868  		case <-mem.flushed:
  1869  			return
  1870  		case <-timer.C:
  1871  			d.commit.mu.Lock()
  1872  			defer d.commit.mu.Unlock()
  1873  			d.mu.Lock()
  1874  			defer d.mu.Unlock()
  1875  
  1876  			// NB: The timer may fire concurrently with a call to Close.  If a
  1877  			// Close call beat us to acquiring d.mu, d.closed holds ErrClosed,
  1878  			// and it's too late to flush anything. Otherwise, the Close call
  1879  			// will block on locking d.mu until we've finished scheduling the
  1880  			// flush and set `d.mu.compact.flushing` to true. Close will wait
  1881  			// for the current flush to complete.
  1882  			if d.closed.Load() != nil {
  1883  				return
  1884  			}
  1885  
  1886  			if d.mu.mem.mutable == tbl {
  1887  				d.makeRoomForWrite(nil)
  1888  			} else {
  1889  				mem.flushForced = true
  1890  			}
  1891  			d.maybeScheduleFlush()
  1892  		}
  1893  	}()
  1894  }
  1895  
  1896  func (d *DB) flush() {
  1897  	pprof.Do(context.Background(), flushLabels, func(context.Context) {
  1898  		flushingWorkStart := time.Now()
  1899  		d.mu.Lock()
  1900  		defer d.mu.Unlock()
  1901  		idleDuration := flushingWorkStart.Sub(d.mu.compact.noOngoingFlushStartTime)
  1902  		var bytesFlushed uint64
  1903  		var err error
  1904  		if bytesFlushed, err = d.flush1(); err != nil {
  1905  			// TODO(peter): count consecutive flush errors and backoff.
  1906  			d.opts.EventListener.BackgroundError(err)
  1907  		}
  1908  		d.mu.compact.flushing = false
  1909  		d.mu.compact.noOngoingFlushStartTime = time.Now()
  1910  		workDuration := d.mu.compact.noOngoingFlushStartTime.Sub(flushingWorkStart)
  1911  		d.mu.compact.flushWriteThroughput.Bytes += int64(bytesFlushed)
  1912  		d.mu.compact.flushWriteThroughput.WorkDuration += workDuration
  1913  		d.mu.compact.flushWriteThroughput.IdleDuration += idleDuration
  1914  		// More flush work may have arrived while we were flushing, so schedule
  1915  		// another flush if needed.
  1916  		d.maybeScheduleFlush()
  1917  		// The flush may have produced too many files in a level, so schedule a
  1918  		// compaction if needed.
  1919  		d.maybeScheduleCompaction()
  1920  		d.mu.compact.cond.Broadcast()
  1921  	})
  1922  }
  1923  
  1924  // runIngestFlush is used to generate a flush version edit for sstables which
  1925  // were ingested as flushables. Both DB.mu and the manifest lock must be held
  1926  // while runIngestFlush is called.
  1927  func (d *DB) runIngestFlush(c *compaction) (*manifest.VersionEdit, error) {
  1928  	if len(c.flushing) != 1 {
  1929  		panic("pebble: ingestedFlushable must be flushed one at a time.")
  1930  	}
  1931  
  1932  	// Construct the VersionEdit, levelMetrics etc.
  1933  	c.metrics = make(map[int]*LevelMetrics, numLevels)
  1934  	// Finding the target level for ingestion must use the latest version
  1935  	// after the logLock has been acquired.
  1936  	c.version = d.mu.versions.currentVersion()
  1937  
  1938  	baseLevel := d.mu.versions.picker.getBaseLevel()
  1939  	iterOpts := IterOptions{logger: d.opts.Logger}
  1940  	ve := &versionEdit{}
  1941  	var level int
  1942  	var err error
  1943  	var fileToSplit *fileMetadata
  1944  	var ingestSplitFiles []ingestSplitFile
  1945  	for _, file := range c.flushing[0].flushable.(*ingestedFlushable).files {
  1946  		suggestSplit := d.opts.Experimental.IngestSplit != nil && d.opts.Experimental.IngestSplit() &&
  1947  			d.FormatMajorVersion() >= FormatVirtualSSTables
  1948  		level, fileToSplit, err = ingestTargetLevel(
  1949  			d.newIters, d.tableNewRangeKeyIter, iterOpts, d.opts.Comparer,
  1950  			c.version, baseLevel, d.mu.compact.inProgress, file.FileMetadata,
  1951  			suggestSplit,
  1952  		)
  1953  		if err != nil {
  1954  			return nil, err
  1955  		}
  1956  		ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: level, Meta: file.FileMetadata})
  1957  		if fileToSplit != nil {
  1958  			ingestSplitFiles = append(ingestSplitFiles, ingestSplitFile{
  1959  				ingestFile: file.FileMetadata,
  1960  				splitFile:  fileToSplit,
  1961  				level:      level,
  1962  			})
  1963  		}
  1964  		levelMetrics := c.metrics[level]
  1965  		if levelMetrics == nil {
  1966  			levelMetrics = &LevelMetrics{}
  1967  			c.metrics[level] = levelMetrics
  1968  		}
  1969  		levelMetrics.BytesIngested += file.Size
  1970  		levelMetrics.TablesIngested++
  1971  	}
  1972  
  1973  	updateLevelMetricsOnExcise := func(m *fileMetadata, level int, added []newFileEntry) {
  1974  		levelMetrics := c.metrics[level]
  1975  		if levelMetrics == nil {
  1976  			levelMetrics = &LevelMetrics{}
  1977  			c.metrics[level] = levelMetrics
  1978  		}
  1979  		levelMetrics.NumFiles--
  1980  		levelMetrics.Size -= int64(m.Size)
  1981  		for i := range added {
  1982  			levelMetrics.NumFiles++
  1983  			levelMetrics.Size += int64(added[i].Meta.Size)
  1984  		}
  1985  	}
  1986  
  1987  	if len(ingestSplitFiles) > 0 {
  1988  		ve.DeletedFiles = make(map[manifest.DeletedFileEntry]*manifest.FileMetadata)
  1989  		replacedFiles := make(map[base.FileNum][]newFileEntry)
  1990  		if err := d.ingestSplit(ve, updateLevelMetricsOnExcise, ingestSplitFiles, replacedFiles); err != nil {
  1991  			return nil, err
  1992  		}
  1993  	}
  1994  
  1995  	return ve, nil
  1996  }
  1997  
  1998  // flush runs a compaction that copies the immutable memtables from memory to
  1999  // disk.
  2000  //
  2001  // d.mu must be held when calling this, but the mutex may be dropped and
  2002  // re-acquired during the course of this method.
  2003  func (d *DB) flush1() (bytesFlushed uint64, err error) {
  2004  	// NB: The flushable queue can contain flushables of type ingestedFlushable.
  2005  	// The sstables in ingestedFlushable.files must be placed into the appropriate
  2006  	// level in the lsm. Let's say the flushable queue contains a prefix of
  2007  	// regular immutable memtables, then an ingestedFlushable, and then the
  2008  	// mutable memtable. When the flush of the ingestedFlushable is performed,
  2009  	// it needs an updated view of the lsm. That is, the prefix of immutable
  2010  	// memtables must have already been flushed. Similarly, if there are two
  2011  	// contiguous ingestedFlushables in the queue, then the first flushable must
  2012  	// be flushed, so that the second flushable can see an updated view of the
  2013  	// lsm.
  2014  	//
  2015  	// Given the above, we restrict flushes to either some prefix of regular
  2016  	// memtables, or a single flushable of type ingestedFlushable. The DB.flush
  2017  	// function will call DB.maybeScheduleFlush again, so a new flush to finish
  2018  	// the remaining flush work should be scheduled right away.
  2019  	//
  2020  	// NB: Large batches placed in the flushable queue share the WAL with the
  2021  	// previous memtable in the queue. We must ensure the property that both the
  2022  	// large batch and the memtable with which it shares a WAL are flushed
  2023  	// together. The property ensures that the minimum unflushed log number
  2024  	// isn't incremented incorrectly. Since a flushableBatch.readyToFlush always
  2025  	// returns true, and since the large batch will always be placed right after
  2026  	// the memtable with which it shares a WAL, the property is naturally
  2027  	// ensured. The large batch will always be placed after the memtable with
  2028  	// which it shares a WAL because we ensure it in DB.commitWrite by holding
  2029  	// the commitPipeline.mu and then holding DB.mu. As an extra defensive
  2030  	// measure, if we try to flush the memtable without also flushing the
  2031  	// flushable batch in the same flush, since the memtable and flushableBatch
  2032  	// have the same logNum, the logNum invariant check below will trigger.
  2033  	var n, inputs int
  2034  	var inputBytes uint64
  2035  	var ingest bool
  2036  	for ; n < len(d.mu.mem.queue)-1; n++ {
  2037  		if f, ok := d.mu.mem.queue[n].flushable.(*ingestedFlushable); ok {
  2038  			if n == 0 {
  2039  				// The first flushable is of type ingestedFlushable. Since these
  2040  				// must be flushed individually, we perform a flush for just
  2041  				// this.
  2042  				if !f.readyForFlush() {
  2043  					// This check is almost unnecessary, but we guard against it
  2044  					// just in case this invariant changes in the future.
  2045  					panic("pebble: ingestedFlushable should always be ready to flush.")
  2046  				}
  2047  				// By setting n = 1, we ensure that the first flushable(n == 0)
  2048  				// is scheduled for a flush. The number of tables added is equal to the
  2049  				// number of files in the ingest operation.
  2050  				n = 1
  2051  				inputs = len(f.files)
  2052  				ingest = true
  2053  				break
  2054  			} else {
  2055  				// There was some prefix of flushables which weren't of type
  2056  				// ingestedFlushable. So, perform a flush for those.
  2057  				break
  2058  			}
  2059  		}
  2060  		if !d.mu.mem.queue[n].readyForFlush() {
  2061  			break
  2062  		}
  2063  		inputBytes += d.mu.mem.queue[n].inuseBytes()
  2064  	}
  2065  	if n == 0 {
  2066  		// None of the immutable memtables are ready for flushing.
  2067  		return 0, nil
  2068  	}
  2069  	if !ingest {
  2070  		// Flushes of memtables add the prefix of n memtables from the flushable
  2071  		// queue.
  2072  		inputs = n
  2073  	}
  2074  
  2075  	// Require that every memtable being flushed has a log number less than the
  2076  	// new minimum unflushed log number.
  2077  	minUnflushedLogNum := d.mu.mem.queue[n].logNum
  2078  	if !d.opts.DisableWAL {
  2079  		for i := 0; i < n; i++ {
  2080  			if logNum := d.mu.mem.queue[i].logNum; logNum >= minUnflushedLogNum {
  2081  				panic(errors.AssertionFailedf("logNum invariant violated: flushing %d items; %d:type=%T,logNum=%d; %d:type=%T,logNum=%d",
  2082  					n,
  2083  					i, d.mu.mem.queue[i].flushable, logNum,
  2084  					n, d.mu.mem.queue[n].flushable, minUnflushedLogNum))
  2085  			}
  2086  		}
  2087  	}
  2088  
  2089  	c := newFlush(d.opts, d.mu.versions.currentVersion(),
  2090  		d.mu.versions.picker.getBaseLevel(), d.mu.mem.queue[:n], d.timeNow())
  2091  	d.addInProgressCompaction(c)
  2092  
  2093  	jobID := d.mu.nextJobID
  2094  	d.mu.nextJobID++
  2095  	d.opts.EventListener.FlushBegin(FlushInfo{
  2096  		JobID:      jobID,
  2097  		Input:      inputs,
  2098  		InputBytes: inputBytes,
  2099  		Ingest:     ingest,
  2100  	})
  2101  	startTime := d.timeNow()
  2102  
  2103  	var ve *manifest.VersionEdit
  2104  	var pendingOutputs []physicalMeta
  2105  	var stats compactStats
  2106  	// To determine the target level of the files in the ingestedFlushable, we
  2107  	// need to acquire the logLock, and not release it for that duration. Since,
  2108  	// we need to acquire the logLock below to perform the logAndApply step
  2109  	// anyway, we create the VersionEdit for ingestedFlushable outside of
  2110  	// runCompaction. For all other flush cases, we construct the VersionEdit
  2111  	// inside runCompaction.
  2112  	if c.kind != compactionKindIngestedFlushable {
  2113  		ve, pendingOutputs, stats, err = d.runCompaction(jobID, c)
  2114  	}
  2115  
  2116  	// Acquire logLock. This will be released either on an error, by way of
  2117  	// logUnlock, or through a call to logAndApply if there is no error.
  2118  	d.mu.versions.logLock()
  2119  
  2120  	if c.kind == compactionKindIngestedFlushable {
  2121  		ve, err = d.runIngestFlush(c)
  2122  	}
  2123  
  2124  	info := FlushInfo{
  2125  		JobID:      jobID,
  2126  		Input:      inputs,
  2127  		InputBytes: inputBytes,
  2128  		Duration:   d.timeNow().Sub(startTime),
  2129  		Done:       true,
  2130  		Ingest:     ingest,
  2131  		Err:        err,
  2132  	}
  2133  	if err == nil {
  2134  		for i := range ve.NewFiles {
  2135  			e := &ve.NewFiles[i]
  2136  			info.Output = append(info.Output, e.Meta.TableInfo())
  2137  			// Ingested tables are not necessarily flushed to L0. Record the level of
  2138  			// each ingested file explicitly.
  2139  			if ingest {
  2140  				info.IngestLevels = append(info.IngestLevels, e.Level)
  2141  			}
  2142  		}
  2143  		if len(ve.NewFiles) == 0 {
  2144  			info.Err = errEmptyTable
  2145  		}
  2146  
  2147  		// The flush succeeded or it produced an empty sstable. In either case we
  2148  		// want to bump the minimum unflushed log number to the log number of the
  2149  		// oldest unflushed memtable.
  2150  		ve.MinUnflushedLogNum = minUnflushedLogNum
  2151  		if c.kind != compactionKindIngestedFlushable {
  2152  			metrics := c.metrics[0]
  2153  			if d.opts.DisableWAL {
  2154  				// If the WAL is disabled, every flushable has a zero [logSize],
  2155  				// resulting in zero bytes in. Instead, use the number of bytes we
  2156  				// flushed as the BytesIn. This ensures we get a reasonable w-amp
  2157  				// calculation even when the WAL is disabled.
  2158  				metrics.BytesIn = metrics.BytesFlushed
  2159  			} else {
  2160  				metrics := c.metrics[0]
  2161  				for i := 0; i < n; i++ {
  2162  					metrics.BytesIn += d.mu.mem.queue[i].logSize
  2163  				}
  2164  			}
  2165  		} else if len(ve.DeletedFiles) > 0 {
  2166  			// c.kind == compactionKindIngestedFlushable && we have deleted files due
  2167  			// to ingest-time splits.
  2168  			//
  2169  			// Iterate through all other compactions, and check if their inputs have
  2170  			// been replaced due to an ingest-time split. In that case, cancel the
  2171  			// compaction.
  2172  			for c2 := range d.mu.compact.inProgress {
  2173  				for i := range c2.inputs {
  2174  					iter := c2.inputs[i].files.Iter()
  2175  					for f := iter.First(); f != nil; f = iter.Next() {
  2176  						if _, ok := ve.DeletedFiles[deletedFileEntry{FileNum: f.FileNum, Level: c2.inputs[i].level}]; ok {
  2177  							c2.cancel.Store(true)
  2178  							break
  2179  						}
  2180  					}
  2181  				}
  2182  			}
  2183  		}
  2184  		err = d.mu.versions.logAndApply(jobID, ve, c.metrics, false, /* forceRotation */
  2185  			func() []compactionInfo { return d.getInProgressCompactionInfoLocked(c) })
  2186  		if err != nil {
  2187  			info.Err = err
  2188  			// TODO(peter): untested.
  2189  			for _, f := range pendingOutputs {
  2190  				// Note that the FileBacking for the file metadata might not have
  2191  				// been set yet. So, we directly use the FileNum. Since these
  2192  				// files were generated as compaction outputs, these must be
  2193  				// physical files on disk. This property might not hold once
  2194  				// https://github.com/cockroachdb/pebble/issues/389 is
  2195  				// implemented if #389 creates virtual sstables as output files.
  2196  				d.mu.versions.obsoleteTables = append(
  2197  					d.mu.versions.obsoleteTables,
  2198  					fileInfo{f.FileNum.DiskFileNum(), f.Size},
  2199  				)
  2200  			}
  2201  			d.mu.versions.updateObsoleteTableMetricsLocked()
  2202  		}
  2203  	} else {
  2204  		// We won't be performing the logAndApply step because of the error,
  2205  		// so logUnlock.
  2206  		d.mu.versions.logUnlock()
  2207  	}
  2208  
  2209  	bytesFlushed = c.bytesIterated
  2210  
  2211  	// If err != nil, then the flush will be retried, and we will recalculate
  2212  	// these metrics.
  2213  	if err == nil {
  2214  		d.mu.snapshots.cumulativePinnedCount += stats.cumulativePinnedKeys
  2215  		d.mu.snapshots.cumulativePinnedSize += stats.cumulativePinnedSize
  2216  		d.mu.versions.metrics.Keys.MissizedTombstonesCount += stats.countMissizedDels
  2217  		d.maybeUpdateDeleteCompactionHints(c)
  2218  	}
  2219  
  2220  	d.clearCompactingState(c, err != nil)
  2221  	delete(d.mu.compact.inProgress, c)
  2222  	d.mu.versions.incrementCompactions(c.kind, c.extraLevels, c.pickerMetrics)
  2223  
  2224  	var flushed flushableList
  2225  	if err == nil {
  2226  		flushed = d.mu.mem.queue[:n]
  2227  		d.mu.mem.queue = d.mu.mem.queue[n:]
  2228  		d.updateReadStateLocked(d.opts.DebugCheck)
  2229  		d.updateTableStatsLocked(ve.NewFiles)
  2230  		if ingest {
  2231  			d.mu.versions.metrics.Flush.AsIngestCount++
  2232  			for _, l := range c.metrics {
  2233  				d.mu.versions.metrics.Flush.AsIngestBytes += l.BytesIngested
  2234  				d.mu.versions.metrics.Flush.AsIngestTableCount += l.TablesIngested
  2235  			}
  2236  		}
  2237  
  2238  		// Update if any eventually file-only snapshots have now transitioned to
  2239  		// being file-only.
  2240  		earliestUnflushedSeqNum := d.getEarliestUnflushedSeqNumLocked()
  2241  		currentVersion := d.mu.versions.currentVersion()
  2242  		for s := d.mu.snapshots.root.next; s != &d.mu.snapshots.root; {
  2243  			if s.efos == nil {
  2244  				s = s.next
  2245  				continue
  2246  			}
  2247  			if base.Visible(earliestUnflushedSeqNum, s.efos.seqNum, InternalKeySeqNumMax) {
  2248  				s = s.next
  2249  				continue
  2250  			}
  2251  			if s.efos.excised.Load() {
  2252  				// If a concurrent excise has happened that overlaps with one of the key
  2253  				// ranges this snapshot is interested in, this EFOS cannot transition to
  2254  				// a file-only snapshot as keys in that range could now be deleted. Move
  2255  				// onto the next snapshot.
  2256  				s = s.next
  2257  				continue
  2258  			}
  2259  			currentVersion.Ref()
  2260  
  2261  			// NB: s.efos.transitionToFileOnlySnapshot could close s, in which
  2262  			// case s.next would be nil. Save it before calling it.
  2263  			next := s.next
  2264  			_ = s.efos.transitionToFileOnlySnapshot(currentVersion)
  2265  			s = next
  2266  		}
  2267  	}
  2268  	// Signal FlushEnd after installing the new readState. This helps for unit
  2269  	// tests that use the callback to trigger a read using an iterator with
  2270  	// IterOptions.OnlyReadGuaranteedDurable.
  2271  	info.TotalDuration = d.timeNow().Sub(startTime)
  2272  	d.opts.EventListener.FlushEnd(info)
  2273  
  2274  	// The order of these operations matters here for ease of testing.
  2275  	// Removing the reader reference first allows tests to be guaranteed that
  2276  	// the memtable reservation has been released by the time a synchronous
  2277  	// flush returns. readerUnrefLocked may also produce obsolete files so the
  2278  	// call to deleteObsoleteFiles must happen after it.
  2279  	for i := range flushed {
  2280  		flushed[i].readerUnrefLocked(true)
  2281  	}
  2282  
  2283  	d.deleteObsoleteFiles(jobID)
  2284  
  2285  	// Mark all the memtables we flushed as flushed.
  2286  	for i := range flushed {
  2287  		close(flushed[i].flushed)
  2288  	}
  2289  
  2290  	return bytesFlushed, err
  2291  }
  2292  
  2293  // maybeScheduleCompactionAsync should be used when
  2294  // we want to possibly schedule a compaction, but don't
  2295  // want to eat the cost of running maybeScheduleCompaction.
  2296  // This method should be launched in a separate goroutine.
  2297  // d.mu must not be held when this is called.
  2298  func (d *DB) maybeScheduleCompactionAsync() {
  2299  	defer d.compactionSchedulers.Done()
  2300  
  2301  	d.mu.Lock()
  2302  	d.maybeScheduleCompaction()
  2303  	d.mu.Unlock()
  2304  }
  2305  
  2306  // maybeScheduleCompaction schedules a compaction if necessary.
  2307  //
  2308  // d.mu must be held when calling this.
  2309  func (d *DB) maybeScheduleCompaction() {
  2310  	d.maybeScheduleCompactionPicker(pickAuto)
  2311  }
  2312  
  2313  func pickAuto(picker compactionPicker, env compactionEnv) *pickedCompaction {
  2314  	return picker.pickAuto(env)
  2315  }
  2316  
  2317  func pickElisionOnly(picker compactionPicker, env compactionEnv) *pickedCompaction {
  2318  	return picker.pickElisionOnlyCompaction(env)
  2319  }
  2320  
  2321  // maybeScheduleDownloadCompaction schedules a download compaction.
  2322  //
  2323  // Requires d.mu to be held.
  2324  func (d *DB) maybeScheduleDownloadCompaction(env compactionEnv, maxConcurrentCompactions int) {
  2325  	for len(d.mu.compact.downloads) > 0 && d.mu.compact.compactingCount < maxConcurrentCompactions {
  2326  		v := d.mu.versions.currentVersion()
  2327  		download := d.mu.compact.downloads[0]
  2328  		env.inProgressCompactions = d.getInProgressCompactionInfoLocked(nil)
  2329  		var externalFile *fileMetadata
  2330  		var err error
  2331  		var level int
  2332  		for i := range v.Levels {
  2333  			overlaps := v.Overlaps(i, d.cmp, download.start, download.end, true /* exclusiveEnd */)
  2334  			iter := overlaps.Iter()
  2335  			provider := d.objProvider
  2336  			for f := iter.First(); f != nil; f = iter.Next() {
  2337  				var objMeta objstorage.ObjectMetadata
  2338  				objMeta, err = provider.Lookup(fileTypeTable, f.FileBacking.DiskFileNum)
  2339  				if err != nil {
  2340  					break
  2341  				}
  2342  				if objMeta.IsExternal() {
  2343  					if f.IsCompacting() {
  2344  						continue
  2345  					}
  2346  					externalFile = f
  2347  					level = i
  2348  					break
  2349  				}
  2350  			}
  2351  			if externalFile != nil || err != nil {
  2352  				break
  2353  			}
  2354  		}
  2355  		if err != nil {
  2356  			d.mu.compact.downloads = d.mu.compact.downloads[1:]
  2357  			download.doneChans[download.compactionsStarted] <- err
  2358  			continue
  2359  		}
  2360  		if externalFile == nil {
  2361  			// The entirety of this span is downloaded, or is being downloaded right
  2362  			// now. No need to schedule additional downloads for this span.
  2363  			d.mu.compact.downloads = d.mu.compact.downloads[1:]
  2364  			continue
  2365  		}
  2366  		pc := pickDownloadCompaction(v, d.opts, env, d.mu.versions.picker.getBaseLevel(), download, level, externalFile)
  2367  		if pc != nil {
  2368  			doneCh := download.doneChans[download.compactionsStarted]
  2369  			download.compactionsStarted++
  2370  			// Create another doneChan for the next compaction.
  2371  			download.doneChans = append(download.doneChans, make(chan error, 1))
  2372  
  2373  			c := newCompaction(pc, d.opts, d.timeNow(), d.ObjProvider())
  2374  			d.mu.compact.compactingCount++
  2375  			d.addInProgressCompaction(c)
  2376  			go d.compact(c, doneCh)
  2377  		}
  2378  	}
  2379  }
  2380  
  2381  // maybeScheduleCompactionPicker schedules a compaction if necessary,
  2382  // calling `pickFunc` to pick automatic compactions.
  2383  //
  2384  // d.mu must be held when calling this.
  2385  func (d *DB) maybeScheduleCompactionPicker(
  2386  	pickFunc func(compactionPicker, compactionEnv) *pickedCompaction,
  2387  ) {
  2388  	if d.closed.Load() != nil || d.opts.ReadOnly {
  2389  		return
  2390  	}
  2391  	maxConcurrentCompactions := d.opts.MaxConcurrentCompactions()
  2392  	if d.mu.compact.compactingCount >= maxConcurrentCompactions {
  2393  		if len(d.mu.compact.manual) > 0 {
  2394  			// Inability to run head blocks later manual compactions.
  2395  			d.mu.compact.manual[0].retries++
  2396  		}
  2397  		return
  2398  	}
  2399  
  2400  	// Compaction picking needs a coherent view of a Version. In particular, we
  2401  	// need to exlude concurrent ingestions from making a decision on which level
  2402  	// to ingest into that conflicts with our compaction
  2403  	// decision. versionSet.logLock provides the necessary mutual exclusion.
  2404  	d.mu.versions.logLock()
  2405  	defer d.mu.versions.logUnlock()
  2406  
  2407  	// Check for the closed flag again, in case the DB was closed while we were
  2408  	// waiting for logLock().
  2409  	if d.closed.Load() != nil {
  2410  		return
  2411  	}
  2412  
  2413  	env := compactionEnv{
  2414  		diskAvailBytes:          d.diskAvailBytes.Load(),
  2415  		earliestSnapshotSeqNum:  d.mu.snapshots.earliest(),
  2416  		earliestUnflushedSeqNum: d.getEarliestUnflushedSeqNumLocked(),
  2417  	}
  2418  
  2419  	// Check for delete-only compactions first, because they're expected to be
  2420  	// cheap and reduce future compaction work.
  2421  	if !d.opts.private.disableDeleteOnlyCompactions &&
  2422  		len(d.mu.compact.deletionHints) > 0 &&
  2423  		!d.opts.DisableAutomaticCompactions {
  2424  		v := d.mu.versions.currentVersion()
  2425  		snapshots := d.mu.snapshots.toSlice()
  2426  		inputs, unresolvedHints := checkDeleteCompactionHints(d.cmp, v, d.mu.compact.deletionHints, snapshots)
  2427  		d.mu.compact.deletionHints = unresolvedHints
  2428  
  2429  		if len(inputs) > 0 {
  2430  			c := newDeleteOnlyCompaction(d.opts, v, inputs, d.timeNow())
  2431  			d.mu.compact.compactingCount++
  2432  			d.addInProgressCompaction(c)
  2433  			go d.compact(c, nil)
  2434  		}
  2435  	}
  2436  
  2437  	for len(d.mu.compact.manual) > 0 && d.mu.compact.compactingCount < maxConcurrentCompactions {
  2438  		v := d.mu.versions.currentVersion()
  2439  		manual := d.mu.compact.manual[0]
  2440  		env.inProgressCompactions = d.getInProgressCompactionInfoLocked(nil)
  2441  		pc, retryLater := pickManualCompaction(v, d.opts, env, d.mu.versions.picker.getBaseLevel(), manual)
  2442  		if pc != nil {
  2443  			c := newCompaction(pc, d.opts, d.timeNow(), d.ObjProvider())
  2444  			d.mu.compact.manual = d.mu.compact.manual[1:]
  2445  			d.mu.compact.compactingCount++
  2446  			d.addInProgressCompaction(c)
  2447  			go d.compact(c, manual.done)
  2448  		} else if !retryLater {
  2449  			// Noop
  2450  			d.mu.compact.manual = d.mu.compact.manual[1:]
  2451  			manual.done <- nil
  2452  		} else {
  2453  			// Inability to run head blocks later manual compactions.
  2454  			manual.retries++
  2455  			break
  2456  		}
  2457  	}
  2458  
  2459  	for !d.opts.DisableAutomaticCompactions && d.mu.compact.compactingCount < maxConcurrentCompactions {
  2460  		env.inProgressCompactions = d.getInProgressCompactionInfoLocked(nil)
  2461  		env.readCompactionEnv = readCompactionEnv{
  2462  			readCompactions:          &d.mu.compact.readCompactions,
  2463  			flushing:                 d.mu.compact.flushing || d.passedFlushThreshold(),
  2464  			rescheduleReadCompaction: &d.mu.compact.rescheduleReadCompaction,
  2465  		}
  2466  		pc := pickFunc(d.mu.versions.picker, env)
  2467  		if pc == nil {
  2468  			break
  2469  		}
  2470  		c := newCompaction(pc, d.opts, d.timeNow(), d.ObjProvider())
  2471  		d.mu.compact.compactingCount++
  2472  		d.addInProgressCompaction(c)
  2473  		go d.compact(c, nil)
  2474  	}
  2475  
  2476  	d.maybeScheduleDownloadCompaction(env, maxConcurrentCompactions)
  2477  }
  2478  
  2479  // deleteCompactionHintType indicates whether the deleteCompactionHint was
  2480  // generated from a span containing a range del (point key only), a range key
  2481  // delete (range key only), or both a point and range key.
  2482  type deleteCompactionHintType uint8
  2483  
  2484  const (
  2485  	// NOTE: While these are primarily used as enumeration types, they are also
  2486  	// used for some bitwise operations. Care should be taken when updating.
  2487  	deleteCompactionHintTypeUnknown deleteCompactionHintType = iota
  2488  	deleteCompactionHintTypePointKeyOnly
  2489  	deleteCompactionHintTypeRangeKeyOnly
  2490  	deleteCompactionHintTypePointAndRangeKey
  2491  )
  2492  
  2493  // String implements fmt.Stringer.
  2494  func (h deleteCompactionHintType) String() string {
  2495  	switch h {
  2496  	case deleteCompactionHintTypeUnknown:
  2497  		return "unknown"
  2498  	case deleteCompactionHintTypePointKeyOnly:
  2499  		return "point-key-only"
  2500  	case deleteCompactionHintTypeRangeKeyOnly:
  2501  		return "range-key-only"
  2502  	case deleteCompactionHintTypePointAndRangeKey:
  2503  		return "point-and-range-key"
  2504  	default:
  2505  		panic(fmt.Sprintf("unknown hint type: %d", h))
  2506  	}
  2507  }
  2508  
  2509  // compactionHintFromKeys returns a deleteCompactionHintType given a slice of
  2510  // keyspan.Keys.
  2511  func compactionHintFromKeys(keys []keyspan.Key) deleteCompactionHintType {
  2512  	var hintType deleteCompactionHintType
  2513  	for _, k := range keys {
  2514  		switch k.Kind() {
  2515  		case base.InternalKeyKindRangeDelete:
  2516  			hintType |= deleteCompactionHintTypePointKeyOnly
  2517  		case base.InternalKeyKindRangeKeyDelete:
  2518  			hintType |= deleteCompactionHintTypeRangeKeyOnly
  2519  		default:
  2520  			panic(fmt.Sprintf("unsupported key kind: %s", k.Kind()))
  2521  		}
  2522  	}
  2523  	return hintType
  2524  }
  2525  
  2526  // A deleteCompactionHint records a user key and sequence number span that has been
  2527  // deleted by a range tombstone. A hint is recorded if at least one sstable
  2528  // falls completely within both the user key and sequence number spans.
  2529  // Once the tombstones and the observed completely-contained sstables fall
  2530  // into the same snapshot stripe, a delete-only compaction may delete any
  2531  // sstables within the range.
  2532  type deleteCompactionHint struct {
  2533  	// The type of key span that generated this hint (point key, range key, or
  2534  	// both).
  2535  	hintType deleteCompactionHintType
  2536  	// start and end are user keys specifying a key range [start, end) of
  2537  	// deleted keys.
  2538  	start []byte
  2539  	end   []byte
  2540  	// The level of the file containing the range tombstone(s) when the hint
  2541  	// was created. Only lower levels need to be searched for files that may
  2542  	// be deleted.
  2543  	tombstoneLevel int
  2544  	// The file containing the range tombstone(s) that created the hint.
  2545  	tombstoneFile *fileMetadata
  2546  	// The smallest and largest sequence numbers of the abutting tombstones
  2547  	// merged to form this hint. All of a tables' keys must be less than the
  2548  	// tombstone smallest sequence number to be deleted. All of a tables'
  2549  	// sequence numbers must fall into the same snapshot stripe as the
  2550  	// tombstone largest sequence number to be deleted.
  2551  	tombstoneLargestSeqNum  uint64
  2552  	tombstoneSmallestSeqNum uint64
  2553  	// The smallest sequence number of a sstable that was found to be covered
  2554  	// by this hint. The hint cannot be resolved until this sequence number is
  2555  	// in the same snapshot stripe as the largest tombstone sequence number.
  2556  	// This is set when a hint is created, so the LSM may look different and
  2557  	// notably no longer contain the sstable that contained the key at this
  2558  	// sequence number.
  2559  	fileSmallestSeqNum uint64
  2560  }
  2561  
  2562  func (h deleteCompactionHint) String() string {
  2563  	return fmt.Sprintf(
  2564  		"L%d.%s %s-%s seqnums(tombstone=%d-%d, file-smallest=%d, type=%s)",
  2565  		h.tombstoneLevel, h.tombstoneFile.FileNum, h.start, h.end,
  2566  		h.tombstoneSmallestSeqNum, h.tombstoneLargestSeqNum, h.fileSmallestSeqNum,
  2567  		h.hintType,
  2568  	)
  2569  }
  2570  
  2571  func (h *deleteCompactionHint) canDelete(cmp Compare, m *fileMetadata, snapshots []uint64) bool {
  2572  	// The file can only be deleted if all of its keys are older than the
  2573  	// earliest tombstone aggregated into the hint.
  2574  	if m.LargestSeqNum >= h.tombstoneSmallestSeqNum || m.SmallestSeqNum < h.fileSmallestSeqNum {
  2575  		return false
  2576  	}
  2577  
  2578  	// The file's oldest key must  be in the same snapshot stripe as the
  2579  	// newest tombstone. NB: We already checked the hint's sequence numbers,
  2580  	// but this file's oldest sequence number might be lower than the hint's
  2581  	// smallest sequence number despite the file falling within the key range
  2582  	// if this file was constructed after the hint by a compaction.
  2583  	ti, _ := snapshotIndex(h.tombstoneLargestSeqNum, snapshots)
  2584  	fi, _ := snapshotIndex(m.SmallestSeqNum, snapshots)
  2585  	if ti != fi {
  2586  		return false
  2587  	}
  2588  
  2589  	switch h.hintType {
  2590  	case deleteCompactionHintTypePointKeyOnly:
  2591  		// A hint generated by a range del span cannot delete tables that contain
  2592  		// range keys.
  2593  		if m.HasRangeKeys {
  2594  			return false
  2595  		}
  2596  	case deleteCompactionHintTypeRangeKeyOnly:
  2597  		// A hint generated by a range key del span cannot delete tables that
  2598  		// contain point keys.
  2599  		if m.HasPointKeys {
  2600  			return false
  2601  		}
  2602  	case deleteCompactionHintTypePointAndRangeKey:
  2603  		// A hint from a span that contains both range dels *and* range keys can
  2604  		// only be deleted if both bounds fall within the hint. The next check takes
  2605  		// care of this.
  2606  	default:
  2607  		panic(fmt.Sprintf("pebble: unknown delete compaction hint type: %d", h.hintType))
  2608  	}
  2609  
  2610  	// The file's keys must be completely contained within the hint range.
  2611  	return cmp(h.start, m.Smallest.UserKey) <= 0 && cmp(m.Largest.UserKey, h.end) < 0
  2612  }
  2613  
  2614  func (d *DB) maybeUpdateDeleteCompactionHints(c *compaction) {
  2615  	// Compactions that zero sequence numbers can interfere with compaction
  2616  	// deletion hints. Deletion hints apply to tables containing keys older
  2617  	// than a threshold. If a key more recent than the threshold is zeroed in
  2618  	// a compaction, a delete-only compaction may mistake it as meeting the
  2619  	// threshold and drop a table containing live data.
  2620  	//
  2621  	// To avoid this scenario, compactions that zero sequence numbers remove
  2622  	// any conflicting deletion hints. A deletion hint is conflicting if both
  2623  	// of the following conditions apply:
  2624  	// * its key space overlaps with the compaction
  2625  	// * at least one of its inputs contains a key as recent as one of the
  2626  	//   hint's tombstones.
  2627  	//
  2628  	if !c.allowedZeroSeqNum {
  2629  		return
  2630  	}
  2631  
  2632  	updatedHints := d.mu.compact.deletionHints[:0]
  2633  	for _, h := range d.mu.compact.deletionHints {
  2634  		// If the compaction's key space is disjoint from the hint's key
  2635  		// space, the zeroing of sequence numbers won't affect the hint. Keep
  2636  		// the hint.
  2637  		keysDisjoint := d.cmp(h.end, c.smallest.UserKey) < 0 || d.cmp(h.start, c.largest.UserKey) > 0
  2638  		if keysDisjoint {
  2639  			updatedHints = append(updatedHints, h)
  2640  			continue
  2641  		}
  2642  
  2643  		// All of the compaction's inputs must be older than the hint's
  2644  		// tombstones.
  2645  		inputsOlder := true
  2646  		for _, in := range c.inputs {
  2647  			iter := in.files.Iter()
  2648  			for f := iter.First(); f != nil; f = iter.Next() {
  2649  				inputsOlder = inputsOlder && f.LargestSeqNum < h.tombstoneSmallestSeqNum
  2650  			}
  2651  		}
  2652  		if inputsOlder {
  2653  			updatedHints = append(updatedHints, h)
  2654  			continue
  2655  		}
  2656  
  2657  		// Drop h, because the compaction c may have zeroed sequence numbers
  2658  		// of keys more recent than some of h's tombstones.
  2659  	}
  2660  	d.mu.compact.deletionHints = updatedHints
  2661  }
  2662  
  2663  func checkDeleteCompactionHints(
  2664  	cmp Compare, v *version, hints []deleteCompactionHint, snapshots []uint64,
  2665  ) ([]compactionLevel, []deleteCompactionHint) {
  2666  	var files map[*fileMetadata]bool
  2667  	var byLevel [numLevels][]*fileMetadata
  2668  
  2669  	unresolvedHints := hints[:0]
  2670  	for _, h := range hints {
  2671  		// Check each compaction hint to see if it's resolvable. Resolvable
  2672  		// hints are removed and trigger a delete-only compaction if any files
  2673  		// in the current LSM still meet their criteria. Unresolvable hints
  2674  		// are saved and don't trigger a delete-only compaction.
  2675  		//
  2676  		// When a compaction hint is created, the sequence numbers of the
  2677  		// range tombstones and the covered file with the oldest key are
  2678  		// recorded. The largest tombstone sequence number and the smallest
  2679  		// file sequence number must be in the same snapshot stripe for the
  2680  		// hint to be resolved. The below graphic models a compaction hint
  2681  		// covering the keyspace [b, r). The hint completely contains two
  2682  		// files, 000002 and 000003. The file 000003 contains the lowest
  2683  		// covered sequence number at #90. The tombstone b.RANGEDEL.230:h has
  2684  		// the highest tombstone sequence number incorporated into the hint.
  2685  		// The hint may be resolved only once the snapshots at #100, #180 and
  2686  		// #210 are all closed. File 000001 is not included within the hint
  2687  		// because it extends beyond the range tombstones in user key space.
  2688  		//
  2689  		// 250
  2690  		//
  2691  		//       |-b...230:h-|
  2692  		// _____________________________________________________ snapshot #210
  2693  		// 200               |--h.RANGEDEL.200:r--|
  2694  		//
  2695  		// _____________________________________________________ snapshot #180
  2696  		//
  2697  		// 150                     +--------+
  2698  		//           +---------+   | 000003 |
  2699  		//           | 000002  |   |        |
  2700  		//           +_________+   |        |
  2701  		// 100_____________________|________|___________________ snapshot #100
  2702  		//                         +--------+
  2703  		// _____________________________________________________ snapshot #70
  2704  		//                             +---------------+
  2705  		//  50                         | 000001        |
  2706  		//                             |               |
  2707  		//                             +---------------+
  2708  		// ______________________________________________________________
  2709  		//     a b c d e f g h i j k l m n o p q r s t u v w x y z
  2710  
  2711  		ti, _ := snapshotIndex(h.tombstoneLargestSeqNum, snapshots)
  2712  		fi, _ := snapshotIndex(h.fileSmallestSeqNum, snapshots)
  2713  		if ti != fi {
  2714  			// Cannot resolve yet.
  2715  			unresolvedHints = append(unresolvedHints, h)
  2716  			continue
  2717  		}
  2718  
  2719  		// The hint h will be resolved and dropped, regardless of whether
  2720  		// there are any tables that can be deleted.
  2721  		for l := h.tombstoneLevel + 1; l < numLevels; l++ {
  2722  			overlaps := v.Overlaps(l, cmp, h.start, h.end, true /* exclusiveEnd */)
  2723  			iter := overlaps.Iter()
  2724  			for m := iter.First(); m != nil; m = iter.Next() {
  2725  				if m.IsCompacting() || !h.canDelete(cmp, m, snapshots) || files[m] {
  2726  					continue
  2727  				}
  2728  				if files == nil {
  2729  					// Construct files lazily, assuming most calls will not
  2730  					// produce delete-only compactions.
  2731  					files = make(map[*fileMetadata]bool)
  2732  				}
  2733  				files[m] = true
  2734  				byLevel[l] = append(byLevel[l], m)
  2735  			}
  2736  		}
  2737  	}
  2738  
  2739  	var compactLevels []compactionLevel
  2740  	for l, files := range byLevel {
  2741  		if len(files) == 0 {
  2742  			continue
  2743  		}
  2744  		compactLevels = append(compactLevels, compactionLevel{
  2745  			level: l,
  2746  			files: manifest.NewLevelSliceKeySorted(cmp, files),
  2747  		})
  2748  	}
  2749  	return compactLevels, unresolvedHints
  2750  }
  2751  
  2752  // compact runs one compaction and maybe schedules another call to compact.
  2753  func (d *DB) compact(c *compaction, errChannel chan error) {
  2754  	pprof.Do(context.Background(), compactLabels, func(context.Context) {
  2755  		d.mu.Lock()
  2756  		defer d.mu.Unlock()
  2757  		if err := d.compact1(c, errChannel); err != nil {
  2758  			// TODO(peter): count consecutive compaction errors and backoff.
  2759  			d.opts.EventListener.BackgroundError(err)
  2760  		}
  2761  		d.mu.compact.compactingCount--
  2762  		delete(d.mu.compact.inProgress, c)
  2763  		// Add this compaction's duration to the cumulative duration. NB: This
  2764  		// must be atomic with the above removal of c from
  2765  		// d.mu.compact.InProgress to ensure Metrics.Compact.Duration does not
  2766  		// miss or double count a completing compaction's duration.
  2767  		d.mu.compact.duration += d.timeNow().Sub(c.beganAt)
  2768  
  2769  		// The previous compaction may have produced too many files in a
  2770  		// level, so reschedule another compaction if needed.
  2771  		d.maybeScheduleCompaction()
  2772  		d.mu.compact.cond.Broadcast()
  2773  	})
  2774  }
  2775  
  2776  // compact1 runs one compaction.
  2777  //
  2778  // d.mu must be held when calling this, but the mutex may be dropped and
  2779  // re-acquired during the course of this method.
  2780  func (d *DB) compact1(c *compaction, errChannel chan error) (err error) {
  2781  	if errChannel != nil {
  2782  		defer func() {
  2783  			errChannel <- err
  2784  		}()
  2785  	}
  2786  
  2787  	jobID := d.mu.nextJobID
  2788  	d.mu.nextJobID++
  2789  	info := c.makeInfo(jobID)
  2790  	d.opts.EventListener.CompactionBegin(info)
  2791  	startTime := d.timeNow()
  2792  
  2793  	ve, pendingOutputs, stats, err := d.runCompaction(jobID, c)
  2794  
  2795  	info.Duration = d.timeNow().Sub(startTime)
  2796  	if err == nil {
  2797  		err = func() error {
  2798  			var err error
  2799  			d.mu.versions.logLock()
  2800  			// Check if this compaction had a conflicting operation (eg. a d.excise())
  2801  			// that necessitates it restarting from scratch. Note that since we hold
  2802  			// the manifest lock, we don't expect this bool to change its value
  2803  			// as only the holder of the manifest lock will ever write to it.
  2804  			if c.cancel.Load() {
  2805  				err = firstError(err, ErrCancelledCompaction)
  2806  			}
  2807  			if err != nil {
  2808  				// logAndApply calls logUnlock. If we didn't call it, we need to call
  2809  				// logUnlock ourselves.
  2810  				d.mu.versions.logUnlock()
  2811  				return err
  2812  			}
  2813  			return d.mu.versions.logAndApply(jobID, ve, c.metrics, false /* forceRotation */, func() []compactionInfo {
  2814  				return d.getInProgressCompactionInfoLocked(c)
  2815  			})
  2816  		}()
  2817  		if err != nil {
  2818  			// TODO(peter): untested.
  2819  			for _, f := range pendingOutputs {
  2820  				// Note that the FileBacking for the file metadata might not have
  2821  				// been set yet. So, we directly use the FileNum. Since these
  2822  				// files were generated as compaction outputs, these must be
  2823  				// physical files on disk. This property might not hold once
  2824  				// https://github.com/cockroachdb/pebble/issues/389 is
  2825  				// implemented if #389 creates virtual sstables as output files.
  2826  				d.mu.versions.obsoleteTables = append(
  2827  					d.mu.versions.obsoleteTables,
  2828  					fileInfo{f.FileNum.DiskFileNum(), f.Size},
  2829  				)
  2830  			}
  2831  			d.mu.versions.updateObsoleteTableMetricsLocked()
  2832  		}
  2833  	}
  2834  
  2835  	info.Done = true
  2836  	info.Err = err
  2837  	if err == nil {
  2838  		for i := range ve.NewFiles {
  2839  			e := &ve.NewFiles[i]
  2840  			info.Output.Tables = append(info.Output.Tables, e.Meta.TableInfo())
  2841  		}
  2842  		d.mu.snapshots.cumulativePinnedCount += stats.cumulativePinnedKeys
  2843  		d.mu.snapshots.cumulativePinnedSize += stats.cumulativePinnedSize
  2844  		d.mu.versions.metrics.Keys.MissizedTombstonesCount += stats.countMissizedDels
  2845  		d.maybeUpdateDeleteCompactionHints(c)
  2846  	}
  2847  
  2848  	// NB: clearing compacting state must occur before updating the read state;
  2849  	// L0Sublevels initialization depends on it.
  2850  	d.clearCompactingState(c, err != nil)
  2851  	d.mu.versions.incrementCompactions(c.kind, c.extraLevels, c.pickerMetrics)
  2852  	d.mu.versions.incrementCompactionBytes(-c.bytesWritten)
  2853  
  2854  	info.TotalDuration = d.timeNow().Sub(c.beganAt)
  2855  	d.opts.EventListener.CompactionEnd(info)
  2856  
  2857  	// Update the read state before deleting obsolete files because the
  2858  	// read-state update will cause the previous version to be unref'd and if
  2859  	// there are no references obsolete tables will be added to the obsolete
  2860  	// table list.
  2861  	if err == nil {
  2862  		d.updateReadStateLocked(d.opts.DebugCheck)
  2863  		d.updateTableStatsLocked(ve.NewFiles)
  2864  	}
  2865  	d.deleteObsoleteFiles(jobID)
  2866  
  2867  	return err
  2868  }
  2869  
  2870  type compactStats struct {
  2871  	cumulativePinnedKeys uint64
  2872  	cumulativePinnedSize uint64
  2873  	countMissizedDels    uint64
  2874  }
  2875  
  2876  // runCopyCompaction runs a copy compaction where a new FileNum is created that
  2877  // is a byte-for-byte copy of the input file. This is used in lieu of a move
  2878  // compaction when a file is being moved across the local/remote storage
  2879  // boundary.
  2880  //
  2881  // d.mu must be held when calling this method.
  2882  func (d *DB) runCopyCompaction(
  2883  	jobID int,
  2884  	c *compaction,
  2885  	meta *fileMetadata,
  2886  	objMeta objstorage.ObjectMetadata,
  2887  	versionEdit *versionEdit,
  2888  ) (ve *versionEdit, pendingOutputs []physicalMeta, retErr error) {
  2889  	ve = versionEdit
  2890  	if objMeta.IsRemote() || !remote.ShouldCreateShared(d.opts.Experimental.CreateOnShared, c.outputLevel.level) {
  2891  		panic("pebble: scheduled a copy compaction that is not actually moving files to shared storage")
  2892  	}
  2893  	// Note that based on logic in the compaction picker, we're guaranteed
  2894  	// meta.Virtual is false.
  2895  	if meta.Virtual {
  2896  		panic(errors.AssertionFailedf("cannot do a copy compaction of a virtual sstable across local/remote storage"))
  2897  	}
  2898  	// We are in the relatively more complex case where we need to copy this
  2899  	// file to remote/shared storage. Drop the db mutex while we do the
  2900  	// copy.
  2901  	//
  2902  	// To ease up cleanup of the local file and tracking of refs, we create
  2903  	// a new FileNum. This has the potential of making the block cache less
  2904  	// effective, however.
  2905  	metaCopy := new(fileMetadata)
  2906  	*metaCopy = fileMetadata{
  2907  		Size:           meta.Size,
  2908  		CreationTime:   meta.CreationTime,
  2909  		SmallestSeqNum: meta.SmallestSeqNum,
  2910  		LargestSeqNum:  meta.LargestSeqNum,
  2911  		Stats:          meta.Stats,
  2912  		Virtual:        meta.Virtual,
  2913  	}
  2914  	if meta.HasPointKeys {
  2915  		metaCopy.ExtendPointKeyBounds(c.cmp, meta.SmallestPointKey, meta.LargestPointKey)
  2916  	}
  2917  	if meta.HasRangeKeys {
  2918  		metaCopy.ExtendRangeKeyBounds(c.cmp, meta.SmallestRangeKey, meta.LargestRangeKey)
  2919  	}
  2920  	metaCopy.FileNum = d.mu.versions.getNextFileNum()
  2921  	metaCopy.InitPhysicalBacking()
  2922  	c.metrics = map[int]*LevelMetrics{
  2923  		c.outputLevel.level: {
  2924  			BytesIn:         meta.Size,
  2925  			BytesCompacted:  meta.Size,
  2926  			TablesCompacted: 1,
  2927  		},
  2928  	}
  2929  	pendingOutputs = append(pendingOutputs, metaCopy.PhysicalMeta())
  2930  	// Before dropping the db mutex, grab a ref to the current version. This
  2931  	// prevents any concurrent excises from deleting files that this compaction
  2932  	// needs to read/maintain a reference to.
  2933  	vers := d.mu.versions.currentVersion()
  2934  	vers.Ref()
  2935  	defer vers.UnrefLocked()
  2936  
  2937  	d.mu.Unlock()
  2938  	defer d.mu.Lock()
  2939  	_, err := d.objProvider.LinkOrCopyFromLocal(context.TODO(), d.opts.FS,
  2940  		d.objProvider.Path(objMeta), fileTypeTable, metaCopy.FileBacking.DiskFileNum,
  2941  		objstorage.CreateOptions{PreferSharedStorage: true})
  2942  	if err != nil {
  2943  		return ve, pendingOutputs, err
  2944  	}
  2945  	ve.NewFiles[0].Meta = metaCopy
  2946  
  2947  	if err := d.objProvider.Sync(); err != nil {
  2948  		return nil, pendingOutputs, err
  2949  	}
  2950  	return ve, pendingOutputs, nil
  2951  }
  2952  
  2953  // runCompactions runs a compaction that produces new on-disk tables from
  2954  // memtables or old on-disk tables.
  2955  //
  2956  // d.mu must be held when calling this, but the mutex may be dropped and
  2957  // re-acquired during the course of this method.
  2958  func (d *DB) runCompaction(
  2959  	jobID int, c *compaction,
  2960  ) (ve *versionEdit, pendingOutputs []physicalMeta, stats compactStats, retErr error) {
  2961  	// As a sanity check, confirm that the smallest / largest keys for new and
  2962  	// deleted files in the new versionEdit pass a validation function before
  2963  	// returning the edit.
  2964  	defer func() {
  2965  		// If we're handling a panic, don't expect the version edit to validate.
  2966  		if r := recover(); r != nil {
  2967  			panic(r)
  2968  		} else if ve != nil {
  2969  			err := validateVersionEdit(ve, d.opts.Experimental.KeyValidationFunc, d.opts.Comparer.FormatKey)
  2970  			if err != nil {
  2971  				d.opts.Logger.Fatalf("pebble: version edit validation failed: %s", err)
  2972  			}
  2973  		}
  2974  	}()
  2975  
  2976  	// Check for a delete-only compaction. This can occur when wide range
  2977  	// tombstones completely contain sstables.
  2978  	if c.kind == compactionKindDeleteOnly {
  2979  		c.metrics = make(map[int]*LevelMetrics, len(c.inputs))
  2980  		ve := &versionEdit{
  2981  			DeletedFiles: map[deletedFileEntry]*fileMetadata{},
  2982  		}
  2983  		for _, cl := range c.inputs {
  2984  			levelMetrics := &LevelMetrics{}
  2985  			iter := cl.files.Iter()
  2986  			for f := iter.First(); f != nil; f = iter.Next() {
  2987  				ve.DeletedFiles[deletedFileEntry{
  2988  					Level:   cl.level,
  2989  					FileNum: f.FileNum,
  2990  				}] = f
  2991  			}
  2992  			c.metrics[cl.level] = levelMetrics
  2993  		}
  2994  		return ve, nil, stats, nil
  2995  	}
  2996  
  2997  	if c.kind == compactionKindIngestedFlushable {
  2998  		panic("pebble: runCompaction cannot handle compactionKindIngestedFlushable.")
  2999  	}
  3000  
  3001  	// Check for a move or copy of one table from one level to the next. We avoid
  3002  	// such a move if there is lots of overlapping grandparent data. Otherwise,
  3003  	// the move could create a parent file that will require a very expensive
  3004  	// merge later on.
  3005  	if c.kind == compactionKindMove || c.kind == compactionKindCopy {
  3006  		iter := c.startLevel.files.Iter()
  3007  		meta := iter.First()
  3008  		if invariants.Enabled {
  3009  			if iter.Next() != nil {
  3010  				panic("got more than one file for a move or copy compaction")
  3011  			}
  3012  		}
  3013  		if c.cancel.Load() {
  3014  			return ve, nil, stats, ErrCancelledCompaction
  3015  		}
  3016  		objMeta, err := d.objProvider.Lookup(fileTypeTable, meta.FileBacking.DiskFileNum)
  3017  		if err != nil {
  3018  			return ve, pendingOutputs, stats, err
  3019  		}
  3020  		c.metrics = map[int]*LevelMetrics{
  3021  			c.outputLevel.level: {
  3022  				BytesMoved:  meta.Size,
  3023  				TablesMoved: 1,
  3024  			},
  3025  		}
  3026  		ve := &versionEdit{
  3027  			DeletedFiles: map[deletedFileEntry]*fileMetadata{
  3028  				{Level: c.startLevel.level, FileNum: meta.FileNum}: meta,
  3029  			},
  3030  			NewFiles: []newFileEntry{
  3031  				{Level: c.outputLevel.level, Meta: meta},
  3032  			},
  3033  		}
  3034  		if c.kind == compactionKindCopy {
  3035  			ve, pendingOutputs, retErr = d.runCopyCompaction(jobID, c, meta, objMeta, ve)
  3036  			if retErr != nil {
  3037  				return ve, pendingOutputs, stats, retErr
  3038  			}
  3039  		}
  3040  		return ve, nil, stats, nil
  3041  	}
  3042  
  3043  	defer func() {
  3044  		if retErr != nil {
  3045  			pendingOutputs = nil
  3046  		}
  3047  	}()
  3048  
  3049  	snapshots := d.mu.snapshots.toSlice()
  3050  	formatVers := d.FormatMajorVersion()
  3051  
  3052  	if c.flushing == nil {
  3053  		// Before dropping the db mutex, grab a ref to the current version. This
  3054  		// prevents any concurrent excises from deleting files that this compaction
  3055  		// needs to read/maintain a reference to.
  3056  		//
  3057  		// Note that unlike user iterators, compactionIter does not maintain a ref
  3058  		// of the version or read state.
  3059  		vers := d.mu.versions.currentVersion()
  3060  		vers.Ref()
  3061  		defer vers.UnrefLocked()
  3062  	}
  3063  
  3064  	if c.cancel.Load() {
  3065  		return ve, nil, stats, ErrCancelledCompaction
  3066  	}
  3067  
  3068  	// Release the d.mu lock while doing I/O.
  3069  	// Note the unusual order: Unlock and then Lock.
  3070  	d.mu.Unlock()
  3071  	defer d.mu.Lock()
  3072  
  3073  	// Compactions use a pool of buffers to read blocks, avoiding polluting the
  3074  	// block cache with blocks that will not be read again. We initialize the
  3075  	// buffer pool with a size 12. This initial size does not need to be
  3076  	// accurate, because the pool will grow to accommodate the maximum number of
  3077  	// blocks allocated at a given time over the course of the compaction. But
  3078  	// choosing a size larger than that working set avoids any additional
  3079  	// allocations to grow the size of the pool over the course of iteration.
  3080  	//
  3081  	// Justification for initial size 12: In a two-level compaction, at any
  3082  	// given moment we'll have 2 index blocks in-use and 2 data blocks in-use.
  3083  	// Additionally, when decoding a compressed block, we'll temporarily
  3084  	// allocate 1 additional block to hold the compressed buffer. In the worst
  3085  	// case that all input sstables have two-level index blocks (+2), value
  3086  	// blocks (+2), range deletion blocks (+n) and range key blocks (+n), we'll
  3087  	// additionally require 2n+4 blocks where n is the number of input sstables.
  3088  	// Range deletion and range key blocks are relatively rare, and the cost of
  3089  	// an additional allocation or two over the course of the compaction is
  3090  	// considered to be okay. A larger initial size would cause the pool to hold
  3091  	// on to more memory, even when it's not in-use because the pool will
  3092  	// recycle buffers up to the current capacity of the pool. The memory use of
  3093  	// a 12-buffer pool is expected to be within reason, even if all the buffers
  3094  	// grow to the typical size of an index block (256 KiB) which would
  3095  	// translate to 3 MiB per compaction.
  3096  	c.bufferPool.Init(12)
  3097  	defer c.bufferPool.Release()
  3098  
  3099  	iiter, err := c.newInputIter(d.newIters, d.tableNewRangeKeyIter, snapshots)
  3100  	if err != nil {
  3101  		return nil, pendingOutputs, stats, err
  3102  	}
  3103  	c.allowedZeroSeqNum = c.allowZeroSeqNum()
  3104  	iiter = invalidating.MaybeWrapIfInvariants(iiter)
  3105  	iter := newCompactionIter(c.cmp, c.equal, c.formatKey, d.merge, iiter, snapshots,
  3106  		&c.rangeDelFrag, &c.rangeKeyFrag, c.allowedZeroSeqNum, c.elideTombstone,
  3107  		c.elideRangeTombstone, d.opts.Experimental.IneffectualSingleDeleteCallback,
  3108  		d.opts.Experimental.SingleDeleteInvariantViolationCallback,
  3109  		d.FormatMajorVersion())
  3110  
  3111  	var (
  3112  		createdFiles    []base.DiskFileNum
  3113  		tw              *sstable.Writer
  3114  		pinnedKeySize   uint64
  3115  		pinnedValueSize uint64
  3116  		pinnedCount     uint64
  3117  	)
  3118  	defer func() {
  3119  		if iter != nil {
  3120  			retErr = firstError(retErr, iter.Close())
  3121  		}
  3122  		if tw != nil {
  3123  			retErr = firstError(retErr, tw.Close())
  3124  		}
  3125  		if retErr != nil {
  3126  			for _, fileNum := range createdFiles {
  3127  				_ = d.objProvider.Remove(fileTypeTable, fileNum)
  3128  			}
  3129  		}
  3130  		for _, closer := range c.closers {
  3131  			retErr = firstError(retErr, closer.Close())
  3132  		}
  3133  	}()
  3134  
  3135  	ve = &versionEdit{
  3136  		DeletedFiles: map[deletedFileEntry]*fileMetadata{},
  3137  	}
  3138  
  3139  	startLevelBytes := c.startLevel.files.SizeSum()
  3140  	outputMetrics := &LevelMetrics{
  3141  		BytesIn:   startLevelBytes,
  3142  		BytesRead: c.outputLevel.files.SizeSum(),
  3143  	}
  3144  	if len(c.extraLevels) > 0 {
  3145  		outputMetrics.BytesIn += c.extraLevels[0].files.SizeSum()
  3146  	}
  3147  	outputMetrics.BytesRead += outputMetrics.BytesIn
  3148  
  3149  	c.metrics = map[int]*LevelMetrics{
  3150  		c.outputLevel.level: outputMetrics,
  3151  	}
  3152  	if len(c.flushing) == 0 && c.metrics[c.startLevel.level] == nil {
  3153  		c.metrics[c.startLevel.level] = &LevelMetrics{}
  3154  	}
  3155  	if len(c.extraLevels) > 0 {
  3156  		c.metrics[c.extraLevels[0].level] = &LevelMetrics{}
  3157  		outputMetrics.MultiLevel.BytesInTop = startLevelBytes
  3158  		outputMetrics.MultiLevel.BytesIn = outputMetrics.BytesIn
  3159  		outputMetrics.MultiLevel.BytesRead = outputMetrics.BytesRead
  3160  	}
  3161  
  3162  	// The table is typically written at the maximum allowable format implied by
  3163  	// the current format major version of the DB.
  3164  	tableFormat := formatVers.MaxTableFormat()
  3165  
  3166  	// In format major versions with maximum table formats of Pebblev3, value
  3167  	// blocks were conditional on an experimental setting. In format major
  3168  	// versions with maximum table formats of Pebblev4 and higher, value blocks
  3169  	// are always enabled.
  3170  	if tableFormat == sstable.TableFormatPebblev3 &&
  3171  		(d.opts.Experimental.EnableValueBlocks == nil || !d.opts.Experimental.EnableValueBlocks()) {
  3172  		tableFormat = sstable.TableFormatPebblev2
  3173  	}
  3174  
  3175  	writerOpts := d.opts.MakeWriterOptions(c.outputLevel.level, tableFormat)
  3176  	if formatVers < FormatBlockPropertyCollector {
  3177  		// Cannot yet write block properties.
  3178  		writerOpts.BlockPropertyCollectors = nil
  3179  	}
  3180  
  3181  	// prevPointKey is a sstable.WriterOption that provides access to
  3182  	// the last point key written to a writer's sstable. When a new
  3183  	// output begins in newOutput, prevPointKey is updated to point to
  3184  	// the new output's sstable.Writer. This allows the compaction loop
  3185  	// to access the last written point key without requiring the
  3186  	// compaction loop to make a copy of each key ahead of time. Users
  3187  	// must be careful, because the byte slice returned by UnsafeKey
  3188  	// points directly into the Writer's block buffer.
  3189  	var prevPointKey sstable.PreviousPointKeyOpt
  3190  	var cpuWorkHandle CPUWorkHandle
  3191  	defer func() {
  3192  		if cpuWorkHandle != nil {
  3193  			d.opts.Experimental.CPUWorkPermissionGranter.CPUWorkDone(cpuWorkHandle)
  3194  		}
  3195  	}()
  3196  
  3197  	newOutput := func() error {
  3198  		// Check if we've been cancelled by a concurrent operation.
  3199  		if c.cancel.Load() {
  3200  			return ErrCancelledCompaction
  3201  		}
  3202  		fileMeta := &fileMetadata{}
  3203  		d.mu.Lock()
  3204  		fileNum := d.mu.versions.getNextFileNum()
  3205  		fileMeta.FileNum = fileNum
  3206  		pendingOutputs = append(pendingOutputs, fileMeta.PhysicalMeta())
  3207  		d.mu.Unlock()
  3208  
  3209  		ctx := context.TODO()
  3210  		if objiotracing.Enabled {
  3211  			ctx = objiotracing.WithLevel(ctx, c.outputLevel.level)
  3212  			switch c.kind {
  3213  			case compactionKindFlush:
  3214  				ctx = objiotracing.WithReason(ctx, objiotracing.ForFlush)
  3215  			case compactionKindIngestedFlushable:
  3216  				ctx = objiotracing.WithReason(ctx, objiotracing.ForIngestion)
  3217  			default:
  3218  				ctx = objiotracing.WithReason(ctx, objiotracing.ForCompaction)
  3219  			}
  3220  		}
  3221  		// Prefer shared storage if present.
  3222  		createOpts := objstorage.CreateOptions{
  3223  			PreferSharedStorage: remote.ShouldCreateShared(d.opts.Experimental.CreateOnShared, c.outputLevel.level),
  3224  		}
  3225  		writable, objMeta, err := d.objProvider.Create(ctx, fileTypeTable, fileNum.DiskFileNum(), createOpts)
  3226  		if err != nil {
  3227  			return err
  3228  		}
  3229  
  3230  		reason := "flushing"
  3231  		if c.flushing == nil {
  3232  			reason = "compacting"
  3233  		}
  3234  		d.opts.EventListener.TableCreated(TableCreateInfo{
  3235  			JobID:   jobID,
  3236  			Reason:  reason,
  3237  			Path:    d.objProvider.Path(objMeta),
  3238  			FileNum: fileNum,
  3239  		})
  3240  		if c.kind != compactionKindFlush {
  3241  			writable = &compactionWritable{
  3242  				Writable: writable,
  3243  				versions: d.mu.versions,
  3244  				written:  &c.bytesWritten,
  3245  			}
  3246  		}
  3247  		createdFiles = append(createdFiles, fileNum.DiskFileNum())
  3248  		cacheOpts := private.SSTableCacheOpts(d.cacheID, fileNum.DiskFileNum()).(sstable.WriterOption)
  3249  
  3250  		const MaxFileWriteAdditionalCPUTime = time.Millisecond * 100
  3251  		cpuWorkHandle = d.opts.Experimental.CPUWorkPermissionGranter.GetPermission(
  3252  			MaxFileWriteAdditionalCPUTime,
  3253  		)
  3254  		writerOpts.Parallelism =
  3255  			d.opts.Experimental.MaxWriterConcurrency > 0 &&
  3256  				(cpuWorkHandle.Permitted() || d.opts.Experimental.ForceWriterParallelism)
  3257  
  3258  		tw = sstable.NewWriter(writable, writerOpts, cacheOpts, &prevPointKey)
  3259  
  3260  		fileMeta.CreationTime = time.Now().Unix()
  3261  		ve.NewFiles = append(ve.NewFiles, newFileEntry{
  3262  			Level: c.outputLevel.level,
  3263  			Meta:  fileMeta,
  3264  		})
  3265  		return nil
  3266  	}
  3267  
  3268  	// splitL0Outputs is true during flushes and intra-L0 compactions with flush
  3269  	// splits enabled.
  3270  	splitL0Outputs := c.outputLevel.level == 0 && d.opts.FlushSplitBytes > 0
  3271  
  3272  	// finishOutput is called with the a user key up to which all tombstones
  3273  	// should be flushed. Typically, this is the first key of the next
  3274  	// sstable or an empty key if this output is the final sstable.
  3275  	finishOutput := func(splitKey []byte) error {
  3276  		// If we haven't output any point records to the sstable (tw == nil) then the
  3277  		// sstable will only contain range tombstones and/or range keys. The smallest
  3278  		// key in the sstable will be the start key of the first range tombstone or
  3279  		// range key added. We need to ensure that this start key is distinct from
  3280  		// the splitKey passed to finishOutput (if set), otherwise we would generate
  3281  		// an sstable where the largest key is smaller than the smallest key due to
  3282  		// how the largest key boundary is set below. NB: It is permissible for the
  3283  		// range tombstone / range key start key to be the empty string.
  3284  		//
  3285  		// TODO: It is unfortunate that we have to do this check here rather than
  3286  		// when we decide to finish the sstable in the runCompaction loop. A better
  3287  		// structure currently eludes us.
  3288  		if tw == nil {
  3289  			startKey := c.rangeDelFrag.Start()
  3290  			if len(iter.tombstones) > 0 {
  3291  				startKey = iter.tombstones[0].Start
  3292  			}
  3293  			if startKey == nil {
  3294  				startKey = c.rangeKeyFrag.Start()
  3295  				if len(iter.rangeKeys) > 0 {
  3296  					startKey = iter.rangeKeys[0].Start
  3297  				}
  3298  			}
  3299  			if splitKey != nil && d.cmp(startKey, splitKey) == 0 {
  3300  				return nil
  3301  			}
  3302  		}
  3303  
  3304  		// NB: clone the key because the data can be held on to by the call to
  3305  		// compactionIter.Tombstones via keyspan.Fragmenter.FlushTo, and by the
  3306  		// WriterMetadata.LargestRangeDel.UserKey.
  3307  		splitKey = append([]byte(nil), splitKey...)
  3308  		for _, v := range iter.Tombstones(splitKey) {
  3309  			if tw == nil {
  3310  				if err := newOutput(); err != nil {
  3311  					return err
  3312  				}
  3313  			}
  3314  			// The tombstone being added could be completely outside the
  3315  			// eventual bounds of the sstable. Consider this example (bounds
  3316  			// in square brackets next to table filename):
  3317  			//
  3318  			// ./000240.sst   [tmgc#391,MERGE-tmgc#391,MERGE]
  3319  			// tmgc#391,MERGE [786e627a]
  3320  			// tmgc-udkatvs#331,RANGEDEL
  3321  			//
  3322  			// ./000241.sst   [tmgc#384,MERGE-tmgc#384,MERGE]
  3323  			// tmgc#384,MERGE [666c7070]
  3324  			// tmgc-tvsalezade#383,RANGEDEL
  3325  			// tmgc-tvsalezade#331,RANGEDEL
  3326  			//
  3327  			// ./000242.sst   [tmgc#383,RANGEDEL-tvsalezade#72057594037927935,RANGEDEL]
  3328  			// tmgc-tvsalezade#383,RANGEDEL
  3329  			// tmgc#375,SET [72646c78766965616c72776865676e79]
  3330  			// tmgc-tvsalezade#356,RANGEDEL
  3331  			//
  3332  			// Note that both of the top two SSTables have range tombstones
  3333  			// that start after the file's end keys. Since the file bound
  3334  			// computation happens well after all range tombstones have been
  3335  			// added to the writer, eliding out-of-file range tombstones based
  3336  			// on sequence number at this stage is difficult, and necessitates
  3337  			// read-time logic to ignore range tombstones outside file bounds.
  3338  			if err := rangedel.Encode(&v, tw.Add); err != nil {
  3339  				return err
  3340  			}
  3341  		}
  3342  		for _, v := range iter.RangeKeys(splitKey) {
  3343  			// Same logic as for range tombstones, except added using tw.AddRangeKey.
  3344  			if tw == nil {
  3345  				if err := newOutput(); err != nil {
  3346  					return err
  3347  				}
  3348  			}
  3349  			if err := rangekey.Encode(&v, tw.AddRangeKey); err != nil {
  3350  				return err
  3351  			}
  3352  		}
  3353  
  3354  		if tw == nil {
  3355  			return nil
  3356  		}
  3357  		{
  3358  			// Set internal sstable properties.
  3359  			p := getInternalWriterProperties(tw)
  3360  			// Set the external sst version to 0. This is what RocksDB expects for
  3361  			// db-internal sstables; otherwise, it could apply a global sequence number.
  3362  			p.ExternalFormatVersion = 0
  3363  			// Set the snapshot pinned totals.
  3364  			p.SnapshotPinnedKeys = pinnedCount
  3365  			p.SnapshotPinnedKeySize = pinnedKeySize
  3366  			p.SnapshotPinnedValueSize = pinnedValueSize
  3367  			stats.cumulativePinnedKeys += pinnedCount
  3368  			stats.cumulativePinnedSize += pinnedKeySize + pinnedValueSize
  3369  			pinnedCount = 0
  3370  			pinnedKeySize = 0
  3371  			pinnedValueSize = 0
  3372  		}
  3373  		if err := tw.Close(); err != nil {
  3374  			tw = nil
  3375  			return err
  3376  		}
  3377  		d.opts.Experimental.CPUWorkPermissionGranter.CPUWorkDone(cpuWorkHandle)
  3378  		cpuWorkHandle = nil
  3379  		writerMeta, err := tw.Metadata()
  3380  		if err != nil {
  3381  			tw = nil
  3382  			return err
  3383  		}
  3384  		tw = nil
  3385  		meta := ve.NewFiles[len(ve.NewFiles)-1].Meta
  3386  		meta.Size = writerMeta.Size
  3387  		meta.SmallestSeqNum = writerMeta.SmallestSeqNum
  3388  		meta.LargestSeqNum = writerMeta.LargestSeqNum
  3389  		meta.InitPhysicalBacking()
  3390  
  3391  		// If the file didn't contain any range deletions, we can fill its
  3392  		// table stats now, avoiding unnecessarily loading the table later.
  3393  		maybeSetStatsFromProperties(
  3394  			meta.PhysicalMeta(), &writerMeta.Properties,
  3395  		)
  3396  
  3397  		if c.flushing == nil {
  3398  			outputMetrics.TablesCompacted++
  3399  			outputMetrics.BytesCompacted += meta.Size
  3400  		} else {
  3401  			outputMetrics.TablesFlushed++
  3402  			outputMetrics.BytesFlushed += meta.Size
  3403  		}
  3404  		outputMetrics.Size += int64(meta.Size)
  3405  		outputMetrics.NumFiles++
  3406  		outputMetrics.Additional.BytesWrittenDataBlocks += writerMeta.Properties.DataSize
  3407  		outputMetrics.Additional.BytesWrittenValueBlocks += writerMeta.Properties.ValueBlocksSize
  3408  
  3409  		if n := len(ve.NewFiles); n > 1 {
  3410  			// This is not the first output file. Ensure the sstable boundaries
  3411  			// are nonoverlapping.
  3412  			prevMeta := ve.NewFiles[n-2].Meta
  3413  			if writerMeta.SmallestRangeDel.UserKey != nil {
  3414  				c := d.cmp(writerMeta.SmallestRangeDel.UserKey, prevMeta.Largest.UserKey)
  3415  				if c < 0 {
  3416  					return errors.Errorf(
  3417  						"pebble: smallest range tombstone start key is less than previous sstable largest key: %s < %s",
  3418  						writerMeta.SmallestRangeDel.Pretty(d.opts.Comparer.FormatKey),
  3419  						prevMeta.Largest.Pretty(d.opts.Comparer.FormatKey))
  3420  				} else if c == 0 && !prevMeta.Largest.IsExclusiveSentinel() {
  3421  					// The user key portion of the range boundary start key is
  3422  					// equal to the previous table's largest key user key, and
  3423  					// the previous table's largest key is not exclusive. This
  3424  					// violates the invariant that tables are key-space
  3425  					// partitioned.
  3426  					return errors.Errorf(
  3427  						"pebble: invariant violation: previous sstable largest key %s, current sstable smallest rangedel: %s",
  3428  						prevMeta.Largest.Pretty(d.opts.Comparer.FormatKey),
  3429  						writerMeta.SmallestRangeDel.Pretty(d.opts.Comparer.FormatKey),
  3430  					)
  3431  				}
  3432  			}
  3433  		}
  3434  
  3435  		// Verify that all range deletions outputted to the sstable are
  3436  		// truncated to split key.
  3437  		if splitKey != nil && writerMeta.LargestRangeDel.UserKey != nil &&
  3438  			d.cmp(writerMeta.LargestRangeDel.UserKey, splitKey) > 0 {
  3439  			return errors.Errorf(
  3440  				"pebble: invariant violation: rangedel largest key %q extends beyond split key %q",
  3441  				writerMeta.LargestRangeDel.Pretty(d.opts.Comparer.FormatKey),
  3442  				d.opts.Comparer.FormatKey(splitKey),
  3443  			)
  3444  		}
  3445  
  3446  		if writerMeta.HasPointKeys {
  3447  			meta.ExtendPointKeyBounds(d.cmp, writerMeta.SmallestPoint, writerMeta.LargestPoint)
  3448  		}
  3449  		if writerMeta.HasRangeDelKeys {
  3450  			meta.ExtendPointKeyBounds(d.cmp, writerMeta.SmallestRangeDel, writerMeta.LargestRangeDel)
  3451  		}
  3452  		if writerMeta.HasRangeKeys {
  3453  			meta.ExtendRangeKeyBounds(d.cmp, writerMeta.SmallestRangeKey, writerMeta.LargestRangeKey)
  3454  		}
  3455  
  3456  		// Verify that the sstable bounds fall within the compaction input
  3457  		// bounds. This is a sanity check that we don't have a logic error
  3458  		// elsewhere that causes the sstable bounds to accidentally expand past the
  3459  		// compaction input bounds as doing so could lead to various badness such
  3460  		// as keys being deleted by a range tombstone incorrectly.
  3461  		if c.smallest.UserKey != nil {
  3462  			switch v := d.cmp(meta.Smallest.UserKey, c.smallest.UserKey); {
  3463  			case v >= 0:
  3464  				// Nothing to do.
  3465  			case v < 0:
  3466  				return errors.Errorf("pebble: compaction output grew beyond bounds of input: %s < %s",
  3467  					meta.Smallest.Pretty(d.opts.Comparer.FormatKey),
  3468  					c.smallest.Pretty(d.opts.Comparer.FormatKey))
  3469  			}
  3470  		}
  3471  		if c.largest.UserKey != nil {
  3472  			switch v := d.cmp(meta.Largest.UserKey, c.largest.UserKey); {
  3473  			case v <= 0:
  3474  				// Nothing to do.
  3475  			case v > 0:
  3476  				return errors.Errorf("pebble: compaction output grew beyond bounds of input: %s > %s",
  3477  					meta.Largest.Pretty(d.opts.Comparer.FormatKey),
  3478  					c.largest.Pretty(d.opts.Comparer.FormatKey))
  3479  			}
  3480  		}
  3481  		// Verify that we never split different revisions of the same user key
  3482  		// across two different sstables.
  3483  		if err := c.errorOnUserKeyOverlap(ve); err != nil {
  3484  			return err
  3485  		}
  3486  		if err := meta.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil {
  3487  			return err
  3488  		}
  3489  		return nil
  3490  	}
  3491  
  3492  	// Build a compactionOutputSplitter that contains all logic to determine
  3493  	// whether the compaction loop should stop writing to one output sstable and
  3494  	// switch to a new one. Some splitters can wrap other splitters, and the
  3495  	// splitterGroup can be composed of multiple splitters. In this case, we
  3496  	// start off with splitters for file sizes, grandparent limits, and (for L0
  3497  	// splits) L0 limits, before wrapping them in an splitterGroup.
  3498  	sizeSplitter := newFileSizeSplitter(&iter.frontiers, c.maxOutputFileSize, c.grandparents.Iter())
  3499  	unsafePrevUserKey := func() []byte {
  3500  		// Return the largest point key written to tw or the start of
  3501  		// the current range deletion in the fragmenter, whichever is
  3502  		// greater.
  3503  		prevPoint := prevPointKey.UnsafeKey()
  3504  		if c.cmp(prevPoint.UserKey, c.rangeDelFrag.Start()) > 0 {
  3505  			return prevPoint.UserKey
  3506  		}
  3507  		return c.rangeDelFrag.Start()
  3508  	}
  3509  	outputSplitters := []compactionOutputSplitter{
  3510  		// We do not split the same user key across different sstables within
  3511  		// one flush or compaction. The fileSizeSplitter may request a split in
  3512  		// the middle of a user key, so the userKeyChangeSplitter ensures we are
  3513  		// at a user key change boundary when doing a split.
  3514  		&userKeyChangeSplitter{
  3515  			cmp:               c.cmp,
  3516  			splitter:          sizeSplitter,
  3517  			unsafePrevUserKey: unsafePrevUserKey,
  3518  		},
  3519  		newLimitFuncSplitter(&iter.frontiers, c.findGrandparentLimit),
  3520  	}
  3521  	if splitL0Outputs {
  3522  		outputSplitters = append(outputSplitters, newLimitFuncSplitter(&iter.frontiers, c.findL0Limit))
  3523  	}
  3524  	splitter := &splitterGroup{cmp: c.cmp, splitters: outputSplitters}
  3525  
  3526  	// Each outer loop iteration produces one output file. An iteration that
  3527  	// produces a file containing point keys (and optionally range tombstones)
  3528  	// guarantees that the input iterator advanced. An iteration that produces
  3529  	// a file containing only range tombstones guarantees the limit passed to
  3530  	// `finishOutput()` advanced to a strictly greater user key corresponding
  3531  	// to a grandparent file largest key, or nil. Taken together, these
  3532  	// progress guarantees ensure that eventually the input iterator will be
  3533  	// exhausted and the range tombstone fragments will all be flushed.
  3534  	for key, val := iter.First(); key != nil || !c.rangeDelFrag.Empty() || !c.rangeKeyFrag.Empty(); {
  3535  		var firstKey []byte
  3536  		if key != nil {
  3537  			firstKey = key.UserKey
  3538  		} else if startKey := c.rangeDelFrag.Start(); startKey != nil {
  3539  			// Pass the start key of the first pending tombstone to find the
  3540  			// next limit. All pending tombstones have the same start key. We
  3541  			// use this as opposed to the end key of the last written sstable to
  3542  			// effectively handle cases like these:
  3543  			//
  3544  			// a.SET.3
  3545  			// (lf.limit at b)
  3546  			// d.RANGEDEL.4:f
  3547  			//
  3548  			// In this case, the partition after b has only range deletions, so
  3549  			// if we were to find the limit after the last written key at the
  3550  			// split point (key a), we'd get the limit b again, and
  3551  			// finishOutput() would not advance any further because the next
  3552  			// range tombstone to write does not start until after the L0 split
  3553  			// point.
  3554  			firstKey = startKey
  3555  		}
  3556  		splitterSuggestion := splitter.onNewOutput(firstKey)
  3557  
  3558  		// Each inner loop iteration processes one key from the input iterator.
  3559  		for ; key != nil; key, val = iter.Next() {
  3560  			if split := splitter.shouldSplitBefore(key, tw); split == splitNow {
  3561  				break
  3562  			}
  3563  
  3564  			switch key.Kind() {
  3565  			case InternalKeyKindRangeDelete:
  3566  				// Range tombstones are handled specially. They are fragmented,
  3567  				// and they're not written until later during `finishOutput()`.
  3568  				// We add them to the `Fragmenter` now to make them visible to
  3569  				// `compactionIter` so covered keys in the same snapshot stripe
  3570  				// can be elided.
  3571  
  3572  				// The interleaved range deletion might only be one of many with
  3573  				// these bounds. Some fragmenting is performed ahead of time by
  3574  				// keyspan.MergingIter.
  3575  				if s := c.rangeDelIter.Span(); !s.Empty() {
  3576  					// The memory management here is subtle. Range deletions
  3577  					// blocks do NOT use prefix compression, which ensures that
  3578  					// range deletion spans' memory is available as long we keep
  3579  					// the iterator open. However, the keyspan.MergingIter that
  3580  					// merges spans across levels only guarantees the lifetime
  3581  					// of the [start, end) bounds until the next positioning
  3582  					// method is called.
  3583  					//
  3584  					// Additionally, the Span.Keys slice is owned by the the
  3585  					// range deletion iterator stack, and it may be overwritten
  3586  					// when we advance.
  3587  					//
  3588  					// Clone the Keys slice and the start and end keys.
  3589  					//
  3590  					// TODO(jackson): Avoid the clone by removing c.rangeDelFrag
  3591  					// and performing explicit truncation of the pending
  3592  					// rangedel span as necessary.
  3593  					clone := keyspan.Span{
  3594  						Start: iter.cloneKey(s.Start),
  3595  						End:   iter.cloneKey(s.End),
  3596  						Keys:  make([]keyspan.Key, len(s.Keys)),
  3597  					}
  3598  					copy(clone.Keys, s.Keys)
  3599  					c.rangeDelFrag.Add(clone)
  3600  				}
  3601  				continue
  3602  			case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
  3603  				// Range keys are handled in the same way as range tombstones, except
  3604  				// with a dedicated fragmenter.
  3605  				if s := c.rangeKeyInterleaving.Span(); !s.Empty() {
  3606  					clone := keyspan.Span{
  3607  						Start: iter.cloneKey(s.Start),
  3608  						End:   iter.cloneKey(s.End),
  3609  						Keys:  make([]keyspan.Key, len(s.Keys)),
  3610  					}
  3611  					// Since the keys' Suffix and Value fields are not deep cloned, the
  3612  					// underlying blockIter must be kept open for the lifetime of the
  3613  					// compaction.
  3614  					copy(clone.Keys, s.Keys)
  3615  					c.rangeKeyFrag.Add(clone)
  3616  				}
  3617  				continue
  3618  			}
  3619  			if tw == nil {
  3620  				if err := newOutput(); err != nil {
  3621  					return nil, pendingOutputs, stats, err
  3622  				}
  3623  			}
  3624  			if err := tw.AddWithForceObsolete(*key, val, iter.forceObsoleteDueToRangeDel); err != nil {
  3625  				return nil, pendingOutputs, stats, err
  3626  			}
  3627  			if iter.snapshotPinned {
  3628  				// The kv pair we just added to the sstable was only surfaced by
  3629  				// the compaction iterator because an open snapshot prevented
  3630  				// its elision. Increment the stats.
  3631  				pinnedCount++
  3632  				pinnedKeySize += uint64(len(key.UserKey)) + base.InternalTrailerLen
  3633  				pinnedValueSize += uint64(len(val))
  3634  			}
  3635  		}
  3636  
  3637  		// A splitter requested a split, and we're ready to finish the output.
  3638  		// We need to choose the key at which to split any pending range
  3639  		// tombstones. There are two options:
  3640  		// 1. splitterSuggestion — The key suggested by the splitter. This key
  3641  		//    is guaranteed to be greater than the last key written to the
  3642  		//    current output.
  3643  		// 2. key.UserKey — the first key of the next sstable output. This user
  3644  		//     key is also guaranteed to be greater than the last user key
  3645  		//     written to the current output (see userKeyChangeSplitter).
  3646  		//
  3647  		// Use whichever is smaller. Using the smaller of the two limits
  3648  		// overlap with grandparents. Consider the case where the
  3649  		// grandparent limit is calculated to be 'b', key is 'x', and
  3650  		// there exist many sstables between 'b' and 'x'. If the range
  3651  		// deletion fragmenter has a pending tombstone [a,x), splitting
  3652  		// at 'x' would cause the output table to overlap many
  3653  		// grandparents well beyond the calculated grandparent limit
  3654  		// 'b'. Splitting at the smaller `splitterSuggestion` avoids
  3655  		// this unbounded overlap with grandparent tables.
  3656  		splitKey := splitterSuggestion
  3657  		if key != nil && (splitKey == nil || c.cmp(splitKey, key.UserKey) > 0) {
  3658  			splitKey = key.UserKey
  3659  		}
  3660  		if err := finishOutput(splitKey); err != nil {
  3661  			return nil, pendingOutputs, stats, err
  3662  		}
  3663  	}
  3664  
  3665  	for _, cl := range c.inputs {
  3666  		iter := cl.files.Iter()
  3667  		for f := iter.First(); f != nil; f = iter.Next() {
  3668  			ve.DeletedFiles[deletedFileEntry{
  3669  				Level:   cl.level,
  3670  				FileNum: f.FileNum,
  3671  			}] = f
  3672  		}
  3673  	}
  3674  
  3675  	// The compaction iterator keeps track of a count of the number of DELSIZED
  3676  	// keys that encoded an incorrect size. Propagate it up as a part of
  3677  	// compactStats.
  3678  	stats.countMissizedDels = iter.stats.countMissizedDels
  3679  
  3680  	if err := d.objProvider.Sync(); err != nil {
  3681  		return nil, pendingOutputs, stats, err
  3682  	}
  3683  
  3684  	// Refresh the disk available statistic whenever a compaction/flush
  3685  	// completes, before re-acquiring the mutex.
  3686  	_ = d.calculateDiskAvailableBytes()
  3687  
  3688  	return ve, pendingOutputs, stats, nil
  3689  }
  3690  
  3691  // validateVersionEdit validates that start and end keys across new and deleted
  3692  // files in a versionEdit pass the given validation function.
  3693  func validateVersionEdit(
  3694  	ve *versionEdit, validateFn func([]byte) error, format base.FormatKey,
  3695  ) error {
  3696  	validateMetaFn := func(f *manifest.FileMetadata) error {
  3697  		for _, key := range []InternalKey{f.Smallest, f.Largest} {
  3698  			if err := validateFn(key.UserKey); err != nil {
  3699  				return errors.Wrapf(err, "key=%q; file=%s", format(key.UserKey), f)
  3700  			}
  3701  		}
  3702  		return nil
  3703  	}
  3704  
  3705  	// Validate both new and deleted files.
  3706  	for _, f := range ve.NewFiles {
  3707  		if err := validateMetaFn(f.Meta); err != nil {
  3708  			return err
  3709  		}
  3710  	}
  3711  	for _, m := range ve.DeletedFiles {
  3712  		if err := validateMetaFn(m); err != nil {
  3713  			return err
  3714  		}
  3715  	}
  3716  
  3717  	return nil
  3718  }
  3719  
  3720  // scanObsoleteFiles scans the filesystem for files that are no longer needed
  3721  // and adds those to the internal lists of obsolete files. Note that the files
  3722  // are not actually deleted by this method. A subsequent call to
  3723  // deleteObsoleteFiles must be performed. Must be not be called concurrently
  3724  // with compactions and flushes. db.mu must be held when calling this function.
  3725  func (d *DB) scanObsoleteFiles(list []string) {
  3726  	// Disable automatic compactions temporarily to avoid concurrent compactions /
  3727  	// flushes from interfering. The original value is restored on completion.
  3728  	disabledPrev := d.opts.DisableAutomaticCompactions
  3729  	defer func() {
  3730  		d.opts.DisableAutomaticCompactions = disabledPrev
  3731  	}()
  3732  	d.opts.DisableAutomaticCompactions = true
  3733  
  3734  	// Wait for any ongoing compaction to complete before continuing.
  3735  	for d.mu.compact.compactingCount > 0 || d.mu.compact.flushing {
  3736  		d.mu.compact.cond.Wait()
  3737  	}
  3738  
  3739  	liveFileNums := make(map[base.DiskFileNum]struct{})
  3740  	d.mu.versions.addLiveFileNums(liveFileNums)
  3741  	// Protect against files which are only referred to by the ingestedFlushable
  3742  	// from being deleted. These are added to the flushable queue on WAL replay
  3743  	// during read only mode and aren't part of the Version. Note that if
  3744  	// !d.opts.ReadOnly, then all flushables of type ingestedFlushable have
  3745  	// already been flushed.
  3746  	for _, fEntry := range d.mu.mem.queue {
  3747  		if f, ok := fEntry.flushable.(*ingestedFlushable); ok {
  3748  			for _, file := range f.files {
  3749  				liveFileNums[file.FileBacking.DiskFileNum] = struct{}{}
  3750  			}
  3751  		}
  3752  	}
  3753  
  3754  	minUnflushedLogNum := d.mu.versions.minUnflushedLogNum
  3755  	manifestFileNum := d.mu.versions.manifestFileNum
  3756  
  3757  	var obsoleteLogs []fileInfo
  3758  	var obsoleteTables []fileInfo
  3759  	var obsoleteManifests []fileInfo
  3760  	var obsoleteOptions []fileInfo
  3761  
  3762  	for _, filename := range list {
  3763  		fileType, diskFileNum, ok := base.ParseFilename(d.opts.FS, filename)
  3764  		if !ok {
  3765  			continue
  3766  		}
  3767  		switch fileType {
  3768  		case fileTypeLog:
  3769  			if diskFileNum >= minUnflushedLogNum {
  3770  				continue
  3771  			}
  3772  			fi := fileInfo{fileNum: diskFileNum}
  3773  			if stat, err := d.opts.FS.Stat(filename); err == nil {
  3774  				fi.fileSize = uint64(stat.Size())
  3775  			}
  3776  			obsoleteLogs = append(obsoleteLogs, fi)
  3777  		case fileTypeManifest:
  3778  			if diskFileNum >= manifestFileNum {
  3779  				continue
  3780  			}
  3781  			fi := fileInfo{fileNum: diskFileNum}
  3782  			if stat, err := d.opts.FS.Stat(filename); err == nil {
  3783  				fi.fileSize = uint64(stat.Size())
  3784  			}
  3785  			obsoleteManifests = append(obsoleteManifests, fi)
  3786  		case fileTypeOptions:
  3787  			if diskFileNum.FileNum() >= d.optionsFileNum.FileNum() {
  3788  				continue
  3789  			}
  3790  			fi := fileInfo{fileNum: diskFileNum}
  3791  			if stat, err := d.opts.FS.Stat(filename); err == nil {
  3792  				fi.fileSize = uint64(stat.Size())
  3793  			}
  3794  			obsoleteOptions = append(obsoleteOptions, fi)
  3795  		case fileTypeTable:
  3796  			// Objects are handled through the objstorage provider below.
  3797  		default:
  3798  			// Don't delete files we don't know about.
  3799  		}
  3800  	}
  3801  
  3802  	objects := d.objProvider.List()
  3803  	for _, obj := range objects {
  3804  		switch obj.FileType {
  3805  		case fileTypeTable:
  3806  			if _, ok := liveFileNums[obj.DiskFileNum]; ok {
  3807  				continue
  3808  			}
  3809  			fileInfo := fileInfo{
  3810  				fileNum: obj.DiskFileNum,
  3811  			}
  3812  			if size, err := d.objProvider.Size(obj); err == nil {
  3813  				fileInfo.fileSize = uint64(size)
  3814  			}
  3815  			obsoleteTables = append(obsoleteTables, fileInfo)
  3816  
  3817  		default:
  3818  			// Ignore object types we don't know about.
  3819  		}
  3820  	}
  3821  
  3822  	d.mu.log.queue = merge(d.mu.log.queue, obsoleteLogs)
  3823  	d.mu.versions.metrics.WAL.Files = int64(len(d.mu.log.queue))
  3824  	d.mu.versions.obsoleteTables = merge(d.mu.versions.obsoleteTables, obsoleteTables)
  3825  	d.mu.versions.updateObsoleteTableMetricsLocked()
  3826  	d.mu.versions.obsoleteManifests = merge(d.mu.versions.obsoleteManifests, obsoleteManifests)
  3827  	d.mu.versions.obsoleteOptions = merge(d.mu.versions.obsoleteOptions, obsoleteOptions)
  3828  }
  3829  
  3830  // disableFileDeletions disables file deletions and then waits for any
  3831  // in-progress deletion to finish. The caller is required to call
  3832  // enableFileDeletions in order to enable file deletions again. It is ok for
  3833  // multiple callers to disable file deletions simultaneously, though they must
  3834  // all invoke enableFileDeletions in order for file deletions to be re-enabled
  3835  // (there is an internal reference count on file deletion disablement).
  3836  //
  3837  // d.mu must be held when calling this method.
  3838  func (d *DB) disableFileDeletions() {
  3839  	d.mu.disableFileDeletions++
  3840  	d.mu.Unlock()
  3841  	defer d.mu.Lock()
  3842  	d.cleanupManager.Wait()
  3843  }
  3844  
  3845  // enableFileDeletions enables previously disabled file deletions. A cleanup job
  3846  // is queued if necessary.
  3847  //
  3848  // d.mu must be held when calling this method.
  3849  func (d *DB) enableFileDeletions() {
  3850  	if d.mu.disableFileDeletions <= 0 {
  3851  		panic("pebble: file deletion disablement invariant violated")
  3852  	}
  3853  	d.mu.disableFileDeletions--
  3854  	if d.mu.disableFileDeletions > 0 {
  3855  		return
  3856  	}
  3857  	jobID := d.mu.nextJobID
  3858  	d.mu.nextJobID++
  3859  	d.deleteObsoleteFiles(jobID)
  3860  }
  3861  
  3862  type fileInfo struct {
  3863  	fileNum  base.DiskFileNum
  3864  	fileSize uint64
  3865  }
  3866  
  3867  // deleteObsoleteFiles enqueues a cleanup job to the cleanup manager, if necessary.
  3868  //
  3869  // d.mu must be held when calling this. The function will release and re-aquire the mutex.
  3870  //
  3871  // Does nothing if file deletions are disabled (see disableFileDeletions). A
  3872  // cleanup job will be scheduled when file deletions are re-enabled.
  3873  func (d *DB) deleteObsoleteFiles(jobID int) {
  3874  	if d.mu.disableFileDeletions > 0 {
  3875  		return
  3876  	}
  3877  
  3878  	var obsoleteLogs []fileInfo
  3879  	for i := range d.mu.log.queue {
  3880  		// NB: d.mu.versions.minUnflushedLogNum is the log number of the earliest
  3881  		// log that has not had its contents flushed to an sstable. We can recycle
  3882  		// the prefix of d.mu.log.queue with log numbers less than
  3883  		// minUnflushedLogNum.
  3884  		if d.mu.log.queue[i].fileNum >= d.mu.versions.minUnflushedLogNum {
  3885  			obsoleteLogs = d.mu.log.queue[:i]
  3886  			d.mu.log.queue = d.mu.log.queue[i:]
  3887  			d.mu.versions.metrics.WAL.Files -= int64(len(obsoleteLogs))
  3888  			break
  3889  		}
  3890  	}
  3891  
  3892  	obsoleteTables := append([]fileInfo(nil), d.mu.versions.obsoleteTables...)
  3893  	d.mu.versions.obsoleteTables = nil
  3894  
  3895  	for _, tbl := range obsoleteTables {
  3896  		delete(d.mu.versions.zombieTables, tbl.fileNum)
  3897  	}
  3898  
  3899  	// Sort the manifests cause we want to delete some contiguous prefix
  3900  	// of the older manifests.
  3901  	slices.SortFunc(d.mu.versions.obsoleteManifests, func(a, b fileInfo) int {
  3902  		return cmp.Compare(a.fileNum, b.fileNum)
  3903  	})
  3904  
  3905  	var obsoleteManifests []fileInfo
  3906  	manifestsToDelete := len(d.mu.versions.obsoleteManifests) - d.opts.NumPrevManifest
  3907  	if manifestsToDelete > 0 {
  3908  		obsoleteManifests = d.mu.versions.obsoleteManifests[:manifestsToDelete]
  3909  		d.mu.versions.obsoleteManifests = d.mu.versions.obsoleteManifests[manifestsToDelete:]
  3910  		if len(d.mu.versions.obsoleteManifests) == 0 {
  3911  			d.mu.versions.obsoleteManifests = nil
  3912  		}
  3913  	}
  3914  
  3915  	obsoleteOptions := d.mu.versions.obsoleteOptions
  3916  	d.mu.versions.obsoleteOptions = nil
  3917  
  3918  	// Release d.mu while preparing the cleanup job and possibly waiting.
  3919  	// Note the unusual order: Unlock and then Lock.
  3920  	d.mu.Unlock()
  3921  	defer d.mu.Lock()
  3922  
  3923  	files := [4]struct {
  3924  		fileType fileType
  3925  		obsolete []fileInfo
  3926  	}{
  3927  		{fileTypeLog, obsoleteLogs},
  3928  		{fileTypeTable, obsoleteTables},
  3929  		{fileTypeManifest, obsoleteManifests},
  3930  		{fileTypeOptions, obsoleteOptions},
  3931  	}
  3932  	_, noRecycle := d.opts.Cleaner.(base.NeedsFileContents)
  3933  	filesToDelete := make([]obsoleteFile, 0, len(obsoleteLogs)+len(obsoleteTables)+len(obsoleteManifests)+len(obsoleteOptions))
  3934  	for _, f := range files {
  3935  		// We sort to make the order of deletions deterministic, which is nice for
  3936  		// tests.
  3937  		slices.SortFunc(f.obsolete, func(a, b fileInfo) int {
  3938  			return cmp.Compare(a.fileNum, b.fileNum)
  3939  		})
  3940  		for _, fi := range f.obsolete {
  3941  			dir := d.dirname
  3942  			switch f.fileType {
  3943  			case fileTypeLog:
  3944  				if !noRecycle && d.logRecycler.add(fi) {
  3945  					continue
  3946  				}
  3947  				dir = d.walDirname
  3948  			case fileTypeTable:
  3949  				d.tableCache.evict(fi.fileNum)
  3950  			}
  3951  
  3952  			filesToDelete = append(filesToDelete, obsoleteFile{
  3953  				dir:      dir,
  3954  				fileNum:  fi.fileNum,
  3955  				fileType: f.fileType,
  3956  				fileSize: fi.fileSize,
  3957  			})
  3958  		}
  3959  	}
  3960  	if len(filesToDelete) > 0 {
  3961  		d.cleanupManager.EnqueueJob(jobID, filesToDelete)
  3962  	}
  3963  	if d.opts.private.testingAlwaysWaitForCleanup {
  3964  		d.cleanupManager.Wait()
  3965  	}
  3966  }
  3967  
  3968  func (d *DB) maybeScheduleObsoleteTableDeletion() {
  3969  	d.mu.Lock()
  3970  	defer d.mu.Unlock()
  3971  	d.maybeScheduleObsoleteTableDeletionLocked()
  3972  }
  3973  
  3974  func (d *DB) maybeScheduleObsoleteTableDeletionLocked() {
  3975  	if len(d.mu.versions.obsoleteTables) > 0 {
  3976  		jobID := d.mu.nextJobID
  3977  		d.mu.nextJobID++
  3978  		d.deleteObsoleteFiles(jobID)
  3979  	}
  3980  }
  3981  
  3982  func merge(a, b []fileInfo) []fileInfo {
  3983  	if len(b) == 0 {
  3984  		return a
  3985  	}
  3986  
  3987  	a = append(a, b...)
  3988  	slices.SortFunc(a, func(a, b fileInfo) int {
  3989  		return cmp.Compare(a.fileNum, b.fileNum)
  3990  	})
  3991  	return slices.CompactFunc(a, func(a, b fileInfo) bool {
  3992  		return a.fileNum == b.fileNum
  3993  	})
  3994  }