github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/compaction_picker.go

github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/compaction_picker.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package bitalostable
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"math"
    11  	"sort"
    12  
    13  	"github.com/zuoyebang/bitalostable/internal/base"
    14  	"github.com/zuoyebang/bitalostable/internal/humanize"
    15  	"github.com/zuoyebang/bitalostable/internal/manifest"
    16  )
    17  
    18  // The minimum count for an intra-L0 compaction. This matches the RocksDB
    19  // heuristic.
    20  const minIntraL0Count = 4
    21  
    22  const levelMultiplier = 10
    23  
    24  type compactionEnv struct {
    25  	earliestUnflushedSeqNum uint64
    26  	earliestSnapshotSeqNum  uint64
    27  	inProgressCompactions   []compactionInfo
    28  	readCompactionEnv       readCompactionEnv
    29  }
    30  
    31  type compactionPicker interface {
    32  	getScores([]compactionInfo) [numLevels]float64
    33  	getBaseLevel() int
    34  	getEstimatedMaxWAmp() float64
    35  	estimatedCompactionDebt(l0ExtraSize uint64) uint64
    36  	pickAuto(env compactionEnv) (pc *pickedCompaction)
    37  	pickManual(env compactionEnv, manual *manualCompaction) (c *pickedCompaction, retryLater bool)
    38  	pickElisionOnlyCompaction(env compactionEnv) (pc *pickedCompaction)
    39  	pickRewriteCompaction(env compactionEnv) (pc *pickedCompaction)
    40  	pickReadTriggeredCompaction(env compactionEnv) (pc *pickedCompaction)
    41  	forceBaseLevel1()
    42  }
    43  
    44  // readCompactionEnv is used to hold data required to perform read compactions
    45  type readCompactionEnv struct {
    46  	rescheduleReadCompaction *bool
    47  	readCompactions          *readCompactionQueue
    48  	flushing                 bool
    49  }
    50  
    51  // Information about in-progress compactions provided to the compaction picker. These are used to
    52  // constrain the new compactions that will be picked.
    53  type compactionInfo struct {
    54  	inputs      []compactionLevel
    55  	outputLevel int
    56  	smallest    InternalKey
    57  	largest     InternalKey
    58  }
    59  
    60  func (info compactionInfo) String() string {
    61  	var buf bytes.Buffer
    62  	var largest int
    63  	for i, in := range info.inputs {
    64  		if i > 0 {
    65  			fmt.Fprintf(&buf, " -> ")
    66  		}
    67  		fmt.Fprintf(&buf, "L%d", in.level)
    68  		in.files.Each(func(m *fileMetadata) {
    69  			fmt.Fprintf(&buf, " %s", m.FileNum)
    70  		})
    71  		if largest < in.level {
    72  			largest = in.level
    73  		}
    74  	}
    75  	if largest != info.outputLevel || len(info.inputs) == 1 {
    76  		fmt.Fprintf(&buf, " -> L%d", info.outputLevel)
    77  	}
    78  	return buf.String()
    79  }
    80  
    81  type sortCompactionLevelsDecreasingScore []candidateLevelInfo
    82  
    83  func (s sortCompactionLevelsDecreasingScore) Len() int {
    84  	return len(s)
    85  }
    86  func (s sortCompactionLevelsDecreasingScore) Less(i, j int) bool {
    87  	if s[i].score != s[j].score {
    88  		return s[i].score > s[j].score
    89  	}
    90  	return s[i].level < s[j].level
    91  }
    92  func (s sortCompactionLevelsDecreasingScore) Swap(i, j int) {
    93  	s[i], s[j] = s[j], s[i]
    94  }
    95  
    96  // sublevelInfo is used to tag a LevelSlice for an L0 sublevel with the
    97  // sublevel.
    98  type sublevelInfo struct {
    99  	manifest.LevelSlice
   100  	sublevel manifest.Level
   101  }
   102  
   103  // generateSublevelInfo will generate the level slices for each of the sublevels
   104  // from the level slice for all of L0.
   105  func generateSublevelInfo(cmp base.Compare, levelFiles manifest.LevelSlice) []sublevelInfo {
   106  	sublevelMap := make(map[uint64][]*fileMetadata)
   107  	it := levelFiles.Iter()
   108  	for f := it.First(); f != nil; f = it.Next() {
   109  		sublevelMap[uint64(f.SubLevel)] = append(sublevelMap[uint64(f.SubLevel)], f)
   110  	}
   111  
   112  	var sublevels []int
   113  	for level := range sublevelMap {
   114  		sublevels = append(sublevels, int(level))
   115  	}
   116  	sort.Ints(sublevels)
   117  
   118  	var levelSlices []sublevelInfo
   119  	for _, sublevel := range sublevels {
   120  		metas := sublevelMap[uint64(sublevel)]
   121  		levelSlices = append(
   122  			levelSlices,
   123  			sublevelInfo{
   124  				manifest.NewLevelSliceKeySorted(cmp, metas),
   125  				manifest.L0Sublevel(sublevel),
   126  			},
   127  		)
   128  	}
   129  	return levelSlices
   130  }
   131  
   132  // pickedCompaction contains information about a compaction that has already
   133  // been chosen, and is being constructed. Compaction construction info lives in
   134  // this struct, and is copied over into the compaction struct when that's
   135  // created.
   136  type pickedCompaction struct {
   137  	cmp Compare
   138  
   139  	// score of the chosen compaction. Taken from candidateLevelInfo.
   140  	score float64
   141  
   142  	// kind indicates the kind of compaction.
   143  	kind compactionKind
   144  
   145  	// startLevel is the level that is being compacted. Inputs from startLevel
   146  	// and outputLevel will be merged to produce a set of outputLevel files.
   147  	startLevel *compactionLevel
   148  
   149  	// outputLevel is the level that files are being produced in. outputLevel is
   150  	// equal to startLevel+1 except when:
   151  	//    - if startLevel is 0, the output level equals compactionPicker.baseLevel().
   152  	//    - in multilevel compaction, the output level is the lowest level involved in
   153  	//      the compaction
   154  	outputLevel *compactionLevel
   155  
   156  	// extraLevels contain additional levels in between the input and output
   157  	// levels that get compacted in multi level compactions
   158  	extraLevels []*compactionLevel
   159  
   160  	// adjustedOutputLevel is the output level used for the purpose of
   161  	// determining the target output file size, overlap bytes, and expanded
   162  	// bytes, taking into account the base level.
   163  	adjustedOutputLevel int
   164  
   165  	inputs []compactionLevel
   166  
   167  	// L0-specific compaction info. Set to a non-nil value for all compactions
   168  	// where startLevel == 0 that were generated by L0Sublevels.
   169  	lcf *manifest.L0CompactionFiles
   170  
   171  	// L0SublevelInfo is used for compactions out of L0. It is nil for all
   172  	// other compactions.
   173  	l0SublevelInfo []sublevelInfo
   174  
   175  	// maxOutputFileSize is the maximum size of an individual table created
   176  	// during compaction.
   177  	maxOutputFileSize uint64
   178  	// maxOverlapBytes is the maximum number of bytes of overlap allowed for a
   179  	// single output table with the tables in the grandparent level.
   180  	maxOverlapBytes uint64
   181  	// maxReadCompactionBytes is the maximum bytes a read compaction is allowed to
   182  	// overlap in its output level with. If the overlap is greater than
   183  	// maxReadCompaction bytes, then we don't proceed with the compaction.
   184  	maxReadCompactionBytes uint64
   185  
   186  	// The boundaries of the input data.
   187  	smallest InternalKey
   188  	largest  InternalKey
   189  
   190  	version *version
   191  }
   192  
   193  func defaultOutputLevel(startLevel, baseLevel int) int {
   194  	outputLevel := startLevel + 1
   195  	if startLevel == 0 {
   196  		outputLevel = baseLevel
   197  	}
   198  	if outputLevel >= numLevels-1 {
   199  		outputLevel = numLevels - 1
   200  	}
   201  	return outputLevel
   202  }
   203  
   204  func newPickedCompaction(
   205  	opts *Options, cur *version, startLevel, outputLevel, baseLevel int,
   206  ) *pickedCompaction {
   207  	if startLevel > 0 && startLevel < baseLevel {
   208  		panic(fmt.Sprintf("invalid compaction: start level %d should not be empty (base level %d)",
   209  			startLevel, baseLevel))
   210  	}
   211  
   212  	adjustedOutputLevel := outputLevel
   213  	if adjustedOutputLevel > 0 {
   214  		// Output level is in the range [baseLevel,numLevels]. For the purpose of
   215  		// determining the target output file size, overlap bytes, and expanded
   216  		// bytes, we want to adjust the range to [1,numLevels].
   217  		adjustedOutputLevel = 1 + outputLevel - baseLevel
   218  	}
   219  
   220  	pc := &pickedCompaction{
   221  		cmp:                    opts.Comparer.Compare,
   222  		version:                cur,
   223  		inputs:                 []compactionLevel{{level: startLevel}, {level: outputLevel}},
   224  		adjustedOutputLevel:    adjustedOutputLevel,
   225  		maxOutputFileSize:      uint64(opts.Level(adjustedOutputLevel).TargetFileSize),
   226  		maxOverlapBytes:        maxGrandparentOverlapBytes(opts, adjustedOutputLevel),
   227  		maxReadCompactionBytes: maxReadCompactionBytes(opts, adjustedOutputLevel),
   228  	}
   229  	pc.startLevel = &pc.inputs[0]
   230  	pc.outputLevel = &pc.inputs[1]
   231  	return pc
   232  }
   233  
   234  func newPickedCompactionFromL0(
   235  	lcf *manifest.L0CompactionFiles, opts *Options, vers *version, baseLevel int, isBase bool,
   236  ) *pickedCompaction {
   237  	outputLevel := baseLevel
   238  	if !isBase {
   239  		outputLevel = 0 // Intra L0
   240  	}
   241  
   242  	pc := newPickedCompaction(opts, vers, 0, outputLevel, baseLevel)
   243  	pc.lcf = lcf
   244  	pc.outputLevel.level = outputLevel
   245  
   246  	// Manually build the compaction as opposed to calling
   247  	// pickAutoHelper. This is because L0Sublevels has already added
   248  	// any overlapping L0 SSTables that need to be added, and
   249  	// because compactions built by L0SSTables do not necessarily
   250  	// pick contiguous sequences of files in pc.version.Levels[0].
   251  	files := make([]*manifest.FileMetadata, 0, len(lcf.Files))
   252  	iter := vers.Levels[0].Iter()
   253  	for f := iter.First(); f != nil; f = iter.Next() {
   254  		if lcf.FilesIncluded[f.L0Index] {
   255  			files = append(files, f)
   256  		}
   257  	}
   258  	pc.startLevel.files = manifest.NewLevelSliceSeqSorted(files)
   259  	return pc
   260  }
   261  
   262  // maybeExpandedBounds is a helper function for setupInputs which ensures the
   263  // pickedCompaction's smallest and largest internal keys are updated iff
   264  // the candidate keys expand the key span. This avoids a bug for multi-level
   265  // compactions: during the second call to setupInputs, the picked compaction's
   266  // smallest and largest keys should not decrease the key span.
   267  func (pc *pickedCompaction) maybeExpandBounds(smallest InternalKey, largest InternalKey) {
   268  	emptyKey := InternalKey{}
   269  	if base.InternalCompare(pc.cmp, smallest, emptyKey) == 0 {
   270  		if base.InternalCompare(pc.cmp, largest, emptyKey) != 0 {
   271  			panic("either both candidate keys are empty or neither are empty")
   272  		}
   273  		return
   274  	}
   275  	if base.InternalCompare(pc.cmp, pc.smallest, emptyKey) == 0 {
   276  		if base.InternalCompare(pc.cmp, pc.largest, emptyKey) != 0 {
   277  			panic("either both pc keys are empty or neither are empty")
   278  		}
   279  		pc.smallest = smallest
   280  		pc.largest = largest
   281  		return
   282  	}
   283  	if base.InternalCompare(pc.cmp, pc.smallest, smallest) >= 0 {
   284  		pc.smallest = smallest
   285  	}
   286  	if base.InternalCompare(pc.cmp, pc.largest, largest) <= 0 {
   287  		pc.largest = largest
   288  	}
   289  }
   290  
   291  func (pc *pickedCompaction) setupInputs(
   292  	opts *Options, diskAvailBytes uint64, startLevel *compactionLevel,
   293  ) bool {
   294  	// maxExpandedBytes is the maximum size of an expanded compaction. If
   295  	// growing a compaction results in a larger size, the original compaction
   296  	// is used instead.
   297  	maxExpandedBytes := expandedCompactionByteSizeLimit(
   298  		opts, pc.adjustedOutputLevel, diskAvailBytes,
   299  	)
   300  
   301  	// Expand the initial inputs to a clean cut.
   302  	var isCompacting bool
   303  	startLevel.files, isCompacting = expandToAtomicUnit(pc.cmp, startLevel.files, false /* disableIsCompacting */)
   304  	if isCompacting {
   305  		return false
   306  	}
   307  	pc.maybeExpandBounds(manifest.KeyRange(pc.cmp, startLevel.files.Iter()))
   308  
   309  	// Determine the sstables in the output level which overlap with the input
   310  	// sstables, and then expand those tables to a clean cut. No need to do
   311  	// this for intra-L0 compactions; outputLevel.files is left empty for those.
   312  	if startLevel.level != pc.outputLevel.level {
   313  		pc.outputLevel.files = pc.version.Overlaps(pc.outputLevel.level, pc.cmp, pc.smallest.UserKey,
   314  			pc.largest.UserKey, pc.largest.IsExclusiveSentinel())
   315  		pc.outputLevel.files, isCompacting = expandToAtomicUnit(pc.cmp, pc.outputLevel.files,
   316  			false /* disableIsCompacting */)
   317  		if isCompacting {
   318  			return false
   319  		}
   320  		pc.maybeExpandBounds(manifest.KeyRange(pc.cmp,
   321  			startLevel.files.Iter(), pc.outputLevel.files.Iter()))
   322  	}
   323  
   324  	// Grow the sstables in startLevel.level as long as it doesn't affect the number
   325  	// of sstables included from pc.outputLevel.level.
   326  	if pc.lcf != nil && startLevel.level == 0 && pc.outputLevel.level != 0 {
   327  		// Call the L0-specific compaction extension method. Similar logic as
   328  		// pc.grow. Additional L0 files are optionally added to the compaction at
   329  		// this step. Note that the bounds passed in are not the bounds of the
   330  		// compaction, but rather the smallest and largest internal keys that
   331  		// the compaction cannot include from L0 without pulling in more Lbase
   332  		// files. Consider this example:
   333  		//
   334  		// L0:        c-d e+f g-h
   335  		// Lbase: a-b     e+f     i-j
   336  		//        a b c d e f g h i j
   337  		//
   338  		// The e-f files have already been chosen in the compaction. As pulling
   339  		// in more LBase files is undesirable, the logic below will pass in
   340  		// smallest = b and largest = i to ExtendL0ForBaseCompactionTo, which
   341  		// will expand the compaction to include c-d and g-h from L0. The
   342  		// bounds passed in are exclusive; the compaction cannot be expanded
   343  		// to include files that "touch" it.
   344  		smallestBaseKey := base.InvalidInternalKey
   345  		largestBaseKey := base.InvalidInternalKey
   346  		if pc.outputLevel.files.Empty() {
   347  			baseIter := pc.version.Levels[pc.outputLevel.level].Iter()
   348  			if sm := baseIter.SeekLT(pc.cmp, pc.smallest.UserKey); sm != nil {
   349  				smallestBaseKey = sm.Largest
   350  			}
   351  			if la := baseIter.SeekGE(pc.cmp, pc.largest.UserKey); la != nil {
   352  				largestBaseKey = la.Smallest
   353  			}
   354  		} else {
   355  			// NB: We use Reslice to access the underlying level's files, but
   356  			// we discard the returned slice. The pc.outputLevel.files slice
   357  			// is not modified.
   358  			_ = pc.outputLevel.files.Reslice(func(start, end *manifest.LevelIterator) {
   359  				if sm := start.Prev(); sm != nil {
   360  					smallestBaseKey = sm.Largest
   361  				}
   362  				if la := end.Next(); la != nil {
   363  					largestBaseKey = la.Smallest
   364  				}
   365  			})
   366  		}
   367  
   368  		oldLcf := *pc.lcf
   369  		if pc.version.L0Sublevels.ExtendL0ForBaseCompactionTo(smallestBaseKey, largestBaseKey, pc.lcf) {
   370  			var newStartLevelFiles []*fileMetadata
   371  			iter := pc.version.Levels[0].Iter()
   372  			var sizeSum uint64
   373  			for j, f := 0, iter.First(); f != nil; j, f = j+1, iter.Next() {
   374  				if pc.lcf.FilesIncluded[f.L0Index] {
   375  					newStartLevelFiles = append(newStartLevelFiles, f)
   376  					sizeSum += f.Size
   377  				}
   378  			}
   379  			if sizeSum+pc.outputLevel.files.SizeSum() < maxExpandedBytes {
   380  				startLevel.files = manifest.NewLevelSliceSeqSorted(newStartLevelFiles)
   381  				pc.smallest, pc.largest = manifest.KeyRange(pc.cmp,
   382  					startLevel.files.Iter(), pc.outputLevel.files.Iter())
   383  			} else {
   384  				*pc.lcf = oldLcf
   385  			}
   386  		}
   387  	} else if pc.grow(pc.smallest, pc.largest, maxExpandedBytes, startLevel) {
   388  		pc.maybeExpandBounds(manifest.KeyRange(pc.cmp,
   389  			startLevel.files.Iter(), pc.outputLevel.files.Iter()))
   390  	}
   391  
   392  	if pc.startLevel.level == 0 {
   393  		// We don't change the input files for the compaction beyond this point.
   394  		pc.l0SublevelInfo = generateSublevelInfo(pc.cmp, pc.startLevel.files)
   395  	}
   396  
   397  	return true
   398  }
   399  
   400  // grow grows the number of inputs at c.level without changing the number of
   401  // c.level+1 files in the compaction, and returns whether the inputs grew. sm
   402  // and la are the smallest and largest InternalKeys in all of the inputs.
   403  func (pc *pickedCompaction) grow(
   404  	sm, la InternalKey, maxExpandedBytes uint64, startLevel *compactionLevel,
   405  ) bool {
   406  	if pc.outputLevel.files.Empty() {
   407  		return false
   408  	}
   409  	grow0 := pc.version.Overlaps(startLevel.level, pc.cmp, sm.UserKey,
   410  		la.UserKey, la.IsExclusiveSentinel())
   411  	grow0, isCompacting := expandToAtomicUnit(pc.cmp, grow0, false /* disableIsCompacting */)
   412  	if isCompacting {
   413  		return false
   414  	}
   415  	if grow0.Len() <= startLevel.files.Len() {
   416  		return false
   417  	}
   418  	if grow0.SizeSum()+pc.outputLevel.files.SizeSum() >= maxExpandedBytes {
   419  		return false
   420  	}
   421  	// We need to include the outputLevel iter because without it, in a multiLevel scenario,
   422  	// sm1 and la1 could shift the output level keyspace when pc.outputLevel.files is set to grow1.
   423  	sm1, la1 := manifest.KeyRange(pc.cmp, grow0.Iter(), pc.outputLevel.files.Iter())
   424  	grow1 := pc.version.Overlaps(pc.outputLevel.level, pc.cmp, sm1.UserKey,
   425  		la1.UserKey, la1.IsExclusiveSentinel())
   426  	grow1, isCompacting = expandToAtomicUnit(pc.cmp, grow1, false /* disableIsCompacting */)
   427  	if isCompacting {
   428  		return false
   429  	}
   430  	if grow1.Len() != pc.outputLevel.files.Len() {
   431  		return false
   432  	}
   433  	startLevel.files = grow0
   434  	pc.outputLevel.files = grow1
   435  	return true
   436  }
   437  
   438  // initMultiLevelCompaction returns true if it initiated a multilevel input
   439  // compaction. This currently never inits a multiLevel compaction.
   440  func (pc *pickedCompaction) initMultiLevelCompaction(
   441  	opts *Options, vers *version, levelMaxBytes [7]int64, diskAvailBytes uint64,
   442  ) bool {
   443  	return false
   444  }
   445  
   446  // expandToAtomicUnit expands the provided level slice within its level both
   447  // forwards and backwards to its "atomic compaction unit" boundaries, if
   448  // necessary.
   449  //
   450  // While picking compaction inputs, this is required to maintain the invariant
   451  // that the versions of keys at level+1 are older than the versions of keys at
   452  // level. Tables are added to the right of the current slice tables such that
   453  // the rightmost table has a "clean cut". A clean cut is either a change in
   454  // user keys, or when the largest key in the left sstable is a range tombstone
   455  // sentinel key (InternalKeyRangeDeleteSentinel).
   456  //
   457  // In addition to maintaining the seqnum invariant, expandToAtomicUnit is used
   458  // to provide clean boundaries for range tombstone truncation during
   459  // compaction. In order to achieve these clean boundaries, expandToAtomicUnit
   460  // needs to find a "clean cut" on the left edge of the compaction as well.
   461  // This is necessary in order for "atomic compaction units" to always be
   462  // compacted as a unit. Failure to do this leads to a subtle bug with
   463  // truncation of range tombstones to atomic compaction unit boundaries.
   464  // Consider the scenario:
   465  //
   466  //	L3:
   467  //	  12:[a#2,15-b#1,1]
   468  //	  13:[b#0,15-d#72057594037927935,15]
   469  //
   470  // These sstables contain a range tombstone [a-d)#2 which spans the two
   471  // sstables. The two sstables need to always be kept together. Compacting
   472  // sstable 13 independently of sstable 12 would result in:
   473  //
   474  //	L3:
   475  //	  12:[a#2,15-b#1,1]
   476  //	L4:
   477  //	  14:[b#0,15-d#72057594037927935,15]
   478  //
   479  // This state is still ok, but when sstable 12 is next compacted, its range
   480  // tombstones will be truncated at "b" (the largest key in its atomic
   481  // compaction unit). In the scenario here, that could result in b#1 becoming
   482  // visible when it should be deleted.
   483  //
   484  // isCompacting is returned true for any atomic units that contain files that
   485  // have in-progress compactions, i.e. FileMetadata.Compacting == true. If
   486  // disableIsCompacting is true, isCompacting always returns false. This helps
   487  // avoid spurious races from being detected when this method is used outside
   488  // of compaction picking code.
   489  //
   490  // TODO(jackson): Compactions and flushes no longer split a user key between two
   491  // sstables. We could perform a migration, re-compacting any sstables with split
   492  // user keys, which would allow us to remove atomic compaction unit expansion
   493  // code.
   494  func expandToAtomicUnit(
   495  	cmp Compare, inputs manifest.LevelSlice, disableIsCompacting bool,
   496  ) (slice manifest.LevelSlice, isCompacting bool) {
   497  	// NB: Inputs for L0 can't be expanded and *version.Overlaps guarantees
   498  	// that we get a 'clean cut.' For L0, Overlaps will return a slice without
   499  	// access to the rest of the L0 files, so it's OK to try to reslice.
   500  	if inputs.Empty() {
   501  		// Nothing to expand.
   502  		return inputs, false
   503  	}
   504  
   505  	inputs = inputs.Reslice(func(start, end *manifest.LevelIterator) {
   506  		iter := start.Clone()
   507  		iter.Prev()
   508  		for cur, prev := start.Current(), iter.Current(); prev != nil; cur, prev = start.Prev(), iter.Prev() {
   509  			if cur.IsCompacting() {
   510  				isCompacting = true
   511  			}
   512  			if cmp(prev.Largest.UserKey, cur.Smallest.UserKey) < 0 {
   513  				break
   514  			}
   515  			if prev.Largest.IsExclusiveSentinel() {
   516  				// The table prev has a largest key indicating that the user key
   517  				// prev.largest.UserKey doesn't actually exist in the table.
   518  				break
   519  			}
   520  			// prev.Largest.UserKey == cur.Smallest.UserKey, so we need to
   521  			// include prev in the compaction.
   522  		}
   523  
   524  		iter = end.Clone()
   525  		iter.Next()
   526  		for cur, next := end.Current(), iter.Current(); next != nil; cur, next = end.Next(), iter.Next() {
   527  			if cur.IsCompacting() {
   528  				isCompacting = true
   529  			}
   530  			if cmp(cur.Largest.UserKey, next.Smallest.UserKey) < 0 {
   531  				break
   532  			}
   533  			if cur.Largest.IsExclusiveSentinel() {
   534  				// The table cur has a largest key indicating that the user key
   535  				// cur.largest.UserKey doesn't actually exist in the table.
   536  				break
   537  			}
   538  			// cur.Largest.UserKey == next.Smallest.UserKey, so we need to
   539  			// include next in the compaction.
   540  		}
   541  	})
   542  	inputIter := inputs.Iter()
   543  	isCompacting = !disableIsCompacting &&
   544  		(isCompacting || inputIter.First().IsCompacting() || inputIter.Last().IsCompacting())
   545  	return inputs, isCompacting
   546  }
   547  
   548  func newCompactionPicker(
   549  	v *version,
   550  	opts *Options,
   551  	inProgressCompactions []compactionInfo,
   552  	levelSizes [numLevels]int64,
   553  	diskAvailBytes func() uint64,
   554  ) compactionPicker {
   555  	p := &compactionPickerByScore{
   556  		opts:           opts,
   557  		vers:           v,
   558  		levelSizes:     levelSizes,
   559  		diskAvailBytes: diskAvailBytes,
   560  	}
   561  	p.initLevelMaxBytes(inProgressCompactions)
   562  	return p
   563  }
   564  
   565  // Information about a candidate compaction level that has been identified by
   566  // the compaction picker.
   567  type candidateLevelInfo struct {
   568  	// The score of the level to be compacted.
   569  	score     float64
   570  	origScore float64
   571  	level     int
   572  	// The level to compact to.
   573  	outputLevel int
   574  	// The file in level that will be compacted. Additional files may be
   575  	// picked by the compaction, and a pickedCompaction created for the
   576  	// compaction.
   577  	file manifest.LevelFile
   578  }
   579  
   580  // compensatedSize returns f's file size, inflated according to compaction
   581  // priorities.
   582  func compensatedSize(f *fileMetadata) uint64 {
   583  	sz := f.Size
   584  	// Add in the estimate of disk space that may be reclaimed by compacting
   585  	// the file's tombstones.
   586  	sz += f.Stats.PointDeletionsBytesEstimate
   587  	sz += f.Stats.RangeDeletionsBytesEstimate
   588  	return sz
   589  }
   590  
   591  // compensatedSizeAnnotator implements manifest.Annotator, annotating B-Tree
   592  // nodes with the sum of the files' compensated sizes. Its annotation type is
   593  // a *uint64. Compensated sizes may change once a table's stats are loaded
   594  // asynchronously, so its values are marked as cacheable only if a file's
   595  // stats have been loaded.
   596  type compensatedSizeAnnotator struct{}
   597  
   598  var _ manifest.Annotator = compensatedSizeAnnotator{}
   599  
   600  func (a compensatedSizeAnnotator) Zero(dst interface{}) interface{} {
   601  	if dst == nil {
   602  		return new(uint64)
   603  	}
   604  	v := dst.(*uint64)
   605  	*v = 0
   606  	return v
   607  }
   608  
   609  func (a compensatedSizeAnnotator) Accumulate(
   610  	f *fileMetadata, dst interface{},
   611  ) (v interface{}, cacheOK bool) {
   612  	vptr := dst.(*uint64)
   613  	*vptr = *vptr + compensatedSize(f)
   614  	return vptr, f.StatsValidLocked()
   615  }
   616  
   617  func (a compensatedSizeAnnotator) Merge(src interface{}, dst interface{}) interface{} {
   618  	srcV := src.(*uint64)
   619  	dstV := dst.(*uint64)
   620  	*dstV = *dstV + *srcV
   621  	return dstV
   622  }
   623  
   624  // totalCompensatedSize computes the compensated size over a file metadata
   625  // iterator. Note that this function is linear in the files available to the
   626  // iterator. Use the compensatedSizeAnnotator if querying the total
   627  // compensated size of a level.
   628  func totalCompensatedSize(iter manifest.LevelIterator) uint64 {
   629  	var sz uint64
   630  	for f := iter.First(); f != nil; f = iter.Next() {
   631  		sz += compensatedSize(f)
   632  	}
   633  	return sz
   634  }
   635  
   636  // compactionPickerByScore holds the state and logic for picking a compaction. A
   637  // compaction picker is associated with a single version. A new compaction
   638  // picker is created and initialized every time a new version is installed.
   639  type compactionPickerByScore struct {
   640  	opts *Options
   641  	vers *version
   642  
   643  	// The level to target for L0 compactions. Levels L1 to baseLevel must be
   644  	// empty.
   645  	baseLevel int
   646  
   647  	// estimatedMaxWAmp is the estimated maximum write amp per byte that is
   648  	// added to L0.
   649  	estimatedMaxWAmp float64
   650  
   651  	// levelMaxBytes holds the dynamically adjusted max bytes setting for each
   652  	// level.
   653  	levelMaxBytes [numLevels]int64
   654  
   655  	// levelSizes holds the current size of each level.
   656  	levelSizes [numLevels]int64
   657  
   658  	// diskAvailBytes returns a cached statistic on the number of bytes
   659  	// available on disk, as reported by the filesystem. It's used to be more
   660  	// restrictive in expanding compactions if available disk space is
   661  	// limited.
   662  	//
   663  	// The cached value is updated whenever a file is deleted and
   664  	// whenever a compaction or flush completes. Since file removal is
   665  	// the primary means of reclaiming space, there is a rough bound on
   666  	// the statistic's staleness when available bytes is growing.
   667  	// Compactions and flushes are longer, slower operations and provide
   668  	// a much looser bound when available bytes is decreasing.
   669  	diskAvailBytes func() uint64
   670  }
   671  
   672  var _ compactionPicker = &compactionPickerByScore{}
   673  
   674  func (p *compactionPickerByScore) getScores(inProgress []compactionInfo) [numLevels]float64 {
   675  	var scores [numLevels]float64
   676  	for _, info := range p.calculateScores(inProgress) {
   677  		scores[info.level] = info.score
   678  	}
   679  	return scores
   680  }
   681  
   682  func (p *compactionPickerByScore) getBaseLevel() int {
   683  	if p == nil {
   684  		return 1
   685  	}
   686  	return p.baseLevel
   687  }
   688  
   689  func (p *compactionPickerByScore) getEstimatedMaxWAmp() float64 {
   690  	return p.estimatedMaxWAmp
   691  }
   692  
   693  // estimatedCompactionDebt estimates the number of bytes which need to be
   694  // compacted before the LSM tree becomes stable.
   695  func (p *compactionPickerByScore) estimatedCompactionDebt(l0ExtraSize uint64) uint64 {
   696  	if p == nil {
   697  		return 0
   698  	}
   699  
   700  	// We assume that all the bytes in L0 need to be compacted to Lbase. This is
   701  	// unlike the RocksDB logic that figures out whether L0 needs compaction.
   702  	bytesAddedToNextLevel := l0ExtraSize + uint64(p.levelSizes[0])
   703  	nextLevelSize := uint64(p.levelSizes[p.baseLevel])
   704  
   705  	var compactionDebt uint64
   706  	if bytesAddedToNextLevel > 0 && nextLevelSize > 0 {
   707  		// We only incur compaction debt if both L0 and Lbase contain data. If L0
   708  		// is empty, no compaction is necessary. If Lbase is empty, a move-based
   709  		// compaction from L0 would occur.
   710  		compactionDebt += bytesAddedToNextLevel + nextLevelSize
   711  	}
   712  
   713  	for level := p.baseLevel; level < numLevels-1; level++ {
   714  		levelSize := nextLevelSize + bytesAddedToNextLevel
   715  		nextLevelSize = uint64(p.levelSizes[level+1])
   716  		if levelSize > uint64(p.levelMaxBytes[level]) {
   717  			bytesAddedToNextLevel = levelSize - uint64(p.levelMaxBytes[level])
   718  			if nextLevelSize > 0 {
   719  				// We only incur compaction debt if the next level contains data. If the
   720  				// next level is empty, a move-based compaction would be used.
   721  				levelRatio := float64(nextLevelSize) / float64(levelSize)
   722  				// The current level contributes bytesAddedToNextLevel to compactions.
   723  				// The next level contributes levelRatio * bytesAddedToNextLevel.
   724  				compactionDebt += uint64(float64(bytesAddedToNextLevel) * (levelRatio + 1))
   725  			}
   726  		}
   727  	}
   728  
   729  	return compactionDebt
   730  }
   731  
   732  func (p *compactionPickerByScore) initLevelMaxBytes(inProgressCompactions []compactionInfo) {
   733  	// The levelMaxBytes calculations here differ from RocksDB in two ways:
   734  	//
   735  	// 1. The use of dbSize vs maxLevelSize. RocksDB uses the size of the maximum
   736  	//    level in L1-L6, rather than determining the size of the bottom level
   737  	//    based on the total amount of data in the dB. The RocksDB calculation is
   738  	//    problematic if L0 contains a significant fraction of data, or if the
   739  	//    level sizes are roughly equal and thus there is a significant fraction
   740  	//    of data outside of the largest level.
   741  	//
   742  	// 2. Not adjusting the size of Lbase based on L0. RocksDB computes
   743  	//    baseBytesMax as the maximum of the configured LBaseMaxBytes and the
   744  	//    size of L0. This is problematic because baseBytesMax is used to compute
   745  	//    the max size of lower levels. A very large baseBytesMax will result in
   746  	//    an overly large value for the size of lower levels which will caused
   747  	//    those levels not to be compacted even when they should be
   748  	//    compacted. This often results in "inverted" LSM shapes where Ln is
   749  	//    larger than Ln+1.
   750  
   751  	// Determine the first non-empty level and the total DB size.
   752  	firstNonEmptyLevel := -1
   753  	var dbSize int64
   754  	for level := 1; level < numLevels; level++ {
   755  		if p.levelSizes[level] > 0 {
   756  			if firstNonEmptyLevel == -1 {
   757  				firstNonEmptyLevel = level
   758  			}
   759  			dbSize += p.levelSizes[level]
   760  		}
   761  	}
   762  	for _, c := range inProgressCompactions {
   763  		if c.outputLevel == 0 || c.outputLevel == -1 {
   764  			continue
   765  		}
   766  		if c.inputs[0].level == 0 && (firstNonEmptyLevel == -1 || c.outputLevel < firstNonEmptyLevel) {
   767  			firstNonEmptyLevel = c.outputLevel
   768  		}
   769  	}
   770  
   771  	// Initialize the max-bytes setting for each level to "infinity" which will
   772  	// disallow compaction for that level. We'll fill in the actual value below
   773  	// for levels we want to allow compactions from.
   774  	for level := 0; level < numLevels; level++ {
   775  		p.levelMaxBytes[level] = math.MaxInt64
   776  	}
   777  
   778  	if dbSize == 0 {
   779  		// No levels for L1 and up contain any data. Target L0 compactions for the
   780  		// last level or to the level to which there is an ongoing L0 compaction.
   781  		p.baseLevel = numLevels - 1
   782  		if firstNonEmptyLevel >= 0 {
   783  			p.baseLevel = firstNonEmptyLevel
   784  		}
   785  		return
   786  	}
   787  
   788  	dbSize += p.levelSizes[0]
   789  	bottomLevelSize := dbSize - dbSize/levelMultiplier
   790  
   791  	curLevelSize := bottomLevelSize
   792  	for level := numLevels - 2; level >= firstNonEmptyLevel; level-- {
   793  		curLevelSize = int64(float64(curLevelSize) / levelMultiplier)
   794  	}
   795  
   796  	// Compute base level (where L0 data is compacted to).
   797  	baseBytesMax := p.opts.LBaseMaxBytes
   798  	p.baseLevel = firstNonEmptyLevel
   799  	for p.baseLevel > 1 && curLevelSize > baseBytesMax {
   800  		p.baseLevel--
   801  		curLevelSize = int64(float64(curLevelSize) / levelMultiplier)
   802  	}
   803  
   804  	smoothedLevelMultiplier := 1.0
   805  	if p.baseLevel < numLevels-1 {
   806  		smoothedLevelMultiplier = math.Pow(
   807  			float64(bottomLevelSize)/float64(baseBytesMax),
   808  			1.0/float64(numLevels-p.baseLevel-1))
   809  	}
   810  
   811  	p.estimatedMaxWAmp = float64(numLevels-p.baseLevel) * (smoothedLevelMultiplier + 1)
   812  
   813  	levelSize := float64(baseBytesMax)
   814  	for level := p.baseLevel; level < numLevels; level++ {
   815  		if level > p.baseLevel && levelSize > 0 {
   816  			levelSize *= smoothedLevelMultiplier
   817  		}
   818  		// Round the result since test cases use small target level sizes, which
   819  		// can be impacted by floating-point imprecision + integer truncation.
   820  		roundedLevelSize := math.Round(levelSize)
   821  		if roundedLevelSize > float64(math.MaxInt64) {
   822  			p.levelMaxBytes[level] = math.MaxInt64
   823  		} else {
   824  			p.levelMaxBytes[level] = int64(roundedLevelSize)
   825  		}
   826  	}
   827  }
   828  
   829  func calculateSizeAdjust(inProgressCompactions []compactionInfo) [numLevels]int64 {
   830  	// Compute a size adjustment for each level based on the in-progress
   831  	// compactions. We subtract the compensated size of start level inputs.
   832  	// Since compensated file sizes may be compensated because they reclaim
   833  	// space from the output level's files, we add the real file size to the
   834  	// output level. This is slightly different from RocksDB's behavior, which
   835  	// simply elides compacting files from the level size calculation.
   836  	var sizeAdjust [numLevels]int64
   837  	for i := range inProgressCompactions {
   838  		c := &inProgressCompactions[i]
   839  
   840  		for _, input := range c.inputs {
   841  			real := int64(input.files.SizeSum())
   842  			compensated := int64(totalCompensatedSize(input.files.Iter()))
   843  
   844  			if input.level != c.outputLevel {
   845  				sizeAdjust[input.level] -= compensated
   846  				if c.outputLevel != -1 {
   847  					sizeAdjust[c.outputLevel] += real
   848  				}
   849  			}
   850  		}
   851  	}
   852  	return sizeAdjust
   853  }
   854  
   855  func levelCompensatedSize(lm manifest.LevelMetadata) uint64 {
   856  	return *lm.Annotation(compensatedSizeAnnotator{}).(*uint64)
   857  }
   858  
   859  func (p *compactionPickerByScore) calculateScores(
   860  	inProgressCompactions []compactionInfo,
   861  ) [numLevels]candidateLevelInfo {
   862  	var scores [numLevels]candidateLevelInfo
   863  	for i := range scores {
   864  		scores[i].level = i
   865  		scores[i].outputLevel = i + 1
   866  	}
   867  	scores[0] = p.calculateL0Score(inProgressCompactions)
   868  
   869  	sizeAdjust := calculateSizeAdjust(inProgressCompactions)
   870  	for level := 1; level < numLevels; level++ {
   871  		levelSize := int64(levelCompensatedSize(p.vers.Levels[level])) + sizeAdjust[level]
   872  		scores[level].score = float64(levelSize) / float64(p.levelMaxBytes[level])
   873  		scores[level].origScore = scores[level].score
   874  	}
   875  
   876  	// Adjust each level's score by the score of the next level. If the next
   877  	// level has a high score, and is thus a priority for compaction, this
   878  	// reduces the priority for compacting the current level. If the next level
   879  	// has a low score (i.e. it is below its target size), this increases the
   880  	// priority for compacting the current level.
   881  	//
   882  	// The effect of this adjustment is to help prioritize compactions in lower
   883  	// levels. The following shows the new score and original score. In this
   884  	// scenario, L0 has 68 sublevels. L3 (a.k.a. Lbase) is significantly above
   885  	// its target size. The original score prioritizes compactions from those two
   886  	// levels, but doing so ends up causing a future problem: data piles up in
   887  	// the higher levels, starving L5->L6 compactions, and to a lesser degree
   888  	// starving L4->L5 compactions.
   889  	//
   890  	//        adjusted   original
   891  	//           score      score       size   max-size
   892  	//   L0        3.2       68.0      2.2 G          -
   893  	//   L3        3.2       21.1      1.3 G       64 M
   894  	//   L4        3.4        6.7      3.1 G      467 M
   895  	//   L5        3.4        2.0      6.6 G      3.3 G
   896  	//   L6        0.6        0.6       14 G       24 G
   897  	var prevLevel int
   898  	for level := p.baseLevel; level < numLevels; level++ {
   899  		if scores[prevLevel].score >= 1 {
   900  			// Avoid absurdly large scores by placing a floor on the score that we'll
   901  			// adjust a level by. The value of 0.01 was chosen somewhat arbitrarily
   902  			const minScore = 0.01
   903  			if scores[level].score >= minScore {
   904  				scores[prevLevel].score /= scores[level].score
   905  			} else {
   906  				scores[prevLevel].score /= minScore
   907  			}
   908  		}
   909  		prevLevel = level
   910  	}
   911  
   912  	sort.Sort(sortCompactionLevelsDecreasingScore(scores[:]))
   913  	return scores
   914  }
   915  
   916  func (p *compactionPickerByScore) calculateL0Score(
   917  	inProgressCompactions []compactionInfo,
   918  ) candidateLevelInfo {
   919  	var info candidateLevelInfo
   920  	info.outputLevel = p.baseLevel
   921  
   922  	// If L0Sublevels are present, use the sublevel count to calculate the
   923  	// score. The base vs intra-L0 compaction determination happens in pickAuto,
   924  	// not here.
   925  	info.score = float64(2*p.vers.L0Sublevels.MaxDepthAfterOngoingCompactions()) /
   926  		float64(p.opts.L0CompactionThreshold)
   927  
   928  	// Also calculate a score based on the file count but use it only if it
   929  	// produces a higher score than the sublevel-based one. This heuristic is
   930  	// designed to accommodate cases where L0 is accumulating non-overlapping
   931  	// files in L0. Letting too many non-overlapping files accumulate in few
   932  	// sublevels is undesirable, because:
   933  	// 1) we can produce a massive backlog to compact once files do overlap.
   934  	// 2) constructing L0 sublevels has a runtime that grows superlinearly with
   935  	//    the number of files in L0 and must be done while holding D.mu.
   936  	noncompactingFiles := p.vers.Levels[0].Len()
   937  	for _, c := range inProgressCompactions {
   938  		for _, cl := range c.inputs {
   939  			if cl.level == 0 {
   940  				noncompactingFiles -= cl.files.Len()
   941  			}
   942  		}
   943  	}
   944  	fileScore := float64(noncompactingFiles) / float64(p.opts.L0CompactionFileThreshold)
   945  	if info.score < fileScore {
   946  		info.score = fileScore
   947  	}
   948  	return info
   949  }
   950  
   951  func (p *compactionPickerByScore) pickFile(
   952  	level, outputLevel int, earliestSnapshotSeqNum uint64,
   953  ) (manifest.LevelFile, bool) {
   954  	// Select the file within the level to compact. We want to minimize write
   955  	// amplification, but also ensure that deletes are propagated to the
   956  	// bottom level in a timely fashion so as to reclaim disk space. A table's
   957  	// smallest sequence number provides a measure of its age. The ratio of
   958  	// overlapping-bytes / table-size gives an indication of write
   959  	// amplification (a smaller ratio is preferrable).
   960  	//
   961  	// The current heuristic is based off the the RocksDB kMinOverlappingRatio
   962  	// heuristic. It chooses the file with the minimum overlapping ratio with
   963  	// the target level, which minimizes write amplification.
   964  	//
   965  	// It uses a "compensated size" for the denominator, which is the file
   966  	// size but artificially inflated by an estimate of the space that may be
   967  	// reclaimed through compaction. Currently, we only compensate for range
   968  	// deletions and only with a rough estimate of the reclaimable bytes. This
   969  	// differs from RocksDB which only compensates for point tombstones and
   970  	// only if they exceed the number of non-deletion entries in table.
   971  	//
   972  	// TODO(peter): For concurrent compactions, we may want to try harder to
   973  	// pick a seed file whose resulting compaction bounds do not overlap with
   974  	// an in-progress compaction.
   975  
   976  	cmp := p.opts.Comparer.Compare
   977  	startIter := p.vers.Levels[level].Iter()
   978  	outputIter := p.vers.Levels[outputLevel].Iter()
   979  
   980  	var file manifest.LevelFile
   981  	smallestRatio := uint64(math.MaxUint64)
   982  
   983  	outputFile := outputIter.First()
   984  
   985  	for f := startIter.First(); f != nil; f = startIter.Next() {
   986  		var overlappingBytes uint64
   987  
   988  		// Trim any output-level files smaller than f.
   989  		for outputFile != nil && base.InternalCompare(cmp, outputFile.Largest, f.Smallest) < 0 {
   990  			outputFile = outputIter.Next()
   991  		}
   992  
   993  		compacting := f.IsCompacting()
   994  		for outputFile != nil && base.InternalCompare(cmp, outputFile.Smallest, f.Largest) < 0 {
   995  			overlappingBytes += outputFile.Size
   996  			compacting = compacting || outputFile.IsCompacting()
   997  
   998  			// For files in the bottommost level of the LSM, the
   999  			// Stats.RangeDeletionsBytesEstimate field is set to the estimate
  1000  			// of bytes /within/ the file itself that may be dropped by
  1001  			// recompacting the file. These bytes from obsolete keys would not
  1002  			// need to be rewritten if we compacted `f` into `outputFile`, so
  1003  			// they don't contribute to write amplification. Subtracting them
  1004  			// out of the overlapping bytes helps prioritize these compactions
  1005  			// that are cheaper than their file sizes suggest.
  1006  			if outputLevel == numLevels-1 && outputFile.LargestSeqNum < earliestSnapshotSeqNum {
  1007  				overlappingBytes -= outputFile.Stats.RangeDeletionsBytesEstimate
  1008  			}
  1009  
  1010  			// If the file in the next level extends beyond f's largest key,
  1011  			// break out and don't advance outputIter because f's successor
  1012  			// might also overlap.
  1013  			if base.InternalCompare(cmp, outputFile.Largest, f.Largest) > 0 {
  1014  				break
  1015  			}
  1016  			outputFile = outputIter.Next()
  1017  		}
  1018  
  1019  		// If the input level file or one of the overlapping files is
  1020  		// compacting, we're not going to be able to compact this file
  1021  		// anyways, so skip it.
  1022  		if compacting {
  1023  			continue
  1024  		}
  1025  
  1026  		scaledRatio := overlappingBytes * 1024 / compensatedSize(f)
  1027  		if scaledRatio < smallestRatio && !f.IsCompacting() {
  1028  			smallestRatio = scaledRatio
  1029  			file = startIter.Take()
  1030  		}
  1031  	}
  1032  	return file, file.FileMetadata != nil
  1033  }
  1034  
  1035  // pickAuto picks the best compaction, if any.
  1036  //
  1037  // On each call, pickAuto computes per-level size adjustments based on
  1038  // in-progress compactions, and computes a per-level score. The levels are
  1039  // iterated over in decreasing score order trying to find a valid compaction
  1040  // anchored at that level.
  1041  //
  1042  // If a score-based compaction cannot be found, pickAuto falls back to looking
  1043  // for an elision-only compaction to remove obsolete keys.
  1044  func (p *compactionPickerByScore) pickAuto(env compactionEnv) (pc *pickedCompaction) {
  1045  	// Compaction concurrency is controlled by L0 read-amp. We allow one
  1046  	// additional compaction per L0CompactionConcurrency sublevels, as well as
  1047  	// one additional compaction per CompactionDebtConcurrency bytes of
  1048  	// compaction debt. Compaction concurrency is tied to L0 sublevels as that
  1049  	// signal is independent of the database size. We tack on the compaction
  1050  	// debt as a second signal to prevent compaction concurrency from dropping
  1051  	// significantly right after a base compaction finishes, and before those
  1052  	// bytes have been compacted further down the LSM.
  1053  	if n := len(env.inProgressCompactions); n > 0 {
  1054  		l0ReadAmp := p.vers.L0Sublevels.MaxDepthAfterOngoingCompactions()
  1055  		compactionDebt := int(p.estimatedCompactionDebt(0))
  1056  		ccSignal1 := n * p.opts.Experimental.L0CompactionConcurrency
  1057  		ccSignal2 := n * p.opts.Experimental.CompactionDebtConcurrency
  1058  		if l0ReadAmp < ccSignal1 && compactionDebt < ccSignal2 {
  1059  			return nil
  1060  		}
  1061  	}
  1062  
  1063  	scores := p.calculateScores(env.inProgressCompactions)
  1064  
  1065  	// TODO(peter): Either remove, or change this into an event sent to the
  1066  	// EventListener.
  1067  	logCompaction := func(pc *pickedCompaction) {
  1068  		var buf bytes.Buffer
  1069  		for i := 0; i < numLevels; i++ {
  1070  			if i != 0 && i < p.baseLevel {
  1071  				continue
  1072  			}
  1073  
  1074  			var info *candidateLevelInfo
  1075  			for j := range scores {
  1076  				if scores[j].level == i {
  1077  					info = &scores[j]
  1078  					break
  1079  				}
  1080  			}
  1081  
  1082  			marker := " "
  1083  			if pc.startLevel.level == info.level {
  1084  				marker = "*"
  1085  			}
  1086  			fmt.Fprintf(&buf, "  %sL%d: %5.1f  %5.1f  %8s  %8s",
  1087  				marker, info.level, info.score, info.origScore,
  1088  				humanize.Int64(int64(totalCompensatedSize(p.vers.Levels[info.level].Iter()))),
  1089  				humanize.Int64(p.levelMaxBytes[info.level]),
  1090  			)
  1091  
  1092  			count := 0
  1093  			for i := range env.inProgressCompactions {
  1094  				c := &env.inProgressCompactions[i]
  1095  				if c.inputs[0].level != info.level {
  1096  					continue
  1097  				}
  1098  				count++
  1099  				if count == 1 {
  1100  					fmt.Fprintf(&buf, "  [")
  1101  				} else {
  1102  					fmt.Fprintf(&buf, " ")
  1103  				}
  1104  				fmt.Fprintf(&buf, "L%d->L%d", c.inputs[0].level, c.outputLevel)
  1105  			}
  1106  			if count > 0 {
  1107  				fmt.Fprintf(&buf, "]")
  1108  			}
  1109  			fmt.Fprintf(&buf, "\n")
  1110  		}
  1111  		p.opts.Logger.Infof("pickAuto: L%d->L%d\n%s",
  1112  			pc.startLevel.level, pc.outputLevel.level, buf.String())
  1113  	}
  1114  
  1115  	// Check for a score-based compaction. "scores" has been sorted in order of
  1116  	// decreasing score. For each level with a score >= 1, we attempt to find a
  1117  	// compaction anchored at at that level.
  1118  	for i := range scores {
  1119  		info := &scores[i]
  1120  		if info.score < 1 {
  1121  			break
  1122  		}
  1123  		if info.level == numLevels-1 {
  1124  			continue
  1125  		}
  1126  
  1127  		if info.level == 0 {
  1128  			pc = pickL0(env, p.opts, p.vers, p.baseLevel, p.diskAvailBytes)
  1129  			// Fail-safe to protect against compacting the same sstable
  1130  			// concurrently.
  1131  			if pc != nil && !inputRangeAlreadyCompacting(env, pc) {
  1132  				pc.score = info.score
  1133  				// TODO(peter): remove
  1134  				if false {
  1135  					logCompaction(pc)
  1136  				}
  1137  				return pc
  1138  			}
  1139  			continue
  1140  		}
  1141  
  1142  		// info.level > 0
  1143  		var ok bool
  1144  		info.file, ok = p.pickFile(info.level, info.outputLevel, env.earliestSnapshotSeqNum)
  1145  		if !ok {
  1146  			continue
  1147  		}
  1148  
  1149  		pc := pickAutoLPositive(env, p.opts, p.vers, *info, p.baseLevel, p.diskAvailBytes, p.levelMaxBytes)
  1150  		// Fail-safe to protect against compacting the same sstable concurrently.
  1151  		if pc != nil && !inputRangeAlreadyCompacting(env, pc) {
  1152  			pc.score = info.score
  1153  			// TODO(peter): remove
  1154  			if false {
  1155  				logCompaction(pc)
  1156  			}
  1157  			return pc
  1158  		}
  1159  	}
  1160  
  1161  	// Check for L6 files with tombstones that may be elided. These files may
  1162  	// exist if a snapshot prevented the elision of a tombstone or because of
  1163  	// a move compaction. These are low-priority compactions because they
  1164  	// don't help us keep up with writes, just reclaim disk space.
  1165  	if pc := p.pickElisionOnlyCompaction(env); pc != nil {
  1166  		return pc
  1167  	}
  1168  
  1169  	if pc := p.pickReadTriggeredCompaction(env); pc != nil {
  1170  		return pc
  1171  	}
  1172  
  1173  	// NB: This should only be run if a read compaction wasn't
  1174  	// scheduled.
  1175  	//
  1176  	// We won't be scheduling a read compaction right now, and in
  1177  	// read heavy workloads, compactions won't be scheduled frequently
  1178  	// because flushes aren't frequent. So we need to signal to the
  1179  	// iterator to schedule a compaction when it adds compactions to
  1180  	// the read compaction queue.
  1181  	//
  1182  	// We need the nil check here because without it, we have some
  1183  	// tests which don't set that variable fail. Since there's a
  1184  	// chance that one of those tests wouldn't want extra compactions
  1185  	// to be scheduled, I added this check here, instead of
  1186  	// setting rescheduleReadCompaction in those tests.
  1187  	if env.readCompactionEnv.rescheduleReadCompaction != nil {
  1188  		*env.readCompactionEnv.rescheduleReadCompaction = true
  1189  	}
  1190  
  1191  	// At the lowest possible compaction-picking priority, look for files marked
  1192  	// for compaction. Pebble will mark files for compaction if they have atomic
  1193  	// compaction units that span multiple files. While current Pebble code does
  1194  	// not construct such sstables, RocksDB and earlier versions of Pebble may
  1195  	// have created them. These split user keys form sets of files that must be
  1196  	// compacted together for correctness (referred to as "atomic compaction
  1197  	// units" within the code). Rewrite them in-place.
  1198  	//
  1199  	// It's also possible that a file may have been marked for compaction by
  1200  	// even earlier versions of Pebble code, since FileMetadata's
  1201  	// MarkedForCompaction field is persisted in the manifest. That's okay. We
  1202  	// previously would've ignored the designation, whereas now we'll re-compact
  1203  	// the file in place.
  1204  	if p.vers.Stats.MarkedForCompaction > 0 {
  1205  		if pc := p.pickRewriteCompaction(env); pc != nil {
  1206  			return pc
  1207  		}
  1208  	}
  1209  
  1210  	return nil
  1211  }
  1212  
  1213  // elisionOnlyAnnotator implements the manifest.Annotator interface,
  1214  // annotating B-Tree nodes with the *fileMetadata of a file meeting the
  1215  // obsolete keys criteria for an elision-only compaction within the subtree.
  1216  // If multiple files meet the criteria, it chooses whichever file has the
  1217  // lowest LargestSeqNum. The lowest LargestSeqNum file will be the first
  1218  // eligible for an elision-only compaction once snapshots less than or equal
  1219  // to its LargestSeqNum are closed.
  1220  type elisionOnlyAnnotator struct{}
  1221  
  1222  var _ manifest.Annotator = elisionOnlyAnnotator{}
  1223  
  1224  func (a elisionOnlyAnnotator) Zero(interface{}) interface{} {
  1225  	return nil
  1226  }
  1227  
  1228  func (a elisionOnlyAnnotator) Accumulate(f *fileMetadata, dst interface{}) (interface{}, bool) {
  1229  	if f.IsCompacting() {
  1230  		return dst, true
  1231  	}
  1232  	if !f.StatsValidLocked() {
  1233  		return dst, false
  1234  	}
  1235  	// Bottommost files are large and not worthwhile to compact just
  1236  	// to remove a few tombstones. Consider a file ineligible if its
  1237  	// own range deletions delete less than 10% of its data and its
  1238  	// deletion tombstones make up less than 10% of its entries.
  1239  	//
  1240  	// TODO(jackson): This does not account for duplicate user keys
  1241  	// which may be collapsed. Ideally, we would have 'obsolete keys'
  1242  	// statistics that would include tombstones, the keys that are
  1243  	// dropped by tombstones and duplicated user keys. See #847.
  1244  	//
  1245  	// Note that tables that contain exclusively range keys (i.e. no point keys,
  1246  	// `NumEntries` and `RangeDeletionsBytesEstimate` are both zero) are excluded
  1247  	// from elision-only compactions.
  1248  	// TODO(travers): Consider an alternative heuristic for elision of range-keys.
  1249  	if f.Stats.RangeDeletionsBytesEstimate*10 < f.Size &&
  1250  		f.Stats.NumDeletions*10 <= f.Stats.NumEntries {
  1251  		return dst, true
  1252  	}
  1253  	if dst == nil {
  1254  		return f, true
  1255  	} else if dstV := dst.(*fileMetadata); dstV.LargestSeqNum > f.LargestSeqNum {
  1256  		return f, true
  1257  	}
  1258  	return dst, true
  1259  }
  1260  
  1261  func (a elisionOnlyAnnotator) Merge(v interface{}, accum interface{}) interface{} {
  1262  	if v == nil {
  1263  		return accum
  1264  	}
  1265  	// If we haven't accumulated an eligible file yet, or f's LargestSeqNum is
  1266  	// less than the accumulated file's, use f.
  1267  	if accum == nil {
  1268  		return v
  1269  	}
  1270  	f := v.(*fileMetadata)
  1271  	accumV := accum.(*fileMetadata)
  1272  	if accumV == nil || accumV.LargestSeqNum > f.LargestSeqNum {
  1273  		return f
  1274  	}
  1275  	return accumV
  1276  }
  1277  
  1278  // markedForCompactionAnnotator implements the manifest.Annotator interface,
  1279  // annotating B-Tree nodes with the *fileMetadata of a file that is marked for
  1280  // compaction within the subtree. If multiple files meet the criteria, it
  1281  // chooses whichever file has the lowest LargestSeqNum.
  1282  type markedForCompactionAnnotator struct{}
  1283  
  1284  var _ manifest.Annotator = markedForCompactionAnnotator{}
  1285  
  1286  func (a markedForCompactionAnnotator) Zero(interface{}) interface{} {
  1287  	return nil
  1288  }
  1289  
  1290  func (a markedForCompactionAnnotator) Accumulate(
  1291  	f *fileMetadata, dst interface{},
  1292  ) (interface{}, bool) {
  1293  	if !f.MarkedForCompaction {
  1294  		// Not marked for compaction; return dst.
  1295  		return dst, true
  1296  	}
  1297  	return markedMergeHelper(f, dst)
  1298  }
  1299  
  1300  func (a markedForCompactionAnnotator) Merge(v interface{}, accum interface{}) interface{} {
  1301  	if v == nil {
  1302  		return accum
  1303  	}
  1304  	accum, _ = markedMergeHelper(v.(*fileMetadata), accum)
  1305  	return accum
  1306  }
  1307  
  1308  // REQUIRES: f is non-nil, and f.MarkedForCompaction=true.
  1309  func markedMergeHelper(f *fileMetadata, dst interface{}) (interface{}, bool) {
  1310  	if dst == nil {
  1311  		return f, true
  1312  	} else if dstV := dst.(*fileMetadata); dstV.LargestSeqNum > f.LargestSeqNum {
  1313  		return f, true
  1314  	}
  1315  	return dst, true
  1316  }
  1317  
  1318  // pickElisionOnlyCompaction looks for compactions of sstables in the
  1319  // bottommost level containing obsolete records that may now be dropped.
  1320  func (p *compactionPickerByScore) pickElisionOnlyCompaction(
  1321  	env compactionEnv,
  1322  ) (pc *pickedCompaction) {
  1323  	v := p.vers.Levels[numLevels-1].Annotation(elisionOnlyAnnotator{})
  1324  	if v == nil {
  1325  		return nil
  1326  	}
  1327  	candidate := v.(*fileMetadata)
  1328  	if candidate.IsCompacting() || candidate.LargestSeqNum >= env.earliestSnapshotSeqNum {
  1329  		return nil
  1330  	}
  1331  	lf := p.vers.Levels[numLevels-1].Find(p.opts.Comparer.Compare, candidate)
  1332  	if lf == nil {
  1333  		panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, numLevels-1))
  1334  	}
  1335  
  1336  	// Construct a picked compaction of the elision candidate's atomic
  1337  	// compaction unit.
  1338  	pc = newPickedCompaction(p.opts, p.vers, numLevels-1, numLevels-1, p.baseLevel)
  1339  	pc.kind = compactionKindElisionOnly
  1340  	var isCompacting bool
  1341  	pc.startLevel.files, isCompacting = expandToAtomicUnit(p.opts.Comparer.Compare, lf.Slice(), false /* disableIsCompacting */)
  1342  	if isCompacting {
  1343  		return nil
  1344  	}
  1345  	pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter())
  1346  	// Fail-safe to protect against compacting the same sstable concurrently.
  1347  	if !inputRangeAlreadyCompacting(env, pc) {
  1348  		return pc
  1349  	}
  1350  	return nil
  1351  }
  1352  
  1353  // pickRewriteCompaction attempts to construct a compaction that
  1354  // rewrites a file marked for compaction. pickRewriteCompaction will
  1355  // pull in adjacent files in the file's atomic compaction unit if
  1356  // necessary. A rewrite compaction outputs files to the same level as
  1357  // the input level.
  1358  func (p *compactionPickerByScore) pickRewriteCompaction(env compactionEnv) (pc *pickedCompaction) {
  1359  	for l := numLevels - 1; l >= 0; l-- {
  1360  		v := p.vers.Levels[l].Annotation(markedForCompactionAnnotator{})
  1361  		if v == nil {
  1362  			// Try the next level.
  1363  			continue
  1364  		}
  1365  		candidate := v.(*fileMetadata)
  1366  		if candidate.IsCompacting() {
  1367  			// Try the next level.
  1368  			continue
  1369  		}
  1370  		lf := p.vers.Levels[l].Find(p.opts.Comparer.Compare, candidate)
  1371  		if lf == nil {
  1372  			panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, numLevels-1))
  1373  		}
  1374  
  1375  		inputs := lf.Slice()
  1376  		// L0 files generated by a flush have never been split such that
  1377  		// adjacent files can contain the same user key. So we do not need to
  1378  		// rewrite an atomic compaction unit for L0. Note that there is nothing
  1379  		// preventing two different flushes from producing files that are
  1380  		// non-overlapping from an InternalKey perspective, but span the same
  1381  		// user key. However, such files cannot be in the same L0 sublevel,
  1382  		// since each sublevel requires non-overlapping user keys (unlike other
  1383  		// levels).
  1384  		if l > 0 {
  1385  			// Find this file's atomic compaction unit. This is only relevant
  1386  			// for levels L1+.
  1387  			var isCompacting bool
  1388  			inputs, isCompacting = expandToAtomicUnit(
  1389  				p.opts.Comparer.Compare,
  1390  				inputs,
  1391  				false, /* disableIsCompacting */
  1392  			)
  1393  			if isCompacting {
  1394  				// Try the next level.
  1395  				continue
  1396  			}
  1397  		}
  1398  
  1399  		pc = newPickedCompaction(p.opts, p.vers, l, l, p.baseLevel)
  1400  		pc.outputLevel.level = l
  1401  		pc.kind = compactionKindRewrite
  1402  		pc.startLevel.files = inputs
  1403  		pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter())
  1404  
  1405  		// Fail-safe to protect against compacting the same sstable concurrently.
  1406  		if !inputRangeAlreadyCompacting(env, pc) {
  1407  			if pc.startLevel.level == 0 {
  1408  				pc.l0SublevelInfo = generateSublevelInfo(pc.cmp, pc.startLevel.files)
  1409  			}
  1410  			return pc
  1411  		}
  1412  	}
  1413  	return nil
  1414  }
  1415  
  1416  // pickAutoLPositive picks an automatic compaction for the candidate
  1417  // file in a positive-numbered level. This function must not be used for
  1418  // L0.
  1419  func pickAutoLPositive(
  1420  	env compactionEnv,
  1421  	opts *Options,
  1422  	vers *version,
  1423  	cInfo candidateLevelInfo,
  1424  	baseLevel int,
  1425  	diskAvailBytes func() uint64,
  1426  	levelMaxBytes [7]int64,
  1427  ) (pc *pickedCompaction) {
  1428  	if cInfo.level == 0 {
  1429  		panic("bitalostable: pickAutoLPositive called for L0")
  1430  	}
  1431  
  1432  	pc = newPickedCompaction(opts, vers, cInfo.level, defaultOutputLevel(cInfo.level, baseLevel), baseLevel)
  1433  	if pc.outputLevel.level != cInfo.outputLevel {
  1434  		panic("bitalostable: compaction picked unexpected output level")
  1435  	}
  1436  	pc.startLevel.files = cInfo.file.Slice()
  1437  	// Files in level 0 may overlap each other, so pick up all overlapping ones.
  1438  	if pc.startLevel.level == 0 {
  1439  		cmp := opts.Comparer.Compare
  1440  		smallest, largest := manifest.KeyRange(cmp, pc.startLevel.files.Iter())
  1441  		pc.startLevel.files = vers.Overlaps(0, cmp, smallest.UserKey,
  1442  			largest.UserKey, largest.IsExclusiveSentinel())
  1443  		if pc.startLevel.files.Empty() {
  1444  			panic("bitalostable: empty compaction")
  1445  		}
  1446  	}
  1447  
  1448  	if !pc.setupInputs(opts, diskAvailBytes(), pc.startLevel) {
  1449  		return nil
  1450  	}
  1451  	if opts.Experimental.MultiLevelCompaction &&
  1452  		pc.initMultiLevelCompaction(opts, vers, levelMaxBytes, diskAvailBytes()) {
  1453  		if !pc.setupInputs(opts, diskAvailBytes(), pc.extraLevels[len(pc.extraLevels)-1]) {
  1454  			return nil
  1455  		}
  1456  	}
  1457  	return pc
  1458  }
  1459  
  1460  // Helper method to pick compactions originating from L0. Uses information about
  1461  // sublevels to generate a compaction.
  1462  func pickL0(
  1463  	env compactionEnv, opts *Options, vers *version, baseLevel int, diskAvailBytes func() uint64,
  1464  ) (pc *pickedCompaction) {
  1465  	// It is important to pass information about Lbase files to L0Sublevels
  1466  	// so it can pick a compaction that does not conflict with an Lbase => Lbase+1
  1467  	// compaction. Without this, we observed reduced concurrency of L0=>Lbase
  1468  	// compactions, and increasing read amplification in L0.
  1469  	//
  1470  	// TODO(bilal) Remove the minCompactionDepth parameter once fixing it at 1
  1471  	// has been shown to not cause a performance regression.
  1472  	lcf, err := vers.L0Sublevels.PickBaseCompaction(1, vers.Levels[baseLevel].Slice())
  1473  	if err != nil {
  1474  		opts.Logger.Infof("error when picking base compaction: %s", err)
  1475  		return
  1476  	}
  1477  	if lcf != nil {
  1478  		pc = newPickedCompactionFromL0(lcf, opts, vers, baseLevel, true)
  1479  		pc.setupInputs(opts, diskAvailBytes(), pc.startLevel)
  1480  		if pc.startLevel.files.Empty() {
  1481  			opts.Logger.Fatalf("empty compaction chosen")
  1482  		}
  1483  		return pc
  1484  	}
  1485  
  1486  	// Couldn't choose a base compaction. Try choosing an intra-L0
  1487  	// compaction. Note that we pass in L0CompactionThreshold here as opposed to
  1488  	// 1, since choosing a single sublevel intra-L0 compaction is
  1489  	// counterproductive.
  1490  	lcf, err = vers.L0Sublevels.PickIntraL0Compaction(env.earliestUnflushedSeqNum, minIntraL0Count)
  1491  	if err != nil {
  1492  		opts.Logger.Infof("error when picking intra-L0 compaction: %s", err)
  1493  		return
  1494  	}
  1495  	if lcf != nil {
  1496  		pc = newPickedCompactionFromL0(lcf, opts, vers, 0, false)
  1497  		if !pc.setupInputs(opts, diskAvailBytes(), pc.startLevel) {
  1498  			return nil
  1499  		}
  1500  		if pc.startLevel.files.Empty() {
  1501  			opts.Logger.Fatalf("empty compaction chosen")
  1502  		}
  1503  		{
  1504  			iter := pc.startLevel.files.Iter()
  1505  			if iter.First() == nil || iter.Next() == nil {
  1506  				// A single-file intra-L0 compaction is unproductive.
  1507  				return nil
  1508  			}
  1509  		}
  1510  
  1511  		pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter())
  1512  	}
  1513  	return pc
  1514  }
  1515  
  1516  func (p *compactionPickerByScore) pickManual(
  1517  	env compactionEnv, manual *manualCompaction,
  1518  ) (pc *pickedCompaction, retryLater bool) {
  1519  	if p == nil {
  1520  		return nil, false
  1521  	}
  1522  
  1523  	outputLevel := manual.level + 1
  1524  	if manual.level == 0 {
  1525  		outputLevel = p.baseLevel
  1526  	} else if manual.level < p.baseLevel {
  1527  		// The start level for a compaction must be >= Lbase. A manual
  1528  		// compaction could have been created adhering to that condition, and
  1529  		// then an automatic compaction came in and compacted all of the
  1530  		// sstables in Lbase to Lbase+1 which caused Lbase to change. Simply
  1531  		// ignore this manual compaction as there is nothing to do (manual.level
  1532  		// points to an empty level).
  1533  		return nil, false
  1534  	}
  1535  	// This conflictsWithInProgress call is necessary for the manual compaction to
  1536  	// be retried when it conflicts with an ongoing automatic compaction. Without
  1537  	// it, the compaction is dropped due to pc.setupInputs returning false since
  1538  	// the input/output range is already being compacted, and the manual
  1539  	// compaction ends with a non-compacted LSM.
  1540  	if conflictsWithInProgress(manual, outputLevel, env.inProgressCompactions, p.opts.Comparer.Compare) {
  1541  		return nil, true
  1542  	}
  1543  	pc = pickManualHelper(p.opts, manual, p.vers, p.baseLevel, p.diskAvailBytes, p.levelMaxBytes)
  1544  	if pc == nil {
  1545  		return nil, false
  1546  	}
  1547  	if pc.outputLevel.level != outputLevel {
  1548  		if len(pc.extraLevels) > 0 {
  1549  			// multilevel compactions relax this invariant
  1550  		} else {
  1551  			panic("bitalostable: compaction picked unexpected output level")
  1552  		}
  1553  	}
  1554  	// Fail-safe to protect against compacting the same sstable concurrently.
  1555  	if inputRangeAlreadyCompacting(env, pc) {
  1556  		return nil, true
  1557  	}
  1558  	return pc, false
  1559  }
  1560  
  1561  func pickManualHelper(
  1562  	opts *Options,
  1563  	manual *manualCompaction,
  1564  	vers *version,
  1565  	baseLevel int,
  1566  	diskAvailBytes func() uint64,
  1567  	levelMaxBytes [7]int64,
  1568  ) (pc *pickedCompaction) {
  1569  	pc = newPickedCompaction(opts, vers, manual.level, defaultOutputLevel(manual.level, baseLevel), baseLevel)
  1570  	manual.outputLevel = pc.outputLevel.level
  1571  	cmp := opts.Comparer.Compare
  1572  	pc.startLevel.files = vers.Overlaps(manual.level, cmp, manual.start, manual.end, false)
  1573  	if pc.startLevel.files.Empty() {
  1574  		// Nothing to do
  1575  		return nil
  1576  	}
  1577  	if !pc.setupInputs(opts, diskAvailBytes(), pc.startLevel) {
  1578  		return nil
  1579  	}
  1580  	if opts.Experimental.MultiLevelCompaction && pc.startLevel.level > 0 &&
  1581  		pc.initMultiLevelCompaction(opts, vers, levelMaxBytes, diskAvailBytes()) {
  1582  		if !pc.setupInputs(opts, diskAvailBytes(), pc.extraLevels[len(pc.extraLevels)-1]) {
  1583  			return nil
  1584  		}
  1585  	}
  1586  	return pc
  1587  }
  1588  
  1589  func (p *compactionPickerByScore) pickReadTriggeredCompaction(
  1590  	env compactionEnv,
  1591  ) (pc *pickedCompaction) {
  1592  	// If a flush is in-progress or expected to happen soon, it means more writes are taking place. We would
  1593  	// soon be scheduling more write focussed compactions. In this case, skip read compactions as they are
  1594  	// lower priority.
  1595  	if env.readCompactionEnv.flushing || env.readCompactionEnv.readCompactions == nil {
  1596  		return nil
  1597  	}
  1598  	for env.readCompactionEnv.readCompactions.size > 0 {
  1599  		rc := env.readCompactionEnv.readCompactions.remove()
  1600  		if pc = pickReadTriggeredCompactionHelper(p, rc, env); pc != nil {
  1601  			break
  1602  		}
  1603  	}
  1604  	return pc
  1605  }
  1606  
  1607  func pickReadTriggeredCompactionHelper(
  1608  	p *compactionPickerByScore, rc *readCompaction, env compactionEnv,
  1609  ) (pc *pickedCompaction) {
  1610  	cmp := p.opts.Comparer.Compare
  1611  	overlapSlice := p.vers.Overlaps(rc.level, cmp, rc.start, rc.end, false /* exclusiveEnd */)
  1612  	if overlapSlice.Empty() {
  1613  		// If there is no overlap, then the file with the key range
  1614  		// must have been compacted away. So, we don't proceed to
  1615  		// compact the same key range again.
  1616  		return nil
  1617  	}
  1618  
  1619  	iter := overlapSlice.Iter()
  1620  	var fileMatches bool
  1621  	for f := iter.First(); f != nil; f = iter.Next() {
  1622  		if f.FileNum == rc.fileNum {
  1623  			fileMatches = true
  1624  			break
  1625  		}
  1626  	}
  1627  	if !fileMatches {
  1628  		return nil
  1629  	}
  1630  
  1631  	pc = newPickedCompaction(p.opts, p.vers, rc.level, defaultOutputLevel(rc.level, p.baseLevel), p.baseLevel)
  1632  
  1633  	pc.startLevel.files = overlapSlice
  1634  	if !pc.setupInputs(p.opts, p.diskAvailBytes(), pc.startLevel) {
  1635  		return nil
  1636  	}
  1637  	if inputRangeAlreadyCompacting(env, pc) {
  1638  		return nil
  1639  	}
  1640  	pc.kind = compactionKindRead
  1641  
  1642  	// Prevent read compactions which are too wide.
  1643  	outputOverlaps := pc.version.Overlaps(
  1644  		pc.outputLevel.level, pc.cmp, pc.smallest.UserKey,
  1645  		pc.largest.UserKey, pc.largest.IsExclusiveSentinel())
  1646  	if outputOverlaps.SizeSum() > pc.maxReadCompactionBytes {
  1647  		return nil
  1648  	}
  1649  
  1650  	// Prevent compactions which start with a small seed file X, but overlap
  1651  	// with over allowedCompactionWidth * X file sizes in the output layer.
  1652  	const allowedCompactionWidth = 35
  1653  	if outputOverlaps.SizeSum() > overlapSlice.SizeSum()*allowedCompactionWidth {
  1654  		return nil
  1655  	}
  1656  
  1657  	return pc
  1658  }
  1659  
  1660  func (p *compactionPickerByScore) forceBaseLevel1() {
  1661  	p.baseLevel = 1
  1662  }
  1663  
  1664  func inputRangeAlreadyCompacting(env compactionEnv, pc *pickedCompaction) bool {
  1665  	for _, cl := range pc.inputs {
  1666  		iter := cl.files.Iter()
  1667  		for f := iter.First(); f != nil; f = iter.Next() {
  1668  			if f.IsCompacting() {
  1669  				return true
  1670  			}
  1671  		}
  1672  	}
  1673  
  1674  	// Look for active compactions outputting to the same region of the key
  1675  	// space in the same output level. Two potential compactions may conflict
  1676  	// without sharing input files if there are no files in the output level
  1677  	// that overlap with the intersection of the compactions' key spaces.
  1678  	//
  1679  	// Consider an active L0->Lbase compaction compacting two L0 files one
  1680  	// [a-f] and the other [t-z] into Lbase.
  1681  	//
  1682  	// L0
  1683  	//     ↦ 000100  ↤                           ↦  000101   ↤
  1684  	// L1
  1685  	//     ↦ 000004  ↤
  1686  	//     a b c d e f g h i j k l m n o p q r s t u v w x y z
  1687  	//
  1688  	// If a new file 000102 [j-p] is flushed while the existing compaction is
  1689  	// still ongoing, new file would not be in any compacting sublevel
  1690  	// intervals and would not overlap with any Lbase files that are also
  1691  	// compacting. However, this compaction cannot be picked because the
  1692  	// compaction's output key space [j-p] would overlap the existing
  1693  	// compaction's output key space [a-z].
  1694  	//
  1695  	// L0
  1696  	//     ↦ 000100* ↤       ↦   000102  ↤       ↦  000101*  ↤
  1697  	// L1
  1698  	//     ↦ 000004* ↤
  1699  	//     a b c d e f g h i j k l m n o p q r s t u v w x y z
  1700  	//
  1701  	// * - currently compacting
  1702  	if pc.outputLevel != nil && pc.outputLevel.level != 0 {
  1703  		for _, c := range env.inProgressCompactions {
  1704  			if pc.outputLevel.level != c.outputLevel {
  1705  				continue
  1706  			}
  1707  			if base.InternalCompare(pc.cmp, c.largest, pc.smallest) < 0 ||
  1708  				base.InternalCompare(pc.cmp, c.smallest, pc.largest) > 0 {
  1709  				continue
  1710  			}
  1711  
  1712  			// The picked compaction and the in-progress compaction c are
  1713  			// outputting to the same region of the key space of the same
  1714  			// level.
  1715  			return true
  1716  		}
  1717  	}
  1718  	return false
  1719  }
  1720  
  1721  // conflictsWithInProgress checks if there are any in-progress compactions with overlapping keyspace.
  1722  func conflictsWithInProgress(
  1723  	manual *manualCompaction, outputLevel int, inProgressCompactions []compactionInfo, cmp Compare,
  1724  ) bool {
  1725  	for _, c := range inProgressCompactions {
  1726  		if (c.outputLevel == manual.level || c.outputLevel == outputLevel) &&
  1727  			isUserKeysOverlapping(manual.start, manual.end, c.smallest.UserKey, c.largest.UserKey, cmp) {
  1728  			return true
  1729  		}
  1730  		for _, in := range c.inputs {
  1731  			if in.files.Empty() {
  1732  				continue
  1733  			}
  1734  			iter := in.files.Iter()
  1735  			smallest := iter.First().Smallest.UserKey
  1736  			largest := iter.Last().Largest.UserKey
  1737  			if (in.level == manual.level || in.level == outputLevel) &&
  1738  				isUserKeysOverlapping(manual.start, manual.end, smallest, largest, cmp) {
  1739  				return true
  1740  			}
  1741  		}
  1742  	}
  1743  	return false
  1744  }
  1745  
  1746  func isUserKeysOverlapping(x1, x2, y1, y2 []byte, cmp Compare) bool {
  1747  	return cmp(x1, y2) <= 0 && cmp(y1, x2) <= 0
  1748  }