github.com/cockroachdb/pebble@v1.1.5/internal/manifest/l0_sublevels.go (about)

     1  // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package manifest
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"math"
    11  	"sort"
    12  	"strings"
    13  
    14  	"github.com/cockroachdb/errors"
    15  	"github.com/cockroachdb/pebble/internal/base"
    16  	"github.com/cockroachdb/pebble/internal/invariants"
    17  )
    18  
    19  // errInvalidL0SublevelsOpt is for use in AddL0Files when the incremental
    20  // sublevel generation optimization failed, and NewL0Sublevels must be called.
    21  var errInvalidL0SublevelsOpt = errors.New("pebble: L0 sublevel generation optimization cannot be used")
    22  
    23  // Intervals are of the form [start, end) with no gap between intervals. Each
    24  // file overlaps perfectly with a sequence of intervals. This perfect overlap
    25  // occurs because the union of file boundary keys is used to pick intervals.
    26  // However the largest key in a file is inclusive, so when it is used as
    27  // an interval, the actual key is ImmediateSuccessor(key). We don't have the
    28  // ImmediateSuccessor function to do this computation, so we instead keep an
    29  // isLargest bool to remind the code about this fact. This is used for
    30  // comparisons in the following manner:
    31  // - intervalKey{k, false} < intervalKey{k, true}
    32  // - k1 < k2 -> intervalKey{k1, _} < intervalKey{k2, _}.
    33  //
    34  // Note that the file's largest key is exclusive if the internal key
    35  // has a trailer matching the rangedel sentinel key. In this case, we set
    36  // isLargest to false for end interval computation.
    37  //
    38  // For example, consider three files with bounds [a,e], [b,g], and [e,j]. The
    39  // interval keys produced would be intervalKey{a, false}, intervalKey{b, false},
    40  // intervalKey{e, false}, intervalKey{e, true}, intervalKey{g, true} and
    41  // intervalKey{j, true}, resulting in intervals
    42  // [a, b), [b, (e, false)), [(e,false), (e, true)), [(e, true), (g, true)) and
    43  // [(g, true), (j, true)). The first file overlaps with the first three
    44  // perfectly, the second file overlaps with the second through to fourth
    45  // intervals, and the third file overlaps with the last three.
    46  //
    47  // The intervals are indexed starting from 0, with the index of the interval
    48  // being the index of the start key of the interval.
    49  //
    50  // In addition to helping with compaction picking, we use interval indices
    51  // to assign each file an interval range once. Subsequent operations, say
    52  // picking overlapping files for a compaction, only need to use the index
    53  // numbers and so avoid expensive byte slice comparisons.
    54  type intervalKey struct {
    55  	key       []byte
    56  	isLargest bool
    57  }
    58  
    59  // intervalKeyTemp is used in the sortAndSweep step. It contains additional metadata
    60  // which is used to generate the {min,max}IntervalIndex for files.
    61  type intervalKeyTemp struct {
    62  	intervalKey intervalKey
    63  	fileMeta    *FileMetadata
    64  	isEndKey    bool
    65  }
    66  
    67  func (i *intervalKeyTemp) setFileIntervalIndex(idx int) {
    68  	if i.isEndKey {
    69  		// This is the right endpoint of some file interval, so the
    70  		// file.maxIntervalIndex must be j - 1 as maxIntervalIndex is
    71  		// inclusive.
    72  		i.fileMeta.maxIntervalIndex = idx - 1
    73  		return
    74  	}
    75  	// This is the left endpoint for some file interval, so the
    76  	// file.minIntervalIndex must be j.
    77  	i.fileMeta.minIntervalIndex = idx
    78  }
    79  
    80  func intervalKeyCompare(cmp Compare, a, b intervalKey) int {
    81  	rv := cmp(a.key, b.key)
    82  	if rv == 0 {
    83  		if a.isLargest && !b.isLargest {
    84  			return +1
    85  		}
    86  		if !a.isLargest && b.isLargest {
    87  			return -1
    88  		}
    89  	}
    90  	return rv
    91  }
    92  
    93  type intervalKeySorter struct {
    94  	keys []intervalKeyTemp
    95  	cmp  Compare
    96  }
    97  
    98  func (s intervalKeySorter) Len() int { return len(s.keys) }
    99  func (s intervalKeySorter) Less(i, j int) bool {
   100  	return intervalKeyCompare(s.cmp, s.keys[i].intervalKey, s.keys[j].intervalKey) < 0
   101  }
   102  func (s intervalKeySorter) Swap(i, j int) {
   103  	s.keys[i], s.keys[j] = s.keys[j], s.keys[i]
   104  }
   105  
   106  // sortAndSweep will sort the intervalKeys using intervalKeySorter, remove the
   107  // duplicate fileIntervals, and set the {min, max}IntervalIndex for the files.
   108  func sortAndSweep(keys []intervalKeyTemp, cmp Compare) []intervalKeyTemp {
   109  	if len(keys) == 0 {
   110  		return nil
   111  	}
   112  	sorter := intervalKeySorter{keys: keys, cmp: cmp}
   113  	sort.Sort(sorter)
   114  
   115  	// intervalKeys are generated using the file bounds. Specifically, there are
   116  	// 2 intervalKeys for each file, and len(keys) = 2 * number of files. Each
   117  	// `intervalKeyTemp` stores information about which file it was generated
   118  	// from, and whether the key represents the end key of the file. So, as
   119  	// we're deduplicating the `keys` slice, we're guaranteed to iterate over
   120  	// the interval keys belonging to each of the files. Since the
   121  	// file.{min,max}IntervalIndex points to the position of the files bounds in
   122  	// the deduplicated `keys` slice, we can determine
   123  	// file.{min,max}IntervalIndex during the iteration.
   124  	i := 0
   125  	j := 0
   126  	for i < len(keys) {
   127  		// loop invariant: j <= i
   128  		currKey := keys[i]
   129  		keys[j] = keys[i]
   130  
   131  		for {
   132  			keys[i].setFileIntervalIndex(j)
   133  			i++
   134  			if i >= len(keys) || intervalKeyCompare(cmp, currKey.intervalKey, keys[i].intervalKey) != 0 {
   135  				break
   136  			}
   137  		}
   138  		j++
   139  	}
   140  	return keys[:j]
   141  }
   142  
   143  // A key interval of the form [start, end). The end is not represented here
   144  // since it is implicit in the start of the next interval. The last interval is
   145  // an exception but we don't need to ever lookup the end of that interval; the
   146  // last fileInterval will only act as an end key marker. The set of intervals
   147  // is const after initialization.
   148  type fileInterval struct {
   149  	index    int
   150  	startKey intervalKey
   151  
   152  	// True iff some file in this interval is compacting to base. Such intervals
   153  	// cannot have any files participate in L0 -> Lbase compactions.
   154  	isBaseCompacting bool
   155  
   156  	// The min and max intervals index across all the files that overlap with
   157  	// this interval. Inclusive on both sides.
   158  	filesMinIntervalIndex int
   159  	filesMaxIntervalIndex int
   160  
   161  	// True if another interval that has a file extending into this interval is
   162  	// undergoing a compaction into Lbase. In other words, this bool is true if
   163  	// any interval in [filesMinIntervalIndex, filesMaxIntervalIndex] has
   164  	// isBaseCompacting set to true. This lets the compaction picker
   165  	// de-prioritize this interval for picking compactions, since there's a high
   166  	// chance that a base compaction with a sufficient height of sublevels
   167  	// rooted at this interval could not be chosen due to the ongoing base
   168  	// compaction in the other interval. If the file straddling the two
   169  	// intervals is at a sufficiently high sublevel (with enough compactible
   170  	// files below it to satisfy minCompactionDepth), this is not an issue, but
   171  	// to optimize for quickly picking base compactions far away from other base
   172  	// compactions, this bool is used as a heuristic (but not as a complete
   173  	// disqualifier).
   174  	intervalRangeIsBaseCompacting bool
   175  
   176  	// All files in this interval, in increasing sublevel order.
   177  	files []*FileMetadata
   178  
   179  	// len(files) - compactingFileCount is the stack depth that requires
   180  	// starting new compactions. This metric is not precise since the
   181  	// compactingFileCount can include files that are part of N (where N > 1)
   182  	// intra-L0 compactions, so the stack depth after those complete will be
   183  	// len(files) - compactingFileCount + N. We ignore this imprecision since we
   184  	// don't want to track which files are part of which intra-L0 compaction.
   185  	compactingFileCount int
   186  
   187  	// Interpolated from files in this interval. For files spanning multiple
   188  	// intervals, we assume an equal distribution of bytes across all those
   189  	// intervals.
   190  	estimatedBytes uint64
   191  }
   192  
   193  // Helper type for any cases requiring a bool slice.
   194  type bitSet []bool
   195  
   196  func newBitSet(n int) bitSet {
   197  	return make([]bool, n)
   198  }
   199  
   200  func (b *bitSet) markBit(i int) {
   201  	(*b)[i] = true
   202  }
   203  
   204  func (b *bitSet) markBits(start, end int) {
   205  	for i := start; i < end; i++ {
   206  		(*b)[i] = true
   207  	}
   208  }
   209  
   210  func (b *bitSet) clearAllBits() {
   211  	for i := range *b {
   212  		(*b)[i] = false
   213  	}
   214  }
   215  
   216  // L0Compaction describes an active compaction with inputs from L0.
   217  type L0Compaction struct {
   218  	Smallest  InternalKey
   219  	Largest   InternalKey
   220  	IsIntraL0 bool
   221  }
   222  
   223  // L0Sublevels represents a sublevel view of SSTables in L0. Tables in one
   224  // sublevel are non-overlapping in key ranges, and keys in higher-indexed
   225  // sublevels shadow older versions in lower-indexed sublevels. These invariants
   226  // are similar to the regular level invariants, except with higher indexed
   227  // sublevels having newer keys as opposed to lower indexed levels.
   228  //
   229  // There is no limit to the number of sublevels that can exist in L0 at any
   230  // time, however read and compaction performance is best when there are as few
   231  // sublevels as possible.
   232  type L0Sublevels struct {
   233  	// Levels are ordered from oldest sublevel to youngest sublevel in the
   234  	// outer slice, and the inner slice contains non-overlapping files for
   235  	// that sublevel in increasing key order. Levels is constructed from
   236  	// levelFiles and is used by callers that require a LevelSlice. The below two
   237  	// fields are treated as immutable once created in NewL0Sublevels.
   238  	Levels     []LevelSlice
   239  	levelFiles [][]*FileMetadata
   240  
   241  	cmp       Compare
   242  	formatKey base.FormatKey
   243  
   244  	fileBytes uint64
   245  	// All the L0 files, ordered from oldest to youngest.
   246  	levelMetadata *LevelMetadata
   247  
   248  	// The file intervals in increasing key order.
   249  	orderedIntervals []fileInterval
   250  
   251  	// Keys to break flushes at.
   252  	flushSplitUserKeys [][]byte
   253  
   254  	// Only used to check invariants.
   255  	addL0FilesCalled bool
   256  }
   257  
   258  type sublevelSorter []*FileMetadata
   259  
   260  // Len implements sort.Interface.
   261  func (sl sublevelSorter) Len() int {
   262  	return len(sl)
   263  }
   264  
   265  // Less implements sort.Interface.
   266  func (sl sublevelSorter) Less(i, j int) bool {
   267  	return sl[i].minIntervalIndex < sl[j].minIntervalIndex
   268  }
   269  
   270  // Swap implements sort.Interface.
   271  func (sl sublevelSorter) Swap(i, j int) {
   272  	sl[i], sl[j] = sl[j], sl[i]
   273  }
   274  
   275  // NewL0Sublevels creates an L0Sublevels instance for a given set of L0 files.
   276  // These files must all be in L0 and must be sorted by seqnum (see
   277  // SortBySeqNum). During interval iteration, when flushSplitMaxBytes bytes are
   278  // exceeded in the range of intervals since the last flush split key, a flush
   279  // split key is added.
   280  //
   281  // This method can be called without DB.mu being held, so any DB.mu protected
   282  // fields in FileMetadata cannot be accessed here, such as Compacting and
   283  // IsIntraL0Compacting. Those fields are accessed in InitCompactingFileInfo
   284  // instead.
   285  func NewL0Sublevels(
   286  	levelMetadata *LevelMetadata, cmp Compare, formatKey base.FormatKey, flushSplitMaxBytes int64,
   287  ) (*L0Sublevels, error) {
   288  	s := &L0Sublevels{cmp: cmp, formatKey: formatKey}
   289  	s.levelMetadata = levelMetadata
   290  	keys := make([]intervalKeyTemp, 0, 2*s.levelMetadata.Len())
   291  	iter := levelMetadata.Iter()
   292  	for i, f := 0, iter.First(); f != nil; i, f = i+1, iter.Next() {
   293  		f.L0Index = i
   294  		keys = append(keys, intervalKeyTemp{
   295  			intervalKey: intervalKey{key: f.Smallest.UserKey},
   296  			fileMeta:    f,
   297  			isEndKey:    false,
   298  		})
   299  		keys = append(keys, intervalKeyTemp{
   300  			intervalKey: intervalKey{
   301  				key:       f.Largest.UserKey,
   302  				isLargest: !f.Largest.IsExclusiveSentinel(),
   303  			},
   304  			fileMeta: f,
   305  			isEndKey: true,
   306  		})
   307  	}
   308  	keys = sortAndSweep(keys, cmp)
   309  	// All interval indices reference s.orderedIntervals.
   310  	s.orderedIntervals = make([]fileInterval, len(keys))
   311  	for i := range keys {
   312  		s.orderedIntervals[i] = fileInterval{
   313  			index:                 i,
   314  			startKey:              keys[i].intervalKey,
   315  			filesMinIntervalIndex: i,
   316  			filesMaxIntervalIndex: i,
   317  		}
   318  	}
   319  	// Initialize minIntervalIndex and maxIntervalIndex for each file, and use that
   320  	// to update intervals.
   321  	for f := iter.First(); f != nil; f = iter.Next() {
   322  		if err := s.addFileToSublevels(f, false /* checkInvariant */); err != nil {
   323  			return nil, err
   324  		}
   325  	}
   326  	// Sort each sublevel in increasing key order.
   327  	for i := range s.levelFiles {
   328  		sort.Sort(sublevelSorter(s.levelFiles[i]))
   329  	}
   330  
   331  	// Construct a parallel slice of sublevel B-Trees.
   332  	// TODO(jackson): Consolidate and only use the B-Trees.
   333  	for _, sublevelFiles := range s.levelFiles {
   334  		tr, ls := makeBTree(btreeCmpSmallestKey(cmp), sublevelFiles)
   335  		s.Levels = append(s.Levels, ls)
   336  		tr.Release()
   337  	}
   338  
   339  	s.calculateFlushSplitKeys(flushSplitMaxBytes)
   340  	return s, nil
   341  }
   342  
   343  // Helper function to merge new intervalKeys into an existing slice of old
   344  // fileIntervals, into result. Returns the new result and a slice of ints
   345  // mapping old interval indices to new ones. The added intervalKeys do not need
   346  // to be sorted; they get sorted and deduped in this function.
   347  func mergeIntervals(
   348  	old, result []fileInterval, added []intervalKeyTemp, compare Compare,
   349  ) ([]fileInterval, []int) {
   350  	sorter := intervalKeySorter{keys: added, cmp: compare}
   351  	sort.Sort(sorter)
   352  
   353  	oldToNewMap := make([]int, len(old))
   354  	i := 0
   355  	j := 0
   356  
   357  	for i < len(old) || j < len(added) {
   358  		for j > 0 && j < len(added) && intervalKeyCompare(compare, added[j-1].intervalKey, added[j].intervalKey) == 0 {
   359  			added[j].setFileIntervalIndex(len(result) - 1)
   360  			j++
   361  		}
   362  		if i >= len(old) && j >= len(added) {
   363  			break
   364  		}
   365  		var cmp int
   366  		if i >= len(old) {
   367  			cmp = +1
   368  		}
   369  		if j >= len(added) {
   370  			cmp = -1
   371  		}
   372  		if cmp == 0 {
   373  			cmp = intervalKeyCompare(compare, old[i].startKey, added[j].intervalKey)
   374  		}
   375  		switch {
   376  		case cmp <= 0:
   377  			// Shallow-copy the existing interval.
   378  			newInterval := old[i]
   379  			result = append(result, newInterval)
   380  			oldToNewMap[i] = len(result) - 1
   381  			i++
   382  			if cmp == 0 {
   383  				added[j].setFileIntervalIndex(len(result) - 1)
   384  				j++
   385  			}
   386  		case cmp > 0:
   387  			var prevInterval fileInterval
   388  			// Insert a new interval for a newly-added file. prevInterval, if
   389  			// non-zero, will be "inherited"; we copy its files as those extend
   390  			// into this interval.
   391  			if len(result) > 0 {
   392  				prevInterval = result[len(result)-1]
   393  			}
   394  			newInterval := fileInterval{
   395  				index:                 len(result),
   396  				startKey:              added[j].intervalKey,
   397  				filesMinIntervalIndex: len(result),
   398  				filesMaxIntervalIndex: len(result),
   399  
   400  				// estimatedBytes gets recalculated later on, as the number of intervals
   401  				// the file bytes are interpolated over has changed.
   402  				estimatedBytes: 0,
   403  				// Copy the below attributes from prevInterval.
   404  				files:                         append([]*FileMetadata(nil), prevInterval.files...),
   405  				isBaseCompacting:              prevInterval.isBaseCompacting,
   406  				intervalRangeIsBaseCompacting: prevInterval.intervalRangeIsBaseCompacting,
   407  				compactingFileCount:           prevInterval.compactingFileCount,
   408  			}
   409  			result = append(result, newInterval)
   410  			added[j].setFileIntervalIndex(len(result) - 1)
   411  			j++
   412  		}
   413  	}
   414  	return result, oldToNewMap
   415  }
   416  
   417  // AddL0Files incrementally builds a new L0Sublevels for when the only change
   418  // since the receiver L0Sublevels was an addition of the specified files, with
   419  // no L0 deletions. The common case of this is an ingestion or a flush. These
   420  // files can "sit on top" of existing sublevels, creating at most one new
   421  // sublevel for a flush (and possibly multiple for an ingestion), and at most
   422  // 2*len(files) additions to s.orderedIntervals. No files must have been deleted
   423  // from L0, and the added files must all be newer in sequence numbers than
   424  // existing files in L0Sublevels. The files parameter must be sorted in seqnum
   425  // order. The levelMetadata parameter corresponds to the new L0 post addition of
   426  // files. This method is meant to be significantly more performant than
   427  // NewL0Sublevels.
   428  //
   429  // Note that this function can only be called once on a given receiver; it
   430  // appends to some slices in s which is only safe when done once. This is okay,
   431  // as the common case (generating a new L0Sublevels after a flush/ingestion) is
   432  // only going to necessitate one call of this method on a given receiver. The
   433  // returned value, if non-nil, can then have [*L0Sublevels.AddL0Files] called on
   434  // it again, and so on. If [errInvalidL0SublevelsOpt] is returned as an error,
   435  // it likely means the optimization could not be applied (i.e. files added were
   436  // older than files already in the sublevels, which is possible around
   437  // ingestions and in tests). Eg. it can happen when an ingested file was
   438  // ingested without queueing a flush since it did not actually overlap with any
   439  // keys in the memtable. Later on the memtable was flushed, and the memtable had
   440  // keys spanning around the ingested file, producing a flushed file that
   441  // overlapped with the ingested file in file bounds but not in keys. It's
   442  // possible for that flushed file to have a lower LargestSeqNum than the
   443  // ingested file if all the additions after the ingestion were to another
   444  // flushed file that was split into a separate sstable during flush. Any other
   445  // non-nil error means [L0Sublevels] generation failed in the same way as
   446  // [NewL0Sublevels] would likely fail.
   447  func (s *L0Sublevels) AddL0Files(
   448  	files []*FileMetadata, flushSplitMaxBytes int64, levelMetadata *LevelMetadata,
   449  ) (*L0Sublevels, error) {
   450  	if invariants.Enabled && s.addL0FilesCalled {
   451  		panic("AddL0Files called twice on the same receiver")
   452  	}
   453  	s.addL0FilesCalled = true
   454  
   455  	// Start with a shallow copy of s.
   456  	newVal := &L0Sublevels{}
   457  	*newVal = *s
   458  
   459  	newVal.addL0FilesCalled = false
   460  	newVal.levelMetadata = levelMetadata
   461  	// Deep copy levelFiles and Levels, as they are mutated and sorted below.
   462  	// Shallow copies of slices that we just append to, are okay.
   463  	newVal.levelFiles = make([][]*FileMetadata, len(s.levelFiles))
   464  	for i := range s.levelFiles {
   465  		newVal.levelFiles[i] = make([]*FileMetadata, len(s.levelFiles[i]))
   466  		copy(newVal.levelFiles[i], s.levelFiles[i])
   467  	}
   468  	newVal.Levels = make([]LevelSlice, len(s.Levels))
   469  	copy(newVal.Levels, s.Levels)
   470  
   471  	fileKeys := make([]intervalKeyTemp, 0, 2*len(files))
   472  	for _, f := range files {
   473  		left := intervalKeyTemp{
   474  			intervalKey: intervalKey{key: f.Smallest.UserKey},
   475  			fileMeta:    f,
   476  		}
   477  		right := intervalKeyTemp{
   478  			intervalKey: intervalKey{
   479  				key:       f.Largest.UserKey,
   480  				isLargest: !f.Largest.IsExclusiveSentinel(),
   481  			},
   482  			fileMeta: f,
   483  			isEndKey: true,
   484  		}
   485  		fileKeys = append(fileKeys, left, right)
   486  	}
   487  	keys := make([]fileInterval, 0, 2*levelMetadata.Len())
   488  	var oldToNewMap []int
   489  	// We can avoid the sortAndSweep step on the combined length of
   490  	// s.orderedIntervals and fileKeys by treating this as a merge of two sorted
   491  	// runs, fileKeys and s.orderedIntervals, into `keys` which will form
   492  	// newVal.orderedIntervals.
   493  	keys, oldToNewMap = mergeIntervals(s.orderedIntervals, keys, fileKeys, s.cmp)
   494  	if invariants.Enabled {
   495  		for i := 1; i < len(keys); i++ {
   496  			if intervalKeyCompare(newVal.cmp, keys[i-1].startKey, keys[i].startKey) >= 0 {
   497  				panic("keys not sorted correctly")
   498  			}
   499  		}
   500  	}
   501  	newVal.orderedIntervals = keys
   502  	// Update indices in s.orderedIntervals for fileIntervals we retained.
   503  	for _, newIdx := range oldToNewMap {
   504  		newInterval := &keys[newIdx]
   505  		newInterval.index = newIdx
   506  		// This code, and related code in the for loop below, adjusts
   507  		// files{Min,Max}IntervalIndex just for interval indices shifting due to
   508  		// new intervals, and not for any of the new files being added to the
   509  		// same intervals. The goal is to produce a state of the system that's
   510  		// accurate for all existing files, and has all the new intervals to
   511  		// support new files. Once that's done, we can just call
   512  		// addFileToSublevel to adjust all relevant intervals for new files.
   513  		newInterval.filesMinIntervalIndex = oldToNewMap[newInterval.filesMinIntervalIndex]
   514  		// maxIntervalIndexes are special. Since it's an inclusive end bound, we
   515  		// actually have to map it to the _next_ old interval's new previous
   516  		// interval. This logic is easier to understand if you see
   517  		// [f.minIntervalIndex, f.maxIntervalIndex] as [f.minIntervalIndex,
   518  		// f.maxIntervalIndex+1). The other case to remember is when the
   519  		// interval is completely empty (i.e. len(newInterval.files) == 0); in
   520  		// that case we want to refer back to ourselves regardless of additions
   521  		// to the right of us.
   522  		if newInterval.filesMaxIntervalIndex < len(oldToNewMap)-1 && len(newInterval.files) > 0 {
   523  			newInterval.filesMaxIntervalIndex = oldToNewMap[newInterval.filesMaxIntervalIndex+1] - 1
   524  		} else {
   525  			// newInterval.filesMaxIntervalIndex == len(oldToNewMap)-1.
   526  			newInterval.filesMaxIntervalIndex = oldToNewMap[newInterval.filesMaxIntervalIndex]
   527  		}
   528  	}
   529  	// Loop through all instances of new intervals added between two old
   530  	// intervals and expand [filesMinIntervalIndex, filesMaxIntervalIndex] of
   531  	// new intervals to reflect that of adjacent old intervals.
   532  	{
   533  		// We can skip cases where new intervals were added to the left of all
   534  		// existing intervals (eg. if the first entry in oldToNewMap is
   535  		// oldToNewMap[0] >= 1). Those intervals will only contain newly added
   536  		// files and will have their parameters adjusted down in
   537  		// addFileToSublevels. The same can also be said about new intervals
   538  		// that are to the right of all existing intervals.
   539  		lastIdx := 0
   540  		for _, newIdx := range oldToNewMap {
   541  			for i := lastIdx + 1; i < newIdx; i++ {
   542  				minIntervalIndex := i
   543  				maxIntervalIndex := i
   544  				if keys[lastIdx].filesMaxIntervalIndex != lastIdx {
   545  					// Last old interval has files extending into keys[i].
   546  					minIntervalIndex = keys[lastIdx].filesMinIntervalIndex
   547  					maxIntervalIndex = keys[lastIdx].filesMaxIntervalIndex
   548  				}
   549  
   550  				keys[i].filesMinIntervalIndex = minIntervalIndex
   551  				keys[i].filesMaxIntervalIndex = maxIntervalIndex
   552  			}
   553  			lastIdx = newIdx
   554  		}
   555  	}
   556  	// Go through old files and update interval indices.
   557  	//
   558  	// TODO(bilal): This is the only place in this method where we loop through
   559  	// all existing files, which could be much more in number than newly added
   560  	// files. See if we can avoid the need for this, either by getting rid of
   561  	// f.minIntervalIndex and f.maxIntervalIndex and calculating them on the fly
   562  	// with a binary search, or by only looping through files to the right of
   563  	// the first interval touched by this method.
   564  	for sublevel := range s.Levels {
   565  		s.Levels[sublevel].Each(func(f *FileMetadata) {
   566  			oldIntervalDelta := f.maxIntervalIndex - f.minIntervalIndex + 1
   567  			oldMinIntervalIndex := f.minIntervalIndex
   568  			f.minIntervalIndex = oldToNewMap[f.minIntervalIndex]
   569  			// maxIntervalIndex is special. Since it's an inclusive end bound,
   570  			// we actually have to map it to the _next_ old interval's new
   571  			// previous interval. This logic is easier to understand if you see
   572  			// [f.minIntervalIndex, f.maxIntervalIndex] as [f.minIntervalIndex,
   573  			// f.maxIntervalIndex+1).
   574  			f.maxIntervalIndex = oldToNewMap[f.maxIntervalIndex+1] - 1
   575  			newIntervalDelta := f.maxIntervalIndex - f.minIntervalIndex + 1
   576  			// Recalculate estimatedBytes for all old files across new
   577  			// intervals, but only if new intervals were added in between.
   578  			if oldIntervalDelta != newIntervalDelta {
   579  				// j is incremented so that oldToNewMap[j] points to the next
   580  				// old interval. This is used to distinguish between old
   581  				// intervals (i.e. ones where we need to subtract
   582  				// f.Size/oldIntervalDelta) from new ones (where we don't need
   583  				// to subtract). In both cases we need to add
   584  				// f.Size/newIntervalDelta.
   585  				j := oldMinIntervalIndex
   586  				for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ {
   587  					if oldToNewMap[j] == i {
   588  						newVal.orderedIntervals[i].estimatedBytes -= f.Size / uint64(oldIntervalDelta)
   589  						j++
   590  					}
   591  					newVal.orderedIntervals[i].estimatedBytes += f.Size / uint64(newIntervalDelta)
   592  				}
   593  			}
   594  		})
   595  	}
   596  	updatedSublevels := make([]int, 0)
   597  	// Update interval indices for new files.
   598  	for i, f := range files {
   599  		f.L0Index = s.levelMetadata.Len() + i
   600  		if err := newVal.addFileToSublevels(f, true /* checkInvariant */); err != nil {
   601  			return nil, err
   602  		}
   603  		updatedSublevels = append(updatedSublevels, f.SubLevel)
   604  	}
   605  
   606  	// Sort and deduplicate updatedSublevels.
   607  	sort.Ints(updatedSublevels)
   608  	{
   609  		j := 0
   610  		for i := 1; i < len(updatedSublevels); i++ {
   611  			if updatedSublevels[i] != updatedSublevels[j] {
   612  				j++
   613  				updatedSublevels[j] = updatedSublevels[i]
   614  			}
   615  		}
   616  		updatedSublevels = updatedSublevels[:j+1]
   617  	}
   618  
   619  	// Sort each updated sublevel in increasing key order.
   620  	for _, sublevel := range updatedSublevels {
   621  		sort.Sort(sublevelSorter(newVal.levelFiles[sublevel]))
   622  	}
   623  
   624  	// Construct a parallel slice of sublevel B-Trees.
   625  	// TODO(jackson): Consolidate and only use the B-Trees.
   626  	for _, sublevel := range updatedSublevels {
   627  		tr, ls := makeBTree(btreeCmpSmallestKey(newVal.cmp), newVal.levelFiles[sublevel])
   628  		if sublevel == len(newVal.Levels) {
   629  			newVal.Levels = append(newVal.Levels, ls)
   630  		} else {
   631  			// sublevel < len(s.Levels). If this panics, updatedSublevels was not
   632  			// populated correctly.
   633  			newVal.Levels[sublevel] = ls
   634  		}
   635  		tr.Release()
   636  	}
   637  
   638  	newVal.flushSplitUserKeys = nil
   639  	newVal.calculateFlushSplitKeys(flushSplitMaxBytes)
   640  	return newVal, nil
   641  }
   642  
   643  // addFileToSublevels is called during L0Sublevels generation, and adds f to the
   644  // correct sublevel's levelFiles, the relevant intervals' files slices, and sets
   645  // interval indices on f. This method, if called successively on multiple files,
   646  // _must_ be called on successively newer files (by seqnum). If checkInvariant
   647  // is true, it could check for this in some cases and return
   648  // [errInvalidL0SublevelsOpt] if that invariant isn't held.
   649  func (s *L0Sublevels) addFileToSublevels(f *FileMetadata, checkInvariant bool) error {
   650  	// This is a simple and not very accurate estimate of the number of
   651  	// bytes this SSTable contributes to the intervals it is a part of.
   652  	//
   653  	// TODO(bilal): Call EstimateDiskUsage in sstable.Reader with interval
   654  	// bounds to get a better estimate for each interval.
   655  	interpolatedBytes := f.Size / uint64(f.maxIntervalIndex-f.minIntervalIndex+1)
   656  	s.fileBytes += f.Size
   657  	subLevel := 0
   658  	// Update state in every fileInterval for this file.
   659  	for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ {
   660  		interval := &s.orderedIntervals[i]
   661  		if len(interval.files) > 0 {
   662  			if checkInvariant && interval.files[len(interval.files)-1].LargestSeqNum > f.LargestSeqNum {
   663  				// We are sliding this file "underneath" an existing file. Throw away
   664  				// and start over in NewL0Sublevels.
   665  				return errInvalidL0SublevelsOpt
   666  			}
   667  			// interval.files is sorted by sublevels, from lowest to highest.
   668  			// AddL0Files can only add files at sublevels higher than existing files
   669  			// in the same key intervals.
   670  			if maxSublevel := interval.files[len(interval.files)-1].SubLevel; subLevel <= maxSublevel {
   671  				subLevel = maxSublevel + 1
   672  			}
   673  		}
   674  		interval.estimatedBytes += interpolatedBytes
   675  		if f.minIntervalIndex < interval.filesMinIntervalIndex {
   676  			interval.filesMinIntervalIndex = f.minIntervalIndex
   677  		}
   678  		if f.maxIntervalIndex > interval.filesMaxIntervalIndex {
   679  			interval.filesMaxIntervalIndex = f.maxIntervalIndex
   680  		}
   681  		interval.files = append(interval.files, f)
   682  	}
   683  	f.SubLevel = subLevel
   684  	if subLevel > len(s.levelFiles) {
   685  		return errors.Errorf("chose a sublevel beyond allowed range of sublevels: %d vs 0-%d", subLevel, len(s.levelFiles))
   686  	}
   687  	if subLevel == len(s.levelFiles) {
   688  		s.levelFiles = append(s.levelFiles, []*FileMetadata{f})
   689  	} else {
   690  		s.levelFiles[subLevel] = append(s.levelFiles[subLevel], f)
   691  	}
   692  	return nil
   693  }
   694  
   695  func (s *L0Sublevels) calculateFlushSplitKeys(flushSplitMaxBytes int64) {
   696  	var cumulativeBytes uint64
   697  	// Multiply flushSplitMaxBytes by the number of sublevels. This prevents
   698  	// excessive flush splitting when the number of sublevels increases.
   699  	flushSplitMaxBytes *= int64(len(s.levelFiles))
   700  	for i := 0; i < len(s.orderedIntervals); i++ {
   701  		interval := &s.orderedIntervals[i]
   702  		if flushSplitMaxBytes > 0 && cumulativeBytes > uint64(flushSplitMaxBytes) &&
   703  			(len(s.flushSplitUserKeys) == 0 ||
   704  				!bytes.Equal(interval.startKey.key, s.flushSplitUserKeys[len(s.flushSplitUserKeys)-1])) {
   705  			s.flushSplitUserKeys = append(s.flushSplitUserKeys, interval.startKey.key)
   706  			cumulativeBytes = 0
   707  		}
   708  		cumulativeBytes += s.orderedIntervals[i].estimatedBytes
   709  	}
   710  }
   711  
   712  // InitCompactingFileInfo initializes internal flags relating to compacting
   713  // files. Must be called after sublevel initialization.
   714  //
   715  // Requires DB.mu *and* the manifest lock to be held.
   716  func (s *L0Sublevels) InitCompactingFileInfo(inProgress []L0Compaction) {
   717  	for i := range s.orderedIntervals {
   718  		s.orderedIntervals[i].compactingFileCount = 0
   719  		s.orderedIntervals[i].isBaseCompacting = false
   720  		s.orderedIntervals[i].intervalRangeIsBaseCompacting = false
   721  	}
   722  
   723  	iter := s.levelMetadata.Iter()
   724  	for f := iter.First(); f != nil; f = iter.Next() {
   725  		if invariants.Enabled {
   726  			if !bytes.Equal(s.orderedIntervals[f.minIntervalIndex].startKey.key, f.Smallest.UserKey) {
   727  				panic(fmt.Sprintf("f.minIntervalIndex in FileMetadata out of sync with intervals in L0Sublevels: %s != %s",
   728  					s.formatKey(s.orderedIntervals[f.minIntervalIndex].startKey.key), s.formatKey(f.Smallest.UserKey)))
   729  			}
   730  			if !bytes.Equal(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key, f.Largest.UserKey) {
   731  				panic(fmt.Sprintf("f.maxIntervalIndex in FileMetadata out of sync with intervals in L0Sublevels: %s != %s",
   732  					s.formatKey(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key), s.formatKey(f.Smallest.UserKey)))
   733  			}
   734  		}
   735  		if !f.IsCompacting() {
   736  			continue
   737  		}
   738  		if invariants.Enabled {
   739  			if s.cmp(s.orderedIntervals[f.minIntervalIndex].startKey.key, f.Smallest.UserKey) != 0 || s.cmp(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key, f.Largest.UserKey) != 0 {
   740  				panic(fmt.Sprintf("file %s has inconsistent L0 Sublevel interval bounds: %s-%s, %s-%s", f.FileNum,
   741  					s.orderedIntervals[f.minIntervalIndex].startKey.key, s.orderedIntervals[f.maxIntervalIndex+1].startKey.key,
   742  					f.Smallest.UserKey, f.Largest.UserKey))
   743  			}
   744  		}
   745  		for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ {
   746  			interval := &s.orderedIntervals[i]
   747  			interval.compactingFileCount++
   748  			if !f.IsIntraL0Compacting {
   749  				// If f.Compacting && !f.IsIntraL0Compacting, this file is
   750  				// being compacted to Lbase.
   751  				interval.isBaseCompacting = true
   752  			}
   753  		}
   754  	}
   755  
   756  	// Some intervals may be base compacting without the files contained within
   757  	// those intervals being marked as compacting. This is possible if the files
   758  	// were added after the compaction initiated, and the active compaction
   759  	// files straddle the input file. Mark these intervals as base compacting.
   760  	for _, c := range inProgress {
   761  		startIK := intervalKey{key: c.Smallest.UserKey, isLargest: false}
   762  		endIK := intervalKey{key: c.Largest.UserKey, isLargest: !c.Largest.IsExclusiveSentinel()}
   763  		start := sort.Search(len(s.orderedIntervals), func(i int) bool {
   764  			return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, startIK) >= 0
   765  		})
   766  		end := sort.Search(len(s.orderedIntervals), func(i int) bool {
   767  			return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, endIK) >= 0
   768  		})
   769  		for i := start; i < end && i < len(s.orderedIntervals); i++ {
   770  			interval := &s.orderedIntervals[i]
   771  			if !c.IsIntraL0 {
   772  				interval.isBaseCompacting = true
   773  			}
   774  		}
   775  	}
   776  
   777  	min := 0
   778  	for i := range s.orderedIntervals {
   779  		interval := &s.orderedIntervals[i]
   780  		if interval.isBaseCompacting {
   781  			minIndex := interval.filesMinIntervalIndex
   782  			if minIndex < min {
   783  				minIndex = min
   784  			}
   785  			for j := minIndex; j <= interval.filesMaxIntervalIndex; j++ {
   786  				min = j
   787  				s.orderedIntervals[j].intervalRangeIsBaseCompacting = true
   788  			}
   789  		}
   790  	}
   791  }
   792  
   793  // String produces a string containing useful debug information. Useful in test
   794  // code and debugging.
   795  func (s *L0Sublevels) String() string {
   796  	return s.describe(false)
   797  }
   798  
   799  func (s *L0Sublevels) describe(verbose bool) string {
   800  	var buf strings.Builder
   801  	fmt.Fprintf(&buf, "file count: %d, sublevels: %d, intervals: %d\nflush split keys(%d): [",
   802  		s.levelMetadata.Len(), len(s.levelFiles), len(s.orderedIntervals), len(s.flushSplitUserKeys))
   803  	for i := range s.flushSplitUserKeys {
   804  		fmt.Fprintf(&buf, "%s", s.formatKey(s.flushSplitUserKeys[i]))
   805  		if i < len(s.flushSplitUserKeys)-1 {
   806  			fmt.Fprintf(&buf, ", ")
   807  		}
   808  	}
   809  	fmt.Fprintln(&buf, "]")
   810  	numCompactingFiles := 0
   811  	for i := len(s.levelFiles) - 1; i >= 0; i-- {
   812  		maxIntervals := 0
   813  		sumIntervals := 0
   814  		var totalBytes uint64
   815  		for _, f := range s.levelFiles[i] {
   816  			intervals := f.maxIntervalIndex - f.minIntervalIndex + 1
   817  			if intervals > maxIntervals {
   818  				maxIntervals = intervals
   819  			}
   820  			sumIntervals += intervals
   821  			totalBytes += f.Size
   822  			if f.IsCompacting() {
   823  				numCompactingFiles++
   824  			}
   825  		}
   826  		fmt.Fprintf(&buf, "0.%d: file count: %d, bytes: %d, width (mean, max): %0.1f, %d, interval range: [%d, %d]\n",
   827  			i, len(s.levelFiles[i]), totalBytes, float64(sumIntervals)/float64(len(s.levelFiles[i])), maxIntervals, s.levelFiles[i][0].minIntervalIndex,
   828  			s.levelFiles[i][len(s.levelFiles[i])-1].maxIntervalIndex)
   829  		for _, f := range s.levelFiles[i] {
   830  			intervals := f.maxIntervalIndex - f.minIntervalIndex + 1
   831  			if verbose {
   832  				fmt.Fprintf(&buf, "\t%s\n", f)
   833  			}
   834  			if s.levelMetadata.Len() > 50 && intervals*3 > len(s.orderedIntervals) {
   835  				var intervalsBytes uint64
   836  				for k := f.minIntervalIndex; k <= f.maxIntervalIndex; k++ {
   837  					intervalsBytes += s.orderedIntervals[k].estimatedBytes
   838  				}
   839  				fmt.Fprintf(&buf, "wide file: %d, [%d, %d], byte fraction: %f\n",
   840  					f.FileNum, f.minIntervalIndex, f.maxIntervalIndex,
   841  					float64(intervalsBytes)/float64(s.fileBytes))
   842  			}
   843  		}
   844  	}
   845  
   846  	lastCompactingIntervalStart := -1
   847  	fmt.Fprintf(&buf, "compacting file count: %d, base compacting intervals: ", numCompactingFiles)
   848  	i := 0
   849  	foundBaseCompactingIntervals := false
   850  	for ; i < len(s.orderedIntervals); i++ {
   851  		interval := &s.orderedIntervals[i]
   852  		if len(interval.files) == 0 {
   853  			continue
   854  		}
   855  		if !interval.isBaseCompacting {
   856  			if lastCompactingIntervalStart != -1 {
   857  				if foundBaseCompactingIntervals {
   858  					buf.WriteString(", ")
   859  				}
   860  				fmt.Fprintf(&buf, "[%d, %d]", lastCompactingIntervalStart, i-1)
   861  				foundBaseCompactingIntervals = true
   862  			}
   863  			lastCompactingIntervalStart = -1
   864  		} else {
   865  			if lastCompactingIntervalStart == -1 {
   866  				lastCompactingIntervalStart = i
   867  			}
   868  		}
   869  	}
   870  	if lastCompactingIntervalStart != -1 {
   871  		if foundBaseCompactingIntervals {
   872  			buf.WriteString(", ")
   873  		}
   874  		fmt.Fprintf(&buf, "[%d, %d]", lastCompactingIntervalStart, i-1)
   875  	} else if !foundBaseCompactingIntervals {
   876  		fmt.Fprintf(&buf, "none")
   877  	}
   878  	fmt.Fprintln(&buf, "")
   879  	return buf.String()
   880  }
   881  
   882  // ReadAmplification returns the contribution of L0Sublevels to the read
   883  // amplification for any particular point key. It is the maximum height of any
   884  // tracked fileInterval. This is always less than or equal to the number of
   885  // sublevels.
   886  func (s *L0Sublevels) ReadAmplification() int {
   887  	amp := 0
   888  	for i := range s.orderedIntervals {
   889  		interval := &s.orderedIntervals[i]
   890  		fileCount := len(interval.files)
   891  		if amp < fileCount {
   892  			amp = fileCount
   893  		}
   894  	}
   895  	return amp
   896  }
   897  
   898  // UserKeyRange encodes a key range in user key space. A UserKeyRange's Start
   899  // and End boundaries are both inclusive.
   900  type UserKeyRange struct {
   901  	Start, End []byte
   902  }
   903  
   904  // InUseKeyRanges returns the merged table bounds of L0 files overlapping the
   905  // provided user key range. The returned key ranges are sorted and
   906  // nonoverlapping.
   907  func (s *L0Sublevels) InUseKeyRanges(smallest, largest []byte) []UserKeyRange {
   908  	// Binary search to find the provided keys within the intervals.
   909  	startIK := intervalKey{key: smallest, isLargest: false}
   910  	endIK := intervalKey{key: largest, isLargest: true}
   911  	start := sort.Search(len(s.orderedIntervals), func(i int) bool {
   912  		return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, startIK) > 0
   913  	})
   914  	if start > 0 {
   915  		// Back up to the first interval with a start key <= startIK.
   916  		start--
   917  	}
   918  	end := sort.Search(len(s.orderedIntervals), func(i int) bool {
   919  		return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, endIK) > 0
   920  	})
   921  
   922  	var keyRanges []UserKeyRange
   923  	var curr *UserKeyRange
   924  	for i := start; i < end; {
   925  		// Intervals with no files are not in use and can be skipped, once we
   926  		// end the current UserKeyRange.
   927  		if len(s.orderedIntervals[i].files) == 0 {
   928  			curr = nil
   929  			i++
   930  			continue
   931  		}
   932  
   933  		// If curr is nil, start a new in-use key range.
   934  		if curr == nil {
   935  			keyRanges = append(keyRanges, UserKeyRange{
   936  				Start: s.orderedIntervals[i].startKey.key,
   937  			})
   938  			curr = &keyRanges[len(keyRanges)-1]
   939  		}
   940  
   941  		// If the filesMaxIntervalIndex is not the current index, we can jump to
   942  		// the max index, knowing that all intermediary intervals are overlapped
   943  		// by some file.
   944  		if maxIdx := s.orderedIntervals[i].filesMaxIntervalIndex; maxIdx != i {
   945  			// Note that end may be less than or equal to maxIdx if we're
   946  			// concerned with a key range that ends before the interval at
   947  			// maxIdx starts. We must set curr.End now, before making that leap,
   948  			// because this iteration may be the last.
   949  			i = maxIdx
   950  			curr.End = s.orderedIntervals[i+1].startKey.key
   951  			continue
   952  		}
   953  
   954  		// No files overlapping with this interval overlap with the next
   955  		// interval. Update the current end to be the next interval's start key.
   956  		// Note that curr is not necessarily finished, because there may be an
   957  		// abutting non-empty interval.
   958  		curr.End = s.orderedIntervals[i+1].startKey.key
   959  		i++
   960  	}
   961  	return keyRanges
   962  }
   963  
   964  // FlushSplitKeys returns a slice of user keys to split flushes at. Used by
   965  // flushes to avoid writing sstables that straddle these split keys. These
   966  // should be interpreted as the keys to start the next sstable (not the last key
   967  // to include in the prev sstable). These are user keys so that range tombstones
   968  // can be properly truncated (untruncated range tombstones are not permitted for
   969  // L0 files).
   970  func (s *L0Sublevels) FlushSplitKeys() [][]byte {
   971  	return s.flushSplitUserKeys
   972  }
   973  
   974  // MaxDepthAfterOngoingCompactions returns an estimate of maximum depth of
   975  // sublevels after all ongoing compactions run to completion. Used by compaction
   976  // picker to decide compaction score for L0. There is no scoring for intra-L0
   977  // compactions -- they only run if L0 score is high but we're unable to pick an
   978  // L0 -> Lbase compaction.
   979  func (s *L0Sublevels) MaxDepthAfterOngoingCompactions() int {
   980  	depth := 0
   981  	for i := range s.orderedIntervals {
   982  		interval := &s.orderedIntervals[i]
   983  		intervalDepth := len(interval.files) - interval.compactingFileCount
   984  		if depth < intervalDepth {
   985  			depth = intervalDepth
   986  		}
   987  	}
   988  	return depth
   989  }
   990  
   991  // Only for temporary debugging in the absence of proper tests.
   992  //
   993  // TODO(bilal): Simplify away the debugging statements in this method, and make
   994  // this a pure sanity checker.
   995  //
   996  //lint:ignore U1000 - useful for debugging
   997  func (s *L0Sublevels) checkCompaction(c *L0CompactionFiles) error {
   998  	includedFiles := newBitSet(s.levelMetadata.Len())
   999  	fileIntervalsByLevel := make([]struct {
  1000  		min int
  1001  		max int
  1002  	}, len(s.levelFiles))
  1003  	for i := range fileIntervalsByLevel {
  1004  		fileIntervalsByLevel[i].min = math.MaxInt32
  1005  		fileIntervalsByLevel[i].max = 0
  1006  	}
  1007  	var topLevel int
  1008  	var increment int
  1009  	var limitReached func(int) bool
  1010  	if c.isIntraL0 {
  1011  		topLevel = len(s.levelFiles) - 1
  1012  		increment = +1
  1013  		limitReached = func(level int) bool {
  1014  			return level == len(s.levelFiles)
  1015  		}
  1016  	} else {
  1017  		topLevel = 0
  1018  		increment = -1
  1019  		limitReached = func(level int) bool {
  1020  			return level < 0
  1021  		}
  1022  	}
  1023  	for _, f := range c.Files {
  1024  		if fileIntervalsByLevel[f.SubLevel].min > f.minIntervalIndex {
  1025  			fileIntervalsByLevel[f.SubLevel].min = f.minIntervalIndex
  1026  		}
  1027  		if fileIntervalsByLevel[f.SubLevel].max < f.maxIntervalIndex {
  1028  			fileIntervalsByLevel[f.SubLevel].max = f.maxIntervalIndex
  1029  		}
  1030  		includedFiles.markBit(f.L0Index)
  1031  		if c.isIntraL0 {
  1032  			if topLevel > f.SubLevel {
  1033  				topLevel = f.SubLevel
  1034  			}
  1035  		} else {
  1036  			if topLevel < f.SubLevel {
  1037  				topLevel = f.SubLevel
  1038  			}
  1039  		}
  1040  	}
  1041  	min := fileIntervalsByLevel[topLevel].min
  1042  	max := fileIntervalsByLevel[topLevel].max
  1043  	for level := topLevel; !limitReached(level); level += increment {
  1044  		if fileIntervalsByLevel[level].min < min {
  1045  			min = fileIntervalsByLevel[level].min
  1046  		}
  1047  		if fileIntervalsByLevel[level].max > max {
  1048  			max = fileIntervalsByLevel[level].max
  1049  		}
  1050  		index := sort.Search(len(s.levelFiles[level]), func(i int) bool {
  1051  			return s.levelFiles[level][i].maxIntervalIndex >= min
  1052  		})
  1053  		// start := index
  1054  		for ; index < len(s.levelFiles[level]); index++ {
  1055  			f := s.levelFiles[level][index]
  1056  			if f.minIntervalIndex > max {
  1057  				break
  1058  			}
  1059  			if c.isIntraL0 && f.LargestSeqNum >= c.earliestUnflushedSeqNum {
  1060  				return errors.Errorf(
  1061  					"sstable %s in compaction has sequence numbers higher than the earliest unflushed seqnum %d: %d-%d",
  1062  					f.FileNum, c.earliestUnflushedSeqNum, f.SmallestSeqNum,
  1063  					f.LargestSeqNum)
  1064  			}
  1065  			if !includedFiles[f.L0Index] {
  1066  				var buf strings.Builder
  1067  				fmt.Fprintf(&buf, "bug %t, seed interval: %d: level %d, sl index %d, f.index %d, min %d, max %d, pre-min %d, pre-max %d, f.min %d, f.max %d, filenum: %d, isCompacting: %t\n%s\n",
  1068  					c.isIntraL0, c.seedInterval, level, index, f.L0Index, min, max, c.preExtensionMinInterval, c.preExtensionMaxInterval,
  1069  					f.minIntervalIndex, f.maxIntervalIndex,
  1070  					f.FileNum, f.IsCompacting(), s)
  1071  				fmt.Fprintf(&buf, "files included:\n")
  1072  				for _, f := range c.Files {
  1073  					fmt.Fprintf(&buf, "filenum: %d, sl: %d, index: %d, [%d, %d]\n",
  1074  						f.FileNum, f.SubLevel, f.L0Index, f.minIntervalIndex, f.maxIntervalIndex)
  1075  				}
  1076  				fmt.Fprintf(&buf, "files added:\n")
  1077  				for _, f := range c.filesAdded {
  1078  					fmt.Fprintf(&buf, "filenum: %d, sl: %d, index: %d, [%d, %d]\n",
  1079  						f.FileNum, f.SubLevel, f.L0Index, f.minIntervalIndex, f.maxIntervalIndex)
  1080  				}
  1081  				return errors.New(buf.String())
  1082  			}
  1083  		}
  1084  	}
  1085  	return nil
  1086  }
  1087  
  1088  // UpdateStateForStartedCompaction updates internal L0Sublevels state for a
  1089  // recently started compaction. isBase specifies if this is a base compaction;
  1090  // if false, this is assumed to be an intra-L0 compaction. The specified
  1091  // compaction must be involving L0 SSTables. It's assumed that the Compacting
  1092  // and IsIntraL0Compacting fields are already set on all [FileMetadata]s passed
  1093  // in.
  1094  func (s *L0Sublevels) UpdateStateForStartedCompaction(inputs []LevelSlice, isBase bool) error {
  1095  	minIntervalIndex := -1
  1096  	maxIntervalIndex := 0
  1097  	for i := range inputs {
  1098  		iter := inputs[i].Iter()
  1099  		for f := iter.First(); f != nil; f = iter.Next() {
  1100  			for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ {
  1101  				interval := &s.orderedIntervals[i]
  1102  				interval.compactingFileCount++
  1103  			}
  1104  			if f.minIntervalIndex < minIntervalIndex || minIntervalIndex == -1 {
  1105  				minIntervalIndex = f.minIntervalIndex
  1106  			}
  1107  			if f.maxIntervalIndex > maxIntervalIndex {
  1108  				maxIntervalIndex = f.maxIntervalIndex
  1109  			}
  1110  		}
  1111  	}
  1112  	if isBase {
  1113  		for i := minIntervalIndex; i <= maxIntervalIndex; i++ {
  1114  			interval := &s.orderedIntervals[i]
  1115  			interval.isBaseCompacting = isBase
  1116  			for j := interval.filesMinIntervalIndex; j <= interval.filesMaxIntervalIndex; j++ {
  1117  				s.orderedIntervals[j].intervalRangeIsBaseCompacting = true
  1118  			}
  1119  		}
  1120  	}
  1121  	return nil
  1122  }
  1123  
  1124  // L0CompactionFiles represents a candidate set of L0 files for compaction. Also
  1125  // referred to as "lcf". Contains state information useful for generating the
  1126  // compaction (such as Files), as well as for picking between candidate
  1127  // compactions (eg. fileBytes and seedIntervalStackDepthReduction).
  1128  type L0CompactionFiles struct {
  1129  	Files []*FileMetadata
  1130  
  1131  	FilesIncluded bitSet
  1132  	// A "seed interval" is an interval with a high stack depth that was chosen
  1133  	// to bootstrap this compaction candidate. seedIntervalStackDepthReduction
  1134  	// is the number of sublevels that have a file in the seed interval that is
  1135  	// a part of this compaction.
  1136  	seedIntervalStackDepthReduction int
  1137  	// For base compactions, seedIntervalMinLevel is 0, and for intra-L0
  1138  	// compactions, seedIntervalMaxLevel is len(s.Files)-1 i.e. the highest
  1139  	// sublevel.
  1140  	seedIntervalMinLevel int
  1141  	seedIntervalMaxLevel int
  1142  	// Index of the seed interval.
  1143  	seedInterval int
  1144  	// Sum of file sizes for all files in this compaction.
  1145  	fileBytes uint64
  1146  	// Intervals with index [minIntervalIndex, maxIntervalIndex] are
  1147  	// participating in this compaction; it's the union set of all intervals
  1148  	// overlapped by participating files.
  1149  	minIntervalIndex int
  1150  	maxIntervalIndex int
  1151  
  1152  	// Set for intra-L0 compactions. SSTables with sequence numbers greater
  1153  	// than earliestUnflushedSeqNum cannot be a part of intra-L0 compactions.
  1154  	isIntraL0               bool
  1155  	earliestUnflushedSeqNum uint64
  1156  
  1157  	// For debugging purposes only. Used in checkCompaction().
  1158  	preExtensionMinInterval int
  1159  	preExtensionMaxInterval int
  1160  	filesAdded              []*FileMetadata
  1161  }
  1162  
  1163  // Clone allocates a new L0CompactionFiles, with the same underlying data. Note
  1164  // that the two fileMetadata slices contain values that point to the same
  1165  // underlying fileMetadata object. This is safe because these objects are read
  1166  // only.
  1167  func (l *L0CompactionFiles) Clone() *L0CompactionFiles {
  1168  	oldLcf := *l
  1169  	return &oldLcf
  1170  }
  1171  
  1172  // String merely prints the starting address of the first file, if it exists.
  1173  func (l *L0CompactionFiles) String() string {
  1174  	if len(l.Files) > 0 {
  1175  		return fmt.Sprintf("First File Address: %p", &l.Files[0])
  1176  	}
  1177  	return ""
  1178  }
  1179  
  1180  // addFile adds the specified file to the LCF.
  1181  func (l *L0CompactionFiles) addFile(f *FileMetadata) {
  1182  	if l.FilesIncluded[f.L0Index] {
  1183  		return
  1184  	}
  1185  	l.FilesIncluded.markBit(f.L0Index)
  1186  	l.Files = append(l.Files, f)
  1187  	l.filesAdded = append(l.filesAdded, f)
  1188  	l.fileBytes += f.Size
  1189  	if f.minIntervalIndex < l.minIntervalIndex {
  1190  		l.minIntervalIndex = f.minIntervalIndex
  1191  	}
  1192  	if f.maxIntervalIndex > l.maxIntervalIndex {
  1193  		l.maxIntervalIndex = f.maxIntervalIndex
  1194  	}
  1195  }
  1196  
  1197  // Helper to order intervals being considered for compaction.
  1198  type intervalAndScore struct {
  1199  	interval int
  1200  	score    int
  1201  }
  1202  type intervalSorterByDecreasingScore []intervalAndScore
  1203  
  1204  func (is intervalSorterByDecreasingScore) Len() int { return len(is) }
  1205  func (is intervalSorterByDecreasingScore) Less(i, j int) bool {
  1206  	return is[i].score > is[j].score
  1207  }
  1208  func (is intervalSorterByDecreasingScore) Swap(i, j int) {
  1209  	is[i], is[j] = is[j], is[i]
  1210  }
  1211  
  1212  // Compactions:
  1213  //
  1214  // The sub-levels and intervals can be visualized in 2 dimensions as the X axis
  1215  // containing intervals in increasing order and the Y axis containing sub-levels
  1216  // (older to younger). The intervals can be sparse wrt sub-levels. We observe
  1217  // that the system is typically under severe pressure in L0 during large numbers
  1218  // of ingestions where most files added to L0 are narrow and non-overlapping.
  1219  //
  1220  //    L0.1    d---g
  1221  //    L0.0  c--e  g--j o--s u--x
  1222  //
  1223  // As opposed to a case with a lot of wide, overlapping L0 files:
  1224  //
  1225  //    L0.3     d-----------r
  1226  //    L0.2    c--------o
  1227  //    L0.1   b-----------q
  1228  //    L0.0  a----------------x
  1229  //
  1230  // In that case we expect the rectangle represented in the good visualization
  1231  // above (i.e. the first one) to be wide and short, and not too sparse (most
  1232  // intervals will have fileCount close to the sub-level count), which would make
  1233  // it amenable to concurrent L0 -> Lbase compactions.
  1234  //
  1235  // L0 -> Lbase: The high-level goal of a L0 -> Lbase compaction is to reduce
  1236  // stack depth, by compacting files in the intervals with the highest (fileCount
  1237  // - compactingCount). Additionally, we would like compactions to not involve a
  1238  // huge number of files, so that they finish quickly, and to allow for
  1239  // concurrent L0 -> Lbase compactions when needed. In order to achieve these
  1240  // goals we would like compactions to visualize as capturing thin and tall
  1241  // rectangles. The approach below is to consider intervals in some order and
  1242  // then try to construct a compaction using the interval. The first interval we
  1243  // can construct a compaction for is the compaction that is started. There can
  1244  // be multiple heuristics in choosing the ordering of the intervals -- the code
  1245  // uses one heuristic that worked well for a large ingestion stemming from a
  1246  // cockroachdb import, but additional experimentation is necessary to pick a
  1247  // general heuristic. Additionally, the compaction that gets picked may be not
  1248  // as desirable as one that could be constructed later in terms of reducing
  1249  // stack depth (since adding more files to the compaction can get blocked by
  1250  // needing to encompass files that are already being compacted). So an
  1251  // alternative would be to try to construct more than one compaction and pick
  1252  // the best one.
  1253  //
  1254  // Here's a visualization of an ideal L0->LBase compaction selection:
  1255  //
  1256  //    L0.3  a--d    g-j
  1257  //    L0.2         f--j          r-t
  1258  //    L0.1   b-d  e---j
  1259  //    L0.0  a--d   f--j  l--o  p-----x
  1260  //
  1261  //    Lbase a--------i    m---------w
  1262  //
  1263  // The [g,j] interval has the highest stack depth, so it would have the highest
  1264  // priority for selecting a base compaction candidate. Assuming none of the
  1265  // files are already compacting, this is the compaction that will be chosen:
  1266  //
  1267  //               _______
  1268  //    L0.3  a--d |  g-j|
  1269  //    L0.2       | f--j|         r-t
  1270  //    L0.1   b-d |e---j|
  1271  //    L0.0  a--d | f--j| l--o  p-----x
  1272  //
  1273  //    Lbase a--------i    m---------w
  1274  //
  1275  // Note that running this compaction will mark the a--i file in Lbase as
  1276  // compacting, and when ExtendL0ForBaseCompactionTo is called with the bounds of
  1277  // that base file, it'll expand the compaction to also include all L0 files in
  1278  // the a-d interval. The resultant compaction would then be:
  1279  //
  1280  //         _____________
  1281  //    L0.3 |a--d    g-j|
  1282  //    L0.2 |       f--j|         r-t
  1283  //    L0.1 | b-d  e---j|
  1284  //    L0.0 |a--d   f--j| l--o  p-----x
  1285  //
  1286  //    Lbase a--------i    m---------w
  1287  //
  1288  // The next best interval for base compaction would therefore be the one
  1289  // including r--t in L0.2 and p--x in L0.0, and both this compaction and the one
  1290  // picked earlier can run in parallel. This is assuming minCompactionDepth >= 2,
  1291  // otherwise the second compaction has too little depth to pick.
  1292  //
  1293  //         _____________
  1294  //    L0.3 |a--d    g-j|      _________
  1295  //    L0.2 |       f--j|      |  r-t  |
  1296  //    L0.1 | b-d  e---j|      |       |
  1297  //    L0.0 |a--d   f--j| l--o |p-----x|
  1298  //
  1299  //    Lbase a--------i    m---------w
  1300  //
  1301  // Note that when ExtendL0ForBaseCompactionTo is called, the compaction expands
  1302  // to the following, given that the [l,o] file can be added without including
  1303  // additional files in Lbase:
  1304  //
  1305  //         _____________
  1306  //    L0.3 |a--d    g-j|      _________
  1307  //    L0.2 |       f--j|      |  r-t  |
  1308  //    L0.1 | b-d  e---j|______|       |
  1309  //    L0.0 |a--d   f--j||l--o  p-----x|
  1310  //
  1311  //    Lbase a--------i    m---------w
  1312  //
  1313  // If an additional file existed in LBase that overlapped with [l,o], it would
  1314  // be excluded from the compaction. Concretely:
  1315  //
  1316  //         _____________
  1317  //    L0.3 |a--d    g-j|      _________
  1318  //    L0.2 |       f--j|      |  r-t  |
  1319  //    L0.1 | b-d  e---j|      |       |
  1320  //    L0.0 |a--d   f--j| l--o |p-----x|
  1321  //
  1322  //    Lbase a--------ij--lm---------w
  1323  //
  1324  // Intra-L0: If the L0 score is high, but PickBaseCompaction() is unable to pick
  1325  // a compaction, PickIntraL0Compaction will be used to pick an intra-L0
  1326  // compaction. Similar to L0 -> Lbase compactions, we want to allow for multiple
  1327  // intra-L0 compactions and not generate wide output files that hinder later
  1328  // concurrency of L0 -> Lbase compactions. Also compactions that produce wide
  1329  // files don't reduce stack depth -- they represent wide rectangles in our
  1330  // visualization, which means many intervals have their depth reduced by a small
  1331  // amount. Typically, L0 files have non-overlapping sequence numbers, and
  1332  // sticking to that invariant would require us to consider intra-L0 compactions
  1333  // that proceed from youngest to oldest files, which could result in the
  1334  // aforementioned undesirable wide rectangle shape. But this non-overlapping
  1335  // sequence number is already relaxed in RocksDB -- sstables are primarily
  1336  // ordered by their largest sequence number. So we can arrange for intra-L0
  1337  // compactions to capture thin and tall rectangles starting with the top of the
  1338  // stack (youngest files). Like the L0 -> Lbase case we order the intervals
  1339  // using a heuristic and consider each in turn. The same comment about better L0
  1340  // -> Lbase heuristics and not being greedy applies here.
  1341  //
  1342  // Going back to a modified version of our example from earlier, let's say these
  1343  // are the base compactions in progress:
  1344  //                _______
  1345  //    L0.3  a--d  |  g-j|      _________
  1346  //    L0.2        | f--j|      |  r-t  |
  1347  //    L0.1   b-d  |e---j|      |       |
  1348  //    L0.0  a--d  | f--j| l--o |p-----x|
  1349  //
  1350  //    Lbase a---------i    m---------w
  1351  //
  1352  // Since both LBase files are compacting, the only L0 compaction that can be
  1353  // picked is an intra-L0 compaction. For this, the b--d interval has the highest
  1354  // stack depth (3), and starting with a--d in L0.3 as the seed file, we can
  1355  // iterate downward and build this compaction, assuming all files in that
  1356  // interval are not compacting and have a highest sequence number less than
  1357  // earliestUnflushedSeqNum:
  1358  //
  1359  //                _______
  1360  //    L0.3 |a--d| |  g-j|      _________
  1361  //    L0.2 |    | | f--j|      |  r-t  |
  1362  //    L0.1 | b-d| |e---j|      |       |
  1363  //    L0.0 |a--d| | f--j| l--o |p-----x|
  1364  //         ------
  1365  //    Lbase a---------i    m---------w
  1366  //
  1367  
  1368  // PickBaseCompaction picks a base compaction based on the above specified
  1369  // heuristics, for the specified Lbase files and a minimum depth of overlapping
  1370  // files that can be selected for compaction. Returns nil if no compaction is
  1371  // possible.
  1372  func (s *L0Sublevels) PickBaseCompaction(
  1373  	minCompactionDepth int, baseFiles LevelSlice,
  1374  ) (*L0CompactionFiles, error) {
  1375  	// For LBase compactions, we consider intervals in a greedy manner in the
  1376  	// following order:
  1377  	// - Intervals that are unlikely to be blocked due
  1378  	//   to ongoing L0 -> Lbase compactions. These are the ones with
  1379  	//   !isBaseCompacting && !intervalRangeIsBaseCompacting.
  1380  	// - Intervals that are !isBaseCompacting && intervalRangeIsBaseCompacting.
  1381  	//
  1382  	// The ordering heuristic exists just to avoid wasted work. Ideally,
  1383  	// we would consider all intervals with isBaseCompacting = false and
  1384  	// construct a compaction for it and compare the constructed compactions
  1385  	// and pick the best one. If microbenchmarks show that we can afford
  1386  	// this cost we can eliminate this heuristic.
  1387  	scoredIntervals := make([]intervalAndScore, 0, len(s.orderedIntervals))
  1388  	sublevelCount := len(s.levelFiles)
  1389  	for i := range s.orderedIntervals {
  1390  		interval := &s.orderedIntervals[i]
  1391  		depth := len(interval.files) - interval.compactingFileCount
  1392  		if interval.isBaseCompacting || minCompactionDepth > depth {
  1393  			continue
  1394  		}
  1395  		if interval.intervalRangeIsBaseCompacting {
  1396  			scoredIntervals = append(scoredIntervals, intervalAndScore{interval: i, score: depth})
  1397  		} else {
  1398  			// Prioritize this interval by incrementing the score by the number
  1399  			// of sublevels.
  1400  			scoredIntervals = append(scoredIntervals, intervalAndScore{interval: i, score: depth + sublevelCount})
  1401  		}
  1402  	}
  1403  	sort.Sort(intervalSorterByDecreasingScore(scoredIntervals))
  1404  
  1405  	// Optimization to avoid considering different intervals that
  1406  	// are likely to choose the same seed file. Again this is just
  1407  	// to reduce wasted work.
  1408  	consideredIntervals := newBitSet(len(s.orderedIntervals))
  1409  	for _, scoredInterval := range scoredIntervals {
  1410  		interval := &s.orderedIntervals[scoredInterval.interval]
  1411  		if consideredIntervals[interval.index] {
  1412  			continue
  1413  		}
  1414  
  1415  		// Pick the seed file for the interval as the file
  1416  		// in the lowest sub-level.
  1417  		f := interval.files[0]
  1418  		// Don't bother considering the intervals that are covered by the seed
  1419  		// file since they are likely nearby. Note that it is possible that
  1420  		// those intervals have seed files at lower sub-levels so could be
  1421  		// viable for compaction.
  1422  		if f == nil {
  1423  			return nil, errors.New("no seed file found in sublevel intervals")
  1424  		}
  1425  		consideredIntervals.markBits(f.minIntervalIndex, f.maxIntervalIndex+1)
  1426  		if f.IsCompacting() {
  1427  			if f.IsIntraL0Compacting {
  1428  				// If we're picking a base compaction and we came across a seed
  1429  				// file candidate that's being intra-L0 compacted, skip the
  1430  				// interval instead of erroring out.
  1431  				continue
  1432  			}
  1433  			// We chose a compaction seed file that should not be compacting.
  1434  			// Usually means the score is not accurately accounting for files
  1435  			// already compacting, or internal state is inconsistent.
  1436  			return nil, errors.Errorf("file %s chosen as seed file for compaction should not be compacting", f.FileNum)
  1437  		}
  1438  
  1439  		c := s.baseCompactionUsingSeed(f, interval.index, minCompactionDepth)
  1440  		if c != nil {
  1441  			// Check if the chosen compaction overlaps with any files in Lbase
  1442  			// that have Compacting = true. If that's the case, this compaction
  1443  			// cannot be chosen.
  1444  			baseIter := baseFiles.Iter()
  1445  			// An interval starting at ImmediateSuccessor(key) can never be the
  1446  			// first interval of a compaction since no file can start at that
  1447  			// interval.
  1448  			m := baseIter.SeekGE(s.cmp, s.orderedIntervals[c.minIntervalIndex].startKey.key)
  1449  
  1450  			var baseCompacting bool
  1451  			for ; m != nil && !baseCompacting; m = baseIter.Next() {
  1452  				cmp := s.cmp(m.Smallest.UserKey, s.orderedIntervals[c.maxIntervalIndex+1].startKey.key)
  1453  				// Compaction is ending at exclusive bound of c.maxIntervalIndex+1
  1454  				if cmp > 0 || (cmp == 0 && !s.orderedIntervals[c.maxIntervalIndex+1].startKey.isLargest) {
  1455  					break
  1456  				}
  1457  				baseCompacting = baseCompacting || m.IsCompacting()
  1458  			}
  1459  			if baseCompacting {
  1460  				continue
  1461  			}
  1462  			return c, nil
  1463  		}
  1464  	}
  1465  	return nil, nil
  1466  }
  1467  
  1468  // Helper function for building an L0 -> Lbase compaction using a seed interval
  1469  // and seed file in that seed interval.
  1470  func (s *L0Sublevels) baseCompactionUsingSeed(
  1471  	f *FileMetadata, intervalIndex int, minCompactionDepth int,
  1472  ) *L0CompactionFiles {
  1473  	c := &L0CompactionFiles{
  1474  		FilesIncluded:        newBitSet(s.levelMetadata.Len()),
  1475  		seedInterval:         intervalIndex,
  1476  		seedIntervalMinLevel: 0,
  1477  		minIntervalIndex:     f.minIntervalIndex,
  1478  		maxIntervalIndex:     f.maxIntervalIndex,
  1479  	}
  1480  	c.addFile(f)
  1481  
  1482  	// The first iteration of this loop builds the compaction at the seed file's
  1483  	// sublevel. Future iterations expand on this compaction by stacking more
  1484  	// files from intervalIndex and repeating. This is an optional activity so
  1485  	// when it fails we can fallback to the last successful candidate.
  1486  	var lastCandidate *L0CompactionFiles
  1487  	interval := &s.orderedIntervals[intervalIndex]
  1488  
  1489  	for i := 0; i < len(interval.files); i++ {
  1490  		f2 := interval.files[i]
  1491  		sl := f2.SubLevel
  1492  		c.seedIntervalStackDepthReduction++
  1493  		c.seedIntervalMaxLevel = sl
  1494  		c.addFile(f2)
  1495  		// The seed file is in the lowest sublevel in the seed interval, but it
  1496  		// may overlap with other files in even lower sublevels. For correctness
  1497  		// we need to grow our interval to include those files, and capture all
  1498  		// files in the next level that fall in this extended interval and so
  1499  		// on. This can result in a triangular shape like the following where
  1500  		// again the X axis is the key intervals and the Y axis is oldest to
  1501  		// youngest. Note that it is not necessary for correctness to fill out
  1502  		// the shape at the higher sub-levels to make it more rectangular since
  1503  		// the invariant only requires that younger versions of a key not be
  1504  		// moved to Lbase while leaving behind older versions.
  1505  		//                     -
  1506  		//                    ---
  1507  		//                   -----
  1508  		// It may be better for performance to have a more rectangular shape
  1509  		// since the files being left behind will overlap with the same Lbase
  1510  		// key range as that of this compaction. But there is also the danger
  1511  		// that in trying to construct a more rectangular shape we will be
  1512  		// forced to pull in a file that is already compacting. We expect
  1513  		// extendCandidateToRectangle to eventually be called on this compaction
  1514  		// if it's chosen, at which point we would iterate backward and choose
  1515  		// those files. This logic is similar to compaction.grow for non-L0
  1516  		// compactions.
  1517  		done := false
  1518  		for currLevel := sl - 1; currLevel >= 0; currLevel-- {
  1519  			if !s.extendFiles(currLevel, math.MaxUint64, c) {
  1520  				// Failed to extend due to ongoing compaction.
  1521  				done = true
  1522  				break
  1523  			}
  1524  		}
  1525  		if done {
  1526  			break
  1527  		}
  1528  		// Observed some compactions using > 1GB from L0 in an import
  1529  		// experiment. Very long running compactions are not great as they
  1530  		// reduce concurrency while they run, and take a while to produce
  1531  		// results, though they're sometimes unavoidable. There is a tradeoff
  1532  		// here in that adding more depth is more efficient in reducing stack
  1533  		// depth, but long running compactions reduce flexibility in what can
  1534  		// run concurrently in L0 and even Lbase -> Lbase+1. An increase more
  1535  		// than 150% in bytes since the last candidate compaction (along with a
  1536  		// total compaction size in excess of 100mb), or a total compaction size
  1537  		// beyond a hard limit of 500mb, is criteria for rejecting this
  1538  		// candidate. This lets us prefer slow growths as we add files, while
  1539  		// still having a hard limit. Note that if this is the first compaction
  1540  		// candidate to reach a stack depth reduction of minCompactionDepth or
  1541  		// higher, this candidate will be chosen regardless.
  1542  		if lastCandidate == nil {
  1543  			lastCandidate = &L0CompactionFiles{}
  1544  		} else if lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth &&
  1545  			c.fileBytes > 100<<20 &&
  1546  			(float64(c.fileBytes)/float64(lastCandidate.fileBytes) > 1.5 || c.fileBytes > 500<<20) {
  1547  			break
  1548  		}
  1549  		*lastCandidate = *c
  1550  	}
  1551  	if lastCandidate != nil && lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth {
  1552  		lastCandidate.FilesIncluded.clearAllBits()
  1553  		for _, f := range lastCandidate.Files {
  1554  			lastCandidate.FilesIncluded.markBit(f.L0Index)
  1555  		}
  1556  		return lastCandidate
  1557  	}
  1558  	return nil
  1559  }
  1560  
  1561  // Expands fields in the provided L0CompactionFiles instance (cFiles) to
  1562  // include overlapping files in the specified sublevel. Returns true if the
  1563  // compaction is possible (i.e. does not conflict with any base/intra-L0
  1564  // compacting files).
  1565  func (s *L0Sublevels) extendFiles(
  1566  	sl int, earliestUnflushedSeqNum uint64, cFiles *L0CompactionFiles,
  1567  ) bool {
  1568  	index := sort.Search(len(s.levelFiles[sl]), func(i int) bool {
  1569  		return s.levelFiles[sl][i].maxIntervalIndex >= cFiles.minIntervalIndex
  1570  	})
  1571  	for ; index < len(s.levelFiles[sl]); index++ {
  1572  		f := s.levelFiles[sl][index]
  1573  		if f.minIntervalIndex > cFiles.maxIntervalIndex {
  1574  			break
  1575  		}
  1576  		if f.IsCompacting() {
  1577  			return false
  1578  		}
  1579  		// Skip over files that are newer than earliestUnflushedSeqNum. This is
  1580  		// okay because this compaction can just pretend these files are not in
  1581  		// L0 yet. These files must be in higher sublevels than any overlapping
  1582  		// files with f.LargestSeqNum < earliestUnflushedSeqNum, and the output
  1583  		// of the compaction will also go in a lower (older) sublevel than this
  1584  		// file by definition.
  1585  		if f.LargestSeqNum >= earliestUnflushedSeqNum {
  1586  			continue
  1587  		}
  1588  		cFiles.addFile(f)
  1589  	}
  1590  	return true
  1591  }
  1592  
  1593  // PickIntraL0Compaction picks an intra-L0 compaction for files in this
  1594  // sublevel. This method is only called when a base compaction cannot be chosen.
  1595  // See comment above [PickBaseCompaction] for heuristics involved in this
  1596  // selection.
  1597  func (s *L0Sublevels) PickIntraL0Compaction(
  1598  	earliestUnflushedSeqNum uint64, minCompactionDepth int,
  1599  ) (*L0CompactionFiles, error) {
  1600  	scoredIntervals := make([]intervalAndScore, len(s.orderedIntervals))
  1601  	for i := range s.orderedIntervals {
  1602  		interval := &s.orderedIntervals[i]
  1603  		depth := len(interval.files) - interval.compactingFileCount
  1604  		if minCompactionDepth > depth {
  1605  			continue
  1606  		}
  1607  		scoredIntervals[i] = intervalAndScore{interval: i, score: depth}
  1608  	}
  1609  	sort.Sort(intervalSorterByDecreasingScore(scoredIntervals))
  1610  
  1611  	// Optimization to avoid considering different intervals that are likely to
  1612  	// choose the same seed file. Again this is just to reduce wasted work.
  1613  	consideredIntervals := newBitSet(len(s.orderedIntervals))
  1614  	for _, scoredInterval := range scoredIntervals {
  1615  		interval := &s.orderedIntervals[scoredInterval.interval]
  1616  		if consideredIntervals[interval.index] {
  1617  			continue
  1618  		}
  1619  
  1620  		var f *FileMetadata
  1621  		// Pick the seed file for the interval as the file in the highest
  1622  		// sub-level.
  1623  		stackDepthReduction := scoredInterval.score
  1624  		for i := len(interval.files) - 1; i >= 0; i-- {
  1625  			f = interval.files[i]
  1626  			if f.IsCompacting() {
  1627  				break
  1628  			}
  1629  			consideredIntervals.markBits(f.minIntervalIndex, f.maxIntervalIndex+1)
  1630  			// Can this be the seed file? Files with newer sequence numbers than
  1631  			// earliestUnflushedSeqNum cannot be in the compaction.
  1632  			if f.LargestSeqNum >= earliestUnflushedSeqNum {
  1633  				stackDepthReduction--
  1634  				if stackDepthReduction == 0 {
  1635  					break
  1636  				}
  1637  			} else {
  1638  				break
  1639  			}
  1640  		}
  1641  		if stackDepthReduction < minCompactionDepth {
  1642  			// Can't use this interval.
  1643  			continue
  1644  		}
  1645  
  1646  		if f == nil {
  1647  			return nil, errors.New("no seed file found in sublevel intervals")
  1648  		}
  1649  		if f.IsCompacting() {
  1650  			// This file could be in a concurrent intra-L0 or base compaction.
  1651  			// Try another interval.
  1652  			continue
  1653  		}
  1654  
  1655  		// We have a seed file. Build a compaction off of that seed.
  1656  		c := s.intraL0CompactionUsingSeed(
  1657  			f, interval.index, earliestUnflushedSeqNum, minCompactionDepth)
  1658  		if c != nil {
  1659  			return c, nil
  1660  		}
  1661  	}
  1662  	return nil, nil
  1663  }
  1664  
  1665  func (s *L0Sublevels) intraL0CompactionUsingSeed(
  1666  	f *FileMetadata, intervalIndex int, earliestUnflushedSeqNum uint64, minCompactionDepth int,
  1667  ) *L0CompactionFiles {
  1668  	// We know that all the files that overlap with intervalIndex have
  1669  	// LargestSeqNum < earliestUnflushedSeqNum, but for other intervals
  1670  	// we need to exclude files >= earliestUnflushedSeqNum
  1671  
  1672  	c := &L0CompactionFiles{
  1673  		FilesIncluded:           newBitSet(s.levelMetadata.Len()),
  1674  		seedInterval:            intervalIndex,
  1675  		seedIntervalMaxLevel:    len(s.levelFiles) - 1,
  1676  		minIntervalIndex:        f.minIntervalIndex,
  1677  		maxIntervalIndex:        f.maxIntervalIndex,
  1678  		isIntraL0:               true,
  1679  		earliestUnflushedSeqNum: earliestUnflushedSeqNum,
  1680  	}
  1681  	c.addFile(f)
  1682  
  1683  	var lastCandidate *L0CompactionFiles
  1684  	interval := &s.orderedIntervals[intervalIndex]
  1685  	slIndex := len(interval.files) - 1
  1686  	for {
  1687  		if interval.files[slIndex] == f {
  1688  			break
  1689  		}
  1690  		slIndex--
  1691  	}
  1692  	// The first iteration of this loop produces an intra-L0 compaction at the
  1693  	// seed level. Iterations after that optionally add to the compaction by
  1694  	// stacking more files from intervalIndex and repeating. This is an optional
  1695  	// activity so when it fails we can fallback to the last successful
  1696  	// candidate. The code stops adding when it can't add more, or when
  1697  	// fileBytes grows too large.
  1698  	for ; slIndex >= 0; slIndex-- {
  1699  		f2 := interval.files[slIndex]
  1700  		sl := f2.SubLevel
  1701  		if f2.IsCompacting() {
  1702  			break
  1703  		}
  1704  		c.seedIntervalStackDepthReduction++
  1705  		c.seedIntervalMinLevel = sl
  1706  		c.addFile(f2)
  1707  		// The seed file captures all files in the higher level that fall in the
  1708  		// range of intervals. That may extend the range of intervals so for
  1709  		// correctness we need to capture all files in the next higher level
  1710  		// that fall in this extended interval and so on. This can result in an
  1711  		// inverted triangular shape like the following where again the X axis
  1712  		// is the key intervals and the Y axis is oldest to youngest. Note that
  1713  		// it is not necessary for correctness to fill out the shape at lower
  1714  		// sub-levels to make it more rectangular since the invariant only
  1715  		// requires that if we move an older seqnum for key k into a file that
  1716  		// has a higher seqnum, we also move all younger seqnums for that key k
  1717  		// into that file.
  1718  		//                  -----
  1719  		//                   ---
  1720  		//                    -
  1721  		// It may be better for performance to have a more rectangular shape
  1722  		// since it will reduce the stack depth for more intervals. But there is
  1723  		// also the danger that in explicitly trying to construct a more
  1724  		// rectangular shape we will be forced to pull in a file that is already
  1725  		// compacting. We assume that the performance concern is not a practical
  1726  		// issue.
  1727  		done := false
  1728  		for currLevel := sl + 1; currLevel < len(s.levelFiles); currLevel++ {
  1729  			if !s.extendFiles(currLevel, earliestUnflushedSeqNum, c) {
  1730  				// Failed to extend due to ongoing compaction.
  1731  				done = true
  1732  				break
  1733  			}
  1734  		}
  1735  		if done {
  1736  			break
  1737  		}
  1738  		if lastCandidate == nil {
  1739  			lastCandidate = &L0CompactionFiles{}
  1740  		} else if lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth &&
  1741  			c.fileBytes > 100<<20 &&
  1742  			(float64(c.fileBytes)/float64(lastCandidate.fileBytes) > 1.5 || c.fileBytes > 500<<20) {
  1743  			break
  1744  		}
  1745  		*lastCandidate = *c
  1746  	}
  1747  	if lastCandidate != nil && lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth {
  1748  		lastCandidate.FilesIncluded.clearAllBits()
  1749  		for _, f := range lastCandidate.Files {
  1750  			lastCandidate.FilesIncluded.markBit(f.L0Index)
  1751  		}
  1752  		s.extendCandidateToRectangle(
  1753  			lastCandidate.minIntervalIndex, lastCandidate.maxIntervalIndex, lastCandidate, false)
  1754  		return lastCandidate
  1755  	}
  1756  	return nil
  1757  }
  1758  
  1759  // ExtendL0ForBaseCompactionTo extends the specified base compaction candidate
  1760  // L0CompactionFiles to optionally cover more files in L0 without "touching" any
  1761  // of the passed-in keys (i.e. the smallest/largest bounds are exclusive), as
  1762  // including any user keys for those internal keys could require choosing more
  1763  // files in LBase which is undesirable. Unbounded start/end keys are indicated
  1764  // by passing in the InvalidInternalKey.
  1765  func (s *L0Sublevels) ExtendL0ForBaseCompactionTo(
  1766  	smallest, largest InternalKey, candidate *L0CompactionFiles,
  1767  ) bool {
  1768  	firstIntervalIndex := 0
  1769  	lastIntervalIndex := len(s.orderedIntervals) - 1
  1770  	if smallest.Kind() != base.InternalKeyKindInvalid {
  1771  		if smallest.Trailer == base.InternalKeyRangeDeleteSentinel {
  1772  			// Starting at smallest.UserKey == interval.startKey is okay.
  1773  			firstIntervalIndex = sort.Search(len(s.orderedIntervals), func(i int) bool {
  1774  				return s.cmp(smallest.UserKey, s.orderedIntervals[i].startKey.key) <= 0
  1775  			})
  1776  		} else {
  1777  			firstIntervalIndex = sort.Search(len(s.orderedIntervals), func(i int) bool {
  1778  				// Need to start at >= smallest since if we widen too much we may miss
  1779  				// an Lbase file that overlaps with an L0 file that will get picked in
  1780  				// this widening, which would be bad. This interval will not start with
  1781  				// an immediate successor key.
  1782  				return s.cmp(smallest.UserKey, s.orderedIntervals[i].startKey.key) < 0
  1783  			})
  1784  		}
  1785  	}
  1786  	if largest.Kind() != base.InternalKeyKindInvalid {
  1787  		// First interval that starts at or beyond the largest. This interval will not
  1788  		// start with an immediate successor key.
  1789  		lastIntervalIndex = sort.Search(len(s.orderedIntervals), func(i int) bool {
  1790  			return s.cmp(largest.UserKey, s.orderedIntervals[i].startKey.key) <= 0
  1791  		})
  1792  		// Right now, lastIntervalIndex has a startKey that extends beyond largest.
  1793  		// The previous interval, by definition, has an end key higher than largest.
  1794  		// Iterate back twice to get the last interval that's completely within
  1795  		// (smallest, largest). Except in the case where we went past the end of the
  1796  		// list; in that case, the last interval to include is the very last
  1797  		// interval in the list.
  1798  		if lastIntervalIndex < len(s.orderedIntervals) {
  1799  			lastIntervalIndex--
  1800  		}
  1801  		lastIntervalIndex--
  1802  	}
  1803  	if lastIntervalIndex < firstIntervalIndex {
  1804  		return false
  1805  	}
  1806  	return s.extendCandidateToRectangle(firstIntervalIndex, lastIntervalIndex, candidate, true)
  1807  }
  1808  
  1809  // Best-effort attempt to make the compaction include more files in the
  1810  // rectangle defined by [minIntervalIndex, maxIntervalIndex] on the X axis and
  1811  // bounded on the Y axis by seedIntervalMinLevel and seedIntervalMaxLevel.
  1812  //
  1813  // This is strictly an optional extension; at any point where we can't feasibly
  1814  // add more files, the sublevel iteration can be halted early and candidate will
  1815  // still be a correct compaction candidate.
  1816  //
  1817  // Consider this scenario (original candidate is inside the rectangle), with
  1818  // isBase = true and interval bounds a-j (from the union of base file bounds and
  1819  // that of compaction candidate):
  1820  //
  1821  //	           _______
  1822  //	L0.3  a--d |  g-j|
  1823  //	L0.2       | f--j|         r-t
  1824  //	L0.1   b-d |e---j|
  1825  //	L0.0  a--d | f--j| l--o  p-----x
  1826  //
  1827  //	Lbase a--------i    m---------w
  1828  //
  1829  // This method will iterate from the bottom up. At L0.0, it will add a--d since
  1830  // it's in the bounds, then add b-d, then a--d, and so on, to produce this:
  1831  //
  1832  //	     _____________
  1833  //	L0.3 |a--d    g-j|
  1834  //	L0.2 |       f--j|         r-t
  1835  //	L0.1 | b-d  e---j|
  1836  //	L0.0 |a--d   f--j| l--o  p-----x
  1837  //
  1838  //	Lbase a-------i     m---------w
  1839  //
  1840  // Let's assume that, instead of a--d in the top sublevel, we had 3 files, a-b,
  1841  // bb-c, and cc-d, of which bb-c is compacting. Let's also add another sublevel
  1842  // L0.4 with some files, all of which aren't compacting:
  1843  //
  1844  //	L0.4  a------c ca--d _______
  1845  //	L0.3  a-b bb-c  cc-d |  g-j|
  1846  //	L0.2                 | f--j|         r-t
  1847  //	L0.1    b----------d |e---j|
  1848  //	L0.0  a------------d | f--j| l--o  p-----x
  1849  //
  1850  //	Lbase a------------------i    m---------w
  1851  //
  1852  // This method then needs to choose between the left side of L0.3 bb-c (i.e.
  1853  // a-b), or the right side (i.e. cc-d and g-j) for inclusion in this compaction.
  1854  // Since the right side has more files as well as one file that has already been
  1855  // picked, it gets chosen at that sublevel, resulting in this intermediate
  1856  // compaction:
  1857  //
  1858  //	L0.4  a------c ca--d
  1859  //	              ______________
  1860  //	L0.3  a-b bb-c| cc-d    g-j|
  1861  //	L0.2 _________|        f--j|         r-t
  1862  //	L0.1 |  b----------d  e---j|
  1863  //	L0.0 |a------------d   f--j| l--o  p-----x
  1864  //
  1865  //	Lbase a------------------i    m---------w
  1866  //
  1867  // Since bb-c had to be excluded at L0.3, the interval bounds for L0.4 are
  1868  // actually ca-j, since ca is the next interval start key after the end interval
  1869  // of bb-c. This would result in only ca-d being chosen at that sublevel, even
  1870  // though a--c is also not compacting. This is the final result:
  1871  //
  1872  //	              ______________
  1873  //	L0.4  a------c|ca--d       |
  1874  //	L0.3  a-b bb-c| cc-d    g-j|
  1875  //	L0.2 _________|        f--j|         r-t
  1876  //	L0.1 |  b----------d  e---j|
  1877  //	L0.0 |a------------d   f--j| l--o  p-----x
  1878  //
  1879  //	Lbase a------------------i    m---------w
  1880  //
  1881  // TODO(bilal): Add more targeted tests for this method, through
  1882  // ExtendL0ForBaseCompactionTo and intraL0CompactionUsingSeed.
  1883  func (s *L0Sublevels) extendCandidateToRectangle(
  1884  	minIntervalIndex int, maxIntervalIndex int, candidate *L0CompactionFiles, isBase bool,
  1885  ) bool {
  1886  	candidate.preExtensionMinInterval = candidate.minIntervalIndex
  1887  	candidate.preExtensionMaxInterval = candidate.maxIntervalIndex
  1888  	// Extend {min,max}IntervalIndex to include all of the candidate's current
  1889  	// bounds.
  1890  	if minIntervalIndex > candidate.minIntervalIndex {
  1891  		minIntervalIndex = candidate.minIntervalIndex
  1892  	}
  1893  	if maxIntervalIndex < candidate.maxIntervalIndex {
  1894  		maxIntervalIndex = candidate.maxIntervalIndex
  1895  	}
  1896  	var startLevel, increment, endLevel int
  1897  	if isBase {
  1898  		startLevel = 0
  1899  		increment = +1
  1900  		// seedIntervalMaxLevel is inclusive, while endLevel is exclusive.
  1901  		endLevel = candidate.seedIntervalMaxLevel + 1
  1902  	} else {
  1903  		startLevel = len(s.levelFiles) - 1
  1904  		increment = -1
  1905  		// seedIntervalMinLevel is inclusive, while endLevel is exclusive.
  1906  		endLevel = candidate.seedIntervalMinLevel - 1
  1907  	}
  1908  	// Stats for files.
  1909  	addedCount := 0
  1910  	// Iterate from the oldest sub-level for L0 -> Lbase and youngest sub-level
  1911  	// for intra-L0. The idea here is that anything that can't be included from
  1912  	// that level constrains what can be included from the next level. This
  1913  	// change in constraint is directly incorporated into minIntervalIndex,
  1914  	// maxIntervalIndex.
  1915  	for sl := startLevel; sl != endLevel; sl += increment {
  1916  		files := s.levelFiles[sl]
  1917  		// Find the first file that overlaps with minIntervalIndex.
  1918  		index := sort.Search(len(files), func(i int) bool {
  1919  			return minIntervalIndex <= files[i].maxIntervalIndex
  1920  		})
  1921  		// Track the files that are fully within the current constraint of
  1922  		// [minIntervalIndex, maxIntervalIndex].
  1923  		firstIndex := -1
  1924  		lastIndex := -1
  1925  		for ; index < len(files); index++ {
  1926  			f := files[index]
  1927  			if f.minIntervalIndex > maxIntervalIndex {
  1928  				break
  1929  			}
  1930  			include := true
  1931  			// Extends out on the left so can't be included. This narrows what
  1932  			// we can included in the next level.
  1933  			if f.minIntervalIndex < minIntervalIndex {
  1934  				include = false
  1935  				minIntervalIndex = f.maxIntervalIndex + 1
  1936  			}
  1937  			// Extends out on the right so can't be included.
  1938  			if f.maxIntervalIndex > maxIntervalIndex {
  1939  				include = false
  1940  				maxIntervalIndex = f.minIntervalIndex - 1
  1941  			}
  1942  			if !include {
  1943  				continue
  1944  			}
  1945  			if firstIndex == -1 {
  1946  				firstIndex = index
  1947  			}
  1948  			lastIndex = index
  1949  		}
  1950  		if minIntervalIndex > maxIntervalIndex {
  1951  			// We excluded files that prevent continuation.
  1952  			break
  1953  		}
  1954  		if firstIndex < 0 {
  1955  			// No files to add in this sub-level.
  1956  			continue
  1957  		}
  1958  		// We have the files in [firstIndex, lastIndex] as potential for
  1959  		// inclusion. Some of these may already have been picked. Some of them
  1960  		// may be already compacting. The latter is tricky since we have to
  1961  		// decide whether to contract minIntervalIndex or maxIntervalIndex when
  1962  		// we encounter an already compacting file. We pick the longest sequence
  1963  		// between firstIndex and lastIndex of non-compacting files -- this is
  1964  		// represented by [candidateNonCompactingFirst,
  1965  		// candidateNonCompactingLast].
  1966  		nonCompactingFirst := -1
  1967  		currentRunHasAlreadyPickedFiles := false
  1968  		candidateNonCompactingFirst := -1
  1969  		candidateNonCompactingLast := -1
  1970  		candidateHasAlreadyPickedFiles := false
  1971  		for index = firstIndex; index <= lastIndex; index++ {
  1972  			f := files[index]
  1973  			if f.IsCompacting() {
  1974  				if nonCompactingFirst != -1 {
  1975  					last := index - 1
  1976  					// Prioritize runs of consecutive non-compacting files that
  1977  					// have files that have already been picked. That is to say,
  1978  					// if candidateHasAlreadyPickedFiles == true, we stick with
  1979  					// it, and if currentRunHasAlreadyPickedfiles == true, we
  1980  					// pick that run even if it contains fewer files than the
  1981  					// previous candidate.
  1982  					if !candidateHasAlreadyPickedFiles && (candidateNonCompactingFirst == -1 ||
  1983  						currentRunHasAlreadyPickedFiles ||
  1984  						(last-nonCompactingFirst) > (candidateNonCompactingLast-candidateNonCompactingFirst)) {
  1985  						candidateNonCompactingFirst = nonCompactingFirst
  1986  						candidateNonCompactingLast = last
  1987  						candidateHasAlreadyPickedFiles = currentRunHasAlreadyPickedFiles
  1988  					}
  1989  				}
  1990  				nonCompactingFirst = -1
  1991  				currentRunHasAlreadyPickedFiles = false
  1992  				continue
  1993  			}
  1994  			if nonCompactingFirst == -1 {
  1995  				nonCompactingFirst = index
  1996  			}
  1997  			if candidate.FilesIncluded[f.L0Index] {
  1998  				currentRunHasAlreadyPickedFiles = true
  1999  			}
  2000  		}
  2001  		// Logic duplicated from inside the for loop above.
  2002  		if nonCompactingFirst != -1 {
  2003  			last := index - 1
  2004  			if !candidateHasAlreadyPickedFiles && (candidateNonCompactingFirst == -1 ||
  2005  				currentRunHasAlreadyPickedFiles ||
  2006  				(last-nonCompactingFirst) > (candidateNonCompactingLast-candidateNonCompactingFirst)) {
  2007  				candidateNonCompactingFirst = nonCompactingFirst
  2008  				candidateNonCompactingLast = last
  2009  			}
  2010  		}
  2011  		if candidateNonCompactingFirst == -1 {
  2012  			// All files are compacting. There will be gaps that we could
  2013  			// exploit to continue, but don't bother.
  2014  			break
  2015  		}
  2016  		// May need to shrink [minIntervalIndex, maxIntervalIndex] for the next level.
  2017  		if candidateNonCompactingFirst > firstIndex {
  2018  			minIntervalIndex = files[candidateNonCompactingFirst-1].maxIntervalIndex + 1
  2019  		}
  2020  		if candidateNonCompactingLast < lastIndex {
  2021  			maxIntervalIndex = files[candidateNonCompactingLast+1].minIntervalIndex - 1
  2022  		}
  2023  		for index := candidateNonCompactingFirst; index <= candidateNonCompactingLast; index++ {
  2024  			f := files[index]
  2025  			if f.IsCompacting() {
  2026  				// TODO(bilal): Do a logger.Fatalf instead of a panic, for
  2027  				// cleaner unwinding and error messages.
  2028  				panic(fmt.Sprintf("expected %s to not be compacting", f.FileNum))
  2029  			}
  2030  			if candidate.isIntraL0 && f.LargestSeqNum >= candidate.earliestUnflushedSeqNum {
  2031  				continue
  2032  			}
  2033  			if !candidate.FilesIncluded[f.L0Index] {
  2034  				addedCount++
  2035  				candidate.addFile(f)
  2036  			}
  2037  		}
  2038  	}
  2039  	return addedCount > 0
  2040  }