github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/internal/manifest/l0_sublevels.go (about)

     1  // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package manifest
     6  
     7  import (
     8  	"bytes"
     9  	stdcmp "cmp"
    10  	"fmt"
    11  	"math"
    12  	"slices"
    13  	"sort"
    14  	"strings"
    15  
    16  	"github.com/cockroachdb/errors"
    17  	"github.com/cockroachdb/pebble/internal/base"
    18  	"github.com/cockroachdb/pebble/internal/invariants"
    19  )
    20  
    21  // errInvalidL0SublevelsOpt is for use in AddL0Files when the incremental
    22  // sublevel generation optimization failed, and NewL0Sublevels must be called.
    23  var errInvalidL0SublevelsOpt = errors.New("pebble: L0 sublevel generation optimization cannot be used")
    24  
    25  // Intervals are of the form [start, end) with no gap between intervals. Each
    26  // file overlaps perfectly with a sequence of intervals. This perfect overlap
    27  // occurs because the union of file boundary keys is used to pick intervals.
    28  // However the largest key in a file is inclusive, so when it is used as
    29  // an interval, the actual key is ImmediateSuccessor(key). We don't have the
    30  // ImmediateSuccessor function to do this computation, so we instead keep an
    31  // isLargest bool to remind the code about this fact. This is used for
    32  // comparisons in the following manner:
    33  // - intervalKey{k, false} < intervalKey{k, true}
    34  // - k1 < k2 -> intervalKey{k1, _} < intervalKey{k2, _}.
    35  //
    36  // Note that the file's largest key is exclusive if the internal key
    37  // has a trailer matching the rangedel sentinel key. In this case, we set
    38  // isLargest to false for end interval computation.
    39  //
    40  // For example, consider three files with bounds [a,e], [b,g], and [e,j]. The
    41  // interval keys produced would be intervalKey{a, false}, intervalKey{b, false},
    42  // intervalKey{e, false}, intervalKey{e, true}, intervalKey{g, true} and
    43  // intervalKey{j, true}, resulting in intervals
    44  // [a, b), [b, (e, false)), [(e,false), (e, true)), [(e, true), (g, true)) and
    45  // [(g, true), (j, true)). The first file overlaps with the first three
    46  // perfectly, the second file overlaps with the second through to fourth
    47  // intervals, and the third file overlaps with the last three.
    48  //
    49  // The intervals are indexed starting from 0, with the index of the interval
    50  // being the index of the start key of the interval.
    51  //
    52  // In addition to helping with compaction picking, we use interval indices
    53  // to assign each file an interval range once. Subsequent operations, say
    54  // picking overlapping files for a compaction, only need to use the index
    55  // numbers and so avoid expensive byte slice comparisons.
    56  type intervalKey struct {
    57  	key       []byte
    58  	isLargest bool
    59  }
    60  
    61  // intervalKeyTemp is used in the sortAndSweep step. It contains additional metadata
    62  // which is used to generate the {min,max}IntervalIndex for files.
    63  type intervalKeyTemp struct {
    64  	intervalKey intervalKey
    65  	fileMeta    *FileMetadata
    66  	isEndKey    bool
    67  }
    68  
    69  func (i *intervalKeyTemp) setFileIntervalIndex(idx int) {
    70  	if i.isEndKey {
    71  		// This is the right endpoint of some file interval, so the
    72  		// file.maxIntervalIndex must be j - 1 as maxIntervalIndex is
    73  		// inclusive.
    74  		i.fileMeta.maxIntervalIndex = idx - 1
    75  		return
    76  	}
    77  	// This is the left endpoint for some file interval, so the
    78  	// file.minIntervalIndex must be j.
    79  	i.fileMeta.minIntervalIndex = idx
    80  }
    81  
    82  func intervalKeyCompare(cmp Compare, a, b intervalKey) int {
    83  	rv := cmp(a.key, b.key)
    84  	if rv == 0 {
    85  		if a.isLargest && !b.isLargest {
    86  			return +1
    87  		}
    88  		if !a.isLargest && b.isLargest {
    89  			return -1
    90  		}
    91  	}
    92  	return rv
    93  }
    94  
    95  type intervalKeySorter struct {
    96  	keys []intervalKeyTemp
    97  	cmp  Compare
    98  }
    99  
   100  func (s intervalKeySorter) Len() int { return len(s.keys) }
   101  func (s intervalKeySorter) Less(i, j int) bool {
   102  	return intervalKeyCompare(s.cmp, s.keys[i].intervalKey, s.keys[j].intervalKey) < 0
   103  }
   104  func (s intervalKeySorter) Swap(i, j int) {
   105  	s.keys[i], s.keys[j] = s.keys[j], s.keys[i]
   106  }
   107  
   108  // sortAndSweep will sort the intervalKeys using intervalKeySorter, remove the
   109  // duplicate fileIntervals, and set the {min, max}IntervalIndex for the files.
   110  func sortAndSweep(keys []intervalKeyTemp, cmp Compare) []intervalKeyTemp {
   111  	if len(keys) == 0 {
   112  		return nil
   113  	}
   114  	sorter := intervalKeySorter{keys: keys, cmp: cmp}
   115  	sort.Sort(sorter)
   116  
   117  	// intervalKeys are generated using the file bounds. Specifically, there are
   118  	// 2 intervalKeys for each file, and len(keys) = 2 * number of files. Each
   119  	// `intervalKeyTemp` stores information about which file it was generated
   120  	// from, and whether the key represents the end key of the file. So, as
   121  	// we're deduplicating the `keys` slice, we're guaranteed to iterate over
   122  	// the interval keys belonging to each of the files. Since the
   123  	// file.{min,max}IntervalIndex points to the position of the files bounds in
   124  	// the deduplicated `keys` slice, we can determine
   125  	// file.{min,max}IntervalIndex during the iteration.
   126  	i := 0
   127  	j := 0
   128  	for i < len(keys) {
   129  		// loop invariant: j <= i
   130  		currKey := keys[i]
   131  		keys[j] = keys[i]
   132  
   133  		for {
   134  			keys[i].setFileIntervalIndex(j)
   135  			i++
   136  			if i >= len(keys) || intervalKeyCompare(cmp, currKey.intervalKey, keys[i].intervalKey) != 0 {
   137  				break
   138  			}
   139  		}
   140  		j++
   141  	}
   142  	return keys[:j]
   143  }
   144  
   145  // A key interval of the form [start, end). The end is not represented here
   146  // since it is implicit in the start of the next interval. The last interval is
   147  // an exception but we don't need to ever lookup the end of that interval; the
   148  // last fileInterval will only act as an end key marker. The set of intervals
   149  // is const after initialization.
   150  type fileInterval struct {
   151  	index    int
   152  	startKey intervalKey
   153  
   154  	// True iff some file in this interval is compacting to base. Such intervals
   155  	// cannot have any files participate in L0 -> Lbase compactions.
   156  	isBaseCompacting bool
   157  
   158  	// The min and max intervals index across all the files that overlap with
   159  	// this interval. Inclusive on both sides.
   160  	filesMinIntervalIndex int
   161  	filesMaxIntervalIndex int
   162  
   163  	// True if another interval that has a file extending into this interval is
   164  	// undergoing a compaction into Lbase. In other words, this bool is true if
   165  	// any interval in [filesMinIntervalIndex, filesMaxIntervalIndex] has
   166  	// isBaseCompacting set to true. This lets the compaction picker
   167  	// de-prioritize this interval for picking compactions, since there's a high
   168  	// chance that a base compaction with a sufficient height of sublevels
   169  	// rooted at this interval could not be chosen due to the ongoing base
   170  	// compaction in the other interval. If the file straddling the two
   171  	// intervals is at a sufficiently high sublevel (with enough compactible
   172  	// files below it to satisfy minCompactionDepth), this is not an issue, but
   173  	// to optimize for quickly picking base compactions far away from other base
   174  	// compactions, this bool is used as a heuristic (but not as a complete
   175  	// disqualifier).
   176  	intervalRangeIsBaseCompacting bool
   177  
   178  	// All files in this interval, in increasing sublevel order.
   179  	files []*FileMetadata
   180  
   181  	// len(files) - compactingFileCount is the stack depth that requires
   182  	// starting new compactions. This metric is not precise since the
   183  	// compactingFileCount can include files that are part of N (where N > 1)
   184  	// intra-L0 compactions, so the stack depth after those complete will be
   185  	// len(files) - compactingFileCount + N. We ignore this imprecision since we
   186  	// don't want to track which files are part of which intra-L0 compaction.
   187  	compactingFileCount int
   188  
   189  	// Interpolated from files in this interval. For files spanning multiple
   190  	// intervals, we assume an equal distribution of bytes across all those
   191  	// intervals.
   192  	estimatedBytes uint64
   193  }
   194  
   195  // Helper type for any cases requiring a bool slice.
   196  type bitSet []bool
   197  
   198  func newBitSet(n int) bitSet {
   199  	return make([]bool, n)
   200  }
   201  
   202  func (b *bitSet) markBit(i int) {
   203  	(*b)[i] = true
   204  }
   205  
   206  func (b *bitSet) markBits(start, end int) {
   207  	for i := start; i < end; i++ {
   208  		(*b)[i] = true
   209  	}
   210  }
   211  
   212  func (b *bitSet) clearAllBits() {
   213  	for i := range *b {
   214  		(*b)[i] = false
   215  	}
   216  }
   217  
   218  // L0Compaction describes an active compaction with inputs from L0.
   219  type L0Compaction struct {
   220  	Smallest  InternalKey
   221  	Largest   InternalKey
   222  	IsIntraL0 bool
   223  }
   224  
   225  // L0Sublevels represents a sublevel view of SSTables in L0. Tables in one
   226  // sublevel are non-overlapping in key ranges, and keys in higher-indexed
   227  // sublevels shadow older versions in lower-indexed sublevels. These invariants
   228  // are similar to the regular level invariants, except with higher indexed
   229  // sublevels having newer keys as opposed to lower indexed levels.
   230  //
   231  // There is no limit to the number of sublevels that can exist in L0 at any
   232  // time, however read and compaction performance is best when there are as few
   233  // sublevels as possible.
   234  type L0Sublevels struct {
   235  	// Levels are ordered from oldest sublevel to youngest sublevel in the
   236  	// outer slice, and the inner slice contains non-overlapping files for
   237  	// that sublevel in increasing key order. Levels is constructed from
   238  	// levelFiles and is used by callers that require a LevelSlice. The below two
   239  	// fields are treated as immutable once created in NewL0Sublevels.
   240  	Levels     []LevelSlice
   241  	levelFiles [][]*FileMetadata
   242  
   243  	cmp       Compare
   244  	formatKey base.FormatKey
   245  
   246  	fileBytes uint64
   247  	// All the L0 files, ordered from oldest to youngest.
   248  	levelMetadata *LevelMetadata
   249  
   250  	// The file intervals in increasing key order.
   251  	orderedIntervals []fileInterval
   252  
   253  	// Keys to break flushes at.
   254  	flushSplitUserKeys [][]byte
   255  
   256  	// Only used to check invariants.
   257  	addL0FilesCalled bool
   258  }
   259  
   260  type sublevelSorter []*FileMetadata
   261  
   262  // Len implements sort.Interface.
   263  func (sl sublevelSorter) Len() int {
   264  	return len(sl)
   265  }
   266  
   267  // Less implements sort.Interface.
   268  func (sl sublevelSorter) Less(i, j int) bool {
   269  	return sl[i].minIntervalIndex < sl[j].minIntervalIndex
   270  }
   271  
   272  // Swap implements sort.Interface.
   273  func (sl sublevelSorter) Swap(i, j int) {
   274  	sl[i], sl[j] = sl[j], sl[i]
   275  }
   276  
   277  // NewL0Sublevels creates an L0Sublevels instance for a given set of L0 files.
   278  // These files must all be in L0 and must be sorted by seqnum (see
   279  // SortBySeqNum). During interval iteration, when flushSplitMaxBytes bytes are
   280  // exceeded in the range of intervals since the last flush split key, a flush
   281  // split key is added.
   282  //
   283  // This method can be called without DB.mu being held, so any DB.mu protected
   284  // fields in FileMetadata cannot be accessed here, such as Compacting and
   285  // IsIntraL0Compacting. Those fields are accessed in InitCompactingFileInfo
   286  // instead.
   287  func NewL0Sublevels(
   288  	levelMetadata *LevelMetadata, cmp Compare, formatKey base.FormatKey, flushSplitMaxBytes int64,
   289  ) (*L0Sublevels, error) {
   290  	s := &L0Sublevels{cmp: cmp, formatKey: formatKey}
   291  	s.levelMetadata = levelMetadata
   292  	keys := make([]intervalKeyTemp, 0, 2*s.levelMetadata.Len())
   293  	iter := levelMetadata.Iter()
   294  	for i, f := 0, iter.First(); f != nil; i, f = i+1, iter.Next() {
   295  		f.L0Index = i
   296  		keys = append(keys, intervalKeyTemp{
   297  			intervalKey: intervalKey{key: f.Smallest.UserKey},
   298  			fileMeta:    f,
   299  			isEndKey:    false,
   300  		})
   301  		keys = append(keys, intervalKeyTemp{
   302  			intervalKey: intervalKey{
   303  				key:       f.Largest.UserKey,
   304  				isLargest: !f.Largest.IsExclusiveSentinel(),
   305  			},
   306  			fileMeta: f,
   307  			isEndKey: true,
   308  		})
   309  	}
   310  	keys = sortAndSweep(keys, cmp)
   311  	// All interval indices reference s.orderedIntervals.
   312  	s.orderedIntervals = make([]fileInterval, len(keys))
   313  	for i := range keys {
   314  		s.orderedIntervals[i] = fileInterval{
   315  			index:                 i,
   316  			startKey:              keys[i].intervalKey,
   317  			filesMinIntervalIndex: i,
   318  			filesMaxIntervalIndex: i,
   319  		}
   320  	}
   321  	// Initialize minIntervalIndex and maxIntervalIndex for each file, and use that
   322  	// to update intervals.
   323  	for f := iter.First(); f != nil; f = iter.Next() {
   324  		if err := s.addFileToSublevels(f, false /* checkInvariant */); err != nil {
   325  			return nil, err
   326  		}
   327  	}
   328  	// Sort each sublevel in increasing key order.
   329  	for i := range s.levelFiles {
   330  		sort.Sort(sublevelSorter(s.levelFiles[i]))
   331  	}
   332  
   333  	// Construct a parallel slice of sublevel B-Trees.
   334  	// TODO(jackson): Consolidate and only use the B-Trees.
   335  	for _, sublevelFiles := range s.levelFiles {
   336  		tr, ls := makeBTree(btreeCmpSmallestKey(cmp), sublevelFiles)
   337  		s.Levels = append(s.Levels, ls)
   338  		tr.Release()
   339  	}
   340  
   341  	s.calculateFlushSplitKeys(flushSplitMaxBytes)
   342  	return s, nil
   343  }
   344  
   345  // Helper function to merge new intervalKeys into an existing slice of old
   346  // fileIntervals, into result. Returns the new result and a slice of ints
   347  // mapping old interval indices to new ones. The added intervalKeys do not need
   348  // to be sorted; they get sorted and deduped in this function.
   349  func mergeIntervals(
   350  	old, result []fileInterval, added []intervalKeyTemp, compare Compare,
   351  ) ([]fileInterval, []int) {
   352  	sorter := intervalKeySorter{keys: added, cmp: compare}
   353  	sort.Sort(sorter)
   354  
   355  	oldToNewMap := make([]int, len(old))
   356  	i := 0
   357  	j := 0
   358  
   359  	for i < len(old) || j < len(added) {
   360  		for j > 0 && j < len(added) && intervalKeyCompare(compare, added[j-1].intervalKey, added[j].intervalKey) == 0 {
   361  			added[j].setFileIntervalIndex(len(result) - 1)
   362  			j++
   363  		}
   364  		if i >= len(old) && j >= len(added) {
   365  			break
   366  		}
   367  		var cmp int
   368  		if i >= len(old) {
   369  			cmp = +1
   370  		}
   371  		if j >= len(added) {
   372  			cmp = -1
   373  		}
   374  		if cmp == 0 {
   375  			cmp = intervalKeyCompare(compare, old[i].startKey, added[j].intervalKey)
   376  		}
   377  		switch {
   378  		case cmp <= 0:
   379  			// Shallow-copy the existing interval.
   380  			newInterval := old[i]
   381  			result = append(result, newInterval)
   382  			oldToNewMap[i] = len(result) - 1
   383  			i++
   384  			if cmp == 0 {
   385  				added[j].setFileIntervalIndex(len(result) - 1)
   386  				j++
   387  			}
   388  		case cmp > 0:
   389  			var prevInterval fileInterval
   390  			// Insert a new interval for a newly-added file. prevInterval, if
   391  			// non-zero, will be "inherited"; we copy its files as those extend
   392  			// into this interval.
   393  			if len(result) > 0 {
   394  				prevInterval = result[len(result)-1]
   395  			}
   396  			newInterval := fileInterval{
   397  				index:                 len(result),
   398  				startKey:              added[j].intervalKey,
   399  				filesMinIntervalIndex: len(result),
   400  				filesMaxIntervalIndex: len(result),
   401  
   402  				// estimatedBytes gets recalculated later on, as the number of intervals
   403  				// the file bytes are interpolated over has changed.
   404  				estimatedBytes: 0,
   405  				// Copy the below attributes from prevInterval.
   406  				files:                         append([]*FileMetadata(nil), prevInterval.files...),
   407  				isBaseCompacting:              prevInterval.isBaseCompacting,
   408  				intervalRangeIsBaseCompacting: prevInterval.intervalRangeIsBaseCompacting,
   409  				compactingFileCount:           prevInterval.compactingFileCount,
   410  			}
   411  			result = append(result, newInterval)
   412  			added[j].setFileIntervalIndex(len(result) - 1)
   413  			j++
   414  		}
   415  	}
   416  	return result, oldToNewMap
   417  }
   418  
   419  // AddL0Files incrementally builds a new L0Sublevels for when the only change
   420  // since the receiver L0Sublevels was an addition of the specified files, with
   421  // no L0 deletions. The common case of this is an ingestion or a flush. These
   422  // files can "sit on top" of existing sublevels, creating at most one new
   423  // sublevel for a flush (and possibly multiple for an ingestion), and at most
   424  // 2*len(files) additions to s.orderedIntervals. No files must have been deleted
   425  // from L0, and the added files must all be newer in sequence numbers than
   426  // existing files in L0Sublevels. The files parameter must be sorted in seqnum
   427  // order. The levelMetadata parameter corresponds to the new L0 post addition of
   428  // files. This method is meant to be significantly more performant than
   429  // NewL0Sublevels.
   430  //
   431  // Note that this function can only be called once on a given receiver; it
   432  // appends to some slices in s which is only safe when done once. This is okay,
   433  // as the common case (generating a new L0Sublevels after a flush/ingestion) is
   434  // only going to necessitate one call of this method on a given receiver. The
   435  // returned value, if non-nil, can then have [*L0Sublevels.AddL0Files] called on
   436  // it again, and so on. If [errInvalidL0SublevelsOpt] is returned as an error,
   437  // it likely means the optimization could not be applied (i.e. files added were
   438  // older than files already in the sublevels, which is possible around
   439  // ingestions and in tests). Eg. it can happen when an ingested file was
   440  // ingested without queueing a flush since it did not actually overlap with any
   441  // keys in the memtable. Later on the memtable was flushed, and the memtable had
   442  // keys spanning around the ingested file, producing a flushed file that
   443  // overlapped with the ingested file in file bounds but not in keys. It's
   444  // possible for that flushed file to have a lower LargestSeqNum than the
   445  // ingested file if all the additions after the ingestion were to another
   446  // flushed file that was split into a separate sstable during flush. Any other
   447  // non-nil error means [L0Sublevels] generation failed in the same way as
   448  // [NewL0Sublevels] would likely fail.
   449  func (s *L0Sublevels) AddL0Files(
   450  	files []*FileMetadata, flushSplitMaxBytes int64, levelMetadata *LevelMetadata,
   451  ) (*L0Sublevels, error) {
   452  	if invariants.Enabled && s.addL0FilesCalled {
   453  		panic("AddL0Files called twice on the same receiver")
   454  	}
   455  	s.addL0FilesCalled = true
   456  
   457  	// Start with a shallow copy of s.
   458  	newVal := &L0Sublevels{}
   459  	*newVal = *s
   460  
   461  	newVal.addL0FilesCalled = false
   462  	newVal.levelMetadata = levelMetadata
   463  	// Deep copy levelFiles and Levels, as they are mutated and sorted below.
   464  	// Shallow copies of slices that we just append to, are okay.
   465  	newVal.levelFiles = make([][]*FileMetadata, len(s.levelFiles))
   466  	for i := range s.levelFiles {
   467  		newVal.levelFiles[i] = make([]*FileMetadata, len(s.levelFiles[i]))
   468  		copy(newVal.levelFiles[i], s.levelFiles[i])
   469  	}
   470  	newVal.Levels = make([]LevelSlice, len(s.Levels))
   471  	copy(newVal.Levels, s.Levels)
   472  
   473  	fileKeys := make([]intervalKeyTemp, 0, 2*len(files))
   474  	for _, f := range files {
   475  		left := intervalKeyTemp{
   476  			intervalKey: intervalKey{key: f.Smallest.UserKey},
   477  			fileMeta:    f,
   478  		}
   479  		right := intervalKeyTemp{
   480  			intervalKey: intervalKey{
   481  				key:       f.Largest.UserKey,
   482  				isLargest: !f.Largest.IsExclusiveSentinel(),
   483  			},
   484  			fileMeta: f,
   485  			isEndKey: true,
   486  		}
   487  		fileKeys = append(fileKeys, left, right)
   488  	}
   489  	keys := make([]fileInterval, 0, 2*levelMetadata.Len())
   490  	var oldToNewMap []int
   491  	// We can avoid the sortAndSweep step on the combined length of
   492  	// s.orderedIntervals and fileKeys by treating this as a merge of two sorted
   493  	// runs, fileKeys and s.orderedIntervals, into `keys` which will form
   494  	// newVal.orderedIntervals.
   495  	keys, oldToNewMap = mergeIntervals(s.orderedIntervals, keys, fileKeys, s.cmp)
   496  	if invariants.Enabled {
   497  		for i := 1; i < len(keys); i++ {
   498  			if intervalKeyCompare(newVal.cmp, keys[i-1].startKey, keys[i].startKey) >= 0 {
   499  				panic("keys not sorted correctly")
   500  			}
   501  		}
   502  	}
   503  	newVal.orderedIntervals = keys
   504  	// Update indices in s.orderedIntervals for fileIntervals we retained.
   505  	for _, newIdx := range oldToNewMap {
   506  		newInterval := &keys[newIdx]
   507  		newInterval.index = newIdx
   508  		// This code, and related code in the for loop below, adjusts
   509  		// files{Min,Max}IntervalIndex just for interval indices shifting due to
   510  		// new intervals, and not for any of the new files being added to the
   511  		// same intervals. The goal is to produce a state of the system that's
   512  		// accurate for all existing files, and has all the new intervals to
   513  		// support new files. Once that's done, we can just call
   514  		// addFileToSublevel to adjust all relevant intervals for new files.
   515  		newInterval.filesMinIntervalIndex = oldToNewMap[newInterval.filesMinIntervalIndex]
   516  		// maxIntervalIndexes are special. Since it's an inclusive end bound, we
   517  		// actually have to map it to the _next_ old interval's new previous
   518  		// interval. This logic is easier to understand if you see
   519  		// [f.minIntervalIndex, f.maxIntervalIndex] as [f.minIntervalIndex,
   520  		// f.maxIntervalIndex+1). The other case to remember is when the
   521  		// interval is completely empty (i.e. len(newInterval.files) == 0); in
   522  		// that case we want to refer back to ourselves regardless of additions
   523  		// to the right of us.
   524  		if newInterval.filesMaxIntervalIndex < len(oldToNewMap)-1 && len(newInterval.files) > 0 {
   525  			newInterval.filesMaxIntervalIndex = oldToNewMap[newInterval.filesMaxIntervalIndex+1] - 1
   526  		} else {
   527  			// newInterval.filesMaxIntervalIndex == len(oldToNewMap)-1.
   528  			newInterval.filesMaxIntervalIndex = oldToNewMap[newInterval.filesMaxIntervalIndex]
   529  		}
   530  	}
   531  	// Loop through all instances of new intervals added between two old
   532  	// intervals and expand [filesMinIntervalIndex, filesMaxIntervalIndex] of
   533  	// new intervals to reflect that of adjacent old intervals.
   534  	{
   535  		// We can skip cases where new intervals were added to the left of all
   536  		// existing intervals (eg. if the first entry in oldToNewMap is
   537  		// oldToNewMap[0] >= 1). Those intervals will only contain newly added
   538  		// files and will have their parameters adjusted down in
   539  		// addFileToSublevels. The same can also be said about new intervals
   540  		// that are to the right of all existing intervals.
   541  		lastIdx := 0
   542  		for _, newIdx := range oldToNewMap {
   543  			for i := lastIdx + 1; i < newIdx; i++ {
   544  				minIntervalIndex := i
   545  				maxIntervalIndex := i
   546  				if keys[lastIdx].filesMaxIntervalIndex != lastIdx {
   547  					// Last old interval has files extending into keys[i].
   548  					minIntervalIndex = keys[lastIdx].filesMinIntervalIndex
   549  					maxIntervalIndex = keys[lastIdx].filesMaxIntervalIndex
   550  				}
   551  
   552  				keys[i].filesMinIntervalIndex = minIntervalIndex
   553  				keys[i].filesMaxIntervalIndex = maxIntervalIndex
   554  			}
   555  			lastIdx = newIdx
   556  		}
   557  	}
   558  	// Go through old files and update interval indices.
   559  	//
   560  	// TODO(bilal): This is the only place in this method where we loop through
   561  	// all existing files, which could be much more in number than newly added
   562  	// files. See if we can avoid the need for this, either by getting rid of
   563  	// f.minIntervalIndex and f.maxIntervalIndex and calculating them on the fly
   564  	// with a binary search, or by only looping through files to the right of
   565  	// the first interval touched by this method.
   566  	for sublevel := range s.Levels {
   567  		s.Levels[sublevel].Each(func(f *FileMetadata) {
   568  			oldIntervalDelta := f.maxIntervalIndex - f.minIntervalIndex + 1
   569  			oldMinIntervalIndex := f.minIntervalIndex
   570  			f.minIntervalIndex = oldToNewMap[f.minIntervalIndex]
   571  			// maxIntervalIndex is special. Since it's an inclusive end bound,
   572  			// we actually have to map it to the _next_ old interval's new
   573  			// previous interval. This logic is easier to understand if you see
   574  			// [f.minIntervalIndex, f.maxIntervalIndex] as [f.minIntervalIndex,
   575  			// f.maxIntervalIndex+1).
   576  			f.maxIntervalIndex = oldToNewMap[f.maxIntervalIndex+1] - 1
   577  			newIntervalDelta := f.maxIntervalIndex - f.minIntervalIndex + 1
   578  			// Recalculate estimatedBytes for all old files across new
   579  			// intervals, but only if new intervals were added in between.
   580  			if oldIntervalDelta != newIntervalDelta {
   581  				// j is incremented so that oldToNewMap[j] points to the next
   582  				// old interval. This is used to distinguish between old
   583  				// intervals (i.e. ones where we need to subtract
   584  				// f.Size/oldIntervalDelta) from new ones (where we don't need
   585  				// to subtract). In both cases we need to add
   586  				// f.Size/newIntervalDelta.
   587  				j := oldMinIntervalIndex
   588  				for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ {
   589  					if oldToNewMap[j] == i {
   590  						newVal.orderedIntervals[i].estimatedBytes -= f.Size / uint64(oldIntervalDelta)
   591  						j++
   592  					}
   593  					newVal.orderedIntervals[i].estimatedBytes += f.Size / uint64(newIntervalDelta)
   594  				}
   595  			}
   596  		})
   597  	}
   598  	updatedSublevels := make([]int, 0)
   599  	// Update interval indices for new files.
   600  	for i, f := range files {
   601  		f.L0Index = s.levelMetadata.Len() + i
   602  		if err := newVal.addFileToSublevels(f, true /* checkInvariant */); err != nil {
   603  			return nil, err
   604  		}
   605  		updatedSublevels = append(updatedSublevels, f.SubLevel)
   606  	}
   607  
   608  	// Sort and deduplicate updatedSublevels.
   609  	sort.Ints(updatedSublevels)
   610  	{
   611  		j := 0
   612  		for i := 1; i < len(updatedSublevels); i++ {
   613  			if updatedSublevels[i] != updatedSublevels[j] {
   614  				j++
   615  				updatedSublevels[j] = updatedSublevels[i]
   616  			}
   617  		}
   618  		updatedSublevels = updatedSublevels[:j+1]
   619  	}
   620  
   621  	// Sort each updated sublevel in increasing key order.
   622  	for _, sublevel := range updatedSublevels {
   623  		sort.Sort(sublevelSorter(newVal.levelFiles[sublevel]))
   624  	}
   625  
   626  	// Construct a parallel slice of sublevel B-Trees.
   627  	// TODO(jackson): Consolidate and only use the B-Trees.
   628  	for _, sublevel := range updatedSublevels {
   629  		tr, ls := makeBTree(btreeCmpSmallestKey(newVal.cmp), newVal.levelFiles[sublevel])
   630  		if sublevel == len(newVal.Levels) {
   631  			newVal.Levels = append(newVal.Levels, ls)
   632  		} else {
   633  			// sublevel < len(s.Levels). If this panics, updatedSublevels was not
   634  			// populated correctly.
   635  			newVal.Levels[sublevel] = ls
   636  		}
   637  		tr.Release()
   638  	}
   639  
   640  	newVal.flushSplitUserKeys = nil
   641  	newVal.calculateFlushSplitKeys(flushSplitMaxBytes)
   642  	return newVal, nil
   643  }
   644  
   645  // addFileToSublevels is called during L0Sublevels generation, and adds f to the
   646  // correct sublevel's levelFiles, the relevant intervals' files slices, and sets
   647  // interval indices on f. This method, if called successively on multiple files,
   648  // _must_ be called on successively newer files (by seqnum). If checkInvariant
   649  // is true, it could check for this in some cases and return
   650  // [errInvalidL0SublevelsOpt] if that invariant isn't held.
   651  func (s *L0Sublevels) addFileToSublevels(f *FileMetadata, checkInvariant bool) error {
   652  	// This is a simple and not very accurate estimate of the number of
   653  	// bytes this SSTable contributes to the intervals it is a part of.
   654  	//
   655  	// TODO(bilal): Call EstimateDiskUsage in sstable.Reader with interval
   656  	// bounds to get a better estimate for each interval.
   657  	interpolatedBytes := f.Size / uint64(f.maxIntervalIndex-f.minIntervalIndex+1)
   658  	s.fileBytes += f.Size
   659  	subLevel := 0
   660  	// Update state in every fileInterval for this file.
   661  	for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ {
   662  		interval := &s.orderedIntervals[i]
   663  		if len(interval.files) > 0 {
   664  			if checkInvariant && interval.files[len(interval.files)-1].LargestSeqNum > f.LargestSeqNum {
   665  				// We are sliding this file "underneath" an existing file. Throw away
   666  				// and start over in NewL0Sublevels.
   667  				return errInvalidL0SublevelsOpt
   668  			}
   669  			// interval.files is sorted by sublevels, from lowest to highest.
   670  			// AddL0Files can only add files at sublevels higher than existing files
   671  			// in the same key intervals.
   672  			if maxSublevel := interval.files[len(interval.files)-1].SubLevel; subLevel <= maxSublevel {
   673  				subLevel = maxSublevel + 1
   674  			}
   675  		}
   676  		interval.estimatedBytes += interpolatedBytes
   677  		if f.minIntervalIndex < interval.filesMinIntervalIndex {
   678  			interval.filesMinIntervalIndex = f.minIntervalIndex
   679  		}
   680  		if f.maxIntervalIndex > interval.filesMaxIntervalIndex {
   681  			interval.filesMaxIntervalIndex = f.maxIntervalIndex
   682  		}
   683  		interval.files = append(interval.files, f)
   684  	}
   685  	f.SubLevel = subLevel
   686  	if subLevel > len(s.levelFiles) {
   687  		return errors.Errorf("chose a sublevel beyond allowed range of sublevels: %d vs 0-%d", subLevel, len(s.levelFiles))
   688  	}
   689  	if subLevel == len(s.levelFiles) {
   690  		s.levelFiles = append(s.levelFiles, []*FileMetadata{f})
   691  	} else {
   692  		s.levelFiles[subLevel] = append(s.levelFiles[subLevel], f)
   693  	}
   694  	return nil
   695  }
   696  
   697  func (s *L0Sublevels) calculateFlushSplitKeys(flushSplitMaxBytes int64) {
   698  	var cumulativeBytes uint64
   699  	// Multiply flushSplitMaxBytes by the number of sublevels. This prevents
   700  	// excessive flush splitting when the number of sublevels increases.
   701  	flushSplitMaxBytes *= int64(len(s.levelFiles))
   702  	for i := 0; i < len(s.orderedIntervals); i++ {
   703  		interval := &s.orderedIntervals[i]
   704  		if flushSplitMaxBytes > 0 && cumulativeBytes > uint64(flushSplitMaxBytes) &&
   705  			(len(s.flushSplitUserKeys) == 0 ||
   706  				!bytes.Equal(interval.startKey.key, s.flushSplitUserKeys[len(s.flushSplitUserKeys)-1])) {
   707  			s.flushSplitUserKeys = append(s.flushSplitUserKeys, interval.startKey.key)
   708  			cumulativeBytes = 0
   709  		}
   710  		cumulativeBytes += s.orderedIntervals[i].estimatedBytes
   711  	}
   712  }
   713  
   714  // InitCompactingFileInfo initializes internal flags relating to compacting
   715  // files. Must be called after sublevel initialization.
   716  //
   717  // Requires DB.mu *and* the manifest lock to be held.
   718  func (s *L0Sublevels) InitCompactingFileInfo(inProgress []L0Compaction) {
   719  	for i := range s.orderedIntervals {
   720  		s.orderedIntervals[i].compactingFileCount = 0
   721  		s.orderedIntervals[i].isBaseCompacting = false
   722  		s.orderedIntervals[i].intervalRangeIsBaseCompacting = false
   723  	}
   724  
   725  	iter := s.levelMetadata.Iter()
   726  	for f := iter.First(); f != nil; f = iter.Next() {
   727  		if invariants.Enabled {
   728  			if !bytes.Equal(s.orderedIntervals[f.minIntervalIndex].startKey.key, f.Smallest.UserKey) {
   729  				panic(fmt.Sprintf("f.minIntervalIndex in FileMetadata out of sync with intervals in L0Sublevels: %s != %s",
   730  					s.formatKey(s.orderedIntervals[f.minIntervalIndex].startKey.key), s.formatKey(f.Smallest.UserKey)))
   731  			}
   732  			if !bytes.Equal(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key, f.Largest.UserKey) {
   733  				panic(fmt.Sprintf("f.maxIntervalIndex in FileMetadata out of sync with intervals in L0Sublevels: %s != %s",
   734  					s.formatKey(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key), s.formatKey(f.Smallest.UserKey)))
   735  			}
   736  		}
   737  		if !f.IsCompacting() {
   738  			continue
   739  		}
   740  		if invariants.Enabled {
   741  			if s.cmp(s.orderedIntervals[f.minIntervalIndex].startKey.key, f.Smallest.UserKey) != 0 || s.cmp(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key, f.Largest.UserKey) != 0 {
   742  				panic(fmt.Sprintf("file %s has inconsistent L0 Sublevel interval bounds: %s-%s, %s-%s", f.FileNum,
   743  					s.orderedIntervals[f.minIntervalIndex].startKey.key, s.orderedIntervals[f.maxIntervalIndex+1].startKey.key,
   744  					f.Smallest.UserKey, f.Largest.UserKey))
   745  			}
   746  		}
   747  		for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ {
   748  			interval := &s.orderedIntervals[i]
   749  			interval.compactingFileCount++
   750  			if !f.IsIntraL0Compacting {
   751  				// If f.Compacting && !f.IsIntraL0Compacting, this file is
   752  				// being compacted to Lbase.
   753  				interval.isBaseCompacting = true
   754  			}
   755  		}
   756  	}
   757  
   758  	// Some intervals may be base compacting without the files contained within
   759  	// those intervals being marked as compacting. This is possible if the files
   760  	// were added after the compaction initiated, and the active compaction
   761  	// files straddle the input file. Mark these intervals as base compacting.
   762  	for _, c := range inProgress {
   763  		startIK := intervalKey{key: c.Smallest.UserKey, isLargest: false}
   764  		endIK := intervalKey{key: c.Largest.UserKey, isLargest: !c.Largest.IsExclusiveSentinel()}
   765  		start, _ := slices.BinarySearchFunc(s.orderedIntervals, startIK, func(a fileInterval, b intervalKey) int {
   766  			return intervalKeyCompare(s.cmp, a.startKey, b)
   767  		})
   768  		end, _ := slices.BinarySearchFunc(s.orderedIntervals, endIK, func(a fileInterval, b intervalKey) int {
   769  			return intervalKeyCompare(s.cmp, a.startKey, b)
   770  		})
   771  		for i := start; i < end && i < len(s.orderedIntervals); i++ {
   772  			interval := &s.orderedIntervals[i]
   773  			if !c.IsIntraL0 {
   774  				interval.isBaseCompacting = true
   775  			}
   776  		}
   777  	}
   778  
   779  	min := 0
   780  	for i := range s.orderedIntervals {
   781  		interval := &s.orderedIntervals[i]
   782  		if interval.isBaseCompacting {
   783  			minIndex := interval.filesMinIntervalIndex
   784  			if minIndex < min {
   785  				minIndex = min
   786  			}
   787  			for j := minIndex; j <= interval.filesMaxIntervalIndex; j++ {
   788  				min = j
   789  				s.orderedIntervals[j].intervalRangeIsBaseCompacting = true
   790  			}
   791  		}
   792  	}
   793  }
   794  
   795  // String produces a string containing useful debug information. Useful in test
   796  // code and debugging.
   797  func (s *L0Sublevels) String() string {
   798  	return s.describe(false)
   799  }
   800  
   801  func (s *L0Sublevels) describe(verbose bool) string {
   802  	var buf strings.Builder
   803  	fmt.Fprintf(&buf, "file count: %d, sublevels: %d, intervals: %d\nflush split keys(%d): [",
   804  		s.levelMetadata.Len(), len(s.levelFiles), len(s.orderedIntervals), len(s.flushSplitUserKeys))
   805  	for i := range s.flushSplitUserKeys {
   806  		fmt.Fprintf(&buf, "%s", s.formatKey(s.flushSplitUserKeys[i]))
   807  		if i < len(s.flushSplitUserKeys)-1 {
   808  			fmt.Fprintf(&buf, ", ")
   809  		}
   810  	}
   811  	fmt.Fprintln(&buf, "]")
   812  	numCompactingFiles := 0
   813  	for i := len(s.levelFiles) - 1; i >= 0; i-- {
   814  		maxIntervals := 0
   815  		sumIntervals := 0
   816  		var totalBytes uint64
   817  		for _, f := range s.levelFiles[i] {
   818  			intervals := f.maxIntervalIndex - f.minIntervalIndex + 1
   819  			if intervals > maxIntervals {
   820  				maxIntervals = intervals
   821  			}
   822  			sumIntervals += intervals
   823  			totalBytes += f.Size
   824  			if f.IsCompacting() {
   825  				numCompactingFiles++
   826  			}
   827  		}
   828  		fmt.Fprintf(&buf, "0.%d: file count: %d, bytes: %d, width (mean, max): %0.1f, %d, interval range: [%d, %d]\n",
   829  			i, len(s.levelFiles[i]), totalBytes, float64(sumIntervals)/float64(len(s.levelFiles[i])), maxIntervals, s.levelFiles[i][0].minIntervalIndex,
   830  			s.levelFiles[i][len(s.levelFiles[i])-1].maxIntervalIndex)
   831  		for _, f := range s.levelFiles[i] {
   832  			intervals := f.maxIntervalIndex - f.minIntervalIndex + 1
   833  			if verbose {
   834  				fmt.Fprintf(&buf, "\t%s\n", f)
   835  			}
   836  			if s.levelMetadata.Len() > 50 && intervals*3 > len(s.orderedIntervals) {
   837  				var intervalsBytes uint64
   838  				for k := f.minIntervalIndex; k <= f.maxIntervalIndex; k++ {
   839  					intervalsBytes += s.orderedIntervals[k].estimatedBytes
   840  				}
   841  				fmt.Fprintf(&buf, "wide file: %d, [%d, %d], byte fraction: %f\n",
   842  					f.FileNum, f.minIntervalIndex, f.maxIntervalIndex,
   843  					float64(intervalsBytes)/float64(s.fileBytes))
   844  			}
   845  		}
   846  	}
   847  
   848  	lastCompactingIntervalStart := -1
   849  	fmt.Fprintf(&buf, "compacting file count: %d, base compacting intervals: ", numCompactingFiles)
   850  	i := 0
   851  	foundBaseCompactingIntervals := false
   852  	for ; i < len(s.orderedIntervals); i++ {
   853  		interval := &s.orderedIntervals[i]
   854  		if len(interval.files) == 0 {
   855  			continue
   856  		}
   857  		if !interval.isBaseCompacting {
   858  			if lastCompactingIntervalStart != -1 {
   859  				if foundBaseCompactingIntervals {
   860  					buf.WriteString(", ")
   861  				}
   862  				fmt.Fprintf(&buf, "[%d, %d]", lastCompactingIntervalStart, i-1)
   863  				foundBaseCompactingIntervals = true
   864  			}
   865  			lastCompactingIntervalStart = -1
   866  		} else {
   867  			if lastCompactingIntervalStart == -1 {
   868  				lastCompactingIntervalStart = i
   869  			}
   870  		}
   871  	}
   872  	if lastCompactingIntervalStart != -1 {
   873  		if foundBaseCompactingIntervals {
   874  			buf.WriteString(", ")
   875  		}
   876  		fmt.Fprintf(&buf, "[%d, %d]", lastCompactingIntervalStart, i-1)
   877  	} else if !foundBaseCompactingIntervals {
   878  		fmt.Fprintf(&buf, "none")
   879  	}
   880  	fmt.Fprintln(&buf, "")
   881  	return buf.String()
   882  }
   883  
   884  // ReadAmplification returns the contribution of L0Sublevels to the read
   885  // amplification for any particular point key. It is the maximum height of any
   886  // tracked fileInterval. This is always less than or equal to the number of
   887  // sublevels.
   888  func (s *L0Sublevels) ReadAmplification() int {
   889  	amp := 0
   890  	for i := range s.orderedIntervals {
   891  		interval := &s.orderedIntervals[i]
   892  		fileCount := len(interval.files)
   893  		if amp < fileCount {
   894  			amp = fileCount
   895  		}
   896  	}
   897  	return amp
   898  }
   899  
   900  // UserKeyRange encodes a key range in user key space. A UserKeyRange's Start
   901  // and End boundaries are both inclusive.
   902  type UserKeyRange struct {
   903  	Start, End []byte
   904  }
   905  
   906  // InUseKeyRanges returns the merged table bounds of L0 files overlapping the
   907  // provided user key range. The returned key ranges are sorted and
   908  // nonoverlapping.
   909  func (s *L0Sublevels) InUseKeyRanges(smallest, largest []byte) []UserKeyRange {
   910  	// Binary search to find the provided keys within the intervals.
   911  	startIK := intervalKey{key: smallest, isLargest: false}
   912  	endIK := intervalKey{key: largest, isLargest: true}
   913  	start := sort.Search(len(s.orderedIntervals), func(i int) bool {
   914  		return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, startIK) > 0
   915  	})
   916  	if start > 0 {
   917  		// Back up to the first interval with a start key <= startIK.
   918  		start--
   919  	}
   920  	end := sort.Search(len(s.orderedIntervals), func(i int) bool {
   921  		return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, endIK) > 0
   922  	})
   923  
   924  	var keyRanges []UserKeyRange
   925  	var curr *UserKeyRange
   926  	for i := start; i < end; {
   927  		// Intervals with no files are not in use and can be skipped, once we
   928  		// end the current UserKeyRange.
   929  		if len(s.orderedIntervals[i].files) == 0 {
   930  			curr = nil
   931  			i++
   932  			continue
   933  		}
   934  
   935  		// If curr is nil, start a new in-use key range.
   936  		if curr == nil {
   937  			keyRanges = append(keyRanges, UserKeyRange{
   938  				Start: s.orderedIntervals[i].startKey.key,
   939  			})
   940  			curr = &keyRanges[len(keyRanges)-1]
   941  		}
   942  
   943  		// If the filesMaxIntervalIndex is not the current index, we can jump to
   944  		// the max index, knowing that all intermediary intervals are overlapped
   945  		// by some file.
   946  		if maxIdx := s.orderedIntervals[i].filesMaxIntervalIndex; maxIdx != i {
   947  			// Note that end may be less than or equal to maxIdx if we're
   948  			// concerned with a key range that ends before the interval at
   949  			// maxIdx starts. We must set curr.End now, before making that leap,
   950  			// because this iteration may be the last.
   951  			i = maxIdx
   952  			curr.End = s.orderedIntervals[i+1].startKey.key
   953  			continue
   954  		}
   955  
   956  		// No files overlapping with this interval overlap with the next
   957  		// interval. Update the current end to be the next interval's start key.
   958  		// Note that curr is not necessarily finished, because there may be an
   959  		// abutting non-empty interval.
   960  		curr.End = s.orderedIntervals[i+1].startKey.key
   961  		i++
   962  	}
   963  	return keyRanges
   964  }
   965  
   966  // FlushSplitKeys returns a slice of user keys to split flushes at. Used by
   967  // flushes to avoid writing sstables that straddle these split keys. These
   968  // should be interpreted as the keys to start the next sstable (not the last key
   969  // to include in the prev sstable). These are user keys so that range tombstones
   970  // can be properly truncated (untruncated range tombstones are not permitted for
   971  // L0 files).
   972  func (s *L0Sublevels) FlushSplitKeys() [][]byte {
   973  	return s.flushSplitUserKeys
   974  }
   975  
   976  // MaxDepthAfterOngoingCompactions returns an estimate of maximum depth of
   977  // sublevels after all ongoing compactions run to completion. Used by compaction
   978  // picker to decide compaction score for L0. There is no scoring for intra-L0
   979  // compactions -- they only run if L0 score is high but we're unable to pick an
   980  // L0 -> Lbase compaction.
   981  func (s *L0Sublevels) MaxDepthAfterOngoingCompactions() int {
   982  	depth := 0
   983  	for i := range s.orderedIntervals {
   984  		interval := &s.orderedIntervals[i]
   985  		intervalDepth := len(interval.files) - interval.compactingFileCount
   986  		if depth < intervalDepth {
   987  			depth = intervalDepth
   988  		}
   989  	}
   990  	return depth
   991  }
   992  
   993  // Only for temporary debugging in the absence of proper tests.
   994  //
   995  // TODO(bilal): Simplify away the debugging statements in this method, and make
   996  // this a pure sanity checker.
   997  //
   998  //lint:ignore U1000 - useful for debugging
   999  func (s *L0Sublevels) checkCompaction(c *L0CompactionFiles) error {
  1000  	includedFiles := newBitSet(s.levelMetadata.Len())
  1001  	fileIntervalsByLevel := make([]struct {
  1002  		min int
  1003  		max int
  1004  	}, len(s.levelFiles))
  1005  	for i := range fileIntervalsByLevel {
  1006  		fileIntervalsByLevel[i].min = math.MaxInt32
  1007  		fileIntervalsByLevel[i].max = 0
  1008  	}
  1009  	var topLevel int
  1010  	var increment int
  1011  	var limitReached func(int) bool
  1012  	if c.isIntraL0 {
  1013  		topLevel = len(s.levelFiles) - 1
  1014  		increment = +1
  1015  		limitReached = func(level int) bool {
  1016  			return level == len(s.levelFiles)
  1017  		}
  1018  	} else {
  1019  		topLevel = 0
  1020  		increment = -1
  1021  		limitReached = func(level int) bool {
  1022  			return level < 0
  1023  		}
  1024  	}
  1025  	for _, f := range c.Files {
  1026  		if fileIntervalsByLevel[f.SubLevel].min > f.minIntervalIndex {
  1027  			fileIntervalsByLevel[f.SubLevel].min = f.minIntervalIndex
  1028  		}
  1029  		if fileIntervalsByLevel[f.SubLevel].max < f.maxIntervalIndex {
  1030  			fileIntervalsByLevel[f.SubLevel].max = f.maxIntervalIndex
  1031  		}
  1032  		includedFiles.markBit(f.L0Index)
  1033  		if c.isIntraL0 {
  1034  			if topLevel > f.SubLevel {
  1035  				topLevel = f.SubLevel
  1036  			}
  1037  		} else {
  1038  			if topLevel < f.SubLevel {
  1039  				topLevel = f.SubLevel
  1040  			}
  1041  		}
  1042  	}
  1043  	min := fileIntervalsByLevel[topLevel].min
  1044  	max := fileIntervalsByLevel[topLevel].max
  1045  	for level := topLevel; !limitReached(level); level += increment {
  1046  		if fileIntervalsByLevel[level].min < min {
  1047  			min = fileIntervalsByLevel[level].min
  1048  		}
  1049  		if fileIntervalsByLevel[level].max > max {
  1050  			max = fileIntervalsByLevel[level].max
  1051  		}
  1052  		index, _ := slices.BinarySearchFunc(s.levelFiles[level], min, func(a *FileMetadata, b int) int {
  1053  			return stdcmp.Compare(a.maxIntervalIndex, b)
  1054  		})
  1055  		// start := index
  1056  		for ; index < len(s.levelFiles[level]); index++ {
  1057  			f := s.levelFiles[level][index]
  1058  			if f.minIntervalIndex > max {
  1059  				break
  1060  			}
  1061  			if c.isIntraL0 && f.LargestSeqNum >= c.earliestUnflushedSeqNum {
  1062  				return errors.Errorf(
  1063  					"sstable %s in compaction has sequence numbers higher than the earliest unflushed seqnum %d: %d-%d",
  1064  					f.FileNum, c.earliestUnflushedSeqNum, f.SmallestSeqNum,
  1065  					f.LargestSeqNum)
  1066  			}
  1067  			if !includedFiles[f.L0Index] {
  1068  				var buf strings.Builder
  1069  				fmt.Fprintf(&buf, "bug %t, seed interval: %d: level %d, sl index %d, f.index %d, min %d, max %d, pre-min %d, pre-max %d, f.min %d, f.max %d, filenum: %d, isCompacting: %t\n%s\n",
  1070  					c.isIntraL0, c.seedInterval, level, index, f.L0Index, min, max, c.preExtensionMinInterval, c.preExtensionMaxInterval,
  1071  					f.minIntervalIndex, f.maxIntervalIndex,
  1072  					f.FileNum, f.IsCompacting(), s)
  1073  				fmt.Fprintf(&buf, "files included:\n")
  1074  				for _, f := range c.Files {
  1075  					fmt.Fprintf(&buf, "filenum: %d, sl: %d, index: %d, [%d, %d]\n",
  1076  						f.FileNum, f.SubLevel, f.L0Index, f.minIntervalIndex, f.maxIntervalIndex)
  1077  				}
  1078  				fmt.Fprintf(&buf, "files added:\n")
  1079  				for _, f := range c.filesAdded {
  1080  					fmt.Fprintf(&buf, "filenum: %d, sl: %d, index: %d, [%d, %d]\n",
  1081  						f.FileNum, f.SubLevel, f.L0Index, f.minIntervalIndex, f.maxIntervalIndex)
  1082  				}
  1083  				return errors.New(buf.String())
  1084  			}
  1085  		}
  1086  	}
  1087  	return nil
  1088  }
  1089  
  1090  // UpdateStateForStartedCompaction updates internal L0Sublevels state for a
  1091  // recently started compaction. isBase specifies if this is a base compaction;
  1092  // if false, this is assumed to be an intra-L0 compaction. The specified
  1093  // compaction must be involving L0 SSTables. It's assumed that the Compacting
  1094  // and IsIntraL0Compacting fields are already set on all [FileMetadata]s passed
  1095  // in.
  1096  func (s *L0Sublevels) UpdateStateForStartedCompaction(inputs []LevelSlice, isBase bool) error {
  1097  	minIntervalIndex := -1
  1098  	maxIntervalIndex := 0
  1099  	for i := range inputs {
  1100  		iter := inputs[i].Iter()
  1101  		for f := iter.First(); f != nil; f = iter.Next() {
  1102  			for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ {
  1103  				interval := &s.orderedIntervals[i]
  1104  				interval.compactingFileCount++
  1105  			}
  1106  			if f.minIntervalIndex < minIntervalIndex || minIntervalIndex == -1 {
  1107  				minIntervalIndex = f.minIntervalIndex
  1108  			}
  1109  			if f.maxIntervalIndex > maxIntervalIndex {
  1110  				maxIntervalIndex = f.maxIntervalIndex
  1111  			}
  1112  		}
  1113  	}
  1114  	if isBase {
  1115  		for i := minIntervalIndex; i <= maxIntervalIndex; i++ {
  1116  			interval := &s.orderedIntervals[i]
  1117  			interval.isBaseCompacting = isBase
  1118  			for j := interval.filesMinIntervalIndex; j <= interval.filesMaxIntervalIndex; j++ {
  1119  				s.orderedIntervals[j].intervalRangeIsBaseCompacting = true
  1120  			}
  1121  		}
  1122  	}
  1123  	return nil
  1124  }
  1125  
  1126  // L0CompactionFiles represents a candidate set of L0 files for compaction. Also
  1127  // referred to as "lcf". Contains state information useful for generating the
  1128  // compaction (such as Files), as well as for picking between candidate
  1129  // compactions (eg. fileBytes and seedIntervalStackDepthReduction).
  1130  type L0CompactionFiles struct {
  1131  	Files []*FileMetadata
  1132  
  1133  	FilesIncluded bitSet
  1134  	// A "seed interval" is an interval with a high stack depth that was chosen
  1135  	// to bootstrap this compaction candidate. seedIntervalStackDepthReduction
  1136  	// is the number of sublevels that have a file in the seed interval that is
  1137  	// a part of this compaction.
  1138  	seedIntervalStackDepthReduction int
  1139  	// For base compactions, seedIntervalMinLevel is 0, and for intra-L0
  1140  	// compactions, seedIntervalMaxLevel is len(s.Files)-1 i.e. the highest
  1141  	// sublevel.
  1142  	seedIntervalMinLevel int
  1143  	seedIntervalMaxLevel int
  1144  	// Index of the seed interval.
  1145  	seedInterval int
  1146  	// Sum of file sizes for all files in this compaction.
  1147  	fileBytes uint64
  1148  	// Intervals with index [minIntervalIndex, maxIntervalIndex] are
  1149  	// participating in this compaction; it's the union set of all intervals
  1150  	// overlapped by participating files.
  1151  	minIntervalIndex int
  1152  	maxIntervalIndex int
  1153  
  1154  	// Set for intra-L0 compactions. SSTables with sequence numbers greater
  1155  	// than earliestUnflushedSeqNum cannot be a part of intra-L0 compactions.
  1156  	isIntraL0               bool
  1157  	earliestUnflushedSeqNum uint64
  1158  
  1159  	// For debugging purposes only. Used in checkCompaction().
  1160  	preExtensionMinInterval int
  1161  	preExtensionMaxInterval int
  1162  	filesAdded              []*FileMetadata
  1163  }
  1164  
  1165  // Clone allocates a new L0CompactionFiles, with the same underlying data. Note
  1166  // that the two fileMetadata slices contain values that point to the same
  1167  // underlying fileMetadata object. This is safe because these objects are read
  1168  // only.
  1169  func (l *L0CompactionFiles) Clone() *L0CompactionFiles {
  1170  	oldLcf := *l
  1171  	return &oldLcf
  1172  }
  1173  
  1174  // String merely prints the starting address of the first file, if it exists.
  1175  func (l *L0CompactionFiles) String() string {
  1176  	if len(l.Files) > 0 {
  1177  		return fmt.Sprintf("First File Address: %p", &l.Files[0])
  1178  	}
  1179  	return ""
  1180  }
  1181  
  1182  // addFile adds the specified file to the LCF.
  1183  func (l *L0CompactionFiles) addFile(f *FileMetadata) {
  1184  	if l.FilesIncluded[f.L0Index] {
  1185  		return
  1186  	}
  1187  	l.FilesIncluded.markBit(f.L0Index)
  1188  	l.Files = append(l.Files, f)
  1189  	l.filesAdded = append(l.filesAdded, f)
  1190  	l.fileBytes += f.Size
  1191  	if f.minIntervalIndex < l.minIntervalIndex {
  1192  		l.minIntervalIndex = f.minIntervalIndex
  1193  	}
  1194  	if f.maxIntervalIndex > l.maxIntervalIndex {
  1195  		l.maxIntervalIndex = f.maxIntervalIndex
  1196  	}
  1197  }
  1198  
  1199  // Helper to order intervals being considered for compaction.
  1200  type intervalAndScore struct {
  1201  	interval int
  1202  	score    int
  1203  }
  1204  type intervalSorterByDecreasingScore []intervalAndScore
  1205  
  1206  func (is intervalSorterByDecreasingScore) Len() int { return len(is) }
  1207  func (is intervalSorterByDecreasingScore) Less(i, j int) bool {
  1208  	return is[i].score > is[j].score
  1209  }
  1210  func (is intervalSorterByDecreasingScore) Swap(i, j int) {
  1211  	is[i], is[j] = is[j], is[i]
  1212  }
  1213  
  1214  // Compactions:
  1215  //
  1216  // The sub-levels and intervals can be visualized in 2 dimensions as the X axis
  1217  // containing intervals in increasing order and the Y axis containing sub-levels
  1218  // (older to younger). The intervals can be sparse wrt sub-levels. We observe
  1219  // that the system is typically under severe pressure in L0 during large numbers
  1220  // of ingestions where most files added to L0 are narrow and non-overlapping.
  1221  //
  1222  //    L0.1    d---g
  1223  //    L0.0  c--e  g--j o--s u--x
  1224  //
  1225  // As opposed to a case with a lot of wide, overlapping L0 files:
  1226  //
  1227  //    L0.3     d-----------r
  1228  //    L0.2    c--------o
  1229  //    L0.1   b-----------q
  1230  //    L0.0  a----------------x
  1231  //
  1232  // In that case we expect the rectangle represented in the good visualization
  1233  // above (i.e. the first one) to be wide and short, and not too sparse (most
  1234  // intervals will have fileCount close to the sub-level count), which would make
  1235  // it amenable to concurrent L0 -> Lbase compactions.
  1236  //
  1237  // L0 -> Lbase: The high-level goal of a L0 -> Lbase compaction is to reduce
  1238  // stack depth, by compacting files in the intervals with the highest (fileCount
  1239  // - compactingCount). Additionally, we would like compactions to not involve a
  1240  // huge number of files, so that they finish quickly, and to allow for
  1241  // concurrent L0 -> Lbase compactions when needed. In order to achieve these
  1242  // goals we would like compactions to visualize as capturing thin and tall
  1243  // rectangles. The approach below is to consider intervals in some order and
  1244  // then try to construct a compaction using the interval. The first interval we
  1245  // can construct a compaction for is the compaction that is started. There can
  1246  // be multiple heuristics in choosing the ordering of the intervals -- the code
  1247  // uses one heuristic that worked well for a large ingestion stemming from a
  1248  // cockroachdb import, but additional experimentation is necessary to pick a
  1249  // general heuristic. Additionally, the compaction that gets picked may be not
  1250  // as desirable as one that could be constructed later in terms of reducing
  1251  // stack depth (since adding more files to the compaction can get blocked by
  1252  // needing to encompass files that are already being compacted). So an
  1253  // alternative would be to try to construct more than one compaction and pick
  1254  // the best one.
  1255  //
  1256  // Here's a visualization of an ideal L0->LBase compaction selection:
  1257  //
  1258  //    L0.3  a--d    g-j
  1259  //    L0.2         f--j          r-t
  1260  //    L0.1   b-d  e---j
  1261  //    L0.0  a--d   f--j  l--o  p-----x
  1262  //
  1263  //    Lbase a--------i    m---------w
  1264  //
  1265  // The [g,j] interval has the highest stack depth, so it would have the highest
  1266  // priority for selecting a base compaction candidate. Assuming none of the
  1267  // files are already compacting, this is the compaction that will be chosen:
  1268  //
  1269  //               _______
  1270  //    L0.3  a--d |  g-j|
  1271  //    L0.2       | f--j|         r-t
  1272  //    L0.1   b-d |e---j|
  1273  //    L0.0  a--d | f--j| l--o  p-----x
  1274  //
  1275  //    Lbase a--------i    m---------w
  1276  //
  1277  // Note that running this compaction will mark the a--i file in Lbase as
  1278  // compacting, and when ExtendL0ForBaseCompactionTo is called with the bounds of
  1279  // that base file, it'll expand the compaction to also include all L0 files in
  1280  // the a-d interval. The resultant compaction would then be:
  1281  //
  1282  //         _____________
  1283  //    L0.3 |a--d    g-j|
  1284  //    L0.2 |       f--j|         r-t
  1285  //    L0.1 | b-d  e---j|
  1286  //    L0.0 |a--d   f--j| l--o  p-----x
  1287  //
  1288  //    Lbase a--------i    m---------w
  1289  //
  1290  // The next best interval for base compaction would therefore be the one
  1291  // including r--t in L0.2 and p--x in L0.0, and both this compaction and the one
  1292  // picked earlier can run in parallel. This is assuming minCompactionDepth >= 2,
  1293  // otherwise the second compaction has too little depth to pick.
  1294  //
  1295  //         _____________
  1296  //    L0.3 |a--d    g-j|      _________
  1297  //    L0.2 |       f--j|      |  r-t  |
  1298  //    L0.1 | b-d  e---j|      |       |
  1299  //    L0.0 |a--d   f--j| l--o |p-----x|
  1300  //
  1301  //    Lbase a--------i    m---------w
  1302  //
  1303  // Note that when ExtendL0ForBaseCompactionTo is called, the compaction expands
  1304  // to the following, given that the [l,o] file can be added without including
  1305  // additional files in Lbase:
  1306  //
  1307  //         _____________
  1308  //    L0.3 |a--d    g-j|      _________
  1309  //    L0.2 |       f--j|      |  r-t  |
  1310  //    L0.1 | b-d  e---j|______|       |
  1311  //    L0.0 |a--d   f--j||l--o  p-----x|
  1312  //
  1313  //    Lbase a--------i    m---------w
  1314  //
  1315  // If an additional file existed in LBase that overlapped with [l,o], it would
  1316  // be excluded from the compaction. Concretely:
  1317  //
  1318  //         _____________
  1319  //    L0.3 |a--d    g-j|      _________
  1320  //    L0.2 |       f--j|      |  r-t  |
  1321  //    L0.1 | b-d  e---j|      |       |
  1322  //    L0.0 |a--d   f--j| l--o |p-----x|
  1323  //
  1324  //    Lbase a--------ij--lm---------w
  1325  //
  1326  // Intra-L0: If the L0 score is high, but PickBaseCompaction() is unable to pick
  1327  // a compaction, PickIntraL0Compaction will be used to pick an intra-L0
  1328  // compaction. Similar to L0 -> Lbase compactions, we want to allow for multiple
  1329  // intra-L0 compactions and not generate wide output files that hinder later
  1330  // concurrency of L0 -> Lbase compactions. Also compactions that produce wide
  1331  // files don't reduce stack depth -- they represent wide rectangles in our
  1332  // visualization, which means many intervals have their depth reduced by a small
  1333  // amount. Typically, L0 files have non-overlapping sequence numbers, and
  1334  // sticking to that invariant would require us to consider intra-L0 compactions
  1335  // that proceed from youngest to oldest files, which could result in the
  1336  // aforementioned undesirable wide rectangle shape. But this non-overlapping
  1337  // sequence number is already relaxed in RocksDB -- sstables are primarily
  1338  // ordered by their largest sequence number. So we can arrange for intra-L0
  1339  // compactions to capture thin and tall rectangles starting with the top of the
  1340  // stack (youngest files). Like the L0 -> Lbase case we order the intervals
  1341  // using a heuristic and consider each in turn. The same comment about better L0
  1342  // -> Lbase heuristics and not being greedy applies here.
  1343  //
  1344  // Going back to a modified version of our example from earlier, let's say these
  1345  // are the base compactions in progress:
  1346  //                _______
  1347  //    L0.3  a--d  |  g-j|      _________
  1348  //    L0.2        | f--j|      |  r-t  |
  1349  //    L0.1   b-d  |e---j|      |       |
  1350  //    L0.0  a--d  | f--j| l--o |p-----x|
  1351  //
  1352  //    Lbase a---------i    m---------w
  1353  //
  1354  // Since both LBase files are compacting, the only L0 compaction that can be
  1355  // picked is an intra-L0 compaction. For this, the b--d interval has the highest
  1356  // stack depth (3), and starting with a--d in L0.3 as the seed file, we can
  1357  // iterate downward and build this compaction, assuming all files in that
  1358  // interval are not compacting and have a highest sequence number less than
  1359  // earliestUnflushedSeqNum:
  1360  //
  1361  //                _______
  1362  //    L0.3 |a--d| |  g-j|      _________
  1363  //    L0.2 |    | | f--j|      |  r-t  |
  1364  //    L0.1 | b-d| |e---j|      |       |
  1365  //    L0.0 |a--d| | f--j| l--o |p-----x|
  1366  //         ------
  1367  //    Lbase a---------i    m---------w
  1368  //
  1369  
  1370  // PickBaseCompaction picks a base compaction based on the above specified
  1371  // heuristics, for the specified Lbase files and a minimum depth of overlapping
  1372  // files that can be selected for compaction. Returns nil if no compaction is
  1373  // possible.
  1374  func (s *L0Sublevels) PickBaseCompaction(
  1375  	minCompactionDepth int, baseFiles LevelSlice,
  1376  ) (*L0CompactionFiles, error) {
  1377  	// For LBase compactions, we consider intervals in a greedy manner in the
  1378  	// following order:
  1379  	// - Intervals that are unlikely to be blocked due
  1380  	//   to ongoing L0 -> Lbase compactions. These are the ones with
  1381  	//   !isBaseCompacting && !intervalRangeIsBaseCompacting.
  1382  	// - Intervals that are !isBaseCompacting && intervalRangeIsBaseCompacting.
  1383  	//
  1384  	// The ordering heuristic exists just to avoid wasted work. Ideally,
  1385  	// we would consider all intervals with isBaseCompacting = false and
  1386  	// construct a compaction for it and compare the constructed compactions
  1387  	// and pick the best one. If microbenchmarks show that we can afford
  1388  	// this cost we can eliminate this heuristic.
  1389  	scoredIntervals := make([]intervalAndScore, 0, len(s.orderedIntervals))
  1390  	sublevelCount := len(s.levelFiles)
  1391  	for i := range s.orderedIntervals {
  1392  		interval := &s.orderedIntervals[i]
  1393  		depth := len(interval.files) - interval.compactingFileCount
  1394  		if interval.isBaseCompacting || minCompactionDepth > depth {
  1395  			continue
  1396  		}
  1397  		if interval.intervalRangeIsBaseCompacting {
  1398  			scoredIntervals = append(scoredIntervals, intervalAndScore{interval: i, score: depth})
  1399  		} else {
  1400  			// Prioritize this interval by incrementing the score by the number
  1401  			// of sublevels.
  1402  			scoredIntervals = append(scoredIntervals, intervalAndScore{interval: i, score: depth + sublevelCount})
  1403  		}
  1404  	}
  1405  	sort.Sort(intervalSorterByDecreasingScore(scoredIntervals))
  1406  
  1407  	// Optimization to avoid considering different intervals that
  1408  	// are likely to choose the same seed file. Again this is just
  1409  	// to reduce wasted work.
  1410  	consideredIntervals := newBitSet(len(s.orderedIntervals))
  1411  	for _, scoredInterval := range scoredIntervals {
  1412  		interval := &s.orderedIntervals[scoredInterval.interval]
  1413  		if consideredIntervals[interval.index] {
  1414  			continue
  1415  		}
  1416  
  1417  		// Pick the seed file for the interval as the file
  1418  		// in the lowest sub-level.
  1419  		f := interval.files[0]
  1420  		// Don't bother considering the intervals that are covered by the seed
  1421  		// file since they are likely nearby. Note that it is possible that
  1422  		// those intervals have seed files at lower sub-levels so could be
  1423  		// viable for compaction.
  1424  		if f == nil {
  1425  			return nil, errors.New("no seed file found in sublevel intervals")
  1426  		}
  1427  		consideredIntervals.markBits(f.minIntervalIndex, f.maxIntervalIndex+1)
  1428  		if f.IsCompacting() {
  1429  			if f.IsIntraL0Compacting {
  1430  				// If we're picking a base compaction and we came across a seed
  1431  				// file candidate that's being intra-L0 compacted, skip the
  1432  				// interval instead of erroring out.
  1433  				continue
  1434  			}
  1435  			// We chose a compaction seed file that should not be compacting.
  1436  			// Usually means the score is not accurately accounting for files
  1437  			// already compacting, or internal state is inconsistent.
  1438  			return nil, errors.Errorf("file %s chosen as seed file for compaction should not be compacting", f.FileNum)
  1439  		}
  1440  
  1441  		c := s.baseCompactionUsingSeed(f, interval.index, minCompactionDepth)
  1442  		if c != nil {
  1443  			// Check if the chosen compaction overlaps with any files in Lbase
  1444  			// that have Compacting = true. If that's the case, this compaction
  1445  			// cannot be chosen.
  1446  			baseIter := baseFiles.Iter()
  1447  			// An interval starting at ImmediateSuccessor(key) can never be the
  1448  			// first interval of a compaction since no file can start at that
  1449  			// interval.
  1450  			m := baseIter.SeekGE(s.cmp, s.orderedIntervals[c.minIntervalIndex].startKey.key)
  1451  
  1452  			var baseCompacting bool
  1453  			for ; m != nil && !baseCompacting; m = baseIter.Next() {
  1454  				cmp := s.cmp(m.Smallest.UserKey, s.orderedIntervals[c.maxIntervalIndex+1].startKey.key)
  1455  				// Compaction is ending at exclusive bound of c.maxIntervalIndex+1
  1456  				if cmp > 0 || (cmp == 0 && !s.orderedIntervals[c.maxIntervalIndex+1].startKey.isLargest) {
  1457  					break
  1458  				}
  1459  				baseCompacting = baseCompacting || m.IsCompacting()
  1460  			}
  1461  			if baseCompacting {
  1462  				continue
  1463  			}
  1464  			return c, nil
  1465  		}
  1466  	}
  1467  	return nil, nil
  1468  }
  1469  
  1470  // Helper function for building an L0 -> Lbase compaction using a seed interval
  1471  // and seed file in that seed interval.
  1472  func (s *L0Sublevels) baseCompactionUsingSeed(
  1473  	f *FileMetadata, intervalIndex int, minCompactionDepth int,
  1474  ) *L0CompactionFiles {
  1475  	c := &L0CompactionFiles{
  1476  		FilesIncluded:        newBitSet(s.levelMetadata.Len()),
  1477  		seedInterval:         intervalIndex,
  1478  		seedIntervalMinLevel: 0,
  1479  		minIntervalIndex:     f.minIntervalIndex,
  1480  		maxIntervalIndex:     f.maxIntervalIndex,
  1481  	}
  1482  	c.addFile(f)
  1483  
  1484  	// The first iteration of this loop builds the compaction at the seed file's
  1485  	// sublevel. Future iterations expand on this compaction by stacking more
  1486  	// files from intervalIndex and repeating. This is an optional activity so
  1487  	// when it fails we can fallback to the last successful candidate.
  1488  	var lastCandidate *L0CompactionFiles
  1489  	interval := &s.orderedIntervals[intervalIndex]
  1490  
  1491  	for i := 0; i < len(interval.files); i++ {
  1492  		f2 := interval.files[i]
  1493  		sl := f2.SubLevel
  1494  		c.seedIntervalStackDepthReduction++
  1495  		c.seedIntervalMaxLevel = sl
  1496  		c.addFile(f2)
  1497  		// The seed file is in the lowest sublevel in the seed interval, but it
  1498  		// may overlap with other files in even lower sublevels. For correctness
  1499  		// we need to grow our interval to include those files, and capture all
  1500  		// files in the next level that fall in this extended interval and so
  1501  		// on. This can result in a triangular shape like the following where
  1502  		// again the X axis is the key intervals and the Y axis is oldest to
  1503  		// youngest. Note that it is not necessary for correctness to fill out
  1504  		// the shape at the higher sub-levels to make it more rectangular since
  1505  		// the invariant only requires that younger versions of a key not be
  1506  		// moved to Lbase while leaving behind older versions.
  1507  		//                     -
  1508  		//                    ---
  1509  		//                   -----
  1510  		// It may be better for performance to have a more rectangular shape
  1511  		// since the files being left behind will overlap with the same Lbase
  1512  		// key range as that of this compaction. But there is also the danger
  1513  		// that in trying to construct a more rectangular shape we will be
  1514  		// forced to pull in a file that is already compacting. We expect
  1515  		// extendCandidateToRectangle to eventually be called on this compaction
  1516  		// if it's chosen, at which point we would iterate backward and choose
  1517  		// those files. This logic is similar to compaction.grow for non-L0
  1518  		// compactions.
  1519  		done := false
  1520  		for currLevel := sl - 1; currLevel >= 0; currLevel-- {
  1521  			if !s.extendFiles(currLevel, math.MaxUint64, c) {
  1522  				// Failed to extend due to ongoing compaction.
  1523  				done = true
  1524  				break
  1525  			}
  1526  		}
  1527  		if done {
  1528  			break
  1529  		}
  1530  		// Observed some compactions using > 1GB from L0 in an import
  1531  		// experiment. Very long running compactions are not great as they
  1532  		// reduce concurrency while they run, and take a while to produce
  1533  		// results, though they're sometimes unavoidable. There is a tradeoff
  1534  		// here in that adding more depth is more efficient in reducing stack
  1535  		// depth, but long running compactions reduce flexibility in what can
  1536  		// run concurrently in L0 and even Lbase -> Lbase+1. An increase more
  1537  		// than 150% in bytes since the last candidate compaction (along with a
  1538  		// total compaction size in excess of 100mb), or a total compaction size
  1539  		// beyond a hard limit of 500mb, is criteria for rejecting this
  1540  		// candidate. This lets us prefer slow growths as we add files, while
  1541  		// still having a hard limit. Note that if this is the first compaction
  1542  		// candidate to reach a stack depth reduction of minCompactionDepth or
  1543  		// higher, this candidate will be chosen regardless.
  1544  		if lastCandidate == nil {
  1545  			lastCandidate = &L0CompactionFiles{}
  1546  		} else if lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth &&
  1547  			c.fileBytes > 100<<20 &&
  1548  			(float64(c.fileBytes)/float64(lastCandidate.fileBytes) > 1.5 || c.fileBytes > 500<<20) {
  1549  			break
  1550  		}
  1551  		*lastCandidate = *c
  1552  	}
  1553  	if lastCandidate != nil && lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth {
  1554  		lastCandidate.FilesIncluded.clearAllBits()
  1555  		for _, f := range lastCandidate.Files {
  1556  			lastCandidate.FilesIncluded.markBit(f.L0Index)
  1557  		}
  1558  		return lastCandidate
  1559  	}
  1560  	return nil
  1561  }
  1562  
  1563  // Expands fields in the provided L0CompactionFiles instance (cFiles) to
  1564  // include overlapping files in the specified sublevel. Returns true if the
  1565  // compaction is possible (i.e. does not conflict with any base/intra-L0
  1566  // compacting files).
  1567  func (s *L0Sublevels) extendFiles(
  1568  	sl int, earliestUnflushedSeqNum uint64, cFiles *L0CompactionFiles,
  1569  ) bool {
  1570  	index, _ := slices.BinarySearchFunc(s.levelFiles[sl], cFiles.minIntervalIndex, func(a *FileMetadata, b int) int {
  1571  		return stdcmp.Compare(a.maxIntervalIndex, b)
  1572  	})
  1573  	for ; index < len(s.levelFiles[sl]); index++ {
  1574  		f := s.levelFiles[sl][index]
  1575  		if f.minIntervalIndex > cFiles.maxIntervalIndex {
  1576  			break
  1577  		}
  1578  		if f.IsCompacting() {
  1579  			return false
  1580  		}
  1581  		// Skip over files that are newer than earliestUnflushedSeqNum. This is
  1582  		// okay because this compaction can just pretend these files are not in
  1583  		// L0 yet. These files must be in higher sublevels than any overlapping
  1584  		// files with f.LargestSeqNum < earliestUnflushedSeqNum, and the output
  1585  		// of the compaction will also go in a lower (older) sublevel than this
  1586  		// file by definition.
  1587  		if f.LargestSeqNum >= earliestUnflushedSeqNum {
  1588  			continue
  1589  		}
  1590  		cFiles.addFile(f)
  1591  	}
  1592  	return true
  1593  }
  1594  
  1595  // PickIntraL0Compaction picks an intra-L0 compaction for files in this
  1596  // sublevel. This method is only called when a base compaction cannot be chosen.
  1597  // See comment above [PickBaseCompaction] for heuristics involved in this
  1598  // selection.
  1599  func (s *L0Sublevels) PickIntraL0Compaction(
  1600  	earliestUnflushedSeqNum uint64, minCompactionDepth int,
  1601  ) (*L0CompactionFiles, error) {
  1602  	scoredIntervals := make([]intervalAndScore, len(s.orderedIntervals))
  1603  	for i := range s.orderedIntervals {
  1604  		interval := &s.orderedIntervals[i]
  1605  		depth := len(interval.files) - interval.compactingFileCount
  1606  		if minCompactionDepth > depth {
  1607  			continue
  1608  		}
  1609  		scoredIntervals[i] = intervalAndScore{interval: i, score: depth}
  1610  	}
  1611  	sort.Sort(intervalSorterByDecreasingScore(scoredIntervals))
  1612  
  1613  	// Optimization to avoid considering different intervals that are likely to
  1614  	// choose the same seed file. Again this is just to reduce wasted work.
  1615  	consideredIntervals := newBitSet(len(s.orderedIntervals))
  1616  	for _, scoredInterval := range scoredIntervals {
  1617  		interval := &s.orderedIntervals[scoredInterval.interval]
  1618  		if consideredIntervals[interval.index] {
  1619  			continue
  1620  		}
  1621  
  1622  		var f *FileMetadata
  1623  		// Pick the seed file for the interval as the file in the highest
  1624  		// sub-level.
  1625  		stackDepthReduction := scoredInterval.score
  1626  		for i := len(interval.files) - 1; i >= 0; i-- {
  1627  			f = interval.files[i]
  1628  			if f.IsCompacting() {
  1629  				break
  1630  			}
  1631  			consideredIntervals.markBits(f.minIntervalIndex, f.maxIntervalIndex+1)
  1632  			// Can this be the seed file? Files with newer sequence numbers than
  1633  			// earliestUnflushedSeqNum cannot be in the compaction.
  1634  			if f.LargestSeqNum >= earliestUnflushedSeqNum {
  1635  				stackDepthReduction--
  1636  				if stackDepthReduction == 0 {
  1637  					break
  1638  				}
  1639  			} else {
  1640  				break
  1641  			}
  1642  		}
  1643  		if stackDepthReduction < minCompactionDepth {
  1644  			// Can't use this interval.
  1645  			continue
  1646  		}
  1647  
  1648  		if f == nil {
  1649  			return nil, errors.New("no seed file found in sublevel intervals")
  1650  		}
  1651  		if f.IsCompacting() {
  1652  			// This file could be in a concurrent intra-L0 or base compaction.
  1653  			// Try another interval.
  1654  			continue
  1655  		}
  1656  
  1657  		// We have a seed file. Build a compaction off of that seed.
  1658  		c := s.intraL0CompactionUsingSeed(
  1659  			f, interval.index, earliestUnflushedSeqNum, minCompactionDepth)
  1660  		if c != nil {
  1661  			return c, nil
  1662  		}
  1663  	}
  1664  	return nil, nil
  1665  }
  1666  
  1667  func (s *L0Sublevels) intraL0CompactionUsingSeed(
  1668  	f *FileMetadata, intervalIndex int, earliestUnflushedSeqNum uint64, minCompactionDepth int,
  1669  ) *L0CompactionFiles {
  1670  	// We know that all the files that overlap with intervalIndex have
  1671  	// LargestSeqNum < earliestUnflushedSeqNum, but for other intervals
  1672  	// we need to exclude files >= earliestUnflushedSeqNum
  1673  
  1674  	c := &L0CompactionFiles{
  1675  		FilesIncluded:           newBitSet(s.levelMetadata.Len()),
  1676  		seedInterval:            intervalIndex,
  1677  		seedIntervalMaxLevel:    len(s.levelFiles) - 1,
  1678  		minIntervalIndex:        f.minIntervalIndex,
  1679  		maxIntervalIndex:        f.maxIntervalIndex,
  1680  		isIntraL0:               true,
  1681  		earliestUnflushedSeqNum: earliestUnflushedSeqNum,
  1682  	}
  1683  	c.addFile(f)
  1684  
  1685  	var lastCandidate *L0CompactionFiles
  1686  	interval := &s.orderedIntervals[intervalIndex]
  1687  	slIndex := len(interval.files) - 1
  1688  	for {
  1689  		if interval.files[slIndex] == f {
  1690  			break
  1691  		}
  1692  		slIndex--
  1693  	}
  1694  	// The first iteration of this loop produces an intra-L0 compaction at the
  1695  	// seed level. Iterations after that optionally add to the compaction by
  1696  	// stacking more files from intervalIndex and repeating. This is an optional
  1697  	// activity so when it fails we can fallback to the last successful
  1698  	// candidate. The code stops adding when it can't add more, or when
  1699  	// fileBytes grows too large.
  1700  	for ; slIndex >= 0; slIndex-- {
  1701  		f2 := interval.files[slIndex]
  1702  		sl := f2.SubLevel
  1703  		if f2.IsCompacting() {
  1704  			break
  1705  		}
  1706  		c.seedIntervalStackDepthReduction++
  1707  		c.seedIntervalMinLevel = sl
  1708  		c.addFile(f2)
  1709  		// The seed file captures all files in the higher level that fall in the
  1710  		// range of intervals. That may extend the range of intervals so for
  1711  		// correctness we need to capture all files in the next higher level
  1712  		// that fall in this extended interval and so on. This can result in an
  1713  		// inverted triangular shape like the following where again the X axis
  1714  		// is the key intervals and the Y axis is oldest to youngest. Note that
  1715  		// it is not necessary for correctness to fill out the shape at lower
  1716  		// sub-levels to make it more rectangular since the invariant only
  1717  		// requires that if we move an older seqnum for key k into a file that
  1718  		// has a higher seqnum, we also move all younger seqnums for that key k
  1719  		// into that file.
  1720  		//                  -----
  1721  		//                   ---
  1722  		//                    -
  1723  		// It may be better for performance to have a more rectangular shape
  1724  		// since it will reduce the stack depth for more intervals. But there is
  1725  		// also the danger that in explicitly trying to construct a more
  1726  		// rectangular shape we will be forced to pull in a file that is already
  1727  		// compacting. We assume that the performance concern is not a practical
  1728  		// issue.
  1729  		done := false
  1730  		for currLevel := sl + 1; currLevel < len(s.levelFiles); currLevel++ {
  1731  			if !s.extendFiles(currLevel, earliestUnflushedSeqNum, c) {
  1732  				// Failed to extend due to ongoing compaction.
  1733  				done = true
  1734  				break
  1735  			}
  1736  		}
  1737  		if done {
  1738  			break
  1739  		}
  1740  		if lastCandidate == nil {
  1741  			lastCandidate = &L0CompactionFiles{}
  1742  		} else if lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth &&
  1743  			c.fileBytes > 100<<20 &&
  1744  			(float64(c.fileBytes)/float64(lastCandidate.fileBytes) > 1.5 || c.fileBytes > 500<<20) {
  1745  			break
  1746  		}
  1747  		*lastCandidate = *c
  1748  	}
  1749  	if lastCandidate != nil && lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth {
  1750  		lastCandidate.FilesIncluded.clearAllBits()
  1751  		for _, f := range lastCandidate.Files {
  1752  			lastCandidate.FilesIncluded.markBit(f.L0Index)
  1753  		}
  1754  		s.extendCandidateToRectangle(
  1755  			lastCandidate.minIntervalIndex, lastCandidate.maxIntervalIndex, lastCandidate, false)
  1756  		return lastCandidate
  1757  	}
  1758  	return nil
  1759  }
  1760  
  1761  // ExtendL0ForBaseCompactionTo extends the specified base compaction candidate
  1762  // L0CompactionFiles to optionally cover more files in L0 without "touching" any
  1763  // of the passed-in keys (i.e. the smallest/largest bounds are exclusive), as
  1764  // including any user keys for those internal keys could require choosing more
  1765  // files in LBase which is undesirable. Unbounded start/end keys are indicated
  1766  // by passing in the InvalidInternalKey.
  1767  func (s *L0Sublevels) ExtendL0ForBaseCompactionTo(
  1768  	smallest, largest InternalKey, candidate *L0CompactionFiles,
  1769  ) bool {
  1770  	firstIntervalIndex := 0
  1771  	lastIntervalIndex := len(s.orderedIntervals) - 1
  1772  	if smallest.Kind() != base.InternalKeyKindInvalid {
  1773  		if smallest.Trailer == base.InternalKeyRangeDeleteSentinel {
  1774  			// Starting at smallest.UserKey == interval.startKey is okay.
  1775  			firstIntervalIndex = sort.Search(len(s.orderedIntervals), func(i int) bool {
  1776  				return s.cmp(smallest.UserKey, s.orderedIntervals[i].startKey.key) <= 0
  1777  			})
  1778  		} else {
  1779  			firstIntervalIndex = sort.Search(len(s.orderedIntervals), func(i int) bool {
  1780  				// Need to start at >= smallest since if we widen too much we may miss
  1781  				// an Lbase file that overlaps with an L0 file that will get picked in
  1782  				// this widening, which would be bad. This interval will not start with
  1783  				// an immediate successor key.
  1784  				return s.cmp(smallest.UserKey, s.orderedIntervals[i].startKey.key) < 0
  1785  			})
  1786  		}
  1787  	}
  1788  	if largest.Kind() != base.InternalKeyKindInvalid {
  1789  		// First interval that starts at or beyond the largest. This interval will not
  1790  		// start with an immediate successor key.
  1791  		lastIntervalIndex = sort.Search(len(s.orderedIntervals), func(i int) bool {
  1792  			return s.cmp(largest.UserKey, s.orderedIntervals[i].startKey.key) <= 0
  1793  		})
  1794  		// Right now, lastIntervalIndex has a startKey that extends beyond largest.
  1795  		// The previous interval, by definition, has an end key higher than largest.
  1796  		// Iterate back twice to get the last interval that's completely within
  1797  		// (smallest, largest). Except in the case where we went past the end of the
  1798  		// list; in that case, the last interval to include is the very last
  1799  		// interval in the list.
  1800  		if lastIntervalIndex < len(s.orderedIntervals) {
  1801  			lastIntervalIndex--
  1802  		}
  1803  		lastIntervalIndex--
  1804  	}
  1805  	if lastIntervalIndex < firstIntervalIndex {
  1806  		return false
  1807  	}
  1808  	return s.extendCandidateToRectangle(firstIntervalIndex, lastIntervalIndex, candidate, true)
  1809  }
  1810  
  1811  // Best-effort attempt to make the compaction include more files in the
  1812  // rectangle defined by [minIntervalIndex, maxIntervalIndex] on the X axis and
  1813  // bounded on the Y axis by seedIntervalMinLevel and seedIntervalMaxLevel.
  1814  //
  1815  // This is strictly an optional extension; at any point where we can't feasibly
  1816  // add more files, the sublevel iteration can be halted early and candidate will
  1817  // still be a correct compaction candidate.
  1818  //
  1819  // Consider this scenario (original candidate is inside the rectangle), with
  1820  // isBase = true and interval bounds a-j (from the union of base file bounds and
  1821  // that of compaction candidate):
  1822  //
  1823  //	           _______
  1824  //	L0.3  a--d |  g-j|
  1825  //	L0.2       | f--j|         r-t
  1826  //	L0.1   b-d |e---j|
  1827  //	L0.0  a--d | f--j| l--o  p-----x
  1828  //
  1829  //	Lbase a--------i    m---------w
  1830  //
  1831  // This method will iterate from the bottom up. At L0.0, it will add a--d since
  1832  // it's in the bounds, then add b-d, then a--d, and so on, to produce this:
  1833  //
  1834  //	     _____________
  1835  //	L0.3 |a--d    g-j|
  1836  //	L0.2 |       f--j|         r-t
  1837  //	L0.1 | b-d  e---j|
  1838  //	L0.0 |a--d   f--j| l--o  p-----x
  1839  //
  1840  //	Lbase a-------i     m---------w
  1841  //
  1842  // Let's assume that, instead of a--d in the top sublevel, we had 3 files, a-b,
  1843  // bb-c, and cc-d, of which bb-c is compacting. Let's also add another sublevel
  1844  // L0.4 with some files, all of which aren't compacting:
  1845  //
  1846  //	L0.4  a------c ca--d _______
  1847  //	L0.3  a-b bb-c  cc-d |  g-j|
  1848  //	L0.2                 | f--j|         r-t
  1849  //	L0.1    b----------d |e---j|
  1850  //	L0.0  a------------d | f--j| l--o  p-----x
  1851  //
  1852  //	Lbase a------------------i    m---------w
  1853  //
  1854  // This method then needs to choose between the left side of L0.3 bb-c (i.e.
  1855  // a-b), or the right side (i.e. cc-d and g-j) for inclusion in this compaction.
  1856  // Since the right side has more files as well as one file that has already been
  1857  // picked, it gets chosen at that sublevel, resulting in this intermediate
  1858  // compaction:
  1859  //
  1860  //	L0.4  a------c ca--d
  1861  //	              ______________
  1862  //	L0.3  a-b bb-c| cc-d    g-j|
  1863  //	L0.2 _________|        f--j|         r-t
  1864  //	L0.1 |  b----------d  e---j|
  1865  //	L0.0 |a------------d   f--j| l--o  p-----x
  1866  //
  1867  //	Lbase a------------------i    m---------w
  1868  //
  1869  // Since bb-c had to be excluded at L0.3, the interval bounds for L0.4 are
  1870  // actually ca-j, since ca is the next interval start key after the end interval
  1871  // of bb-c. This would result in only ca-d being chosen at that sublevel, even
  1872  // though a--c is also not compacting. This is the final result:
  1873  //
  1874  //	              ______________
  1875  //	L0.4  a------c|ca--d       |
  1876  //	L0.3  a-b bb-c| cc-d    g-j|
  1877  //	L0.2 _________|        f--j|         r-t
  1878  //	L0.1 |  b----------d  e---j|
  1879  //	L0.0 |a------------d   f--j| l--o  p-----x
  1880  //
  1881  //	Lbase a------------------i    m---------w
  1882  //
  1883  // TODO(bilal): Add more targeted tests for this method, through
  1884  // ExtendL0ForBaseCompactionTo and intraL0CompactionUsingSeed.
  1885  func (s *L0Sublevels) extendCandidateToRectangle(
  1886  	minIntervalIndex int, maxIntervalIndex int, candidate *L0CompactionFiles, isBase bool,
  1887  ) bool {
  1888  	candidate.preExtensionMinInterval = candidate.minIntervalIndex
  1889  	candidate.preExtensionMaxInterval = candidate.maxIntervalIndex
  1890  	// Extend {min,max}IntervalIndex to include all of the candidate's current
  1891  	// bounds.
  1892  	if minIntervalIndex > candidate.minIntervalIndex {
  1893  		minIntervalIndex = candidate.minIntervalIndex
  1894  	}
  1895  	if maxIntervalIndex < candidate.maxIntervalIndex {
  1896  		maxIntervalIndex = candidate.maxIntervalIndex
  1897  	}
  1898  	var startLevel, increment, endLevel int
  1899  	if isBase {
  1900  		startLevel = 0
  1901  		increment = +1
  1902  		// seedIntervalMaxLevel is inclusive, while endLevel is exclusive.
  1903  		endLevel = candidate.seedIntervalMaxLevel + 1
  1904  	} else {
  1905  		startLevel = len(s.levelFiles) - 1
  1906  		increment = -1
  1907  		// seedIntervalMinLevel is inclusive, while endLevel is exclusive.
  1908  		endLevel = candidate.seedIntervalMinLevel - 1
  1909  	}
  1910  	// Stats for files.
  1911  	addedCount := 0
  1912  	// Iterate from the oldest sub-level for L0 -> Lbase and youngest sub-level
  1913  	// for intra-L0. The idea here is that anything that can't be included from
  1914  	// that level constrains what can be included from the next level. This
  1915  	// change in constraint is directly incorporated into minIntervalIndex,
  1916  	// maxIntervalIndex.
  1917  	for sl := startLevel; sl != endLevel; sl += increment {
  1918  		files := s.levelFiles[sl]
  1919  		// Find the first file that overlaps with minIntervalIndex.
  1920  		index := sort.Search(len(files), func(i int) bool {
  1921  			return minIntervalIndex <= files[i].maxIntervalIndex
  1922  		})
  1923  		// Track the files that are fully within the current constraint of
  1924  		// [minIntervalIndex, maxIntervalIndex].
  1925  		firstIndex := -1
  1926  		lastIndex := -1
  1927  		for ; index < len(files); index++ {
  1928  			f := files[index]
  1929  			if f.minIntervalIndex > maxIntervalIndex {
  1930  				break
  1931  			}
  1932  			include := true
  1933  			// Extends out on the left so can't be included. This narrows what
  1934  			// we can included in the next level.
  1935  			if f.minIntervalIndex < minIntervalIndex {
  1936  				include = false
  1937  				minIntervalIndex = f.maxIntervalIndex + 1
  1938  			}
  1939  			// Extends out on the right so can't be included.
  1940  			if f.maxIntervalIndex > maxIntervalIndex {
  1941  				include = false
  1942  				maxIntervalIndex = f.minIntervalIndex - 1
  1943  			}
  1944  			if !include {
  1945  				continue
  1946  			}
  1947  			if firstIndex == -1 {
  1948  				firstIndex = index
  1949  			}
  1950  			lastIndex = index
  1951  		}
  1952  		if minIntervalIndex > maxIntervalIndex {
  1953  			// We excluded files that prevent continuation.
  1954  			break
  1955  		}
  1956  		if firstIndex < 0 {
  1957  			// No files to add in this sub-level.
  1958  			continue
  1959  		}
  1960  		// We have the files in [firstIndex, lastIndex] as potential for
  1961  		// inclusion. Some of these may already have been picked. Some of them
  1962  		// may be already compacting. The latter is tricky since we have to
  1963  		// decide whether to contract minIntervalIndex or maxIntervalIndex when
  1964  		// we encounter an already compacting file. We pick the longest sequence
  1965  		// between firstIndex and lastIndex of non-compacting files -- this is
  1966  		// represented by [candidateNonCompactingFirst,
  1967  		// candidateNonCompactingLast].
  1968  		nonCompactingFirst := -1
  1969  		currentRunHasAlreadyPickedFiles := false
  1970  		candidateNonCompactingFirst := -1
  1971  		candidateNonCompactingLast := -1
  1972  		candidateHasAlreadyPickedFiles := false
  1973  		for index = firstIndex; index <= lastIndex; index++ {
  1974  			f := files[index]
  1975  			if f.IsCompacting() {
  1976  				if nonCompactingFirst != -1 {
  1977  					last := index - 1
  1978  					// Prioritize runs of consecutive non-compacting files that
  1979  					// have files that have already been picked. That is to say,
  1980  					// if candidateHasAlreadyPickedFiles == true, we stick with
  1981  					// it, and if currentRunHasAlreadyPickedfiles == true, we
  1982  					// pick that run even if it contains fewer files than the
  1983  					// previous candidate.
  1984  					if !candidateHasAlreadyPickedFiles && (candidateNonCompactingFirst == -1 ||
  1985  						currentRunHasAlreadyPickedFiles ||
  1986  						(last-nonCompactingFirst) > (candidateNonCompactingLast-candidateNonCompactingFirst)) {
  1987  						candidateNonCompactingFirst = nonCompactingFirst
  1988  						candidateNonCompactingLast = last
  1989  						candidateHasAlreadyPickedFiles = currentRunHasAlreadyPickedFiles
  1990  					}
  1991  				}
  1992  				nonCompactingFirst = -1
  1993  				currentRunHasAlreadyPickedFiles = false
  1994  				continue
  1995  			}
  1996  			if nonCompactingFirst == -1 {
  1997  				nonCompactingFirst = index
  1998  			}
  1999  			if candidate.FilesIncluded[f.L0Index] {
  2000  				currentRunHasAlreadyPickedFiles = true
  2001  			}
  2002  		}
  2003  		// Logic duplicated from inside the for loop above.
  2004  		if nonCompactingFirst != -1 {
  2005  			last := index - 1
  2006  			if !candidateHasAlreadyPickedFiles && (candidateNonCompactingFirst == -1 ||
  2007  				currentRunHasAlreadyPickedFiles ||
  2008  				(last-nonCompactingFirst) > (candidateNonCompactingLast-candidateNonCompactingFirst)) {
  2009  				candidateNonCompactingFirst = nonCompactingFirst
  2010  				candidateNonCompactingLast = last
  2011  			}
  2012  		}
  2013  		if candidateNonCompactingFirst == -1 {
  2014  			// All files are compacting. There will be gaps that we could
  2015  			// exploit to continue, but don't bother.
  2016  			break
  2017  		}
  2018  		// May need to shrink [minIntervalIndex, maxIntervalIndex] for the next level.
  2019  		if candidateNonCompactingFirst > firstIndex {
  2020  			minIntervalIndex = files[candidateNonCompactingFirst-1].maxIntervalIndex + 1
  2021  		}
  2022  		if candidateNonCompactingLast < lastIndex {
  2023  			maxIntervalIndex = files[candidateNonCompactingLast+1].minIntervalIndex - 1
  2024  		}
  2025  		for index := candidateNonCompactingFirst; index <= candidateNonCompactingLast; index++ {
  2026  			f := files[index]
  2027  			if f.IsCompacting() {
  2028  				// TODO(bilal): Do a logger.Fatalf instead of a panic, for
  2029  				// cleaner unwinding and error messages.
  2030  				panic(fmt.Sprintf("expected %s to not be compacting", f.FileNum))
  2031  			}
  2032  			if candidate.isIntraL0 && f.LargestSeqNum >= candidate.earliestUnflushedSeqNum {
  2033  				continue
  2034  			}
  2035  			if !candidate.FilesIncluded[f.L0Index] {
  2036  				addedCount++
  2037  				candidate.addFile(f)
  2038  			}
  2039  		}
  2040  	}
  2041  	return addedCount > 0
  2042  }