github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/internal/manifest/l0_sublevels.go

github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/internal/manifest/l0_sublevels.go (about)

     1  // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package manifest
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"math"
    11  	"sort"
    12  	"strings"
    13  
    14  	"github.com/cockroachdb/errors"
    15  	"github.com/zuoyebang/bitalostable/internal/base"
    16  	"github.com/zuoyebang/bitalostable/internal/invariants"
    17  )
    18  
    19  // errInvalidL0SublevelsOpt is for use in AddL0Files when the incremental
    20  // sublevel generation optimization failed, and NewL0Sublevels must be called.
    21  var errInvalidL0SublevelsOpt = errors.New("bitalostable: L0 sublevel generation optimization cannot be used")
    22  
    23  // Intervals are of the form [start, end) with no gap between intervals. Each
    24  // file overlaps perfectly with a sequence of intervals. This perfect overlap
    25  // occurs because the union of file boundary keys is used to pick intervals.
    26  // However the largest key in a file is inclusive, so when it is used as
    27  // an interval, the actual key is ImmediateSuccessor(key). We don't have the
    28  // ImmediateSuccessor function to do this computation, so we instead keep an
    29  // isLargest bool to remind the code about this fact. This is used for
    30  // comparisons in the following manner:
    31  // - intervalKey{k, false} < intervalKey{k, true}
    32  // - k1 < k2 -> intervalKey{k1, _} < intervalKey{k2, _}.
    33  //
    34  // Note that the file's largest key is exclusive if the internal key
    35  // has a trailer matching the rangedel sentinel key. In this case, we set
    36  // isLargest to false for end interval computation.
    37  //
    38  // For example, consider three files with bounds [a,e], [b,g], and [e,j]. The
    39  // interval keys produced would be intervalKey{a, false}, intervalKey{b, false},
    40  // intervalKey{e, false}, intervalKey{e, true}, intervalKey{g, true} and
    41  // intervalKey{j, true}, resulting in intervals
    42  // [a, b), [b, (e, false)), [(e,false), (e, true)), [(e, true), (g, true)) and
    43  // [(g, true), (j, true)). The first file overlaps with the first three
    44  // perfectly, the second file overlaps with the second through to fourth
    45  // intervals, and the third file overlaps with the last three.
    46  //
    47  // The intervals are indexed starting from 0, with the index of the interval
    48  // being the index of the start key of the interval.
    49  //
    50  // In addition to helping with compaction picking, we use interval indices
    51  // to assign each file an interval range once. Subsequent operations, say
    52  // picking overlapping files for a compaction, only need to use the index
    53  // numbers and so avoid expensive byte slice comparisons.
    54  type intervalKey struct {
    55  	key       []byte
    56  	isLargest bool
    57  }
    58  
    59  // intervalKeyTemp is used in the sortAndSweep step. It contains additional metadata
    60  // which is used to generate the {min,max}IntervalIndex for files.
    61  type intervalKeyTemp struct {
    62  	intervalKey intervalKey
    63  	fileMeta    *FileMetadata
    64  	isEndKey    bool
    65  }
    66  
    67  func (i *intervalKeyTemp) setFileIntervalIndex(idx int) {
    68  	if i.isEndKey {
    69  		// This is the right endpoint of some file interval, so the
    70  		// file.maxIntervalIndex must be j - 1 as maxIntervalIndex is
    71  		// inclusive.
    72  		i.fileMeta.maxIntervalIndex = idx - 1
    73  		return
    74  	}
    75  	// This is the left endpoint for some file interval, so the
    76  	// file.minIntervalIndex must be j.
    77  	i.fileMeta.minIntervalIndex = idx
    78  }
    79  
    80  func intervalKeyCompare(cmp Compare, a, b intervalKey) int {
    81  	rv := cmp(a.key, b.key)
    82  	if rv == 0 {
    83  		if a.isLargest && !b.isLargest {
    84  			return +1
    85  		}
    86  		if !a.isLargest && b.isLargest {
    87  			return -1
    88  		}
    89  	}
    90  	return rv
    91  }
    92  
    93  type intervalKeySorter struct {
    94  	keys []intervalKeyTemp
    95  	cmp  Compare
    96  }
    97  
    98  func (s intervalKeySorter) Len() int { return len(s.keys) }
    99  func (s intervalKeySorter) Less(i, j int) bool {
   100  	return intervalKeyCompare(s.cmp, s.keys[i].intervalKey, s.keys[j].intervalKey) < 0
   101  }
   102  func (s intervalKeySorter) Swap(i, j int) {
   103  	s.keys[i], s.keys[j] = s.keys[j], s.keys[i]
   104  }
   105  
   106  // sortAndSweep will sort the intervalKeys using intervalKeySorter, remove the
   107  // duplicate fileIntervals, and set the {min, max}IntervalIndex for the files.
   108  func sortAndSweep(keys []intervalKeyTemp, cmp Compare) []intervalKeyTemp {
   109  	if len(keys) == 0 {
   110  		return nil
   111  	}
   112  	sorter := intervalKeySorter{keys: keys, cmp: cmp}
   113  	sort.Sort(sorter)
   114  
   115  	// intervalKeys are generated using the file bounds. Specifically, there are 2 intervalKeys
   116  	// for each file, and len(keys) = 2 * number of files. Each intervalKeyTemp stores information
   117  	// about which file it was generated from, and whether the key represents the end key of the file.
   118  	// So, as we're deduplicating the `keys` slice, we're guaranteed to iterate over the interval
   119  	// keys belonging to each of the files. Since the file.{min,max}IntervalIndex points to the position
   120  	// of the files bounds in the deduplicated `keys` slice, we can determine file.{min,max}IntervalIndex
   121  	// during the iteration.
   122  	i := 0
   123  	j := 0
   124  	for i < len(keys) {
   125  		// loop invariant: j <= i
   126  		currKey := keys[i]
   127  		keys[j] = keys[i]
   128  
   129  		for {
   130  			keys[i].setFileIntervalIndex(j)
   131  			i++
   132  			if i >= len(keys) || intervalKeyCompare(cmp, currKey.intervalKey, keys[i].intervalKey) != 0 {
   133  				break
   134  			}
   135  		}
   136  		j++
   137  	}
   138  	return keys[:j]
   139  }
   140  
   141  // A key interval of the form [start, end). The end is not represented here
   142  // since it is implicit in the start of the next interval. The last interval is
   143  // an exception but we don't need to ever lookup the end of that interval; the
   144  // last fileInterval will only act as an end key marker. The set of intervals
   145  // is const after initialization.
   146  type fileInterval struct {
   147  	index    int
   148  	startKey intervalKey
   149  
   150  	// True iff some file in this interval is compacting to base. Such intervals
   151  	// cannot have any files participate in L0 -> Lbase compactions.
   152  	isBaseCompacting bool
   153  
   154  	// The min and max intervals index across all the files that overlap with this
   155  	// interval. Inclusive on both sides.
   156  	filesMinIntervalIndex int
   157  	filesMaxIntervalIndex int
   158  
   159  	// True if another interval that has a file extending into this interval is
   160  	// undergoing a compaction into Lbase. In other words, this bool is true
   161  	// if any interval in [filesMinIntervalIndex,
   162  	// filesMaxIntervalIndex] has isBaseCompacting set to true. This
   163  	// lets the compaction picker de-prioritize this interval for picking
   164  	// compactions, since there's a high chance that a base compaction with a
   165  	// sufficient height of sublevels rooted at this interval could not be
   166  	// chosen due to the ongoing base compaction in the
   167  	// other interval. If the file straddling the two intervals is at a
   168  	// sufficiently high sublevel (with enough compactible files below it to
   169  	// satisfy minCompactionDepth), this is not an issue, but to optimize for
   170  	// quickly picking base compactions far away from other base compactions,
   171  	// this bool is used as a heuristic (but not as a complete disqualifier).
   172  	intervalRangeIsBaseCompacting bool
   173  
   174  	// All files in this interval, in increasing sublevel order.
   175  	files []*FileMetadata
   176  
   177  	// len(files) - compactingFileCount is the stack depth that requires
   178  	// starting new compactions. This metric is not precise since the
   179  	// compactingFileCount can include files that are part of N (where N > 1)
   180  	// intra-L0 compactions, so the stack depth after those complete will be
   181  	// len(files) - compactingFileCount + N. We ignore this imprecision since
   182  	// we don't want to track which files are part of which intra-L0
   183  	// compaction.
   184  	compactingFileCount int
   185  
   186  	// Interpolated from files in this interval. For files spanning multiple
   187  	// intervals, we assume an equal distribution of bytes across all those
   188  	// intervals.
   189  	estimatedBytes uint64
   190  }
   191  
   192  // Helper type for any cases requiring a bool slice.
   193  type bitSet []bool
   194  
   195  func newBitSet(n int) bitSet {
   196  	return make([]bool, n)
   197  }
   198  
   199  func (b *bitSet) markBit(i int) {
   200  	(*b)[i] = true
   201  }
   202  
   203  func (b *bitSet) markBits(start, end int) {
   204  	for i := start; i < end; i++ {
   205  		(*b)[i] = true
   206  	}
   207  }
   208  
   209  func (b *bitSet) clearAllBits() {
   210  	for i := range *b {
   211  		(*b)[i] = false
   212  	}
   213  }
   214  
   215  // L0Compaction describes an active compaction with inputs from L0.
   216  type L0Compaction struct {
   217  	Smallest  InternalKey
   218  	Largest   InternalKey
   219  	IsIntraL0 bool
   220  }
   221  
   222  // L0Sublevels represents a sublevel view of SSTables in L0. Tables in one
   223  // sublevel are non-overlapping in key ranges, and keys in higher-indexed
   224  // sublevels shadow older versions in lower-indexed sublevels. These invariants
   225  // are similar to the regular level invariants, except with higher indexed
   226  // sublevels having newer keys as opposed to lower indexed levels.
   227  //
   228  // There is no limit to the number of sublevels that can exist in L0 at any
   229  // time, however read and compaction performance is best when there are as few
   230  // sublevels as possible.
   231  type L0Sublevels struct {
   232  	// Levels are ordered from oldest sublevel to youngest sublevel in the
   233  	// outer slice, and the inner slice contains non-overlapping files for
   234  	// that sublevel in increasing key order. Levels is constructed from
   235  	// levelFiles and is used by callers that require a LevelSlice. The below two
   236  	// fields are treated as immutable once created in NewL0Sublevels.
   237  	Levels     []LevelSlice
   238  	levelFiles [][]*FileMetadata
   239  
   240  	cmp       Compare
   241  	formatKey base.FormatKey
   242  
   243  	fileBytes uint64
   244  	// All the L0 files, ordered from oldest to youngest.
   245  	levelMetadata *LevelMetadata
   246  
   247  	// The file intervals in increasing key order.
   248  	orderedIntervals []fileInterval
   249  
   250  	// Keys to break flushes at.
   251  	flushSplitUserKeys [][]byte
   252  
   253  	// Only used to check invariants.
   254  	addL0FilesCalled bool
   255  }
   256  
   257  type sublevelSorter []*FileMetadata
   258  
   259  // Len implements sort.Interface.
   260  func (sl sublevelSorter) Len() int {
   261  	return len(sl)
   262  }
   263  
   264  // Less implements sort.Interface.
   265  func (sl sublevelSorter) Less(i, j int) bool {
   266  	return sl[i].minIntervalIndex < sl[j].minIntervalIndex
   267  }
   268  
   269  // Swap implements sort.Interface.
   270  func (sl sublevelSorter) Swap(i, j int) {
   271  	sl[i], sl[j] = sl[j], sl[i]
   272  }
   273  
   274  // NewL0Sublevels creates an L0Sublevels instance for a given set of L0 files.
   275  // These files must all be in L0 and must be sorted by seqnum (see
   276  // SortBySeqNum). During interval iteration, when flushSplitMaxBytes bytes are
   277  // exceeded in the range of intervals since the last flush split key, a flush
   278  // split key is added.
   279  //
   280  // This method can be called without DB.mu being held, so any DB.mu protected
   281  // fields in FileMetadata cannot be accessed here, such as Compacting and
   282  // IsIntraL0Compacting. Those fields are accessed in InitCompactingFileInfo
   283  // instead.
   284  func NewL0Sublevels(
   285  	levelMetadata *LevelMetadata, cmp Compare, formatKey base.FormatKey, flushSplitMaxBytes int64,
   286  ) (*L0Sublevels, error) {
   287  	s := &L0Sublevels{cmp: cmp, formatKey: formatKey}
   288  	s.levelMetadata = levelMetadata
   289  	keys := make([]intervalKeyTemp, 0, 2*s.levelMetadata.Len())
   290  	iter := levelMetadata.Iter()
   291  	for i, f := 0, iter.First(); f != nil; i, f = i+1, iter.Next() {
   292  		f.L0Index = i
   293  		keys = append(keys, intervalKeyTemp{
   294  			intervalKey: intervalKey{key: f.Smallest.UserKey},
   295  			fileMeta:    f,
   296  			isEndKey:    false,
   297  		})
   298  		keys = append(keys, intervalKeyTemp{
   299  			intervalKey: intervalKey{
   300  				key:       f.Largest.UserKey,
   301  				isLargest: !f.Largest.IsExclusiveSentinel(),
   302  			},
   303  			fileMeta: f,
   304  			isEndKey: true,
   305  		})
   306  	}
   307  	keys = sortAndSweep(keys, cmp)
   308  	// All interval indices reference s.orderedIntervals.
   309  	s.orderedIntervals = make([]fileInterval, len(keys))
   310  	for i := range keys {
   311  		s.orderedIntervals[i] = fileInterval{
   312  			index:                 i,
   313  			startKey:              keys[i].intervalKey,
   314  			filesMinIntervalIndex: i,
   315  			filesMaxIntervalIndex: i,
   316  		}
   317  	}
   318  	// Initialize minIntervalIndex and maxIntervalIndex for each file, and use that
   319  	// to update intervals.
   320  	for f := iter.First(); f != nil; f = iter.Next() {
   321  		if err := s.addFileToSublevels(f, false /* checkInvariant */); err != nil {
   322  			return nil, err
   323  		}
   324  	}
   325  	// Sort each sublevel in increasing key order.
   326  	for i := range s.levelFiles {
   327  		sort.Sort(sublevelSorter(s.levelFiles[i]))
   328  	}
   329  
   330  	// Construct a parallel slice of sublevel B-Trees.
   331  	// TODO(jackson): Consolidate and only use the B-Trees.
   332  	for _, sublevelFiles := range s.levelFiles {
   333  		tr, ls := makeBTree(btreeCmpSmallestKey(cmp), sublevelFiles)
   334  		s.Levels = append(s.Levels, ls)
   335  		tr.release()
   336  	}
   337  
   338  	s.calculateFlushSplitKeys(flushSplitMaxBytes)
   339  	return s, nil
   340  }
   341  
   342  // Helper function to merge new intervalKeys into an existing slice
   343  // of old fileIntervals, into result. Returns the new result and a slice of ints
   344  // mapping old interval indices to new ones. The added intervalKeys do not
   345  // need to be sorted; they get sorted and deduped in this function.
   346  func mergeIntervals(
   347  	old, result []fileInterval, added []intervalKeyTemp, compare Compare,
   348  ) ([]fileInterval, []int) {
   349  	sorter := intervalKeySorter{keys: added, cmp: compare}
   350  	sort.Sort(sorter)
   351  
   352  	oldToNewMap := make([]int, len(old))
   353  	i := 0
   354  	j := 0
   355  
   356  	for i < len(old) || j < len(added) {
   357  		for j > 0 && j < len(added) && intervalKeyCompare(compare, added[j-1].intervalKey, added[j].intervalKey) == 0 {
   358  			added[j].setFileIntervalIndex(len(result) - 1)
   359  			j++
   360  		}
   361  		if i >= len(old) && j >= len(added) {
   362  			break
   363  		}
   364  		var cmp int
   365  		if i >= len(old) {
   366  			cmp = +1
   367  		}
   368  		if j >= len(added) {
   369  			cmp = -1
   370  		}
   371  		if cmp == 0 {
   372  			cmp = intervalKeyCompare(compare, old[i].startKey, added[j].intervalKey)
   373  		}
   374  		switch {
   375  		case cmp <= 0:
   376  			// Shallow-copy the existing interval.
   377  			newInterval := old[i]
   378  			result = append(result, newInterval)
   379  			oldToNewMap[i] = len(result) - 1
   380  			i++
   381  			if cmp == 0 {
   382  				added[j].setFileIntervalIndex(len(result) - 1)
   383  				j++
   384  			}
   385  		case cmp > 0:
   386  			var prevInterval fileInterval
   387  			// Insert a new interval for a newly-added file. prevInterval, if
   388  			// non-zero, will be "inherited"; we copy its files as those extend
   389  			// into this interval.
   390  			if len(result) > 0 {
   391  				prevInterval = result[len(result)-1]
   392  			}
   393  			newInterval := fileInterval{
   394  				index:                 len(result),
   395  				startKey:              added[j].intervalKey,
   396  				filesMinIntervalIndex: len(result),
   397  				filesMaxIntervalIndex: len(result),
   398  
   399  				// estimatedBytes gets recalculated later on, as the number of intervals
   400  				// the file bytes are interpolated over has changed.
   401  				estimatedBytes: 0,
   402  				// Copy the below attributes from prevInterval.
   403  				files:                         append([]*FileMetadata(nil), prevInterval.files...),
   404  				isBaseCompacting:              prevInterval.isBaseCompacting,
   405  				intervalRangeIsBaseCompacting: prevInterval.intervalRangeIsBaseCompacting,
   406  				compactingFileCount:           prevInterval.compactingFileCount,
   407  			}
   408  			result = append(result, newInterval)
   409  			added[j].setFileIntervalIndex(len(result) - 1)
   410  			j++
   411  		}
   412  	}
   413  	return result, oldToNewMap
   414  }
   415  
   416  // AddL0Files incrementally builds a new L0Sublevels for when the only
   417  // change since the receiver L0Sublevels was an addition of the specified files,
   418  // with no L0 deletions. The common case of this is an ingestion or a flush.
   419  // These files can "sit on top" of existing sublevels, creating at most one
   420  // new sublevel for a flush (and possibly multiple for an ingestion), and at
   421  // most 2*len(files) additions to s.orderedIntervals. No files must have been
   422  // deleted from L0, and the added files must all be newer in sequence numbers
   423  // than existing files in L0Sublevels. The files parameter must be sorted in
   424  // seqnum order. The levelMetadata parameter corresponds to the new L0 post
   425  // addition of files. This method is meant to be significantly more performant
   426  // than NewL0Sublevels.
   427  //
   428  // Note that this function can only be called once on a given receiver; it
   429  // appends to some slices in s which is only safe when done once. This is okay,
   430  // as the common case (generating a new L0Sublevels after a flush/ingestion) is
   431  // only going to necessitate one call of this method on a given receiver. The
   432  // returned value, if non-nil, can then have AddL0Files called on it again, and
   433  // so on. If errInvalidL0SublevelsOpt is returned as an error, it likely means
   434  // the optimization could not be applied (i.e. files added were older than files
   435  // already in the sublevels, which is possible around ingestions and in tests).
   436  // Eg. it can happen when an ingested file was ingested without queueing a flush
   437  // since it did not actually overlap with any keys in the memtable. Later on the
   438  // memtable was flushed, and the memtable had keys spanning around the ingested
   439  // file, producing a flushed file that overlapped with the ingested file in file
   440  // bounds but not in keys. It's possible for that flushed file to have a lower
   441  // LargestSeqNum than the ingested file if all the additions after the ingestion
   442  // were to another flushed file that was split into a separate sstable during
   443  // flush. Any other non-nil error means L0Sublevels generation failed in the same
   444  // way as NewL0Sublevels would likely fail.
   445  func (s *L0Sublevels) AddL0Files(
   446  	files []*FileMetadata, flushSplitMaxBytes int64, levelMetadata *LevelMetadata,
   447  ) (*L0Sublevels, error) {
   448  	if invariants.Enabled && s.addL0FilesCalled {
   449  		panic("AddL0Files called twice on the same receiver")
   450  	}
   451  	s.addL0FilesCalled = true
   452  
   453  	// Start with a shallow copy of s.
   454  	newVal := &L0Sublevels{}
   455  	*newVal = *s
   456  
   457  	newVal.addL0FilesCalled = false
   458  	newVal.levelMetadata = levelMetadata
   459  	// Deep copy levelFiles and Levels, as they are mutated and sorted below.
   460  	// Shallow copies of slices that we just append to, are okay.
   461  	newVal.levelFiles = make([][]*FileMetadata, len(s.levelFiles))
   462  	for i := range s.levelFiles {
   463  		newVal.levelFiles[i] = make([]*FileMetadata, len(s.levelFiles[i]))
   464  		copy(newVal.levelFiles[i], s.levelFiles[i])
   465  	}
   466  	newVal.Levels = make([]LevelSlice, len(s.Levels))
   467  	copy(newVal.Levels, s.Levels)
   468  
   469  	fileKeys := make([]intervalKeyTemp, 0, 2*len(files))
   470  	for _, f := range files {
   471  		left := intervalKeyTemp{
   472  			intervalKey: intervalKey{key: f.Smallest.UserKey},
   473  			fileMeta:    f,
   474  		}
   475  		right := intervalKeyTemp{
   476  			intervalKey: intervalKey{
   477  				key:       f.Largest.UserKey,
   478  				isLargest: !f.Largest.IsExclusiveSentinel(),
   479  			},
   480  			fileMeta: f,
   481  			isEndKey: true,
   482  		}
   483  		fileKeys = append(fileKeys, left, right)
   484  	}
   485  	keys := make([]fileInterval, 0, 2*levelMetadata.Len())
   486  	var oldToNewMap []int
   487  	// We can avoid the sortAndSweep step on the combined length of
   488  	// s.orderedIntervals and fileKeys by treating this as a merge of two
   489  	// sorted runs, fileKeys and s.orderedIntervals, into `keys` which will form
   490  	// newVal.orderedIntervals.
   491  	keys, oldToNewMap = mergeIntervals(s.orderedIntervals, keys, fileKeys, s.cmp)
   492  	if invariants.Enabled {
   493  		for i := 1; i < len(keys); i++ {
   494  			if intervalKeyCompare(newVal.cmp, keys[i-1].startKey, keys[i].startKey) >= 0 {
   495  				panic("keys not sorted correctly")
   496  			}
   497  		}
   498  	}
   499  	newVal.orderedIntervals = keys
   500  	// Update indices in s.orderedIntervals for fileIntervals we retained.
   501  	for _, newIdx := range oldToNewMap {
   502  		newInterval := &keys[newIdx]
   503  		newInterval.index = newIdx
   504  		// This code, and related code in the for loop below, adjusts
   505  		// files{Min,Max}IntervalIndex just for interval indices shifting due to new
   506  		// intervals, and not for any of the new files being added to the same
   507  		// intervals. The goal is to produce a state of the system that's accurate
   508  		// for all existing files, and has all the new intervals to support new
   509  		// files. Once that's done, we can just call addFileToSublevel to adjust
   510  		// all relevant intervals for new files.
   511  		newInterval.filesMinIntervalIndex = oldToNewMap[newInterval.filesMinIntervalIndex]
   512  		// maxIntervalIndexes are special. Since it's an inclusive end bound, we
   513  		// actually have to map it to the _next_ old interval's new previous
   514  		// interval. This logic is easier to understand if you see
   515  		// [f.minIntervalIndex, f.maxIntervalIndex] as [f.minIntervalIndex,
   516  		// f.maxIntervalIndex+1). The other case to remember is when the interval is
   517  		// completely empty (i.e. len(newInterval.files) == 0); in that case we want
   518  		// to refer back to ourselves regardless of additions to the right of us.
   519  		if newInterval.filesMaxIntervalIndex < len(oldToNewMap)-1 && len(newInterval.files) > 0 {
   520  			newInterval.filesMaxIntervalIndex = oldToNewMap[newInterval.filesMaxIntervalIndex+1] - 1
   521  		} else {
   522  			// newInterval.filesMaxIntervalIndex == len(oldToNewMap)-1.
   523  			newInterval.filesMaxIntervalIndex = oldToNewMap[newInterval.filesMaxIntervalIndex]
   524  		}
   525  	}
   526  	// Loop through all instances of new intervals added between two old intervals
   527  	// and expand [filesMinIntervalIndex, filesMaxIntervalIndex] of new intervals
   528  	// to reflect that of adjacent old intervals.
   529  	{
   530  		// We can skip cases where new intervals were added to the left of all
   531  		// existing intervals (eg. if the first entry in oldToNewMap is
   532  		// oldToNewMap[0] >= 1). Those intervals will only contain newly added files
   533  		// and will have their parameters adjusted down in addFileToSublevels. The
   534  		// same can also be said about new intervals that are to the right of all
   535  		// existing intervals.
   536  		lastIdx := 0
   537  		for _, newIdx := range oldToNewMap {
   538  			for i := lastIdx + 1; i < newIdx; i++ {
   539  				minIntervalIndex := i
   540  				maxIntervalIndex := i
   541  				if keys[lastIdx].filesMaxIntervalIndex != lastIdx {
   542  					// Last old interval has files extending into keys[i].
   543  					minIntervalIndex = keys[lastIdx].filesMinIntervalIndex
   544  					maxIntervalIndex = keys[lastIdx].filesMaxIntervalIndex
   545  				}
   546  
   547  				keys[i].filesMinIntervalIndex = minIntervalIndex
   548  				keys[i].filesMaxIntervalIndex = maxIntervalIndex
   549  			}
   550  			lastIdx = newIdx
   551  		}
   552  	}
   553  	// Go through old files and update interval indices.
   554  	//
   555  	// TODO(bilal): This is the only place in this method where we loop through
   556  	// all existing files, which could be much more in number than newly added
   557  	// files. See if we can avoid the need for this, either by getting rid of
   558  	// f.minIntervalIndex and f.maxIntervalIndex and calculating them on the
   559  	// fly with a binary search, or by only looping through files to the right
   560  	// of the first interval touched by this method.
   561  	for sublevel := range s.Levels {
   562  		s.Levels[sublevel].Each(func(f *FileMetadata) {
   563  			oldIntervalDelta := f.maxIntervalIndex - f.minIntervalIndex + 1
   564  			oldMinIntervalIndex := f.minIntervalIndex
   565  			f.minIntervalIndex = oldToNewMap[f.minIntervalIndex]
   566  			// maxIntervalIndex is special. Since it's an inclusive end bound, we
   567  			// actually have to map it to the _next_ old interval's new previous
   568  			// interval. This logic is easier to understand if you see
   569  			// [f.minIntervalIndex, f.maxIntervalIndex] as
   570  			// [f.minIntervalIndex, f.maxIntervalIndex+1).
   571  			f.maxIntervalIndex = oldToNewMap[f.maxIntervalIndex+1] - 1
   572  			newIntervalDelta := f.maxIntervalIndex - f.minIntervalIndex + 1
   573  			// Recalculate estimatedBytes for all old files across new intervals, but
   574  			// only if new intervals were added in between.
   575  			if oldIntervalDelta != newIntervalDelta {
   576  				// j is incremented so that oldToNewMap[j] points to the next old
   577  				// interval. This is used to distinguish between old intervals (i.e.
   578  				// ones where we need to subtract f.Size/oldIntervalDelta) from new
   579  				// ones (where we don't need to subtract). In both cases we need to add
   580  				// f.Size/newIntervalDelta.
   581  				j := oldMinIntervalIndex
   582  				for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ {
   583  					if oldToNewMap[j] == i {
   584  						newVal.orderedIntervals[i].estimatedBytes -= f.Size / uint64(oldIntervalDelta)
   585  						j++
   586  					}
   587  					newVal.orderedIntervals[i].estimatedBytes += f.Size / uint64(newIntervalDelta)
   588  				}
   589  			}
   590  		})
   591  	}
   592  	updatedSublevels := make([]int, 0)
   593  	// Update interval indices for new files.
   594  	for i, f := range files {
   595  		f.L0Index = s.levelMetadata.Len() + i
   596  		if err := newVal.addFileToSublevels(f, true /* checkInvariant */); err != nil {
   597  			return nil, err
   598  		}
   599  		updatedSublevels = append(updatedSublevels, f.SubLevel)
   600  	}
   601  
   602  	// Sort and deduplicate updatedSublevels.
   603  	sort.Ints(updatedSublevels)
   604  	{
   605  		j := 0
   606  		for i := 1; i < len(updatedSublevels); i++ {
   607  			if updatedSublevels[i] != updatedSublevels[j] {
   608  				j++
   609  				updatedSublevels[j] = updatedSublevels[i]
   610  			}
   611  		}
   612  		updatedSublevels = updatedSublevels[:j+1]
   613  	}
   614  
   615  	// Sort each updated sublevel in increasing key order.
   616  	for _, sublevel := range updatedSublevels {
   617  		sort.Sort(sublevelSorter(newVal.levelFiles[sublevel]))
   618  	}
   619  
   620  	// Construct a parallel slice of sublevel B-Trees.
   621  	// TODO(jackson): Consolidate and only use the B-Trees.
   622  	for _, sublevel := range updatedSublevels {
   623  		tr, ls := makeBTree(btreeCmpSmallestKey(newVal.cmp), newVal.levelFiles[sublevel])
   624  		if sublevel == len(newVal.Levels) {
   625  			newVal.Levels = append(newVal.Levels, ls)
   626  		} else {
   627  			// sublevel < len(s.Levels). If this panics, updatedSublevels was not
   628  			// populated correctly.
   629  			newVal.Levels[sublevel] = ls
   630  		}
   631  		tr.release()
   632  	}
   633  
   634  	newVal.flushSplitUserKeys = nil
   635  	newVal.calculateFlushSplitKeys(flushSplitMaxBytes)
   636  	return newVal, nil
   637  }
   638  
   639  // addFileToSublevels is called during L0Sublevels generation, and adds f to
   640  // the correct sublevel's levelFiles, the relevant intervals' files slices, and
   641  // sets interval indices on f. This method, if called successively on multiple
   642  // files, _must_ be called on successively newer files (by seqnum). If
   643  // checkInvariant is true, it could check for this in some cases and return
   644  // errInvalidL0SublevelsOpt if that invariant isn't held.
   645  func (s *L0Sublevels) addFileToSublevels(f *FileMetadata, checkInvariant bool) error {
   646  	// This is a simple and not very accurate estimate of the number of
   647  	// bytes this SSTable contributes to the intervals it is a part of.
   648  	//
   649  	// TODO(bilal): Call EstimateDiskUsage in sstable.Reader with interval
   650  	// bounds to get a better estimate for each interval.
   651  	interpolatedBytes := f.Size / uint64(f.maxIntervalIndex-f.minIntervalIndex+1)
   652  	s.fileBytes += f.Size
   653  	subLevel := 0
   654  	// Update state in every fileInterval for this file.
   655  	for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ {
   656  		interval := &s.orderedIntervals[i]
   657  		if len(interval.files) > 0 &&
   658  			subLevel <= interval.files[len(interval.files)-1].SubLevel {
   659  			if checkInvariant && interval.files[len(interval.files)-1].LargestSeqNum > f.LargestSeqNum {
   660  				// We are sliding this file "underneath" an existing file. Throw away
   661  				// and start over in NewL0Sublevels.
   662  				return errInvalidL0SublevelsOpt
   663  			}
   664  			subLevel = interval.files[len(interval.files)-1].SubLevel + 1
   665  		}
   666  		interval.estimatedBytes += interpolatedBytes
   667  		if f.minIntervalIndex < interval.filesMinIntervalIndex {
   668  			interval.filesMinIntervalIndex = f.minIntervalIndex
   669  		}
   670  		if f.maxIntervalIndex > interval.filesMaxIntervalIndex {
   671  			interval.filesMaxIntervalIndex = f.maxIntervalIndex
   672  		}
   673  		interval.files = append(interval.files, f)
   674  	}
   675  	f.SubLevel = subLevel
   676  	if subLevel > len(s.levelFiles) {
   677  		return errors.Errorf("chose a sublevel beyond allowed range of sublevels: %d vs 0-%d", subLevel, len(s.levelFiles))
   678  	}
   679  	if subLevel == len(s.levelFiles) {
   680  		s.levelFiles = append(s.levelFiles, []*FileMetadata{f})
   681  	} else {
   682  		s.levelFiles[subLevel] = append(s.levelFiles[subLevel], f)
   683  	}
   684  	return nil
   685  }
   686  
   687  func (s *L0Sublevels) calculateFlushSplitKeys(flushSplitMaxBytes int64) {
   688  	var cumulativeBytes uint64
   689  	// Multiply flushSplitMaxBytes by the number of sublevels. This prevents
   690  	// excessive flush splitting when the number of sublevels increases.
   691  	flushSplitMaxBytes *= int64(len(s.levelFiles))
   692  	for i := 0; i < len(s.orderedIntervals); i++ {
   693  		interval := &s.orderedIntervals[i]
   694  		if flushSplitMaxBytes > 0 && cumulativeBytes > uint64(flushSplitMaxBytes) &&
   695  			(len(s.flushSplitUserKeys) == 0 ||
   696  				!bytes.Equal(interval.startKey.key, s.flushSplitUserKeys[len(s.flushSplitUserKeys)-1])) {
   697  			s.flushSplitUserKeys = append(s.flushSplitUserKeys, interval.startKey.key)
   698  			cumulativeBytes = 0
   699  		}
   700  		cumulativeBytes += s.orderedIntervals[i].estimatedBytes
   701  	}
   702  }
   703  
   704  // InitCompactingFileInfo initializes internal flags relating to compacting
   705  // files. Must be called after sublevel initialization.
   706  //
   707  // Requires DB.mu to be held.
   708  func (s *L0Sublevels) InitCompactingFileInfo(inProgress []L0Compaction) {
   709  	for i := range s.orderedIntervals {
   710  		s.orderedIntervals[i].compactingFileCount = 0
   711  		s.orderedIntervals[i].isBaseCompacting = false
   712  		s.orderedIntervals[i].intervalRangeIsBaseCompacting = false
   713  	}
   714  
   715  	iter := s.levelMetadata.Iter()
   716  	for f := iter.First(); f != nil; f = iter.Next() {
   717  		if invariants.Enabled {
   718  			if !bytes.Equal(s.orderedIntervals[f.minIntervalIndex].startKey.key, f.Smallest.UserKey) {
   719  				panic(fmt.Sprintf("f.minIntervalIndex in FileMetadata out of sync with intervals in L0Sublevels: %s != %s",
   720  					s.formatKey(s.orderedIntervals[f.minIntervalIndex].startKey.key), s.formatKey(f.Smallest.UserKey)))
   721  			}
   722  			if !bytes.Equal(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key, f.Largest.UserKey) {
   723  				panic(fmt.Sprintf("f.maxIntervalIndex in FileMetadata out of sync with intervals in L0Sublevels: %s != %s",
   724  					s.formatKey(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key), s.formatKey(f.Smallest.UserKey)))
   725  			}
   726  		}
   727  		if !f.IsCompacting() {
   728  			continue
   729  		}
   730  		for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ {
   731  			interval := &s.orderedIntervals[i]
   732  			interval.compactingFileCount++
   733  			if !f.IsIntraL0Compacting {
   734  				// If f.Compacting && !f.IsIntraL0Compacting, this file is
   735  				// being compacted to Lbase.
   736  				interval.isBaseCompacting = true
   737  			}
   738  		}
   739  	}
   740  
   741  	// Some intervals may be base compacting without the files contained
   742  	// within those intervals being marked as compacting. This is possible if
   743  	// the files were added after the compaction initiated, and the active
   744  	// compaction files straddle the input file. Mark these intervals as base
   745  	// compacting.
   746  	for _, c := range inProgress {
   747  		startIK := intervalKey{key: c.Smallest.UserKey, isLargest: false}
   748  		endIK := intervalKey{key: c.Largest.UserKey, isLargest: !c.Largest.IsExclusiveSentinel()}
   749  		start := sort.Search(len(s.orderedIntervals), func(i int) bool {
   750  			return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, startIK) >= 0
   751  		})
   752  		end := sort.Search(len(s.orderedIntervals), func(i int) bool {
   753  			return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, endIK) >= 0
   754  		})
   755  		for i := start; i < end && i < len(s.orderedIntervals); i++ {
   756  			interval := &s.orderedIntervals[i]
   757  			if !c.IsIntraL0 {
   758  				interval.isBaseCompacting = true
   759  			}
   760  		}
   761  	}
   762  
   763  	min := 0
   764  	for i := range s.orderedIntervals {
   765  		interval := &s.orderedIntervals[i]
   766  		if interval.isBaseCompacting {
   767  			minIndex := interval.filesMinIntervalIndex
   768  			if minIndex < min {
   769  				minIndex = min
   770  			}
   771  			for j := minIndex; j <= interval.filesMaxIntervalIndex; j++ {
   772  				min = j
   773  				s.orderedIntervals[j].intervalRangeIsBaseCompacting = true
   774  			}
   775  		}
   776  	}
   777  }
   778  
   779  // String produces a string containing useful debug information. Useful in test
   780  // code and debugging.
   781  func (s *L0Sublevels) String() string {
   782  	return s.describe(false)
   783  }
   784  
   785  func (s *L0Sublevels) describe(verbose bool) string {
   786  	var buf strings.Builder
   787  	fmt.Fprintf(&buf, "file count: %d, sublevels: %d, intervals: %d\nflush split keys(%d): [",
   788  		s.levelMetadata.Len(), len(s.levelFiles), len(s.orderedIntervals), len(s.flushSplitUserKeys))
   789  	for i := range s.flushSplitUserKeys {
   790  		fmt.Fprintf(&buf, "%s", s.formatKey(s.flushSplitUserKeys[i]))
   791  		if i < len(s.flushSplitUserKeys)-1 {
   792  			fmt.Fprintf(&buf, ", ")
   793  		}
   794  	}
   795  	fmt.Fprintln(&buf, "]")
   796  	numCompactingFiles := 0
   797  	for i := len(s.levelFiles) - 1; i >= 0; i-- {
   798  		maxIntervals := 0
   799  		sumIntervals := 0
   800  		var totalBytes uint64
   801  		for _, f := range s.levelFiles[i] {
   802  			intervals := f.maxIntervalIndex - f.minIntervalIndex + 1
   803  			if intervals > maxIntervals {
   804  				maxIntervals = intervals
   805  			}
   806  			sumIntervals += intervals
   807  			totalBytes += f.Size
   808  			if f.IsCompacting() {
   809  				numCompactingFiles++
   810  			}
   811  		}
   812  		fmt.Fprintf(&buf, "0.%d: file count: %d, bytes: %d, width (mean, max): %0.1f, %d, interval range: [%d, %d]\n",
   813  			i, len(s.levelFiles[i]), totalBytes, float64(sumIntervals)/float64(len(s.levelFiles[i])), maxIntervals, s.levelFiles[i][0].minIntervalIndex,
   814  			s.levelFiles[i][len(s.levelFiles[i])-1].maxIntervalIndex)
   815  		for _, f := range s.levelFiles[i] {
   816  			intervals := f.maxIntervalIndex - f.minIntervalIndex + 1
   817  			if verbose {
   818  				fmt.Fprintf(&buf, "\t%s\n", f)
   819  			}
   820  			if s.levelMetadata.Len() > 50 && intervals*3 > len(s.orderedIntervals) {
   821  				var intervalsBytes uint64
   822  				for k := f.minIntervalIndex; k <= f.maxIntervalIndex; k++ {
   823  					intervalsBytes += s.orderedIntervals[k].estimatedBytes
   824  				}
   825  				fmt.Fprintf(&buf, "wide file: %d, [%d, %d], byte fraction: %f\n",
   826  					f.FileNum, f.minIntervalIndex, f.maxIntervalIndex,
   827  					float64(intervalsBytes)/float64(s.fileBytes))
   828  			}
   829  		}
   830  	}
   831  
   832  	lastCompactingIntervalStart := -1
   833  	fmt.Fprintf(&buf, "compacting file count: %d, base compacting intervals: ", numCompactingFiles)
   834  	i := 0
   835  	foundBaseCompactingIntervals := false
   836  	for ; i < len(s.orderedIntervals); i++ {
   837  		interval := &s.orderedIntervals[i]
   838  		if len(interval.files) == 0 {
   839  			continue
   840  		}
   841  		if !interval.isBaseCompacting {
   842  			if lastCompactingIntervalStart != -1 {
   843  				if foundBaseCompactingIntervals {
   844  					buf.WriteString(", ")
   845  				}
   846  				fmt.Fprintf(&buf, "[%d, %d]", lastCompactingIntervalStart, i-1)
   847  				foundBaseCompactingIntervals = true
   848  			}
   849  			lastCompactingIntervalStart = -1
   850  		} else {
   851  			if lastCompactingIntervalStart == -1 {
   852  				lastCompactingIntervalStart = i
   853  			}
   854  		}
   855  	}
   856  	if lastCompactingIntervalStart != -1 {
   857  		if foundBaseCompactingIntervals {
   858  			buf.WriteString(", ")
   859  		}
   860  		fmt.Fprintf(&buf, "[%d, %d]", lastCompactingIntervalStart, i-1)
   861  	} else if !foundBaseCompactingIntervals {
   862  		fmt.Fprintf(&buf, "none")
   863  	}
   864  	fmt.Fprintln(&buf, "")
   865  	return buf.String()
   866  }
   867  
   868  // ReadAmplification returns the contribution of L0Sublevels to the read
   869  // amplification for any particular point key. It is the maximum height of any
   870  // tracked fileInterval. This is always less than or equal to the number of
   871  // sublevels.
   872  func (s *L0Sublevels) ReadAmplification() int {
   873  	amp := 0
   874  	for i := range s.orderedIntervals {
   875  		interval := &s.orderedIntervals[i]
   876  		fileCount := len(interval.files)
   877  		if amp < fileCount {
   878  			amp = fileCount
   879  		}
   880  	}
   881  	return amp
   882  }
   883  
   884  // UserKeyRange encodes a key range in user key space. A UserKeyRange's Start
   885  // and End boundaries are both inclusive.
   886  type UserKeyRange struct {
   887  	Start, End []byte
   888  }
   889  
   890  // InUseKeyRanges returns the merged table bounds of L0 files overlapping the
   891  // provided user key range. The returned key ranges are sorted and
   892  // nonoverlapping.
   893  func (s *L0Sublevels) InUseKeyRanges(smallest, largest []byte) []UserKeyRange {
   894  	// Binary search to find the provided keys within the intervals.
   895  	startIK := intervalKey{key: smallest, isLargest: false}
   896  	endIK := intervalKey{key: largest, isLargest: true}
   897  	start := sort.Search(len(s.orderedIntervals), func(i int) bool {
   898  		return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, startIK) > 0
   899  	})
   900  	if start > 0 {
   901  		// Back up to the first interval with a start key <= startIK.
   902  		start--
   903  	}
   904  	end := sort.Search(len(s.orderedIntervals), func(i int) bool {
   905  		return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, endIK) > 0
   906  	})
   907  
   908  	var keyRanges []UserKeyRange
   909  	var curr *UserKeyRange
   910  	for i := start; i < end; {
   911  		// Intervals with no files are not in use and can be skipped, once we
   912  		// end the current UserKeyRange.
   913  		if len(s.orderedIntervals[i].files) == 0 {
   914  			curr = nil
   915  			i++
   916  			continue
   917  		}
   918  
   919  		// If curr is nil, start a new in-use key range.
   920  		if curr == nil {
   921  			keyRanges = append(keyRanges, UserKeyRange{
   922  				Start: s.orderedIntervals[i].startKey.key,
   923  			})
   924  			curr = &keyRanges[len(keyRanges)-1]
   925  		}
   926  
   927  		// If the filesMaxIntervalIndex is not the current index, we can jump
   928  		// to the max index, knowing that all intermediary intervals are
   929  		// overlapped by some file.
   930  		if maxIdx := s.orderedIntervals[i].filesMaxIntervalIndex; maxIdx != i {
   931  			// Note that end may be less than or equal to maxIdx if we're
   932  			// concerned with a key range that ends before the interval at
   933  			// maxIdx starts. We must set curr.End now, before making that
   934  			// leap, because this iteration may be the last.
   935  			i = maxIdx
   936  			curr.End = s.orderedIntervals[i+1].startKey.key
   937  			continue
   938  		}
   939  
   940  		// No files overlapping with this interval overlap with the next
   941  		// interval. Update the current end to be the next interval's start
   942  		// key. Note that curr is not necessarily finished, because there may
   943  		// be an abutting non-empty interval.
   944  		curr.End = s.orderedIntervals[i+1].startKey.key
   945  		i++
   946  	}
   947  	return keyRanges
   948  }
   949  
   950  // FlushSplitKeys returns a slice of user keys to split flushes at.
   951  // Used by flushes to avoid writing sstables that straddle these split keys.
   952  // These should be interpreted as the keys to start the next sstable (not the
   953  // last key to include in the prev sstable). These are user keys so that
   954  // range tombstones can be properly truncated (untruncated range tombstones
   955  // are not permitted for L0 files).
   956  func (s *L0Sublevels) FlushSplitKeys() [][]byte {
   957  	return s.flushSplitUserKeys
   958  }
   959  
   960  // MaxDepthAfterOngoingCompactions returns an estimate of maximum depth of
   961  // sublevels after all ongoing compactions run to completion. Used by compaction
   962  // picker to decide compaction score for L0. There is no scoring for intra-L0
   963  // compactions -- they only run if L0 score is high but we're unable to pick an
   964  // L0 -> Lbase compaction.
   965  func (s *L0Sublevels) MaxDepthAfterOngoingCompactions() int {
   966  	depth := 0
   967  	for i := range s.orderedIntervals {
   968  		interval := &s.orderedIntervals[i]
   969  		intervalDepth := len(interval.files) - interval.compactingFileCount
   970  		if depth < intervalDepth {
   971  			depth = intervalDepth
   972  		}
   973  	}
   974  	return depth
   975  }
   976  
   977  // Only for temporary debugging in the absence of proper tests.
   978  //
   979  // TODO(bilal): Simplify away the debugging statements in this method, and make
   980  // this a pure sanity checker.
   981  //
   982  //lint:ignore U1000 - useful for debugging
   983  func (s *L0Sublevels) checkCompaction(c *L0CompactionFiles) error {
   984  	includedFiles := newBitSet(s.levelMetadata.Len())
   985  	fileIntervalsByLevel := make([]struct {
   986  		min int
   987  		max int
   988  	}, len(s.levelFiles))
   989  	for i := range fileIntervalsByLevel {
   990  		fileIntervalsByLevel[i].min = math.MaxInt32
   991  		fileIntervalsByLevel[i].max = 0
   992  	}
   993  	var topLevel int
   994  	var increment int
   995  	var limitReached func(int) bool
   996  	if c.isIntraL0 {
   997  		topLevel = len(s.levelFiles) - 1
   998  		increment = +1
   999  		limitReached = func(level int) bool {
  1000  			return level == len(s.levelFiles)
  1001  		}
  1002  	} else {
  1003  		topLevel = 0
  1004  		increment = -1
  1005  		limitReached = func(level int) bool {
  1006  			return level < 0
  1007  		}
  1008  	}
  1009  	for _, f := range c.Files {
  1010  		if fileIntervalsByLevel[f.SubLevel].min > f.minIntervalIndex {
  1011  			fileIntervalsByLevel[f.SubLevel].min = f.minIntervalIndex
  1012  		}
  1013  		if fileIntervalsByLevel[f.SubLevel].max < f.maxIntervalIndex {
  1014  			fileIntervalsByLevel[f.SubLevel].max = f.maxIntervalIndex
  1015  		}
  1016  		includedFiles.markBit(f.L0Index)
  1017  		if c.isIntraL0 {
  1018  			if topLevel > f.SubLevel {
  1019  				topLevel = f.SubLevel
  1020  			}
  1021  		} else {
  1022  			if topLevel < f.SubLevel {
  1023  				topLevel = f.SubLevel
  1024  			}
  1025  		}
  1026  	}
  1027  	min := fileIntervalsByLevel[topLevel].min
  1028  	max := fileIntervalsByLevel[topLevel].max
  1029  	for level := topLevel; !limitReached(level); level += increment {
  1030  		if fileIntervalsByLevel[level].min < min {
  1031  			min = fileIntervalsByLevel[level].min
  1032  		}
  1033  		if fileIntervalsByLevel[level].max > max {
  1034  			max = fileIntervalsByLevel[level].max
  1035  		}
  1036  		index := sort.Search(len(s.levelFiles[level]), func(i int) bool {
  1037  			return s.levelFiles[level][i].maxIntervalIndex >= min
  1038  		})
  1039  		// start := index
  1040  		for ; index < len(s.levelFiles[level]); index++ {
  1041  			f := s.levelFiles[level][index]
  1042  			if f.minIntervalIndex > max {
  1043  				break
  1044  			}
  1045  			if c.isIntraL0 && f.LargestSeqNum >= c.earliestUnflushedSeqNum {
  1046  				return errors.Errorf(
  1047  					"sstable %s in compaction has sequence numbers higher than the earliest unflushed seqnum %d: %d-%d",
  1048  					f.FileNum, c.earliestUnflushedSeqNum, f.SmallestSeqNum,
  1049  					f.LargestSeqNum)
  1050  			}
  1051  			if !includedFiles[f.L0Index] {
  1052  				var buf strings.Builder
  1053  				fmt.Fprintf(&buf, "bug %t, seed interval: %d: level %d, sl index %d, f.index %d, min %d, max %d, pre-min %d, pre-max %d, f.min %d, f.max %d, filenum: %d, isCompacting: %t\n%s\n",
  1054  					c.isIntraL0, c.seedInterval, level, index, f.L0Index, min, max, c.preExtensionMinInterval, c.preExtensionMaxInterval,
  1055  					f.minIntervalIndex, f.maxIntervalIndex,
  1056  					f.FileNum, f.IsCompacting(), s)
  1057  				fmt.Fprintf(&buf, "files included:\n")
  1058  				for _, f := range c.Files {
  1059  					fmt.Fprintf(&buf, "filenum: %d, sl: %d, index: %d, [%d, %d]\n",
  1060  						f.FileNum, f.SubLevel, f.L0Index, f.minIntervalIndex, f.maxIntervalIndex)
  1061  				}
  1062  				fmt.Fprintf(&buf, "files added:\n")
  1063  				for _, f := range c.filesAdded {
  1064  					fmt.Fprintf(&buf, "filenum: %d, sl: %d, index: %d, [%d, %d]\n",
  1065  						f.FileNum, f.SubLevel, f.L0Index, f.minIntervalIndex, f.maxIntervalIndex)
  1066  				}
  1067  				return errors.New(buf.String())
  1068  			}
  1069  		}
  1070  	}
  1071  	return nil
  1072  }
  1073  
  1074  // UpdateStateForStartedCompaction updates internal L0Sublevels state for a
  1075  // recently started compaction. isBase specifies if this is a base compaction;
  1076  // if false, this is assumed to be an intra-L0 compaction. The specified
  1077  // compaction must be involving L0 SSTables. It's assumed that the Compacting
  1078  // and IsIntraL0Compacting fields are already set on all FileMetadatas passed
  1079  // in.
  1080  func (s *L0Sublevels) UpdateStateForStartedCompaction(inputs []LevelSlice, isBase bool) error {
  1081  	minIntervalIndex := -1
  1082  	maxIntervalIndex := 0
  1083  	for i := range inputs {
  1084  		iter := inputs[i].Iter()
  1085  		for f := iter.First(); f != nil; f = iter.Next() {
  1086  			for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ {
  1087  				interval := &s.orderedIntervals[i]
  1088  				interval.compactingFileCount++
  1089  			}
  1090  			if f.minIntervalIndex < minIntervalIndex || minIntervalIndex == -1 {
  1091  				minIntervalIndex = f.minIntervalIndex
  1092  			}
  1093  			if f.maxIntervalIndex > maxIntervalIndex {
  1094  				maxIntervalIndex = f.maxIntervalIndex
  1095  			}
  1096  		}
  1097  	}
  1098  	if isBase {
  1099  		for i := minIntervalIndex; i <= maxIntervalIndex; i++ {
  1100  			interval := &s.orderedIntervals[i]
  1101  			interval.isBaseCompacting = isBase
  1102  			for j := interval.filesMinIntervalIndex; j <= interval.filesMaxIntervalIndex; j++ {
  1103  				s.orderedIntervals[j].intervalRangeIsBaseCompacting = true
  1104  			}
  1105  		}
  1106  	}
  1107  	return nil
  1108  }
  1109  
  1110  // L0CompactionFiles represents a candidate set of L0 files for compaction.
  1111  // Also referred to as "lcf". Contains state information useful
  1112  // for generating the compaction (such as Files), as well as for picking
  1113  // between candidate compactions (eg. fileBytes and
  1114  // seedIntervalStackDepthReduction).
  1115  type L0CompactionFiles struct {
  1116  	Files []*FileMetadata
  1117  
  1118  	FilesIncluded bitSet
  1119  	// A "seed interval" is an interval with a high stack depth that was chosen
  1120  	// to bootstrap this compaction candidate. seedIntervalStackDepthReduction
  1121  	// is the number of sublevels that have a file in the seed interval that is
  1122  	// a part of this compaction.
  1123  	seedIntervalStackDepthReduction int
  1124  	// For base compactions, seedIntervalMinLevel is 0, and for intra-L0
  1125  	// compactions, seedIntervalMaxLevel is len(s.Files)-1 i.e. the highest
  1126  	// sublevel.
  1127  	seedIntervalMinLevel int
  1128  	seedIntervalMaxLevel int
  1129  	// Index of the seed interval.
  1130  	seedInterval int
  1131  	// Sum of file sizes for all files in this compaction.
  1132  	fileBytes uint64
  1133  	// Intervals with index [minIntervalIndex, maxIntervalIndex] are
  1134  	// participating in this compaction; it's the union set of all intervals
  1135  	// overlapped by participating files.
  1136  	minIntervalIndex int
  1137  	maxIntervalIndex int
  1138  
  1139  	// Set for intra-L0 compactions. SSTables with sequence numbers greater
  1140  	// than earliestUnflushedSeqNum cannot be a part of intra-L0 compactions.
  1141  	isIntraL0               bool
  1142  	earliestUnflushedSeqNum uint64
  1143  
  1144  	// For debugging purposes only. Used in checkCompaction().
  1145  	preExtensionMinInterval int
  1146  	preExtensionMaxInterval int
  1147  	filesAdded              []*FileMetadata
  1148  }
  1149  
  1150  // addFile adds the specified file to the LCF.
  1151  func (l *L0CompactionFiles) addFile(f *FileMetadata) {
  1152  	if l.FilesIncluded[f.L0Index] {
  1153  		return
  1154  	}
  1155  	l.FilesIncluded.markBit(f.L0Index)
  1156  	l.Files = append(l.Files, f)
  1157  	l.filesAdded = append(l.filesAdded, f)
  1158  	l.fileBytes += f.Size
  1159  	if f.minIntervalIndex < l.minIntervalIndex {
  1160  		l.minIntervalIndex = f.minIntervalIndex
  1161  	}
  1162  	if f.maxIntervalIndex > l.maxIntervalIndex {
  1163  		l.maxIntervalIndex = f.maxIntervalIndex
  1164  	}
  1165  }
  1166  
  1167  // Helper to order intervals being considered for compaction.
  1168  type intervalAndScore struct {
  1169  	interval int
  1170  	score    int
  1171  }
  1172  type intervalSorterByDecreasingScore []intervalAndScore
  1173  
  1174  func (is intervalSorterByDecreasingScore) Len() int { return len(is) }
  1175  func (is intervalSorterByDecreasingScore) Less(i, j int) bool {
  1176  	return is[i].score > is[j].score
  1177  }
  1178  func (is intervalSorterByDecreasingScore) Swap(i, j int) {
  1179  	is[i], is[j] = is[j], is[i]
  1180  }
  1181  
  1182  // Compactions:
  1183  //
  1184  // The sub-levels and intervals can be visualized in 2 dimensions as the X
  1185  // axis containing intervals in increasing order and the Y axis containing
  1186  // sub-levels (older to younger). The intervals can be sparse wrt sub-levels.
  1187  // We observe that the system is typically under severe pressure in L0 during
  1188  // large numbers of ingestions where most files added to L0 are narrow and
  1189  // non-overlapping.
  1190  //
  1191  //    L0.1    d---g
  1192  //    L0.0  c--e  g--j o--s u--x
  1193  //
  1194  // As opposed to a case with a lot of wide, overlapping L0 files:
  1195  //
  1196  //    L0.3     d-----------r
  1197  //    L0.2    c--------o
  1198  //    L0.1   b-----------q
  1199  //    L0.0  a----------------x
  1200  //
  1201  // In that case we expect the rectangle represented in the good visualization
  1202  // above (i.e. the first one) to be wide and short, and not too sparse (most
  1203  // intervals will have fileCount close to the sub-level count), which would make
  1204  // it amenable to concurrent L0 -> Lbase compactions.
  1205  //
  1206  // L0 -> Lbase: The high-level goal of a L0 -> Lbase compaction is to reduce
  1207  // stack depth, by compacting files in the intervals with the highest
  1208  // (fileCount - compactingCount). Additionally, we would like compactions to
  1209  // not involve a huge number of files, so that they finish quickly, and to
  1210  // allow for concurrent L0 -> Lbase compactions when needed. In order to
  1211  // achieve these goals we would like compactions to visualize as capturing
  1212  // thin and tall rectangles. The approach below is to consider intervals in
  1213  // some order and then try to construct a compaction using the interval. The
  1214  // first interval we can construct a compaction for is the compaction that is
  1215  // started. There can be multiple heuristics in choosing the ordering of the
  1216  // intervals -- the code uses one heuristic that worked well for a large
  1217  // ingestion stemming from a cockroachdb import, but additional experimentation
  1218  // is necessary to pick a general heuristic. Additionally, the compaction that
  1219  // gets picked may be not as desirable as one that could be constructed later
  1220  // in terms of reducing stack depth (since adding more files to the compaction
  1221  // can get blocked by needing to encompass files that are already being
  1222  // compacted). So an alternative would be to try to construct more than one
  1223  // compaction and pick the best one.
  1224  //
  1225  // Here's a visualization of an ideal L0->LBase compaction selection:
  1226  //
  1227  //    L0.3  a--d    g-j
  1228  //    L0.2         f--j          r-t
  1229  //    L0.1   b-d  e---j
  1230  //    L0.0  a--d   f--j  l--o  p-----x
  1231  //
  1232  //    Lbase a--------i    m---------w
  1233  //
  1234  // The [g,j] interval has the highest stack depth, so it would have the highest
  1235  // priority for selecting a base compaction candidate. Assuming none of the
  1236  // files are already compacting, this is the compaction that will be chosen:
  1237  //
  1238  //               _______
  1239  //    L0.3  a--d |  g-j|
  1240  //    L0.2       | f--j|         r-t
  1241  //    L0.1   b-d |e---j|
  1242  //    L0.0  a--d | f--j| l--o  p-----x
  1243  //
  1244  //    Lbase a--------i    m---------w
  1245  //
  1246  // Note that running this compaction will mark the a--i file in Lbase as
  1247  // compacting, and when ExtendL0ForBaseCompactionTo is called with the bounds
  1248  // of that base file, it'll expand the compaction to also include all L0 files
  1249  // in the a-d interval. The resultant compaction would then be:
  1250  //
  1251  //         _____________
  1252  //    L0.3 |a--d    g-j|
  1253  //    L0.2 |       f--j|         r-t
  1254  //    L0.1 | b-d  e---j|
  1255  //    L0.0 |a--d   f--j| l--o  p-----x
  1256  //
  1257  //    Lbase a--------i    m---------w
  1258  //
  1259  // The next best interval for base compaction would therefore
  1260  // be the one including r--t in L0.2 and p--x in L0.0, and both this compaction
  1261  // and the one picked earlier can run in parallel. This is assuming
  1262  // minCompactionDepth >= 2, otherwise the second compaction has too little
  1263  // depth to pick.
  1264  //
  1265  //         _____________
  1266  //    L0.3 |a--d    g-j|      _________
  1267  //    L0.2 |       f--j|      |  r-t  |
  1268  //    L0.1 | b-d  e---j|      |       |
  1269  //    L0.0 |a--d   f--j| l--o |p-----x|
  1270  //
  1271  //    Lbase a--------i    m---------w
  1272  //
  1273  // Note that when ExtendL0ForBaseCompactionTo is called, the compaction expands
  1274  // to the following, given that the [l,o] file can be added without including
  1275  // additional files in Lbase:
  1276  //
  1277  //         _____________
  1278  //    L0.3 |a--d    g-j|      _________
  1279  //    L0.2 |       f--j|      |  r-t  |
  1280  //    L0.1 | b-d  e---j|______|       |
  1281  //    L0.0 |a--d   f--j||l--o  p-----x|
  1282  //
  1283  //    Lbase a--------i    m---------w
  1284  //
  1285  // If an additional file existed in LBase that overlapped with [l,o], it would
  1286  // be excluded from the compaction. Concretely:
  1287  //
  1288  //         _____________
  1289  //    L0.3 |a--d    g-j|      _________
  1290  //    L0.2 |       f--j|      |  r-t  |
  1291  //    L0.1 | b-d  e---j|      |       |
  1292  //    L0.0 |a--d   f--j| l--o |p-----x|
  1293  //
  1294  //    Lbase a--------ij--lm---------w
  1295  //
  1296  // Intra-L0: If the L0 score is high, but PickBaseCompaction() is unable to
  1297  // pick a compaction, PickIntraL0Compaction will be used to pick an intra-L0
  1298  // compaction. Similar to L0 -> Lbase compactions, we want to allow for
  1299  // multiple intra-L0 compactions and not generate wide output files that
  1300  // hinder later concurrency of L0 -> Lbase compactions. Also compactions
  1301  // that produce wide files don't reduce stack depth -- they represent wide
  1302  // rectangles in our visualization, which means many intervals have their
  1303  // depth reduced by a small amount. Typically, L0 files have non-overlapping
  1304  // sequence numbers, and sticking to that invariant would require us to
  1305  // consider intra-L0 compactions that proceed from youngest to oldest files,
  1306  // which could result in the aforementioned undesirable wide rectangle
  1307  // shape. But this non-overlapping sequence number is already relaxed in
  1308  // RocksDB -- sstables are primarily ordered by their largest sequence
  1309  // number. So we can arrange for intra-L0 compactions to capture thin and
  1310  // tall rectangles starting with the top of the stack (youngest files).
  1311  // Like the L0 -> Lbase case we order the intervals using a heuristic and
  1312  // consider each in turn. The same comment about better L0 -> Lbase heuristics
  1313  // and not being greedy applies here.
  1314  //
  1315  // Going back to a modified version of our example from earlier, let's say these
  1316  // are the base compactions in progress:
  1317  //                _______
  1318  //    L0.3  a--d  |  g-j|      _________
  1319  //    L0.2        | f--j|      |  r-t  |
  1320  //    L0.1   b-d  |e---j|      |       |
  1321  //    L0.0  a--d  | f--j| l--o |p-----x|
  1322  //
  1323  //    Lbase a---------i    m---------w
  1324  //
  1325  // Since both LBase files are compacting, the only L0 compaction that can be
  1326  // picked is an intra-L0 compaction. For this, the b--d interval has the highest
  1327  // stack depth (3), and starting with a--d in L0.3 as the seed file, we can
  1328  // iterate downward and build this compaction, assuming all files in that
  1329  // interval are not compacting and have a highest sequence number less than
  1330  // earliestUnflushedSeqNum:
  1331  //
  1332  //                _______
  1333  //    L0.3 |a--d| |  g-j|      _________
  1334  //    L0.2 |    | | f--j|      |  r-t  |
  1335  //    L0.1 | b-d| |e---j|      |       |
  1336  //    L0.0 |a--d| | f--j| l--o |p-----x|
  1337  //         ------
  1338  //    Lbase a---------i    m---------w
  1339  //
  1340  
  1341  // PickBaseCompaction picks a base compaction based on the above specified
  1342  // heuristics, for the specified Lbase files and a minimum depth of overlapping
  1343  // files that can be selected for compaction. Returns nil if no compaction is
  1344  // possible.
  1345  func (s *L0Sublevels) PickBaseCompaction(
  1346  	minCompactionDepth int, baseFiles LevelSlice,
  1347  ) (*L0CompactionFiles, error) {
  1348  	// For LBase compactions, we consider intervals in a greedy manner in the
  1349  	// following order:
  1350  	// - Intervals that are unlikely to be blocked due
  1351  	//   to ongoing L0 -> Lbase compactions. These are the ones with
  1352  	//   !isBaseCompacting && !intervalRangeIsBaseCompacting.
  1353  	// - Intervals that are !isBaseCompacting && intervalRangeIsBaseCompacting.
  1354  	//
  1355  	// The ordering heuristic exists just to avoid wasted work. Ideally,
  1356  	// we would consider all intervals with isBaseCompacting = false and
  1357  	// construct a compaction for it and compare the constructed compactions
  1358  	// and pick the best one. If microbenchmarks show that we can afford
  1359  	// this cost we can eliminate this heuristic.
  1360  	scoredIntervals := make([]intervalAndScore, 0, len(s.orderedIntervals))
  1361  	sublevelCount := len(s.levelFiles)
  1362  	for i := range s.orderedIntervals {
  1363  		interval := &s.orderedIntervals[i]
  1364  		depth := len(interval.files) - interval.compactingFileCount
  1365  		if interval.isBaseCompacting || minCompactionDepth > depth {
  1366  			continue
  1367  		}
  1368  		if interval.intervalRangeIsBaseCompacting {
  1369  			scoredIntervals = append(scoredIntervals, intervalAndScore{interval: i, score: depth})
  1370  		} else {
  1371  			// Prioritize this interval by incrementing the score by the number
  1372  			// of sublevels.
  1373  			scoredIntervals = append(scoredIntervals, intervalAndScore{interval: i, score: depth + sublevelCount})
  1374  		}
  1375  	}
  1376  	sort.Sort(intervalSorterByDecreasingScore(scoredIntervals))
  1377  
  1378  	// Optimization to avoid considering different intervals that
  1379  	// are likely to choose the same seed file. Again this is just
  1380  	// to reduce wasted work.
  1381  	consideredIntervals := newBitSet(len(s.orderedIntervals))
  1382  	for _, scoredInterval := range scoredIntervals {
  1383  		interval := &s.orderedIntervals[scoredInterval.interval]
  1384  		if consideredIntervals[interval.index] {
  1385  			continue
  1386  		}
  1387  
  1388  		// Pick the seed file for the interval as the file
  1389  		// in the lowest sub-level.
  1390  		f := interval.files[0]
  1391  		// Don't bother considering the intervals that are
  1392  		// covered by the seed file since they are likely
  1393  		// nearby. Note that it is possible that those intervals
  1394  		// have seed files at lower sub-levels so could be
  1395  		// viable for compaction.
  1396  		if f == nil {
  1397  			return nil, errors.New("no seed file found in sublevel intervals")
  1398  		}
  1399  		consideredIntervals.markBits(f.minIntervalIndex, f.maxIntervalIndex+1)
  1400  		if f.IsCompacting() {
  1401  			if f.IsIntraL0Compacting {
  1402  				// If we're picking a base compaction and we came across a
  1403  				// seed file candidate that's being intra-L0 compacted, skip
  1404  				// the interval instead of erroring out.
  1405  				continue
  1406  			}
  1407  			// We chose a compaction seed file that should not be
  1408  			// compacting. Usually means the score is not accurately
  1409  			// accounting for files already compacting, or internal state is
  1410  			// inconsistent.
  1411  			return nil, errors.Errorf("file %s chosen as seed file for compaction should not be compacting", f.FileNum)
  1412  		}
  1413  
  1414  		c := s.baseCompactionUsingSeed(f, interval.index, minCompactionDepth)
  1415  		if c != nil {
  1416  			// Check if the chosen compaction overlaps with any files
  1417  			// in Lbase that have Compacting = true. If that's the case,
  1418  			// this compaction cannot be chosen.
  1419  			baseIter := baseFiles.Iter()
  1420  			// An interval starting at ImmediateSuccessor(key) can never be the
  1421  			// first interval of a compaction since no file can start at that
  1422  			// interval.
  1423  			m := baseIter.SeekGE(s.cmp, s.orderedIntervals[c.minIntervalIndex].startKey.key)
  1424  
  1425  			var baseCompacting bool
  1426  			for ; m != nil && !baseCompacting; m = baseIter.Next() {
  1427  				cmp := s.cmp(m.Smallest.UserKey, s.orderedIntervals[c.maxIntervalIndex+1].startKey.key)
  1428  				// Compaction is ending at exclusive bound of c.maxIntervalIndex+1
  1429  				if cmp > 0 || (cmp == 0 && !s.orderedIntervals[c.maxIntervalIndex+1].startKey.isLargest) {
  1430  					break
  1431  				}
  1432  				baseCompacting = baseCompacting || m.IsCompacting()
  1433  			}
  1434  			if baseCompacting {
  1435  				continue
  1436  			}
  1437  			return c, nil
  1438  		}
  1439  	}
  1440  	return nil, nil
  1441  }
  1442  
  1443  // Helper function for building an L0 -> Lbase compaction using a seed interval
  1444  // and seed file in that seed interval.
  1445  func (s *L0Sublevels) baseCompactionUsingSeed(
  1446  	f *FileMetadata, intervalIndex int, minCompactionDepth int,
  1447  ) *L0CompactionFiles {
  1448  	c := &L0CompactionFiles{
  1449  		FilesIncluded:        newBitSet(s.levelMetadata.Len()),
  1450  		seedInterval:         intervalIndex,
  1451  		seedIntervalMinLevel: 0,
  1452  		minIntervalIndex:     f.minIntervalIndex,
  1453  		maxIntervalIndex:     f.maxIntervalIndex,
  1454  	}
  1455  	c.addFile(f)
  1456  
  1457  	// The first iteration of this loop builds the compaction at the seed file's
  1458  	// sublevel. Future iterations expand on this compaction by stacking
  1459  	// more files from intervalIndex and repeating. This is an
  1460  	// optional activity so when it fails we can fallback to the last
  1461  	// successful candidate.
  1462  	var lastCandidate *L0CompactionFiles
  1463  	interval := &s.orderedIntervals[intervalIndex]
  1464  
  1465  	for i := 0; i < len(interval.files); i++ {
  1466  		f2 := interval.files[i]
  1467  		sl := f2.SubLevel
  1468  		c.seedIntervalStackDepthReduction++
  1469  		c.seedIntervalMaxLevel = sl
  1470  		c.addFile(f2)
  1471  		// The seed file is in the lowest sublevel in the seed interval, but it may
  1472  		// overlap with other files in even lower sublevels. For
  1473  		// correctness we need to grow our interval to include those files, and
  1474  		// capture all files in the next level that fall in this extended interval
  1475  		// and so on. This can result in a triangular shape like the following
  1476  		// where again the X axis is the key intervals and the Y axis
  1477  		// is oldest to youngest. Note that it is not necessary for
  1478  		// correctness to fill out the shape at the higher sub-levels
  1479  		// to make it more rectangular since the invariant only requires
  1480  		// that younger versions of a key not be moved to Lbase while
  1481  		// leaving behind older versions.
  1482  		//                     -
  1483  		//                    ---
  1484  		//                   -----
  1485  		// It may be better for performance to have a more rectangular
  1486  		// shape since the files being left behind will overlap with the
  1487  		// same Lbase key range as that of this compaction. But there is
  1488  		// also the danger that in trying to construct a more rectangular
  1489  		// shape we will be forced to pull in a file that is already
  1490  		// compacting. We expect extendCandidateToRectangle to eventually be called
  1491  		// on this compaction if it's chosen, at which point we would iterate
  1492  		// backward and choose those files. This logic is similar to compaction.grow
  1493  		// for non-L0 compactions.
  1494  		done := false
  1495  		for currLevel := sl - 1; currLevel >= 0; currLevel-- {
  1496  			if !s.extendFiles(currLevel, math.MaxUint64, c) {
  1497  				// Failed to extend due to ongoing compaction.
  1498  				done = true
  1499  				break
  1500  			}
  1501  		}
  1502  		if done {
  1503  			break
  1504  		}
  1505  		// Observed some compactions using > 1GB from L0 in an import
  1506  		// experiment. Very long running compactions are not great as they
  1507  		// reduce concurrency while they run, and take a while to produce
  1508  		// results, though they're sometimes unavoidable. There is a tradeoff
  1509  		// here in that adding more depth is more efficient in reducing stack
  1510  		// depth, but long running compactions reduce flexibility in what can
  1511  		// run concurrently in L0 and even Lbase -> Lbase+1. An increase more
  1512  		// than 150% in bytes since the last candidate compaction (along with a
  1513  		// total compaction size in excess of 100mb), or a total compaction
  1514  		// size beyond a hard limit of 500mb, is criteria for rejecting this
  1515  		// candidate. This lets us prefer slow growths as we add files, while
  1516  		// still having a hard limit. Note that if this is the first compaction
  1517  		// candidate to reach a stack depth reduction of minCompactionDepth or
  1518  		// higher, this candidate will be chosen regardless.
  1519  		if lastCandidate == nil {
  1520  			lastCandidate = &L0CompactionFiles{}
  1521  		} else if lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth &&
  1522  			c.fileBytes > 100<<20 &&
  1523  			(float64(c.fileBytes)/float64(lastCandidate.fileBytes) > 1.5 || c.fileBytes > 500<<20) {
  1524  			break
  1525  		}
  1526  		*lastCandidate = *c
  1527  	}
  1528  	if lastCandidate != nil && lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth {
  1529  		lastCandidate.FilesIncluded.clearAllBits()
  1530  		for _, f := range lastCandidate.Files {
  1531  			lastCandidate.FilesIncluded.markBit(f.L0Index)
  1532  		}
  1533  		return lastCandidate
  1534  	}
  1535  	return nil
  1536  }
  1537  
  1538  // Expands fields in the provided L0CompactionFiles instance (cFiles) to
  1539  // include overlapping files in the specified sublevel. Returns true if the
  1540  // compaction is possible (i.e. does not conflict with any base/intra-L0
  1541  // compacting files).
  1542  func (s *L0Sublevels) extendFiles(
  1543  	sl int, earliestUnflushedSeqNum uint64, cFiles *L0CompactionFiles,
  1544  ) bool {
  1545  	index := sort.Search(len(s.levelFiles[sl]), func(i int) bool {
  1546  		return s.levelFiles[sl][i].maxIntervalIndex >= cFiles.minIntervalIndex
  1547  	})
  1548  	for ; index < len(s.levelFiles[sl]); index++ {
  1549  		f := s.levelFiles[sl][index]
  1550  		if f.minIntervalIndex > cFiles.maxIntervalIndex {
  1551  			break
  1552  		}
  1553  		if f.IsCompacting() {
  1554  			return false
  1555  		}
  1556  		// Skip over files that are newer than earliestUnflushedSeqNum. This is
  1557  		// okay because this compaction can just pretend these files are not in
  1558  		// L0 yet. These files must be in higher sublevels than any overlapping
  1559  		// files with f.LargestSeqNum < earliestUnflushedSeqNum, and the output
  1560  		// of the compaction will also go in a lower (older) sublevel than this
  1561  		// file by definition.
  1562  		if f.LargestSeqNum >= earliestUnflushedSeqNum {
  1563  			continue
  1564  		}
  1565  		cFiles.addFile(f)
  1566  	}
  1567  	return true
  1568  }
  1569  
  1570  // PickIntraL0Compaction picks an intra-L0 compaction for files in this
  1571  // sublevel. This method is only called when a base compaction cannot be chosen.
  1572  // See comment above PickBaseCompaction for heuristics involved in this
  1573  // selection.
  1574  func (s *L0Sublevels) PickIntraL0Compaction(
  1575  	earliestUnflushedSeqNum uint64, minCompactionDepth int,
  1576  ) (*L0CompactionFiles, error) {
  1577  	scoredIntervals := make([]intervalAndScore, len(s.orderedIntervals))
  1578  	for i := range s.orderedIntervals {
  1579  		interval := &s.orderedIntervals[i]
  1580  		depth := len(interval.files) - interval.compactingFileCount
  1581  		if minCompactionDepth > depth {
  1582  			continue
  1583  		}
  1584  		scoredIntervals[i] = intervalAndScore{interval: i, score: depth}
  1585  	}
  1586  	sort.Sort(intervalSorterByDecreasingScore(scoredIntervals))
  1587  
  1588  	// Optimization to avoid considering different intervals that
  1589  	// are likely to choose the same seed file. Again this is just
  1590  	// to reduce wasted work.
  1591  	consideredIntervals := newBitSet(len(s.orderedIntervals))
  1592  	for _, scoredInterval := range scoredIntervals {
  1593  		interval := &s.orderedIntervals[scoredInterval.interval]
  1594  		if consideredIntervals[interval.index] {
  1595  			continue
  1596  		}
  1597  
  1598  		var f *FileMetadata
  1599  		// Pick the seed file for the interval as the file
  1600  		// in the highest sub-level.
  1601  		stackDepthReduction := scoredInterval.score
  1602  		for i := len(interval.files) - 1; i >= 0; i-- {
  1603  			f = interval.files[i]
  1604  			if f.IsCompacting() {
  1605  				break
  1606  			}
  1607  			consideredIntervals.markBits(f.minIntervalIndex, f.maxIntervalIndex+1)
  1608  			// Can this be the seed file? Files with newer sequence
  1609  			// numbers than earliestUnflushedSeqNum cannot be in
  1610  			// the compaction.
  1611  			if f.LargestSeqNum >= earliestUnflushedSeqNum {
  1612  				stackDepthReduction--
  1613  				if stackDepthReduction == 0 {
  1614  					break
  1615  				}
  1616  			} else {
  1617  				break
  1618  			}
  1619  		}
  1620  		if stackDepthReduction < minCompactionDepth {
  1621  			// Can't use this interval.
  1622  			continue
  1623  		}
  1624  
  1625  		if f == nil {
  1626  			return nil, errors.New("no seed file found in sublevel intervals")
  1627  		}
  1628  		if f.IsCompacting() {
  1629  			// This file could be in a concurrent intra-L0 or base compaction.
  1630  			// Try another interval.
  1631  			continue
  1632  		}
  1633  
  1634  		// We have a seed file. Build a compaction off of that seed.
  1635  		c := s.intraL0CompactionUsingSeed(
  1636  			f, interval.index, earliestUnflushedSeqNum, minCompactionDepth)
  1637  		if c != nil {
  1638  			return c, nil
  1639  		}
  1640  	}
  1641  	return nil, nil
  1642  }
  1643  
  1644  func (s *L0Sublevels) intraL0CompactionUsingSeed(
  1645  	f *FileMetadata, intervalIndex int, earliestUnflushedSeqNum uint64, minCompactionDepth int,
  1646  ) *L0CompactionFiles {
  1647  	// We know that all the files that overlap with intervalIndex have
  1648  	// LargestSeqNum < earliestUnflushedSeqNum, but for other intervals
  1649  	// we need to exclude files >= earliestUnflushedSeqNum
  1650  
  1651  	c := &L0CompactionFiles{
  1652  		FilesIncluded:           newBitSet(s.levelMetadata.Len()),
  1653  		seedInterval:            intervalIndex,
  1654  		seedIntervalMaxLevel:    len(s.levelFiles) - 1,
  1655  		minIntervalIndex:        f.minIntervalIndex,
  1656  		maxIntervalIndex:        f.maxIntervalIndex,
  1657  		isIntraL0:               true,
  1658  		earliestUnflushedSeqNum: earliestUnflushedSeqNum,
  1659  	}
  1660  	c.addFile(f)
  1661  
  1662  	var lastCandidate *L0CompactionFiles
  1663  	interval := &s.orderedIntervals[intervalIndex]
  1664  	slIndex := len(interval.files) - 1
  1665  	for {
  1666  		if interval.files[slIndex] == f {
  1667  			break
  1668  		}
  1669  		slIndex--
  1670  	}
  1671  	// The first iteration of this loop produces an intra-L0 compaction at the
  1672  	// seed level. Iterations after that optionally add to the compaction by
  1673  	// stacking more files from intervalIndex and repeating. This is an
  1674  	// optional activity so when it fails we can fallback to the last
  1675  	// successful candidate. The code stops adding when it can't add more, or
  1676  	// when fileBytes grows too large.
  1677  	for ; slIndex >= 0; slIndex-- {
  1678  		f2 := interval.files[slIndex]
  1679  		sl := f2.SubLevel
  1680  		if f2.IsCompacting() {
  1681  			break
  1682  		}
  1683  		c.seedIntervalStackDepthReduction++
  1684  		c.seedIntervalMinLevel = sl
  1685  		c.addFile(f2)
  1686  		// The seed file captures all files in the higher level that fall in the
  1687  		// range of intervals. That may extend the range of intervals so for
  1688  		// correctness we need to capture all files in the next higher level that
  1689  		// fall in this extended interval and so on. This can result in an
  1690  		// inverted triangular shape like the following where again the X axis is the
  1691  		// key intervals and the Y axis is oldest to youngest. Note that it is not
  1692  		// necessary for correctness to fill out the shape at lower sub-levels to
  1693  		// make it more rectangular since the invariant only requires that if we
  1694  		// move an older seqnum for key k into a file that has a higher seqnum, we
  1695  		// also move all younger seqnums for that key k into that file.
  1696  		//                  -----
  1697  		//                   ---
  1698  		//                    -
  1699  		//
  1700  		// It may be better for performance to have a more rectangular shape since
  1701  		// it will reduce the stack depth for more intervals. But there is also
  1702  		// the danger that in explicitly trying to construct a more rectangular
  1703  		// shape we will be forced to pull in a file that is already compacting.
  1704  		// We assume that the performance concern is not a practical issue.
  1705  		done := false
  1706  		for currLevel := sl + 1; currLevel < len(s.levelFiles); currLevel++ {
  1707  			if !s.extendFiles(currLevel, earliestUnflushedSeqNum, c) {
  1708  				// Failed to extend due to ongoing compaction.
  1709  				done = true
  1710  				break
  1711  			}
  1712  		}
  1713  		if done {
  1714  			break
  1715  		}
  1716  		if lastCandidate == nil {
  1717  			lastCandidate = &L0CompactionFiles{}
  1718  		} else if lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth &&
  1719  			c.fileBytes > 100<<20 &&
  1720  			(float64(c.fileBytes)/float64(lastCandidate.fileBytes) > 1.5 || c.fileBytes > 500<<20) {
  1721  			break
  1722  		}
  1723  		*lastCandidate = *c
  1724  	}
  1725  	if lastCandidate != nil && lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth {
  1726  		lastCandidate.FilesIncluded.clearAllBits()
  1727  		for _, f := range lastCandidate.Files {
  1728  			lastCandidate.FilesIncluded.markBit(f.L0Index)
  1729  		}
  1730  		s.extendCandidateToRectangle(
  1731  			lastCandidate.minIntervalIndex, lastCandidate.maxIntervalIndex, lastCandidate, false)
  1732  		return lastCandidate
  1733  	}
  1734  	return nil
  1735  }
  1736  
  1737  // ExtendL0ForBaseCompactionTo extends the specified base compaction candidate
  1738  // L0CompactionFiles to optionally cover more files in L0 without "touching"
  1739  // any of the passed-in keys (i.e. the smallest/largest bounds are exclusive),
  1740  // as including any user keys for those internal keys
  1741  // could require choosing more files in LBase which is undesirable. Unbounded
  1742  // start/end keys are indicated by passing in the InvalidInternalKey.
  1743  func (s *L0Sublevels) ExtendL0ForBaseCompactionTo(
  1744  	smallest, largest InternalKey, candidate *L0CompactionFiles,
  1745  ) bool {
  1746  	firstIntervalIndex := 0
  1747  	lastIntervalIndex := len(s.orderedIntervals) - 1
  1748  	if smallest.Kind() != base.InternalKeyKindInvalid {
  1749  		if smallest.Trailer == base.InternalKeyRangeDeleteSentinel {
  1750  			// Starting at smallest.UserKey == interval.startKey is okay.
  1751  			firstIntervalIndex = sort.Search(len(s.orderedIntervals), func(i int) bool {
  1752  				return s.cmp(smallest.UserKey, s.orderedIntervals[i].startKey.key) <= 0
  1753  			})
  1754  		} else {
  1755  			firstIntervalIndex = sort.Search(len(s.orderedIntervals), func(i int) bool {
  1756  				// Need to start at >= smallest since if we widen too much we may miss
  1757  				// an Lbase file that overlaps with an L0 file that will get picked in
  1758  				// this widening, which would be bad. This interval will not start with
  1759  				// an immediate successor key.
  1760  				return s.cmp(smallest.UserKey, s.orderedIntervals[i].startKey.key) < 0
  1761  			})
  1762  		}
  1763  	}
  1764  	if largest.Kind() != base.InternalKeyKindInvalid {
  1765  		// First interval that starts at or beyond the largest. This interval will not
  1766  		// start with an immediate successor key.
  1767  		lastIntervalIndex = sort.Search(len(s.orderedIntervals), func(i int) bool {
  1768  			return s.cmp(largest.UserKey, s.orderedIntervals[i].startKey.key) <= 0
  1769  		})
  1770  		// Right now, lastIntervalIndex has a startKey that extends beyond largest.
  1771  		// The previous interval, by definition, has an end key higher than largest.
  1772  		// Iterate back twice to get the last interval that's completely within
  1773  		// (smallest, largest). Except in the case where we went past the end of the
  1774  		// list; in that case, the last interval to include is the very last
  1775  		// interval in the list.
  1776  		if lastIntervalIndex < len(s.orderedIntervals) {
  1777  			lastIntervalIndex--
  1778  		}
  1779  		lastIntervalIndex--
  1780  	}
  1781  	if lastIntervalIndex < firstIntervalIndex {
  1782  		return false
  1783  	}
  1784  	return s.extendCandidateToRectangle(firstIntervalIndex, lastIntervalIndex, candidate, true)
  1785  }
  1786  
  1787  // Best-effort attempt to make the compaction include more files in the
  1788  // rectangle defined by [minIntervalIndex, maxIntervalIndex] on the X axis and
  1789  // bounded on the Y axis by seedIntervalMinLevel and seedIntervalMaxLevel.
  1790  //
  1791  // This is strictly an optional extension; at any point where we can't feasibly
  1792  // add more files, the sublevel iteration can be halted early and candidate will
  1793  // still be a correct compaction candidate.
  1794  //
  1795  // Consider this scenario (original candidate is inside the rectangle), with
  1796  // isBase = true and interval bounds a-j (from the union of base file bounds and
  1797  // that of compaction candidate):
  1798  //
  1799  //	           _______
  1800  //	L0.3  a--d |  g-j|
  1801  //	L0.2       | f--j|         r-t
  1802  //	L0.1   b-d |e---j|
  1803  //	L0.0  a--d | f--j| l--o  p-----x
  1804  //
  1805  //	Lbase a--------i    m---------w
  1806  //
  1807  // This method will iterate from the bottom up. At L0.0, it will add a--d since
  1808  // it's in the bounds, then add b-d, then a--d, and so on, to produce this:
  1809  //
  1810  //	     _____________
  1811  //	L0.3 |a--d    g-j|
  1812  //	L0.2 |       f--j|         r-t
  1813  //	L0.1 | b-d  e---j|
  1814  //	L0.0 |a--d   f--j| l--o  p-----x
  1815  //
  1816  //	Lbase a-------i     m---------w
  1817  //
  1818  // Let's assume that, instead of a--d in the top sublevel, we had 3 files, a-b,
  1819  // bb-c, and cc-d, of which bb-c is compacting. Let's also add another sublevel
  1820  // L0.4 with some files, all of which aren't compacting:
  1821  //
  1822  //	L0.4  a------c ca--d _______
  1823  //	L0.3  a-b bb-c  cc-d |  g-j|
  1824  //	L0.2                 | f--j|         r-t
  1825  //	L0.1    b----------d |e---j|
  1826  //	L0.0  a------------d | f--j| l--o  p-----x
  1827  //
  1828  //	Lbase a------------------i    m---------w
  1829  //
  1830  // This method then needs to choose between the left side of L0.3 bb-c
  1831  // (i.e. a-b), or the right side (i.e. cc-d and g-j) for inclusion in this
  1832  // compaction. Since the right side has more files as well as one file that has
  1833  // already been picked, it gets chosen at that sublevel, resulting in this
  1834  // intermediate compaction:
  1835  //
  1836  //	L0.4  a------c ca--d
  1837  //	              ______________
  1838  //	L0.3  a-b bb-c| cc-d    g-j|
  1839  //	L0.2 _________|        f--j|         r-t
  1840  //	L0.1 |  b----------d  e---j|
  1841  //	L0.0 |a------------d   f--j| l--o  p-----x
  1842  //
  1843  //	Lbase a------------------i    m---------w
  1844  //
  1845  // Since bb-c had to be excluded at L0.3, the interval bounds for L0.4 are
  1846  // actually ca-j, since ca is the next interval start key after the end interval
  1847  // of bb-c. This would result in only ca-d being chosen at that sublevel, even
  1848  // though a--c is also not compacting. This is the final result:
  1849  //
  1850  //	              ______________
  1851  //	L0.4  a------c|ca--d       |
  1852  //	L0.3  a-b bb-c| cc-d    g-j|
  1853  //	L0.2 _________|        f--j|         r-t
  1854  //	L0.1 |  b----------d  e---j|
  1855  //	L0.0 |a------------d   f--j| l--o  p-----x
  1856  //
  1857  //	Lbase a------------------i    m---------w
  1858  //
  1859  // TODO(bilal): Add more targeted tests for this method, through
  1860  // ExtendL0ForBaseCompactionTo and intraL0CompactionUsingSeed.
  1861  func (s *L0Sublevels) extendCandidateToRectangle(
  1862  	minIntervalIndex int, maxIntervalIndex int, candidate *L0CompactionFiles, isBase bool,
  1863  ) bool {
  1864  	candidate.preExtensionMinInterval = candidate.minIntervalIndex
  1865  	candidate.preExtensionMaxInterval = candidate.maxIntervalIndex
  1866  	// Extend {min,max}IntervalIndex to include all of the candidate's current
  1867  	// bounds.
  1868  	if minIntervalIndex > candidate.minIntervalIndex {
  1869  		minIntervalIndex = candidate.minIntervalIndex
  1870  	}
  1871  	if maxIntervalIndex < candidate.maxIntervalIndex {
  1872  		maxIntervalIndex = candidate.maxIntervalIndex
  1873  	}
  1874  	var startLevel, increment, endLevel int
  1875  	if isBase {
  1876  		startLevel = 0
  1877  		increment = +1
  1878  		// seedIntervalMaxLevel is inclusive, while endLevel is exclusive.
  1879  		endLevel = candidate.seedIntervalMaxLevel + 1
  1880  	} else {
  1881  		startLevel = len(s.levelFiles) - 1
  1882  		increment = -1
  1883  		// seedIntervalMinLevel is inclusive, while endLevel is exclusive.
  1884  		endLevel = candidate.seedIntervalMinLevel - 1
  1885  	}
  1886  	// Stats for files.
  1887  	addedCount := 0
  1888  	// Iterate from the oldest sub-level for L0 -> Lbase and youngest
  1889  	// sub-level for intra-L0. The idea here is that anything that can't
  1890  	// be included from that level constrains what can be included from
  1891  	// the next level. This change in constraint is directly incorporated
  1892  	// into minIntervalIndex, maxIntervalIndex.
  1893  	for sl := startLevel; sl != endLevel; sl += increment {
  1894  		files := s.levelFiles[sl]
  1895  		// Find the first file that overlaps with minIntervalIndex.
  1896  		index := sort.Search(len(files), func(i int) bool {
  1897  			return minIntervalIndex <= files[i].maxIntervalIndex
  1898  		})
  1899  		// Track the files that are fully within the current constraint
  1900  		// of [minIntervalIndex, maxIntervalIndex].
  1901  		firstIndex := -1
  1902  		lastIndex := -1
  1903  		for ; index < len(files); index++ {
  1904  			f := files[index]
  1905  			if f.minIntervalIndex > maxIntervalIndex {
  1906  				break
  1907  			}
  1908  			include := true
  1909  			// Extends out on the left so can't be included. This narrows
  1910  			// what we can included in the next level.
  1911  			if f.minIntervalIndex < minIntervalIndex {
  1912  				include = false
  1913  				minIntervalIndex = f.maxIntervalIndex + 1
  1914  			}
  1915  			// Extends out on the right so can't be included.
  1916  			if f.maxIntervalIndex > maxIntervalIndex {
  1917  				include = false
  1918  				maxIntervalIndex = f.minIntervalIndex - 1
  1919  			}
  1920  			if !include {
  1921  				continue
  1922  			}
  1923  			if firstIndex == -1 {
  1924  				firstIndex = index
  1925  			}
  1926  			lastIndex = index
  1927  		}
  1928  		if minIntervalIndex > maxIntervalIndex {
  1929  			// We excluded files that prevent continuation.
  1930  			break
  1931  		}
  1932  		if firstIndex < 0 {
  1933  			// No files to add in this sub-level.
  1934  			continue
  1935  		}
  1936  		// We have the files in [firstIndex, lastIndex] as potential for
  1937  		// inclusion. Some of these may already have been picked. Some
  1938  		// of them may be already compacting. The latter is tricky since
  1939  		// we have to decide whether to contract minIntervalIndex or
  1940  		// maxIntervalIndex when we encounter an already compacting file.
  1941  		// We pick the longest sequence between firstIndex
  1942  		// and lastIndex of non-compacting files -- this is represented by
  1943  		// [candidateNonCompactingFirst, candidateNonCompactingLast].
  1944  		nonCompactingFirst := -1
  1945  		currentRunHasAlreadyPickedFiles := false
  1946  		candidateNonCompactingFirst := -1
  1947  		candidateNonCompactingLast := -1
  1948  		candidateHasAlreadyPickedFiles := false
  1949  		for index = firstIndex; index <= lastIndex; index++ {
  1950  			f := files[index]
  1951  			if f.IsCompacting() {
  1952  				if nonCompactingFirst != -1 {
  1953  					last := index - 1
  1954  					// Prioritize runs of consecutive non-compacting files that
  1955  					// have files that have already been picked. That is to say,
  1956  					// if candidateHasAlreadyPickedFiles == true, we stick with
  1957  					// it, and if currentRunHasAlreadyPickedfiles == true, we
  1958  					// pick that run even if it contains fewer files than the
  1959  					// previous candidate.
  1960  					if !candidateHasAlreadyPickedFiles && (candidateNonCompactingFirst == -1 ||
  1961  						currentRunHasAlreadyPickedFiles ||
  1962  						(last-nonCompactingFirst) > (candidateNonCompactingLast-candidateNonCompactingFirst)) {
  1963  						candidateNonCompactingFirst = nonCompactingFirst
  1964  						candidateNonCompactingLast = last
  1965  						candidateHasAlreadyPickedFiles = currentRunHasAlreadyPickedFiles
  1966  					}
  1967  				}
  1968  				nonCompactingFirst = -1
  1969  				currentRunHasAlreadyPickedFiles = false
  1970  				continue
  1971  			}
  1972  			if nonCompactingFirst == -1 {
  1973  				nonCompactingFirst = index
  1974  			}
  1975  			if candidate.FilesIncluded[f.L0Index] {
  1976  				currentRunHasAlreadyPickedFiles = true
  1977  			}
  1978  		}
  1979  		// Logic duplicated from inside the for loop above.
  1980  		if nonCompactingFirst != -1 {
  1981  			last := index - 1
  1982  			if !candidateHasAlreadyPickedFiles && (candidateNonCompactingFirst == -1 ||
  1983  				currentRunHasAlreadyPickedFiles ||
  1984  				(last-nonCompactingFirst) > (candidateNonCompactingLast-candidateNonCompactingFirst)) {
  1985  				candidateNonCompactingFirst = nonCompactingFirst
  1986  				candidateNonCompactingLast = last
  1987  			}
  1988  		}
  1989  		if candidateNonCompactingFirst == -1 {
  1990  			// All files are compacting. There will be gaps that we could exploit
  1991  			// to continue, but don't bother.
  1992  			break
  1993  		}
  1994  		// May need to shrink [minIntervalIndex, maxIntervalIndex] for the next level.
  1995  		if candidateNonCompactingFirst > firstIndex {
  1996  			minIntervalIndex = files[candidateNonCompactingFirst-1].maxIntervalIndex + 1
  1997  		}
  1998  		if candidateNonCompactingLast < lastIndex {
  1999  			maxIntervalIndex = files[candidateNonCompactingLast+1].minIntervalIndex - 1
  2000  		}
  2001  		for index := candidateNonCompactingFirst; index <= candidateNonCompactingLast; index++ {
  2002  			f := files[index]
  2003  			if f.IsCompacting() {
  2004  				// TODO(bilal): Do a logger.Fatalf instead of a panic, for
  2005  				// cleaner unwinding and error messages.
  2006  				panic(fmt.Sprintf("expected %s to not be compacting", f.FileNum))
  2007  			}
  2008  			if candidate.isIntraL0 && f.LargestSeqNum >= candidate.earliestUnflushedSeqNum {
  2009  				continue
  2010  			}
  2011  			if !candidate.FilesIncluded[f.L0Index] {
  2012  				addedCount++
  2013  				candidate.addFile(f)
  2014  			}
  2015  		}
  2016  	}
  2017  	return addedCount > 0
  2018  }