github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/compaction.go

github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/compaction.go (about)

     1  // Copyright 2013 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"bytes"
     9  	"errors"
    10  	"fmt"
    11  	"math"
    12  	"os"
    13  	"sort"
    14  	"sync/atomic"
    15  	"unsafe"
    16  
    17  	"github.com/petermattis/pebble/internal/base"
    18  	"github.com/petermattis/pebble/internal/manifest"
    19  	"github.com/petermattis/pebble/internal/rangedel"
    20  	"github.com/petermattis/pebble/sstable"
    21  	"github.com/petermattis/pebble/vfs"
    22  )
    23  
    24  var errEmptyTable = errors.New("pebble: empty table")
    25  
    26  // expandedCompactionByteSizeLimit is the maximum number of bytes in all
    27  // compacted files. We avoid expanding the lower level file set of a compaction
    28  // if it would make the total compaction cover more than this many bytes.
    29  func expandedCompactionByteSizeLimit(opts *Options, level int) uint64 {
    30  	return uint64(25 * opts.Level(level).TargetFileSize)
    31  }
    32  
    33  // maxGrandparentOverlapBytes is the maximum bytes of overlap with level+2
    34  // before we stop building a single file in a level to level+1 compaction.
    35  func maxGrandparentOverlapBytes(opts *Options, level int) uint64 {
    36  	return uint64(10 * opts.Level(level).TargetFileSize)
    37  }
    38  
    39  // totalSize returns the total size of all the files in f.
    40  func totalSize(f []fileMetadata) (size uint64) {
    41  	for _, x := range f {
    42  		size += x.Size
    43  	}
    44  	return size
    45  }
    46  
    47  // compaction is a table compaction from one level to the next, starting from a
    48  // given version.
    49  type compaction struct {
    50  	cmp     Compare
    51  	version *version
    52  
    53  	// startLevel is the level that is being compacted. Inputs from startLevel
    54  	// and outputLevel will be merged to produce a set of outputLevel files.
    55  	startLevel int
    56  	// outputLevel is the level that files are being produced in. outputLevel is
    57  	// equal to startLevel+1 except when startLevel is 0 in which case it is
    58  	// equal to compactionPicker.baseLevel.
    59  	outputLevel int
    60  
    61  	// maxOutputFileSize is the maximum size of an individual table created
    62  	// during compaction.
    63  	maxOutputFileSize uint64
    64  	// maxOverlapBytes is the maximum number of bytes of overlap allowed for a
    65  	// single output table with the tables in the grandparent level.
    66  	maxOverlapBytes uint64
    67  	// maxExpandedBytes is the maximum size of an expanded compaction. If growing
    68  	// a compaction results in a larger size, the original compaction is used
    69  	// instead.
    70  	maxExpandedBytes uint64
    71  	// disableRangeTombstoneElision disables elision of range tombstones. Used by
    72  	// tests to allow range tombstones to be added to tables where they would
    73  	// otherwise be elided.
    74  	disableRangeTombstoneElision bool
    75  
    76  	// flushing contains the flushables (aka memtables) that are being flushed.
    77  	flushing []flushable
    78  	// bytesIterated contains the number of bytes that have been flushed/compacted.
    79  	bytesIterated uint64
    80  	// atomicBytesIterated points to the variable to increment during iteration.
    81  	// atomicBytesIterated must be read/written atomically. Flushing will increment
    82  	// the shared variable which compaction will read. This allows for the
    83  	// compaction routine to know how many bytes have been flushed before the flush
    84  	// is applied.
    85  	atomicBytesIterated *uint64
    86  	// inputs are the tables to be compacted.
    87  	inputs [2][]fileMetadata
    88  
    89  	// grandparents are the tables in level+2 that overlap with the files being
    90  	// compacted. Used to determine output table boundaries.
    91  	grandparents    []fileMetadata
    92  	overlappedBytes uint64 // bytes of overlap with grandparent tables
    93  	seenKey         bool   // some output key has been seen
    94  
    95  	metrics map[int]*LevelMetrics
    96  }
    97  
    98  func newCompaction(
    99  	opts *Options,
   100  	cur *version,
   101  	startLevel,
   102  	baseLevel int,
   103  	bytesCompacted *uint64,
   104  ) *compaction {
   105  	if startLevel > 0 && startLevel < baseLevel {
   106  		panic(fmt.Sprintf("invalid compaction: start level %d should be empty (base level %d)",
   107  			startLevel, baseLevel))
   108  	}
   109  
   110  	outputLevel := startLevel + 1
   111  	if startLevel == 0 {
   112  		outputLevel = baseLevel
   113  	}
   114  	if outputLevel >= numLevels-1 {
   115  		outputLevel = numLevels - 1
   116  	}
   117  	// Output level is in the range [baseLevel,numLevels]. For the purpose of
   118  	// determining the target output file size, overlap bytes, and expanded
   119  	// bytes, we want to adjust the range to [1,numLevels].
   120  	adjustedOutputLevel := 1 + outputLevel - baseLevel
   121  
   122  	return &compaction{
   123  		cmp:                 opts.Comparer.Compare,
   124  		version:             cur,
   125  		startLevel:          startLevel,
   126  		outputLevel:         outputLevel,
   127  		maxOutputFileSize:   uint64(opts.Level(adjustedOutputLevel).TargetFileSize),
   128  		maxOverlapBytes:     maxGrandparentOverlapBytes(opts, adjustedOutputLevel),
   129  		maxExpandedBytes:    expandedCompactionByteSizeLimit(opts, adjustedOutputLevel),
   130  		atomicBytesIterated: bytesCompacted,
   131  	}
   132  }
   133  
   134  func newFlush(
   135  	opts *Options,
   136  	cur *version,
   137  	baseLevel int,
   138  	flushing []flushable,
   139  	bytesFlushed *uint64,
   140  ) *compaction {
   141  	c := &compaction{
   142  		cmp:                 opts.Comparer.Compare,
   143  		version:             cur,
   144  		startLevel:          -1,
   145  		outputLevel:         0,
   146  		maxOutputFileSize:   math.MaxUint64,
   147  		maxOverlapBytes:     math.MaxUint64,
   148  		maxExpandedBytes:    math.MaxUint64,
   149  		flushing:            flushing,
   150  		atomicBytesIterated: bytesFlushed,
   151  	}
   152  
   153  	// TODO(peter): When we allow flushing to create multiple tables we'll want
   154  	// to choose sstable boundaries based on the grandparents. But for now we
   155  	// want to create a single table during flushing so this is all commented
   156  	// out.
   157  	if false {
   158  		c.maxOutputFileSize = uint64(opts.Level(0).TargetFileSize)
   159  		c.maxOverlapBytes = maxGrandparentOverlapBytes(opts, 0)
   160  		c.maxExpandedBytes = expandedCompactionByteSizeLimit(opts, 0)
   161  
   162  		var smallest InternalKey
   163  		var largest InternalKey
   164  		smallestSet, largestSet := false, false
   165  
   166  		updatePointBounds := func(iter internalIterator) {
   167  			if key, _ := iter.First(); key != nil {
   168  				if !smallestSet ||
   169  					base.InternalCompare(c.cmp, smallest, *key) > 0 {
   170  					smallestSet = true
   171  					smallest = key.Clone()
   172  				}
   173  			}
   174  			if key, _ := iter.Last(); key != nil {
   175  				if !largestSet ||
   176  					base.InternalCompare(c.cmp, largest, *key) < 0 {
   177  					largestSet = true
   178  					largest = key.Clone()
   179  				}
   180  			}
   181  		}
   182  
   183  		updateRangeBounds := func(iter internalIterator) {
   184  			if key, _ := iter.First(); key != nil {
   185  				if !smallestSet ||
   186  					base.InternalCompare(c.cmp, smallest, *key) > 0 {
   187  					smallestSet = true
   188  					smallest = key.Clone()
   189  				}
   190  			}
   191  		}
   192  
   193  		for i := range flushing {
   194  			f := flushing[i]
   195  			updatePointBounds(f.newIter(nil))
   196  			if rangeDelIter := f.newRangeDelIter(nil); rangeDelIter != nil {
   197  				updateRangeBounds(rangeDelIter)
   198  			}
   199  		}
   200  
   201  		c.grandparents = c.version.Overlaps(baseLevel, c.cmp, smallest.UserKey, largest.UserKey)
   202  	}
   203  	return c
   204  }
   205  
   206  // setupOtherInputs fills in the rest of the compaction inputs, regardless of
   207  // whether the compaction was automatically scheduled or user initiated.
   208  func (c *compaction) setupOtherInputs() {
   209  	c.inputs[0] = c.expandInputs(c.inputs[0])
   210  	smallest0, largest0 := manifest.KeyRange(c.cmp, c.inputs[0], nil)
   211  	c.inputs[1] = c.version.Overlaps(c.outputLevel, c.cmp, smallest0.UserKey, largest0.UserKey)
   212  	smallest01, largest01 := manifest.KeyRange(c.cmp, c.inputs[0], c.inputs[1])
   213  
   214  	// Grow the inputs if it doesn't affect the number of level+1 files.
   215  	if c.grow(smallest01, largest01) {
   216  		smallest01, largest01 = manifest.KeyRange(c.cmp, c.inputs[0], c.inputs[1])
   217  	}
   218  
   219  	// Compute the set of outputLevel+1 files that overlap this compaction.
   220  	if c.outputLevel+1 < numLevels {
   221  		c.grandparents = c.version.Overlaps(c.outputLevel+1, c.cmp, smallest01.UserKey, largest01.UserKey)
   222  	}
   223  }
   224  
   225  // expandInputs expands the files in inputs[0] in order to maintain the
   226  // invariant that the versions of keys at level+1 are older than the versions
   227  // of keys at level. This is achieved by adding tables to the right of the
   228  // current input tables such that the rightmost table has a "clean cut". A
   229  // clean cut is either a change in user keys, or
   230  func (c *compaction) expandInputs(inputs []fileMetadata) []fileMetadata {
   231  	if c.startLevel == 0 {
   232  		// We already call version.overlaps for L0 and that call guarantees that we
   233  		// get a "clean cut".
   234  		return inputs
   235  	}
   236  	files := c.version.Files[c.startLevel]
   237  	// Pointer arithmetic to figure out the index if inputs[0] with
   238  	// files[0]. This requires that the inputs slice is a sub-slice of
   239  	// files. This is true for non-L0 files returned from version.overlaps.
   240  	if uintptr(unsafe.Pointer(&inputs[0])) < uintptr(unsafe.Pointer(&files[0])) {
   241  		panic("pebble: invalid input slice")
   242  	}
   243  	start := int((uintptr(unsafe.Pointer(&inputs[0])) -
   244  		uintptr(unsafe.Pointer(&files[0]))) / unsafe.Sizeof(inputs[0]))
   245  	if start >= len(files) {
   246  		panic("pebble: invalid input slice")
   247  	}
   248  	end := start + len(inputs)
   249  	for ; end < len(files); end++ {
   250  		cur := &files[end-1]
   251  		next := &files[end]
   252  		if c.cmp(cur.Largest.UserKey, next.Smallest.UserKey) < 0 {
   253  			break
   254  		}
   255  		if cur.Largest.Trailer == InternalKeyRangeDeleteSentinel {
   256  			// The range deletion sentinel key is set for the largest key in a table
   257  			// when a range deletion tombstone straddles a table. It isn't necessary
   258  			// to include the next table in the compaction as cur.largest.UserKey
   259  			// does not actually exist in the table.
   260  			break
   261  		}
   262  		// cur.largest.UserKey == next.largest.UserKey, so we need to include next
   263  		// in the compaction.
   264  	}
   265  	return files[start:end]
   266  }
   267  
   268  // grow grows the number of inputs at c.level without changing the number of
   269  // c.level+1 files in the compaction, and returns whether the inputs grew. sm
   270  // and la are the smallest and largest InternalKeys in all of the inputs.
   271  func (c *compaction) grow(sm, la InternalKey) bool {
   272  	if len(c.inputs[1]) == 0 {
   273  		return false
   274  	}
   275  	grow0 := c.version.Overlaps(c.startLevel, c.cmp, sm.UserKey, la.UserKey)
   276  	grow0 = c.expandInputs(grow0)
   277  	if len(grow0) <= len(c.inputs[0]) {
   278  		return false
   279  	}
   280  	if totalSize(grow0)+totalSize(c.inputs[1]) >= c.maxExpandedBytes {
   281  		return false
   282  	}
   283  	sm1, la1 := manifest.KeyRange(c.cmp, grow0, nil)
   284  	grow1 := c.version.Overlaps(c.outputLevel, c.cmp, sm1.UserKey, la1.UserKey)
   285  	if len(grow1) != len(c.inputs[1]) {
   286  		return false
   287  	}
   288  	c.inputs[0] = grow0
   289  	c.inputs[1] = grow1
   290  	return true
   291  }
   292  
   293  func (c *compaction) trivialMove() bool {
   294  	if len(c.flushing) != 0 {
   295  		return false
   296  	}
   297  	// Check for a trivial move of one table from one level to the next. We avoid
   298  	// such a move if there is lots of overlapping grandparent data. Otherwise,
   299  	// the move could create a parent file that will require a very expensive
   300  	// merge later on.
   301  	if len(c.inputs[0]) == 1 && len(c.inputs[1]) == 0 &&
   302  		totalSize(c.grandparents) <= c.maxOverlapBytes {
   303  		return true
   304  	}
   305  	return false
   306  }
   307  
   308  // shouldStopBefore returns true if the output to the current table should be
   309  // finished and a new table started before adding the specified key. This is
   310  // done in order to prevent a table at level N from overlapping too much data
   311  // at level N+1. We want to avoid such large overlaps because they translate
   312  // into large compactions. The current heuristic stops output of a table if the
   313  // addition of another key would cause the table to overlap more than 10x the
   314  // target file size at level N. See maxGrandparentOverlapBytes.
   315  //
   316  // TODO(peter): Stopping compaction output in the middle of a user-key creates
   317  // 2 sstables that need to be compacted together as an "atomic compaction
   318  // unit". This is unfortunate as it removes the benefit of stopping output to
   319  // an sstable in order to prevent a large compaction with the next level. Seems
   320  // better to adjust shouldStopBefore to not stop output in the middle of a
   321  // user-key. Perhaps this isn't a problem if the compaction picking heuristics
   322  // always pick the right (older) sibling for compaction first.
   323  func (c *compaction) shouldStopBefore(key InternalKey) bool {
   324  	for len(c.grandparents) > 0 {
   325  		g := &c.grandparents[0]
   326  		if base.InternalCompare(c.cmp, key, g.Largest) <= 0 {
   327  			break
   328  		}
   329  		if c.seenKey {
   330  			c.overlappedBytes += g.Size
   331  		}
   332  		c.grandparents = c.grandparents[1:]
   333  	}
   334  	c.seenKey = true
   335  	if c.overlappedBytes > c.maxOverlapBytes {
   336  		c.overlappedBytes = 0
   337  		return true
   338  	}
   339  	return false
   340  }
   341  
   342  // allowZeroSeqNum returns true if seqnum's can be zeroed if there are no
   343  // snapshots requiring them to be kept. It performs this determination by
   344  // looking for an sstable which overlaps the bounds of the compaction at a
   345  // lower level in the LSM.
   346  func (c *compaction) allowZeroSeqNum(iter internalIterator) bool {
   347  	if len(c.flushing) != 0 {
   348  		if len(c.version.Files[0]) > 0 {
   349  			// We can only allow zeroing of seqnum for L0 tables if no other L0 tables
   350  			// exist. Otherwise we may violate the invariant that L0 tables are ordered
   351  			// by increasing seqnum. This could be relaxed with a bit more intelligence
   352  			// in how a new L0 table is merged into the existing set of L0 tables.
   353  			return false
   354  		}
   355  		lower, _ := iter.First()
   356  		upper, _ := iter.Last()
   357  		if lower == nil || upper == nil {
   358  			return false
   359  		}
   360  		return c.elideRangeTombstone(lower.UserKey, upper.UserKey)
   361  	}
   362  
   363  	var lower, upper []byte
   364  	for i := range c.inputs {
   365  		files := c.inputs[i]
   366  		for j := range files {
   367  			f := &files[j]
   368  			if lower == nil || c.cmp(lower, f.Smallest.UserKey) > 0 {
   369  				lower = f.Smallest.UserKey
   370  			}
   371  			if upper == nil || c.cmp(upper, f.Largest.UserKey) < 0 {
   372  				upper = f.Largest.UserKey
   373  			}
   374  		}
   375  	}
   376  	// [lower,upper] now cover the bounds of the compaction inputs. Check to see
   377  	// if those bounds overlap an sstable at a lower level.
   378  	return c.elideRangeTombstone(lower, upper)
   379  }
   380  
   381  // elideTombstone returns true if it is ok to elide a tombstone for the
   382  // specified key. A return value of true guarantees that there are no key/value
   383  // pairs at c.level+2 or higher that possibly contain the specified user key.
   384  func (c *compaction) elideTombstone(key []byte) bool {
   385  	if len(c.flushing) != 0 {
   386  		return false
   387  	}
   388  
   389  	level := c.outputLevel + 1
   390  	if c.outputLevel == 0 {
   391  		// Level 0 can contain overlapping sstables so we need to check it for
   392  		// overlaps.
   393  		level = 0
   394  	}
   395  
   396  	// TODO(peter): this can be faster if key is always increasing between
   397  	// successive elideTombstones calls and we can keep some state in between
   398  	// calls.
   399  	for ; level < numLevels; level++ {
   400  		for _, f := range c.version.Files[level] {
   401  			if c.cmp(key, f.Largest.UserKey) <= 0 {
   402  				if c.cmp(key, f.Smallest.UserKey) >= 0 {
   403  					return false
   404  				}
   405  				// For levels below level 0, the files within a level are in
   406  				// increasing ikey order, so we can break early.
   407  				break
   408  			}
   409  		}
   410  	}
   411  	return true
   412  }
   413  
   414  // elideRangeTombstone returns true if it is ok to elide the specified range
   415  // tombstone. A return value of true guarantees that there are no key/value
   416  // pairs at c.outputLevel+1 or higher that possibly overlap the specified
   417  // tombstone.
   418  func (c *compaction) elideRangeTombstone(start, end []byte) bool {
   419  	if c.disableRangeTombstoneElision {
   420  		return false
   421  	}
   422  
   423  	level := c.outputLevel + 1
   424  	if c.outputLevel == 0 {
   425  		// Level 0 can contain overlapping sstables so we need to check it for
   426  		// overlaps.
   427  		level = 0
   428  	}
   429  
   430  	for ; level < numLevels; level++ {
   431  		overlaps := c.version.Overlaps(level, c.cmp, start, end)
   432  		if len(overlaps) > 0 {
   433  			return false
   434  		}
   435  	}
   436  	return true
   437  }
   438  
   439  // atomicUnitBounds returns the bounds of the atomic compaction unit containing
   440  // the specified sstable (identified by a pointer to its fileMetadata).
   441  func (c *compaction) atomicUnitBounds(f *fileMetadata) (lower, upper []byte) {
   442  	for i := range c.inputs {
   443  		files := c.inputs[i]
   444  		for j := range files {
   445  			if f == &files[j] {
   446  				lowerBound := f.Smallest.UserKey
   447  				for k := j; k > 0; k-- {
   448  					cur := &files[k]
   449  					prev := &files[k-1]
   450  					if c.cmp(prev.Largest.UserKey, cur.Smallest.UserKey) < 0 {
   451  						break
   452  					}
   453  					if prev.Largest.Trailer == InternalKeyRangeDeleteSentinel {
   454  						// The range deletion sentinel key is set for the largest key in a
   455  						// table when a range deletion tombstone straddles a table. It
   456  						// isn't necessary to include the next table in the atomic
   457  						// compaction unit as cur.largest.UserKey does not actually exist
   458  						// in the table.
   459  						break
   460  					}
   461  					lowerBound = prev.Smallest.UserKey
   462  				}
   463  
   464  				upperBound := f.Largest.UserKey
   465  				for k := j + 1; k < len(files); k++ {
   466  					cur := &files[k-1]
   467  					next := &files[k]
   468  					if c.cmp(cur.Largest.UserKey, next.Smallest.UserKey) < 0 {
   469  						break
   470  					}
   471  					if cur.Largest.Trailer == InternalKeyRangeDeleteSentinel {
   472  						// The range deletion sentinel key is set for the largest key in a
   473  						// table when a range deletion tombstone straddles a table. It
   474  						// isn't necessary to include the next table in the atomic
   475  						// compaction unit as cur.largest.UserKey does not actually exist
   476  						// in the table.
   477  						break
   478  					}
   479  					// cur.largest.UserKey == next.largest.UserKey, so next is part of
   480  					// the atomic compaction unit.
   481  					upperBound = next.Largest.UserKey
   482  				}
   483  				return lowerBound, upperBound
   484  			}
   485  		}
   486  	}
   487  	return nil, nil
   488  }
   489  
   490  // newInputIter returns an iterator over all the input tables in a compaction.
   491  func (c *compaction) newInputIter(
   492  	newIters tableNewIters,
   493  ) (_ internalIterator, retErr error) {
   494  	if len(c.flushing) != 0 {
   495  		if len(c.flushing) == 1 {
   496  			f := c.flushing[0]
   497  			iter := f.newFlushIter(nil, &c.bytesIterated)
   498  			if rangeDelIter := f.newRangeDelIter(nil); rangeDelIter != nil {
   499  				return newMergingIter(c.cmp, iter, rangeDelIter), nil
   500  			}
   501  			return iter, nil
   502  		}
   503  		iters := make([]internalIterator, 0, 2*len(c.flushing))
   504  		for i := range c.flushing {
   505  			f := c.flushing[i]
   506  			iters = append(iters, f.newFlushIter(nil, &c.bytesIterated))
   507  			rangeDelIter := f.newRangeDelIter(nil)
   508  			if rangeDelIter != nil {
   509  				iters = append(iters, rangeDelIter)
   510  			}
   511  		}
   512  		return newMergingIter(c.cmp, iters...), nil
   513  	}
   514  
   515  	iters := make([]internalIterator, 0, 2*len(c.inputs[0])+1)
   516  	defer func() {
   517  		if retErr != nil {
   518  			for _, iter := range iters {
   519  				if iter != nil {
   520  					iter.Close()
   521  				}
   522  			}
   523  		}
   524  	}()
   525  
   526  	// In normal operation, levelIter iterates over the point operations in a
   527  	// level, and initializes a rangeDelIter pointer for the range deletions in
   528  	// each table. During compaction, we want to iterate over the merged view of
   529  	// point operations and range deletions. In order to do this we create two
   530  	// levelIters per level, one which iterates over the point operations, and
   531  	// one which iterates over the range deletions. These two iterators are
   532  	// combined with a mergingIter.
   533  	newRangeDelIter := func(
   534  		f *fileMetadata, _ *IterOptions, bytesIterated *uint64,
   535  	) (internalIterator, internalIterator, error) {
   536  		iter, rangeDelIter, err := newIters(f, nil /* iter options */, &c.bytesIterated)
   537  		if err == nil {
   538  			// TODO(peter): It is mildly wasteful to open the point iterator only to
   539  			// immediately close it. One way to solve this would be to add new
   540  			// methods to tableCache for creating point and range-deletion iterators
   541  			// independently. We'd only want to use those methods here,
   542  			// though. Doesn't seem worth the hassle in the near term.
   543  			if err = iter.Close(); err != nil {
   544  				rangeDelIter.Close()
   545  				rangeDelIter = nil
   546  			}
   547  		}
   548  		if rangeDelIter != nil {
   549  			// Truncate the range tombstones returned by the iterator to the upper
   550  			// bound of the atomic compaction unit.
   551  			lowerBound, upperBound := c.atomicUnitBounds(f)
   552  			if lowerBound != nil || upperBound != nil {
   553  				rangeDelIter = rangedel.Truncate(c.cmp, rangeDelIter, lowerBound, upperBound)
   554  			}
   555  		}
   556  		return rangeDelIter, nil, err
   557  	}
   558  
   559  	if c.startLevel != 0 {
   560  		iters = append(iters, newLevelIter(nil, c.cmp, newIters, c.inputs[0], &c.bytesIterated))
   561  		iters = append(iters, newLevelIter(nil, c.cmp, newRangeDelIter, c.inputs[0], &c.bytesIterated))
   562  	} else {
   563  		for i := range c.inputs[0] {
   564  			f := &c.inputs[0][i]
   565  			iter, rangeDelIter, err := newIters(f, nil /* iter options */, &c.bytesIterated)
   566  			if err != nil {
   567  				return nil, fmt.Errorf("pebble: could not open table %d: %v", f.FileNum, err)
   568  			}
   569  			iters = append(iters, iter)
   570  			if rangeDelIter != nil {
   571  				iters = append(iters, rangeDelIter)
   572  			}
   573  		}
   574  	}
   575  
   576  	iters = append(iters, newLevelIter(nil, c.cmp, newIters, c.inputs[1], &c.bytesIterated))
   577  	iters = append(iters, newLevelIter(nil, c.cmp, newRangeDelIter, c.inputs[1], &c.bytesIterated))
   578  	return newMergingIter(c.cmp, iters...), nil
   579  }
   580  
   581  func (c *compaction) String() string {
   582  	if len(c.flushing) != 0 {
   583  		return "flush\n"
   584  	}
   585  
   586  	var buf bytes.Buffer
   587  	for i := range c.inputs {
   588  		level := c.startLevel
   589  		if i == 1 {
   590  			level = c.outputLevel
   591  		}
   592  		fmt.Fprintf(&buf, "%d:", level)
   593  		for _, f := range c.inputs[i] {
   594  			fmt.Fprintf(&buf, " %d:%s-%s", f.FileNum, f.Smallest, f.Largest)
   595  		}
   596  		fmt.Fprintf(&buf, "\n")
   597  	}
   598  	return buf.String()
   599  }
   600  
   601  type manualCompaction struct {
   602  	level       int
   603  	outputLevel int
   604  	done        chan error
   605  	start       InternalKey
   606  	end         InternalKey
   607  }
   608  
   609  func (d *DB) getCompactionPacerInfo() compactionPacerInfo {
   610  	bytesFlushed := atomic.LoadUint64(&d.bytesFlushed)
   611  
   612  	d.mu.Lock()
   613  	estimatedMaxWAmp := d.mu.versions.picker.estimatedMaxWAmp
   614  	pacerInfo := compactionPacerInfo{
   615  		slowdownThreshold:   uint64(estimatedMaxWAmp * float64(d.opts.MemTableSize)),
   616  		totalCompactionDebt: d.mu.versions.picker.estimatedCompactionDebt(bytesFlushed),
   617  	}
   618  	for _, m := range d.mu.mem.queue {
   619  		pacerInfo.totalDirtyBytes += m.totalBytes()
   620  	}
   621  	d.mu.Unlock()
   622  
   623  	return pacerInfo
   624  }
   625  
   626  func (d *DB) getFlushPacerInfo() flushPacerInfo {
   627  	var pacerInfo flushPacerInfo
   628  	d.mu.Lock()
   629  	for _, m := range d.mu.mem.queue {
   630  		pacerInfo.totalBytes += m.totalBytes()
   631  	}
   632  	d.mu.Unlock()
   633  	return pacerInfo
   634  }
   635  
   636  // maybeScheduleFlush schedules a flush if necessary.
   637  //
   638  // d.mu must be held when calling this.
   639  func (d *DB) maybeScheduleFlush() {
   640  	if d.mu.compact.flushing || atomic.LoadInt32(&d.closed) != 0 || d.opts.ReadOnly {
   641  		return
   642  	}
   643  	if len(d.mu.mem.queue) <= 1 {
   644  		return
   645  	}
   646  	if !d.mu.mem.queue[0].readyForFlush() {
   647  		return
   648  	}
   649  
   650  	d.mu.compact.flushing = true
   651  	go d.flush()
   652  }
   653  
   654  func (d *DB) flush() {
   655  	d.mu.Lock()
   656  	defer d.mu.Unlock()
   657  	if err := d.flush1(); err != nil {
   658  		// TODO(peter): count consecutive flush errors and backoff.
   659  		if d.opts.EventListener.BackgroundError != nil {
   660  			d.opts.EventListener.BackgroundError(err)
   661  		}
   662  	}
   663  	d.mu.compact.flushing = false
   664  	// More flush work may have arrived while we were flushing, so schedule
   665  	// another flush if needed.
   666  	d.maybeScheduleFlush()
   667  	// The flush may have produced too many files in a level, so schedule a
   668  	// compaction if needed.
   669  	d.maybeScheduleCompaction()
   670  	d.mu.compact.cond.Broadcast()
   671  }
   672  
   673  // flush runs a compaction that copies the immutable memtables from memory to
   674  // disk.
   675  //
   676  // d.mu must be held when calling this, but the mutex may be dropped and
   677  // re-acquired during the course of this method.
   678  func (d *DB) flush1() error {
   679  	var n int
   680  	for ; n < len(d.mu.mem.queue)-1; n++ {
   681  		if !d.mu.mem.queue[n].readyForFlush() {
   682  			break
   683  		}
   684  	}
   685  	if n == 0 {
   686  		// None of the immutable memtables are ready for flushing.
   687  		return nil
   688  	}
   689  
   690  	c := newFlush(d.opts, d.mu.versions.currentVersion(),
   691  		d.mu.versions.picker.baseLevel, d.mu.mem.queue[:n], &d.bytesFlushed)
   692  
   693  	jobID := d.mu.nextJobID
   694  	d.mu.nextJobID++
   695  	if d.opts.EventListener.FlushBegin != nil {
   696  		d.opts.EventListener.FlushBegin(FlushInfo{
   697  			JobID: jobID,
   698  		})
   699  	}
   700  
   701  	flushPacer := newFlushPacer(flushPacerEnv{
   702  		limiter:      d.flushLimiter,
   703  		memTableSize: uint64(d.opts.MemTableSize),
   704  		getInfo:      d.getFlushPacerInfo,
   705  	})
   706  	ve, pendingOutputs, err := d.runCompaction(jobID, c, flushPacer)
   707  
   708  	if d.opts.EventListener.FlushEnd != nil {
   709  		info := FlushInfo{
   710  			JobID: jobID,
   711  			Err:   err,
   712  		}
   713  		if err == nil {
   714  			for i := range ve.NewFiles {
   715  				e := &ve.NewFiles[i]
   716  				info.Output = append(info.Output, e.Meta.TableInfo(d.dirname))
   717  			}
   718  			if len(ve.NewFiles) == 0 {
   719  				info.Err = errEmptyTable
   720  			}
   721  		}
   722  		d.opts.EventListener.FlushEnd(info)
   723  	}
   724  
   725  	if err != nil {
   726  		return err
   727  	}
   728  
   729  	// The flush succeeded or it produced an empty sstable. In either case we
   730  	// want to bump the log number.
   731  	ve.LogNum, _ = d.mu.mem.queue[n].logInfo()
   732  	metrics := c.metrics[0]
   733  	for i := 0; i < n; i++ {
   734  		_, size := d.mu.mem.queue[i].logInfo()
   735  		metrics.BytesIn += size
   736  	}
   737  
   738  	err = d.mu.versions.logAndApply(jobID, ve, c.metrics, d.dataDir)
   739  	for _, fileNum := range pendingOutputs {
   740  		if _, ok := d.mu.compact.pendingOutputs[fileNum]; !ok {
   741  			panic("pebble: expected pending output not present")
   742  		}
   743  		delete(d.mu.compact.pendingOutputs, fileNum)
   744  	}
   745  	if err != nil {
   746  		return err
   747  	}
   748  
   749  	// Refresh bytes flushed count.
   750  	atomic.StoreUint64(&d.bytesFlushed, 0)
   751  
   752  	flushed := d.mu.mem.queue[:n]
   753  	d.mu.mem.queue = d.mu.mem.queue[n:]
   754  	d.updateReadStateLocked()
   755  	d.deleteObsoleteFiles(jobID)
   756  
   757  	// Mark all the memtables we flushed as flushed. Note that we do this last so
   758  	// that a synchronous call to DB.Flush() will not return until the deletion
   759  	// of obsolete files from this job have completed. This makes testing easier
   760  	// and provides similar behavior to manual compactions where the compaction
   761  	// is not marked as completed until the deletion of obsolete files job has
   762  	// completed.
   763  	for i := range flushed {
   764  		close(flushed[i].flushed())
   765  	}
   766  	return nil
   767  }
   768  
   769  // maybeScheduleCompaction schedules a compaction if necessary.
   770  //
   771  // d.mu must be held when calling this.
   772  func (d *DB) maybeScheduleCompaction() {
   773  	if d.mu.compact.compacting || atomic.LoadInt32(&d.closed) != 0 || d.opts.ReadOnly {
   774  		return
   775  	}
   776  
   777  	if len(d.mu.compact.manual) > 0 {
   778  		d.mu.compact.compacting = true
   779  		go d.compact()
   780  		return
   781  	}
   782  
   783  	if !d.mu.versions.picker.compactionNeeded() {
   784  		// There is no work to be done.
   785  		return
   786  	}
   787  
   788  	d.mu.compact.compacting = true
   789  	go d.compact()
   790  }
   791  
   792  // compact runs one compaction and maybe schedules another call to compact.
   793  func (d *DB) compact() {
   794  	d.mu.Lock()
   795  	defer d.mu.Unlock()
   796  	if err := d.compact1(); err != nil {
   797  		// TODO(peter): count consecutive compaction errors and backoff.
   798  		if d.opts.EventListener.BackgroundError != nil {
   799  			d.opts.EventListener.BackgroundError(err)
   800  		}
   801  	}
   802  	d.mu.compact.compacting = false
   803  	// The previous compaction may have produced too many files in a
   804  	// level, so reschedule another compaction if needed.
   805  	d.maybeScheduleCompaction()
   806  	d.mu.compact.cond.Broadcast()
   807  }
   808  
   809  // compact1 runs one compaction.
   810  //
   811  // d.mu must be held when calling this, but the mutex may be dropped and
   812  // re-acquired during the course of this method.
   813  func (d *DB) compact1() (err error) {
   814  	var c *compaction
   815  	if len(d.mu.compact.manual) > 0 {
   816  		manual := d.mu.compact.manual[0]
   817  		d.mu.compact.manual = d.mu.compact.manual[1:]
   818  		c = d.mu.versions.picker.pickManual(d.opts, manual, &d.bytesCompacted)
   819  		defer func() {
   820  			manual.done <- err
   821  		}()
   822  	} else {
   823  		c = d.mu.versions.picker.pickAuto(d.opts, &d.bytesCompacted)
   824  	}
   825  	if c == nil {
   826  		return nil
   827  	}
   828  
   829  	jobID := d.mu.nextJobID
   830  	d.mu.nextJobID++
   831  	info := CompactionInfo{
   832  		JobID: jobID,
   833  	}
   834  	if d.opts.EventListener.CompactionBegin != nil || d.opts.EventListener.CompactionEnd != nil {
   835  		info.Input.Level = c.startLevel
   836  		info.Output.Level = c.outputLevel
   837  		for i := range c.inputs {
   838  			for j := range c.inputs[i] {
   839  				m := &c.inputs[i][j]
   840  				info.Input.Tables[i] = append(info.Input.Tables[i], m.TableInfo(d.dirname))
   841  			}
   842  		}
   843  	}
   844  	if d.opts.EventListener.CompactionBegin != nil {
   845  		d.opts.EventListener.CompactionBegin(info)
   846  	}
   847  
   848  	compactionPacer := newCompactionPacer(compactionPacerEnv{
   849  		limiter:      d.compactionLimiter,
   850  		memTableSize: uint64(d.opts.MemTableSize),
   851  		getInfo:      d.getCompactionPacerInfo,
   852  	})
   853  	ve, pendingOutputs, err := d.runCompaction(jobID, c, compactionPacer)
   854  
   855  	if d.opts.EventListener.CompactionEnd != nil {
   856  		info.Err = err
   857  		if err == nil {
   858  			for i := range ve.NewFiles {
   859  				e := &ve.NewFiles[i]
   860  				info.Output.Tables = append(info.Output.Tables, e.Meta.TableInfo(d.dirname))
   861  			}
   862  		}
   863  		d.opts.EventListener.CompactionEnd(info)
   864  	}
   865  
   866  	if err != nil {
   867  		return err
   868  	}
   869  	err = d.mu.versions.logAndApply(jobID, ve, c.metrics, d.dataDir)
   870  	for _, fileNum := range pendingOutputs {
   871  		if _, ok := d.mu.compact.pendingOutputs[fileNum]; !ok {
   872  			panic("pebble: expected pending output not present")
   873  		}
   874  		delete(d.mu.compact.pendingOutputs, fileNum)
   875  	}
   876  	if err != nil {
   877  		return err
   878  	}
   879  
   880  	d.updateReadStateLocked()
   881  	d.deleteObsoleteFiles(jobID)
   882  	return nil
   883  }
   884  
   885  // runCompactions runs a compaction that produces new on-disk tables from
   886  // memtables or old on-disk tables.
   887  //
   888  // d.mu must be held when calling this, but the mutex may be dropped and
   889  // re-acquired during the course of this method.
   890  func (d *DB) runCompaction(jobID int, c *compaction, pacer pacer) (
   891  	ve *versionEdit, pendingOutputs []uint64, retErr error,
   892  ) {
   893  	// Check for a trivial move of one table from one level to the next. We avoid
   894  	// such a move if there is lots of overlapping grandparent data. Otherwise,
   895  	// the move could create a parent file that will require a very expensive
   896  	// merge later on.
   897  	if c.trivialMove() {
   898  		meta := &c.inputs[0][0]
   899  		c.metrics = map[int]*LevelMetrics{
   900  			c.outputLevel: &LevelMetrics{
   901  				BytesMoved: meta.Size,
   902  			},
   903  		}
   904  		ve := &versionEdit{
   905  			DeletedFiles: map[deletedFileEntry]bool{
   906  				deletedFileEntry{Level: c.startLevel, FileNum: meta.FileNum}: true,
   907  			},
   908  			NewFiles: []newFileEntry{
   909  				{Level: c.outputLevel, Meta: *meta},
   910  			},
   911  		}
   912  		return ve, nil, nil
   913  	}
   914  
   915  	defer func() {
   916  		if retErr != nil {
   917  			for _, fileNum := range pendingOutputs {
   918  				delete(d.mu.compact.pendingOutputs, fileNum)
   919  			}
   920  			pendingOutputs = nil
   921  		}
   922  	}()
   923  
   924  	snapshots := d.mu.snapshots.toSlice()
   925  
   926  	// Release the d.mu lock while doing I/O.
   927  	// Note the unusual order: Unlock and then Lock.
   928  	d.mu.Unlock()
   929  	defer d.mu.Lock()
   930  
   931  	iiter, err := c.newInputIter(d.newIters)
   932  	if err != nil {
   933  		return nil, pendingOutputs, err
   934  	}
   935  	iter := newCompactionIter(c.cmp, d.merge, iiter, snapshots,
   936  		c.allowZeroSeqNum(iiter), c.elideTombstone, c.elideRangeTombstone)
   937  
   938  	var (
   939  		filenames []string
   940  		tw        *sstable.Writer
   941  	)
   942  	defer func() {
   943  		if iter != nil {
   944  			retErr = firstError(retErr, iter.Close())
   945  		}
   946  		if tw != nil {
   947  			retErr = firstError(retErr, tw.Close())
   948  		}
   949  		if retErr != nil {
   950  			for _, filename := range filenames {
   951  				d.opts.FS.Remove(filename)
   952  			}
   953  		}
   954  	}()
   955  
   956  	ve = &versionEdit{
   957  		DeletedFiles: map[deletedFileEntry]bool{},
   958  	}
   959  
   960  	metrics := &LevelMetrics{
   961  		BytesIn:   totalSize(c.inputs[0]),
   962  		BytesRead: totalSize(c.inputs[1]),
   963  	}
   964  	metrics.BytesRead += metrics.BytesIn
   965  	c.metrics = map[int]*LevelMetrics{
   966  		c.outputLevel: metrics,
   967  	}
   968  
   969  	newOutput := func() error {
   970  		d.mu.Lock()
   971  		fileNum := d.mu.versions.getNextFileNum()
   972  		d.mu.compact.pendingOutputs[fileNum] = struct{}{}
   973  		pendingOutputs = append(pendingOutputs, fileNum)
   974  		d.mu.Unlock()
   975  
   976  		filename := base.MakeFilename(d.dirname, fileTypeTable, fileNum)
   977  		file, err := d.opts.FS.Create(filename)
   978  		if err != nil {
   979  			return err
   980  		}
   981  		if d.opts.EventListener.TableCreated != nil {
   982  			reason := "flushing"
   983  			if c.flushing == nil {
   984  				reason = "compacting"
   985  			}
   986  			d.opts.EventListener.TableCreated(TableCreateInfo{
   987  				JobID:   jobID,
   988  				Reason:  reason,
   989  				Path:    filename,
   990  				FileNum: fileNum,
   991  			})
   992  		}
   993  		file = vfs.NewSyncingFile(file, vfs.SyncingFileOptions{
   994  			BytesPerSync: d.opts.BytesPerSync,
   995  		})
   996  		filenames = append(filenames, filename)
   997  		tw = sstable.NewWriter(file, d.opts, d.opts.Level(c.outputLevel))
   998  
   999  		ve.NewFiles = append(ve.NewFiles, newFileEntry{
  1000  			Level: c.outputLevel,
  1001  			Meta: fileMetadata{
  1002  				FileNum: fileNum,
  1003  			},
  1004  		})
  1005  		return nil
  1006  	}
  1007  
  1008  	finishOutput := func(key InternalKey) error {
  1009  		// NB: clone the key because the data can be held on to by the call to
  1010  		// compactionIter.Tombstones via rangedel.Fragmenter.FlushTo.
  1011  		key = key.Clone()
  1012  		for _, v := range iter.Tombstones(key.UserKey) {
  1013  			if tw == nil {
  1014  				if err := newOutput(); err != nil {
  1015  					return err
  1016  				}
  1017  			}
  1018  			if err := tw.Add(v.Start, v.End); err != nil {
  1019  				return err
  1020  			}
  1021  		}
  1022  
  1023  		if tw == nil {
  1024  			return nil
  1025  		}
  1026  
  1027  		if err := tw.Close(); err != nil {
  1028  			tw = nil
  1029  			return err
  1030  		}
  1031  		writerMeta, err := tw.Metadata()
  1032  		if err != nil {
  1033  			tw = nil
  1034  			return err
  1035  		}
  1036  		tw = nil
  1037  		meta := &ve.NewFiles[len(ve.NewFiles)-1].Meta
  1038  		meta.Size = writerMeta.Size
  1039  		meta.SmallestSeqNum = writerMeta.SmallestSeqNum
  1040  		meta.LargestSeqNum = writerMeta.LargestSeqNum
  1041  
  1042  		metrics.BytesWritten += meta.Size
  1043  
  1044  		// The handling of range boundaries is a bit complicated.
  1045  		if n := len(ve.NewFiles); n > 1 {
  1046  			// This is not the first output. Bound the smallest range key by the
  1047  			// previous tables largest key.
  1048  			prevMeta := &ve.NewFiles[n-2].Meta
  1049  			if writerMeta.SmallestRange.UserKey != nil &&
  1050  				d.cmp(writerMeta.SmallestRange.UserKey, prevMeta.Largest.UserKey) <= 0 {
  1051  				// The range boundary user key is less than or equal to the previous
  1052  				// table's largest key. We need the tables to be key-space partitioned,
  1053  				// so force the boundary to a key that we know is larger than the
  1054  				// previous key.
  1055  				//
  1056  				// We use seqnum zero since seqnums are in descending order, and our
  1057  				// goal is to ensure this forged key does not overlap with the previous
  1058  				// file. `InternalKeyRangeDeleteSentinel` is actually the first key
  1059  				// kind as key kinds are also in descending order. But, this is OK
  1060  				// because choosing seqnum zero is already enough to prevent overlap
  1061  				// (the previous file could not end with a key at seqnum zero if this
  1062  				// file had a tombstone extending into it).
  1063  				writerMeta.SmallestRange = base.MakeInternalKey(
  1064  					prevMeta.Largest.UserKey, 0, InternalKeyKindRangeDelete)
  1065  			}
  1066  		}
  1067  
  1068  		if key.UserKey != nil && writerMeta.LargestRange.UserKey != nil {
  1069  			if d.cmp(writerMeta.LargestRange.UserKey, key.UserKey) >= 0 {
  1070  				writerMeta.LargestRange = key
  1071  				writerMeta.LargestRange.Trailer = InternalKeyRangeDeleteSentinel
  1072  			}
  1073  		}
  1074  
  1075  		meta.Smallest = writerMeta.Smallest(d.cmp)
  1076  		meta.Largest = writerMeta.Largest(d.cmp)
  1077  		return nil
  1078  	}
  1079  
  1080  	for key, val := iter.First(); key != nil; key, val = iter.Next() {
  1081  		atomic.StoreUint64(c.atomicBytesIterated, c.bytesIterated)
  1082  
  1083  		if err := pacer.maybeThrottle(c.bytesIterated); err != nil {
  1084  			return nil, pendingOutputs, err
  1085  		}
  1086  
  1087  		// TODO(peter,rangedel): Need to incorporate the range tombstones in the
  1088  		// shouldStopBefore decision.
  1089  		if tw != nil && (tw.EstimatedSize() >= c.maxOutputFileSize || c.shouldStopBefore(*key)) {
  1090  			if err := finishOutput(*key); err != nil {
  1091  				return nil, pendingOutputs, err
  1092  			}
  1093  		}
  1094  
  1095  		if tw == nil {
  1096  			if err := newOutput(); err != nil {
  1097  				return nil, pendingOutputs, err
  1098  			}
  1099  		}
  1100  
  1101  		if err := tw.Add(*key, val); err != nil {
  1102  			return nil, pendingOutputs, err
  1103  		}
  1104  	}
  1105  
  1106  	if err := finishOutput(InternalKey{}); err != nil {
  1107  		return nil, pendingOutputs, err
  1108  	}
  1109  
  1110  	for i := range c.inputs {
  1111  		level := c.startLevel
  1112  		if i == 1 {
  1113  			level = c.outputLevel
  1114  		}
  1115  		for _, f := range c.inputs[i] {
  1116  			ve.DeletedFiles[deletedFileEntry{
  1117  				Level:   level,
  1118  				FileNum: f.FileNum,
  1119  			}] = true
  1120  		}
  1121  	}
  1122  
  1123  	if err := d.dataDir.Sync(); err != nil {
  1124  		return nil, pendingOutputs, err
  1125  	}
  1126  	return ve, pendingOutputs, nil
  1127  }
  1128  
  1129  // scanObsoleteFiles scans the filesystem for files that are no longer needed
  1130  // and adds those to the internal lists of obsolete files. Note that he files
  1131  // are not actually deleted by this method. A subsequent call to
  1132  // deleteObsoleteFiles must be performed.
  1133  func (d *DB) scanObsoleteFiles(list []string) {
  1134  	liveFileNums := make(map[uint64]struct{}, len(d.mu.compact.pendingOutputs))
  1135  	for fileNum := range d.mu.compact.pendingOutputs {
  1136  		liveFileNums[fileNum] = struct{}{}
  1137  	}
  1138  	d.mu.versions.addLiveFileNums(liveFileNums)
  1139  	logNumber := d.mu.versions.logNum
  1140  	manifestFileNumber := d.mu.versions.manifestFileNum
  1141  
  1142  	var obsoleteLogs []uint64
  1143  	var obsoleteTables []uint64
  1144  	var obsoleteManifests []uint64
  1145  	var obsoleteOptions []uint64
  1146  
  1147  	for _, filename := range list {
  1148  		fileType, fileNum, ok := base.ParseFilename(filename)
  1149  		if !ok {
  1150  			continue
  1151  		}
  1152  		switch fileType {
  1153  		case fileTypeLog:
  1154  			// TODO(peter): also look at prevLogNumber?
  1155  			if fileNum >= logNumber {
  1156  				continue
  1157  			}
  1158  			obsoleteLogs = append(obsoleteLogs, fileNum)
  1159  		case fileTypeManifest:
  1160  			if fileNum >= manifestFileNumber {
  1161  				continue
  1162  			}
  1163  			obsoleteManifests = append(obsoleteManifests, fileNum)
  1164  		case fileTypeOptions:
  1165  			if fileNum >= d.optionsFileNum {
  1166  				continue
  1167  			}
  1168  			obsoleteOptions = append(obsoleteOptions, fileNum)
  1169  		case fileTypeTable:
  1170  			if _, ok := liveFileNums[fileNum]; ok {
  1171  				continue
  1172  			}
  1173  			obsoleteTables = append(obsoleteTables, fileNum)
  1174  		default:
  1175  			// Don't delete files we don't know about.
  1176  			continue
  1177  		}
  1178  	}
  1179  
  1180  	d.mu.log.queue = merge(d.mu.log.queue, obsoleteLogs)
  1181  	d.mu.versions.metrics.WAL.Files += int64(len(obsoleteLogs))
  1182  	d.mu.versions.obsoleteTables = merge(d.mu.versions.obsoleteTables, obsoleteTables)
  1183  	d.mu.versions.obsoleteManifests = merge(d.mu.versions.obsoleteManifests, obsoleteManifests)
  1184  	d.mu.versions.obsoleteOptions = merge(d.mu.versions.obsoleteOptions, obsoleteOptions)
  1185  }
  1186  
  1187  // deleteObsoleteFiles deletes those files that are no longer needed.
  1188  //
  1189  // d.mu must be held when calling this, but the mutex may be dropped and
  1190  // re-acquired during the course of this method.
  1191  func (d *DB) deleteObsoleteFiles(jobID int) {
  1192  	// Only allow a single delete obsolete files job to run at a time.
  1193  	for d.mu.cleaner.cleaning {
  1194  		d.mu.cleaner.cond.Wait()
  1195  	}
  1196  	d.mu.cleaner.cleaning = true
  1197  	defer func() {
  1198  		d.mu.cleaner.cleaning = false
  1199  		d.mu.cleaner.cond.Signal()
  1200  	}()
  1201  
  1202  	var obsoleteLogs []uint64
  1203  	for i := range d.mu.log.queue {
  1204  		// NB: d.mu.versions.logNumber is the file number of the latest log that
  1205  		// has had its contents persisted to the LSM.
  1206  		if d.mu.log.queue[i] >= d.mu.versions.logNum {
  1207  			obsoleteLogs = d.mu.log.queue[:i]
  1208  			d.mu.log.queue = d.mu.log.queue[i:]
  1209  			d.mu.versions.metrics.WAL.Files -= int64(len(obsoleteLogs))
  1210  			break
  1211  		}
  1212  	}
  1213  
  1214  	obsoleteTables := d.mu.versions.obsoleteTables
  1215  	d.mu.versions.obsoleteTables = nil
  1216  
  1217  	obsoleteManifests := d.mu.versions.obsoleteManifests
  1218  	d.mu.versions.obsoleteManifests = nil
  1219  
  1220  	obsoleteOptions := d.mu.versions.obsoleteOptions
  1221  	d.mu.versions.obsoleteOptions = nil
  1222  
  1223  	// Release d.mu while doing I/O
  1224  	// Note the unusual order: Unlock and then Lock.
  1225  	d.mu.Unlock()
  1226  	defer d.mu.Lock()
  1227  
  1228  	files := [4]struct {
  1229  		fileType fileType
  1230  		obsolete []uint64
  1231  	}{
  1232  		{fileTypeLog, obsoleteLogs},
  1233  		{fileTypeTable, obsoleteTables},
  1234  		{fileTypeManifest, obsoleteManifests},
  1235  		{fileTypeOptions, obsoleteOptions},
  1236  	}
  1237  	for _, f := range files {
  1238  		// We sort to make the order of deletions deterministic, which is nice for
  1239  		// tests.
  1240  		sort.Slice(f.obsolete, func(i, j int) bool {
  1241  			return f.obsolete[i] < f.obsolete[j]
  1242  		})
  1243  		for _, fileNum := range f.obsolete {
  1244  			switch f.fileType {
  1245  			case fileTypeLog:
  1246  				if d.logRecycler.add(fileNum) {
  1247  					continue
  1248  				}
  1249  			case fileTypeTable:
  1250  				d.tableCache.evict(fileNum)
  1251  			}
  1252  
  1253  			path := base.MakeFilename(d.dirname, f.fileType, fileNum)
  1254  			err := d.opts.FS.Remove(path)
  1255  			if err == os.ErrNotExist {
  1256  				continue
  1257  			}
  1258  
  1259  			// TODO(peter): need to handle this errror, probably by re-adding the
  1260  			// file that couldn't be deleted to one of the obsolete slices map.
  1261  
  1262  			switch f.fileType {
  1263  			case fileTypeLog:
  1264  				if d.opts.EventListener.WALDeleted != nil {
  1265  					d.opts.EventListener.WALDeleted(WALDeleteInfo{
  1266  						JobID:   jobID,
  1267  						Path:    path,
  1268  						FileNum: fileNum,
  1269  						Err:     err,
  1270  					})
  1271  				}
  1272  			case fileTypeManifest:
  1273  				if d.opts.EventListener.ManifestDeleted != nil {
  1274  					d.opts.EventListener.ManifestDeleted(ManifestDeleteInfo{
  1275  						JobID:   jobID,
  1276  						Path:    path,
  1277  						FileNum: fileNum,
  1278  						Err:     err,
  1279  					})
  1280  				}
  1281  			case fileTypeTable:
  1282  				if d.opts.EventListener.TableDeleted != nil {
  1283  					d.opts.EventListener.TableDeleted(TableDeleteInfo{
  1284  						JobID:   jobID,
  1285  						Path:    path,
  1286  						FileNum: fileNum,
  1287  						Err:     err,
  1288  					})
  1289  				}
  1290  			}
  1291  		}
  1292  	}
  1293  }
  1294  
  1295  func merge(a, b []uint64) []uint64 {
  1296  	if len(b) == 0 {
  1297  		return a
  1298  	}
  1299  
  1300  	a = append(a, b...)
  1301  	sort.Slice(a, func(i, j int) bool {
  1302  		return a[i] < a[j]
  1303  	})
  1304  
  1305  	n := 0
  1306  	for i := 0; i < len(a); i++ {
  1307  		if n == 0 || a[i] != a[n-1] {
  1308  			a[n] = a[i]
  1309  			n++
  1310  		}
  1311  	}
  1312  	return a[:n]
  1313  }