github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/compaction_picker.go

github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/compaction_picker.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"math"
     9  
    10  	"github.com/petermattis/pebble/internal/manifest"
    11  )
    12  
    13  // compactionPicker holds the state and logic for picking a compaction. A
    14  // compaction picker is associated with a single version. A new compaction
    15  // picker is created and initialized every time a new version is installed.
    16  type compactionPicker struct {
    17  	opts *Options
    18  	vers *version
    19  
    20  	// The level to target for L0 compactions. Levels L1 to baseLevel must be
    21  	// empty.
    22  	baseLevel int
    23  
    24  	// estimatedMaxWAmp is the estimated maximum write amp per byte that is
    25  	// added to L0.
    26  	estimatedMaxWAmp float64
    27  
    28  	// smoothedLevelMultiplier is the size ratio between one level and the next.
    29  	smoothedLevelMultiplier float64
    30  
    31  	// levelMaxBytes holds the dynamically adjusted max bytes setting for each
    32  	// level.
    33  	levelMaxBytes [numLevels]int64
    34  
    35  	// These fields are the level that should be compacted next and its
    36  	// compaction score. A score < 1 means that compaction is not strictly
    37  	// needed.
    38  	score float64
    39  	level int
    40  	file  int
    41  }
    42  
    43  func newCompactionPicker(v *version, opts *Options) *compactionPicker {
    44  	p := &compactionPicker{
    45  		opts: opts,
    46  		vers: v,
    47  	}
    48  	p.initLevelMaxBytes(v, opts)
    49  	p.initTarget(v, opts)
    50  	return p
    51  }
    52  
    53  func (p *compactionPicker) compactionNeeded() bool {
    54  	if p == nil {
    55  		return false
    56  	}
    57  	return p.score >= 1
    58  }
    59  
    60  // estimatedCompactionDebt estimates the number of bytes which need to be
    61  // compacted before the LSM tree becomes stable.
    62  func (p *compactionPicker) estimatedCompactionDebt(l0ExtraSize uint64) uint64 {
    63  	if p == nil {
    64  		return 0
    65  	}
    66  
    67  	compactionDebt := totalSize(p.vers.Files[0]) + l0ExtraSize
    68  	bytesAddedToNextLevel := compactionDebt
    69  
    70  	levelSize := totalSize(p.vers.Files[p.baseLevel])
    71  	// estimatedL0CompactionSize is the estimated size of the L0 component in the
    72  	// current or next L0->LBase compaction. This is needed to estimate the number
    73  	// of L0->LBase compactions which will need to occur for the LSM tree to
    74  	// become stable.
    75  	estimatedL0CompactionSize := uint64(p.opts.L0CompactionThreshold * p.opts.MemTableSize)
    76  	// The ratio bytesAddedToNextLevel(L0 Size)/estimatedL0CompactionSize is the
    77  	// estimated number of L0->LBase compactions which will need to occur for the
    78  	// LSM tree to become stable. We multiply this by levelSize(LBase size) to
    79  	// estimate the compaction debt incurred by LBase in the L0->LBase compactions.
    80  	compactionDebt += (levelSize * bytesAddedToNextLevel) / estimatedL0CompactionSize
    81  
    82  	var nextLevelSize uint64
    83  	for level := p.baseLevel; level < numLevels-1; level++ {
    84  		levelSize += bytesAddedToNextLevel
    85  		bytesAddedToNextLevel = 0
    86  		nextLevelSize = totalSize(p.vers.Files[level+1])
    87  		if levelSize > uint64(p.levelMaxBytes[level]) {
    88  			bytesAddedToNextLevel = levelSize - uint64(p.levelMaxBytes[level])
    89  			levelRatio := float64(nextLevelSize) / float64(levelSize)
    90  			compactionDebt += uint64(float64(bytesAddedToNextLevel) * (levelRatio + 1))
    91  		}
    92  		levelSize = nextLevelSize
    93  	}
    94  
    95  	return compactionDebt
    96  }
    97  
    98  func (p *compactionPicker) initLevelMaxBytes(v *version, opts *Options) {
    99  	// Determine the first non-empty level and the maximum size of any level.
   100  	firstNonEmptyLevel := -1
   101  	var bottomLevelSize int64
   102  	for level := 1; level < numLevels; level++ {
   103  		levelSize := int64(totalSize(v.Files[level]))
   104  		if levelSize > 0 {
   105  			if firstNonEmptyLevel == -1 {
   106  				firstNonEmptyLevel = level
   107  			}
   108  			bottomLevelSize = levelSize
   109  		}
   110  	}
   111  
   112  	// Initialize the max-bytes setting for each level to "infinity" which will
   113  	// disallow compaction for that level. We'll fill in the actual value below
   114  	// for levels we want to allow compactions from.
   115  	for level := 0; level < numLevels; level++ {
   116  		p.levelMaxBytes[level] = math.MaxInt64
   117  	}
   118  
   119  	if bottomLevelSize == 0 {
   120  		// No levels for L1 and up contain any data. Target L0 compactions for the
   121  		// last level.
   122  		p.baseLevel = numLevels - 1
   123  		return
   124  	}
   125  
   126  	levelMultiplier := 10.0
   127  
   128  	baseBytesMax := opts.LBaseMaxBytes
   129  	baseBytesMin := int64(float64(baseBytesMax) / levelMultiplier)
   130  
   131  	curLevelSize := bottomLevelSize
   132  	for level := numLevels - 2; level >= firstNonEmptyLevel; level-- {
   133  		curLevelSize = int64(float64(curLevelSize) / levelMultiplier)
   134  	}
   135  
   136  	if curLevelSize <= baseBytesMin {
   137  		// If we make target size of last level to be bottomLevelSize, target size of
   138  		// the first non-empty level would be smaller than baseBytesMin. We set it
   139  		// be baseBytesMin.
   140  		p.baseLevel = firstNonEmptyLevel
   141  	} else {
   142  		// Compute base level (where L0 data is compacted to).
   143  		p.baseLevel = firstNonEmptyLevel
   144  		for p.baseLevel > 1 && curLevelSize > baseBytesMax {
   145  			p.baseLevel--
   146  			curLevelSize = int64(float64(curLevelSize) / levelMultiplier)
   147  		}
   148  	}
   149  
   150  	if p.baseLevel < numLevels-1 {
   151  		p.smoothedLevelMultiplier = math.Pow(
   152  			float64(bottomLevelSize)/float64(baseBytesMax),
   153  			1.0/float64(numLevels-p.baseLevel-1))
   154  	} else {
   155  		p.smoothedLevelMultiplier = 1.0
   156  	}
   157  
   158  	p.estimatedMaxWAmp = float64(numLevels-p.baseLevel) * (p.smoothedLevelMultiplier + 1)
   159  
   160  	levelSize := float64(baseBytesMax)
   161  	for level := p.baseLevel; level < numLevels; level++ {
   162  		if level > p.baseLevel && levelSize > 0 {
   163  			levelSize *= p.smoothedLevelMultiplier
   164  		}
   165  		// Round the result since test cases use small target level sizes, which
   166  		// can be impacted by floating-point imprecision + integer truncation.
   167  		roundedLevelSize := math.Round(levelSize)
   168  		if roundedLevelSize > float64(math.MaxInt64) {
   169  			p.levelMaxBytes[level] = math.MaxInt64
   170  		} else {
   171  			p.levelMaxBytes[level] = int64(roundedLevelSize)
   172  		}
   173  	}
   174  }
   175  
   176  // initTarget initializes the compaction score and level. If the compaction
   177  // score indicates compaction is needed, a target table within the target level
   178  // is selected for compaction.
   179  func (p *compactionPicker) initTarget(v *version, opts *Options) {
   180  	// We treat level-0 specially by bounding the number of files instead of
   181  	// number of bytes for two reasons:
   182  	//
   183  	// (1) With larger write-buffer sizes, it is nice not to do too many
   184  	// level-0 compactions.
   185  	//
   186  	// (2) The files in level-0 are merged on every read and therefore we
   187  	// wish to avoid too many files when the individual file size is small
   188  	// (perhaps because of a small write-buffer setting, or very high
   189  	// compression ratios, or lots of overwrites/deletions).
   190  	p.score = float64(len(v.Files[0])) / float64(opts.L0CompactionThreshold)
   191  	p.level = 0
   192  
   193  	for level := 1; level < numLevels-1; level++ {
   194  		score := float64(totalSize(v.Files[level])) / float64(p.levelMaxBytes[level])
   195  		if p.score < score {
   196  			p.score = score
   197  			p.level = level
   198  		}
   199  	}
   200  
   201  	if p.score >= 1 {
   202  		// TODO(peter): Select the file within the level to compact. See the
   203  		// kMinOverlappingRatio heuristic in RocksDB which chooses the file with the
   204  		// minimum overlapping ratio with the next level. This minimizes write
   205  		// amplification. We also want to computed a "compensated size" which adjusts
   206  		// the size of a table based on the number of deletions it contains.
   207  		//
   208  		// We want to minimize write amplification, but also ensure that deletes
   209  		// are propagated to the bottom level in a timely fashion so as to reclaim
   210  		// disk space. A table's smallest sequence number provides a measure of its
   211  		// age. The ratio of overlapping-bytes / table-size gives an indication of
   212  		// write amplification (a smaller ratio is preferrable).
   213  		//
   214  		// Simulate various workloads:
   215  		// - Uniform random write
   216  		// - Uniform random write+delete
   217  		// - Skewed random write
   218  		// - Skewed random write+delete
   219  		// - Sequential write
   220  		// - Sequential write+delete (queue)
   221  
   222  		// The current heuristic matches the RocksDB kOldestSmallestSeqFirst
   223  		// heuristic.
   224  		smallestSeqNum := uint64(math.MaxUint64)
   225  		files := v.Files[p.level]
   226  		for i := range files {
   227  			f := &files[i]
   228  			if smallestSeqNum > f.SmallestSeqNum {
   229  				smallestSeqNum = f.SmallestSeqNum
   230  				p.file = i
   231  			}
   232  		}
   233  		return
   234  	}
   235  
   236  	// No levels exceeded their size threshold. Check for forced compactions.
   237  	for level := 0; level < numLevels-1; level++ {
   238  		files := v.Files[p.level]
   239  		for i := range files {
   240  			f := &files[i]
   241  			if f.MarkedForCompaction {
   242  				p.score = 1.0
   243  				p.level = level
   244  				p.file = i
   245  				return
   246  			}
   247  		}
   248  	}
   249  
   250  	// TODO(peter): When a snapshot is released, we may need to compact tables at
   251  	// the bottom level in order to free up entries that were pinned by the
   252  	// snapshot.
   253  }
   254  
   255  // pickAuto picks the best compaction, if any.
   256  func (p *compactionPicker) pickAuto(
   257  	opts *Options,
   258  	bytesCompacted *uint64,
   259  ) (c *compaction) {
   260  	if !p.compactionNeeded() {
   261  		return nil
   262  	}
   263  
   264  	vers := p.vers
   265  	c = newCompaction(opts, vers, p.level, p.baseLevel, bytesCompacted)
   266  	c.inputs[0] = vers.Files[c.startLevel][p.file : p.file+1]
   267  
   268  	// Files in level 0 may overlap each other, so pick up all overlapping ones.
   269  	if c.startLevel == 0 {
   270  		cmp := opts.Comparer.Compare
   271  		smallest, largest := manifest.KeyRange(cmp, c.inputs[0], nil)
   272  		c.inputs[0] = vers.Overlaps(0, cmp, smallest.UserKey, largest.UserKey)
   273  		if len(c.inputs[0]) == 0 {
   274  			panic("pebble: empty compaction")
   275  		}
   276  	}
   277  
   278  	c.setupOtherInputs()
   279  	return c
   280  }
   281  
   282  func (p *compactionPicker) pickManual(
   283  	opts *Options,
   284  	manual *manualCompaction,
   285  	bytesCompacted *uint64,
   286  ) (c *compaction) {
   287  	if p == nil {
   288  		return nil
   289  	}
   290  
   291  	// TODO(peter): The logic here is untested and possibly incomplete.
   292  	cur := p.vers
   293  	c = newCompaction(opts, cur, manual.level, p.baseLevel, bytesCompacted)
   294  	manual.outputLevel = c.outputLevel
   295  	cmp := opts.Comparer.Compare
   296  	c.inputs[0] = cur.Overlaps(manual.level, cmp, manual.start.UserKey, manual.end.UserKey)
   297  	if len(c.inputs[0]) == 0 {
   298  		return nil
   299  	}
   300  	c.setupOtherInputs()
   301  	return c
   302  }