github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/compaction_picker.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "math" 9 10 "github.com/petermattis/pebble/internal/manifest" 11 ) 12 13 // compactionPicker holds the state and logic for picking a compaction. A 14 // compaction picker is associated with a single version. A new compaction 15 // picker is created and initialized every time a new version is installed. 16 type compactionPicker struct { 17 opts *Options 18 vers *version 19 20 // The level to target for L0 compactions. Levels L1 to baseLevel must be 21 // empty. 22 baseLevel int 23 24 // estimatedMaxWAmp is the estimated maximum write amp per byte that is 25 // added to L0. 26 estimatedMaxWAmp float64 27 28 // smoothedLevelMultiplier is the size ratio between one level and the next. 29 smoothedLevelMultiplier float64 30 31 // levelMaxBytes holds the dynamically adjusted max bytes setting for each 32 // level. 33 levelMaxBytes [numLevels]int64 34 35 // These fields are the level that should be compacted next and its 36 // compaction score. A score < 1 means that compaction is not strictly 37 // needed. 38 score float64 39 level int 40 file int 41 } 42 43 func newCompactionPicker(v *version, opts *Options) *compactionPicker { 44 p := &compactionPicker{ 45 opts: opts, 46 vers: v, 47 } 48 p.initLevelMaxBytes(v, opts) 49 p.initTarget(v, opts) 50 return p 51 } 52 53 func (p *compactionPicker) compactionNeeded() bool { 54 if p == nil { 55 return false 56 } 57 return p.score >= 1 58 } 59 60 // estimatedCompactionDebt estimates the number of bytes which need to be 61 // compacted before the LSM tree becomes stable. 62 func (p *compactionPicker) estimatedCompactionDebt(l0ExtraSize uint64) uint64 { 63 if p == nil { 64 return 0 65 } 66 67 compactionDebt := totalSize(p.vers.Files[0]) + l0ExtraSize 68 bytesAddedToNextLevel := compactionDebt 69 70 levelSize := totalSize(p.vers.Files[p.baseLevel]) 71 // estimatedL0CompactionSize is the estimated size of the L0 component in the 72 // current or next L0->LBase compaction. This is needed to estimate the number 73 // of L0->LBase compactions which will need to occur for the LSM tree to 74 // become stable. 75 estimatedL0CompactionSize := uint64(p.opts.L0CompactionThreshold * p.opts.MemTableSize) 76 // The ratio bytesAddedToNextLevel(L0 Size)/estimatedL0CompactionSize is the 77 // estimated number of L0->LBase compactions which will need to occur for the 78 // LSM tree to become stable. We multiply this by levelSize(LBase size) to 79 // estimate the compaction debt incurred by LBase in the L0->LBase compactions. 80 compactionDebt += (levelSize * bytesAddedToNextLevel) / estimatedL0CompactionSize 81 82 var nextLevelSize uint64 83 for level := p.baseLevel; level < numLevels-1; level++ { 84 levelSize += bytesAddedToNextLevel 85 bytesAddedToNextLevel = 0 86 nextLevelSize = totalSize(p.vers.Files[level+1]) 87 if levelSize > uint64(p.levelMaxBytes[level]) { 88 bytesAddedToNextLevel = levelSize - uint64(p.levelMaxBytes[level]) 89 levelRatio := float64(nextLevelSize) / float64(levelSize) 90 compactionDebt += uint64(float64(bytesAddedToNextLevel) * (levelRatio + 1)) 91 } 92 levelSize = nextLevelSize 93 } 94 95 return compactionDebt 96 } 97 98 func (p *compactionPicker) initLevelMaxBytes(v *version, opts *Options) { 99 // Determine the first non-empty level and the maximum size of any level. 100 firstNonEmptyLevel := -1 101 var bottomLevelSize int64 102 for level := 1; level < numLevels; level++ { 103 levelSize := int64(totalSize(v.Files[level])) 104 if levelSize > 0 { 105 if firstNonEmptyLevel == -1 { 106 firstNonEmptyLevel = level 107 } 108 bottomLevelSize = levelSize 109 } 110 } 111 112 // Initialize the max-bytes setting for each level to "infinity" which will 113 // disallow compaction for that level. We'll fill in the actual value below 114 // for levels we want to allow compactions from. 115 for level := 0; level < numLevels; level++ { 116 p.levelMaxBytes[level] = math.MaxInt64 117 } 118 119 if bottomLevelSize == 0 { 120 // No levels for L1 and up contain any data. Target L0 compactions for the 121 // last level. 122 p.baseLevel = numLevels - 1 123 return 124 } 125 126 levelMultiplier := 10.0 127 128 baseBytesMax := opts.LBaseMaxBytes 129 baseBytesMin := int64(float64(baseBytesMax) / levelMultiplier) 130 131 curLevelSize := bottomLevelSize 132 for level := numLevels - 2; level >= firstNonEmptyLevel; level-- { 133 curLevelSize = int64(float64(curLevelSize) / levelMultiplier) 134 } 135 136 if curLevelSize <= baseBytesMin { 137 // If we make target size of last level to be bottomLevelSize, target size of 138 // the first non-empty level would be smaller than baseBytesMin. We set it 139 // be baseBytesMin. 140 p.baseLevel = firstNonEmptyLevel 141 } else { 142 // Compute base level (where L0 data is compacted to). 143 p.baseLevel = firstNonEmptyLevel 144 for p.baseLevel > 1 && curLevelSize > baseBytesMax { 145 p.baseLevel-- 146 curLevelSize = int64(float64(curLevelSize) / levelMultiplier) 147 } 148 } 149 150 if p.baseLevel < numLevels-1 { 151 p.smoothedLevelMultiplier = math.Pow( 152 float64(bottomLevelSize)/float64(baseBytesMax), 153 1.0/float64(numLevels-p.baseLevel-1)) 154 } else { 155 p.smoothedLevelMultiplier = 1.0 156 } 157 158 p.estimatedMaxWAmp = float64(numLevels-p.baseLevel) * (p.smoothedLevelMultiplier + 1) 159 160 levelSize := float64(baseBytesMax) 161 for level := p.baseLevel; level < numLevels; level++ { 162 if level > p.baseLevel && levelSize > 0 { 163 levelSize *= p.smoothedLevelMultiplier 164 } 165 // Round the result since test cases use small target level sizes, which 166 // can be impacted by floating-point imprecision + integer truncation. 167 roundedLevelSize := math.Round(levelSize) 168 if roundedLevelSize > float64(math.MaxInt64) { 169 p.levelMaxBytes[level] = math.MaxInt64 170 } else { 171 p.levelMaxBytes[level] = int64(roundedLevelSize) 172 } 173 } 174 } 175 176 // initTarget initializes the compaction score and level. If the compaction 177 // score indicates compaction is needed, a target table within the target level 178 // is selected for compaction. 179 func (p *compactionPicker) initTarget(v *version, opts *Options) { 180 // We treat level-0 specially by bounding the number of files instead of 181 // number of bytes for two reasons: 182 // 183 // (1) With larger write-buffer sizes, it is nice not to do too many 184 // level-0 compactions. 185 // 186 // (2) The files in level-0 are merged on every read and therefore we 187 // wish to avoid too many files when the individual file size is small 188 // (perhaps because of a small write-buffer setting, or very high 189 // compression ratios, or lots of overwrites/deletions). 190 p.score = float64(len(v.Files[0])) / float64(opts.L0CompactionThreshold) 191 p.level = 0 192 193 for level := 1; level < numLevels-1; level++ { 194 score := float64(totalSize(v.Files[level])) / float64(p.levelMaxBytes[level]) 195 if p.score < score { 196 p.score = score 197 p.level = level 198 } 199 } 200 201 if p.score >= 1 { 202 // TODO(peter): Select the file within the level to compact. See the 203 // kMinOverlappingRatio heuristic in RocksDB which chooses the file with the 204 // minimum overlapping ratio with the next level. This minimizes write 205 // amplification. We also want to computed a "compensated size" which adjusts 206 // the size of a table based on the number of deletions it contains. 207 // 208 // We want to minimize write amplification, but also ensure that deletes 209 // are propagated to the bottom level in a timely fashion so as to reclaim 210 // disk space. A table's smallest sequence number provides a measure of its 211 // age. The ratio of overlapping-bytes / table-size gives an indication of 212 // write amplification (a smaller ratio is preferrable). 213 // 214 // Simulate various workloads: 215 // - Uniform random write 216 // - Uniform random write+delete 217 // - Skewed random write 218 // - Skewed random write+delete 219 // - Sequential write 220 // - Sequential write+delete (queue) 221 222 // The current heuristic matches the RocksDB kOldestSmallestSeqFirst 223 // heuristic. 224 smallestSeqNum := uint64(math.MaxUint64) 225 files := v.Files[p.level] 226 for i := range files { 227 f := &files[i] 228 if smallestSeqNum > f.SmallestSeqNum { 229 smallestSeqNum = f.SmallestSeqNum 230 p.file = i 231 } 232 } 233 return 234 } 235 236 // No levels exceeded their size threshold. Check for forced compactions. 237 for level := 0; level < numLevels-1; level++ { 238 files := v.Files[p.level] 239 for i := range files { 240 f := &files[i] 241 if f.MarkedForCompaction { 242 p.score = 1.0 243 p.level = level 244 p.file = i 245 return 246 } 247 } 248 } 249 250 // TODO(peter): When a snapshot is released, we may need to compact tables at 251 // the bottom level in order to free up entries that were pinned by the 252 // snapshot. 253 } 254 255 // pickAuto picks the best compaction, if any. 256 func (p *compactionPicker) pickAuto( 257 opts *Options, 258 bytesCompacted *uint64, 259 ) (c *compaction) { 260 if !p.compactionNeeded() { 261 return nil 262 } 263 264 vers := p.vers 265 c = newCompaction(opts, vers, p.level, p.baseLevel, bytesCompacted) 266 c.inputs[0] = vers.Files[c.startLevel][p.file : p.file+1] 267 268 // Files in level 0 may overlap each other, so pick up all overlapping ones. 269 if c.startLevel == 0 { 270 cmp := opts.Comparer.Compare 271 smallest, largest := manifest.KeyRange(cmp, c.inputs[0], nil) 272 c.inputs[0] = vers.Overlaps(0, cmp, smallest.UserKey, largest.UserKey) 273 if len(c.inputs[0]) == 0 { 274 panic("pebble: empty compaction") 275 } 276 } 277 278 c.setupOtherInputs() 279 return c 280 } 281 282 func (p *compactionPicker) pickManual( 283 opts *Options, 284 manual *manualCompaction, 285 bytesCompacted *uint64, 286 ) (c *compaction) { 287 if p == nil { 288 return nil 289 } 290 291 // TODO(peter): The logic here is untested and possibly incomplete. 292 cur := p.vers 293 c = newCompaction(opts, cur, manual.level, p.baseLevel, bytesCompacted) 294 manual.outputLevel = c.outputLevel 295 cmp := opts.Comparer.Compare 296 c.inputs[0] = cur.Overlaps(manual.level, cmp, manual.start.UserKey, manual.end.UserKey) 297 if len(c.inputs[0]) == 0 { 298 return nil 299 } 300 c.setupOtherInputs() 301 return c 302 }