github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/compaction_picker.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package bitalostable 6 7 import ( 8 "bytes" 9 "fmt" 10 "math" 11 "sort" 12 13 "github.com/zuoyebang/bitalostable/internal/base" 14 "github.com/zuoyebang/bitalostable/internal/humanize" 15 "github.com/zuoyebang/bitalostable/internal/manifest" 16 ) 17 18 // The minimum count for an intra-L0 compaction. This matches the RocksDB 19 // heuristic. 20 const minIntraL0Count = 4 21 22 const levelMultiplier = 10 23 24 type compactionEnv struct { 25 earliestUnflushedSeqNum uint64 26 earliestSnapshotSeqNum uint64 27 inProgressCompactions []compactionInfo 28 readCompactionEnv readCompactionEnv 29 } 30 31 type compactionPicker interface { 32 getScores([]compactionInfo) [numLevels]float64 33 getBaseLevel() int 34 getEstimatedMaxWAmp() float64 35 estimatedCompactionDebt(l0ExtraSize uint64) uint64 36 pickAuto(env compactionEnv) (pc *pickedCompaction) 37 pickManual(env compactionEnv, manual *manualCompaction) (c *pickedCompaction, retryLater bool) 38 pickElisionOnlyCompaction(env compactionEnv) (pc *pickedCompaction) 39 pickRewriteCompaction(env compactionEnv) (pc *pickedCompaction) 40 pickReadTriggeredCompaction(env compactionEnv) (pc *pickedCompaction) 41 forceBaseLevel1() 42 } 43 44 // readCompactionEnv is used to hold data required to perform read compactions 45 type readCompactionEnv struct { 46 rescheduleReadCompaction *bool 47 readCompactions *readCompactionQueue 48 flushing bool 49 } 50 51 // Information about in-progress compactions provided to the compaction picker. These are used to 52 // constrain the new compactions that will be picked. 53 type compactionInfo struct { 54 inputs []compactionLevel 55 outputLevel int 56 smallest InternalKey 57 largest InternalKey 58 } 59 60 func (info compactionInfo) String() string { 61 var buf bytes.Buffer 62 var largest int 63 for i, in := range info.inputs { 64 if i > 0 { 65 fmt.Fprintf(&buf, " -> ") 66 } 67 fmt.Fprintf(&buf, "L%d", in.level) 68 in.files.Each(func(m *fileMetadata) { 69 fmt.Fprintf(&buf, " %s", m.FileNum) 70 }) 71 if largest < in.level { 72 largest = in.level 73 } 74 } 75 if largest != info.outputLevel || len(info.inputs) == 1 { 76 fmt.Fprintf(&buf, " -> L%d", info.outputLevel) 77 } 78 return buf.String() 79 } 80 81 type sortCompactionLevelsDecreasingScore []candidateLevelInfo 82 83 func (s sortCompactionLevelsDecreasingScore) Len() int { 84 return len(s) 85 } 86 func (s sortCompactionLevelsDecreasingScore) Less(i, j int) bool { 87 if s[i].score != s[j].score { 88 return s[i].score > s[j].score 89 } 90 return s[i].level < s[j].level 91 } 92 func (s sortCompactionLevelsDecreasingScore) Swap(i, j int) { 93 s[i], s[j] = s[j], s[i] 94 } 95 96 // sublevelInfo is used to tag a LevelSlice for an L0 sublevel with the 97 // sublevel. 98 type sublevelInfo struct { 99 manifest.LevelSlice 100 sublevel manifest.Level 101 } 102 103 // generateSublevelInfo will generate the level slices for each of the sublevels 104 // from the level slice for all of L0. 105 func generateSublevelInfo(cmp base.Compare, levelFiles manifest.LevelSlice) []sublevelInfo { 106 sublevelMap := make(map[uint64][]*fileMetadata) 107 it := levelFiles.Iter() 108 for f := it.First(); f != nil; f = it.Next() { 109 sublevelMap[uint64(f.SubLevel)] = append(sublevelMap[uint64(f.SubLevel)], f) 110 } 111 112 var sublevels []int 113 for level := range sublevelMap { 114 sublevels = append(sublevels, int(level)) 115 } 116 sort.Ints(sublevels) 117 118 var levelSlices []sublevelInfo 119 for _, sublevel := range sublevels { 120 metas := sublevelMap[uint64(sublevel)] 121 levelSlices = append( 122 levelSlices, 123 sublevelInfo{ 124 manifest.NewLevelSliceKeySorted(cmp, metas), 125 manifest.L0Sublevel(sublevel), 126 }, 127 ) 128 } 129 return levelSlices 130 } 131 132 // pickedCompaction contains information about a compaction that has already 133 // been chosen, and is being constructed. Compaction construction info lives in 134 // this struct, and is copied over into the compaction struct when that's 135 // created. 136 type pickedCompaction struct { 137 cmp Compare 138 139 // score of the chosen compaction. Taken from candidateLevelInfo. 140 score float64 141 142 // kind indicates the kind of compaction. 143 kind compactionKind 144 145 // startLevel is the level that is being compacted. Inputs from startLevel 146 // and outputLevel will be merged to produce a set of outputLevel files. 147 startLevel *compactionLevel 148 149 // outputLevel is the level that files are being produced in. outputLevel is 150 // equal to startLevel+1 except when: 151 // - if startLevel is 0, the output level equals compactionPicker.baseLevel(). 152 // - in multilevel compaction, the output level is the lowest level involved in 153 // the compaction 154 outputLevel *compactionLevel 155 156 // extraLevels contain additional levels in between the input and output 157 // levels that get compacted in multi level compactions 158 extraLevels []*compactionLevel 159 160 // adjustedOutputLevel is the output level used for the purpose of 161 // determining the target output file size, overlap bytes, and expanded 162 // bytes, taking into account the base level. 163 adjustedOutputLevel int 164 165 inputs []compactionLevel 166 167 // L0-specific compaction info. Set to a non-nil value for all compactions 168 // where startLevel == 0 that were generated by L0Sublevels. 169 lcf *manifest.L0CompactionFiles 170 171 // L0SublevelInfo is used for compactions out of L0. It is nil for all 172 // other compactions. 173 l0SublevelInfo []sublevelInfo 174 175 // maxOutputFileSize is the maximum size of an individual table created 176 // during compaction. 177 maxOutputFileSize uint64 178 // maxOverlapBytes is the maximum number of bytes of overlap allowed for a 179 // single output table with the tables in the grandparent level. 180 maxOverlapBytes uint64 181 // maxReadCompactionBytes is the maximum bytes a read compaction is allowed to 182 // overlap in its output level with. If the overlap is greater than 183 // maxReadCompaction bytes, then we don't proceed with the compaction. 184 maxReadCompactionBytes uint64 185 186 // The boundaries of the input data. 187 smallest InternalKey 188 largest InternalKey 189 190 version *version 191 } 192 193 func defaultOutputLevel(startLevel, baseLevel int) int { 194 outputLevel := startLevel + 1 195 if startLevel == 0 { 196 outputLevel = baseLevel 197 } 198 if outputLevel >= numLevels-1 { 199 outputLevel = numLevels - 1 200 } 201 return outputLevel 202 } 203 204 func newPickedCompaction( 205 opts *Options, cur *version, startLevel, outputLevel, baseLevel int, 206 ) *pickedCompaction { 207 if startLevel > 0 && startLevel < baseLevel { 208 panic(fmt.Sprintf("invalid compaction: start level %d should not be empty (base level %d)", 209 startLevel, baseLevel)) 210 } 211 212 adjustedOutputLevel := outputLevel 213 if adjustedOutputLevel > 0 { 214 // Output level is in the range [baseLevel,numLevels]. For the purpose of 215 // determining the target output file size, overlap bytes, and expanded 216 // bytes, we want to adjust the range to [1,numLevels]. 217 adjustedOutputLevel = 1 + outputLevel - baseLevel 218 } 219 220 pc := &pickedCompaction{ 221 cmp: opts.Comparer.Compare, 222 version: cur, 223 inputs: []compactionLevel{{level: startLevel}, {level: outputLevel}}, 224 adjustedOutputLevel: adjustedOutputLevel, 225 maxOutputFileSize: uint64(opts.Level(adjustedOutputLevel).TargetFileSize), 226 maxOverlapBytes: maxGrandparentOverlapBytes(opts, adjustedOutputLevel), 227 maxReadCompactionBytes: maxReadCompactionBytes(opts, adjustedOutputLevel), 228 } 229 pc.startLevel = &pc.inputs[0] 230 pc.outputLevel = &pc.inputs[1] 231 return pc 232 } 233 234 func newPickedCompactionFromL0( 235 lcf *manifest.L0CompactionFiles, opts *Options, vers *version, baseLevel int, isBase bool, 236 ) *pickedCompaction { 237 outputLevel := baseLevel 238 if !isBase { 239 outputLevel = 0 // Intra L0 240 } 241 242 pc := newPickedCompaction(opts, vers, 0, outputLevel, baseLevel) 243 pc.lcf = lcf 244 pc.outputLevel.level = outputLevel 245 246 // Manually build the compaction as opposed to calling 247 // pickAutoHelper. This is because L0Sublevels has already added 248 // any overlapping L0 SSTables that need to be added, and 249 // because compactions built by L0SSTables do not necessarily 250 // pick contiguous sequences of files in pc.version.Levels[0]. 251 files := make([]*manifest.FileMetadata, 0, len(lcf.Files)) 252 iter := vers.Levels[0].Iter() 253 for f := iter.First(); f != nil; f = iter.Next() { 254 if lcf.FilesIncluded[f.L0Index] { 255 files = append(files, f) 256 } 257 } 258 pc.startLevel.files = manifest.NewLevelSliceSeqSorted(files) 259 return pc 260 } 261 262 // maybeExpandedBounds is a helper function for setupInputs which ensures the 263 // pickedCompaction's smallest and largest internal keys are updated iff 264 // the candidate keys expand the key span. This avoids a bug for multi-level 265 // compactions: during the second call to setupInputs, the picked compaction's 266 // smallest and largest keys should not decrease the key span. 267 func (pc *pickedCompaction) maybeExpandBounds(smallest InternalKey, largest InternalKey) { 268 emptyKey := InternalKey{} 269 if base.InternalCompare(pc.cmp, smallest, emptyKey) == 0 { 270 if base.InternalCompare(pc.cmp, largest, emptyKey) != 0 { 271 panic("either both candidate keys are empty or neither are empty") 272 } 273 return 274 } 275 if base.InternalCompare(pc.cmp, pc.smallest, emptyKey) == 0 { 276 if base.InternalCompare(pc.cmp, pc.largest, emptyKey) != 0 { 277 panic("either both pc keys are empty or neither are empty") 278 } 279 pc.smallest = smallest 280 pc.largest = largest 281 return 282 } 283 if base.InternalCompare(pc.cmp, pc.smallest, smallest) >= 0 { 284 pc.smallest = smallest 285 } 286 if base.InternalCompare(pc.cmp, pc.largest, largest) <= 0 { 287 pc.largest = largest 288 } 289 } 290 291 func (pc *pickedCompaction) setupInputs( 292 opts *Options, diskAvailBytes uint64, startLevel *compactionLevel, 293 ) bool { 294 // maxExpandedBytes is the maximum size of an expanded compaction. If 295 // growing a compaction results in a larger size, the original compaction 296 // is used instead. 297 maxExpandedBytes := expandedCompactionByteSizeLimit( 298 opts, pc.adjustedOutputLevel, diskAvailBytes, 299 ) 300 301 // Expand the initial inputs to a clean cut. 302 var isCompacting bool 303 startLevel.files, isCompacting = expandToAtomicUnit(pc.cmp, startLevel.files, false /* disableIsCompacting */) 304 if isCompacting { 305 return false 306 } 307 pc.maybeExpandBounds(manifest.KeyRange(pc.cmp, startLevel.files.Iter())) 308 309 // Determine the sstables in the output level which overlap with the input 310 // sstables, and then expand those tables to a clean cut. No need to do 311 // this for intra-L0 compactions; outputLevel.files is left empty for those. 312 if startLevel.level != pc.outputLevel.level { 313 pc.outputLevel.files = pc.version.Overlaps(pc.outputLevel.level, pc.cmp, pc.smallest.UserKey, 314 pc.largest.UserKey, pc.largest.IsExclusiveSentinel()) 315 pc.outputLevel.files, isCompacting = expandToAtomicUnit(pc.cmp, pc.outputLevel.files, 316 false /* disableIsCompacting */) 317 if isCompacting { 318 return false 319 } 320 pc.maybeExpandBounds(manifest.KeyRange(pc.cmp, 321 startLevel.files.Iter(), pc.outputLevel.files.Iter())) 322 } 323 324 // Grow the sstables in startLevel.level as long as it doesn't affect the number 325 // of sstables included from pc.outputLevel.level. 326 if pc.lcf != nil && startLevel.level == 0 && pc.outputLevel.level != 0 { 327 // Call the L0-specific compaction extension method. Similar logic as 328 // pc.grow. Additional L0 files are optionally added to the compaction at 329 // this step. Note that the bounds passed in are not the bounds of the 330 // compaction, but rather the smallest and largest internal keys that 331 // the compaction cannot include from L0 without pulling in more Lbase 332 // files. Consider this example: 333 // 334 // L0: c-d e+f g-h 335 // Lbase: a-b e+f i-j 336 // a b c d e f g h i j 337 // 338 // The e-f files have already been chosen in the compaction. As pulling 339 // in more LBase files is undesirable, the logic below will pass in 340 // smallest = b and largest = i to ExtendL0ForBaseCompactionTo, which 341 // will expand the compaction to include c-d and g-h from L0. The 342 // bounds passed in are exclusive; the compaction cannot be expanded 343 // to include files that "touch" it. 344 smallestBaseKey := base.InvalidInternalKey 345 largestBaseKey := base.InvalidInternalKey 346 if pc.outputLevel.files.Empty() { 347 baseIter := pc.version.Levels[pc.outputLevel.level].Iter() 348 if sm := baseIter.SeekLT(pc.cmp, pc.smallest.UserKey); sm != nil { 349 smallestBaseKey = sm.Largest 350 } 351 if la := baseIter.SeekGE(pc.cmp, pc.largest.UserKey); la != nil { 352 largestBaseKey = la.Smallest 353 } 354 } else { 355 // NB: We use Reslice to access the underlying level's files, but 356 // we discard the returned slice. The pc.outputLevel.files slice 357 // is not modified. 358 _ = pc.outputLevel.files.Reslice(func(start, end *manifest.LevelIterator) { 359 if sm := start.Prev(); sm != nil { 360 smallestBaseKey = sm.Largest 361 } 362 if la := end.Next(); la != nil { 363 largestBaseKey = la.Smallest 364 } 365 }) 366 } 367 368 oldLcf := *pc.lcf 369 if pc.version.L0Sublevels.ExtendL0ForBaseCompactionTo(smallestBaseKey, largestBaseKey, pc.lcf) { 370 var newStartLevelFiles []*fileMetadata 371 iter := pc.version.Levels[0].Iter() 372 var sizeSum uint64 373 for j, f := 0, iter.First(); f != nil; j, f = j+1, iter.Next() { 374 if pc.lcf.FilesIncluded[f.L0Index] { 375 newStartLevelFiles = append(newStartLevelFiles, f) 376 sizeSum += f.Size 377 } 378 } 379 if sizeSum+pc.outputLevel.files.SizeSum() < maxExpandedBytes { 380 startLevel.files = manifest.NewLevelSliceSeqSorted(newStartLevelFiles) 381 pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, 382 startLevel.files.Iter(), pc.outputLevel.files.Iter()) 383 } else { 384 *pc.lcf = oldLcf 385 } 386 } 387 } else if pc.grow(pc.smallest, pc.largest, maxExpandedBytes, startLevel) { 388 pc.maybeExpandBounds(manifest.KeyRange(pc.cmp, 389 startLevel.files.Iter(), pc.outputLevel.files.Iter())) 390 } 391 392 if pc.startLevel.level == 0 { 393 // We don't change the input files for the compaction beyond this point. 394 pc.l0SublevelInfo = generateSublevelInfo(pc.cmp, pc.startLevel.files) 395 } 396 397 return true 398 } 399 400 // grow grows the number of inputs at c.level without changing the number of 401 // c.level+1 files in the compaction, and returns whether the inputs grew. sm 402 // and la are the smallest and largest InternalKeys in all of the inputs. 403 func (pc *pickedCompaction) grow( 404 sm, la InternalKey, maxExpandedBytes uint64, startLevel *compactionLevel, 405 ) bool { 406 if pc.outputLevel.files.Empty() { 407 return false 408 } 409 grow0 := pc.version.Overlaps(startLevel.level, pc.cmp, sm.UserKey, 410 la.UserKey, la.IsExclusiveSentinel()) 411 grow0, isCompacting := expandToAtomicUnit(pc.cmp, grow0, false /* disableIsCompacting */) 412 if isCompacting { 413 return false 414 } 415 if grow0.Len() <= startLevel.files.Len() { 416 return false 417 } 418 if grow0.SizeSum()+pc.outputLevel.files.SizeSum() >= maxExpandedBytes { 419 return false 420 } 421 // We need to include the outputLevel iter because without it, in a multiLevel scenario, 422 // sm1 and la1 could shift the output level keyspace when pc.outputLevel.files is set to grow1. 423 sm1, la1 := manifest.KeyRange(pc.cmp, grow0.Iter(), pc.outputLevel.files.Iter()) 424 grow1 := pc.version.Overlaps(pc.outputLevel.level, pc.cmp, sm1.UserKey, 425 la1.UserKey, la1.IsExclusiveSentinel()) 426 grow1, isCompacting = expandToAtomicUnit(pc.cmp, grow1, false /* disableIsCompacting */) 427 if isCompacting { 428 return false 429 } 430 if grow1.Len() != pc.outputLevel.files.Len() { 431 return false 432 } 433 startLevel.files = grow0 434 pc.outputLevel.files = grow1 435 return true 436 } 437 438 // initMultiLevelCompaction returns true if it initiated a multilevel input 439 // compaction. This currently never inits a multiLevel compaction. 440 func (pc *pickedCompaction) initMultiLevelCompaction( 441 opts *Options, vers *version, levelMaxBytes [7]int64, diskAvailBytes uint64, 442 ) bool { 443 return false 444 } 445 446 // expandToAtomicUnit expands the provided level slice within its level both 447 // forwards and backwards to its "atomic compaction unit" boundaries, if 448 // necessary. 449 // 450 // While picking compaction inputs, this is required to maintain the invariant 451 // that the versions of keys at level+1 are older than the versions of keys at 452 // level. Tables are added to the right of the current slice tables such that 453 // the rightmost table has a "clean cut". A clean cut is either a change in 454 // user keys, or when the largest key in the left sstable is a range tombstone 455 // sentinel key (InternalKeyRangeDeleteSentinel). 456 // 457 // In addition to maintaining the seqnum invariant, expandToAtomicUnit is used 458 // to provide clean boundaries for range tombstone truncation during 459 // compaction. In order to achieve these clean boundaries, expandToAtomicUnit 460 // needs to find a "clean cut" on the left edge of the compaction as well. 461 // This is necessary in order for "atomic compaction units" to always be 462 // compacted as a unit. Failure to do this leads to a subtle bug with 463 // truncation of range tombstones to atomic compaction unit boundaries. 464 // Consider the scenario: 465 // 466 // L3: 467 // 12:[a#2,15-b#1,1] 468 // 13:[b#0,15-d#72057594037927935,15] 469 // 470 // These sstables contain a range tombstone [a-d)#2 which spans the two 471 // sstables. The two sstables need to always be kept together. Compacting 472 // sstable 13 independently of sstable 12 would result in: 473 // 474 // L3: 475 // 12:[a#2,15-b#1,1] 476 // L4: 477 // 14:[b#0,15-d#72057594037927935,15] 478 // 479 // This state is still ok, but when sstable 12 is next compacted, its range 480 // tombstones will be truncated at "b" (the largest key in its atomic 481 // compaction unit). In the scenario here, that could result in b#1 becoming 482 // visible when it should be deleted. 483 // 484 // isCompacting is returned true for any atomic units that contain files that 485 // have in-progress compactions, i.e. FileMetadata.Compacting == true. If 486 // disableIsCompacting is true, isCompacting always returns false. This helps 487 // avoid spurious races from being detected when this method is used outside 488 // of compaction picking code. 489 // 490 // TODO(jackson): Compactions and flushes no longer split a user key between two 491 // sstables. We could perform a migration, re-compacting any sstables with split 492 // user keys, which would allow us to remove atomic compaction unit expansion 493 // code. 494 func expandToAtomicUnit( 495 cmp Compare, inputs manifest.LevelSlice, disableIsCompacting bool, 496 ) (slice manifest.LevelSlice, isCompacting bool) { 497 // NB: Inputs for L0 can't be expanded and *version.Overlaps guarantees 498 // that we get a 'clean cut.' For L0, Overlaps will return a slice without 499 // access to the rest of the L0 files, so it's OK to try to reslice. 500 if inputs.Empty() { 501 // Nothing to expand. 502 return inputs, false 503 } 504 505 inputs = inputs.Reslice(func(start, end *manifest.LevelIterator) { 506 iter := start.Clone() 507 iter.Prev() 508 for cur, prev := start.Current(), iter.Current(); prev != nil; cur, prev = start.Prev(), iter.Prev() { 509 if cur.IsCompacting() { 510 isCompacting = true 511 } 512 if cmp(prev.Largest.UserKey, cur.Smallest.UserKey) < 0 { 513 break 514 } 515 if prev.Largest.IsExclusiveSentinel() { 516 // The table prev has a largest key indicating that the user key 517 // prev.largest.UserKey doesn't actually exist in the table. 518 break 519 } 520 // prev.Largest.UserKey == cur.Smallest.UserKey, so we need to 521 // include prev in the compaction. 522 } 523 524 iter = end.Clone() 525 iter.Next() 526 for cur, next := end.Current(), iter.Current(); next != nil; cur, next = end.Next(), iter.Next() { 527 if cur.IsCompacting() { 528 isCompacting = true 529 } 530 if cmp(cur.Largest.UserKey, next.Smallest.UserKey) < 0 { 531 break 532 } 533 if cur.Largest.IsExclusiveSentinel() { 534 // The table cur has a largest key indicating that the user key 535 // cur.largest.UserKey doesn't actually exist in the table. 536 break 537 } 538 // cur.Largest.UserKey == next.Smallest.UserKey, so we need to 539 // include next in the compaction. 540 } 541 }) 542 inputIter := inputs.Iter() 543 isCompacting = !disableIsCompacting && 544 (isCompacting || inputIter.First().IsCompacting() || inputIter.Last().IsCompacting()) 545 return inputs, isCompacting 546 } 547 548 func newCompactionPicker( 549 v *version, 550 opts *Options, 551 inProgressCompactions []compactionInfo, 552 levelSizes [numLevels]int64, 553 diskAvailBytes func() uint64, 554 ) compactionPicker { 555 p := &compactionPickerByScore{ 556 opts: opts, 557 vers: v, 558 levelSizes: levelSizes, 559 diskAvailBytes: diskAvailBytes, 560 } 561 p.initLevelMaxBytes(inProgressCompactions) 562 return p 563 } 564 565 // Information about a candidate compaction level that has been identified by 566 // the compaction picker. 567 type candidateLevelInfo struct { 568 // The score of the level to be compacted. 569 score float64 570 origScore float64 571 level int 572 // The level to compact to. 573 outputLevel int 574 // The file in level that will be compacted. Additional files may be 575 // picked by the compaction, and a pickedCompaction created for the 576 // compaction. 577 file manifest.LevelFile 578 } 579 580 // compensatedSize returns f's file size, inflated according to compaction 581 // priorities. 582 func compensatedSize(f *fileMetadata) uint64 { 583 sz := f.Size 584 // Add in the estimate of disk space that may be reclaimed by compacting 585 // the file's tombstones. 586 sz += f.Stats.PointDeletionsBytesEstimate 587 sz += f.Stats.RangeDeletionsBytesEstimate 588 return sz 589 } 590 591 // compensatedSizeAnnotator implements manifest.Annotator, annotating B-Tree 592 // nodes with the sum of the files' compensated sizes. Its annotation type is 593 // a *uint64. Compensated sizes may change once a table's stats are loaded 594 // asynchronously, so its values are marked as cacheable only if a file's 595 // stats have been loaded. 596 type compensatedSizeAnnotator struct{} 597 598 var _ manifest.Annotator = compensatedSizeAnnotator{} 599 600 func (a compensatedSizeAnnotator) Zero(dst interface{}) interface{} { 601 if dst == nil { 602 return new(uint64) 603 } 604 v := dst.(*uint64) 605 *v = 0 606 return v 607 } 608 609 func (a compensatedSizeAnnotator) Accumulate( 610 f *fileMetadata, dst interface{}, 611 ) (v interface{}, cacheOK bool) { 612 vptr := dst.(*uint64) 613 *vptr = *vptr + compensatedSize(f) 614 return vptr, f.StatsValidLocked() 615 } 616 617 func (a compensatedSizeAnnotator) Merge(src interface{}, dst interface{}) interface{} { 618 srcV := src.(*uint64) 619 dstV := dst.(*uint64) 620 *dstV = *dstV + *srcV 621 return dstV 622 } 623 624 // totalCompensatedSize computes the compensated size over a file metadata 625 // iterator. Note that this function is linear in the files available to the 626 // iterator. Use the compensatedSizeAnnotator if querying the total 627 // compensated size of a level. 628 func totalCompensatedSize(iter manifest.LevelIterator) uint64 { 629 var sz uint64 630 for f := iter.First(); f != nil; f = iter.Next() { 631 sz += compensatedSize(f) 632 } 633 return sz 634 } 635 636 // compactionPickerByScore holds the state and logic for picking a compaction. A 637 // compaction picker is associated with a single version. A new compaction 638 // picker is created and initialized every time a new version is installed. 639 type compactionPickerByScore struct { 640 opts *Options 641 vers *version 642 643 // The level to target for L0 compactions. Levels L1 to baseLevel must be 644 // empty. 645 baseLevel int 646 647 // estimatedMaxWAmp is the estimated maximum write amp per byte that is 648 // added to L0. 649 estimatedMaxWAmp float64 650 651 // levelMaxBytes holds the dynamically adjusted max bytes setting for each 652 // level. 653 levelMaxBytes [numLevels]int64 654 655 // levelSizes holds the current size of each level. 656 levelSizes [numLevels]int64 657 658 // diskAvailBytes returns a cached statistic on the number of bytes 659 // available on disk, as reported by the filesystem. It's used to be more 660 // restrictive in expanding compactions if available disk space is 661 // limited. 662 // 663 // The cached value is updated whenever a file is deleted and 664 // whenever a compaction or flush completes. Since file removal is 665 // the primary means of reclaiming space, there is a rough bound on 666 // the statistic's staleness when available bytes is growing. 667 // Compactions and flushes are longer, slower operations and provide 668 // a much looser bound when available bytes is decreasing. 669 diskAvailBytes func() uint64 670 } 671 672 var _ compactionPicker = &compactionPickerByScore{} 673 674 func (p *compactionPickerByScore) getScores(inProgress []compactionInfo) [numLevels]float64 { 675 var scores [numLevels]float64 676 for _, info := range p.calculateScores(inProgress) { 677 scores[info.level] = info.score 678 } 679 return scores 680 } 681 682 func (p *compactionPickerByScore) getBaseLevel() int { 683 if p == nil { 684 return 1 685 } 686 return p.baseLevel 687 } 688 689 func (p *compactionPickerByScore) getEstimatedMaxWAmp() float64 { 690 return p.estimatedMaxWAmp 691 } 692 693 // estimatedCompactionDebt estimates the number of bytes which need to be 694 // compacted before the LSM tree becomes stable. 695 func (p *compactionPickerByScore) estimatedCompactionDebt(l0ExtraSize uint64) uint64 { 696 if p == nil { 697 return 0 698 } 699 700 // We assume that all the bytes in L0 need to be compacted to Lbase. This is 701 // unlike the RocksDB logic that figures out whether L0 needs compaction. 702 bytesAddedToNextLevel := l0ExtraSize + uint64(p.levelSizes[0]) 703 nextLevelSize := uint64(p.levelSizes[p.baseLevel]) 704 705 var compactionDebt uint64 706 if bytesAddedToNextLevel > 0 && nextLevelSize > 0 { 707 // We only incur compaction debt if both L0 and Lbase contain data. If L0 708 // is empty, no compaction is necessary. If Lbase is empty, a move-based 709 // compaction from L0 would occur. 710 compactionDebt += bytesAddedToNextLevel + nextLevelSize 711 } 712 713 for level := p.baseLevel; level < numLevels-1; level++ { 714 levelSize := nextLevelSize + bytesAddedToNextLevel 715 nextLevelSize = uint64(p.levelSizes[level+1]) 716 if levelSize > uint64(p.levelMaxBytes[level]) { 717 bytesAddedToNextLevel = levelSize - uint64(p.levelMaxBytes[level]) 718 if nextLevelSize > 0 { 719 // We only incur compaction debt if the next level contains data. If the 720 // next level is empty, a move-based compaction would be used. 721 levelRatio := float64(nextLevelSize) / float64(levelSize) 722 // The current level contributes bytesAddedToNextLevel to compactions. 723 // The next level contributes levelRatio * bytesAddedToNextLevel. 724 compactionDebt += uint64(float64(bytesAddedToNextLevel) * (levelRatio + 1)) 725 } 726 } 727 } 728 729 return compactionDebt 730 } 731 732 func (p *compactionPickerByScore) initLevelMaxBytes(inProgressCompactions []compactionInfo) { 733 // The levelMaxBytes calculations here differ from RocksDB in two ways: 734 // 735 // 1. The use of dbSize vs maxLevelSize. RocksDB uses the size of the maximum 736 // level in L1-L6, rather than determining the size of the bottom level 737 // based on the total amount of data in the dB. The RocksDB calculation is 738 // problematic if L0 contains a significant fraction of data, or if the 739 // level sizes are roughly equal and thus there is a significant fraction 740 // of data outside of the largest level. 741 // 742 // 2. Not adjusting the size of Lbase based on L0. RocksDB computes 743 // baseBytesMax as the maximum of the configured LBaseMaxBytes and the 744 // size of L0. This is problematic because baseBytesMax is used to compute 745 // the max size of lower levels. A very large baseBytesMax will result in 746 // an overly large value for the size of lower levels which will caused 747 // those levels not to be compacted even when they should be 748 // compacted. This often results in "inverted" LSM shapes where Ln is 749 // larger than Ln+1. 750 751 // Determine the first non-empty level and the total DB size. 752 firstNonEmptyLevel := -1 753 var dbSize int64 754 for level := 1; level < numLevels; level++ { 755 if p.levelSizes[level] > 0 { 756 if firstNonEmptyLevel == -1 { 757 firstNonEmptyLevel = level 758 } 759 dbSize += p.levelSizes[level] 760 } 761 } 762 for _, c := range inProgressCompactions { 763 if c.outputLevel == 0 || c.outputLevel == -1 { 764 continue 765 } 766 if c.inputs[0].level == 0 && (firstNonEmptyLevel == -1 || c.outputLevel < firstNonEmptyLevel) { 767 firstNonEmptyLevel = c.outputLevel 768 } 769 } 770 771 // Initialize the max-bytes setting for each level to "infinity" which will 772 // disallow compaction for that level. We'll fill in the actual value below 773 // for levels we want to allow compactions from. 774 for level := 0; level < numLevels; level++ { 775 p.levelMaxBytes[level] = math.MaxInt64 776 } 777 778 if dbSize == 0 { 779 // No levels for L1 and up contain any data. Target L0 compactions for the 780 // last level or to the level to which there is an ongoing L0 compaction. 781 p.baseLevel = numLevels - 1 782 if firstNonEmptyLevel >= 0 { 783 p.baseLevel = firstNonEmptyLevel 784 } 785 return 786 } 787 788 dbSize += p.levelSizes[0] 789 bottomLevelSize := dbSize - dbSize/levelMultiplier 790 791 curLevelSize := bottomLevelSize 792 for level := numLevels - 2; level >= firstNonEmptyLevel; level-- { 793 curLevelSize = int64(float64(curLevelSize) / levelMultiplier) 794 } 795 796 // Compute base level (where L0 data is compacted to). 797 baseBytesMax := p.opts.LBaseMaxBytes 798 p.baseLevel = firstNonEmptyLevel 799 for p.baseLevel > 1 && curLevelSize > baseBytesMax { 800 p.baseLevel-- 801 curLevelSize = int64(float64(curLevelSize) / levelMultiplier) 802 } 803 804 smoothedLevelMultiplier := 1.0 805 if p.baseLevel < numLevels-1 { 806 smoothedLevelMultiplier = math.Pow( 807 float64(bottomLevelSize)/float64(baseBytesMax), 808 1.0/float64(numLevels-p.baseLevel-1)) 809 } 810 811 p.estimatedMaxWAmp = float64(numLevels-p.baseLevel) * (smoothedLevelMultiplier + 1) 812 813 levelSize := float64(baseBytesMax) 814 for level := p.baseLevel; level < numLevels; level++ { 815 if level > p.baseLevel && levelSize > 0 { 816 levelSize *= smoothedLevelMultiplier 817 } 818 // Round the result since test cases use small target level sizes, which 819 // can be impacted by floating-point imprecision + integer truncation. 820 roundedLevelSize := math.Round(levelSize) 821 if roundedLevelSize > float64(math.MaxInt64) { 822 p.levelMaxBytes[level] = math.MaxInt64 823 } else { 824 p.levelMaxBytes[level] = int64(roundedLevelSize) 825 } 826 } 827 } 828 829 func calculateSizeAdjust(inProgressCompactions []compactionInfo) [numLevels]int64 { 830 // Compute a size adjustment for each level based on the in-progress 831 // compactions. We subtract the compensated size of start level inputs. 832 // Since compensated file sizes may be compensated because they reclaim 833 // space from the output level's files, we add the real file size to the 834 // output level. This is slightly different from RocksDB's behavior, which 835 // simply elides compacting files from the level size calculation. 836 var sizeAdjust [numLevels]int64 837 for i := range inProgressCompactions { 838 c := &inProgressCompactions[i] 839 840 for _, input := range c.inputs { 841 real := int64(input.files.SizeSum()) 842 compensated := int64(totalCompensatedSize(input.files.Iter())) 843 844 if input.level != c.outputLevel { 845 sizeAdjust[input.level] -= compensated 846 if c.outputLevel != -1 { 847 sizeAdjust[c.outputLevel] += real 848 } 849 } 850 } 851 } 852 return sizeAdjust 853 } 854 855 func levelCompensatedSize(lm manifest.LevelMetadata) uint64 { 856 return *lm.Annotation(compensatedSizeAnnotator{}).(*uint64) 857 } 858 859 func (p *compactionPickerByScore) calculateScores( 860 inProgressCompactions []compactionInfo, 861 ) [numLevels]candidateLevelInfo { 862 var scores [numLevels]candidateLevelInfo 863 for i := range scores { 864 scores[i].level = i 865 scores[i].outputLevel = i + 1 866 } 867 scores[0] = p.calculateL0Score(inProgressCompactions) 868 869 sizeAdjust := calculateSizeAdjust(inProgressCompactions) 870 for level := 1; level < numLevels; level++ { 871 levelSize := int64(levelCompensatedSize(p.vers.Levels[level])) + sizeAdjust[level] 872 scores[level].score = float64(levelSize) / float64(p.levelMaxBytes[level]) 873 scores[level].origScore = scores[level].score 874 } 875 876 // Adjust each level's score by the score of the next level. If the next 877 // level has a high score, and is thus a priority for compaction, this 878 // reduces the priority for compacting the current level. If the next level 879 // has a low score (i.e. it is below its target size), this increases the 880 // priority for compacting the current level. 881 // 882 // The effect of this adjustment is to help prioritize compactions in lower 883 // levels. The following shows the new score and original score. In this 884 // scenario, L0 has 68 sublevels. L3 (a.k.a. Lbase) is significantly above 885 // its target size. The original score prioritizes compactions from those two 886 // levels, but doing so ends up causing a future problem: data piles up in 887 // the higher levels, starving L5->L6 compactions, and to a lesser degree 888 // starving L4->L5 compactions. 889 // 890 // adjusted original 891 // score score size max-size 892 // L0 3.2 68.0 2.2 G - 893 // L3 3.2 21.1 1.3 G 64 M 894 // L4 3.4 6.7 3.1 G 467 M 895 // L5 3.4 2.0 6.6 G 3.3 G 896 // L6 0.6 0.6 14 G 24 G 897 var prevLevel int 898 for level := p.baseLevel; level < numLevels; level++ { 899 if scores[prevLevel].score >= 1 { 900 // Avoid absurdly large scores by placing a floor on the score that we'll 901 // adjust a level by. The value of 0.01 was chosen somewhat arbitrarily 902 const minScore = 0.01 903 if scores[level].score >= minScore { 904 scores[prevLevel].score /= scores[level].score 905 } else { 906 scores[prevLevel].score /= minScore 907 } 908 } 909 prevLevel = level 910 } 911 912 sort.Sort(sortCompactionLevelsDecreasingScore(scores[:])) 913 return scores 914 } 915 916 func (p *compactionPickerByScore) calculateL0Score( 917 inProgressCompactions []compactionInfo, 918 ) candidateLevelInfo { 919 var info candidateLevelInfo 920 info.outputLevel = p.baseLevel 921 922 // If L0Sublevels are present, use the sublevel count to calculate the 923 // score. The base vs intra-L0 compaction determination happens in pickAuto, 924 // not here. 925 info.score = float64(2*p.vers.L0Sublevels.MaxDepthAfterOngoingCompactions()) / 926 float64(p.opts.L0CompactionThreshold) 927 928 // Also calculate a score based on the file count but use it only if it 929 // produces a higher score than the sublevel-based one. This heuristic is 930 // designed to accommodate cases where L0 is accumulating non-overlapping 931 // files in L0. Letting too many non-overlapping files accumulate in few 932 // sublevels is undesirable, because: 933 // 1) we can produce a massive backlog to compact once files do overlap. 934 // 2) constructing L0 sublevels has a runtime that grows superlinearly with 935 // the number of files in L0 and must be done while holding D.mu. 936 noncompactingFiles := p.vers.Levels[0].Len() 937 for _, c := range inProgressCompactions { 938 for _, cl := range c.inputs { 939 if cl.level == 0 { 940 noncompactingFiles -= cl.files.Len() 941 } 942 } 943 } 944 fileScore := float64(noncompactingFiles) / float64(p.opts.L0CompactionFileThreshold) 945 if info.score < fileScore { 946 info.score = fileScore 947 } 948 return info 949 } 950 951 func (p *compactionPickerByScore) pickFile( 952 level, outputLevel int, earliestSnapshotSeqNum uint64, 953 ) (manifest.LevelFile, bool) { 954 // Select the file within the level to compact. We want to minimize write 955 // amplification, but also ensure that deletes are propagated to the 956 // bottom level in a timely fashion so as to reclaim disk space. A table's 957 // smallest sequence number provides a measure of its age. The ratio of 958 // overlapping-bytes / table-size gives an indication of write 959 // amplification (a smaller ratio is preferrable). 960 // 961 // The current heuristic is based off the the RocksDB kMinOverlappingRatio 962 // heuristic. It chooses the file with the minimum overlapping ratio with 963 // the target level, which minimizes write amplification. 964 // 965 // It uses a "compensated size" for the denominator, which is the file 966 // size but artificially inflated by an estimate of the space that may be 967 // reclaimed through compaction. Currently, we only compensate for range 968 // deletions and only with a rough estimate of the reclaimable bytes. This 969 // differs from RocksDB which only compensates for point tombstones and 970 // only if they exceed the number of non-deletion entries in table. 971 // 972 // TODO(peter): For concurrent compactions, we may want to try harder to 973 // pick a seed file whose resulting compaction bounds do not overlap with 974 // an in-progress compaction. 975 976 cmp := p.opts.Comparer.Compare 977 startIter := p.vers.Levels[level].Iter() 978 outputIter := p.vers.Levels[outputLevel].Iter() 979 980 var file manifest.LevelFile 981 smallestRatio := uint64(math.MaxUint64) 982 983 outputFile := outputIter.First() 984 985 for f := startIter.First(); f != nil; f = startIter.Next() { 986 var overlappingBytes uint64 987 988 // Trim any output-level files smaller than f. 989 for outputFile != nil && base.InternalCompare(cmp, outputFile.Largest, f.Smallest) < 0 { 990 outputFile = outputIter.Next() 991 } 992 993 compacting := f.IsCompacting() 994 for outputFile != nil && base.InternalCompare(cmp, outputFile.Smallest, f.Largest) < 0 { 995 overlappingBytes += outputFile.Size 996 compacting = compacting || outputFile.IsCompacting() 997 998 // For files in the bottommost level of the LSM, the 999 // Stats.RangeDeletionsBytesEstimate field is set to the estimate 1000 // of bytes /within/ the file itself that may be dropped by 1001 // recompacting the file. These bytes from obsolete keys would not 1002 // need to be rewritten if we compacted `f` into `outputFile`, so 1003 // they don't contribute to write amplification. Subtracting them 1004 // out of the overlapping bytes helps prioritize these compactions 1005 // that are cheaper than their file sizes suggest. 1006 if outputLevel == numLevels-1 && outputFile.LargestSeqNum < earliestSnapshotSeqNum { 1007 overlappingBytes -= outputFile.Stats.RangeDeletionsBytesEstimate 1008 } 1009 1010 // If the file in the next level extends beyond f's largest key, 1011 // break out and don't advance outputIter because f's successor 1012 // might also overlap. 1013 if base.InternalCompare(cmp, outputFile.Largest, f.Largest) > 0 { 1014 break 1015 } 1016 outputFile = outputIter.Next() 1017 } 1018 1019 // If the input level file or one of the overlapping files is 1020 // compacting, we're not going to be able to compact this file 1021 // anyways, so skip it. 1022 if compacting { 1023 continue 1024 } 1025 1026 scaledRatio := overlappingBytes * 1024 / compensatedSize(f) 1027 if scaledRatio < smallestRatio && !f.IsCompacting() { 1028 smallestRatio = scaledRatio 1029 file = startIter.Take() 1030 } 1031 } 1032 return file, file.FileMetadata != nil 1033 } 1034 1035 // pickAuto picks the best compaction, if any. 1036 // 1037 // On each call, pickAuto computes per-level size adjustments based on 1038 // in-progress compactions, and computes a per-level score. The levels are 1039 // iterated over in decreasing score order trying to find a valid compaction 1040 // anchored at that level. 1041 // 1042 // If a score-based compaction cannot be found, pickAuto falls back to looking 1043 // for an elision-only compaction to remove obsolete keys. 1044 func (p *compactionPickerByScore) pickAuto(env compactionEnv) (pc *pickedCompaction) { 1045 // Compaction concurrency is controlled by L0 read-amp. We allow one 1046 // additional compaction per L0CompactionConcurrency sublevels, as well as 1047 // one additional compaction per CompactionDebtConcurrency bytes of 1048 // compaction debt. Compaction concurrency is tied to L0 sublevels as that 1049 // signal is independent of the database size. We tack on the compaction 1050 // debt as a second signal to prevent compaction concurrency from dropping 1051 // significantly right after a base compaction finishes, and before those 1052 // bytes have been compacted further down the LSM. 1053 if n := len(env.inProgressCompactions); n > 0 { 1054 l0ReadAmp := p.vers.L0Sublevels.MaxDepthAfterOngoingCompactions() 1055 compactionDebt := int(p.estimatedCompactionDebt(0)) 1056 ccSignal1 := n * p.opts.Experimental.L0CompactionConcurrency 1057 ccSignal2 := n * p.opts.Experimental.CompactionDebtConcurrency 1058 if l0ReadAmp < ccSignal1 && compactionDebt < ccSignal2 { 1059 return nil 1060 } 1061 } 1062 1063 scores := p.calculateScores(env.inProgressCompactions) 1064 1065 // TODO(peter): Either remove, or change this into an event sent to the 1066 // EventListener. 1067 logCompaction := func(pc *pickedCompaction) { 1068 var buf bytes.Buffer 1069 for i := 0; i < numLevels; i++ { 1070 if i != 0 && i < p.baseLevel { 1071 continue 1072 } 1073 1074 var info *candidateLevelInfo 1075 for j := range scores { 1076 if scores[j].level == i { 1077 info = &scores[j] 1078 break 1079 } 1080 } 1081 1082 marker := " " 1083 if pc.startLevel.level == info.level { 1084 marker = "*" 1085 } 1086 fmt.Fprintf(&buf, " %sL%d: %5.1f %5.1f %8s %8s", 1087 marker, info.level, info.score, info.origScore, 1088 humanize.Int64(int64(totalCompensatedSize(p.vers.Levels[info.level].Iter()))), 1089 humanize.Int64(p.levelMaxBytes[info.level]), 1090 ) 1091 1092 count := 0 1093 for i := range env.inProgressCompactions { 1094 c := &env.inProgressCompactions[i] 1095 if c.inputs[0].level != info.level { 1096 continue 1097 } 1098 count++ 1099 if count == 1 { 1100 fmt.Fprintf(&buf, " [") 1101 } else { 1102 fmt.Fprintf(&buf, " ") 1103 } 1104 fmt.Fprintf(&buf, "L%d->L%d", c.inputs[0].level, c.outputLevel) 1105 } 1106 if count > 0 { 1107 fmt.Fprintf(&buf, "]") 1108 } 1109 fmt.Fprintf(&buf, "\n") 1110 } 1111 p.opts.Logger.Infof("pickAuto: L%d->L%d\n%s", 1112 pc.startLevel.level, pc.outputLevel.level, buf.String()) 1113 } 1114 1115 // Check for a score-based compaction. "scores" has been sorted in order of 1116 // decreasing score. For each level with a score >= 1, we attempt to find a 1117 // compaction anchored at at that level. 1118 for i := range scores { 1119 info := &scores[i] 1120 if info.score < 1 { 1121 break 1122 } 1123 if info.level == numLevels-1 { 1124 continue 1125 } 1126 1127 if info.level == 0 { 1128 pc = pickL0(env, p.opts, p.vers, p.baseLevel, p.diskAvailBytes) 1129 // Fail-safe to protect against compacting the same sstable 1130 // concurrently. 1131 if pc != nil && !inputRangeAlreadyCompacting(env, pc) { 1132 pc.score = info.score 1133 // TODO(peter): remove 1134 if false { 1135 logCompaction(pc) 1136 } 1137 return pc 1138 } 1139 continue 1140 } 1141 1142 // info.level > 0 1143 var ok bool 1144 info.file, ok = p.pickFile(info.level, info.outputLevel, env.earliestSnapshotSeqNum) 1145 if !ok { 1146 continue 1147 } 1148 1149 pc := pickAutoLPositive(env, p.opts, p.vers, *info, p.baseLevel, p.diskAvailBytes, p.levelMaxBytes) 1150 // Fail-safe to protect against compacting the same sstable concurrently. 1151 if pc != nil && !inputRangeAlreadyCompacting(env, pc) { 1152 pc.score = info.score 1153 // TODO(peter): remove 1154 if false { 1155 logCompaction(pc) 1156 } 1157 return pc 1158 } 1159 } 1160 1161 // Check for L6 files with tombstones that may be elided. These files may 1162 // exist if a snapshot prevented the elision of a tombstone or because of 1163 // a move compaction. These are low-priority compactions because they 1164 // don't help us keep up with writes, just reclaim disk space. 1165 if pc := p.pickElisionOnlyCompaction(env); pc != nil { 1166 return pc 1167 } 1168 1169 if pc := p.pickReadTriggeredCompaction(env); pc != nil { 1170 return pc 1171 } 1172 1173 // NB: This should only be run if a read compaction wasn't 1174 // scheduled. 1175 // 1176 // We won't be scheduling a read compaction right now, and in 1177 // read heavy workloads, compactions won't be scheduled frequently 1178 // because flushes aren't frequent. So we need to signal to the 1179 // iterator to schedule a compaction when it adds compactions to 1180 // the read compaction queue. 1181 // 1182 // We need the nil check here because without it, we have some 1183 // tests which don't set that variable fail. Since there's a 1184 // chance that one of those tests wouldn't want extra compactions 1185 // to be scheduled, I added this check here, instead of 1186 // setting rescheduleReadCompaction in those tests. 1187 if env.readCompactionEnv.rescheduleReadCompaction != nil { 1188 *env.readCompactionEnv.rescheduleReadCompaction = true 1189 } 1190 1191 // At the lowest possible compaction-picking priority, look for files marked 1192 // for compaction. Pebble will mark files for compaction if they have atomic 1193 // compaction units that span multiple files. While current Pebble code does 1194 // not construct such sstables, RocksDB and earlier versions of Pebble may 1195 // have created them. These split user keys form sets of files that must be 1196 // compacted together for correctness (referred to as "atomic compaction 1197 // units" within the code). Rewrite them in-place. 1198 // 1199 // It's also possible that a file may have been marked for compaction by 1200 // even earlier versions of Pebble code, since FileMetadata's 1201 // MarkedForCompaction field is persisted in the manifest. That's okay. We 1202 // previously would've ignored the designation, whereas now we'll re-compact 1203 // the file in place. 1204 if p.vers.Stats.MarkedForCompaction > 0 { 1205 if pc := p.pickRewriteCompaction(env); pc != nil { 1206 return pc 1207 } 1208 } 1209 1210 return nil 1211 } 1212 1213 // elisionOnlyAnnotator implements the manifest.Annotator interface, 1214 // annotating B-Tree nodes with the *fileMetadata of a file meeting the 1215 // obsolete keys criteria for an elision-only compaction within the subtree. 1216 // If multiple files meet the criteria, it chooses whichever file has the 1217 // lowest LargestSeqNum. The lowest LargestSeqNum file will be the first 1218 // eligible for an elision-only compaction once snapshots less than or equal 1219 // to its LargestSeqNum are closed. 1220 type elisionOnlyAnnotator struct{} 1221 1222 var _ manifest.Annotator = elisionOnlyAnnotator{} 1223 1224 func (a elisionOnlyAnnotator) Zero(interface{}) interface{} { 1225 return nil 1226 } 1227 1228 func (a elisionOnlyAnnotator) Accumulate(f *fileMetadata, dst interface{}) (interface{}, bool) { 1229 if f.IsCompacting() { 1230 return dst, true 1231 } 1232 if !f.StatsValidLocked() { 1233 return dst, false 1234 } 1235 // Bottommost files are large and not worthwhile to compact just 1236 // to remove a few tombstones. Consider a file ineligible if its 1237 // own range deletions delete less than 10% of its data and its 1238 // deletion tombstones make up less than 10% of its entries. 1239 // 1240 // TODO(jackson): This does not account for duplicate user keys 1241 // which may be collapsed. Ideally, we would have 'obsolete keys' 1242 // statistics that would include tombstones, the keys that are 1243 // dropped by tombstones and duplicated user keys. See #847. 1244 // 1245 // Note that tables that contain exclusively range keys (i.e. no point keys, 1246 // `NumEntries` and `RangeDeletionsBytesEstimate` are both zero) are excluded 1247 // from elision-only compactions. 1248 // TODO(travers): Consider an alternative heuristic for elision of range-keys. 1249 if f.Stats.RangeDeletionsBytesEstimate*10 < f.Size && 1250 f.Stats.NumDeletions*10 <= f.Stats.NumEntries { 1251 return dst, true 1252 } 1253 if dst == nil { 1254 return f, true 1255 } else if dstV := dst.(*fileMetadata); dstV.LargestSeqNum > f.LargestSeqNum { 1256 return f, true 1257 } 1258 return dst, true 1259 } 1260 1261 func (a elisionOnlyAnnotator) Merge(v interface{}, accum interface{}) interface{} { 1262 if v == nil { 1263 return accum 1264 } 1265 // If we haven't accumulated an eligible file yet, or f's LargestSeqNum is 1266 // less than the accumulated file's, use f. 1267 if accum == nil { 1268 return v 1269 } 1270 f := v.(*fileMetadata) 1271 accumV := accum.(*fileMetadata) 1272 if accumV == nil || accumV.LargestSeqNum > f.LargestSeqNum { 1273 return f 1274 } 1275 return accumV 1276 } 1277 1278 // markedForCompactionAnnotator implements the manifest.Annotator interface, 1279 // annotating B-Tree nodes with the *fileMetadata of a file that is marked for 1280 // compaction within the subtree. If multiple files meet the criteria, it 1281 // chooses whichever file has the lowest LargestSeqNum. 1282 type markedForCompactionAnnotator struct{} 1283 1284 var _ manifest.Annotator = markedForCompactionAnnotator{} 1285 1286 func (a markedForCompactionAnnotator) Zero(interface{}) interface{} { 1287 return nil 1288 } 1289 1290 func (a markedForCompactionAnnotator) Accumulate( 1291 f *fileMetadata, dst interface{}, 1292 ) (interface{}, bool) { 1293 if !f.MarkedForCompaction { 1294 // Not marked for compaction; return dst. 1295 return dst, true 1296 } 1297 return markedMergeHelper(f, dst) 1298 } 1299 1300 func (a markedForCompactionAnnotator) Merge(v interface{}, accum interface{}) interface{} { 1301 if v == nil { 1302 return accum 1303 } 1304 accum, _ = markedMergeHelper(v.(*fileMetadata), accum) 1305 return accum 1306 } 1307 1308 // REQUIRES: f is non-nil, and f.MarkedForCompaction=true. 1309 func markedMergeHelper(f *fileMetadata, dst interface{}) (interface{}, bool) { 1310 if dst == nil { 1311 return f, true 1312 } else if dstV := dst.(*fileMetadata); dstV.LargestSeqNum > f.LargestSeqNum { 1313 return f, true 1314 } 1315 return dst, true 1316 } 1317 1318 // pickElisionOnlyCompaction looks for compactions of sstables in the 1319 // bottommost level containing obsolete records that may now be dropped. 1320 func (p *compactionPickerByScore) pickElisionOnlyCompaction( 1321 env compactionEnv, 1322 ) (pc *pickedCompaction) { 1323 v := p.vers.Levels[numLevels-1].Annotation(elisionOnlyAnnotator{}) 1324 if v == nil { 1325 return nil 1326 } 1327 candidate := v.(*fileMetadata) 1328 if candidate.IsCompacting() || candidate.LargestSeqNum >= env.earliestSnapshotSeqNum { 1329 return nil 1330 } 1331 lf := p.vers.Levels[numLevels-1].Find(p.opts.Comparer.Compare, candidate) 1332 if lf == nil { 1333 panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, numLevels-1)) 1334 } 1335 1336 // Construct a picked compaction of the elision candidate's atomic 1337 // compaction unit. 1338 pc = newPickedCompaction(p.opts, p.vers, numLevels-1, numLevels-1, p.baseLevel) 1339 pc.kind = compactionKindElisionOnly 1340 var isCompacting bool 1341 pc.startLevel.files, isCompacting = expandToAtomicUnit(p.opts.Comparer.Compare, lf.Slice(), false /* disableIsCompacting */) 1342 if isCompacting { 1343 return nil 1344 } 1345 pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter()) 1346 // Fail-safe to protect against compacting the same sstable concurrently. 1347 if !inputRangeAlreadyCompacting(env, pc) { 1348 return pc 1349 } 1350 return nil 1351 } 1352 1353 // pickRewriteCompaction attempts to construct a compaction that 1354 // rewrites a file marked for compaction. pickRewriteCompaction will 1355 // pull in adjacent files in the file's atomic compaction unit if 1356 // necessary. A rewrite compaction outputs files to the same level as 1357 // the input level. 1358 func (p *compactionPickerByScore) pickRewriteCompaction(env compactionEnv) (pc *pickedCompaction) { 1359 for l := numLevels - 1; l >= 0; l-- { 1360 v := p.vers.Levels[l].Annotation(markedForCompactionAnnotator{}) 1361 if v == nil { 1362 // Try the next level. 1363 continue 1364 } 1365 candidate := v.(*fileMetadata) 1366 if candidate.IsCompacting() { 1367 // Try the next level. 1368 continue 1369 } 1370 lf := p.vers.Levels[l].Find(p.opts.Comparer.Compare, candidate) 1371 if lf == nil { 1372 panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, numLevels-1)) 1373 } 1374 1375 inputs := lf.Slice() 1376 // L0 files generated by a flush have never been split such that 1377 // adjacent files can contain the same user key. So we do not need to 1378 // rewrite an atomic compaction unit for L0. Note that there is nothing 1379 // preventing two different flushes from producing files that are 1380 // non-overlapping from an InternalKey perspective, but span the same 1381 // user key. However, such files cannot be in the same L0 sublevel, 1382 // since each sublevel requires non-overlapping user keys (unlike other 1383 // levels). 1384 if l > 0 { 1385 // Find this file's atomic compaction unit. This is only relevant 1386 // for levels L1+. 1387 var isCompacting bool 1388 inputs, isCompacting = expandToAtomicUnit( 1389 p.opts.Comparer.Compare, 1390 inputs, 1391 false, /* disableIsCompacting */ 1392 ) 1393 if isCompacting { 1394 // Try the next level. 1395 continue 1396 } 1397 } 1398 1399 pc = newPickedCompaction(p.opts, p.vers, l, l, p.baseLevel) 1400 pc.outputLevel.level = l 1401 pc.kind = compactionKindRewrite 1402 pc.startLevel.files = inputs 1403 pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter()) 1404 1405 // Fail-safe to protect against compacting the same sstable concurrently. 1406 if !inputRangeAlreadyCompacting(env, pc) { 1407 if pc.startLevel.level == 0 { 1408 pc.l0SublevelInfo = generateSublevelInfo(pc.cmp, pc.startLevel.files) 1409 } 1410 return pc 1411 } 1412 } 1413 return nil 1414 } 1415 1416 // pickAutoLPositive picks an automatic compaction for the candidate 1417 // file in a positive-numbered level. This function must not be used for 1418 // L0. 1419 func pickAutoLPositive( 1420 env compactionEnv, 1421 opts *Options, 1422 vers *version, 1423 cInfo candidateLevelInfo, 1424 baseLevel int, 1425 diskAvailBytes func() uint64, 1426 levelMaxBytes [7]int64, 1427 ) (pc *pickedCompaction) { 1428 if cInfo.level == 0 { 1429 panic("bitalostable: pickAutoLPositive called for L0") 1430 } 1431 1432 pc = newPickedCompaction(opts, vers, cInfo.level, defaultOutputLevel(cInfo.level, baseLevel), baseLevel) 1433 if pc.outputLevel.level != cInfo.outputLevel { 1434 panic("bitalostable: compaction picked unexpected output level") 1435 } 1436 pc.startLevel.files = cInfo.file.Slice() 1437 // Files in level 0 may overlap each other, so pick up all overlapping ones. 1438 if pc.startLevel.level == 0 { 1439 cmp := opts.Comparer.Compare 1440 smallest, largest := manifest.KeyRange(cmp, pc.startLevel.files.Iter()) 1441 pc.startLevel.files = vers.Overlaps(0, cmp, smallest.UserKey, 1442 largest.UserKey, largest.IsExclusiveSentinel()) 1443 if pc.startLevel.files.Empty() { 1444 panic("bitalostable: empty compaction") 1445 } 1446 } 1447 1448 if !pc.setupInputs(opts, diskAvailBytes(), pc.startLevel) { 1449 return nil 1450 } 1451 if opts.Experimental.MultiLevelCompaction && 1452 pc.initMultiLevelCompaction(opts, vers, levelMaxBytes, diskAvailBytes()) { 1453 if !pc.setupInputs(opts, diskAvailBytes(), pc.extraLevels[len(pc.extraLevels)-1]) { 1454 return nil 1455 } 1456 } 1457 return pc 1458 } 1459 1460 // Helper method to pick compactions originating from L0. Uses information about 1461 // sublevels to generate a compaction. 1462 func pickL0( 1463 env compactionEnv, opts *Options, vers *version, baseLevel int, diskAvailBytes func() uint64, 1464 ) (pc *pickedCompaction) { 1465 // It is important to pass information about Lbase files to L0Sublevels 1466 // so it can pick a compaction that does not conflict with an Lbase => Lbase+1 1467 // compaction. Without this, we observed reduced concurrency of L0=>Lbase 1468 // compactions, and increasing read amplification in L0. 1469 // 1470 // TODO(bilal) Remove the minCompactionDepth parameter once fixing it at 1 1471 // has been shown to not cause a performance regression. 1472 lcf, err := vers.L0Sublevels.PickBaseCompaction(1, vers.Levels[baseLevel].Slice()) 1473 if err != nil { 1474 opts.Logger.Infof("error when picking base compaction: %s", err) 1475 return 1476 } 1477 if lcf != nil { 1478 pc = newPickedCompactionFromL0(lcf, opts, vers, baseLevel, true) 1479 pc.setupInputs(opts, diskAvailBytes(), pc.startLevel) 1480 if pc.startLevel.files.Empty() { 1481 opts.Logger.Fatalf("empty compaction chosen") 1482 } 1483 return pc 1484 } 1485 1486 // Couldn't choose a base compaction. Try choosing an intra-L0 1487 // compaction. Note that we pass in L0CompactionThreshold here as opposed to 1488 // 1, since choosing a single sublevel intra-L0 compaction is 1489 // counterproductive. 1490 lcf, err = vers.L0Sublevels.PickIntraL0Compaction(env.earliestUnflushedSeqNum, minIntraL0Count) 1491 if err != nil { 1492 opts.Logger.Infof("error when picking intra-L0 compaction: %s", err) 1493 return 1494 } 1495 if lcf != nil { 1496 pc = newPickedCompactionFromL0(lcf, opts, vers, 0, false) 1497 if !pc.setupInputs(opts, diskAvailBytes(), pc.startLevel) { 1498 return nil 1499 } 1500 if pc.startLevel.files.Empty() { 1501 opts.Logger.Fatalf("empty compaction chosen") 1502 } 1503 { 1504 iter := pc.startLevel.files.Iter() 1505 if iter.First() == nil || iter.Next() == nil { 1506 // A single-file intra-L0 compaction is unproductive. 1507 return nil 1508 } 1509 } 1510 1511 pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter()) 1512 } 1513 return pc 1514 } 1515 1516 func (p *compactionPickerByScore) pickManual( 1517 env compactionEnv, manual *manualCompaction, 1518 ) (pc *pickedCompaction, retryLater bool) { 1519 if p == nil { 1520 return nil, false 1521 } 1522 1523 outputLevel := manual.level + 1 1524 if manual.level == 0 { 1525 outputLevel = p.baseLevel 1526 } else if manual.level < p.baseLevel { 1527 // The start level for a compaction must be >= Lbase. A manual 1528 // compaction could have been created adhering to that condition, and 1529 // then an automatic compaction came in and compacted all of the 1530 // sstables in Lbase to Lbase+1 which caused Lbase to change. Simply 1531 // ignore this manual compaction as there is nothing to do (manual.level 1532 // points to an empty level). 1533 return nil, false 1534 } 1535 // This conflictsWithInProgress call is necessary for the manual compaction to 1536 // be retried when it conflicts with an ongoing automatic compaction. Without 1537 // it, the compaction is dropped due to pc.setupInputs returning false since 1538 // the input/output range is already being compacted, and the manual 1539 // compaction ends with a non-compacted LSM. 1540 if conflictsWithInProgress(manual, outputLevel, env.inProgressCompactions, p.opts.Comparer.Compare) { 1541 return nil, true 1542 } 1543 pc = pickManualHelper(p.opts, manual, p.vers, p.baseLevel, p.diskAvailBytes, p.levelMaxBytes) 1544 if pc == nil { 1545 return nil, false 1546 } 1547 if pc.outputLevel.level != outputLevel { 1548 if len(pc.extraLevels) > 0 { 1549 // multilevel compactions relax this invariant 1550 } else { 1551 panic("bitalostable: compaction picked unexpected output level") 1552 } 1553 } 1554 // Fail-safe to protect against compacting the same sstable concurrently. 1555 if inputRangeAlreadyCompacting(env, pc) { 1556 return nil, true 1557 } 1558 return pc, false 1559 } 1560 1561 func pickManualHelper( 1562 opts *Options, 1563 manual *manualCompaction, 1564 vers *version, 1565 baseLevel int, 1566 diskAvailBytes func() uint64, 1567 levelMaxBytes [7]int64, 1568 ) (pc *pickedCompaction) { 1569 pc = newPickedCompaction(opts, vers, manual.level, defaultOutputLevel(manual.level, baseLevel), baseLevel) 1570 manual.outputLevel = pc.outputLevel.level 1571 cmp := opts.Comparer.Compare 1572 pc.startLevel.files = vers.Overlaps(manual.level, cmp, manual.start, manual.end, false) 1573 if pc.startLevel.files.Empty() { 1574 // Nothing to do 1575 return nil 1576 } 1577 if !pc.setupInputs(opts, diskAvailBytes(), pc.startLevel) { 1578 return nil 1579 } 1580 if opts.Experimental.MultiLevelCompaction && pc.startLevel.level > 0 && 1581 pc.initMultiLevelCompaction(opts, vers, levelMaxBytes, diskAvailBytes()) { 1582 if !pc.setupInputs(opts, diskAvailBytes(), pc.extraLevels[len(pc.extraLevels)-1]) { 1583 return nil 1584 } 1585 } 1586 return pc 1587 } 1588 1589 func (p *compactionPickerByScore) pickReadTriggeredCompaction( 1590 env compactionEnv, 1591 ) (pc *pickedCompaction) { 1592 // If a flush is in-progress or expected to happen soon, it means more writes are taking place. We would 1593 // soon be scheduling more write focussed compactions. In this case, skip read compactions as they are 1594 // lower priority. 1595 if env.readCompactionEnv.flushing || env.readCompactionEnv.readCompactions == nil { 1596 return nil 1597 } 1598 for env.readCompactionEnv.readCompactions.size > 0 { 1599 rc := env.readCompactionEnv.readCompactions.remove() 1600 if pc = pickReadTriggeredCompactionHelper(p, rc, env); pc != nil { 1601 break 1602 } 1603 } 1604 return pc 1605 } 1606 1607 func pickReadTriggeredCompactionHelper( 1608 p *compactionPickerByScore, rc *readCompaction, env compactionEnv, 1609 ) (pc *pickedCompaction) { 1610 cmp := p.opts.Comparer.Compare 1611 overlapSlice := p.vers.Overlaps(rc.level, cmp, rc.start, rc.end, false /* exclusiveEnd */) 1612 if overlapSlice.Empty() { 1613 // If there is no overlap, then the file with the key range 1614 // must have been compacted away. So, we don't proceed to 1615 // compact the same key range again. 1616 return nil 1617 } 1618 1619 iter := overlapSlice.Iter() 1620 var fileMatches bool 1621 for f := iter.First(); f != nil; f = iter.Next() { 1622 if f.FileNum == rc.fileNum { 1623 fileMatches = true 1624 break 1625 } 1626 } 1627 if !fileMatches { 1628 return nil 1629 } 1630 1631 pc = newPickedCompaction(p.opts, p.vers, rc.level, defaultOutputLevel(rc.level, p.baseLevel), p.baseLevel) 1632 1633 pc.startLevel.files = overlapSlice 1634 if !pc.setupInputs(p.opts, p.diskAvailBytes(), pc.startLevel) { 1635 return nil 1636 } 1637 if inputRangeAlreadyCompacting(env, pc) { 1638 return nil 1639 } 1640 pc.kind = compactionKindRead 1641 1642 // Prevent read compactions which are too wide. 1643 outputOverlaps := pc.version.Overlaps( 1644 pc.outputLevel.level, pc.cmp, pc.smallest.UserKey, 1645 pc.largest.UserKey, pc.largest.IsExclusiveSentinel()) 1646 if outputOverlaps.SizeSum() > pc.maxReadCompactionBytes { 1647 return nil 1648 } 1649 1650 // Prevent compactions which start with a small seed file X, but overlap 1651 // with over allowedCompactionWidth * X file sizes in the output layer. 1652 const allowedCompactionWidth = 35 1653 if outputOverlaps.SizeSum() > overlapSlice.SizeSum()*allowedCompactionWidth { 1654 return nil 1655 } 1656 1657 return pc 1658 } 1659 1660 func (p *compactionPickerByScore) forceBaseLevel1() { 1661 p.baseLevel = 1 1662 } 1663 1664 func inputRangeAlreadyCompacting(env compactionEnv, pc *pickedCompaction) bool { 1665 for _, cl := range pc.inputs { 1666 iter := cl.files.Iter() 1667 for f := iter.First(); f != nil; f = iter.Next() { 1668 if f.IsCompacting() { 1669 return true 1670 } 1671 } 1672 } 1673 1674 // Look for active compactions outputting to the same region of the key 1675 // space in the same output level. Two potential compactions may conflict 1676 // without sharing input files if there are no files in the output level 1677 // that overlap with the intersection of the compactions' key spaces. 1678 // 1679 // Consider an active L0->Lbase compaction compacting two L0 files one 1680 // [a-f] and the other [t-z] into Lbase. 1681 // 1682 // L0 1683 // ↦ 000100 ↤ ↦ 000101 ↤ 1684 // L1 1685 // ↦ 000004 ↤ 1686 // a b c d e f g h i j k l m n o p q r s t u v w x y z 1687 // 1688 // If a new file 000102 [j-p] is flushed while the existing compaction is 1689 // still ongoing, new file would not be in any compacting sublevel 1690 // intervals and would not overlap with any Lbase files that are also 1691 // compacting. However, this compaction cannot be picked because the 1692 // compaction's output key space [j-p] would overlap the existing 1693 // compaction's output key space [a-z]. 1694 // 1695 // L0 1696 // ↦ 000100* ↤ ↦ 000102 ↤ ↦ 000101* ↤ 1697 // L1 1698 // ↦ 000004* ↤ 1699 // a b c d e f g h i j k l m n o p q r s t u v w x y z 1700 // 1701 // * - currently compacting 1702 if pc.outputLevel != nil && pc.outputLevel.level != 0 { 1703 for _, c := range env.inProgressCompactions { 1704 if pc.outputLevel.level != c.outputLevel { 1705 continue 1706 } 1707 if base.InternalCompare(pc.cmp, c.largest, pc.smallest) < 0 || 1708 base.InternalCompare(pc.cmp, c.smallest, pc.largest) > 0 { 1709 continue 1710 } 1711 1712 // The picked compaction and the in-progress compaction c are 1713 // outputting to the same region of the key space of the same 1714 // level. 1715 return true 1716 } 1717 } 1718 return false 1719 } 1720 1721 // conflictsWithInProgress checks if there are any in-progress compactions with overlapping keyspace. 1722 func conflictsWithInProgress( 1723 manual *manualCompaction, outputLevel int, inProgressCompactions []compactionInfo, cmp Compare, 1724 ) bool { 1725 for _, c := range inProgressCompactions { 1726 if (c.outputLevel == manual.level || c.outputLevel == outputLevel) && 1727 isUserKeysOverlapping(manual.start, manual.end, c.smallest.UserKey, c.largest.UserKey, cmp) { 1728 return true 1729 } 1730 for _, in := range c.inputs { 1731 if in.files.Empty() { 1732 continue 1733 } 1734 iter := in.files.Iter() 1735 smallest := iter.First().Smallest.UserKey 1736 largest := iter.Last().Largest.UserKey 1737 if (in.level == manual.level || in.level == outputLevel) && 1738 isUserKeysOverlapping(manual.start, manual.end, smallest, largest, cmp) { 1739 return true 1740 } 1741 } 1742 } 1743 return false 1744 } 1745 1746 func isUserKeysOverlapping(x1, x2, y1, y2 []byte, cmp Compare) bool { 1747 return cmp(x1, y2) <= 0 && cmp(y1, x2) <= 0 1748 }