github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/compaction.go (about) 1 // Copyright 2013 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package bitalostable 6 7 import ( 8 "bytes" 9 "context" 10 "fmt" 11 "io" 12 "math" 13 "runtime/pprof" 14 "sort" 15 "strings" 16 "sync/atomic" 17 "time" 18 19 "github.com/cockroachdb/errors" 20 "github.com/cockroachdb/errors/oserror" 21 "github.com/zuoyebang/bitalostable/internal/base" 22 "github.com/zuoyebang/bitalostable/internal/keyspan" 23 "github.com/zuoyebang/bitalostable/internal/manifest" 24 "github.com/zuoyebang/bitalostable/internal/private" 25 "github.com/zuoyebang/bitalostable/internal/rangedel" 26 "github.com/zuoyebang/bitalostable/internal/rangekey" 27 "github.com/zuoyebang/bitalostable/sstable" 28 "github.com/zuoyebang/bitalostable/vfs" 29 ) 30 31 var errEmptyTable = errors.New("bitalostable: empty table") 32 var errFlushInvariant = errors.New("bitalostable: flush next log number is unset") 33 34 var compactLabels = pprof.Labels("bitalostable", "compact") 35 var flushLabels = pprof.Labels("bitalostable", "flush") 36 var gcLabels = pprof.Labels("bitalostable", "gc") 37 38 // expandedCompactionByteSizeLimit is the maximum number of bytes in all 39 // compacted files. We avoid expanding the lower level file set of a compaction 40 // if it would make the total compaction cover more than this many bytes. 41 func expandedCompactionByteSizeLimit(opts *Options, level int, availBytes uint64) uint64 { 42 v := uint64(25 * opts.Level(level).TargetFileSize) 43 44 // Never expand a compaction beyond half the available capacity, divided 45 // by the maximum number of concurrent compactions. Each of the concurrent 46 // compactions may expand up to this limit, so this attempts to limit 47 // compactions to half of available disk space. Note that this will not 48 // prevent compaction picking from pursuing compactions that are larger 49 // than this threshold before expansion. 50 diskMax := (availBytes / 2) / uint64(opts.MaxConcurrentCompactions()) 51 if v > diskMax { 52 v = diskMax 53 } 54 return v 55 } 56 57 // maxGrandparentOverlapBytes is the maximum bytes of overlap with level+1 58 // before we stop building a single file in a level-1 to level compaction. 59 func maxGrandparentOverlapBytes(opts *Options, level int) uint64 { 60 return uint64(10 * opts.Level(level).TargetFileSize) 61 } 62 63 // maxReadCompactionBytes is used to prevent read compactions which 64 // are too wide. 65 func maxReadCompactionBytes(opts *Options, level int) uint64 { 66 return uint64(10 * opts.Level(level).TargetFileSize) 67 } 68 69 // noCloseIter wraps around a FragmentIterator, intercepting and eliding 70 // calls to Close. It is used during compaction to ensure that rangeDelIters 71 // are not closed prematurely. 72 type noCloseIter struct { 73 keyspan.FragmentIterator 74 } 75 76 func (i noCloseIter) Close() error { 77 return nil 78 } 79 80 type compactionLevel struct { 81 level int 82 files manifest.LevelSlice 83 } 84 85 // Return output from compactionOutputSplitters. See comment on 86 // compactionOutputSplitter.shouldSplitBefore() on how this value is used. 87 type compactionSplitSuggestion int 88 89 const ( 90 noSplit compactionSplitSuggestion = iota 91 splitNow 92 ) 93 94 // String implements the Stringer interface. 95 func (c compactionSplitSuggestion) String() string { 96 if c == noSplit { 97 return "no-split" 98 } 99 return "split-now" 100 } 101 102 // compactionOutputSplitter is an interface for encapsulating logic around 103 // switching the output of a compaction to a new output file. Additional 104 // constraints around switching compaction outputs that are specific to that 105 // compaction type (eg. flush splits) are implemented in 106 // compactionOutputSplitters that compose other child compactionOutputSplitters. 107 type compactionOutputSplitter interface { 108 // shouldSplitBefore returns whether we should split outputs before the 109 // specified "current key". The return value is splitNow or noSplit. 110 // splitNow means a split is advised before the specified key, and noSplit 111 // means no split is advised. If shouldSplitBefore(a) advises a split then 112 // shouldSplitBefore(b) should also advise a split given b >= a, until 113 // onNewOutput is called. 114 shouldSplitBefore(key *InternalKey, tw *sstable.Writer) compactionSplitSuggestion 115 // onNewOutput updates internal splitter state when the compaction switches 116 // to a new sstable, and returns the next limit for the new output which 117 // would get used to truncate range tombstones if the compaction iterator 118 // runs out of keys. The limit returned MUST be > key according to the 119 // compaction's comparator. The specified key is the first key in the new 120 // output, or nil if this sstable will only contain range tombstones already 121 // in the fragmenter. 122 onNewOutput(key *InternalKey) []byte 123 } 124 125 // fileSizeSplitter is a compactionOutputSplitter that makes a determination 126 // to split outputs based on the estimated file size of the current output. 127 // Note that, unlike most other splitters, this splitter does not guarantee 128 // that it will advise splits only at user key change boundaries. 129 type fileSizeSplitter struct { 130 maxFileSize uint64 131 } 132 133 func (f *fileSizeSplitter) shouldSplitBefore( 134 key *InternalKey, tw *sstable.Writer, 135 ) compactionSplitSuggestion { 136 // The Kind != RangeDelete part exists because EstimatedSize doesn't grow 137 // rightaway when a range tombstone is added to the fragmenter. It's always 138 // better to make a sequence of range tombstones visible to the fragmenter. 139 if key.Kind() != InternalKeyKindRangeDelete && tw != nil && 140 tw.EstimatedSize() >= f.maxFileSize { 141 return splitNow 142 } 143 return noSplit 144 } 145 146 func (f *fileSizeSplitter) onNewOutput(key *InternalKey) []byte { 147 return nil 148 } 149 150 type limitFuncSplitter struct { 151 c *compaction 152 limitFunc func(userKey []byte) []byte 153 limit []byte 154 } 155 156 func (lf *limitFuncSplitter) shouldSplitBefore( 157 key *InternalKey, tw *sstable.Writer, 158 ) compactionSplitSuggestion { 159 // NB: The limit must be applied using >= since lf.limit may be used as the 160 // `splitterSuggestion` ultimately passed to `compactionIter.Tombstones` to 161 // serve as an *exclusive* end boundary truncation point. If we used > then, 162 // we may have already added a key with the user key `lf.limit` to the 163 // previous sstable. 164 if lf.limit != nil && lf.c.cmp(key.UserKey, lf.limit) >= 0 { 165 return splitNow 166 } 167 return noSplit 168 } 169 170 func (lf *limitFuncSplitter) onNewOutput(key *InternalKey) []byte { 171 lf.limit = nil 172 if key != nil { 173 lf.limit = lf.limitFunc(key.UserKey) 174 } else { 175 // Use the start key of the first pending tombstone to find the 176 // next limit. All pending tombstones have the same start key. 177 // We use this as opposed to the end key of the 178 // last written sstable to effectively handle cases like these: 179 // 180 // a.SET.3 181 // (lf.limit at b) 182 // d.RANGEDEL.4:f 183 // 184 // In this case, the partition after b has only range deletions, 185 // so if we were to find the limit after the last written key at 186 // the split point (key a), we'd get the limit b again, and 187 // finishOutput() would not advance any further because the next 188 // range tombstone to write does not start until after the L0 189 // split point. 190 if startKey := lf.c.rangeDelFrag.Start(); startKey != nil { 191 lf.limit = lf.limitFunc(startKey) 192 } 193 } 194 return lf.limit 195 } 196 197 // splitterGroup is a compactionOutputSplitter that splits whenever one of its 198 // child splitters advises a compaction split. 199 type splitterGroup struct { 200 cmp Compare 201 splitters []compactionOutputSplitter 202 } 203 204 func (a *splitterGroup) shouldSplitBefore( 205 key *InternalKey, tw *sstable.Writer, 206 ) (suggestion compactionSplitSuggestion) { 207 for _, splitter := range a.splitters { 208 if splitter.shouldSplitBefore(key, tw) == splitNow { 209 return splitNow 210 } 211 } 212 return noSplit 213 } 214 215 func (a *splitterGroup) onNewOutput(key *InternalKey) []byte { 216 var earliestLimit []byte 217 for _, splitter := range a.splitters { 218 limit := splitter.onNewOutput(key) 219 if limit == nil { 220 continue 221 } 222 if earliestLimit == nil || a.cmp(limit, earliestLimit) < 0 { 223 earliestLimit = limit 224 } 225 } 226 return earliestLimit 227 } 228 229 // userKeyChangeSplitter is a compactionOutputSplitter that takes in a child 230 // splitter, and splits when 1) that child splitter has advised a split, and 2) 231 // the compaction output is at the boundary between two user keys (also 232 // the boundary between atomic compaction units). Use this splitter to wrap 233 // any splitters that don't guarantee user key splits (i.e. splitters that make 234 // their determination in ways other than comparing the current key against a 235 // limit key.) If a wrapped splitter advises a split, it must continue 236 // to advise a split until a new output. 237 type userKeyChangeSplitter struct { 238 cmp Compare 239 splitter compactionOutputSplitter 240 unsafePrevUserKey func() []byte 241 } 242 243 func (u *userKeyChangeSplitter) shouldSplitBefore( 244 key *InternalKey, tw *sstable.Writer, 245 ) compactionSplitSuggestion { 246 if split := u.splitter.shouldSplitBefore(key, tw); split != splitNow { 247 return split 248 } 249 if u.cmp(key.UserKey, u.unsafePrevUserKey()) > 0 { 250 return splitNow 251 } 252 return noSplit 253 } 254 255 func (u *userKeyChangeSplitter) onNewOutput(key *InternalKey) []byte { 256 return u.splitter.onNewOutput(key) 257 } 258 259 // compactionFile is a vfs.File wrapper that, on every write, updates a metric 260 // in `versions` on bytes written by in-progress compactions so far. It also 261 // increments a per-compaction `written` int. 262 type compactionFile struct { 263 vfs.File 264 265 versions *versionSet 266 written *int64 267 } 268 269 // Write implements the io.Writer interface. 270 func (c *compactionFile) Write(p []byte) (n int, err error) { 271 n, err = c.File.Write(p) 272 if err != nil { 273 return n, err 274 } 275 276 *c.written += int64(n) 277 c.versions.incrementCompactionBytes(int64(n)) 278 return n, err 279 } 280 281 type compactionKind int 282 283 const ( 284 compactionKindDefault compactionKind = iota 285 compactionKindFlush 286 compactionKindMove 287 compactionKindDeleteOnly 288 compactionKindElisionOnly 289 compactionKindRead 290 compactionKindRewrite 291 ) 292 293 func (k compactionKind) String() string { 294 switch k { 295 case compactionKindDefault: 296 return "default" 297 case compactionKindFlush: 298 return "flush" 299 case compactionKindMove: 300 return "move" 301 case compactionKindDeleteOnly: 302 return "delete-only" 303 case compactionKindElisionOnly: 304 return "elision-only" 305 case compactionKindRead: 306 return "read" 307 case compactionKindRewrite: 308 return "rewrite" 309 } 310 return "?" 311 } 312 313 // rangeKeyCompactionTransform is used to transform range key spans as part of the 314 // keyspan.MergingIter. As part of this transformation step, we can elide range 315 // keys in the last snapshot stripe, as well as coalesce range keys within 316 // snapshot stripes. 317 func rangeKeyCompactionTransform( 318 snapshots []uint64, elideRangeKey func(start, end []byte) bool, 319 ) keyspan.Transformer { 320 return keyspan.TransformerFunc(func(cmp base.Compare, s keyspan.Span, dst *keyspan.Span) error { 321 elideInLastStripe := func(keys []keyspan.Key) []keyspan.Key { 322 // Unsets and deletes in the last snapshot stripe can be elided. 323 k := 0 324 for j := range keys { 325 if elideRangeKey(s.Start, s.End) && 326 (keys[j].Kind() == InternalKeyKindRangeKeyUnset || keys[j].Kind() == InternalKeyKindRangeKeyDelete) { 327 continue 328 } 329 keys[k] = keys[j] 330 k++ 331 } 332 keys = keys[:k] 333 return keys 334 } 335 // snapshots are in ascending order, while s.keys are in descending seqnum 336 // order. Partition s.keys by snapshot stripes, and call rangekey.Coalesce 337 // on each partition. 338 dst.Start = s.Start 339 dst.End = s.End 340 dst.Keys = dst.Keys[:0] 341 i, j := len(snapshots)-1, 0 342 usedLen := 0 343 for i >= 0 { 344 start := j 345 for j < len(s.Keys) && !base.Visible(s.Keys[j].SeqNum(), snapshots[i]) { 346 // Include j in current partition. 347 j++ 348 } 349 if j > start { 350 keysDst := dst.Keys[usedLen:cap(dst.Keys)] 351 if err := rangekey.Coalesce(cmp, s.Keys[start:j], &keysDst); err != nil { 352 return err 353 } 354 if j == len(s.Keys) { 355 // This is the last snapshot stripe. Unsets and deletes can be elided. 356 keysDst = elideInLastStripe(keysDst) 357 } 358 usedLen += len(keysDst) 359 dst.Keys = append(dst.Keys, keysDst...) 360 } 361 i-- 362 } 363 if j < len(s.Keys) { 364 keysDst := dst.Keys[usedLen:cap(dst.Keys)] 365 if err := rangekey.Coalesce(cmp, s.Keys[j:], &keysDst); err != nil { 366 return err 367 } 368 keysDst = elideInLastStripe(keysDst) 369 usedLen += len(keysDst) 370 dst.Keys = append(dst.Keys, keysDst...) 371 } 372 return nil 373 }) 374 } 375 376 // compaction is a table compaction from one level to the next, starting from a 377 // given version. 378 type compaction struct { 379 kind compactionKind 380 cmp Compare 381 equal Equal 382 comparer *base.Comparer 383 formatKey base.FormatKey 384 logger Logger 385 version *version 386 stats base.InternalIteratorStats 387 388 score float64 389 390 // startLevel is the level that is being compacted. Inputs from startLevel 391 // and outputLevel will be merged to produce a set of outputLevel files. 392 startLevel *compactionLevel 393 394 // outputLevel is the level that files are being produced in. outputLevel is 395 // equal to startLevel+1 except when: 396 // - if startLevel is 0, the output level equals compactionPicker.baseLevel(). 397 // - in multilevel compaction, the output level is the lowest level involved in 398 // the compaction 399 outputLevel *compactionLevel 400 401 // extraLevels point to additional levels in between the input and output 402 // levels that get compacted in multilevel compactions 403 extraLevels []*compactionLevel 404 405 inputs []compactionLevel 406 407 // maxOutputFileSize is the maximum size of an individual table created 408 // during compaction. 409 maxOutputFileSize uint64 410 // maxOverlapBytes is the maximum number of bytes of overlap allowed for a 411 // single output table with the tables in the grandparent level. 412 maxOverlapBytes uint64 413 // disableSpanElision disables elision of range tombstones and range keys. Used 414 // by tests to allow range tombstones or range keys to be added to tables where 415 // they would otherwise be elided. 416 disableSpanElision bool 417 418 // flushing contains the flushables (aka memtables) that are being flushed. 419 flushing flushableList 420 // bytesIterated contains the number of bytes that have been flushed/compacted. 421 bytesIterated uint64 422 // bytesWritten contains the number of bytes that have been written to outputs. 423 bytesWritten int64 424 425 // The boundaries of the input data. 426 smallest InternalKey 427 largest InternalKey 428 429 // The range deletion tombstone fragmenter. Adds range tombstones as they are 430 // returned from `compactionIter` and fragments them for output to files. 431 // Referenced by `compactionIter` which uses it to check whether keys are deleted. 432 rangeDelFrag keyspan.Fragmenter 433 // The range key fragmenter. Similar to rangeDelFrag in that it gets range 434 // keys from the compaction iter and fragments them for output to files. 435 rangeKeyFrag keyspan.Fragmenter 436 // The range deletion tombstone iterator, that merges and fragments 437 // tombstones across levels. This iterator is included within the compaction 438 // input iterator as a single level. 439 // TODO(jackson): Remove this when the refactor of FragmentIterator, 440 // InterleavingIterator, etc is complete. 441 rangeDelIter keyspan.InternalIteratorShim 442 // rangeKeyInterleaving is the interleaving iter for range keys. 443 rangeKeyInterleaving keyspan.InterleavingIter 444 445 // A list of objects to close when the compaction finishes. Used by input 446 // iteration to keep rangeDelIters open for the lifetime of the compaction, 447 // and only close them when the compaction finishes. 448 closers []io.Closer 449 450 // grandparents are the tables in level+2 that overlap with the files being 451 // compacted. Used to determine output table boundaries. Do not assume that the actual files 452 // in the grandparent when this compaction finishes will be the same. 453 grandparents manifest.LevelSlice 454 455 // Boundaries at which flushes to L0 should be split. Determined by 456 // L0Sublevels. If nil, flushes aren't split. 457 l0Limits [][]byte 458 459 // L0 sublevel info is used for compactions out of L0. It is nil for all 460 // other compactions. 461 l0SublevelInfo []sublevelInfo 462 463 // List of disjoint inuse key ranges the compaction overlaps with in 464 // grandparent and lower levels. See setupInuseKeyRanges() for the 465 // construction. Used by elideTombstone() and elideRangeTombstone() to 466 // determine if keys affected by a tombstone possibly exist at a lower level. 467 inuseKeyRanges []manifest.UserKeyRange 468 // inuseEntireRange is set if the above inuse key ranges wholly contain the 469 // compaction's key range. This allows compactions in higher levels to often 470 // elide key comparisons. 471 inuseEntireRange bool 472 elideTombstoneIndex int 473 474 // allowedZeroSeqNum is true if seqnums can be zeroed if there are no 475 // snapshots requiring them to be kept. This determination is made by 476 // looking for an sstable which overlaps the bounds of the compaction at a 477 // lower level in the LSM during runCompaction. 478 allowedZeroSeqNum bool 479 480 metrics map[int]*LevelMetrics 481 } 482 483 func (c *compaction) makeInfo(jobID int) CompactionInfo { 484 info := CompactionInfo{ 485 JobID: jobID, 486 Reason: c.kind.String(), 487 Input: make([]LevelInfo, 0, len(c.inputs)), 488 } 489 for _, cl := range c.inputs { 490 inputInfo := LevelInfo{Level: cl.level, Tables: nil} 491 iter := cl.files.Iter() 492 for m := iter.First(); m != nil; m = iter.Next() { 493 inputInfo.Tables = append(inputInfo.Tables, m.TableInfo()) 494 } 495 info.Input = append(info.Input, inputInfo) 496 } 497 if c.outputLevel != nil { 498 info.Output.Level = c.outputLevel.level 499 500 // If there are no inputs from the output level (eg, a move 501 // compaction), add an empty LevelInfo to info.Input. 502 if len(c.inputs) > 0 && c.inputs[len(c.inputs)-1].level != c.outputLevel.level { 503 info.Input = append(info.Input, LevelInfo{Level: c.outputLevel.level}) 504 } 505 } else { 506 // For a delete-only compaction, set the output level to L6. The 507 // output level is not meaningful here, but complicating the 508 // info.Output interface with a pointer doesn't seem worth the 509 // semantic distinction. 510 info.Output.Level = numLevels - 1 511 } 512 return info 513 } 514 515 func newCompaction(pc *pickedCompaction, opts *Options) *compaction { 516 c := &compaction{ 517 kind: compactionKindDefault, 518 cmp: pc.cmp, 519 equal: opts.equal(), 520 comparer: opts.Comparer, 521 formatKey: opts.Comparer.FormatKey, 522 score: pc.score, 523 inputs: pc.inputs, 524 smallest: pc.smallest, 525 largest: pc.largest, 526 logger: opts.Logger, 527 version: pc.version, 528 maxOutputFileSize: pc.maxOutputFileSize, 529 maxOverlapBytes: pc.maxOverlapBytes, 530 l0SublevelInfo: pc.l0SublevelInfo, 531 } 532 c.startLevel = &c.inputs[0] 533 c.outputLevel = &c.inputs[1] 534 535 if len(pc.extraLevels) > 0 { 536 c.extraLevels = pc.extraLevels 537 c.outputLevel = &c.inputs[len(c.inputs)-1] 538 } 539 // Compute the set of outputLevel+1 files that overlap this compaction (these 540 // are the grandparent sstables). 541 if c.outputLevel.level+1 < numLevels { 542 c.grandparents = c.version.Overlaps(c.outputLevel.level+1, c.cmp, 543 c.smallest.UserKey, c.largest.UserKey, c.largest.IsExclusiveSentinel()) 544 } 545 c.setupInuseKeyRanges() 546 547 c.kind = pc.kind 548 if c.kind == compactionKindDefault && c.outputLevel.files.Empty() && !c.hasExtraLevelData() && 549 c.startLevel.files.Len() == 1 && c.grandparents.SizeSum() <= c.maxOverlapBytes { 550 // This compaction can be converted into a trivial move from one level 551 // to the next. We avoid such a move if there is lots of overlapping 552 // grandparent data. Otherwise, the move could create a parent file 553 // that will require a very expensive merge later on. 554 c.kind = compactionKindMove 555 } 556 return c 557 } 558 559 func newDeleteOnlyCompaction(opts *Options, cur *version, inputs []compactionLevel) *compaction { 560 c := &compaction{ 561 kind: compactionKindDeleteOnly, 562 cmp: opts.Comparer.Compare, 563 equal: opts.equal(), 564 comparer: opts.Comparer, 565 formatKey: opts.Comparer.FormatKey, 566 logger: opts.Logger, 567 version: cur, 568 inputs: inputs, 569 } 570 571 // Set c.smallest, c.largest. 572 files := make([]manifest.LevelIterator, 0, len(inputs)) 573 for _, in := range inputs { 574 files = append(files, in.files.Iter()) 575 } 576 c.smallest, c.largest = manifest.KeyRange(opts.Comparer.Compare, files...) 577 return c 578 } 579 580 func adjustGrandparentOverlapBytesForFlush(c *compaction, flushingBytes uint64) { 581 // Heuristic to place a lower bound on compaction output file size 582 // caused by Lbase. Prior to this heuristic we have observed an L0 in 583 // production with 310K files of which 290K files were < 10KB in size. 584 // Our hypothesis is that it was caused by L1 having 2600 files and 585 // ~10GB, such that each flush got split into many tiny files due to 586 // overlapping with most of the files in Lbase. 587 // 588 // The computation below is general in that it accounts 589 // for flushing different volumes of data (e.g. we may be flushing 590 // many memtables). For illustration, we consider the typical 591 // example of flushing a 64MB memtable. So 12.8MB output, 592 // based on the compression guess below. If the compressed bytes 593 // guess is an over-estimate we will end up with smaller files, 594 // and if an under-estimate we will end up with larger files. 595 // With a 2MB target file size, 7 files. We are willing to accept 596 // 4x the number of files, if it results in better write amplification 597 // when later compacting to Lbase, i.e., ~450KB files (target file 598 // size / 4). 599 // 600 // Note that this is a pessimistic heuristic in that 601 // fileCountUpperBoundDueToGrandparents could be far from the actual 602 // number of files produced due to the grandparent limits. For 603 // example, in the extreme, consider a flush that overlaps with 1000 604 // files in Lbase f0...f999, and the initially calculated value of 605 // maxOverlapBytes will cause splits at f10, f20,..., f990, which 606 // means an upper bound file count of 100 files. Say the input bytes 607 // in the flush are such that acceptableFileCount=10. We will fatten 608 // up maxOverlapBytes by 10x to ensure that the upper bound file count 609 // drops to 10. However, it is possible that in practice, even without 610 // this change, we would have produced no more than 10 files, and that 611 // this change makes the files unnecessarily wide. Say the input bytes 612 // are distributed such that 10% are in f0...f9, 10% in f10...f19, ... 613 // 10% in f80...f89 and 10% in f990...f999. The original value of 614 // maxOverlapBytes would have actually produced only 10 sstables. But 615 // by increasing maxOverlapBytes by 10x, we may produce 1 sstable that 616 // spans f0...f89, i.e., a much wider sstable than necessary. 617 // 618 // We could produce a tighter estimate of 619 // fileCountUpperBoundDueToGrandparents if we had knowledge of the key 620 // distribution of the flush. The 4x multiplier mentioned earlier is 621 // a way to try to compensate for this pessimism. 622 // 623 // TODO(sumeer): we don't have compression info for the data being 624 // flushed, but it is likely that existing files that overlap with 625 // this flush in Lbase are representative wrt compression ratio. We 626 // could store the uncompressed size in FileMetadata and estimate 627 // the compression ratio. 628 const approxCompressionRatio = 0.2 629 approxOutputBytes := approxCompressionRatio * float64(flushingBytes) 630 approxNumFilesBasedOnTargetSize := 631 int(math.Ceil(approxOutputBytes / float64(c.maxOutputFileSize))) 632 acceptableFileCount := float64(4 * approxNumFilesBasedOnTargetSize) 633 // The byte calculation is linear in numGrandparentFiles, but we will 634 // incur this linear cost in findGrandparentLimit too, so we are also 635 // willing to pay it now. We could approximate this cheaply by using 636 // the mean file size of Lbase. 637 grandparentFileBytes := c.grandparents.SizeSum() 638 fileCountUpperBoundDueToGrandparents := 639 float64(grandparentFileBytes) / float64(c.maxOverlapBytes) 640 if fileCountUpperBoundDueToGrandparents > acceptableFileCount { 641 c.maxOverlapBytes = uint64( 642 float64(c.maxOverlapBytes) * 643 (fileCountUpperBoundDueToGrandparents / acceptableFileCount)) 644 } 645 } 646 647 func newFlush(opts *Options, cur *version, baseLevel int, flushing flushableList) *compaction { 648 c := &compaction{ 649 kind: compactionKindFlush, 650 cmp: opts.Comparer.Compare, 651 equal: opts.equal(), 652 comparer: opts.Comparer, 653 formatKey: opts.Comparer.FormatKey, 654 logger: opts.Logger, 655 version: cur, 656 inputs: []compactionLevel{{level: -1}, {level: 0}}, 657 maxOutputFileSize: math.MaxUint64, 658 maxOverlapBytes: math.MaxUint64, 659 flushing: flushing, 660 } 661 c.startLevel = &c.inputs[0] 662 c.outputLevel = &c.inputs[1] 663 if cur.L0Sublevels != nil { 664 c.l0Limits = cur.L0Sublevels.FlushSplitKeys() 665 } 666 667 smallestSet, largestSet := false, false 668 updatePointBounds := func(iter internalIterator) { 669 if key, _ := iter.First(); key != nil { 670 if !smallestSet || 671 base.InternalCompare(c.cmp, c.smallest, *key) > 0 { 672 smallestSet = true 673 c.smallest = key.Clone() 674 } 675 } 676 if key, _ := iter.Last(); key != nil { 677 if !largestSet || 678 base.InternalCompare(c.cmp, c.largest, *key) < 0 { 679 largestSet = true 680 c.largest = key.Clone() 681 } 682 } 683 } 684 685 updateRangeBounds := func(iter keyspan.FragmentIterator) { 686 // File bounds require s != nil && !s.Empty(). We only need to check for 687 // s != nil here, as the memtable's FragmentIterator would never surface 688 // empty spans. 689 if s := iter.First(); s != nil { 690 if key := s.SmallestKey(); !smallestSet || 691 base.InternalCompare(c.cmp, c.smallest, key) > 0 { 692 smallestSet = true 693 c.smallest = key.Clone() 694 } 695 } 696 if s := iter.Last(); s != nil { 697 if key := s.LargestKey(); !largestSet || 698 base.InternalCompare(c.cmp, c.largest, key) < 0 { 699 largestSet = true 700 c.largest = key.Clone() 701 } 702 } 703 } 704 705 var flushingBytes uint64 706 for i := range flushing { 707 f := flushing[i] 708 updatePointBounds(f.newIter(nil)) 709 if rangeDelIter := f.newRangeDelIter(nil); rangeDelIter != nil { 710 updateRangeBounds(rangeDelIter) 711 } 712 if rangeKeyIter := f.newRangeKeyIter(nil); rangeKeyIter != nil { 713 updateRangeBounds(rangeKeyIter) 714 } 715 flushingBytes += f.inuseBytes() 716 } 717 718 if opts.FlushSplitBytes > 0 { 719 c.maxOutputFileSize = uint64(opts.Level(0).TargetFileSize) 720 c.maxOverlapBytes = maxGrandparentOverlapBytes(opts, 0) 721 c.grandparents = c.version.Overlaps(baseLevel, c.cmp, c.smallest.UserKey, 722 c.largest.UserKey, c.largest.IsExclusiveSentinel()) 723 adjustGrandparentOverlapBytesForFlush(c, flushingBytes) 724 } 725 726 c.setupInuseKeyRanges() 727 return c 728 } 729 730 func (c *compaction) hasExtraLevelData() bool { 731 if len(c.extraLevels) == 0 { 732 // not a multi level compaction 733 return false 734 } else if c.extraLevels[0].files.Empty() { 735 // a multi level compaction without data in the intermediate input level; 736 // e.g. for a multi level compaction with levels 4,5, and 6, this could 737 // occur if there is no files to compact in 5, or in 5 and 6 (i.e. a move). 738 return false 739 } 740 return true 741 } 742 743 func (c *compaction) setupInuseKeyRanges() { 744 level := c.outputLevel.level + 1 745 if c.outputLevel.level == 0 { 746 level = 0 747 } 748 // calculateInuseKeyRanges will return a series of sorted spans. Overlapping 749 // or abutting spans have already been merged. 750 c.inuseKeyRanges = calculateInuseKeyRanges( 751 c.version, c.cmp, level, numLevels-1, c.smallest.UserKey, c.largest.UserKey, 752 ) 753 // Check if there's a single in-use span that encompasses the entire key 754 // range of the compaction. This is an optimization to avoid key comparisons 755 // against inuseKeyRanges during the compaction when every key within the 756 // compaction overlaps with an in-use span. 757 if len(c.inuseKeyRanges) > 0 { 758 c.inuseEntireRange = c.cmp(c.inuseKeyRanges[0].Start, c.smallest.UserKey) <= 0 && 759 c.cmp(c.inuseKeyRanges[0].End, c.largest.UserKey) >= 0 760 } 761 } 762 763 func calculateInuseKeyRanges( 764 v *version, cmp base.Compare, level, maxLevel int, smallest, largest []byte, 765 ) []manifest.UserKeyRange { 766 // Use two slices, alternating which one is input and which one is output 767 // as we descend the LSM. 768 var input, output []manifest.UserKeyRange 769 770 // L0 requires special treatment, since sstables within L0 may overlap. 771 // We use the L0 Sublevels structure to efficiently calculate the merged 772 // in-use key ranges. 773 if level == 0 { 774 output = v.L0Sublevels.InUseKeyRanges(smallest, largest) 775 level++ 776 } 777 778 for ; level <= maxLevel; level++ { 779 // NB: We always treat `largest` as inclusive for simplicity, because 780 // there's little consequence to calculating slightly broader in-use key 781 // ranges. 782 overlaps := v.Overlaps(level, cmp, smallest, largest, false /* exclusiveEnd */) 783 iter := overlaps.Iter() 784 785 // We may already have in-use key ranges from higher levels. Iterate 786 // through both our accumulated in-use key ranges and this level's 787 // files, merging the two. 788 // 789 // Tables higher within the LSM have broader key spaces. We use this 790 // when possible to seek past a level's files that are contained by 791 // our current accumulated in-use key ranges. This helps avoid 792 // per-sstable work during flushes or compactions in high levels which 793 // overlap the majority of the LSM's sstables. 794 input, output = output, input 795 output = output[:0] 796 797 var currFile *fileMetadata 798 var currAccum *manifest.UserKeyRange 799 if len(input) > 0 { 800 currAccum, input = &input[0], input[1:] 801 } 802 803 // If we have an accumulated key range and its start is ≤ smallest, 804 // we can seek to the accumulated range's end. Otherwise, we need to 805 // start at the first overlapping file within the level. 806 if currAccum != nil && cmp(currAccum.Start, smallest) <= 0 { 807 currFile = seekGT(&iter, cmp, currAccum.End) 808 } else { 809 currFile = iter.First() 810 } 811 812 for currFile != nil || currAccum != nil { 813 // If we've exhausted either the files in the level or the 814 // accumulated key ranges, we just need to append the one we have. 815 // If we have both a currFile and a currAccum, they either overlap 816 // or they're disjoint. If they're disjoint, we append whichever 817 // one sorts first and move on to the next file or range. If they 818 // overlap, we merge them into currAccum and proceed to the next 819 // file. 820 switch { 821 case currAccum == nil || (currFile != nil && cmp(currFile.Largest.UserKey, currAccum.Start) < 0): 822 // This file is strictly before the current accumulated range, 823 // or there are no more accumulated ranges. 824 output = append(output, manifest.UserKeyRange{ 825 Start: currFile.Smallest.UserKey, 826 End: currFile.Largest.UserKey, 827 }) 828 currFile = iter.Next() 829 case currFile == nil || (currAccum != nil && cmp(currAccum.End, currFile.Smallest.UserKey) < 0): 830 // The current accumulated key range is strictly before the 831 // current file, or there are no more files. 832 output = append(output, *currAccum) 833 currAccum = nil 834 if len(input) > 0 { 835 currAccum, input = &input[0], input[1:] 836 } 837 default: 838 // The current accumulated range and the current file overlap. 839 // Adjust the accumulated range to be the union. 840 if cmp(currFile.Smallest.UserKey, currAccum.Start) < 0 { 841 currAccum.Start = currFile.Smallest.UserKey 842 } 843 if cmp(currFile.Largest.UserKey, currAccum.End) > 0 { 844 currAccum.End = currFile.Largest.UserKey 845 } 846 847 // Extending `currAccum`'s end boundary may have caused it to 848 // overlap with `input` key ranges that we haven't processed 849 // yet. Merge any such key ranges. 850 for len(input) > 0 && cmp(input[0].Start, currAccum.End) <= 0 { 851 if cmp(input[0].End, currAccum.End) > 0 { 852 currAccum.End = input[0].End 853 } 854 input = input[1:] 855 } 856 // Seek the level iterator past our current accumulated end. 857 currFile = seekGT(&iter, cmp, currAccum.End) 858 } 859 } 860 } 861 return output 862 } 863 864 func seekGT(iter *manifest.LevelIterator, cmp base.Compare, key []byte) *manifest.FileMetadata { 865 f := iter.SeekGE(cmp, key) 866 for f != nil && cmp(f.Largest.UserKey, key) == 0 { 867 f = iter.Next() 868 } 869 return f 870 } 871 872 // findGrandparentLimit takes the start user key for a table and returns the 873 // user key to which that table can extend without excessively overlapping 874 // the grandparent level. If no limit is needed considering the grandparent 875 // files, this function returns nil. This is done in order to prevent a table 876 // at level N from overlapping too much data at level N+1. We want to avoid 877 // such large overlaps because they translate into large compactions. The 878 // current heuristic stops output of a table if the addition of another key 879 // would cause the table to overlap more than 10x the target file size at 880 // level N. See maxGrandparentOverlapBytes. 881 func (c *compaction) findGrandparentLimit(start []byte) []byte { 882 iter := c.grandparents.Iter() 883 var overlappedBytes uint64 884 var greater bool 885 for f := iter.SeekGE(c.cmp, start); f != nil; f = iter.Next() { 886 overlappedBytes += f.Size 887 // To ensure forward progress we always return a larger user 888 // key than where we started. See comments above clients of 889 // this function for how this is used. 890 greater = greater || c.cmp(f.Smallest.UserKey, start) > 0 891 if !greater { 892 continue 893 } 894 895 // We return the smallest bound of a sstable rather than the 896 // largest because the smallest is always inclusive, and limits 897 // are used exlusively when truncating range tombstones. If we 898 // truncated an output to the largest key while there's a 899 // pending tombstone, the next output file would also overlap 900 // the same grandparent f. 901 if overlappedBytes > c.maxOverlapBytes { 902 return f.Smallest.UserKey 903 } 904 } 905 return nil 906 } 907 908 // findL0Limit takes the start key for a table and returns the user key to which 909 // that table can be extended without hitting the next l0Limit. Having flushed 910 // sstables "bridging across" an l0Limit could lead to increased L0 -> LBase 911 // compaction sizes as well as elevated read amplification. 912 func (c *compaction) findL0Limit(start []byte) []byte { 913 if c.startLevel.level > -1 || c.outputLevel.level != 0 || len(c.l0Limits) == 0 { 914 return nil 915 } 916 index := sort.Search(len(c.l0Limits), func(i int) bool { 917 return c.cmp(c.l0Limits[i], start) > 0 918 }) 919 if index < len(c.l0Limits) { 920 return c.l0Limits[index] 921 } 922 return nil 923 } 924 925 // errorOnUserKeyOverlap returns an error if the last two written sstables in 926 // this compaction have revisions of the same user key present in both sstables, 927 // when it shouldn't (eg. when splitting flushes). 928 func (c *compaction) errorOnUserKeyOverlap(ve *versionEdit) error { 929 if n := len(ve.NewFiles); n > 1 { 930 meta := ve.NewFiles[n-1].Meta 931 prevMeta := ve.NewFiles[n-2].Meta 932 if !prevMeta.Largest.IsExclusiveSentinel() && 933 c.cmp(prevMeta.Largest.UserKey, meta.Smallest.UserKey) >= 0 { 934 return errors.Errorf("bitalostable: compaction split user key across two sstables: %s in %s and %s", 935 prevMeta.Largest.Pretty(c.formatKey), 936 prevMeta.FileNum, 937 meta.FileNum) 938 } 939 } 940 return nil 941 } 942 943 // allowZeroSeqNum returns true if seqnum's can be zeroed if there are no 944 // snapshots requiring them to be kept. It performs this determination by 945 // looking for an sstable which overlaps the bounds of the compaction at a 946 // lower level in the LSM. 947 func (c *compaction) allowZeroSeqNum() bool { 948 return c.elideRangeTombstone(c.smallest.UserKey, c.largest.UserKey) 949 } 950 951 // elideTombstone returns true if it is ok to elide a tombstone for the 952 // specified key. A return value of true guarantees that there are no key/value 953 // pairs at c.level+2 or higher that possibly contain the specified user 954 // key. The keys in multiple invocations to elideTombstone must be supplied in 955 // order. 956 func (c *compaction) elideTombstone(key []byte) bool { 957 if c.inuseEntireRange || len(c.flushing) != 0 { 958 return false 959 } 960 961 for ; c.elideTombstoneIndex < len(c.inuseKeyRanges); c.elideTombstoneIndex++ { 962 r := &c.inuseKeyRanges[c.elideTombstoneIndex] 963 if c.cmp(key, r.End) <= 0 { 964 if c.cmp(key, r.Start) >= 0 { 965 return false 966 } 967 break 968 } 969 } 970 return true 971 } 972 973 // elideRangeTombstone returns true if it is ok to elide the specified range 974 // tombstone. A return value of true guarantees that there are no key/value 975 // pairs at c.outputLevel.level+1 or higher that possibly overlap the specified 976 // tombstone. 977 func (c *compaction) elideRangeTombstone(start, end []byte) bool { 978 // Disable range tombstone elision if the testing knob for that is enabled, 979 // or if we are flushing memtables. The latter requirement is due to 980 // inuseKeyRanges not accounting for key ranges in other memtables that are 981 // being flushed in the same compaction. It's possible for a range tombstone 982 // in one memtable to overlap keys in a preceding memtable in c.flushing. 983 // 984 // This function is also used in setting allowZeroSeqNum, so disabling 985 // elision of range tombstones also disables zeroing of SeqNums. 986 // 987 // TODO(peter): we disable zeroing of seqnums during flushing to match 988 // RocksDB behavior and to avoid generating overlapping sstables during 989 // DB.replayWAL. When replaying WAL files at startup, we flush after each 990 // WAL is replayed building up a single version edit that is 991 // applied. Because we don't apply the version edit after each flush, this 992 // code doesn't know that L0 contains files and zeroing of seqnums should 993 // be disabled. That is fixable, but it seems safer to just match the 994 // RocksDB behavior for now. 995 if c.disableSpanElision || len(c.flushing) != 0 { 996 return false 997 } 998 999 lower := sort.Search(len(c.inuseKeyRanges), func(i int) bool { 1000 return c.cmp(c.inuseKeyRanges[i].End, start) >= 0 1001 }) 1002 upper := sort.Search(len(c.inuseKeyRanges), func(i int) bool { 1003 return c.cmp(c.inuseKeyRanges[i].Start, end) > 0 1004 }) 1005 return lower >= upper 1006 } 1007 1008 // elideRangeKey returns true if it is ok to elide the specified range key. A 1009 // return value of true guarantees that there are no key/value pairs at 1010 // c.outputLevel.level+1 or higher that possibly overlap the specified range key. 1011 func (c *compaction) elideRangeKey(start, end []byte) bool { 1012 // TODO(bilal): Track inuseKeyRanges separately for the range keyspace as 1013 // opposed to the point keyspace. Once that is done, elideRangeTombstone 1014 // can just check in the point keyspace, and this function can check for 1015 // inuseKeyRanges in the range keyspace. 1016 return c.elideRangeTombstone(start, end) 1017 } 1018 1019 // newInputIter returns an iterator over all the input tables in a compaction. 1020 func (c *compaction) newInputIter( 1021 newIters tableNewIters, newRangeKeyIter keyspan.TableNewSpanIter, snapshots []uint64, 1022 ) (_ internalIterator, retErr error) { 1023 var rangeDelIters []keyspan.FragmentIterator 1024 var rangeKeyIters []keyspan.FragmentIterator 1025 1026 if len(c.flushing) != 0 { 1027 if len(c.flushing) == 1 { 1028 f := c.flushing[0] 1029 iter := f.newFlushIter(nil, &c.bytesIterated) 1030 if rangeDelIter := f.newRangeDelIter(nil); rangeDelIter != nil { 1031 c.rangeDelIter.Init(c.cmp, rangeDelIter) 1032 iter = newMergingIter(c.logger, &c.stats, c.cmp, nil, iter, &c.rangeDelIter) 1033 } 1034 if rangeKeyIter := f.newRangeKeyIter(nil); rangeKeyIter != nil { 1035 mi := &keyspan.MergingIter{} 1036 mi.Init(c.cmp, rangeKeyCompactionTransform(snapshots, c.elideRangeKey), rangeKeyIter) 1037 c.rangeKeyInterleaving.Init(c.comparer, iter, mi, nil /* hooks */, nil /* lowerBound */, nil /* upperBound */) 1038 iter = &c.rangeKeyInterleaving 1039 } 1040 return iter, nil 1041 } 1042 iters := make([]internalIterator, 0, len(c.flushing)+1) 1043 rangeDelIters = make([]keyspan.FragmentIterator, 0, len(c.flushing)) 1044 rangeKeyIters = make([]keyspan.FragmentIterator, 0, len(c.flushing)) 1045 for i := range c.flushing { 1046 f := c.flushing[i] 1047 iters = append(iters, f.newFlushIter(nil, &c.bytesIterated)) 1048 rangeDelIter := f.newRangeDelIter(nil) 1049 if rangeDelIter != nil { 1050 rangeDelIters = append(rangeDelIters, rangeDelIter) 1051 } 1052 if rangeKeyIter := f.newRangeKeyIter(nil); rangeKeyIter != nil { 1053 rangeKeyIters = append(rangeKeyIters, rangeKeyIter) 1054 } 1055 } 1056 if len(rangeDelIters) > 0 { 1057 c.rangeDelIter.Init(c.cmp, rangeDelIters...) 1058 iters = append(iters, &c.rangeDelIter) 1059 } 1060 var iter internalIterator = newMergingIter(c.logger, &c.stats, c.cmp, nil, iters...) 1061 if len(rangeKeyIters) > 0 { 1062 mi := &keyspan.MergingIter{} 1063 mi.Init(c.cmp, rangeKeyCompactionTransform(snapshots, c.elideRangeKey), rangeKeyIters...) 1064 c.rangeKeyInterleaving.Init(c.comparer, iter, mi, nil /* hooks */, nil /* lowerBound */, nil /* upperBound */) 1065 iter = &c.rangeKeyInterleaving 1066 } 1067 return iter, nil 1068 } 1069 1070 if c.startLevel.level >= 0 { 1071 err := manifest.CheckOrdering(c.cmp, c.formatKey, 1072 manifest.Level(c.startLevel.level), c.startLevel.files.Iter()) 1073 if err != nil { 1074 return nil, err 1075 } 1076 } 1077 err := manifest.CheckOrdering(c.cmp, c.formatKey, 1078 manifest.Level(c.outputLevel.level), c.outputLevel.files.Iter()) 1079 if err != nil { 1080 return nil, err 1081 } 1082 1083 if c.startLevel.level == 0 { 1084 if c.l0SublevelInfo == nil { 1085 panic("l0SublevelInfo not created for compaction out of L0") 1086 } 1087 1088 for _, info := range c.l0SublevelInfo { 1089 err := manifest.CheckOrdering(c.cmp, c.formatKey, 1090 info.sublevel, info.Iter()) 1091 if err != nil { 1092 return nil, err 1093 } 1094 } 1095 } 1096 1097 if len(c.extraLevels) > 0 { 1098 if len(c.extraLevels) > 1 { 1099 panic("n>2 multi level compaction not implemented yet") 1100 } 1101 interLevel := c.extraLevels[0] 1102 err := manifest.CheckOrdering(c.cmp, c.formatKey, 1103 manifest.Level(interLevel.level), interLevel.files.Iter()) 1104 if err != nil { 1105 return nil, err 1106 } 1107 } 1108 iters := make([]internalIterator, 0, len(c.inputs)*c.startLevel.files.Len()+1) 1109 defer func() { 1110 if retErr != nil { 1111 for _, iter := range iters { 1112 if iter != nil { 1113 iter.Close() 1114 } 1115 } 1116 for _, rangeDelIter := range rangeDelIters { 1117 rangeDelIter.Close() 1118 } 1119 } 1120 }() 1121 1122 // In normal operation, levelIter iterates over the point operations in a 1123 // level, and initializes a rangeDelIter pointer for the range deletions in 1124 // each table. During compaction, we want to iterate over the merged view of 1125 // point operations and range deletions. In order to do this we create one 1126 // levelIter per level to iterate over the point operations, and collect up 1127 // all the range deletion files. 1128 // 1129 // The range deletion levels are first combined with a keyspan.MergingIter 1130 // (currently wrapped by a keyspan.InternalIteratorShim to satisfy the 1131 // internal iterator interface). The resulting merged rangedel iterator is 1132 // then included with the point levels in a single mergingIter. 1133 newRangeDelIter := func( 1134 f *manifest.FileMetadata, slice manifest.LevelSlice, _ *IterOptions, bytesIterated *uint64, 1135 ) (keyspan.FragmentIterator, error) { 1136 iter, rangeDelIter, err := newIters(f, nil, /* iter options */ 1137 internalIterOpts{bytesIterated: &c.bytesIterated}) 1138 if err == nil { 1139 // TODO(peter): It is mildly wasteful to open the point iterator only to 1140 // immediately close it. One way to solve this would be to add new 1141 // methods to tableCache for creating point and range-deletion iterators 1142 // independently. We'd only want to use those methods here, 1143 // though. Doesn't seem worth the hassle in the near term. 1144 if err = iter.Close(); err != nil { 1145 rangeDelIter.Close() 1146 rangeDelIter = nil 1147 } 1148 } 1149 if rangeDelIter != nil { 1150 // Ensure that rangeDelIter is not closed until the compaction is 1151 // finished. This is necessary because range tombstone processing 1152 // requires the range tombstones to be held in memory for up to the 1153 // lifetime of the compaction. 1154 c.closers = append(c.closers, rangeDelIter) 1155 rangeDelIter = noCloseIter{rangeDelIter} 1156 1157 // Truncate the range tombstones returned by the iterator to the 1158 // upper bound of the atomic compaction unit. Note that we need do 1159 // this truncation at read time in order to handle sstables 1160 // generated by RocksDB and earlier versions of Pebble which do not 1161 // truncate range tombstones to atomic compaction unit boundaries at 1162 // write time. 1163 // 1164 // The current Pebble compaction logic DOES truncate tombstones to 1165 // atomic unit boundaries at compaction time too. 1166 atomicUnit, _ := expandToAtomicUnit(c.cmp, slice, true /* disableIsCompacting */) 1167 lowerBound, upperBound := manifest.KeyRange(c.cmp, atomicUnit.Iter()) 1168 // Range deletion tombstones are often written to sstables 1169 // untruncated on the end key side. However, they are still only 1170 // valid within a given file's bounds. The logic for writing range 1171 // tombstones to an output file sometimes has an incomplete view 1172 // of range tombstones outside the file's internal key bounds. Skip 1173 // any range tombstones completely outside file bounds. 1174 rangeDelIter = keyspan.Truncate( 1175 c.cmp, rangeDelIter, lowerBound.UserKey, upperBound.UserKey, &f.Smallest, &f.Largest) 1176 } 1177 if rangeDelIter == nil { 1178 rangeDelIter = emptyKeyspanIter 1179 } 1180 return rangeDelIter, err 1181 } 1182 1183 iterOpts := IterOptions{logger: c.logger} 1184 // TODO(bananabrick): Get rid of the extra manifest.Level parameter and fold it into 1185 // compactionLevel. 1186 addItersForLevel := func(level *compactionLevel, l manifest.Level) error { 1187 iters = append(iters, newLevelIter(iterOpts, c.cmp, nil /* split */, newIters, 1188 level.files.Iter(), l, &c.bytesIterated)) 1189 // Create a wrapping closure to turn newRangeDelIter into a 1190 // keyspan.TableNewSpanIter, and return a LevelIter that lazily creates 1191 // rangedel iterators. This is safe now that range deletions are truncated 1192 // at file bounds; the merging iterator no longer needs to see all range 1193 // deletes for correctness. 1194 wrapper := func(file *manifest.FileMetadata, iterOptions *keyspan.SpanIterOptions) (keyspan.FragmentIterator, error) { 1195 return newRangeDelIter(file, level.files, nil, &c.bytesIterated) 1196 } 1197 li := &keyspan.LevelIter{} 1198 li.Init(keyspan.SpanIterOptions{}, c.cmp, wrapper, level.files.Iter(), l, c.logger, manifest.KeyTypePoint) 1199 rangeDelIters = append(rangeDelIters, li) 1200 // Check if this level has any range keys. 1201 hasRangeKeys := false 1202 iter := level.files.Iter() 1203 for f := iter.First(); f != nil; f = iter.Next() { 1204 if f.HasRangeKeys { 1205 hasRangeKeys = true 1206 break 1207 } 1208 } 1209 if hasRangeKeys { 1210 li := &keyspan.LevelIter{} 1211 newRangeKeyIterWrapper := func(file *manifest.FileMetadata, iterOptions *keyspan.SpanIterOptions) (keyspan.FragmentIterator, error) { 1212 iter, err := newRangeKeyIter(file, iterOptions) 1213 if iter != nil { 1214 // Ensure that the range key iter is not closed until the compaction is 1215 // finished. This is necessary because range key processing 1216 // requires the range keys to be held in memory for up to the 1217 // lifetime of the compaction. 1218 c.closers = append(c.closers, iter) 1219 iter = noCloseIter{iter} 1220 1221 // We do not need to truncate range keys to sstable boundaries, or 1222 // only read within the file's atomic compaction units, unlike with 1223 // range tombstones. This is because range keys were added after we 1224 // stopped splitting user keys across sstables, so all the range keys 1225 // in this sstable must wholly lie within the file's bounds. 1226 } 1227 if iter == nil { 1228 iter = emptyKeyspanIter 1229 } 1230 return iter, err 1231 } 1232 li.Init(keyspan.SpanIterOptions{}, c.cmp, newRangeKeyIterWrapper, level.files.Iter(), l, c.logger, manifest.KeyTypeRange) 1233 rangeKeyIters = append(rangeKeyIters, li) 1234 } 1235 return nil 1236 } 1237 1238 if c.startLevel.level != 0 { 1239 if err = addItersForLevel(c.startLevel, manifest.Level(c.startLevel.level)); err != nil { 1240 return nil, err 1241 } 1242 } else { 1243 for _, info := range c.l0SublevelInfo { 1244 if err = addItersForLevel( 1245 &compactionLevel{0, info.LevelSlice}, info.sublevel); err != nil { 1246 return nil, err 1247 } 1248 } 1249 } 1250 if len(c.extraLevels) > 0 { 1251 if err = addItersForLevel(c.extraLevels[0], manifest.Level(c.extraLevels[0].level)); err != nil { 1252 return nil, err 1253 } 1254 } 1255 if err = addItersForLevel(c.outputLevel, manifest.Level(c.outputLevel.level)); err != nil { 1256 return nil, err 1257 } 1258 1259 // Combine all the rangedel iterators using a keyspan.MergingIterator and a 1260 // InternalIteratorShim so that the range deletions may be interleaved in 1261 // the compaction input. 1262 // TODO(jackson): Replace the InternalIteratorShim with an interleaving 1263 // iterator. 1264 if len(rangeDelIters) > 0 { 1265 c.rangeDelIter.Init(c.cmp, rangeDelIters...) 1266 iters = append(iters, &c.rangeDelIter) 1267 } 1268 pointKeyIter := newMergingIter(c.logger, &c.stats, c.cmp, nil, iters...) 1269 if len(rangeKeyIters) > 0 { 1270 mi := &keyspan.MergingIter{} 1271 mi.Init(c.cmp, rangeKeyCompactionTransform(snapshots, c.elideRangeKey), rangeKeyIters...) 1272 di := &keyspan.DefragmentingIter{} 1273 di.Init(c.comparer, mi, keyspan.DefragmentInternal, keyspan.StaticDefragmentReducer) 1274 c.rangeKeyInterleaving.Init(c.comparer, pointKeyIter, di, nil /* hooks */, nil /* lowerBound */, nil /* upperBound */) 1275 return &c.rangeKeyInterleaving, nil 1276 } 1277 1278 return pointKeyIter, nil 1279 } 1280 1281 func (c *compaction) String() string { 1282 if len(c.flushing) != 0 { 1283 return "flush\n" 1284 } 1285 1286 var buf bytes.Buffer 1287 for level := c.startLevel.level; level <= c.outputLevel.level; level++ { 1288 i := level - c.startLevel.level 1289 fmt.Fprintf(&buf, "%d:", level) 1290 iter := c.inputs[i].files.Iter() 1291 for f := iter.First(); f != nil; f = iter.Next() { 1292 fmt.Fprintf(&buf, " %s:%s-%s", f.FileNum, f.Smallest, f.Largest) 1293 } 1294 fmt.Fprintf(&buf, "\n") 1295 } 1296 return buf.String() 1297 } 1298 1299 type manualCompaction struct { 1300 // Count of the retries either due to too many concurrent compactions, or a 1301 // concurrent compaction to overlapping levels. 1302 retries int 1303 level int 1304 outputLevel int 1305 done chan error 1306 start []byte 1307 end []byte 1308 split bool 1309 } 1310 1311 type readCompaction struct { 1312 level int 1313 // [start, end] key ranges are used for de-duping. 1314 start []byte 1315 end []byte 1316 1317 // The file associated with the compaction. 1318 // If the file no longer belongs in the same 1319 // level, then we skip the compaction. 1320 fileNum base.FileNum 1321 } 1322 1323 func (d *DB) addInProgressCompaction(c *compaction) { 1324 d.mu.compact.inProgress[c] = struct{}{} 1325 var isBase, isIntraL0 bool 1326 for _, cl := range c.inputs { 1327 iter := cl.files.Iter() 1328 for f := iter.First(); f != nil; f = iter.Next() { 1329 if f.IsCompacting() { 1330 d.opts.Logger.Fatalf("L%d->L%d: %s already being compacted", c.startLevel.level, c.outputLevel.level, f.FileNum) 1331 } 1332 f.SetCompactionState(manifest.CompactionStateCompacting) 1333 if c.startLevel != nil && c.outputLevel != nil && c.startLevel.level == 0 { 1334 if c.outputLevel.level == 0 { 1335 f.IsIntraL0Compacting = true 1336 isIntraL0 = true 1337 } else { 1338 isBase = true 1339 } 1340 } 1341 } 1342 } 1343 1344 if (isIntraL0 || isBase) && c.version.L0Sublevels != nil { 1345 l0Inputs := []manifest.LevelSlice{c.startLevel.files} 1346 if isIntraL0 { 1347 l0Inputs = append(l0Inputs, c.outputLevel.files) 1348 } 1349 if err := c.version.L0Sublevels.UpdateStateForStartedCompaction(l0Inputs, isBase); err != nil { 1350 d.opts.Logger.Fatalf("could not update state for compaction: %s", err) 1351 } 1352 } 1353 1354 if false { 1355 // TODO(peter): Do we want to keep this? It is useful for seeing the 1356 // concurrent compactions/flushes that are taking place. Right now, this 1357 // spams the logs and output to tests. Figure out a way to useful expose 1358 // it. 1359 strs := make([]string, 0, len(d.mu.compact.inProgress)) 1360 for c := range d.mu.compact.inProgress { 1361 var s string 1362 if c.startLevel.level == -1 { 1363 s = fmt.Sprintf("mem->L%d", c.outputLevel.level) 1364 } else { 1365 s = fmt.Sprintf("L%d->L%d:%.1f", c.startLevel.level, c.outputLevel.level, c.score) 1366 } 1367 strs = append(strs, s) 1368 } 1369 // This odd sorting function is intended to sort "mem" before "L*". 1370 sort.Slice(strs, func(i, j int) bool { 1371 if strs[i][0] == strs[j][0] { 1372 return strs[i] < strs[j] 1373 } 1374 return strs[i] > strs[j] 1375 }) 1376 d.opts.Logger.Infof("compactions: %s", strings.Join(strs, " ")) 1377 } 1378 } 1379 1380 // Removes compaction markers from files in a compaction. The rollback parameter 1381 // indicates whether the compaction state should be rolled back to its original 1382 // state in the case of an unsuccessful compaction. 1383 // 1384 // DB.mu must be held when calling this method. All writes to the manifest for 1385 // this compaction should have completed by this point. 1386 func (d *DB) removeInProgressCompaction(c *compaction, rollback bool) { 1387 for _, cl := range c.inputs { 1388 iter := cl.files.Iter() 1389 for f := iter.First(); f != nil; f = iter.Next() { 1390 if !f.IsCompacting() { 1391 d.opts.Logger.Fatalf("L%d->L%d: %s not being compacted", c.startLevel.level, c.outputLevel.level, f.FileNum) 1392 } 1393 if !rollback { 1394 // On success all compactions other than move-compactions transition the 1395 // file into the Compacted state. Move-compacted files become eligible 1396 // for compaction again and transition back to NotCompacting. 1397 if c.kind != compactionKindMove { 1398 f.SetCompactionState(manifest.CompactionStateCompacted) 1399 } else { 1400 f.SetCompactionState(manifest.CompactionStateNotCompacting) 1401 } 1402 } else { 1403 // Else, on rollback, all input files unconditionally transition back to 1404 // NotCompacting. 1405 f.SetCompactionState(manifest.CompactionStateNotCompacting) 1406 } 1407 f.IsIntraL0Compacting = false 1408 } 1409 } 1410 delete(d.mu.compact.inProgress, c) 1411 1412 l0InProgress := inProgressL0Compactions(d.getInProgressCompactionInfoLocked(c)) 1413 d.mu.versions.currentVersion().L0Sublevels.InitCompactingFileInfo(l0InProgress) 1414 } 1415 1416 func (d *DB) calculateDiskAvailableBytes() uint64 { 1417 if space, err := d.opts.FS.GetDiskUsage(d.dirname); err == nil { 1418 atomic.StoreUint64(&d.atomic.diskAvailBytes, space.AvailBytes) 1419 return space.AvailBytes 1420 } else if !errors.Is(err, vfs.ErrUnsupported) { 1421 d.opts.EventListener.BackgroundError(err) 1422 } 1423 return atomic.LoadUint64(&d.atomic.diskAvailBytes) 1424 } 1425 1426 func (d *DB) getDiskAvailableBytesCached() uint64 { 1427 return atomic.LoadUint64(&d.atomic.diskAvailBytes) 1428 } 1429 1430 func (d *DB) getDeletionPacerInfo() deletionPacerInfo { 1431 var pacerInfo deletionPacerInfo 1432 // Call GetDiskUsage after every file deletion. This may seem inefficient, 1433 // but in practice this was observed to take constant time, regardless of 1434 // volume size used, at least on linux with ext4 and zfs. All invocations 1435 // take 10 microseconds or less. 1436 pacerInfo.freeBytes = d.calculateDiskAvailableBytes() 1437 d.mu.Lock() 1438 pacerInfo.obsoleteBytes = d.mu.versions.metrics.Table.ObsoleteSize 1439 pacerInfo.liveBytes = uint64(d.mu.versions.metrics.Total().Size) 1440 d.mu.Unlock() 1441 return pacerInfo 1442 } 1443 1444 // maybeScheduleFlush schedules a flush if necessary. 1445 // 1446 // d.mu must be held when calling this. 1447 func (d *DB) maybeScheduleFlush(needReport bool) { 1448 if d.mu.compact.flushing || d.closed.Load() != nil || d.opts.ReadOnly { 1449 return 1450 } 1451 if len(d.mu.mem.queue) <= 1 { 1452 return 1453 } 1454 1455 if !d.passedFlushThreshold() { 1456 return 1457 } 1458 1459 d.mu.compact.flushing = true 1460 go d.flush(needReport) 1461 } 1462 1463 func (d *DB) passedFlushThreshold() bool { 1464 var n int 1465 var size uint64 1466 for ; n < len(d.mu.mem.queue)-1; n++ { 1467 if !d.mu.mem.queue[n].readyForFlush() { 1468 break 1469 } 1470 if d.mu.mem.queue[n].flushForced { 1471 // A flush was forced. Pretend the memtable size is the configured 1472 // size. See minFlushSize below. 1473 size += uint64(d.opts.MemTableSize) 1474 } else { 1475 size += d.mu.mem.queue[n].totalBytes() 1476 } 1477 } 1478 if n == 0 { 1479 // None of the immutable memtables are ready for flushing. 1480 return false 1481 } 1482 1483 // Only flush once the sum of the queued memtable sizes exceeds half the 1484 // configured memtable size. This prevents flushing of memtables at startup 1485 // while we're undergoing the ramp period on the memtable size. See 1486 // DB.newMemTable(). 1487 minFlushSize := uint64(d.opts.MemTableSize) / 2 1488 return size >= minFlushSize 1489 } 1490 1491 func (d *DB) maybeScheduleDelayedFlush(tbl *memTable, dur time.Duration) { 1492 var mem *flushableEntry 1493 for _, m := range d.mu.mem.queue { 1494 if m.flushable == tbl { 1495 mem = m 1496 break 1497 } 1498 } 1499 if mem == nil || mem.flushForced { 1500 return 1501 } 1502 deadline := d.timeNow().Add(dur) 1503 if !mem.delayedFlushForcedAt.IsZero() && deadline.After(mem.delayedFlushForcedAt) { 1504 // Already scheduled to flush sooner than within `dur`. 1505 return 1506 } 1507 mem.delayedFlushForcedAt = deadline 1508 go func() { 1509 timer := time.NewTimer(dur) 1510 defer timer.Stop() 1511 1512 select { 1513 case <-d.closedCh: 1514 return 1515 case <-mem.flushed: 1516 return 1517 case <-timer.C: 1518 d.commit.mu.Lock() 1519 defer d.commit.mu.Unlock() 1520 d.mu.Lock() 1521 defer d.mu.Unlock() 1522 1523 // NB: The timer may fire concurrently with a call to Close. If a 1524 // Close call beat us to acquiring d.mu, d.closed holds ErrClosed, 1525 // and it's too late to flush anything. Otherwise, the Close call 1526 // will block on locking d.mu until we've finished scheduling the 1527 // flush and set `d.mu.compact.flushing` to true. Close will wait 1528 // for the current flush to complete. 1529 if d.closed.Load() != nil { 1530 return 1531 } 1532 1533 if d.mu.mem.mutable == tbl { 1534 d.makeRoomForWrite(nil, true) 1535 } else { 1536 mem.flushForced = true 1537 d.maybeScheduleFlush(true) 1538 } 1539 } 1540 }() 1541 } 1542 1543 func (d *DB) flush(needReport bool) { 1544 pprof.Do(context.Background(), flushLabels, func(context.Context) { 1545 flushingWorkStart := time.Now() 1546 d.mu.Lock() 1547 defer d.mu.Unlock() 1548 idleDuration := flushingWorkStart.Sub(d.mu.compact.noOngoingFlushStartTime) 1549 var bytesFlushed uint64 1550 var err error 1551 if bytesFlushed, err = d.flush1(needReport); err != nil { 1552 // TODO(peter): count consecutive flush errors and backoff. 1553 d.opts.EventListener.BackgroundError(err) 1554 } 1555 d.mu.compact.flushing = false 1556 d.mu.compact.noOngoingFlushStartTime = time.Now() 1557 workDuration := d.mu.compact.noOngoingFlushStartTime.Sub(flushingWorkStart) 1558 d.mu.compact.flushWriteThroughput.Bytes += int64(bytesFlushed) 1559 d.mu.compact.flushWriteThroughput.WorkDuration += workDuration 1560 d.mu.compact.flushWriteThroughput.IdleDuration += idleDuration 1561 // More flush work may have arrived while we were flushing, so schedule 1562 // another flush if needed. 1563 d.maybeScheduleFlush(true) 1564 // The flush may have produced too many files in a level, so schedule a 1565 // compaction if needed. 1566 d.maybeScheduleCompaction() 1567 d.mu.compact.cond.Broadcast() 1568 }) 1569 } 1570 1571 // flush runs a compaction that copies the immutable memtables from memory to 1572 // disk. 1573 // 1574 // d.mu must be held when calling this, but the mutex may be dropped and 1575 // re-acquired during the course of this method. 1576 func (d *DB) flush1(needReport bool) (bytesFlushed uint64, err error) { 1577 var n int 1578 for ; n < len(d.mu.mem.queue)-1; n++ { 1579 if !d.mu.mem.queue[n].readyForFlush() { 1580 break 1581 } 1582 } 1583 if n == 0 { 1584 // None of the immutable memtables are ready for flushing. 1585 return 0, nil 1586 } 1587 1588 // Require that every memtable being flushed has a log number less than the 1589 // new minimum unflushed log number. 1590 minUnflushedLogNum := d.mu.mem.queue[n].logNum 1591 if !d.opts.DisableWAL { 1592 for i := 0; i < n; i++ { 1593 logNum := d.mu.mem.queue[i].logNum 1594 if logNum >= minUnflushedLogNum { 1595 return 0, errFlushInvariant 1596 } 1597 } 1598 } 1599 1600 if needReport && d.opts.FlushReporter != nil { 1601 d.opts.FlushReporter(d.opts.Id) 1602 } 1603 1604 c := newFlush(d.opts, d.mu.versions.currentVersion(), 1605 d.mu.versions.picker.getBaseLevel(), d.mu.mem.queue[:n]) 1606 d.addInProgressCompaction(c) 1607 1608 jobID := d.mu.nextJobID 1609 d.mu.nextJobID++ 1610 d.opts.EventListener.FlushBegin(FlushInfo{ 1611 JobID: jobID, 1612 Input: n, 1613 }) 1614 startTime := d.timeNow() 1615 1616 ve, pendingOutputs, err := d.runCompaction(jobID, c) 1617 1618 info := FlushInfo{ 1619 JobID: jobID, 1620 Input: n, 1621 Duration: d.timeNow().Sub(startTime), 1622 Done: true, 1623 Err: err, 1624 } 1625 if err == nil { 1626 for i := range ve.NewFiles { 1627 e := &ve.NewFiles[i] 1628 info.Output = append(info.Output, e.Meta.TableInfo()) 1629 } 1630 if len(ve.NewFiles) == 0 { 1631 info.Err = errEmptyTable 1632 } 1633 1634 // The flush succeeded or it produced an empty sstable. In either case we 1635 // want to bump the minimum unflushed log number to the log number of the 1636 // oldest unflushed memtable. 1637 ve.MinUnflushedLogNum = minUnflushedLogNum 1638 metrics := c.metrics[0] 1639 for i := 0; i < n; i++ { 1640 metrics.BytesIn += d.mu.mem.queue[i].logSize 1641 } 1642 1643 d.mu.versions.logLock() 1644 err = d.mu.versions.logAndApply(jobID, ve, c.metrics, false, /* forceRotation */ 1645 func() []compactionInfo { return d.getInProgressCompactionInfoLocked(c) }) 1646 if err != nil { 1647 info.Err = err 1648 // TODO(peter): untested. 1649 d.mu.versions.obsoleteTables = append(d.mu.versions.obsoleteTables, pendingOutputs...) 1650 d.mu.versions.incrementObsoleteTablesLocked(pendingOutputs) 1651 } 1652 } 1653 1654 bytesFlushed = c.bytesIterated 1655 d.maybeUpdateDeleteCompactionHints(c) 1656 d.removeInProgressCompaction(c, err != nil) 1657 d.mu.versions.incrementCompactions(c.kind, c.extraLevels) 1658 d.mu.versions.incrementCompactionBytes(-c.bytesWritten) 1659 1660 var flushed flushableList 1661 if err == nil { 1662 flushed = d.mu.mem.queue[:n] 1663 d.mu.mem.queue = d.mu.mem.queue[n:] 1664 d.updateReadStateLocked(d.opts.DebugCheck) 1665 d.updateTableStatsLocked(ve.NewFiles) 1666 } 1667 // Signal FlushEnd after installing the new readState. This helps for unit 1668 // tests that use the callback to trigger a read using an iterator with 1669 // IterOptions.OnlyReadGuaranteedDurable. 1670 info.TotalDuration = d.timeNow().Sub(startTime) 1671 d.opts.EventListener.FlushEnd(info) 1672 1673 d.deleteObsoleteFiles(jobID, false /* waitForOngoing */) 1674 1675 // Mark all the memtables we flushed as flushed. Note that we do this last so 1676 // that a synchronous call to DB.Flush() will not return until the deletion 1677 // of obsolete files from this job have completed. This makes testing easier 1678 // and provides similar behavior to manual compactions where the compaction 1679 // is not marked as completed until the deletion of obsolete files job has 1680 // completed. 1681 for i := range flushed { 1682 // The order of these operations matters here for ease of testing. Removing 1683 // the reader reference first allows tests to be guaranteed that the 1684 // memtable reservation has been released by the time a synchronous flush 1685 // returns. 1686 flushed[i].readerUnref() 1687 close(flushed[i].flushed) 1688 } 1689 return bytesFlushed, err 1690 } 1691 1692 // maybeScheduleCompactionAsync should be used when 1693 // we want to possibly schedule a compaction, but don't 1694 // want to eat the cost of running maybeScheduleCompaction. 1695 // This method should be launched in a separate goroutine. 1696 // d.mu must not be held when this is called. 1697 func (d *DB) maybeScheduleCompactionAsync() { 1698 defer d.compactionSchedulers.Done() 1699 1700 d.mu.Lock() 1701 d.maybeScheduleCompaction() 1702 d.mu.Unlock() 1703 } 1704 1705 // maybeScheduleCompaction schedules a compaction if necessary. 1706 // 1707 // d.mu must be held when calling this. 1708 func (d *DB) maybeScheduleCompaction() { 1709 d.maybeScheduleCompactionPicker(pickAuto) 1710 } 1711 1712 func pickAuto(picker compactionPicker, env compactionEnv) *pickedCompaction { 1713 return picker.pickAuto(env) 1714 } 1715 1716 func pickElisionOnly(picker compactionPicker, env compactionEnv) *pickedCompaction { 1717 return picker.pickElisionOnlyCompaction(env) 1718 } 1719 1720 // maybeScheduleCompactionPicker schedules a compaction if necessary, 1721 // calling `pickFunc` to pick automatic compactions. 1722 // 1723 // d.mu must be held when calling this. 1724 func (d *DB) maybeScheduleCompactionPicker( 1725 pickFunc func(compactionPicker, compactionEnv) *pickedCompaction, 1726 ) { 1727 if d.closed.Load() != nil || d.opts.ReadOnly { 1728 return 1729 } 1730 maxConcurrentCompactions := d.opts.MaxConcurrentCompactions() 1731 if d.mu.compact.compactingCount >= maxConcurrentCompactions { 1732 if len(d.mu.compact.manual) > 0 { 1733 // Inability to run head blocks later manual compactions. 1734 d.mu.compact.manual[0].retries++ 1735 } 1736 return 1737 } 1738 1739 // Compaction picking needs a coherent view of a Version. In particular, we 1740 // need to exlude concurrent ingestions from making a decision on which level 1741 // to ingest into that conflicts with our compaction 1742 // decision. versionSet.logLock provides the necessary mutual exclusion. 1743 d.mu.versions.logLock() 1744 defer d.mu.versions.logUnlock() 1745 1746 // Check for the closed flag again, in case the DB was closed while we were 1747 // waiting for logLock(). 1748 if d.closed.Load() != nil { 1749 return 1750 } 1751 1752 env := compactionEnv{ 1753 earliestSnapshotSeqNum: d.mu.snapshots.earliest(), 1754 earliestUnflushedSeqNum: d.getEarliestUnflushedSeqNumLocked(), 1755 } 1756 1757 // Check for delete-only compactions first, because they're expected to be 1758 // cheap and reduce future compaction work. 1759 if len(d.mu.compact.deletionHints) > 0 && 1760 d.mu.compact.compactingCount < maxConcurrentCompactions && 1761 !d.opts.DisableAutomaticCompactions { 1762 v := d.mu.versions.currentVersion() 1763 snapshots := d.mu.snapshots.toSlice() 1764 inputs, unresolvedHints := checkDeleteCompactionHints(d.cmp, v, d.mu.compact.deletionHints, snapshots) 1765 d.mu.compact.deletionHints = unresolvedHints 1766 1767 if len(inputs) > 0 { 1768 c := newDeleteOnlyCompaction(d.opts, v, inputs) 1769 d.mu.compact.compactingCount++ 1770 d.addInProgressCompaction(c) 1771 go d.compact(c, nil) 1772 } 1773 } 1774 1775 for len(d.mu.compact.manual) > 0 && d.mu.compact.compactingCount < maxConcurrentCompactions { 1776 manual := d.mu.compact.manual[0] 1777 env.inProgressCompactions = d.getInProgressCompactionInfoLocked(nil) 1778 pc, retryLater := d.mu.versions.picker.pickManual(env, manual) 1779 if pc != nil { 1780 c := newCompaction(pc, d.opts) 1781 d.mu.compact.manual = d.mu.compact.manual[1:] 1782 d.mu.compact.compactingCount++ 1783 d.addInProgressCompaction(c) 1784 go d.compact(c, manual.done) 1785 } else if !retryLater { 1786 // Noop 1787 d.mu.compact.manual = d.mu.compact.manual[1:] 1788 manual.done <- nil 1789 } else { 1790 // Inability to run head blocks later manual compactions. 1791 manual.retries++ 1792 break 1793 } 1794 } 1795 1796 for !d.opts.DisableAutomaticCompactions && d.mu.compact.compactingCount < maxConcurrentCompactions { 1797 env.inProgressCompactions = d.getInProgressCompactionInfoLocked(nil) 1798 env.readCompactionEnv = readCompactionEnv{ 1799 readCompactions: &d.mu.compact.readCompactions, 1800 flushing: d.mu.compact.flushing || d.passedFlushThreshold(), 1801 rescheduleReadCompaction: &d.mu.compact.rescheduleReadCompaction, 1802 } 1803 pc := pickFunc(d.mu.versions.picker, env) 1804 if pc == nil { 1805 break 1806 } 1807 c := newCompaction(pc, d.opts) 1808 d.mu.compact.compactingCount++ 1809 d.addInProgressCompaction(c) 1810 go d.compact(c, nil) 1811 } 1812 } 1813 1814 // deleteCompactionHintType indicates whether the deleteCompactionHint was 1815 // generated from a span containing a range del (point key only), a range key 1816 // delete (range key only), or both a point and range key. 1817 type deleteCompactionHintType uint8 1818 1819 const ( 1820 // NOTE: While these are primarily used as enumeration types, they are also 1821 // used for some bitwise operations. Care should be taken when updating. 1822 deleteCompactionHintTypeUnknown deleteCompactionHintType = iota 1823 deleteCompactionHintTypePointKeyOnly 1824 deleteCompactionHintTypeRangeKeyOnly 1825 deleteCompactionHintTypePointAndRangeKey 1826 ) 1827 1828 // String implements fmt.Stringer. 1829 func (h deleteCompactionHintType) String() string { 1830 switch h { 1831 case deleteCompactionHintTypeUnknown: 1832 return "unknown" 1833 case deleteCompactionHintTypePointKeyOnly: 1834 return "point-key-only" 1835 case deleteCompactionHintTypeRangeKeyOnly: 1836 return "range-key-only" 1837 case deleteCompactionHintTypePointAndRangeKey: 1838 return "point-and-range-key" 1839 default: 1840 panic(fmt.Sprintf("unknown hint type: %d", h)) 1841 } 1842 } 1843 1844 // compactionHintFromKeys returns a deleteCompactionHintType given a slice of 1845 // keyspan.Keys. 1846 func compactionHintFromKeys(keys []keyspan.Key) deleteCompactionHintType { 1847 var hintType deleteCompactionHintType 1848 for _, k := range keys { 1849 switch k.Kind() { 1850 case base.InternalKeyKindRangeDelete: 1851 hintType |= deleteCompactionHintTypePointKeyOnly 1852 case base.InternalKeyKindRangeKeyDelete: 1853 hintType |= deleteCompactionHintTypeRangeKeyOnly 1854 default: 1855 panic(fmt.Sprintf("unsupported key kind: %s", k.Kind())) 1856 } 1857 } 1858 return hintType 1859 } 1860 1861 // A deleteCompactionHint records a user key and sequence number span that has been 1862 // deleted by a range tombstone. A hint is recorded if at least one sstable 1863 // falls completely within both the user key and sequence number spans. 1864 // Once the tombstones and the observed completely-contained sstables fall 1865 // into the same snapshot stripe, a delete-only compaction may delete any 1866 // sstables within the range. 1867 type deleteCompactionHint struct { 1868 // The type of key span that generated this hint (point key, range key, or 1869 // both). 1870 hintType deleteCompactionHintType 1871 // start and end are user keys specifying a key range [start, end) of 1872 // deleted keys. 1873 start []byte 1874 end []byte 1875 // The level of the file containing the range tombstone(s) when the hint 1876 // was created. Only lower levels need to be searched for files that may 1877 // be deleted. 1878 tombstoneLevel int 1879 // The file containing the range tombstone(s) that created the hint. 1880 tombstoneFile *fileMetadata 1881 // The smallest and largest sequence numbers of the abutting tombstones 1882 // merged to form this hint. All of a tables' keys must be less than the 1883 // tombstone smallest sequence number to be deleted. All of a tables' 1884 // sequence numbers must fall into the same snapshot stripe as the 1885 // tombstone largest sequence number to be deleted. 1886 tombstoneLargestSeqNum uint64 1887 tombstoneSmallestSeqNum uint64 1888 // The smallest sequence number of a sstable that was found to be covered 1889 // by this hint. The hint cannot be resolved until this sequence number is 1890 // in the same snapshot stripe as the largest tombstone sequence number. 1891 // This is set when a hint is created, so the LSM may look different and 1892 // notably no longer contain the sstable that contained the key at this 1893 // sequence number. 1894 fileSmallestSeqNum uint64 1895 } 1896 1897 func (h deleteCompactionHint) String() string { 1898 return fmt.Sprintf( 1899 "L%d.%s %s-%s seqnums(tombstone=%d-%d, file-smallest=%d, type=%s)", 1900 h.tombstoneLevel, h.tombstoneFile.FileNum, h.start, h.end, 1901 h.tombstoneSmallestSeqNum, h.tombstoneLargestSeqNum, h.fileSmallestSeqNum, 1902 h.hintType, 1903 ) 1904 } 1905 1906 func (h *deleteCompactionHint) canDelete(cmp Compare, m *fileMetadata, snapshots []uint64) bool { 1907 // The file can only be deleted if all of its keys are older than the 1908 // earliest tombstone aggregated into the hint. 1909 if m.LargestSeqNum >= h.tombstoneSmallestSeqNum || m.SmallestSeqNum < h.fileSmallestSeqNum { 1910 return false 1911 } 1912 1913 // The file's oldest key must be in the same snapshot stripe as the 1914 // newest tombstone. NB: We already checked the hint's sequence numbers, 1915 // but this file's oldest sequence number might be lower than the hint's 1916 // smallest sequence number despite the file falling within the key range 1917 // if this file was constructed after the hint by a compaction. 1918 ti, _ := snapshotIndex(h.tombstoneLargestSeqNum, snapshots) 1919 fi, _ := snapshotIndex(m.SmallestSeqNum, snapshots) 1920 if ti != fi { 1921 return false 1922 } 1923 1924 switch h.hintType { 1925 case deleteCompactionHintTypePointKeyOnly: 1926 // A hint generated by a range del span cannot delete tables that contain 1927 // range keys. 1928 if m.HasRangeKeys { 1929 return false 1930 } 1931 case deleteCompactionHintTypeRangeKeyOnly: 1932 // A hint generated by a range key del span cannot delete tables that 1933 // contain point keys. 1934 if m.HasPointKeys { 1935 return false 1936 } 1937 case deleteCompactionHintTypePointAndRangeKey: 1938 // A hint from a span that contains both range dels *and* range keys can 1939 // only be deleted if both bounds fall within the hint. The next check takes 1940 // care of this. 1941 default: 1942 panic(fmt.Sprintf("bitalostable: unknown delete compaction hint type: %d", h.hintType)) 1943 } 1944 1945 // The file's keys must be completely contained within the hint range. 1946 return cmp(h.start, m.Smallest.UserKey) <= 0 && cmp(m.Largest.UserKey, h.end) < 0 1947 } 1948 1949 func (d *DB) maybeUpdateDeleteCompactionHints(c *compaction) { 1950 // Compactions that zero sequence numbers can interfere with compaction 1951 // deletion hints. Deletion hints apply to tables containing keys older 1952 // than a threshold. If a key more recent than the threshold is zeroed in 1953 // a compaction, a delete-only compaction may mistake it as meeting the 1954 // threshold and drop a table containing live data. 1955 // 1956 // To avoid this scenario, compactions that zero sequence numbers remove 1957 // any conflicting deletion hints. A deletion hint is conflicting if both 1958 // of the following conditions apply: 1959 // * its key space overlaps with the compaction 1960 // * at least one of its inputs contains a key as recent as one of the 1961 // hint's tombstones. 1962 // 1963 if !c.allowedZeroSeqNum { 1964 return 1965 } 1966 1967 updatedHints := d.mu.compact.deletionHints[:0] 1968 for _, h := range d.mu.compact.deletionHints { 1969 // If the compaction's key space is disjoint from the hint's key 1970 // space, the zeroing of sequence numbers won't affect the hint. Keep 1971 // the hint. 1972 keysDisjoint := d.cmp(h.end, c.smallest.UserKey) < 0 || d.cmp(h.start, c.largest.UserKey) > 0 1973 if keysDisjoint { 1974 updatedHints = append(updatedHints, h) 1975 continue 1976 } 1977 1978 // All of the compaction's inputs must be older than the hint's 1979 // tombstones. 1980 inputsOlder := true 1981 for _, in := range c.inputs { 1982 iter := in.files.Iter() 1983 for f := iter.First(); f != nil; f = iter.Next() { 1984 inputsOlder = inputsOlder && f.LargestSeqNum < h.tombstoneSmallestSeqNum 1985 } 1986 } 1987 if inputsOlder { 1988 updatedHints = append(updatedHints, h) 1989 continue 1990 } 1991 1992 // Drop h, because the compaction c may have zeroed sequence numbers 1993 // of keys more recent than some of h's tombstones. 1994 } 1995 d.mu.compact.deletionHints = updatedHints 1996 } 1997 1998 func checkDeleteCompactionHints( 1999 cmp Compare, v *version, hints []deleteCompactionHint, snapshots []uint64, 2000 ) ([]compactionLevel, []deleteCompactionHint) { 2001 var files map[*fileMetadata]bool 2002 var byLevel [numLevels][]*fileMetadata 2003 2004 unresolvedHints := hints[:0] 2005 for _, h := range hints { 2006 // Check each compaction hint to see if it's resolvable. Resolvable 2007 // hints are removed and trigger a delete-only compaction if any files 2008 // in the current LSM still meet their criteria. Unresolvable hints 2009 // are saved and don't trigger a delete-only compaction. 2010 // 2011 // When a compaction hint is created, the sequence numbers of the 2012 // range tombstones and the covered file with the oldest key are 2013 // recorded. The largest tombstone sequence number and the smallest 2014 // file sequence number must be in the same snapshot stripe for the 2015 // hint to be resolved. The below graphic models a compaction hint 2016 // covering the keyspace [b, r). The hint completely contains two 2017 // files, 000002 and 000003. The file 000003 contains the lowest 2018 // covered sequence number at #90. The tombstone b.RANGEDEL.230:h has 2019 // the highest tombstone sequence number incorporated into the hint. 2020 // The hint may be resolved only once the snapshots at #100, #180 and 2021 // #210 are all closed. File 000001 is not included within the hint 2022 // because it extends beyond the range tombstones in user key space. 2023 // 2024 // 250 2025 // 2026 // |-b...230:h-| 2027 // _____________________________________________________ snapshot #210 2028 // 200 |--h.RANGEDEL.200:r--| 2029 // 2030 // _____________________________________________________ snapshot #180 2031 // 2032 // 150 +--------+ 2033 // +---------+ | 000003 | 2034 // | 000002 | | | 2035 // +_________+ | | 2036 // 100_____________________|________|___________________ snapshot #100 2037 // +--------+ 2038 // _____________________________________________________ snapshot #70 2039 // +---------------+ 2040 // 50 | 000001 | 2041 // | | 2042 // +---------------+ 2043 // ______________________________________________________________ 2044 // a b c d e f g h i j k l m n o p q r s t u v w x y z 2045 2046 ti, _ := snapshotIndex(h.tombstoneLargestSeqNum, snapshots) 2047 fi, _ := snapshotIndex(h.fileSmallestSeqNum, snapshots) 2048 if ti != fi { 2049 // Cannot resolve yet. 2050 unresolvedHints = append(unresolvedHints, h) 2051 continue 2052 } 2053 2054 // The hint h will be resolved and dropped, regardless of whether 2055 // there are any tables that can be deleted. 2056 for l := h.tombstoneLevel + 1; l < numLevels; l++ { 2057 overlaps := v.Overlaps(l, cmp, h.start, h.end, true /* exclusiveEnd */) 2058 iter := overlaps.Iter() 2059 for m := iter.First(); m != nil; m = iter.Next() { 2060 if m.IsCompacting() || !h.canDelete(cmp, m, snapshots) || files[m] { 2061 continue 2062 } 2063 if files == nil { 2064 // Construct files lazily, assuming most calls will not 2065 // produce delete-only compactions. 2066 files = make(map[*fileMetadata]bool) 2067 } 2068 files[m] = true 2069 byLevel[l] = append(byLevel[l], m) 2070 } 2071 } 2072 } 2073 2074 var compactLevels []compactionLevel 2075 for l, files := range byLevel { 2076 if len(files) == 0 { 2077 continue 2078 } 2079 compactLevels = append(compactLevels, compactionLevel{ 2080 level: l, 2081 files: manifest.NewLevelSliceKeySorted(cmp, files), 2082 }) 2083 } 2084 return compactLevels, unresolvedHints 2085 } 2086 2087 // compact runs one compaction and maybe schedules another call to compact. 2088 func (d *DB) compact(c *compaction, errChannel chan error) { 2089 pprof.Do(context.Background(), compactLabels, func(context.Context) { 2090 d.mu.Lock() 2091 defer d.mu.Unlock() 2092 if err := d.compact1(c, errChannel); err != nil { 2093 // TODO(peter): count consecutive compaction errors and backoff. 2094 d.opts.EventListener.BackgroundError(err) 2095 } 2096 d.mu.compact.compactingCount-- 2097 // The previous compaction may have produced too many files in a 2098 // level, so reschedule another compaction if needed. 2099 d.maybeScheduleCompaction() 2100 d.mu.compact.cond.Broadcast() 2101 }) 2102 } 2103 2104 // compact1 runs one compaction. 2105 // 2106 // d.mu must be held when calling this, but the mutex may be dropped and 2107 // re-acquired during the course of this method. 2108 func (d *DB) compact1(c *compaction, errChannel chan error) (err error) { 2109 if errChannel != nil { 2110 defer func() { 2111 errChannel <- err 2112 }() 2113 } 2114 2115 jobID := d.mu.nextJobID 2116 d.mu.nextJobID++ 2117 info := c.makeInfo(jobID) 2118 d.opts.EventListener.CompactionBegin(info) 2119 startTime := d.timeNow() 2120 2121 ve, pendingOutputs, err := d.runCompaction(jobID, c) 2122 2123 info.Duration = d.timeNow().Sub(startTime) 2124 if err == nil { 2125 d.mu.versions.logLock() 2126 err = d.mu.versions.logAndApply(jobID, ve, c.metrics, false /* forceRotation */, func() []compactionInfo { 2127 return d.getInProgressCompactionInfoLocked(c) 2128 }) 2129 if err != nil { 2130 // TODO(peter): untested. 2131 d.mu.versions.obsoleteTables = append(d.mu.versions.obsoleteTables, pendingOutputs...) 2132 d.mu.versions.incrementObsoleteTablesLocked(pendingOutputs) 2133 } 2134 } 2135 2136 info.Done = true 2137 info.Err = err 2138 if err == nil { 2139 for i := range ve.NewFiles { 2140 e := &ve.NewFiles[i] 2141 info.Output.Tables = append(info.Output.Tables, e.Meta.TableInfo()) 2142 } 2143 } 2144 2145 d.maybeUpdateDeleteCompactionHints(c) 2146 d.removeInProgressCompaction(c, err != nil) 2147 d.mu.versions.incrementCompactions(c.kind, c.extraLevels) 2148 d.mu.versions.incrementCompactionBytes(-c.bytesWritten) 2149 2150 info.TotalDuration = d.timeNow().Sub(startTime) 2151 d.opts.EventListener.CompactionEnd(info) 2152 2153 // Update the read state before deleting obsolete files because the 2154 // read-state update will cause the previous version to be unref'd and if 2155 // there are no references obsolete tables will be added to the obsolete 2156 // table list. 2157 if err == nil { 2158 d.updateReadStateLocked(d.opts.DebugCheck) 2159 d.updateTableStatsLocked(ve.NewFiles) 2160 } 2161 d.deleteObsoleteFiles(jobID, true /* waitForOngoing */) 2162 2163 return err 2164 } 2165 2166 // runCompactions runs a compaction that produces new on-disk tables from 2167 // memtables or old on-disk tables. 2168 // 2169 // d.mu must be held when calling this, but the mutex may be dropped and 2170 // re-acquired during the course of this method. 2171 func (d *DB) runCompaction( 2172 jobID int, c *compaction, 2173 ) (ve *versionEdit, pendingOutputs []*fileMetadata, retErr error) { 2174 // As a sanity check, confirm that the smallest / largest keys for new and 2175 // deleted files in the new versionEdit pass a validation function before 2176 // returning the edit. 2177 defer func() { 2178 if ve != nil { 2179 err := validateVersionEdit(ve, d.opts.Experimental.KeyValidationFunc, d.opts.Comparer.FormatKey) 2180 if err != nil { 2181 d.opts.Logger.Fatalf("bitalostable: version edit validation failed: %s", err) 2182 } 2183 } 2184 }() 2185 2186 // Check for a delete-only compaction. This can occur when wide range 2187 // tombstones completely contain sstables. 2188 if c.kind == compactionKindDeleteOnly { 2189 c.metrics = make(map[int]*LevelMetrics, len(c.inputs)) 2190 ve := &versionEdit{ 2191 DeletedFiles: map[deletedFileEntry]*fileMetadata{}, 2192 } 2193 for _, cl := range c.inputs { 2194 levelMetrics := &LevelMetrics{} 2195 iter := cl.files.Iter() 2196 for f := iter.First(); f != nil; f = iter.Next() { 2197 levelMetrics.NumFiles-- 2198 levelMetrics.Size -= int64(f.Size) 2199 ve.DeletedFiles[deletedFileEntry{ 2200 Level: cl.level, 2201 FileNum: f.FileNum, 2202 }] = f 2203 } 2204 c.metrics[cl.level] = levelMetrics 2205 } 2206 return ve, nil, nil 2207 } 2208 2209 // Check for a trivial move of one table from one level to the next. We avoid 2210 // such a move if there is lots of overlapping grandparent data. Otherwise, 2211 // the move could create a parent file that will require a very expensive 2212 // merge later on. 2213 if c.kind == compactionKindMove { 2214 iter := c.startLevel.files.Iter() 2215 meta := iter.First() 2216 c.metrics = map[int]*LevelMetrics{ 2217 c.startLevel.level: { 2218 NumFiles: -1, 2219 Size: -int64(meta.Size), 2220 }, 2221 c.outputLevel.level: { 2222 NumFiles: 1, 2223 Size: int64(meta.Size), 2224 BytesMoved: meta.Size, 2225 TablesMoved: 1, 2226 }, 2227 } 2228 ve := &versionEdit{ 2229 DeletedFiles: map[deletedFileEntry]*fileMetadata{ 2230 {Level: c.startLevel.level, FileNum: meta.FileNum}: meta, 2231 }, 2232 NewFiles: []newFileEntry{ 2233 {Level: c.outputLevel.level, Meta: meta}, 2234 }, 2235 } 2236 return ve, nil, nil 2237 } 2238 2239 defer func() { 2240 if retErr != nil { 2241 pendingOutputs = nil 2242 } 2243 }() 2244 2245 snapshots := d.mu.snapshots.toSlice() 2246 formatVers := d.mu.formatVers.vers 2247 // The table is written at the maximum allowable format implied by the current 2248 // format major version of the DB. 2249 tableFormat := formatVers.MaxTableFormat() 2250 2251 // Release the d.mu lock while doing I/O. 2252 // Note the unusual order: Unlock and then Lock. 2253 d.mu.Unlock() 2254 defer d.mu.Lock() 2255 2256 iiter, err := c.newInputIter(d.newIters, d.tableNewRangeKeyIter, snapshots) 2257 if err != nil { 2258 return nil, pendingOutputs, err 2259 } 2260 c.allowedZeroSeqNum = c.allowZeroSeqNum() 2261 iter := newCompactionIter(c.cmp, c.equal, c.formatKey, d.merge, iiter, snapshots, 2262 &c.rangeDelFrag, &c.rangeKeyFrag, c.allowedZeroSeqNum, c.elideTombstone, 2263 c.elideRangeTombstone, d.FormatMajorVersion()) 2264 2265 var ( 2266 filenames []string 2267 tw *sstable.Writer 2268 ) 2269 defer func() { 2270 if iter != nil { 2271 retErr = firstError(retErr, iter.Close()) 2272 } 2273 if tw != nil { 2274 retErr = firstError(retErr, tw.Close()) 2275 } 2276 if retErr != nil { 2277 for _, filename := range filenames { 2278 d.opts.FS.Remove(filename) 2279 } 2280 } 2281 for _, closer := range c.closers { 2282 retErr = firstError(retErr, closer.Close()) 2283 } 2284 }() 2285 2286 ve = &versionEdit{ 2287 DeletedFiles: map[deletedFileEntry]*fileMetadata{}, 2288 } 2289 2290 outputMetrics := &LevelMetrics{ 2291 BytesIn: c.startLevel.files.SizeSum(), 2292 BytesRead: c.outputLevel.files.SizeSum(), 2293 } 2294 if len(c.extraLevels) > 0 { 2295 outputMetrics.BytesIn += c.extraLevels[0].files.SizeSum() 2296 } 2297 outputMetrics.BytesRead += outputMetrics.BytesIn 2298 2299 c.metrics = map[int]*LevelMetrics{ 2300 c.outputLevel.level: outputMetrics, 2301 } 2302 if len(c.flushing) == 0 && c.metrics[c.startLevel.level] == nil { 2303 c.metrics[c.startLevel.level] = &LevelMetrics{} 2304 } 2305 if len(c.extraLevels) > 0 { 2306 c.metrics[c.extraLevels[0].level] = &LevelMetrics{} 2307 } 2308 2309 writerOpts := d.opts.MakeWriterOptions(c.outputLevel.level, tableFormat) 2310 if formatVers < FormatBlockPropertyCollector { 2311 // Cannot yet write block properties. 2312 writerOpts.BlockPropertyCollectors = nil 2313 } 2314 2315 // prevPointKey is a sstable.WriterOption that provides access to 2316 // the last point key written to a writer's sstable. When a new 2317 // output begins in newOutput, prevPointKey is updated to point to 2318 // the new output's sstable.Writer. This allows the compaction loop 2319 // to access the last written point key without requiring the 2320 // compaction loop to make a copy of each key ahead of time. Users 2321 // must be careful, because the byte slice returned by UnsafeKey 2322 // points directly into the Writer's block buffer. 2323 var prevPointKey sstable.PreviousPointKeyOpt 2324 var additionalCPUProcs int 2325 defer func() { 2326 if additionalCPUProcs > 0 { 2327 d.opts.Experimental.CPUWorkPermissionGranter.ReturnProcs(additionalCPUProcs) 2328 } 2329 }() 2330 2331 newOutput := func() error { 2332 fileMeta := &fileMetadata{} 2333 d.mu.Lock() 2334 fileNum := d.mu.versions.getNextFileNum() 2335 fileMeta.FileNum = fileNum 2336 pendingOutputs = append(pendingOutputs, fileMeta) 2337 d.mu.Unlock() 2338 2339 filename := base.MakeFilepath(d.opts.FS, d.dirname, fileTypeTable, fileNum) 2340 file, err := d.opts.FS.Create(filename) 2341 if err != nil { 2342 return err 2343 } 2344 reason := "flushing" 2345 if c.flushing == nil { 2346 reason = "compacting" 2347 } 2348 d.opts.EventListener.TableCreated(TableCreateInfo{ 2349 JobID: jobID, 2350 Reason: reason, 2351 Path: filename, 2352 FileNum: fileNum, 2353 }) 2354 file = vfs.NewSyncingFile(file, vfs.SyncingFileOptions{ 2355 NoSyncOnClose: d.opts.NoSyncOnClose, 2356 BytesPerSync: d.opts.BytesPerSync, 2357 }) 2358 file = &compactionFile{ 2359 File: file, 2360 versions: d.mu.versions, 2361 written: &c.bytesWritten, 2362 } 2363 filenames = append(filenames, filename) 2364 cacheOpts := private.SSTableCacheOpts(d.cacheID, fileNum).(sstable.WriterOption) 2365 internalTableOpt := private.SSTableInternalTableOpt.(sstable.WriterOption) 2366 if d.opts.Experimental.CPUWorkPermissionGranter != nil { 2367 additionalCPUProcs = d.opts.Experimental.CPUWorkPermissionGranter.TryGetProcs(1) 2368 } 2369 writerOpts.Parallelism = 2370 d.opts.Experimental.MaxWriterConcurrency > 0 && 2371 (additionalCPUProcs > 0 || d.opts.Experimental.ForceWriterParallelism) 2372 tw = sstable.NewWriter(file, writerOpts, cacheOpts, internalTableOpt, &prevPointKey) 2373 2374 fileMeta.CreationTime = time.Now().Unix() 2375 ve.NewFiles = append(ve.NewFiles, newFileEntry{ 2376 Level: c.outputLevel.level, 2377 Meta: fileMeta, 2378 }) 2379 return nil 2380 } 2381 2382 // splitL0Outputs is true during flushes and intra-L0 compactions with flush 2383 // splits enabled. 2384 splitL0Outputs := c.outputLevel.level == 0 && d.opts.FlushSplitBytes > 0 2385 2386 // finishOutput is called with the a user key up to which all tombstones 2387 // should be flushed. Typically, this is the first key of the next 2388 // sstable or an empty key if this output is the final sstable. 2389 finishOutput := func(splitKey []byte) error { 2390 // If we haven't output any point records to the sstable (tw == nil) then the 2391 // sstable will only contain range tombstones and/or range keys. The smallest 2392 // key in the sstable will be the start key of the first range tombstone or 2393 // range key added. We need to ensure that this start key is distinct from 2394 // the splitKey passed to finishOutput (if set), otherwise we would generate 2395 // an sstable where the largest key is smaller than the smallest key due to 2396 // how the largest key boundary is set below. NB: It is permissible for the 2397 // range tombstone / range key start key to be the empty string. 2398 // 2399 // TODO: It is unfortunate that we have to do this check here rather than 2400 // when we decide to finish the sstable in the runCompaction loop. A better 2401 // structure currently eludes us. 2402 if tw == nil { 2403 startKey := c.rangeDelFrag.Start() 2404 if len(iter.tombstones) > 0 { 2405 startKey = iter.tombstones[0].Start 2406 } 2407 if startKey == nil { 2408 startKey = c.rangeKeyFrag.Start() 2409 if len(iter.rangeKeys) > 0 { 2410 startKey = iter.rangeKeys[0].Start 2411 } 2412 } 2413 if splitKey != nil && d.cmp(startKey, splitKey) == 0 { 2414 return nil 2415 } 2416 } 2417 2418 // NB: clone the key because the data can be held on to by the call to 2419 // compactionIter.Tombstones via keyspan.Fragmenter.FlushTo, and by the 2420 // WriterMetadata.LargestRangeDel.UserKey. 2421 splitKey = append([]byte(nil), splitKey...) 2422 for _, v := range iter.Tombstones(splitKey) { 2423 if tw == nil { 2424 if err := newOutput(); err != nil { 2425 return err 2426 } 2427 } 2428 // The tombstone being added could be completely outside the 2429 // eventual bounds of the sstable. Consider this example (bounds 2430 // in square brackets next to table filename): 2431 // 2432 // ./000240.sst [tmgc#391,MERGE-tmgc#391,MERGE] 2433 // tmgc#391,MERGE [786e627a] 2434 // tmgc-udkatvs#331,RANGEDEL 2435 // 2436 // ./000241.sst [tmgc#384,MERGE-tmgc#384,MERGE] 2437 // tmgc#384,MERGE [666c7070] 2438 // tmgc-tvsalezade#383,RANGEDEL 2439 // tmgc-tvsalezade#331,RANGEDEL 2440 // 2441 // ./000242.sst [tmgc#383,RANGEDEL-tvsalezade#72057594037927935,RANGEDEL] 2442 // tmgc-tvsalezade#383,RANGEDEL 2443 // tmgc#375,SET [72646c78766965616c72776865676e79] 2444 // tmgc-tvsalezade#356,RANGEDEL 2445 // 2446 // Note that both of the top two SSTables have range tombstones 2447 // that start after the file's end keys. Since the file bound 2448 // computation happens well after all range tombstones have been 2449 // added to the writer, eliding out-of-file range tombstones based 2450 // on sequence number at this stage is difficult, and necessitates 2451 // read-time logic to ignore range tombstones outside file bounds. 2452 if err := rangedel.Encode(&v, tw.Add); err != nil { 2453 return err 2454 } 2455 } 2456 for _, v := range iter.RangeKeys(splitKey) { 2457 // Same logic as for range tombstones, except added using tw.AddRangeKey. 2458 if tw == nil { 2459 if err := newOutput(); err != nil { 2460 return err 2461 } 2462 } 2463 if err := rangekey.Encode(&v, tw.AddRangeKey); err != nil { 2464 return err 2465 } 2466 } 2467 2468 if tw == nil { 2469 return nil 2470 } 2471 2472 if err := tw.Close(); err != nil { 2473 tw = nil 2474 return err 2475 } 2476 if additionalCPUProcs > 0 { 2477 d.opts.Experimental.CPUWorkPermissionGranter.ReturnProcs(additionalCPUProcs) 2478 additionalCPUProcs = 0 2479 } 2480 writerMeta, err := tw.Metadata() 2481 if err != nil { 2482 tw = nil 2483 return err 2484 } 2485 tw = nil 2486 meta := ve.NewFiles[len(ve.NewFiles)-1].Meta 2487 meta.Size = writerMeta.Size 2488 meta.SmallestSeqNum = writerMeta.SmallestSeqNum 2489 meta.LargestSeqNum = writerMeta.LargestSeqNum 2490 // If the file didn't contain any range deletions, we can fill its 2491 // table stats now, avoiding unnecessarily loading the table later. 2492 maybeSetStatsFromProperties(meta, &writerMeta.Properties) 2493 2494 if c.flushing == nil { 2495 outputMetrics.TablesCompacted++ 2496 outputMetrics.BytesCompacted += meta.Size 2497 } else { 2498 outputMetrics.TablesFlushed++ 2499 outputMetrics.BytesFlushed += meta.Size 2500 } 2501 outputMetrics.Size += int64(meta.Size) 2502 outputMetrics.NumFiles++ 2503 2504 if n := len(ve.NewFiles); n > 1 { 2505 // This is not the first output file. Ensure the sstable boundaries 2506 // are nonoverlapping. 2507 prevMeta := ve.NewFiles[n-2].Meta 2508 if writerMeta.SmallestRangeDel.UserKey != nil { 2509 c := d.cmp(writerMeta.SmallestRangeDel.UserKey, prevMeta.Largest.UserKey) 2510 if c < 0 { 2511 return errors.Errorf( 2512 "bitalostable: smallest range tombstone start key is less than previous sstable largest key: %s < %s", 2513 writerMeta.SmallestRangeDel.Pretty(d.opts.Comparer.FormatKey), 2514 prevMeta.Largest.Pretty(d.opts.Comparer.FormatKey)) 2515 } else if c == 0 && !prevMeta.Largest.IsExclusiveSentinel() { 2516 // The user key portion of the range boundary start key is 2517 // equal to the previous table's largest key user key, and 2518 // the previous table's largest key is not exclusive. This 2519 // violates the invariant that tables are key-space 2520 // partitioned. 2521 return errors.Errorf( 2522 "bitalostable: invariant violation: previous sstable largest key %s, current sstable smallest rangedel: %s", 2523 prevMeta.Largest.Pretty(d.opts.Comparer.FormatKey), 2524 writerMeta.SmallestRangeDel.Pretty(d.opts.Comparer.FormatKey), 2525 ) 2526 } 2527 } 2528 } 2529 2530 // Verify that all range deletions outputted to the sstable are 2531 // truncated to split key. 2532 if splitKey != nil && writerMeta.LargestRangeDel.UserKey != nil && 2533 d.cmp(writerMeta.LargestRangeDel.UserKey, splitKey) > 0 { 2534 return errors.Errorf( 2535 "bitalostable: invariant violation: rangedel largest key %q extends beyond split key %q", 2536 writerMeta.LargestRangeDel.Pretty(d.opts.Comparer.FormatKey), 2537 d.opts.Comparer.FormatKey(splitKey), 2538 ) 2539 } 2540 2541 if writerMeta.HasPointKeys { 2542 meta.ExtendPointKeyBounds(d.cmp, writerMeta.SmallestPoint, writerMeta.LargestPoint) 2543 } 2544 if writerMeta.HasRangeDelKeys { 2545 meta.ExtendPointKeyBounds(d.cmp, writerMeta.SmallestRangeDel, writerMeta.LargestRangeDel) 2546 } 2547 if writerMeta.HasRangeKeys { 2548 meta.ExtendRangeKeyBounds(d.cmp, writerMeta.SmallestRangeKey, writerMeta.LargestRangeKey) 2549 } 2550 2551 // Verify that the sstable bounds fall within the compaction input 2552 // bounds. This is a sanity check that we don't have a logic error 2553 // elsewhere that causes the sstable bounds to accidentally expand past the 2554 // compaction input bounds as doing so could lead to various badness such 2555 // as keys being deleted by a range tombstone incorrectly. 2556 if c.smallest.UserKey != nil { 2557 switch v := d.cmp(meta.Smallest.UserKey, c.smallest.UserKey); { 2558 case v >= 0: 2559 // Nothing to do. 2560 case v < 0: 2561 return errors.Errorf("bitalostable: compaction output grew beyond bounds of input: %s < %s", 2562 meta.Smallest.Pretty(d.opts.Comparer.FormatKey), 2563 c.smallest.Pretty(d.opts.Comparer.FormatKey)) 2564 } 2565 } 2566 if c.largest.UserKey != nil { 2567 switch v := d.cmp(meta.Largest.UserKey, c.largest.UserKey); { 2568 case v <= 0: 2569 // Nothing to do. 2570 case v > 0: 2571 return errors.Errorf("bitalostable: compaction output grew beyond bounds of input: %s > %s", 2572 meta.Largest.Pretty(d.opts.Comparer.FormatKey), 2573 c.largest.Pretty(d.opts.Comparer.FormatKey)) 2574 } 2575 } 2576 // Verify that we never split different revisions of the same user key 2577 // across two different sstables. 2578 if err := c.errorOnUserKeyOverlap(ve); err != nil { 2579 return err 2580 } 2581 if err := meta.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil { 2582 return err 2583 } 2584 return nil 2585 } 2586 2587 // compactionOutputSplitters contain all logic to determine whether the 2588 // compaction loop should stop writing to one output sstable and switch to 2589 // a new one. Some splitters can wrap other splitters, and 2590 // the splitterGroup can be composed of multiple splitters. In this case, 2591 // we start off with splitters for file sizes, grandparent limits, and (for 2592 // L0 splits) L0 limits, before wrapping them in an splitterGroup. 2593 outputSplitters := []compactionOutputSplitter{ 2594 // We do not split the same user key across different sstables within 2595 // one flush or compaction. The fileSizeSplitter may request a split in 2596 // the middle of a user key, so the userKeyChangeSplitter ensures we are 2597 // at a user key change boundary when doing a split. 2598 &userKeyChangeSplitter{ 2599 cmp: c.cmp, 2600 splitter: &fileSizeSplitter{maxFileSize: c.maxOutputFileSize}, 2601 unsafePrevUserKey: func() []byte { 2602 // Return the largest point key written to tw or the start of 2603 // the current range deletion in the fragmenter, whichever is 2604 // greater. 2605 prevPoint := prevPointKey.UnsafeKey() 2606 if c.cmp(prevPoint.UserKey, c.rangeDelFrag.Start()) > 0 { 2607 return prevPoint.UserKey 2608 } 2609 return c.rangeDelFrag.Start() 2610 }, 2611 }, 2612 &limitFuncSplitter{c: c, limitFunc: c.findGrandparentLimit}, 2613 } 2614 if splitL0Outputs { 2615 outputSplitters = append(outputSplitters, &limitFuncSplitter{c: c, limitFunc: c.findL0Limit}) 2616 } 2617 splitter := &splitterGroup{cmp: c.cmp, splitters: outputSplitters} 2618 2619 // Each outer loop iteration produces one output file. An iteration that 2620 // produces a file containing point keys (and optionally range tombstones) 2621 // guarantees that the input iterator advanced. An iteration that produces 2622 // a file containing only range tombstones guarantees the limit passed to 2623 // `finishOutput()` advanced to a strictly greater user key corresponding 2624 // to a grandparent file largest key, or nil. Taken together, these 2625 // progress guarantees ensure that eventually the input iterator will be 2626 // exhausted and the range tombstone fragments will all be flushed. 2627 for key, val := iter.First(); key != nil || !c.rangeDelFrag.Empty() || !c.rangeKeyFrag.Empty(); { 2628 splitterSuggestion := splitter.onNewOutput(key) 2629 2630 // Each inner loop iteration processes one key from the input iterator. 2631 for ; key != nil; key, val = iter.Next() { 2632 if split := splitter.shouldSplitBefore(key, tw); split == splitNow { 2633 break 2634 } 2635 2636 switch key.Kind() { 2637 case InternalKeyKindRangeDelete: 2638 // Range tombstones are handled specially. They are fragmented, 2639 // and they're not written until later during `finishOutput()`. 2640 // We add them to the `Fragmenter` now to make them visible to 2641 // `compactionIter` so covered keys in the same snapshot stripe 2642 // can be elided. 2643 2644 // The interleaved range deletion might only be one of many with 2645 // these bounds. Some fragmenting is performed ahead of time by 2646 // keyspan.MergingIter. 2647 if s := c.rangeDelIter.Span(); !s.Empty() { 2648 // The memory management here is subtle. Range deletions 2649 // blocks do NOT use prefix compression, which ensures that 2650 // range deletion spans' memory is available as long we keep 2651 // the iterator open. However, the keyspan.MergingIter that 2652 // merges spans across levels only guarantees the lifetime 2653 // of the [start, end) bounds until the next positioning 2654 // method is called. 2655 // 2656 // Additionally, the Span.Keys slice is owned by the the 2657 // range deletion iterator stack, and it may be overwritten 2658 // when we advance. 2659 // 2660 // Clone the Keys slice and the start and end keys. 2661 // 2662 // TODO(jackson): Avoid the clone by removing c.rangeDelFrag 2663 // and performing explicit truncation of the pending 2664 // rangedel span as necessary. 2665 clone := keyspan.Span{ 2666 Start: iter.cloneKey(s.Start), 2667 End: iter.cloneKey(s.End), 2668 Keys: make([]keyspan.Key, len(s.Keys)), 2669 } 2670 copy(clone.Keys, s.Keys) 2671 c.rangeDelFrag.Add(clone) 2672 } 2673 continue 2674 case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 2675 // Range keys are handled in the same way as range tombstones, except 2676 // with a dedicated fragmenter. 2677 if s := c.rangeKeyInterleaving.Span(); !s.Empty() { 2678 clone := keyspan.Span{ 2679 Start: iter.cloneKey(s.Start), 2680 End: iter.cloneKey(s.End), 2681 Keys: make([]keyspan.Key, len(s.Keys)), 2682 } 2683 // Since the keys' Suffix and Value fields are not deep cloned, the 2684 // underlying blockIter must be kept open for the lifetime of the 2685 // compaction. 2686 copy(clone.Keys, s.Keys) 2687 c.rangeKeyFrag.Add(clone) 2688 } 2689 continue 2690 } 2691 if tw == nil { 2692 if err := newOutput(); err != nil { 2693 return nil, pendingOutputs, err 2694 } 2695 } 2696 2697 if key.Kind() == InternalKeyKindSet { 2698 if d.opts.KvCheckExpireFunc(key.UserKey, val) { 2699 key.SetKind(InternalKeyKindDelete) 2700 val = nil 2701 } 2702 } 2703 2704 if err := tw.Add(*key, val); err != nil { 2705 return nil, pendingOutputs, err 2706 } 2707 } 2708 2709 // A splitter requested a split, and we're ready to finish the output. 2710 // We need to choose the key at which to split any pending range 2711 // tombstones. There are two options: 2712 // 1. splitterSuggestion — The key suggested by the splitter. This key 2713 // is guaranteed to be greater than the last key written to the 2714 // current output. 2715 // 2. key.UserKey — the first key of the next sstable output. This user 2716 // key is also guaranteed to be greater than the last user key 2717 // written to the current output (see userKeyChangeSplitter). 2718 // 2719 // Use whichever is smaller. Using the smaller of the two limits 2720 // overlap with grandparents. Consider the case where the 2721 // grandparent limit is calculated to be 'b', key is 'x', and 2722 // there exist many sstables between 'b' and 'x'. If the range 2723 // deletion fragmenter has a pending tombstone [a,x), splitting 2724 // at 'x' would cause the output table to overlap many 2725 // grandparents well beyond the calculated grandparent limit 2726 // 'b'. Splitting at the smaller `splitterSuggestion` avoids 2727 // this unbounded overlap with grandparent tables. 2728 splitKey := splitterSuggestion 2729 if key != nil && (splitKey == nil || c.cmp(splitKey, key.UserKey) > 0) { 2730 splitKey = key.UserKey 2731 } 2732 if err := finishOutput(splitKey); err != nil { 2733 return nil, pendingOutputs, err 2734 } 2735 } 2736 2737 for _, cl := range c.inputs { 2738 iter := cl.files.Iter() 2739 for f := iter.First(); f != nil; f = iter.Next() { 2740 c.metrics[cl.level].NumFiles-- 2741 c.metrics[cl.level].Size -= int64(f.Size) 2742 ve.DeletedFiles[deletedFileEntry{ 2743 Level: cl.level, 2744 FileNum: f.FileNum, 2745 }] = f 2746 } 2747 } 2748 2749 if err := d.dataDir.Sync(); err != nil { 2750 return nil, pendingOutputs, err 2751 } 2752 2753 // Refresh the disk available statistic whenever a compaction/flush 2754 // completes, before re-acquiring the mutex. 2755 _ = d.calculateDiskAvailableBytes() 2756 2757 return ve, pendingOutputs, nil 2758 } 2759 2760 // validateVersionEdit validates that start and end keys across new and deleted 2761 // files in a versionEdit pass the given validation function. 2762 func validateVersionEdit( 2763 ve *versionEdit, validateFn func([]byte) error, format base.FormatKey, 2764 ) error { 2765 validateMetaFn := func(f *manifest.FileMetadata) error { 2766 for _, key := range []InternalKey{f.Smallest, f.Largest} { 2767 if err := validateFn(key.UserKey); err != nil { 2768 return errors.Wrapf(err, "key=%q; file=%s", format(key.UserKey), f) 2769 } 2770 } 2771 return nil 2772 } 2773 2774 // Validate both new and deleted files. 2775 for _, f := range ve.NewFiles { 2776 if err := validateMetaFn(f.Meta); err != nil { 2777 return err 2778 } 2779 } 2780 for _, m := range ve.DeletedFiles { 2781 if err := validateMetaFn(m); err != nil { 2782 return err 2783 } 2784 } 2785 2786 return nil 2787 } 2788 2789 // scanObsoleteFiles scans the filesystem for files that are no longer needed 2790 // and adds those to the internal lists of obsolete files. Note that the files 2791 // are not actually deleted by this method. A subsequent call to 2792 // deleteObsoleteFiles must be performed. Must be not be called concurrently 2793 // with compactions and flushes. db.mu must be held when calling this function. 2794 func (d *DB) scanObsoleteFiles(list []string) { 2795 // Disable automatic compactions temporarily to avoid concurrent compactions / 2796 // flushes from interfering. The original value is restored on completion. 2797 disabledPrev := d.opts.DisableAutomaticCompactions 2798 defer func() { 2799 d.opts.DisableAutomaticCompactions = disabledPrev 2800 }() 2801 d.opts.DisableAutomaticCompactions = true 2802 2803 // Wait for any ongoing compaction to complete before continuing. 2804 if d.mu.compact.compactingCount > 0 || d.mu.compact.flushing { 2805 d.mu.compact.cond.Wait() 2806 } 2807 2808 liveFileNums := make(map[FileNum]struct{}) 2809 d.mu.versions.addLiveFileNums(liveFileNums) 2810 minUnflushedLogNum := d.mu.versions.minUnflushedLogNum 2811 manifestFileNum := d.mu.versions.manifestFileNum 2812 2813 var obsoleteLogs []fileInfo 2814 var obsoleteTables []*fileMetadata 2815 var obsoleteManifests []fileInfo 2816 var obsoleteOptions []fileInfo 2817 2818 for _, filename := range list { 2819 fileType, fileNum, ok := base.ParseFilename(d.opts.FS, filename) 2820 if !ok { 2821 continue 2822 } 2823 switch fileType { 2824 case fileTypeLog: 2825 if fileNum >= minUnflushedLogNum { 2826 continue 2827 } 2828 fi := fileInfo{fileNum: fileNum} 2829 if stat, err := d.opts.FS.Stat(filename); err == nil { 2830 fi.fileSize = uint64(stat.Size()) 2831 } 2832 obsoleteLogs = append(obsoleteLogs, fi) 2833 case fileTypeManifest: 2834 if fileNum >= manifestFileNum { 2835 continue 2836 } 2837 fi := fileInfo{fileNum: fileNum} 2838 if stat, err := d.opts.FS.Stat(filename); err == nil { 2839 fi.fileSize = uint64(stat.Size()) 2840 } 2841 obsoleteManifests = append(obsoleteManifests, fi) 2842 case fileTypeOptions: 2843 if fileNum >= d.optionsFileNum { 2844 continue 2845 } 2846 fi := fileInfo{fileNum: fileNum} 2847 if stat, err := d.opts.FS.Stat(filename); err == nil { 2848 fi.fileSize = uint64(stat.Size()) 2849 } 2850 obsoleteOptions = append(obsoleteOptions, fi) 2851 case fileTypeTable: 2852 if _, ok := liveFileNums[fileNum]; ok { 2853 continue 2854 } 2855 fileMeta := &fileMetadata{ 2856 FileNum: fileNum, 2857 } 2858 if stat, err := d.opts.FS.Stat(filename); err == nil { 2859 fileMeta.Size = uint64(stat.Size()) 2860 } 2861 obsoleteTables = append(obsoleteTables, fileMeta) 2862 default: 2863 // Don't delete files we don't know about. 2864 continue 2865 } 2866 } 2867 2868 d.mu.log.queue = merge(d.mu.log.queue, obsoleteLogs) 2869 d.mu.versions.metrics.WAL.Files += int64(len(obsoleteLogs)) 2870 d.mu.versions.obsoleteTables = mergeFileMetas(d.mu.versions.obsoleteTables, obsoleteTables) 2871 d.mu.versions.incrementObsoleteTablesLocked(obsoleteTables) 2872 d.mu.versions.obsoleteManifests = merge(d.mu.versions.obsoleteManifests, obsoleteManifests) 2873 d.mu.versions.obsoleteOptions = merge(d.mu.versions.obsoleteOptions, obsoleteOptions) 2874 } 2875 2876 // disableFileDeletions disables file deletions and then waits for any 2877 // in-progress deletion to finish. The caller is required to call 2878 // enableFileDeletions in order to enable file deletions again. It is ok for 2879 // multiple callers to disable file deletions simultaneously, though they must 2880 // all invoke enableFileDeletions in order for file deletions to be re-enabled 2881 // (there is an internal reference count on file deletion disablement). 2882 // 2883 // d.mu must be held when calling this method. 2884 func (d *DB) disableFileDeletions() { 2885 d.mu.cleaner.disabled++ 2886 for d.mu.cleaner.cleaning { 2887 d.mu.cleaner.cond.Wait() 2888 } 2889 d.mu.cleaner.cond.Broadcast() 2890 } 2891 2892 // enableFileDeletions enables previously disabled file deletions. Note that if 2893 // file deletions have been re-enabled, the current goroutine will be used to 2894 // perform the queued up deletions. 2895 // 2896 // d.mu must be held when calling this method. 2897 func (d *DB) enableFileDeletions() { 2898 if d.mu.cleaner.disabled <= 0 || d.mu.cleaner.cleaning { 2899 panic("bitalostable: file deletion disablement invariant violated") 2900 } 2901 d.mu.cleaner.disabled-- 2902 if d.mu.cleaner.disabled > 0 { 2903 return 2904 } 2905 jobID := d.mu.nextJobID 2906 d.mu.nextJobID++ 2907 d.deleteObsoleteFiles(jobID, true /* waitForOngoing */) 2908 } 2909 2910 // d.mu must be held when calling this. 2911 func (d *DB) acquireCleaningTurn(waitForOngoing bool) bool { 2912 // Only allow a single delete obsolete files job to run at a time. 2913 for d.mu.cleaner.cleaning && d.mu.cleaner.disabled == 0 && waitForOngoing { 2914 d.mu.cleaner.cond.Wait() 2915 } 2916 if d.mu.cleaner.cleaning { 2917 return false 2918 } 2919 if d.mu.cleaner.disabled > 0 { 2920 // File deletions are currently disabled. When they are re-enabled a new 2921 // job will be created to catch up on file deletions. 2922 return false 2923 } 2924 d.mu.cleaner.cleaning = true 2925 return true 2926 } 2927 2928 // d.mu must be held when calling this. 2929 func (d *DB) releaseCleaningTurn() { 2930 d.mu.cleaner.cleaning = false 2931 d.mu.cleaner.cond.Broadcast() 2932 } 2933 2934 // deleteObsoleteFiles deletes those files that are no longer needed. If 2935 // waitForOngoing is true, it waits for any ongoing cleaning turns to complete, 2936 // and if false, it returns rightaway if a cleaning turn is ongoing. 2937 // 2938 // d.mu must be held when calling this, but the mutex may be dropped and 2939 // re-acquired during the course of this method. 2940 func (d *DB) deleteObsoleteFiles(jobID int, waitForOngoing bool) { 2941 if !d.acquireCleaningTurn(waitForOngoing) { 2942 return 2943 } 2944 d.doDeleteObsoleteFiles(jobID) 2945 d.releaseCleaningTurn() 2946 } 2947 2948 // obsoleteFile holds information about a file that needs to be deleted soon. 2949 type obsoleteFile struct { 2950 dir string 2951 fileNum base.FileNum 2952 fileType fileType 2953 fileSize uint64 2954 } 2955 2956 type fileInfo struct { 2957 fileNum FileNum 2958 fileSize uint64 2959 } 2960 2961 // d.mu must be held when calling this, but the mutex may be dropped and 2962 // re-acquired during the course of this method. 2963 func (d *DB) doDeleteObsoleteFiles(jobID int) { 2964 var obsoleteTables []fileInfo 2965 2966 defer func() { 2967 for _, tbl := range obsoleteTables { 2968 delete(d.mu.versions.zombieTables, tbl.fileNum) 2969 } 2970 }() 2971 2972 var obsoleteLogs []fileInfo 2973 for i := range d.mu.log.queue { 2974 // NB: d.mu.versions.minUnflushedLogNum is the log number of the earliest 2975 // log that has not had its contents flushed to an sstable. We can recycle 2976 // the prefix of d.mu.log.queue with log numbers less than 2977 // minUnflushedLogNum. 2978 if d.mu.log.queue[i].fileNum >= d.mu.versions.minUnflushedLogNum { 2979 obsoleteLogs = d.mu.log.queue[:i] 2980 d.mu.log.queue = d.mu.log.queue[i:] 2981 d.mu.versions.metrics.WAL.Files -= int64(len(obsoleteLogs)) 2982 break 2983 } 2984 } 2985 2986 for _, table := range d.mu.versions.obsoleteTables { 2987 obsoleteTables = append(obsoleteTables, fileInfo{ 2988 fileNum: table.FileNum, 2989 fileSize: table.Size, 2990 }) 2991 } 2992 d.mu.versions.obsoleteTables = nil 2993 2994 // Sort the manifests cause we want to delete some contiguous prefix 2995 // of the older manifests. 2996 sort.Slice(d.mu.versions.obsoleteManifests, func(i, j int) bool { 2997 return d.mu.versions.obsoleteManifests[i].fileNum < 2998 d.mu.versions.obsoleteManifests[j].fileNum 2999 }) 3000 3001 var obsoleteManifests []fileInfo 3002 manifestsToDelete := len(d.mu.versions.obsoleteManifests) - d.opts.NumPrevManifest 3003 if manifestsToDelete > 0 { 3004 obsoleteManifests = d.mu.versions.obsoleteManifests[:manifestsToDelete] 3005 d.mu.versions.obsoleteManifests = d.mu.versions.obsoleteManifests[manifestsToDelete:] 3006 if len(d.mu.versions.obsoleteManifests) == 0 { 3007 d.mu.versions.obsoleteManifests = nil 3008 } 3009 } 3010 3011 obsoleteOptions := d.mu.versions.obsoleteOptions 3012 d.mu.versions.obsoleteOptions = nil 3013 3014 // Release d.mu while doing I/O 3015 // Note the unusual order: Unlock and then Lock. 3016 d.mu.Unlock() 3017 defer d.mu.Lock() 3018 3019 files := [4]struct { 3020 fileType fileType 3021 obsolete []fileInfo 3022 }{ 3023 {fileTypeLog, obsoleteLogs}, 3024 {fileTypeTable, obsoleteTables}, 3025 {fileTypeManifest, obsoleteManifests}, 3026 {fileTypeOptions, obsoleteOptions}, 3027 } 3028 _, noRecycle := d.opts.Cleaner.(base.NeedsFileContents) 3029 filesToDelete := make([]obsoleteFile, 0, len(files)) 3030 for _, f := range files { 3031 // We sort to make the order of deletions deterministic, which is nice for 3032 // tests. 3033 sort.Slice(f.obsolete, func(i, j int) bool { 3034 return f.obsolete[i].fileNum < f.obsolete[j].fileNum 3035 }) 3036 for _, fi := range f.obsolete { 3037 dir := d.dirname 3038 switch f.fileType { 3039 case fileTypeLog: 3040 if !noRecycle && d.logRecycler.add(fi) { 3041 continue 3042 } 3043 dir = d.walDirname 3044 case fileTypeTable: 3045 d.tableCache.evict(fi.fileNum) 3046 } 3047 3048 filesToDelete = append(filesToDelete, obsoleteFile{ 3049 dir: dir, 3050 fileNum: fi.fileNum, 3051 fileType: f.fileType, 3052 fileSize: fi.fileSize, 3053 }) 3054 } 3055 } 3056 if len(filesToDelete) > 0 { 3057 d.deleters.Add(1) 3058 // Delete asynchronously if that could get held up in the pacer. 3059 if d.opts.Experimental.MinDeletionRate > 0 { 3060 go d.paceAndDeleteObsoleteFiles(jobID, filesToDelete) 3061 } else { 3062 d.paceAndDeleteObsoleteFiles(jobID, filesToDelete) 3063 } 3064 } 3065 } 3066 3067 // Paces and eventually deletes the list of obsolete files passed in. db.mu 3068 // must NOT be held when calling this method. 3069 func (d *DB) paceAndDeleteObsoleteFiles(jobID int, files []obsoleteFile) { 3070 defer d.deleters.Done() 3071 pacer := (pacer)(nilPacer) 3072 if d.opts.Experimental.MinDeletionRate > 0 { 3073 pacer = newDeletionPacer(d.deletionLimiter, d.getDeletionPacerInfo) 3074 } 3075 3076 for _, of := range files { 3077 path := base.MakeFilepath(d.opts.FS, of.dir, of.fileType, of.fileNum) 3078 if of.fileType == fileTypeTable { 3079 _ = pacer.maybeThrottle(of.fileSize) 3080 d.mu.Lock() 3081 d.mu.versions.metrics.Table.ObsoleteCount-- 3082 d.mu.versions.metrics.Table.ObsoleteSize -= of.fileSize 3083 d.mu.Unlock() 3084 } 3085 d.deleteObsoleteFile(of.fileType, jobID, path, of.fileNum) 3086 } 3087 } 3088 3089 func (d *DB) maybeScheduleObsoleteTableDeletion() { 3090 d.mu.Lock() 3091 defer d.mu.Unlock() 3092 3093 if len(d.mu.versions.obsoleteTables) == 0 { 3094 return 3095 } 3096 if !d.acquireCleaningTurn(false) { 3097 return 3098 } 3099 3100 go func() { 3101 pprof.Do(context.Background(), gcLabels, func(context.Context) { 3102 d.mu.Lock() 3103 defer d.mu.Unlock() 3104 3105 jobID := d.mu.nextJobID 3106 d.mu.nextJobID++ 3107 d.doDeleteObsoleteFiles(jobID) 3108 d.releaseCleaningTurn() 3109 }) 3110 }() 3111 } 3112 3113 // deleteObsoleteFile deletes file that is no longer needed. 3114 func (d *DB) deleteObsoleteFile(fileType fileType, jobID int, path string, fileNum FileNum) { 3115 // TODO(peter): need to handle this error, probably by re-adding the 3116 // file that couldn't be deleted to one of the obsolete slices map. 3117 err := d.opts.Cleaner.Clean(d.opts.FS, fileType, path) 3118 if oserror.IsNotExist(err) { 3119 return 3120 } 3121 3122 switch fileType { 3123 case fileTypeLog: 3124 d.opts.EventListener.WALDeleted(WALDeleteInfo{ 3125 JobID: jobID, 3126 Path: path, 3127 FileNum: fileNum, 3128 Err: err, 3129 }) 3130 case fileTypeManifest: 3131 d.opts.EventListener.ManifestDeleted(ManifestDeleteInfo{ 3132 JobID: jobID, 3133 Path: path, 3134 FileNum: fileNum, 3135 Err: err, 3136 }) 3137 case fileTypeTable: 3138 d.opts.EventListener.TableDeleted(TableDeleteInfo{ 3139 JobID: jobID, 3140 Path: path, 3141 FileNum: fileNum, 3142 Err: err, 3143 }) 3144 } 3145 } 3146 3147 func merge(a, b []fileInfo) []fileInfo { 3148 if len(b) == 0 { 3149 return a 3150 } 3151 3152 a = append(a, b...) 3153 sort.Slice(a, func(i, j int) bool { 3154 return a[i].fileNum < a[j].fileNum 3155 }) 3156 3157 n := 0 3158 for i := 0; i < len(a); i++ { 3159 if n == 0 || a[i].fileNum != a[n-1].fileNum { 3160 a[n] = a[i] 3161 n++ 3162 } 3163 } 3164 return a[:n] 3165 } 3166 3167 func mergeFileMetas(a, b []*fileMetadata) []*fileMetadata { 3168 if len(b) == 0 { 3169 return a 3170 } 3171 3172 a = append(a, b...) 3173 sort.Slice(a, func(i, j int) bool { 3174 return a[i].FileNum < a[j].FileNum 3175 }) 3176 3177 n := 0 3178 for i := 0; i < len(a); i++ { 3179 if n == 0 || a[i].FileNum != a[n-1].FileNum { 3180 a[n] = a[i] 3181 n++ 3182 } 3183 } 3184 return a[:n] 3185 }