github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/compaction.go (about) 1 // Copyright 2013 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "bytes" 9 "cmp" 10 "context" 11 "fmt" 12 "io" 13 "math" 14 "runtime/pprof" 15 "slices" 16 "sort" 17 "sync/atomic" 18 "time" 19 20 "github.com/cockroachdb/errors" 21 "github.com/cockroachdb/pebble/internal/base" 22 "github.com/cockroachdb/pebble/internal/invalidating" 23 "github.com/cockroachdb/pebble/internal/invariants" 24 "github.com/cockroachdb/pebble/internal/keyspan" 25 "github.com/cockroachdb/pebble/internal/manifest" 26 "github.com/cockroachdb/pebble/internal/private" 27 "github.com/cockroachdb/pebble/internal/rangedel" 28 "github.com/cockroachdb/pebble/internal/rangekey" 29 "github.com/cockroachdb/pebble/objstorage" 30 "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing" 31 "github.com/cockroachdb/pebble/objstorage/remote" 32 "github.com/cockroachdb/pebble/sstable" 33 "github.com/cockroachdb/pebble/vfs" 34 ) 35 36 var errEmptyTable = errors.New("pebble: empty table") 37 38 // ErrCancelledCompaction is returned if a compaction is cancelled by a 39 // concurrent excise or ingest-split operation. 40 var ErrCancelledCompaction = errors.New("pebble: compaction cancelled by a concurrent operation, will retry compaction") 41 42 var compactLabels = pprof.Labels("pebble", "compact") 43 var flushLabels = pprof.Labels("pebble", "flush") 44 var gcLabels = pprof.Labels("pebble", "gc") 45 46 // getInternalWriterProperties accesses a private variable (in the 47 // internal/private package) initialized by the sstable Writer. This indirection 48 // is necessary to ensure non-Pebble users constructing sstables for ingestion 49 // are unable to set internal-only properties. 50 var getInternalWriterProperties = private.SSTableInternalProperties.(func(*sstable.Writer) *sstable.Properties) 51 52 // expandedCompactionByteSizeLimit is the maximum number of bytes in all 53 // compacted files. We avoid expanding the lower level file set of a compaction 54 // if it would make the total compaction cover more than this many bytes. 55 func expandedCompactionByteSizeLimit(opts *Options, level int, availBytes uint64) uint64 { 56 v := uint64(25 * opts.Level(level).TargetFileSize) 57 58 // Never expand a compaction beyond half the available capacity, divided 59 // by the maximum number of concurrent compactions. Each of the concurrent 60 // compactions may expand up to this limit, so this attempts to limit 61 // compactions to half of available disk space. Note that this will not 62 // prevent compaction picking from pursuing compactions that are larger 63 // than this threshold before expansion. 64 diskMax := (availBytes / 2) / uint64(opts.MaxConcurrentCompactions()) 65 if v > diskMax { 66 v = diskMax 67 } 68 return v 69 } 70 71 // maxGrandparentOverlapBytes is the maximum bytes of overlap with level+1 72 // before we stop building a single file in a level-1 to level compaction. 73 func maxGrandparentOverlapBytes(opts *Options, level int) uint64 { 74 return uint64(10 * opts.Level(level).TargetFileSize) 75 } 76 77 // maxReadCompactionBytes is used to prevent read compactions which 78 // are too wide. 79 func maxReadCompactionBytes(opts *Options, level int) uint64 { 80 return uint64(10 * opts.Level(level).TargetFileSize) 81 } 82 83 // noCloseIter wraps around a FragmentIterator, intercepting and eliding 84 // calls to Close. It is used during compaction to ensure that rangeDelIters 85 // are not closed prematurely. 86 type noCloseIter struct { 87 keyspan.FragmentIterator 88 } 89 90 func (i noCloseIter) Close() error { 91 return nil 92 } 93 94 type compactionLevel struct { 95 level int 96 files manifest.LevelSlice 97 // l0SublevelInfo contains information about L0 sublevels being compacted. 98 // It's only set for the start level of a compaction starting out of L0 and 99 // is nil for all other compactions. 100 l0SublevelInfo []sublevelInfo 101 } 102 103 func (cl compactionLevel) Clone() compactionLevel { 104 newCL := compactionLevel{ 105 level: cl.level, 106 files: cl.files.Reslice(func(start, end *manifest.LevelIterator) {}), 107 } 108 return newCL 109 } 110 func (cl compactionLevel) String() string { 111 return fmt.Sprintf(`Level %d, Files %s`, cl.level, cl.files) 112 } 113 114 // Return output from compactionOutputSplitters. See comment on 115 // compactionOutputSplitter.shouldSplitBefore() on how this value is used. 116 type maybeSplit int 117 118 const ( 119 noSplit maybeSplit = iota 120 splitNow 121 ) 122 123 // String implements the Stringer interface. 124 func (c maybeSplit) String() string { 125 if c == noSplit { 126 return "no-split" 127 } 128 return "split-now" 129 } 130 131 // compactionOutputSplitter is an interface for encapsulating logic around 132 // switching the output of a compaction to a new output file. Additional 133 // constraints around switching compaction outputs that are specific to that 134 // compaction type (eg. flush splits) are implemented in 135 // compactionOutputSplitters that compose other child compactionOutputSplitters. 136 type compactionOutputSplitter interface { 137 // shouldSplitBefore returns whether we should split outputs before the 138 // specified "current key". The return value is splitNow or noSplit. 139 // splitNow means a split is advised before the specified key, and noSplit 140 // means no split is advised. If shouldSplitBefore(a) advises a split then 141 // shouldSplitBefore(b) should also advise a split given b >= a, until 142 // onNewOutput is called. 143 shouldSplitBefore(key *InternalKey, tw *sstable.Writer) maybeSplit 144 // onNewOutput updates internal splitter state when the compaction switches 145 // to a new sstable, and returns the next limit for the new output which 146 // would get used to truncate range tombstones if the compaction iterator 147 // runs out of keys. The limit returned MUST be > key according to the 148 // compaction's comparator. The specified key is the first key in the new 149 // output, or nil if this sstable will only contain range tombstones already 150 // in the fragmenter. 151 onNewOutput(key []byte) []byte 152 } 153 154 // fileSizeSplitter is a compactionOutputSplitter that enforces target file 155 // sizes. This splitter splits to a new output file when the estimated file size 156 // is 0.5x-2x the target file size. If there are overlapping grandparent files, 157 // this splitter will attempt to split at a grandparent boundary. For example, 158 // consider the example where a compaction wrote 'd' to the current output file, 159 // and the next key has a user key 'g': 160 // 161 // previous key next key 162 // | | 163 // | | 164 // +---------------|----+ +--|----------+ 165 // grandparents: | 000006 | | | | 000007 | 166 // +---------------|----+ +--|----------+ 167 // a b d e f g i 168 // 169 // Splitting the output file F before 'g' will ensure that the current output 170 // file F does not overlap the grandparent file 000007. Aligning sstable 171 // boundaries like this can significantly reduce write amplification, since a 172 // subsequent compaction of F into the grandparent level will avoid needlessly 173 // rewriting any keys within 000007 that do not overlap F's bounds. Consider the 174 // following compaction: 175 // 176 // +----------------------+ 177 // input | | 178 // level +----------------------+ 179 // \/ 180 // +---------------+ +---------------+ 181 // output |XXXXXXX| | | |XXXXXXXX| 182 // level +---------------+ +---------------+ 183 // 184 // The input-level file overlaps two files in the output level, but only 185 // partially. The beginning of the first output-level file and the end of the 186 // second output-level file will be rewritten verbatim. This write I/O is 187 // "wasted" in the sense that no merging is being performed. 188 // 189 // To prevent the above waste, this splitter attempts to split output files 190 // before the start key of grandparent files. It still strives to write output 191 // files of approximately the target file size, by constraining this splitting 192 // at grandparent points to apply only if the current output's file size is 193 // about the right order of magnitude. 194 // 195 // Note that, unlike most other splitters, this splitter does not guarantee that 196 // it will advise splits only at user key change boundaries. 197 type fileSizeSplitter struct { 198 frontier frontier 199 targetFileSize uint64 200 atGrandparentBoundary bool 201 boundariesObserved uint64 202 nextGrandparent *fileMetadata 203 grandparents manifest.LevelIterator 204 } 205 206 func newFileSizeSplitter( 207 f *frontiers, targetFileSize uint64, grandparents manifest.LevelIterator, 208 ) *fileSizeSplitter { 209 s := &fileSizeSplitter{targetFileSize: targetFileSize} 210 s.nextGrandparent = grandparents.First() 211 s.grandparents = grandparents 212 if s.nextGrandparent != nil { 213 s.frontier.Init(f, s.nextGrandparent.Smallest.UserKey, s.reached) 214 } 215 return s 216 } 217 218 func (f *fileSizeSplitter) reached(nextKey []byte) []byte { 219 f.atGrandparentBoundary = true 220 f.boundariesObserved++ 221 // NB: f.grandparents is a bounded iterator, constrained to the compaction 222 // key range. 223 f.nextGrandparent = f.grandparents.Next() 224 if f.nextGrandparent == nil { 225 return nil 226 } 227 // TODO(jackson): Should we also split before or immediately after 228 // grandparents' largest keys? Splitting before the start boundary prevents 229 // overlap with the grandparent. Also splitting after the end boundary may 230 // increase the probability of move compactions. 231 return f.nextGrandparent.Smallest.UserKey 232 } 233 234 func (f *fileSizeSplitter) shouldSplitBefore(key *InternalKey, tw *sstable.Writer) maybeSplit { 235 atGrandparentBoundary := f.atGrandparentBoundary 236 237 // Clear f.atGrandparentBoundary unconditionally. 238 // 239 // This is a bit subtle. Even if do decide to split, it's possible that a 240 // higher-level splitter will ignore our request (eg, because we're between 241 // two internal keys with the same user key). In this case, the next call to 242 // shouldSplitBefore will find atGrandparentBoundary=false. This is 243 // desirable, because in this case we would've already written the earlier 244 // key with the same user key to the output file. The current output file is 245 // already doomed to overlap the grandparent whose bound triggered 246 // atGrandparentBoundary=true. We should continue on, waiting for the next 247 // grandparent boundary. 248 f.atGrandparentBoundary = false 249 250 // If the key is a range tombstone, the EstimatedSize may not grow right 251 // away when a range tombstone is added to the fragmenter: It's dependent on 252 // whether or not the this new range deletion will start a new fragment. 253 // Range deletions are rare, so we choose to simply not split yet. 254 // TODO(jackson): Reconsider this, and consider range keys too as a part of 255 // #2321. 256 if key.Kind() == InternalKeyKindRangeDelete || tw == nil { 257 return noSplit 258 } 259 260 estSize := tw.EstimatedSize() 261 switch { 262 case estSize < f.targetFileSize/2: 263 // The estimated file size is less than half the target file size. Don't 264 // split it, even if currently aligned with a grandparent file because 265 // it's too small. 266 return noSplit 267 case estSize >= 2*f.targetFileSize: 268 // The estimated file size is double the target file size. Split it even 269 // if we were not aligned with a grandparent file boundary to avoid 270 // excessively exceeding the target file size. 271 return splitNow 272 case !atGrandparentBoundary: 273 // Don't split if we're not at a grandparent, except if we've exhausted 274 // all the grandparents overlapping this compaction's key range. Then we 275 // may want to split purely based on file size. 276 if f.nextGrandparent == nil { 277 // There are no more grandparents. Optimize for the target file size 278 // and split as soon as we hit the target file size. 279 if estSize >= f.targetFileSize { 280 return splitNow 281 } 282 } 283 return noSplit 284 default: 285 // INVARIANT: atGrandparentBoundary 286 // INVARIANT: targetSize/2 < estSize < 2*targetSize 287 // 288 // The estimated file size is close enough to the target file size that 289 // we should consider splitting. 290 // 291 // Determine whether to split now based on how many grandparent 292 // boundaries we have already observed while building this output file. 293 // The intuition here is that if the grandparent level is dense in this 294 // part of the keyspace, we're likely to continue to have more 295 // opportunities to split this file aligned with a grandparent. If this 296 // is the first grandparent boundary observed, we split immediately 297 // (we're already at ≥50% the target file size). Otherwise, each 298 // overlapping grandparent we've observed increases the minimum file 299 // size by 5% of the target file size, up to at most 90% of the target 300 // file size. 301 // 302 // TODO(jackson): The particular thresholds are somewhat unprincipled. 303 // This is the same heuristic as RocksDB implements. Is there are more 304 // principled formulation that can, further reduce w-amp, produce files 305 // closer to the target file size, or is more understandable? 306 307 // NB: Subtract 1 from `boundariesObserved` to account for the current 308 // boundary we're considering splitting at. `reached` will have 309 // incremented it at the same time it set `atGrandparentBoundary`. 310 minimumPctOfTargetSize := 50 + 5*min(f.boundariesObserved-1, 8) 311 if estSize < (minimumPctOfTargetSize*f.targetFileSize)/100 { 312 return noSplit 313 } 314 return splitNow 315 } 316 } 317 318 func (f *fileSizeSplitter) onNewOutput(key []byte) []byte { 319 f.boundariesObserved = 0 320 return nil 321 } 322 323 func newLimitFuncSplitter(f *frontiers, limitFunc func(userKey []byte) []byte) *limitFuncSplitter { 324 s := &limitFuncSplitter{limitFunc: limitFunc} 325 s.frontier.Init(f, nil, s.reached) 326 return s 327 } 328 329 type limitFuncSplitter struct { 330 frontier frontier 331 limitFunc func(userKey []byte) []byte 332 split maybeSplit 333 } 334 335 func (lf *limitFuncSplitter) shouldSplitBefore(key *InternalKey, tw *sstable.Writer) maybeSplit { 336 return lf.split 337 } 338 339 func (lf *limitFuncSplitter) reached(nextKey []byte) []byte { 340 lf.split = splitNow 341 return nil 342 } 343 344 func (lf *limitFuncSplitter) onNewOutput(key []byte) []byte { 345 lf.split = noSplit 346 if key != nil { 347 // TODO(jackson): For some users, like L0 flush splits, there's no need 348 // to binary search over all the flush splits every time. The next split 349 // point must be ahead of the previous flush split point. 350 limit := lf.limitFunc(key) 351 lf.frontier.Update(limit) 352 return limit 353 } 354 lf.frontier.Update(nil) 355 return nil 356 } 357 358 // splitterGroup is a compactionOutputSplitter that splits whenever one of its 359 // child splitters advises a compaction split. 360 type splitterGroup struct { 361 cmp Compare 362 splitters []compactionOutputSplitter 363 } 364 365 func (a *splitterGroup) shouldSplitBefore( 366 key *InternalKey, tw *sstable.Writer, 367 ) (suggestion maybeSplit) { 368 for _, splitter := range a.splitters { 369 if splitter.shouldSplitBefore(key, tw) == splitNow { 370 return splitNow 371 } 372 } 373 return noSplit 374 } 375 376 func (a *splitterGroup) onNewOutput(key []byte) []byte { 377 var earliestLimit []byte 378 for _, splitter := range a.splitters { 379 limit := splitter.onNewOutput(key) 380 if limit == nil { 381 continue 382 } 383 if earliestLimit == nil || a.cmp(limit, earliestLimit) < 0 { 384 earliestLimit = limit 385 } 386 } 387 return earliestLimit 388 } 389 390 // userKeyChangeSplitter is a compactionOutputSplitter that takes in a child 391 // splitter, and splits when 1) that child splitter has advised a split, and 2) 392 // the compaction output is at the boundary between two user keys (also 393 // the boundary between atomic compaction units). Use this splitter to wrap 394 // any splitters that don't guarantee user key splits (i.e. splitters that make 395 // their determination in ways other than comparing the current key against a 396 // limit key.) If a wrapped splitter advises a split, it must continue 397 // to advise a split until a new output. 398 type userKeyChangeSplitter struct { 399 cmp Compare 400 splitter compactionOutputSplitter 401 unsafePrevUserKey func() []byte 402 } 403 404 func (u *userKeyChangeSplitter) shouldSplitBefore(key *InternalKey, tw *sstable.Writer) maybeSplit { 405 // NB: The userKeyChangeSplitter only needs to suffer a key comparison if 406 // the wrapped splitter requests a split. 407 // 408 // We could implement this splitter using frontiers: When the inner splitter 409 // requests a split before key `k`, we'd update a frontier to be 410 // ImmediateSuccessor(k). Then on the next key greater than >k, the 411 // frontier's `reached` func would be called and we'd return splitNow. 412 // This doesn't really save work since duplicate user keys are rare, and it 413 // requires us to materialize the ImmediateSuccessor key. It also prevents 414 // us from splitting on the same key that the inner splitter requested a 415 // split for—instead we need to wait until the next key. The current 416 // implementation uses `unsafePrevUserKey` to gain access to the previous 417 // key which allows it to immediately respect the inner splitter if 418 // possible. 419 if split := u.splitter.shouldSplitBefore(key, tw); split != splitNow { 420 return split 421 } 422 if u.cmp(key.UserKey, u.unsafePrevUserKey()) > 0 { 423 return splitNow 424 } 425 return noSplit 426 } 427 428 func (u *userKeyChangeSplitter) onNewOutput(key []byte) []byte { 429 return u.splitter.onNewOutput(key) 430 } 431 432 // compactionWritable is a objstorage.Writable wrapper that, on every write, 433 // updates a metric in `versions` on bytes written by in-progress compactions so 434 // far. It also increments a per-compaction `written` int. 435 type compactionWritable struct { 436 objstorage.Writable 437 438 versions *versionSet 439 written *int64 440 } 441 442 // Write is part of the objstorage.Writable interface. 443 func (c *compactionWritable) Write(p []byte) error { 444 if err := c.Writable.Write(p); err != nil { 445 return err 446 } 447 448 *c.written += int64(len(p)) 449 c.versions.incrementCompactionBytes(int64(len(p))) 450 return nil 451 } 452 453 type compactionKind int 454 455 const ( 456 compactionKindDefault compactionKind = iota 457 compactionKindFlush 458 // compactionKindMove denotes a move compaction where the input file is 459 // retained and linked in a new level without being obsoleted. 460 compactionKindMove 461 // compactionKindCopy denotes a copy compaction where the input file is 462 // copied byte-by-byte into a new file with a new FileNum in the output level. 463 compactionKindCopy 464 compactionKindDeleteOnly 465 compactionKindElisionOnly 466 compactionKindRead 467 compactionKindRewrite 468 compactionKindIngestedFlushable 469 ) 470 471 func (k compactionKind) String() string { 472 switch k { 473 case compactionKindDefault: 474 return "default" 475 case compactionKindFlush: 476 return "flush" 477 case compactionKindMove: 478 return "move" 479 case compactionKindDeleteOnly: 480 return "delete-only" 481 case compactionKindElisionOnly: 482 return "elision-only" 483 case compactionKindRead: 484 return "read" 485 case compactionKindRewrite: 486 return "rewrite" 487 case compactionKindIngestedFlushable: 488 return "ingested-flushable" 489 case compactionKindCopy: 490 return "copy" 491 } 492 return "?" 493 } 494 495 // rangeKeyCompactionTransform is used to transform range key spans as part of the 496 // keyspan.MergingIter. As part of this transformation step, we can elide range 497 // keys in the last snapshot stripe, as well as coalesce range keys within 498 // snapshot stripes. 499 func rangeKeyCompactionTransform( 500 eq base.Equal, snapshots []uint64, elideRangeKey func(start, end []byte) bool, 501 ) keyspan.Transformer { 502 return keyspan.TransformerFunc(func(cmp base.Compare, s keyspan.Span, dst *keyspan.Span) error { 503 elideInLastStripe := func(keys []keyspan.Key) []keyspan.Key { 504 // Unsets and deletes in the last snapshot stripe can be elided. 505 k := 0 506 for j := range keys { 507 if elideRangeKey(s.Start, s.End) && 508 (keys[j].Kind() == InternalKeyKindRangeKeyUnset || keys[j].Kind() == InternalKeyKindRangeKeyDelete) { 509 continue 510 } 511 keys[k] = keys[j] 512 k++ 513 } 514 keys = keys[:k] 515 return keys 516 } 517 // snapshots are in ascending order, while s.keys are in descending seqnum 518 // order. Partition s.keys by snapshot stripes, and call rangekey.Coalesce 519 // on each partition. 520 dst.Start = s.Start 521 dst.End = s.End 522 dst.Keys = dst.Keys[:0] 523 i, j := len(snapshots)-1, 0 524 usedLen := 0 525 for i >= 0 { 526 start := j 527 for j < len(s.Keys) && !base.Visible(s.Keys[j].SeqNum(), snapshots[i], base.InternalKeySeqNumMax) { 528 // Include j in current partition. 529 j++ 530 } 531 if j > start { 532 keysDst := dst.Keys[usedLen:cap(dst.Keys)] 533 if err := rangekey.Coalesce(cmp, eq, s.Keys[start:j], &keysDst); err != nil { 534 return err 535 } 536 if j == len(s.Keys) { 537 // This is the last snapshot stripe. Unsets and deletes can be elided. 538 keysDst = elideInLastStripe(keysDst) 539 } 540 usedLen += len(keysDst) 541 dst.Keys = append(dst.Keys, keysDst...) 542 } 543 i-- 544 } 545 if j < len(s.Keys) { 546 keysDst := dst.Keys[usedLen:cap(dst.Keys)] 547 if err := rangekey.Coalesce(cmp, eq, s.Keys[j:], &keysDst); err != nil { 548 return err 549 } 550 keysDst = elideInLastStripe(keysDst) 551 usedLen += len(keysDst) 552 dst.Keys = append(dst.Keys, keysDst...) 553 } 554 return nil 555 }) 556 } 557 558 // compaction is a table compaction from one level to the next, starting from a 559 // given version. 560 type compaction struct { 561 // cancel is a bool that can be used by other goroutines to signal a compaction 562 // to cancel, such as if a conflicting excise operation raced it to manifest 563 // application. Only holders of the manifest lock will write to this atomic. 564 cancel atomic.Bool 565 566 kind compactionKind 567 cmp Compare 568 equal Equal 569 comparer *base.Comparer 570 formatKey base.FormatKey 571 logger Logger 572 version *version 573 stats base.InternalIteratorStats 574 beganAt time.Time 575 // versionEditApplied is set to true when a compaction has completed and the 576 // resulting version has been installed (if successful), but the compaction 577 // goroutine is still cleaning up (eg, deleting obsolete files). 578 versionEditApplied bool 579 bufferPool sstable.BufferPool 580 581 // startLevel is the level that is being compacted. Inputs from startLevel 582 // and outputLevel will be merged to produce a set of outputLevel files. 583 startLevel *compactionLevel 584 585 // outputLevel is the level that files are being produced in. outputLevel is 586 // equal to startLevel+1 except when: 587 // - if startLevel is 0, the output level equals compactionPicker.baseLevel(). 588 // - in multilevel compaction, the output level is the lowest level involved in 589 // the compaction 590 // A compaction's outputLevel is nil for delete-only compactions. 591 outputLevel *compactionLevel 592 593 // extraLevels point to additional levels in between the input and output 594 // levels that get compacted in multilevel compactions 595 extraLevels []*compactionLevel 596 597 inputs []compactionLevel 598 599 // maxOutputFileSize is the maximum size of an individual table created 600 // during compaction. 601 maxOutputFileSize uint64 602 // maxOverlapBytes is the maximum number of bytes of overlap allowed for a 603 // single output table with the tables in the grandparent level. 604 maxOverlapBytes uint64 605 // disableSpanElision disables elision of range tombstones and range keys. Used 606 // by tests to allow range tombstones or range keys to be added to tables where 607 // they would otherwise be elided. 608 disableSpanElision bool 609 610 // flushing contains the flushables (aka memtables) that are being flushed. 611 flushing flushableList 612 // bytesIterated contains the number of bytes that have been flushed/compacted. 613 bytesIterated uint64 614 // bytesWritten contains the number of bytes that have been written to outputs. 615 bytesWritten int64 616 617 // The boundaries of the input data. 618 smallest InternalKey 619 largest InternalKey 620 621 // The range deletion tombstone fragmenter. Adds range tombstones as they are 622 // returned from `compactionIter` and fragments them for output to files. 623 // Referenced by `compactionIter` which uses it to check whether keys are deleted. 624 rangeDelFrag keyspan.Fragmenter 625 // The range key fragmenter. Similar to rangeDelFrag in that it gets range 626 // keys from the compaction iter and fragments them for output to files. 627 rangeKeyFrag keyspan.Fragmenter 628 // The range deletion tombstone iterator, that merges and fragments 629 // tombstones across levels. This iterator is included within the compaction 630 // input iterator as a single level. 631 // TODO(jackson): Remove this when the refactor of FragmentIterator, 632 // InterleavingIterator, etc is complete. 633 rangeDelIter keyspan.InternalIteratorShim 634 // rangeKeyInterleaving is the interleaving iter for range keys. 635 rangeKeyInterleaving keyspan.InterleavingIter 636 637 // A list of objects to close when the compaction finishes. Used by input 638 // iteration to keep rangeDelIters open for the lifetime of the compaction, 639 // and only close them when the compaction finishes. 640 closers []io.Closer 641 642 // grandparents are the tables in level+2 that overlap with the files being 643 // compacted. Used to determine output table boundaries. Do not assume that the actual files 644 // in the grandparent when this compaction finishes will be the same. 645 grandparents manifest.LevelSlice 646 647 // Boundaries at which flushes to L0 should be split. Determined by 648 // L0Sublevels. If nil, flushes aren't split. 649 l0Limits [][]byte 650 651 // List of disjoint inuse key ranges the compaction overlaps with in 652 // grandparent and lower levels. See setupInuseKeyRanges() for the 653 // construction. Used by elideTombstone() and elideRangeTombstone() to 654 // determine if keys affected by a tombstone possibly exist at a lower level. 655 inuseKeyRanges []manifest.UserKeyRange 656 // inuseEntireRange is set if the above inuse key ranges wholly contain the 657 // compaction's key range. This allows compactions in higher levels to often 658 // elide key comparisons. 659 inuseEntireRange bool 660 elideTombstoneIndex int 661 662 // allowedZeroSeqNum is true if seqnums can be zeroed if there are no 663 // snapshots requiring them to be kept. This determination is made by 664 // looking for an sstable which overlaps the bounds of the compaction at a 665 // lower level in the LSM during runCompaction. 666 allowedZeroSeqNum bool 667 668 metrics map[int]*LevelMetrics 669 670 pickerMetrics compactionPickerMetrics 671 } 672 673 func (c *compaction) makeInfo(jobID int) CompactionInfo { 674 info := CompactionInfo{ 675 JobID: jobID, 676 Reason: c.kind.String(), 677 Input: make([]LevelInfo, 0, len(c.inputs)), 678 Annotations: []string{}, 679 } 680 for _, cl := range c.inputs { 681 inputInfo := LevelInfo{Level: cl.level, Tables: nil} 682 iter := cl.files.Iter() 683 for m := iter.First(); m != nil; m = iter.Next() { 684 inputInfo.Tables = append(inputInfo.Tables, m.TableInfo()) 685 } 686 info.Input = append(info.Input, inputInfo) 687 } 688 if c.outputLevel != nil { 689 info.Output.Level = c.outputLevel.level 690 691 // If there are no inputs from the output level (eg, a move 692 // compaction), add an empty LevelInfo to info.Input. 693 if len(c.inputs) > 0 && c.inputs[len(c.inputs)-1].level != c.outputLevel.level { 694 info.Input = append(info.Input, LevelInfo{Level: c.outputLevel.level}) 695 } 696 } else { 697 // For a delete-only compaction, set the output level to L6. The 698 // output level is not meaningful here, but complicating the 699 // info.Output interface with a pointer doesn't seem worth the 700 // semantic distinction. 701 info.Output.Level = numLevels - 1 702 } 703 704 for i, score := range c.pickerMetrics.scores { 705 info.Input[i].Score = score 706 } 707 info.SingleLevelOverlappingRatio = c.pickerMetrics.singleLevelOverlappingRatio 708 info.MultiLevelOverlappingRatio = c.pickerMetrics.multiLevelOverlappingRatio 709 if len(info.Input) > 2 { 710 info.Annotations = append(info.Annotations, "multilevel") 711 } 712 return info 713 } 714 715 func newCompaction( 716 pc *pickedCompaction, opts *Options, beganAt time.Time, provider objstorage.Provider, 717 ) *compaction { 718 c := &compaction{ 719 kind: compactionKindDefault, 720 cmp: pc.cmp, 721 equal: opts.equal(), 722 comparer: opts.Comparer, 723 formatKey: opts.Comparer.FormatKey, 724 inputs: pc.inputs, 725 smallest: pc.smallest, 726 largest: pc.largest, 727 logger: opts.Logger, 728 version: pc.version, 729 beganAt: beganAt, 730 maxOutputFileSize: pc.maxOutputFileSize, 731 maxOverlapBytes: pc.maxOverlapBytes, 732 pickerMetrics: pc.pickerMetrics, 733 } 734 c.startLevel = &c.inputs[0] 735 if pc.startLevel.l0SublevelInfo != nil { 736 c.startLevel.l0SublevelInfo = pc.startLevel.l0SublevelInfo 737 } 738 c.outputLevel = &c.inputs[1] 739 740 if len(pc.extraLevels) > 0 { 741 c.extraLevels = pc.extraLevels 742 c.outputLevel = &c.inputs[len(c.inputs)-1] 743 } 744 // Compute the set of outputLevel+1 files that overlap this compaction (these 745 // are the grandparent sstables). 746 if c.outputLevel.level+1 < numLevels { 747 c.grandparents = c.version.Overlaps(c.outputLevel.level+1, c.cmp, 748 c.smallest.UserKey, c.largest.UserKey, c.largest.IsExclusiveSentinel()) 749 } 750 c.setupInuseKeyRanges() 751 c.kind = pc.kind 752 753 if c.kind == compactionKindDefault && c.outputLevel.files.Empty() && !c.hasExtraLevelData() && 754 c.startLevel.files.Len() == 1 && c.grandparents.SizeSum() <= c.maxOverlapBytes { 755 // This compaction can be converted into a move or copy from one level 756 // to the next. We avoid such a move if there is lots of overlapping 757 // grandparent data. Otherwise, the move could create a parent file 758 // that will require a very expensive merge later on. 759 iter := c.startLevel.files.Iter() 760 meta := iter.First() 761 isRemote := false 762 // We should always be passed a provider, except in some unit tests. 763 if provider != nil { 764 objMeta, err := provider.Lookup(fileTypeTable, meta.FileBacking.DiskFileNum) 765 if err != nil { 766 panic(errors.Wrapf(err, "cannot lookup table %s in provider", meta.FileBacking.DiskFileNum)) 767 } 768 isRemote = objMeta.IsRemote() 769 } 770 // Avoid a trivial move or copy if all of these are true, as rewriting a 771 // new file is better: 772 // 773 // 1) The source file is a virtual sstable 774 // 2) The existing file `meta` is on non-remote storage 775 // 3) The output level prefers shared storage 776 mustCopy := !isRemote && remote.ShouldCreateShared(opts.Experimental.CreateOnShared, c.outputLevel.level) 777 if mustCopy { 778 // If the source is virtual, it's best to just rewrite the file as all 779 // conditions in the above comment are met. 780 if !meta.Virtual { 781 c.kind = compactionKindCopy 782 } 783 } else { 784 c.kind = compactionKindMove 785 } 786 } 787 return c 788 } 789 790 func newDeleteOnlyCompaction( 791 opts *Options, cur *version, inputs []compactionLevel, beganAt time.Time, 792 ) *compaction { 793 c := &compaction{ 794 kind: compactionKindDeleteOnly, 795 cmp: opts.Comparer.Compare, 796 equal: opts.equal(), 797 comparer: opts.Comparer, 798 formatKey: opts.Comparer.FormatKey, 799 logger: opts.Logger, 800 version: cur, 801 beganAt: beganAt, 802 inputs: inputs, 803 } 804 805 // Set c.smallest, c.largest. 806 files := make([]manifest.LevelIterator, 0, len(inputs)) 807 for _, in := range inputs { 808 files = append(files, in.files.Iter()) 809 } 810 c.smallest, c.largest = manifest.KeyRange(opts.Comparer.Compare, files...) 811 return c 812 } 813 814 func adjustGrandparentOverlapBytesForFlush(c *compaction, flushingBytes uint64) { 815 // Heuristic to place a lower bound on compaction output file size 816 // caused by Lbase. Prior to this heuristic we have observed an L0 in 817 // production with 310K files of which 290K files were < 10KB in size. 818 // Our hypothesis is that it was caused by L1 having 2600 files and 819 // ~10GB, such that each flush got split into many tiny files due to 820 // overlapping with most of the files in Lbase. 821 // 822 // The computation below is general in that it accounts 823 // for flushing different volumes of data (e.g. we may be flushing 824 // many memtables). For illustration, we consider the typical 825 // example of flushing a 64MB memtable. So 12.8MB output, 826 // based on the compression guess below. If the compressed bytes 827 // guess is an over-estimate we will end up with smaller files, 828 // and if an under-estimate we will end up with larger files. 829 // With a 2MB target file size, 7 files. We are willing to accept 830 // 4x the number of files, if it results in better write amplification 831 // when later compacting to Lbase, i.e., ~450KB files (target file 832 // size / 4). 833 // 834 // Note that this is a pessimistic heuristic in that 835 // fileCountUpperBoundDueToGrandparents could be far from the actual 836 // number of files produced due to the grandparent limits. For 837 // example, in the extreme, consider a flush that overlaps with 1000 838 // files in Lbase f0...f999, and the initially calculated value of 839 // maxOverlapBytes will cause splits at f10, f20,..., f990, which 840 // means an upper bound file count of 100 files. Say the input bytes 841 // in the flush are such that acceptableFileCount=10. We will fatten 842 // up maxOverlapBytes by 10x to ensure that the upper bound file count 843 // drops to 10. However, it is possible that in practice, even without 844 // this change, we would have produced no more than 10 files, and that 845 // this change makes the files unnecessarily wide. Say the input bytes 846 // are distributed such that 10% are in f0...f9, 10% in f10...f19, ... 847 // 10% in f80...f89 and 10% in f990...f999. The original value of 848 // maxOverlapBytes would have actually produced only 10 sstables. But 849 // by increasing maxOverlapBytes by 10x, we may produce 1 sstable that 850 // spans f0...f89, i.e., a much wider sstable than necessary. 851 // 852 // We could produce a tighter estimate of 853 // fileCountUpperBoundDueToGrandparents if we had knowledge of the key 854 // distribution of the flush. The 4x multiplier mentioned earlier is 855 // a way to try to compensate for this pessimism. 856 // 857 // TODO(sumeer): we don't have compression info for the data being 858 // flushed, but it is likely that existing files that overlap with 859 // this flush in Lbase are representative wrt compression ratio. We 860 // could store the uncompressed size in FileMetadata and estimate 861 // the compression ratio. 862 const approxCompressionRatio = 0.2 863 approxOutputBytes := approxCompressionRatio * float64(flushingBytes) 864 approxNumFilesBasedOnTargetSize := 865 int(math.Ceil(approxOutputBytes / float64(c.maxOutputFileSize))) 866 acceptableFileCount := float64(4 * approxNumFilesBasedOnTargetSize) 867 // The byte calculation is linear in numGrandparentFiles, but we will 868 // incur this linear cost in findGrandparentLimit too, so we are also 869 // willing to pay it now. We could approximate this cheaply by using 870 // the mean file size of Lbase. 871 grandparentFileBytes := c.grandparents.SizeSum() 872 fileCountUpperBoundDueToGrandparents := 873 float64(grandparentFileBytes) / float64(c.maxOverlapBytes) 874 if fileCountUpperBoundDueToGrandparents > acceptableFileCount { 875 c.maxOverlapBytes = uint64( 876 float64(c.maxOverlapBytes) * 877 (fileCountUpperBoundDueToGrandparents / acceptableFileCount)) 878 } 879 } 880 881 func newFlush( 882 opts *Options, cur *version, baseLevel int, flushing flushableList, beganAt time.Time, 883 ) *compaction { 884 c := &compaction{ 885 kind: compactionKindFlush, 886 cmp: opts.Comparer.Compare, 887 equal: opts.equal(), 888 comparer: opts.Comparer, 889 formatKey: opts.Comparer.FormatKey, 890 logger: opts.Logger, 891 version: cur, 892 beganAt: beganAt, 893 inputs: []compactionLevel{{level: -1}, {level: 0}}, 894 maxOutputFileSize: math.MaxUint64, 895 maxOverlapBytes: math.MaxUint64, 896 flushing: flushing, 897 } 898 c.startLevel = &c.inputs[0] 899 c.outputLevel = &c.inputs[1] 900 901 if len(flushing) > 0 { 902 if _, ok := flushing[0].flushable.(*ingestedFlushable); ok { 903 if len(flushing) != 1 { 904 panic("pebble: ingestedFlushable must be flushed one at a time.") 905 } 906 c.kind = compactionKindIngestedFlushable 907 return c 908 } 909 } 910 911 // Make sure there's no ingestedFlushable after the first flushable in the 912 // list. 913 for _, f := range flushing { 914 if _, ok := f.flushable.(*ingestedFlushable); ok { 915 panic("pebble: flushing shouldn't contain ingestedFlushable flushable") 916 } 917 } 918 919 if cur.L0Sublevels != nil { 920 c.l0Limits = cur.L0Sublevels.FlushSplitKeys() 921 } 922 923 smallestSet, largestSet := false, false 924 updatePointBounds := func(iter internalIterator) { 925 if key, _ := iter.First(); key != nil { 926 if !smallestSet || 927 base.InternalCompare(c.cmp, c.smallest, *key) > 0 { 928 smallestSet = true 929 c.smallest = key.Clone() 930 } 931 } 932 if key, _ := iter.Last(); key != nil { 933 if !largestSet || 934 base.InternalCompare(c.cmp, c.largest, *key) < 0 { 935 largestSet = true 936 c.largest = key.Clone() 937 } 938 } 939 } 940 941 updateRangeBounds := func(iter keyspan.FragmentIterator) { 942 // File bounds require s != nil && !s.Empty(). We only need to check for 943 // s != nil here, as the memtable's FragmentIterator would never surface 944 // empty spans. 945 if s := iter.First(); s != nil { 946 if key := s.SmallestKey(); !smallestSet || 947 base.InternalCompare(c.cmp, c.smallest, key) > 0 { 948 smallestSet = true 949 c.smallest = key.Clone() 950 } 951 } 952 if s := iter.Last(); s != nil { 953 if key := s.LargestKey(); !largestSet || 954 base.InternalCompare(c.cmp, c.largest, key) < 0 { 955 largestSet = true 956 c.largest = key.Clone() 957 } 958 } 959 } 960 961 var flushingBytes uint64 962 for i := range flushing { 963 f := flushing[i] 964 updatePointBounds(f.newIter(nil)) 965 if rangeDelIter := f.newRangeDelIter(nil); rangeDelIter != nil { 966 updateRangeBounds(rangeDelIter) 967 } 968 if rangeKeyIter := f.newRangeKeyIter(nil); rangeKeyIter != nil { 969 updateRangeBounds(rangeKeyIter) 970 } 971 flushingBytes += f.inuseBytes() 972 } 973 974 if opts.FlushSplitBytes > 0 { 975 c.maxOutputFileSize = uint64(opts.Level(0).TargetFileSize) 976 c.maxOverlapBytes = maxGrandparentOverlapBytes(opts, 0) 977 c.grandparents = c.version.Overlaps(baseLevel, c.cmp, c.smallest.UserKey, 978 c.largest.UserKey, c.largest.IsExclusiveSentinel()) 979 adjustGrandparentOverlapBytesForFlush(c, flushingBytes) 980 } 981 982 c.setupInuseKeyRanges() 983 return c 984 } 985 986 func (c *compaction) hasExtraLevelData() bool { 987 if len(c.extraLevels) == 0 { 988 // not a multi level compaction 989 return false 990 } else if c.extraLevels[0].files.Empty() { 991 // a multi level compaction without data in the intermediate input level; 992 // e.g. for a multi level compaction with levels 4,5, and 6, this could 993 // occur if there is no files to compact in 5, or in 5 and 6 (i.e. a move). 994 return false 995 } 996 return true 997 } 998 999 func (c *compaction) setupInuseKeyRanges() { 1000 level := c.outputLevel.level + 1 1001 if c.outputLevel.level == 0 { 1002 level = 0 1003 } 1004 // calculateInuseKeyRanges will return a series of sorted spans. Overlapping 1005 // or abutting spans have already been merged. 1006 c.inuseKeyRanges = calculateInuseKeyRanges( 1007 c.version, c.cmp, level, numLevels-1, c.smallest.UserKey, c.largest.UserKey, 1008 ) 1009 // Check if there's a single in-use span that encompasses the entire key 1010 // range of the compaction. This is an optimization to avoid key comparisons 1011 // against inuseKeyRanges during the compaction when every key within the 1012 // compaction overlaps with an in-use span. 1013 if len(c.inuseKeyRanges) > 0 { 1014 c.inuseEntireRange = c.cmp(c.inuseKeyRanges[0].Start, c.smallest.UserKey) <= 0 && 1015 c.cmp(c.inuseKeyRanges[0].End, c.largest.UserKey) >= 0 1016 } 1017 } 1018 1019 func calculateInuseKeyRanges( 1020 v *version, cmp base.Compare, level, maxLevel int, smallest, largest []byte, 1021 ) []manifest.UserKeyRange { 1022 // Use two slices, alternating which one is input and which one is output 1023 // as we descend the LSM. 1024 var input, output []manifest.UserKeyRange 1025 1026 // L0 requires special treatment, since sstables within L0 may overlap. 1027 // We use the L0 Sublevels structure to efficiently calculate the merged 1028 // in-use key ranges. 1029 if level == 0 { 1030 output = v.L0Sublevels.InUseKeyRanges(smallest, largest) 1031 level++ 1032 } 1033 1034 for ; level <= maxLevel; level++ { 1035 // NB: We always treat `largest` as inclusive for simplicity, because 1036 // there's little consequence to calculating slightly broader in-use key 1037 // ranges. 1038 overlaps := v.Overlaps(level, cmp, smallest, largest, false /* exclusiveEnd */) 1039 iter := overlaps.Iter() 1040 1041 // We may already have in-use key ranges from higher levels. Iterate 1042 // through both our accumulated in-use key ranges and this level's 1043 // files, merging the two. 1044 // 1045 // Tables higher within the LSM have broader key spaces. We use this 1046 // when possible to seek past a level's files that are contained by 1047 // our current accumulated in-use key ranges. This helps avoid 1048 // per-sstable work during flushes or compactions in high levels which 1049 // overlap the majority of the LSM's sstables. 1050 input, output = output, input 1051 output = output[:0] 1052 1053 var currFile *fileMetadata 1054 var currAccum *manifest.UserKeyRange 1055 if len(input) > 0 { 1056 currAccum, input = &input[0], input[1:] 1057 } 1058 1059 // If we have an accumulated key range and its start is ≤ smallest, 1060 // we can seek to the accumulated range's end. Otherwise, we need to 1061 // start at the first overlapping file within the level. 1062 if currAccum != nil && cmp(currAccum.Start, smallest) <= 0 { 1063 currFile = seekGT(&iter, cmp, currAccum.End) 1064 } else { 1065 currFile = iter.First() 1066 } 1067 1068 for currFile != nil || currAccum != nil { 1069 // If we've exhausted either the files in the level or the 1070 // accumulated key ranges, we just need to append the one we have. 1071 // If we have both a currFile and a currAccum, they either overlap 1072 // or they're disjoint. If they're disjoint, we append whichever 1073 // one sorts first and move on to the next file or range. If they 1074 // overlap, we merge them into currAccum and proceed to the next 1075 // file. 1076 switch { 1077 case currAccum == nil || (currFile != nil && cmp(currFile.Largest.UserKey, currAccum.Start) < 0): 1078 // This file is strictly before the current accumulated range, 1079 // or there are no more accumulated ranges. 1080 output = append(output, manifest.UserKeyRange{ 1081 Start: currFile.Smallest.UserKey, 1082 End: currFile.Largest.UserKey, 1083 }) 1084 currFile = iter.Next() 1085 case currFile == nil || (currAccum != nil && cmp(currAccum.End, currFile.Smallest.UserKey) < 0): 1086 // The current accumulated key range is strictly before the 1087 // current file, or there are no more files. 1088 output = append(output, *currAccum) 1089 currAccum = nil 1090 if len(input) > 0 { 1091 currAccum, input = &input[0], input[1:] 1092 } 1093 default: 1094 // The current accumulated range and the current file overlap. 1095 // Adjust the accumulated range to be the union. 1096 if cmp(currFile.Smallest.UserKey, currAccum.Start) < 0 { 1097 currAccum.Start = currFile.Smallest.UserKey 1098 } 1099 if cmp(currFile.Largest.UserKey, currAccum.End) > 0 { 1100 currAccum.End = currFile.Largest.UserKey 1101 } 1102 1103 // Extending `currAccum`'s end boundary may have caused it to 1104 // overlap with `input` key ranges that we haven't processed 1105 // yet. Merge any such key ranges. 1106 for len(input) > 0 && cmp(input[0].Start, currAccum.End) <= 0 { 1107 if cmp(input[0].End, currAccum.End) > 0 { 1108 currAccum.End = input[0].End 1109 } 1110 input = input[1:] 1111 } 1112 // Seek the level iterator past our current accumulated end. 1113 currFile = seekGT(&iter, cmp, currAccum.End) 1114 } 1115 } 1116 } 1117 return output 1118 } 1119 1120 func seekGT(iter *manifest.LevelIterator, cmp base.Compare, key []byte) *manifest.FileMetadata { 1121 f := iter.SeekGE(cmp, key) 1122 for f != nil && cmp(f.Largest.UserKey, key) == 0 { 1123 f = iter.Next() 1124 } 1125 return f 1126 } 1127 1128 // findGrandparentLimit takes the start user key for a table and returns the 1129 // user key to which that table can extend without excessively overlapping 1130 // the grandparent level. If no limit is needed considering the grandparent 1131 // files, this function returns nil. This is done in order to prevent a table 1132 // at level N from overlapping too much data at level N+1. We want to avoid 1133 // such large overlaps because they translate into large compactions. The 1134 // current heuristic stops output of a table if the addition of another key 1135 // would cause the table to overlap more than 10x the target file size at 1136 // level N. See maxGrandparentOverlapBytes. 1137 func (c *compaction) findGrandparentLimit(start []byte) []byte { 1138 iter := c.grandparents.Iter() 1139 var overlappedBytes uint64 1140 var greater bool 1141 for f := iter.SeekGE(c.cmp, start); f != nil; f = iter.Next() { 1142 overlappedBytes += f.Size 1143 // To ensure forward progress we always return a larger user 1144 // key than where we started. See comments above clients of 1145 // this function for how this is used. 1146 greater = greater || c.cmp(f.Smallest.UserKey, start) > 0 1147 if !greater { 1148 continue 1149 } 1150 1151 // We return the smallest bound of a sstable rather than the 1152 // largest because the smallest is always inclusive, and limits 1153 // are used exlusively when truncating range tombstones. If we 1154 // truncated an output to the largest key while there's a 1155 // pending tombstone, the next output file would also overlap 1156 // the same grandparent f. 1157 if overlappedBytes > c.maxOverlapBytes { 1158 return f.Smallest.UserKey 1159 } 1160 } 1161 return nil 1162 } 1163 1164 // findL0Limit takes the start key for a table and returns the user key to which 1165 // that table can be extended without hitting the next l0Limit. Having flushed 1166 // sstables "bridging across" an l0Limit could lead to increased L0 -> LBase 1167 // compaction sizes as well as elevated read amplification. 1168 func (c *compaction) findL0Limit(start []byte) []byte { 1169 if c.startLevel.level > -1 || c.outputLevel.level != 0 || len(c.l0Limits) == 0 { 1170 return nil 1171 } 1172 index := sort.Search(len(c.l0Limits), func(i int) bool { 1173 return c.cmp(c.l0Limits[i], start) > 0 1174 }) 1175 if index < len(c.l0Limits) { 1176 return c.l0Limits[index] 1177 } 1178 return nil 1179 } 1180 1181 // errorOnUserKeyOverlap returns an error if the last two written sstables in 1182 // this compaction have revisions of the same user key present in both sstables, 1183 // when it shouldn't (eg. when splitting flushes). 1184 func (c *compaction) errorOnUserKeyOverlap(ve *versionEdit) error { 1185 if n := len(ve.NewFiles); n > 1 { 1186 meta := ve.NewFiles[n-1].Meta 1187 prevMeta := ve.NewFiles[n-2].Meta 1188 if !prevMeta.Largest.IsExclusiveSentinel() && 1189 c.cmp(prevMeta.Largest.UserKey, meta.Smallest.UserKey) >= 0 { 1190 return errors.Errorf("pebble: compaction split user key across two sstables: %s in %s and %s", 1191 prevMeta.Largest.Pretty(c.formatKey), 1192 prevMeta.FileNum, 1193 meta.FileNum) 1194 } 1195 } 1196 return nil 1197 } 1198 1199 // allowZeroSeqNum returns true if seqnum's can be zeroed if there are no 1200 // snapshots requiring them to be kept. It performs this determination by 1201 // looking for an sstable which overlaps the bounds of the compaction at a 1202 // lower level in the LSM. 1203 func (c *compaction) allowZeroSeqNum() bool { 1204 return c.elideRangeTombstone(c.smallest.UserKey, c.largest.UserKey) 1205 } 1206 1207 // elideTombstone returns true if it is ok to elide a tombstone for the 1208 // specified key. A return value of true guarantees that there are no key/value 1209 // pairs at c.level+2 or higher that possibly contain the specified user 1210 // key. The keys in multiple invocations to elideTombstone must be supplied in 1211 // order. 1212 func (c *compaction) elideTombstone(key []byte) bool { 1213 if c.inuseEntireRange || len(c.flushing) != 0 { 1214 return false 1215 } 1216 1217 for ; c.elideTombstoneIndex < len(c.inuseKeyRanges); c.elideTombstoneIndex++ { 1218 r := &c.inuseKeyRanges[c.elideTombstoneIndex] 1219 if c.cmp(key, r.End) <= 0 { 1220 if c.cmp(key, r.Start) >= 0 { 1221 return false 1222 } 1223 break 1224 } 1225 } 1226 return true 1227 } 1228 1229 // elideRangeTombstone returns true if it is ok to elide the specified range 1230 // tombstone. A return value of true guarantees that there are no key/value 1231 // pairs at c.outputLevel.level+1 or higher that possibly overlap the specified 1232 // tombstone. 1233 func (c *compaction) elideRangeTombstone(start, end []byte) bool { 1234 // Disable range tombstone elision if the testing knob for that is enabled, 1235 // or if we are flushing memtables. The latter requirement is due to 1236 // inuseKeyRanges not accounting for key ranges in other memtables that are 1237 // being flushed in the same compaction. It's possible for a range tombstone 1238 // in one memtable to overlap keys in a preceding memtable in c.flushing. 1239 // 1240 // This function is also used in setting allowZeroSeqNum, so disabling 1241 // elision of range tombstones also disables zeroing of SeqNums. 1242 // 1243 // TODO(peter): we disable zeroing of seqnums during flushing to match 1244 // RocksDB behavior and to avoid generating overlapping sstables during 1245 // DB.replayWAL. When replaying WAL files at startup, we flush after each 1246 // WAL is replayed building up a single version edit that is 1247 // applied. Because we don't apply the version edit after each flush, this 1248 // code doesn't know that L0 contains files and zeroing of seqnums should 1249 // be disabled. That is fixable, but it seems safer to just match the 1250 // RocksDB behavior for now. 1251 if c.disableSpanElision || len(c.flushing) != 0 { 1252 return false 1253 } 1254 1255 lower := sort.Search(len(c.inuseKeyRanges), func(i int) bool { 1256 return c.cmp(c.inuseKeyRanges[i].End, start) >= 0 1257 }) 1258 upper := sort.Search(len(c.inuseKeyRanges), func(i int) bool { 1259 return c.cmp(c.inuseKeyRanges[i].Start, end) > 0 1260 }) 1261 return lower >= upper 1262 } 1263 1264 // elideRangeKey returns true if it is ok to elide the specified range key. A 1265 // return value of true guarantees that there are no key/value pairs at 1266 // c.outputLevel.level+1 or higher that possibly overlap the specified range key. 1267 func (c *compaction) elideRangeKey(start, end []byte) bool { 1268 // TODO(bilal): Track inuseKeyRanges separately for the range keyspace as 1269 // opposed to the point keyspace. Once that is done, elideRangeTombstone 1270 // can just check in the point keyspace, and this function can check for 1271 // inuseKeyRanges in the range keyspace. 1272 return c.elideRangeTombstone(start, end) 1273 } 1274 1275 // newInputIter returns an iterator over all the input tables in a compaction. 1276 func (c *compaction) newInputIter( 1277 newIters tableNewIters, newRangeKeyIter keyspan.TableNewSpanIter, snapshots []uint64, 1278 ) (_ internalIterator, retErr error) { 1279 // Validate the ordering of compaction input files for defense in depth. 1280 // TODO(jackson): Some of the CheckOrdering calls may be adapted to pass 1281 // ProhibitSplitUserKeys if we thread the active format major version in. Or 1282 // if we remove support for earlier FMVs, we can remove the parameter 1283 // altogether. 1284 if len(c.flushing) == 0 { 1285 if c.startLevel.level >= 0 { 1286 err := manifest.CheckOrdering(c.cmp, c.formatKey, 1287 manifest.Level(c.startLevel.level), c.startLevel.files.Iter(), 1288 manifest.AllowSplitUserKeys) 1289 if err != nil { 1290 return nil, err 1291 } 1292 } 1293 err := manifest.CheckOrdering(c.cmp, c.formatKey, 1294 manifest.Level(c.outputLevel.level), c.outputLevel.files.Iter(), 1295 manifest.AllowSplitUserKeys) 1296 if err != nil { 1297 return nil, err 1298 } 1299 if c.startLevel.level == 0 { 1300 if c.startLevel.l0SublevelInfo == nil { 1301 panic("l0SublevelInfo not created for compaction out of L0") 1302 } 1303 for _, info := range c.startLevel.l0SublevelInfo { 1304 err := manifest.CheckOrdering(c.cmp, c.formatKey, 1305 info.sublevel, info.Iter(), 1306 // NB: L0 sublevels have never allowed split user keys. 1307 manifest.ProhibitSplitUserKeys) 1308 if err != nil { 1309 return nil, err 1310 } 1311 } 1312 } 1313 if len(c.extraLevels) > 0 { 1314 if len(c.extraLevels) > 1 { 1315 panic("n>2 multi level compaction not implemented yet") 1316 } 1317 interLevel := c.extraLevels[0] 1318 err := manifest.CheckOrdering(c.cmp, c.formatKey, 1319 manifest.Level(interLevel.level), interLevel.files.Iter(), 1320 manifest.AllowSplitUserKeys) 1321 if err != nil { 1322 return nil, err 1323 } 1324 } 1325 } 1326 1327 // There are three classes of keys that a compaction needs to process: point 1328 // keys, range deletion tombstones and range keys. Collect all iterators for 1329 // all these classes of keys from all the levels. We'll aggregate them 1330 // together farther below. 1331 // 1332 // numInputLevels is an approximation of the number of iterator levels. Due 1333 // to idiosyncrasies in iterator construction, we may (rarely) exceed this 1334 // initial capacity. 1335 numInputLevels := max(len(c.flushing), len(c.inputs)) 1336 iters := make([]internalIterator, 0, numInputLevels) 1337 rangeDelIters := make([]keyspan.FragmentIterator, 0, numInputLevels) 1338 rangeKeyIters := make([]keyspan.FragmentIterator, 0, numInputLevels) 1339 1340 // If construction of the iterator inputs fails, ensure that we close all 1341 // the consitutent iterators. 1342 defer func() { 1343 if retErr != nil { 1344 for _, iter := range iters { 1345 if iter != nil { 1346 iter.Close() 1347 } 1348 } 1349 for _, rangeDelIter := range rangeDelIters { 1350 rangeDelIter.Close() 1351 } 1352 } 1353 }() 1354 iterOpts := IterOptions{ 1355 CategoryAndQoS: sstable.CategoryAndQoS{ 1356 Category: "pebble-compaction", 1357 QoSLevel: sstable.NonLatencySensitiveQoSLevel, 1358 }, 1359 logger: c.logger, 1360 } 1361 1362 // Populate iters, rangeDelIters and rangeKeyIters with the appropriate 1363 // constituent iterators. This depends on whether this is a flush or a 1364 // compaction. 1365 if len(c.flushing) != 0 { 1366 // If flushing, we need to build the input iterators over the memtables 1367 // stored in c.flushing. 1368 for i := range c.flushing { 1369 f := c.flushing[i] 1370 iters = append(iters, f.newFlushIter(nil, &c.bytesIterated)) 1371 rangeDelIter := f.newRangeDelIter(nil) 1372 if rangeDelIter != nil { 1373 rangeDelIters = append(rangeDelIters, rangeDelIter) 1374 } 1375 if rangeKeyIter := f.newRangeKeyIter(nil); rangeKeyIter != nil { 1376 rangeKeyIters = append(rangeKeyIters, rangeKeyIter) 1377 } 1378 } 1379 } else { 1380 addItersForLevel := func(level *compactionLevel, l manifest.Level) error { 1381 // Add a *levelIter for point iterators. Because we don't call 1382 // initRangeDel, the levelIter will close and forget the range 1383 // deletion iterator when it steps on to a new file. Surfacing range 1384 // deletions to compactions are handled below. 1385 iters = append(iters, newLevelIter(context.Background(), 1386 iterOpts, c.comparer, newIters, level.files.Iter(), l, internalIterOpts{ 1387 bytesIterated: &c.bytesIterated, 1388 bufferPool: &c.bufferPool, 1389 })) 1390 // TODO(jackson): Use keyspan.LevelIter to avoid loading all the range 1391 // deletions into memory upfront. (See #2015, which reverted this.) 1392 // There will be no user keys that are split between sstables 1393 // within a level in Cockroach 23.1, which unblocks this optimization. 1394 1395 // Add the range deletion iterator for each file as an independent level 1396 // in mergingIter, as opposed to making a levelIter out of those. This 1397 // is safer as levelIter expects all keys coming from underlying 1398 // iterators to be in order. Due to compaction / tombstone writing 1399 // logic in finishOutput(), it is possible for range tombstones to not 1400 // be strictly ordered across all files in one level. 1401 // 1402 // Consider this example from the metamorphic tests (also repeated in 1403 // finishOutput()), consisting of three L3 files with their bounds 1404 // specified in square brackets next to the file name: 1405 // 1406 // ./000240.sst [tmgc#391,MERGE-tmgc#391,MERGE] 1407 // tmgc#391,MERGE [786e627a] 1408 // tmgc-udkatvs#331,RANGEDEL 1409 // 1410 // ./000241.sst [tmgc#384,MERGE-tmgc#384,MERGE] 1411 // tmgc#384,MERGE [666c7070] 1412 // tmgc-tvsalezade#383,RANGEDEL 1413 // tmgc-tvsalezade#331,RANGEDEL 1414 // 1415 // ./000242.sst [tmgc#383,RANGEDEL-tvsalezade#72057594037927935,RANGEDEL] 1416 // tmgc-tvsalezade#383,RANGEDEL 1417 // tmgc#375,SET [72646c78766965616c72776865676e79] 1418 // tmgc-tvsalezade#356,RANGEDEL 1419 // 1420 // Here, the range tombstone in 000240.sst falls "after" one in 1421 // 000241.sst, despite 000240.sst being ordered "before" 000241.sst for 1422 // levelIter's purposes. While each file is still consistent before its 1423 // bounds, it's safer to have all rangedel iterators be visible to 1424 // mergingIter. 1425 iter := level.files.Iter() 1426 for f := iter.First(); f != nil; f = iter.Next() { 1427 rangeDelIter, closer, err := c.newRangeDelIter( 1428 newIters, iter.Take(), iterOpts, l, &c.bytesIterated) 1429 if err != nil { 1430 // The error will already be annotated with the BackingFileNum, so 1431 // we annotate it with the FileNum. 1432 return errors.Wrapf(err, "pebble: could not open table %s", errors.Safe(f.FileNum)) 1433 } 1434 if rangeDelIter == nil { 1435 continue 1436 } 1437 rangeDelIters = append(rangeDelIters, rangeDelIter) 1438 c.closers = append(c.closers, closer) 1439 } 1440 1441 // Check if this level has any range keys. 1442 hasRangeKeys := false 1443 for f := iter.First(); f != nil; f = iter.Next() { 1444 if f.HasRangeKeys { 1445 hasRangeKeys = true 1446 break 1447 } 1448 } 1449 if hasRangeKeys { 1450 li := &keyspan.LevelIter{} 1451 newRangeKeyIterWrapper := func(file *manifest.FileMetadata, iterOptions keyspan.SpanIterOptions) (keyspan.FragmentIterator, error) { 1452 iter, err := newRangeKeyIter(file, iterOptions) 1453 if err != nil { 1454 return nil, err 1455 } else if iter == nil { 1456 return emptyKeyspanIter, nil 1457 } 1458 // Ensure that the range key iter is not closed until the compaction is 1459 // finished. This is necessary because range key processing 1460 // requires the range keys to be held in memory for up to the 1461 // lifetime of the compaction. 1462 c.closers = append(c.closers, iter) 1463 iter = noCloseIter{iter} 1464 1465 // We do not need to truncate range keys to sstable boundaries, or 1466 // only read within the file's atomic compaction units, unlike with 1467 // range tombstones. This is because range keys were added after we 1468 // stopped splitting user keys across sstables, so all the range keys 1469 // in this sstable must wholly lie within the file's bounds. 1470 return iter, err 1471 } 1472 li.Init(keyspan.SpanIterOptions{}, c.cmp, newRangeKeyIterWrapper, level.files.Iter(), l, manifest.KeyTypeRange) 1473 rangeKeyIters = append(rangeKeyIters, li) 1474 } 1475 return nil 1476 } 1477 1478 for i := range c.inputs { 1479 // If the level is annotated with l0SublevelInfo, expand it into one 1480 // level per sublevel. 1481 // TODO(jackson): Perform this expansion even earlier when we pick the 1482 // compaction? 1483 if len(c.inputs[i].l0SublevelInfo) > 0 { 1484 for _, info := range c.startLevel.l0SublevelInfo { 1485 sublevelCompactionLevel := &compactionLevel{0, info.LevelSlice, nil} 1486 if err := addItersForLevel(sublevelCompactionLevel, info.sublevel); err != nil { 1487 return nil, err 1488 } 1489 } 1490 continue 1491 } 1492 if err := addItersForLevel(&c.inputs[i], manifest.Level(c.inputs[i].level)); err != nil { 1493 return nil, err 1494 } 1495 } 1496 } 1497 1498 // In normal operation, levelIter iterates over the point operations in a 1499 // level, and initializes a rangeDelIter pointer for the range deletions in 1500 // each table. During compaction, we want to iterate over the merged view of 1501 // point operations and range deletions. In order to do this we create one 1502 // levelIter per level to iterate over the point operations, and collect up 1503 // all the range deletion files. 1504 // 1505 // The range deletion levels are first combined with a keyspan.MergingIter 1506 // (currently wrapped by a keyspan.InternalIteratorShim to satisfy the 1507 // internal iterator interface). The resulting merged rangedel iterator is 1508 // then included with the point levels in a single mergingIter. 1509 // 1510 // Combine all the rangedel iterators using a keyspan.MergingIterator and a 1511 // InternalIteratorShim so that the range deletions may be interleaved in 1512 // the compaction input. 1513 // TODO(jackson): Replace the InternalIteratorShim with an interleaving 1514 // iterator. 1515 if len(rangeDelIters) > 0 { 1516 c.rangeDelIter.Init(c.cmp, rangeDelIters...) 1517 iters = append(iters, &c.rangeDelIter) 1518 } 1519 1520 // If there's only one constituent point iterator, we can avoid the overhead 1521 // of a *mergingIter. This is possible, for example, when performing a flush 1522 // of a single memtable. Otherwise, combine all the iterators into a merging 1523 // iter. 1524 iter := iters[0] 1525 if len(iters) > 0 { 1526 iter = newMergingIter(c.logger, &c.stats, c.cmp, nil, iters...) 1527 } 1528 // If there are range key iterators, we need to combine them using 1529 // keyspan.MergingIter, and then interleave them among the points. 1530 if len(rangeKeyIters) > 0 { 1531 mi := &keyspan.MergingIter{} 1532 mi.Init(c.cmp, rangeKeyCompactionTransform(c.equal, snapshots, c.elideRangeKey), new(keyspan.MergingBuffers), rangeKeyIters...) 1533 di := &keyspan.DefragmentingIter{} 1534 di.Init(c.comparer, mi, keyspan.DefragmentInternal, keyspan.StaticDefragmentReducer, new(keyspan.DefragmentingBuffers)) 1535 c.rangeKeyInterleaving.Init(c.comparer, iter, di, keyspan.InterleavingIterOpts{}) 1536 iter = &c.rangeKeyInterleaving 1537 } 1538 return iter, nil 1539 } 1540 1541 func (c *compaction) newRangeDelIter( 1542 newIters tableNewIters, 1543 f manifest.LevelFile, 1544 opts IterOptions, 1545 l manifest.Level, 1546 bytesIterated *uint64, 1547 ) (keyspan.FragmentIterator, io.Closer, error) { 1548 opts.level = l 1549 iter, rangeDelIter, err := newIters(context.Background(), f.FileMetadata, 1550 &opts, internalIterOpts{ 1551 bytesIterated: &c.bytesIterated, 1552 bufferPool: &c.bufferPool, 1553 }) 1554 if err != nil { 1555 return nil, nil, err 1556 } 1557 // TODO(peter): It is mildly wasteful to open the point iterator only to 1558 // immediately close it. One way to solve this would be to add new 1559 // methods to tableCache for creating point and range-deletion iterators 1560 // independently. We'd only want to use those methods here, 1561 // though. Doesn't seem worth the hassle in the near term. 1562 if err = iter.Close(); err != nil { 1563 if rangeDelIter != nil { 1564 err = errors.CombineErrors(err, rangeDelIter.Close()) 1565 } 1566 return nil, nil, err 1567 } 1568 if rangeDelIter == nil { 1569 // The file doesn't contain any range deletions. 1570 return nil, nil, nil 1571 } 1572 1573 // Ensure that rangeDelIter is not closed until the compaction is 1574 // finished. This is necessary because range tombstone processing 1575 // requires the range tombstones to be held in memory for up to the 1576 // lifetime of the compaction. 1577 closer := rangeDelIter 1578 rangeDelIter = noCloseIter{rangeDelIter} 1579 1580 // Truncate the range tombstones returned by the iterator to the 1581 // upper bound of the atomic compaction unit of the file. We want to 1582 // truncate the range tombstone to the bounds of the file, but files 1583 // with split user keys pose an obstacle: The file's largest bound 1584 // is inclusive whereas the range tombstone's end is exclusive. 1585 // 1586 // Consider the example: 1587 // 1588 // 000001:[b-f#200] range del [c,k) 1589 // 000002:[f#190-g#inf] range del [c,k) 1590 // 000003:[g#500-i#3] 1591 // 1592 // Files 000001 and 000002 contain the untruncated range tombstones 1593 // [c,k). While the keyspace covered by 000003 was at one point 1594 // deleted by the tombstone [c,k), the tombstone may have already 1595 // been compacted away and the file does not contain an untruncated 1596 // range tombstone. We want to bound 000001's tombstone to the file 1597 // bounds, but it's not possible to encode a range tombstone with an 1598 // end boundary within a user key (eg, between sequence numbers 1599 // f#200 and f#190). Instead, we expand 000001 to its atomic 1600 // compaction unit (000001 and 000002) and truncate the tombstone to 1601 // g#inf. 1602 // 1603 // NB: We must not use the atomic compaction unit of the entire 1604 // compaction, because the [c,k) tombstone contained in the file 1605 // 000001 ≥ g. If 000001, 000002 and 000003 are all included in the 1606 // same compaction, the compaction's atomic compaction unit includes 1607 // 000003. However 000003's keys must not be covered by 000001's 1608 // untruncated range tombstone. 1609 // 1610 // Note that we need do this truncation at read time in order to 1611 // handle sstables generated by RocksDB and earlier versions of 1612 // Pebble which do not truncate range tombstones to atomic 1613 // compaction unit boundaries at write time. 1614 // 1615 // The current Pebble compaction logic DOES truncate tombstones to 1616 // atomic unit boundaries at compaction time too. 1617 atomicUnit, _ := expandToAtomicUnit(c.cmp, f.Slice(), true /* disableIsCompacting */) 1618 lowerBound, upperBound := manifest.KeyRange(c.cmp, atomicUnit.Iter()) 1619 // Range deletion tombstones are often written to sstables 1620 // untruncated on the end key side. However, they are still only 1621 // valid within a given file's bounds. The logic for writing range 1622 // tombstones to an output file sometimes has an incomplete view 1623 // of range tombstones outside the file's internal key bounds. Skip 1624 // any range tombstones completely outside file bounds. 1625 rangeDelIter = keyspan.Truncate( 1626 c.cmp, rangeDelIter, lowerBound.UserKey, upperBound.UserKey, 1627 &f.Smallest, &f.Largest, false, /* panicOnUpperTruncate */ 1628 ) 1629 return rangeDelIter, closer, nil 1630 } 1631 1632 func (c *compaction) String() string { 1633 if len(c.flushing) != 0 { 1634 return "flush\n" 1635 } 1636 1637 var buf bytes.Buffer 1638 for level := c.startLevel.level; level <= c.outputLevel.level; level++ { 1639 i := level - c.startLevel.level 1640 fmt.Fprintf(&buf, "%d:", level) 1641 iter := c.inputs[i].files.Iter() 1642 for f := iter.First(); f != nil; f = iter.Next() { 1643 fmt.Fprintf(&buf, " %s:%s-%s", f.FileNum, f.Smallest, f.Largest) 1644 } 1645 fmt.Fprintf(&buf, "\n") 1646 } 1647 return buf.String() 1648 } 1649 1650 type manualCompaction struct { 1651 // Count of the retries either due to too many concurrent compactions, or a 1652 // concurrent compaction to overlapping levels. 1653 retries int 1654 level int 1655 outputLevel int 1656 done chan error 1657 start []byte 1658 end []byte 1659 split bool 1660 } 1661 1662 type readCompaction struct { 1663 level int 1664 // [start, end] key ranges are used for de-duping. 1665 start []byte 1666 end []byte 1667 1668 // The file associated with the compaction. 1669 // If the file no longer belongs in the same 1670 // level, then we skip the compaction. 1671 fileNum base.FileNum 1672 } 1673 1674 type downloadSpan struct { 1675 start []byte 1676 end []byte 1677 // doneChans contains a list of channels passed into compactions as done 1678 // channels. Each channel has a buffer size of 1 and is only passed into 1679 // one compaction. This slice can grow over the lifetime of a downloadSpan. 1680 doneChans []chan error 1681 // compactionsStarted is the number of compactions started for this 1682 // downloadSpan. Must be equal to len(doneChans)-1, i.e. there's one spare 1683 // doneChan created each time a compaction starts up, for the next compaction. 1684 compactionsStarted int 1685 } 1686 1687 func (d *DB) addInProgressCompaction(c *compaction) { 1688 d.mu.compact.inProgress[c] = struct{}{} 1689 var isBase, isIntraL0 bool 1690 for _, cl := range c.inputs { 1691 iter := cl.files.Iter() 1692 for f := iter.First(); f != nil; f = iter.Next() { 1693 if f.IsCompacting() { 1694 d.opts.Logger.Fatalf("L%d->L%d: %s already being compacted", c.startLevel.level, c.outputLevel.level, f.FileNum) 1695 } 1696 f.SetCompactionState(manifest.CompactionStateCompacting) 1697 if c.startLevel != nil && c.outputLevel != nil && c.startLevel.level == 0 { 1698 if c.outputLevel.level == 0 { 1699 f.IsIntraL0Compacting = true 1700 isIntraL0 = true 1701 } else { 1702 isBase = true 1703 } 1704 } 1705 } 1706 } 1707 1708 if (isIntraL0 || isBase) && c.version.L0Sublevels != nil { 1709 l0Inputs := []manifest.LevelSlice{c.startLevel.files} 1710 if isIntraL0 { 1711 l0Inputs = append(l0Inputs, c.outputLevel.files) 1712 } 1713 if err := c.version.L0Sublevels.UpdateStateForStartedCompaction(l0Inputs, isBase); err != nil { 1714 d.opts.Logger.Fatalf("could not update state for compaction: %s", err) 1715 } 1716 } 1717 } 1718 1719 // Removes compaction markers from files in a compaction. The rollback parameter 1720 // indicates whether the compaction state should be rolled back to its original 1721 // state in the case of an unsuccessful compaction. 1722 // 1723 // DB.mu must be held when calling this method, however this method can drop and 1724 // re-acquire that mutex. All writes to the manifest for this compaction should 1725 // have completed by this point. 1726 func (d *DB) clearCompactingState(c *compaction, rollback bool) { 1727 c.versionEditApplied = true 1728 for _, cl := range c.inputs { 1729 iter := cl.files.Iter() 1730 for f := iter.First(); f != nil; f = iter.Next() { 1731 if !f.IsCompacting() { 1732 d.opts.Logger.Fatalf("L%d->L%d: %s not being compacted", c.startLevel.level, c.outputLevel.level, f.FileNum) 1733 } 1734 if !rollback { 1735 // On success all compactions other than move-compactions transition the 1736 // file into the Compacted state. Move-compacted files become eligible 1737 // for compaction again and transition back to NotCompacting. 1738 if c.kind != compactionKindMove { 1739 f.SetCompactionState(manifest.CompactionStateCompacted) 1740 } else { 1741 f.SetCompactionState(manifest.CompactionStateNotCompacting) 1742 } 1743 } else { 1744 // Else, on rollback, all input files unconditionally transition back to 1745 // NotCompacting. 1746 f.SetCompactionState(manifest.CompactionStateNotCompacting) 1747 } 1748 f.IsIntraL0Compacting = false 1749 } 1750 } 1751 l0InProgress := inProgressL0Compactions(d.getInProgressCompactionInfoLocked(c)) 1752 func() { 1753 // InitCompactingFileInfo requires that no other manifest writes be 1754 // happening in parallel with it, i.e. we're not in the midst of installing 1755 // another version. Otherwise, it's possible that we've created another 1756 // L0Sublevels instance, but not added it to the versions list, causing 1757 // all the indices in FileMetadata to be inaccurate. To ensure this, 1758 // grab the manifest lock. 1759 d.mu.versions.logLock() 1760 defer d.mu.versions.logUnlock() 1761 d.mu.versions.currentVersion().L0Sublevels.InitCompactingFileInfo(l0InProgress) 1762 }() 1763 } 1764 1765 func (d *DB) calculateDiskAvailableBytes() uint64 { 1766 if space, err := d.opts.FS.GetDiskUsage(d.dirname); err == nil { 1767 d.diskAvailBytes.Store(space.AvailBytes) 1768 return space.AvailBytes 1769 } else if !errors.Is(err, vfs.ErrUnsupported) { 1770 d.opts.EventListener.BackgroundError(err) 1771 } 1772 return d.diskAvailBytes.Load() 1773 } 1774 1775 func (d *DB) getDeletionPacerInfo() deletionPacerInfo { 1776 var pacerInfo deletionPacerInfo 1777 // Call GetDiskUsage after every file deletion. This may seem inefficient, 1778 // but in practice this was observed to take constant time, regardless of 1779 // volume size used, at least on linux with ext4 and zfs. All invocations 1780 // take 10 microseconds or less. 1781 pacerInfo.freeBytes = d.calculateDiskAvailableBytes() 1782 d.mu.Lock() 1783 pacerInfo.obsoleteBytes = d.mu.versions.metrics.Table.ObsoleteSize 1784 pacerInfo.liveBytes = uint64(d.mu.versions.metrics.Total().Size) 1785 d.mu.Unlock() 1786 return pacerInfo 1787 } 1788 1789 // onObsoleteTableDelete is called to update metrics when an sstable is deleted. 1790 func (d *DB) onObsoleteTableDelete(fileSize uint64) { 1791 d.mu.Lock() 1792 d.mu.versions.metrics.Table.ObsoleteCount-- 1793 d.mu.versions.metrics.Table.ObsoleteSize -= fileSize 1794 d.mu.Unlock() 1795 } 1796 1797 // maybeScheduleFlush schedules a flush if necessary. 1798 // 1799 // d.mu must be held when calling this. 1800 func (d *DB) maybeScheduleFlush() { 1801 if d.mu.compact.flushing || d.closed.Load() != nil || d.opts.ReadOnly { 1802 return 1803 } 1804 if len(d.mu.mem.queue) <= 1 { 1805 return 1806 } 1807 1808 if !d.passedFlushThreshold() { 1809 return 1810 } 1811 1812 d.mu.compact.flushing = true 1813 go d.flush() 1814 } 1815 1816 func (d *DB) passedFlushThreshold() bool { 1817 var n int 1818 var size uint64 1819 for ; n < len(d.mu.mem.queue)-1; n++ { 1820 if !d.mu.mem.queue[n].readyForFlush() { 1821 break 1822 } 1823 if d.mu.mem.queue[n].flushForced { 1824 // A flush was forced. Pretend the memtable size is the configured 1825 // size. See minFlushSize below. 1826 size += d.opts.MemTableSize 1827 } else { 1828 size += d.mu.mem.queue[n].totalBytes() 1829 } 1830 } 1831 if n == 0 { 1832 // None of the immutable memtables are ready for flushing. 1833 return false 1834 } 1835 1836 // Only flush once the sum of the queued memtable sizes exceeds half the 1837 // configured memtable size. This prevents flushing of memtables at startup 1838 // while we're undergoing the ramp period on the memtable size. See 1839 // DB.newMemTable(). 1840 minFlushSize := d.opts.MemTableSize / 2 1841 return size >= minFlushSize 1842 } 1843 1844 func (d *DB) maybeScheduleDelayedFlush(tbl *memTable, dur time.Duration) { 1845 var mem *flushableEntry 1846 for _, m := range d.mu.mem.queue { 1847 if m.flushable == tbl { 1848 mem = m 1849 break 1850 } 1851 } 1852 if mem == nil || mem.flushForced { 1853 return 1854 } 1855 deadline := d.timeNow().Add(dur) 1856 if !mem.delayedFlushForcedAt.IsZero() && deadline.After(mem.delayedFlushForcedAt) { 1857 // Already scheduled to flush sooner than within `dur`. 1858 return 1859 } 1860 mem.delayedFlushForcedAt = deadline 1861 go func() { 1862 timer := time.NewTimer(dur) 1863 defer timer.Stop() 1864 1865 select { 1866 case <-d.closedCh: 1867 return 1868 case <-mem.flushed: 1869 return 1870 case <-timer.C: 1871 d.commit.mu.Lock() 1872 defer d.commit.mu.Unlock() 1873 d.mu.Lock() 1874 defer d.mu.Unlock() 1875 1876 // NB: The timer may fire concurrently with a call to Close. If a 1877 // Close call beat us to acquiring d.mu, d.closed holds ErrClosed, 1878 // and it's too late to flush anything. Otherwise, the Close call 1879 // will block on locking d.mu until we've finished scheduling the 1880 // flush and set `d.mu.compact.flushing` to true. Close will wait 1881 // for the current flush to complete. 1882 if d.closed.Load() != nil { 1883 return 1884 } 1885 1886 if d.mu.mem.mutable == tbl { 1887 d.makeRoomForWrite(nil) 1888 } else { 1889 mem.flushForced = true 1890 } 1891 d.maybeScheduleFlush() 1892 } 1893 }() 1894 } 1895 1896 func (d *DB) flush() { 1897 pprof.Do(context.Background(), flushLabels, func(context.Context) { 1898 flushingWorkStart := time.Now() 1899 d.mu.Lock() 1900 defer d.mu.Unlock() 1901 idleDuration := flushingWorkStart.Sub(d.mu.compact.noOngoingFlushStartTime) 1902 var bytesFlushed uint64 1903 var err error 1904 if bytesFlushed, err = d.flush1(); err != nil { 1905 // TODO(peter): count consecutive flush errors and backoff. 1906 d.opts.EventListener.BackgroundError(err) 1907 } 1908 d.mu.compact.flushing = false 1909 d.mu.compact.noOngoingFlushStartTime = time.Now() 1910 workDuration := d.mu.compact.noOngoingFlushStartTime.Sub(flushingWorkStart) 1911 d.mu.compact.flushWriteThroughput.Bytes += int64(bytesFlushed) 1912 d.mu.compact.flushWriteThroughput.WorkDuration += workDuration 1913 d.mu.compact.flushWriteThroughput.IdleDuration += idleDuration 1914 // More flush work may have arrived while we were flushing, so schedule 1915 // another flush if needed. 1916 d.maybeScheduleFlush() 1917 // The flush may have produced too many files in a level, so schedule a 1918 // compaction if needed. 1919 d.maybeScheduleCompaction() 1920 d.mu.compact.cond.Broadcast() 1921 }) 1922 } 1923 1924 // runIngestFlush is used to generate a flush version edit for sstables which 1925 // were ingested as flushables. Both DB.mu and the manifest lock must be held 1926 // while runIngestFlush is called. 1927 func (d *DB) runIngestFlush(c *compaction) (*manifest.VersionEdit, error) { 1928 if len(c.flushing) != 1 { 1929 panic("pebble: ingestedFlushable must be flushed one at a time.") 1930 } 1931 1932 // Construct the VersionEdit, levelMetrics etc. 1933 c.metrics = make(map[int]*LevelMetrics, numLevels) 1934 // Finding the target level for ingestion must use the latest version 1935 // after the logLock has been acquired. 1936 c.version = d.mu.versions.currentVersion() 1937 1938 baseLevel := d.mu.versions.picker.getBaseLevel() 1939 iterOpts := IterOptions{logger: d.opts.Logger} 1940 ve := &versionEdit{} 1941 var level int 1942 var err error 1943 var fileToSplit *fileMetadata 1944 var ingestSplitFiles []ingestSplitFile 1945 for _, file := range c.flushing[0].flushable.(*ingestedFlushable).files { 1946 suggestSplit := d.opts.Experimental.IngestSplit != nil && d.opts.Experimental.IngestSplit() && 1947 d.FormatMajorVersion() >= FormatVirtualSSTables 1948 level, fileToSplit, err = ingestTargetLevel( 1949 d.newIters, d.tableNewRangeKeyIter, iterOpts, d.opts.Comparer, 1950 c.version, baseLevel, d.mu.compact.inProgress, file.FileMetadata, 1951 suggestSplit, 1952 ) 1953 if err != nil { 1954 return nil, err 1955 } 1956 ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: level, Meta: file.FileMetadata}) 1957 if fileToSplit != nil { 1958 ingestSplitFiles = append(ingestSplitFiles, ingestSplitFile{ 1959 ingestFile: file.FileMetadata, 1960 splitFile: fileToSplit, 1961 level: level, 1962 }) 1963 } 1964 levelMetrics := c.metrics[level] 1965 if levelMetrics == nil { 1966 levelMetrics = &LevelMetrics{} 1967 c.metrics[level] = levelMetrics 1968 } 1969 levelMetrics.BytesIngested += file.Size 1970 levelMetrics.TablesIngested++ 1971 } 1972 1973 updateLevelMetricsOnExcise := func(m *fileMetadata, level int, added []newFileEntry) { 1974 levelMetrics := c.metrics[level] 1975 if levelMetrics == nil { 1976 levelMetrics = &LevelMetrics{} 1977 c.metrics[level] = levelMetrics 1978 } 1979 levelMetrics.NumFiles-- 1980 levelMetrics.Size -= int64(m.Size) 1981 for i := range added { 1982 levelMetrics.NumFiles++ 1983 levelMetrics.Size += int64(added[i].Meta.Size) 1984 } 1985 } 1986 1987 if len(ingestSplitFiles) > 0 { 1988 ve.DeletedFiles = make(map[manifest.DeletedFileEntry]*manifest.FileMetadata) 1989 replacedFiles := make(map[base.FileNum][]newFileEntry) 1990 if err := d.ingestSplit(ve, updateLevelMetricsOnExcise, ingestSplitFiles, replacedFiles); err != nil { 1991 return nil, err 1992 } 1993 } 1994 1995 return ve, nil 1996 } 1997 1998 // flush runs a compaction that copies the immutable memtables from memory to 1999 // disk. 2000 // 2001 // d.mu must be held when calling this, but the mutex may be dropped and 2002 // re-acquired during the course of this method. 2003 func (d *DB) flush1() (bytesFlushed uint64, err error) { 2004 // NB: The flushable queue can contain flushables of type ingestedFlushable. 2005 // The sstables in ingestedFlushable.files must be placed into the appropriate 2006 // level in the lsm. Let's say the flushable queue contains a prefix of 2007 // regular immutable memtables, then an ingestedFlushable, and then the 2008 // mutable memtable. When the flush of the ingestedFlushable is performed, 2009 // it needs an updated view of the lsm. That is, the prefix of immutable 2010 // memtables must have already been flushed. Similarly, if there are two 2011 // contiguous ingestedFlushables in the queue, then the first flushable must 2012 // be flushed, so that the second flushable can see an updated view of the 2013 // lsm. 2014 // 2015 // Given the above, we restrict flushes to either some prefix of regular 2016 // memtables, or a single flushable of type ingestedFlushable. The DB.flush 2017 // function will call DB.maybeScheduleFlush again, so a new flush to finish 2018 // the remaining flush work should be scheduled right away. 2019 // 2020 // NB: Large batches placed in the flushable queue share the WAL with the 2021 // previous memtable in the queue. We must ensure the property that both the 2022 // large batch and the memtable with which it shares a WAL are flushed 2023 // together. The property ensures that the minimum unflushed log number 2024 // isn't incremented incorrectly. Since a flushableBatch.readyToFlush always 2025 // returns true, and since the large batch will always be placed right after 2026 // the memtable with which it shares a WAL, the property is naturally 2027 // ensured. The large batch will always be placed after the memtable with 2028 // which it shares a WAL because we ensure it in DB.commitWrite by holding 2029 // the commitPipeline.mu and then holding DB.mu. As an extra defensive 2030 // measure, if we try to flush the memtable without also flushing the 2031 // flushable batch in the same flush, since the memtable and flushableBatch 2032 // have the same logNum, the logNum invariant check below will trigger. 2033 var n, inputs int 2034 var inputBytes uint64 2035 var ingest bool 2036 for ; n < len(d.mu.mem.queue)-1; n++ { 2037 if f, ok := d.mu.mem.queue[n].flushable.(*ingestedFlushable); ok { 2038 if n == 0 { 2039 // The first flushable is of type ingestedFlushable. Since these 2040 // must be flushed individually, we perform a flush for just 2041 // this. 2042 if !f.readyForFlush() { 2043 // This check is almost unnecessary, but we guard against it 2044 // just in case this invariant changes in the future. 2045 panic("pebble: ingestedFlushable should always be ready to flush.") 2046 } 2047 // By setting n = 1, we ensure that the first flushable(n == 0) 2048 // is scheduled for a flush. The number of tables added is equal to the 2049 // number of files in the ingest operation. 2050 n = 1 2051 inputs = len(f.files) 2052 ingest = true 2053 break 2054 } else { 2055 // There was some prefix of flushables which weren't of type 2056 // ingestedFlushable. So, perform a flush for those. 2057 break 2058 } 2059 } 2060 if !d.mu.mem.queue[n].readyForFlush() { 2061 break 2062 } 2063 inputBytes += d.mu.mem.queue[n].inuseBytes() 2064 } 2065 if n == 0 { 2066 // None of the immutable memtables are ready for flushing. 2067 return 0, nil 2068 } 2069 if !ingest { 2070 // Flushes of memtables add the prefix of n memtables from the flushable 2071 // queue. 2072 inputs = n 2073 } 2074 2075 // Require that every memtable being flushed has a log number less than the 2076 // new minimum unflushed log number. 2077 minUnflushedLogNum := d.mu.mem.queue[n].logNum 2078 if !d.opts.DisableWAL { 2079 for i := 0; i < n; i++ { 2080 if logNum := d.mu.mem.queue[i].logNum; logNum >= minUnflushedLogNum { 2081 panic(errors.AssertionFailedf("logNum invariant violated: flushing %d items; %d:type=%T,logNum=%d; %d:type=%T,logNum=%d", 2082 n, 2083 i, d.mu.mem.queue[i].flushable, logNum, 2084 n, d.mu.mem.queue[n].flushable, minUnflushedLogNum)) 2085 } 2086 } 2087 } 2088 2089 c := newFlush(d.opts, d.mu.versions.currentVersion(), 2090 d.mu.versions.picker.getBaseLevel(), d.mu.mem.queue[:n], d.timeNow()) 2091 d.addInProgressCompaction(c) 2092 2093 jobID := d.mu.nextJobID 2094 d.mu.nextJobID++ 2095 d.opts.EventListener.FlushBegin(FlushInfo{ 2096 JobID: jobID, 2097 Input: inputs, 2098 InputBytes: inputBytes, 2099 Ingest: ingest, 2100 }) 2101 startTime := d.timeNow() 2102 2103 var ve *manifest.VersionEdit 2104 var pendingOutputs []physicalMeta 2105 var stats compactStats 2106 // To determine the target level of the files in the ingestedFlushable, we 2107 // need to acquire the logLock, and not release it for that duration. Since, 2108 // we need to acquire the logLock below to perform the logAndApply step 2109 // anyway, we create the VersionEdit for ingestedFlushable outside of 2110 // runCompaction. For all other flush cases, we construct the VersionEdit 2111 // inside runCompaction. 2112 if c.kind != compactionKindIngestedFlushable { 2113 ve, pendingOutputs, stats, err = d.runCompaction(jobID, c) 2114 } 2115 2116 // Acquire logLock. This will be released either on an error, by way of 2117 // logUnlock, or through a call to logAndApply if there is no error. 2118 d.mu.versions.logLock() 2119 2120 if c.kind == compactionKindIngestedFlushable { 2121 ve, err = d.runIngestFlush(c) 2122 } 2123 2124 info := FlushInfo{ 2125 JobID: jobID, 2126 Input: inputs, 2127 InputBytes: inputBytes, 2128 Duration: d.timeNow().Sub(startTime), 2129 Done: true, 2130 Ingest: ingest, 2131 Err: err, 2132 } 2133 if err == nil { 2134 for i := range ve.NewFiles { 2135 e := &ve.NewFiles[i] 2136 info.Output = append(info.Output, e.Meta.TableInfo()) 2137 // Ingested tables are not necessarily flushed to L0. Record the level of 2138 // each ingested file explicitly. 2139 if ingest { 2140 info.IngestLevels = append(info.IngestLevels, e.Level) 2141 } 2142 } 2143 if len(ve.NewFiles) == 0 { 2144 info.Err = errEmptyTable 2145 } 2146 2147 // The flush succeeded or it produced an empty sstable. In either case we 2148 // want to bump the minimum unflushed log number to the log number of the 2149 // oldest unflushed memtable. 2150 ve.MinUnflushedLogNum = minUnflushedLogNum 2151 if c.kind != compactionKindIngestedFlushable { 2152 metrics := c.metrics[0] 2153 if d.opts.DisableWAL { 2154 // If the WAL is disabled, every flushable has a zero [logSize], 2155 // resulting in zero bytes in. Instead, use the number of bytes we 2156 // flushed as the BytesIn. This ensures we get a reasonable w-amp 2157 // calculation even when the WAL is disabled. 2158 metrics.BytesIn = metrics.BytesFlushed 2159 } else { 2160 metrics := c.metrics[0] 2161 for i := 0; i < n; i++ { 2162 metrics.BytesIn += d.mu.mem.queue[i].logSize 2163 } 2164 } 2165 } else if len(ve.DeletedFiles) > 0 { 2166 // c.kind == compactionKindIngestedFlushable && we have deleted files due 2167 // to ingest-time splits. 2168 // 2169 // Iterate through all other compactions, and check if their inputs have 2170 // been replaced due to an ingest-time split. In that case, cancel the 2171 // compaction. 2172 for c2 := range d.mu.compact.inProgress { 2173 for i := range c2.inputs { 2174 iter := c2.inputs[i].files.Iter() 2175 for f := iter.First(); f != nil; f = iter.Next() { 2176 if _, ok := ve.DeletedFiles[deletedFileEntry{FileNum: f.FileNum, Level: c2.inputs[i].level}]; ok { 2177 c2.cancel.Store(true) 2178 break 2179 } 2180 } 2181 } 2182 } 2183 } 2184 err = d.mu.versions.logAndApply(jobID, ve, c.metrics, false, /* forceRotation */ 2185 func() []compactionInfo { return d.getInProgressCompactionInfoLocked(c) }) 2186 if err != nil { 2187 info.Err = err 2188 // TODO(peter): untested. 2189 for _, f := range pendingOutputs { 2190 // Note that the FileBacking for the file metadata might not have 2191 // been set yet. So, we directly use the FileNum. Since these 2192 // files were generated as compaction outputs, these must be 2193 // physical files on disk. This property might not hold once 2194 // https://github.com/cockroachdb/pebble/issues/389 is 2195 // implemented if #389 creates virtual sstables as output files. 2196 d.mu.versions.obsoleteTables = append( 2197 d.mu.versions.obsoleteTables, 2198 fileInfo{f.FileNum.DiskFileNum(), f.Size}, 2199 ) 2200 } 2201 d.mu.versions.updateObsoleteTableMetricsLocked() 2202 } 2203 } else { 2204 // We won't be performing the logAndApply step because of the error, 2205 // so logUnlock. 2206 d.mu.versions.logUnlock() 2207 } 2208 2209 bytesFlushed = c.bytesIterated 2210 2211 // If err != nil, then the flush will be retried, and we will recalculate 2212 // these metrics. 2213 if err == nil { 2214 d.mu.snapshots.cumulativePinnedCount += stats.cumulativePinnedKeys 2215 d.mu.snapshots.cumulativePinnedSize += stats.cumulativePinnedSize 2216 d.mu.versions.metrics.Keys.MissizedTombstonesCount += stats.countMissizedDels 2217 d.maybeUpdateDeleteCompactionHints(c) 2218 } 2219 2220 d.clearCompactingState(c, err != nil) 2221 delete(d.mu.compact.inProgress, c) 2222 d.mu.versions.incrementCompactions(c.kind, c.extraLevels, c.pickerMetrics) 2223 2224 var flushed flushableList 2225 if err == nil { 2226 flushed = d.mu.mem.queue[:n] 2227 d.mu.mem.queue = d.mu.mem.queue[n:] 2228 d.updateReadStateLocked(d.opts.DebugCheck) 2229 d.updateTableStatsLocked(ve.NewFiles) 2230 if ingest { 2231 d.mu.versions.metrics.Flush.AsIngestCount++ 2232 for _, l := range c.metrics { 2233 d.mu.versions.metrics.Flush.AsIngestBytes += l.BytesIngested 2234 d.mu.versions.metrics.Flush.AsIngestTableCount += l.TablesIngested 2235 } 2236 } 2237 2238 // Update if any eventually file-only snapshots have now transitioned to 2239 // being file-only. 2240 earliestUnflushedSeqNum := d.getEarliestUnflushedSeqNumLocked() 2241 currentVersion := d.mu.versions.currentVersion() 2242 for s := d.mu.snapshots.root.next; s != &d.mu.snapshots.root; { 2243 if s.efos == nil { 2244 s = s.next 2245 continue 2246 } 2247 if base.Visible(earliestUnflushedSeqNum, s.efos.seqNum, InternalKeySeqNumMax) { 2248 s = s.next 2249 continue 2250 } 2251 if s.efos.excised.Load() { 2252 // If a concurrent excise has happened that overlaps with one of the key 2253 // ranges this snapshot is interested in, this EFOS cannot transition to 2254 // a file-only snapshot as keys in that range could now be deleted. Move 2255 // onto the next snapshot. 2256 s = s.next 2257 continue 2258 } 2259 currentVersion.Ref() 2260 2261 // NB: s.efos.transitionToFileOnlySnapshot could close s, in which 2262 // case s.next would be nil. Save it before calling it. 2263 next := s.next 2264 _ = s.efos.transitionToFileOnlySnapshot(currentVersion) 2265 s = next 2266 } 2267 } 2268 // Signal FlushEnd after installing the new readState. This helps for unit 2269 // tests that use the callback to trigger a read using an iterator with 2270 // IterOptions.OnlyReadGuaranteedDurable. 2271 info.TotalDuration = d.timeNow().Sub(startTime) 2272 d.opts.EventListener.FlushEnd(info) 2273 2274 // The order of these operations matters here for ease of testing. 2275 // Removing the reader reference first allows tests to be guaranteed that 2276 // the memtable reservation has been released by the time a synchronous 2277 // flush returns. readerUnrefLocked may also produce obsolete files so the 2278 // call to deleteObsoleteFiles must happen after it. 2279 for i := range flushed { 2280 flushed[i].readerUnrefLocked(true) 2281 } 2282 2283 d.deleteObsoleteFiles(jobID) 2284 2285 // Mark all the memtables we flushed as flushed. 2286 for i := range flushed { 2287 close(flushed[i].flushed) 2288 } 2289 2290 return bytesFlushed, err 2291 } 2292 2293 // maybeScheduleCompactionAsync should be used when 2294 // we want to possibly schedule a compaction, but don't 2295 // want to eat the cost of running maybeScheduleCompaction. 2296 // This method should be launched in a separate goroutine. 2297 // d.mu must not be held when this is called. 2298 func (d *DB) maybeScheduleCompactionAsync() { 2299 defer d.compactionSchedulers.Done() 2300 2301 d.mu.Lock() 2302 d.maybeScheduleCompaction() 2303 d.mu.Unlock() 2304 } 2305 2306 // maybeScheduleCompaction schedules a compaction if necessary. 2307 // 2308 // d.mu must be held when calling this. 2309 func (d *DB) maybeScheduleCompaction() { 2310 d.maybeScheduleCompactionPicker(pickAuto) 2311 } 2312 2313 func pickAuto(picker compactionPicker, env compactionEnv) *pickedCompaction { 2314 return picker.pickAuto(env) 2315 } 2316 2317 func pickElisionOnly(picker compactionPicker, env compactionEnv) *pickedCompaction { 2318 return picker.pickElisionOnlyCompaction(env) 2319 } 2320 2321 // maybeScheduleDownloadCompaction schedules a download compaction. 2322 // 2323 // Requires d.mu to be held. 2324 func (d *DB) maybeScheduleDownloadCompaction(env compactionEnv, maxConcurrentCompactions int) { 2325 for len(d.mu.compact.downloads) > 0 && d.mu.compact.compactingCount < maxConcurrentCompactions { 2326 v := d.mu.versions.currentVersion() 2327 download := d.mu.compact.downloads[0] 2328 env.inProgressCompactions = d.getInProgressCompactionInfoLocked(nil) 2329 var externalFile *fileMetadata 2330 var err error 2331 var level int 2332 for i := range v.Levels { 2333 overlaps := v.Overlaps(i, d.cmp, download.start, download.end, true /* exclusiveEnd */) 2334 iter := overlaps.Iter() 2335 provider := d.objProvider 2336 for f := iter.First(); f != nil; f = iter.Next() { 2337 var objMeta objstorage.ObjectMetadata 2338 objMeta, err = provider.Lookup(fileTypeTable, f.FileBacking.DiskFileNum) 2339 if err != nil { 2340 break 2341 } 2342 if objMeta.IsExternal() { 2343 if f.IsCompacting() { 2344 continue 2345 } 2346 externalFile = f 2347 level = i 2348 break 2349 } 2350 } 2351 if externalFile != nil || err != nil { 2352 break 2353 } 2354 } 2355 if err != nil { 2356 d.mu.compact.downloads = d.mu.compact.downloads[1:] 2357 download.doneChans[download.compactionsStarted] <- err 2358 continue 2359 } 2360 if externalFile == nil { 2361 // The entirety of this span is downloaded, or is being downloaded right 2362 // now. No need to schedule additional downloads for this span. 2363 d.mu.compact.downloads = d.mu.compact.downloads[1:] 2364 continue 2365 } 2366 pc := pickDownloadCompaction(v, d.opts, env, d.mu.versions.picker.getBaseLevel(), download, level, externalFile) 2367 if pc != nil { 2368 doneCh := download.doneChans[download.compactionsStarted] 2369 download.compactionsStarted++ 2370 // Create another doneChan for the next compaction. 2371 download.doneChans = append(download.doneChans, make(chan error, 1)) 2372 2373 c := newCompaction(pc, d.opts, d.timeNow(), d.ObjProvider()) 2374 d.mu.compact.compactingCount++ 2375 d.addInProgressCompaction(c) 2376 go d.compact(c, doneCh) 2377 } 2378 } 2379 } 2380 2381 // maybeScheduleCompactionPicker schedules a compaction if necessary, 2382 // calling `pickFunc` to pick automatic compactions. 2383 // 2384 // d.mu must be held when calling this. 2385 func (d *DB) maybeScheduleCompactionPicker( 2386 pickFunc func(compactionPicker, compactionEnv) *pickedCompaction, 2387 ) { 2388 if d.closed.Load() != nil || d.opts.ReadOnly { 2389 return 2390 } 2391 maxConcurrentCompactions := d.opts.MaxConcurrentCompactions() 2392 if d.mu.compact.compactingCount >= maxConcurrentCompactions { 2393 if len(d.mu.compact.manual) > 0 { 2394 // Inability to run head blocks later manual compactions. 2395 d.mu.compact.manual[0].retries++ 2396 } 2397 return 2398 } 2399 2400 // Compaction picking needs a coherent view of a Version. In particular, we 2401 // need to exlude concurrent ingestions from making a decision on which level 2402 // to ingest into that conflicts with our compaction 2403 // decision. versionSet.logLock provides the necessary mutual exclusion. 2404 d.mu.versions.logLock() 2405 defer d.mu.versions.logUnlock() 2406 2407 // Check for the closed flag again, in case the DB was closed while we were 2408 // waiting for logLock(). 2409 if d.closed.Load() != nil { 2410 return 2411 } 2412 2413 env := compactionEnv{ 2414 diskAvailBytes: d.diskAvailBytes.Load(), 2415 earliestSnapshotSeqNum: d.mu.snapshots.earliest(), 2416 earliestUnflushedSeqNum: d.getEarliestUnflushedSeqNumLocked(), 2417 } 2418 2419 // Check for delete-only compactions first, because they're expected to be 2420 // cheap and reduce future compaction work. 2421 if !d.opts.private.disableDeleteOnlyCompactions && 2422 len(d.mu.compact.deletionHints) > 0 && 2423 !d.opts.DisableAutomaticCompactions { 2424 v := d.mu.versions.currentVersion() 2425 snapshots := d.mu.snapshots.toSlice() 2426 inputs, unresolvedHints := checkDeleteCompactionHints(d.cmp, v, d.mu.compact.deletionHints, snapshots) 2427 d.mu.compact.deletionHints = unresolvedHints 2428 2429 if len(inputs) > 0 { 2430 c := newDeleteOnlyCompaction(d.opts, v, inputs, d.timeNow()) 2431 d.mu.compact.compactingCount++ 2432 d.addInProgressCompaction(c) 2433 go d.compact(c, nil) 2434 } 2435 } 2436 2437 for len(d.mu.compact.manual) > 0 && d.mu.compact.compactingCount < maxConcurrentCompactions { 2438 v := d.mu.versions.currentVersion() 2439 manual := d.mu.compact.manual[0] 2440 env.inProgressCompactions = d.getInProgressCompactionInfoLocked(nil) 2441 pc, retryLater := pickManualCompaction(v, d.opts, env, d.mu.versions.picker.getBaseLevel(), manual) 2442 if pc != nil { 2443 c := newCompaction(pc, d.opts, d.timeNow(), d.ObjProvider()) 2444 d.mu.compact.manual = d.mu.compact.manual[1:] 2445 d.mu.compact.compactingCount++ 2446 d.addInProgressCompaction(c) 2447 go d.compact(c, manual.done) 2448 } else if !retryLater { 2449 // Noop 2450 d.mu.compact.manual = d.mu.compact.manual[1:] 2451 manual.done <- nil 2452 } else { 2453 // Inability to run head blocks later manual compactions. 2454 manual.retries++ 2455 break 2456 } 2457 } 2458 2459 for !d.opts.DisableAutomaticCompactions && d.mu.compact.compactingCount < maxConcurrentCompactions { 2460 env.inProgressCompactions = d.getInProgressCompactionInfoLocked(nil) 2461 env.readCompactionEnv = readCompactionEnv{ 2462 readCompactions: &d.mu.compact.readCompactions, 2463 flushing: d.mu.compact.flushing || d.passedFlushThreshold(), 2464 rescheduleReadCompaction: &d.mu.compact.rescheduleReadCompaction, 2465 } 2466 pc := pickFunc(d.mu.versions.picker, env) 2467 if pc == nil { 2468 break 2469 } 2470 c := newCompaction(pc, d.opts, d.timeNow(), d.ObjProvider()) 2471 d.mu.compact.compactingCount++ 2472 d.addInProgressCompaction(c) 2473 go d.compact(c, nil) 2474 } 2475 2476 d.maybeScheduleDownloadCompaction(env, maxConcurrentCompactions) 2477 } 2478 2479 // deleteCompactionHintType indicates whether the deleteCompactionHint was 2480 // generated from a span containing a range del (point key only), a range key 2481 // delete (range key only), or both a point and range key. 2482 type deleteCompactionHintType uint8 2483 2484 const ( 2485 // NOTE: While these are primarily used as enumeration types, they are also 2486 // used for some bitwise operations. Care should be taken when updating. 2487 deleteCompactionHintTypeUnknown deleteCompactionHintType = iota 2488 deleteCompactionHintTypePointKeyOnly 2489 deleteCompactionHintTypeRangeKeyOnly 2490 deleteCompactionHintTypePointAndRangeKey 2491 ) 2492 2493 // String implements fmt.Stringer. 2494 func (h deleteCompactionHintType) String() string { 2495 switch h { 2496 case deleteCompactionHintTypeUnknown: 2497 return "unknown" 2498 case deleteCompactionHintTypePointKeyOnly: 2499 return "point-key-only" 2500 case deleteCompactionHintTypeRangeKeyOnly: 2501 return "range-key-only" 2502 case deleteCompactionHintTypePointAndRangeKey: 2503 return "point-and-range-key" 2504 default: 2505 panic(fmt.Sprintf("unknown hint type: %d", h)) 2506 } 2507 } 2508 2509 // compactionHintFromKeys returns a deleteCompactionHintType given a slice of 2510 // keyspan.Keys. 2511 func compactionHintFromKeys(keys []keyspan.Key) deleteCompactionHintType { 2512 var hintType deleteCompactionHintType 2513 for _, k := range keys { 2514 switch k.Kind() { 2515 case base.InternalKeyKindRangeDelete: 2516 hintType |= deleteCompactionHintTypePointKeyOnly 2517 case base.InternalKeyKindRangeKeyDelete: 2518 hintType |= deleteCompactionHintTypeRangeKeyOnly 2519 default: 2520 panic(fmt.Sprintf("unsupported key kind: %s", k.Kind())) 2521 } 2522 } 2523 return hintType 2524 } 2525 2526 // A deleteCompactionHint records a user key and sequence number span that has been 2527 // deleted by a range tombstone. A hint is recorded if at least one sstable 2528 // falls completely within both the user key and sequence number spans. 2529 // Once the tombstones and the observed completely-contained sstables fall 2530 // into the same snapshot stripe, a delete-only compaction may delete any 2531 // sstables within the range. 2532 type deleteCompactionHint struct { 2533 // The type of key span that generated this hint (point key, range key, or 2534 // both). 2535 hintType deleteCompactionHintType 2536 // start and end are user keys specifying a key range [start, end) of 2537 // deleted keys. 2538 start []byte 2539 end []byte 2540 // The level of the file containing the range tombstone(s) when the hint 2541 // was created. Only lower levels need to be searched for files that may 2542 // be deleted. 2543 tombstoneLevel int 2544 // The file containing the range tombstone(s) that created the hint. 2545 tombstoneFile *fileMetadata 2546 // The smallest and largest sequence numbers of the abutting tombstones 2547 // merged to form this hint. All of a tables' keys must be less than the 2548 // tombstone smallest sequence number to be deleted. All of a tables' 2549 // sequence numbers must fall into the same snapshot stripe as the 2550 // tombstone largest sequence number to be deleted. 2551 tombstoneLargestSeqNum uint64 2552 tombstoneSmallestSeqNum uint64 2553 // The smallest sequence number of a sstable that was found to be covered 2554 // by this hint. The hint cannot be resolved until this sequence number is 2555 // in the same snapshot stripe as the largest tombstone sequence number. 2556 // This is set when a hint is created, so the LSM may look different and 2557 // notably no longer contain the sstable that contained the key at this 2558 // sequence number. 2559 fileSmallestSeqNum uint64 2560 } 2561 2562 func (h deleteCompactionHint) String() string { 2563 return fmt.Sprintf( 2564 "L%d.%s %s-%s seqnums(tombstone=%d-%d, file-smallest=%d, type=%s)", 2565 h.tombstoneLevel, h.tombstoneFile.FileNum, h.start, h.end, 2566 h.tombstoneSmallestSeqNum, h.tombstoneLargestSeqNum, h.fileSmallestSeqNum, 2567 h.hintType, 2568 ) 2569 } 2570 2571 func (h *deleteCompactionHint) canDelete(cmp Compare, m *fileMetadata, snapshots []uint64) bool { 2572 // The file can only be deleted if all of its keys are older than the 2573 // earliest tombstone aggregated into the hint. 2574 if m.LargestSeqNum >= h.tombstoneSmallestSeqNum || m.SmallestSeqNum < h.fileSmallestSeqNum { 2575 return false 2576 } 2577 2578 // The file's oldest key must be in the same snapshot stripe as the 2579 // newest tombstone. NB: We already checked the hint's sequence numbers, 2580 // but this file's oldest sequence number might be lower than the hint's 2581 // smallest sequence number despite the file falling within the key range 2582 // if this file was constructed after the hint by a compaction. 2583 ti, _ := snapshotIndex(h.tombstoneLargestSeqNum, snapshots) 2584 fi, _ := snapshotIndex(m.SmallestSeqNum, snapshots) 2585 if ti != fi { 2586 return false 2587 } 2588 2589 switch h.hintType { 2590 case deleteCompactionHintTypePointKeyOnly: 2591 // A hint generated by a range del span cannot delete tables that contain 2592 // range keys. 2593 if m.HasRangeKeys { 2594 return false 2595 } 2596 case deleteCompactionHintTypeRangeKeyOnly: 2597 // A hint generated by a range key del span cannot delete tables that 2598 // contain point keys. 2599 if m.HasPointKeys { 2600 return false 2601 } 2602 case deleteCompactionHintTypePointAndRangeKey: 2603 // A hint from a span that contains both range dels *and* range keys can 2604 // only be deleted if both bounds fall within the hint. The next check takes 2605 // care of this. 2606 default: 2607 panic(fmt.Sprintf("pebble: unknown delete compaction hint type: %d", h.hintType)) 2608 } 2609 2610 // The file's keys must be completely contained within the hint range. 2611 return cmp(h.start, m.Smallest.UserKey) <= 0 && cmp(m.Largest.UserKey, h.end) < 0 2612 } 2613 2614 func (d *DB) maybeUpdateDeleteCompactionHints(c *compaction) { 2615 // Compactions that zero sequence numbers can interfere with compaction 2616 // deletion hints. Deletion hints apply to tables containing keys older 2617 // than a threshold. If a key more recent than the threshold is zeroed in 2618 // a compaction, a delete-only compaction may mistake it as meeting the 2619 // threshold and drop a table containing live data. 2620 // 2621 // To avoid this scenario, compactions that zero sequence numbers remove 2622 // any conflicting deletion hints. A deletion hint is conflicting if both 2623 // of the following conditions apply: 2624 // * its key space overlaps with the compaction 2625 // * at least one of its inputs contains a key as recent as one of the 2626 // hint's tombstones. 2627 // 2628 if !c.allowedZeroSeqNum { 2629 return 2630 } 2631 2632 updatedHints := d.mu.compact.deletionHints[:0] 2633 for _, h := range d.mu.compact.deletionHints { 2634 // If the compaction's key space is disjoint from the hint's key 2635 // space, the zeroing of sequence numbers won't affect the hint. Keep 2636 // the hint. 2637 keysDisjoint := d.cmp(h.end, c.smallest.UserKey) < 0 || d.cmp(h.start, c.largest.UserKey) > 0 2638 if keysDisjoint { 2639 updatedHints = append(updatedHints, h) 2640 continue 2641 } 2642 2643 // All of the compaction's inputs must be older than the hint's 2644 // tombstones. 2645 inputsOlder := true 2646 for _, in := range c.inputs { 2647 iter := in.files.Iter() 2648 for f := iter.First(); f != nil; f = iter.Next() { 2649 inputsOlder = inputsOlder && f.LargestSeqNum < h.tombstoneSmallestSeqNum 2650 } 2651 } 2652 if inputsOlder { 2653 updatedHints = append(updatedHints, h) 2654 continue 2655 } 2656 2657 // Drop h, because the compaction c may have zeroed sequence numbers 2658 // of keys more recent than some of h's tombstones. 2659 } 2660 d.mu.compact.deletionHints = updatedHints 2661 } 2662 2663 func checkDeleteCompactionHints( 2664 cmp Compare, v *version, hints []deleteCompactionHint, snapshots []uint64, 2665 ) ([]compactionLevel, []deleteCompactionHint) { 2666 var files map[*fileMetadata]bool 2667 var byLevel [numLevels][]*fileMetadata 2668 2669 unresolvedHints := hints[:0] 2670 for _, h := range hints { 2671 // Check each compaction hint to see if it's resolvable. Resolvable 2672 // hints are removed and trigger a delete-only compaction if any files 2673 // in the current LSM still meet their criteria. Unresolvable hints 2674 // are saved and don't trigger a delete-only compaction. 2675 // 2676 // When a compaction hint is created, the sequence numbers of the 2677 // range tombstones and the covered file with the oldest key are 2678 // recorded. The largest tombstone sequence number and the smallest 2679 // file sequence number must be in the same snapshot stripe for the 2680 // hint to be resolved. The below graphic models a compaction hint 2681 // covering the keyspace [b, r). The hint completely contains two 2682 // files, 000002 and 000003. The file 000003 contains the lowest 2683 // covered sequence number at #90. The tombstone b.RANGEDEL.230:h has 2684 // the highest tombstone sequence number incorporated into the hint. 2685 // The hint may be resolved only once the snapshots at #100, #180 and 2686 // #210 are all closed. File 000001 is not included within the hint 2687 // because it extends beyond the range tombstones in user key space. 2688 // 2689 // 250 2690 // 2691 // |-b...230:h-| 2692 // _____________________________________________________ snapshot #210 2693 // 200 |--h.RANGEDEL.200:r--| 2694 // 2695 // _____________________________________________________ snapshot #180 2696 // 2697 // 150 +--------+ 2698 // +---------+ | 000003 | 2699 // | 000002 | | | 2700 // +_________+ | | 2701 // 100_____________________|________|___________________ snapshot #100 2702 // +--------+ 2703 // _____________________________________________________ snapshot #70 2704 // +---------------+ 2705 // 50 | 000001 | 2706 // | | 2707 // +---------------+ 2708 // ______________________________________________________________ 2709 // a b c d e f g h i j k l m n o p q r s t u v w x y z 2710 2711 ti, _ := snapshotIndex(h.tombstoneLargestSeqNum, snapshots) 2712 fi, _ := snapshotIndex(h.fileSmallestSeqNum, snapshots) 2713 if ti != fi { 2714 // Cannot resolve yet. 2715 unresolvedHints = append(unresolvedHints, h) 2716 continue 2717 } 2718 2719 // The hint h will be resolved and dropped, regardless of whether 2720 // there are any tables that can be deleted. 2721 for l := h.tombstoneLevel + 1; l < numLevels; l++ { 2722 overlaps := v.Overlaps(l, cmp, h.start, h.end, true /* exclusiveEnd */) 2723 iter := overlaps.Iter() 2724 for m := iter.First(); m != nil; m = iter.Next() { 2725 if m.IsCompacting() || !h.canDelete(cmp, m, snapshots) || files[m] { 2726 continue 2727 } 2728 if files == nil { 2729 // Construct files lazily, assuming most calls will not 2730 // produce delete-only compactions. 2731 files = make(map[*fileMetadata]bool) 2732 } 2733 files[m] = true 2734 byLevel[l] = append(byLevel[l], m) 2735 } 2736 } 2737 } 2738 2739 var compactLevels []compactionLevel 2740 for l, files := range byLevel { 2741 if len(files) == 0 { 2742 continue 2743 } 2744 compactLevels = append(compactLevels, compactionLevel{ 2745 level: l, 2746 files: manifest.NewLevelSliceKeySorted(cmp, files), 2747 }) 2748 } 2749 return compactLevels, unresolvedHints 2750 } 2751 2752 // compact runs one compaction and maybe schedules another call to compact. 2753 func (d *DB) compact(c *compaction, errChannel chan error) { 2754 pprof.Do(context.Background(), compactLabels, func(context.Context) { 2755 d.mu.Lock() 2756 defer d.mu.Unlock() 2757 if err := d.compact1(c, errChannel); err != nil { 2758 // TODO(peter): count consecutive compaction errors and backoff. 2759 d.opts.EventListener.BackgroundError(err) 2760 } 2761 d.mu.compact.compactingCount-- 2762 delete(d.mu.compact.inProgress, c) 2763 // Add this compaction's duration to the cumulative duration. NB: This 2764 // must be atomic with the above removal of c from 2765 // d.mu.compact.InProgress to ensure Metrics.Compact.Duration does not 2766 // miss or double count a completing compaction's duration. 2767 d.mu.compact.duration += d.timeNow().Sub(c.beganAt) 2768 2769 // The previous compaction may have produced too many files in a 2770 // level, so reschedule another compaction if needed. 2771 d.maybeScheduleCompaction() 2772 d.mu.compact.cond.Broadcast() 2773 }) 2774 } 2775 2776 // compact1 runs one compaction. 2777 // 2778 // d.mu must be held when calling this, but the mutex may be dropped and 2779 // re-acquired during the course of this method. 2780 func (d *DB) compact1(c *compaction, errChannel chan error) (err error) { 2781 if errChannel != nil { 2782 defer func() { 2783 errChannel <- err 2784 }() 2785 } 2786 2787 jobID := d.mu.nextJobID 2788 d.mu.nextJobID++ 2789 info := c.makeInfo(jobID) 2790 d.opts.EventListener.CompactionBegin(info) 2791 startTime := d.timeNow() 2792 2793 ve, pendingOutputs, stats, err := d.runCompaction(jobID, c) 2794 2795 info.Duration = d.timeNow().Sub(startTime) 2796 if err == nil { 2797 err = func() error { 2798 var err error 2799 d.mu.versions.logLock() 2800 // Check if this compaction had a conflicting operation (eg. a d.excise()) 2801 // that necessitates it restarting from scratch. Note that since we hold 2802 // the manifest lock, we don't expect this bool to change its value 2803 // as only the holder of the manifest lock will ever write to it. 2804 if c.cancel.Load() { 2805 err = firstError(err, ErrCancelledCompaction) 2806 } 2807 if err != nil { 2808 // logAndApply calls logUnlock. If we didn't call it, we need to call 2809 // logUnlock ourselves. 2810 d.mu.versions.logUnlock() 2811 return err 2812 } 2813 return d.mu.versions.logAndApply(jobID, ve, c.metrics, false /* forceRotation */, func() []compactionInfo { 2814 return d.getInProgressCompactionInfoLocked(c) 2815 }) 2816 }() 2817 if err != nil { 2818 // TODO(peter): untested. 2819 for _, f := range pendingOutputs { 2820 // Note that the FileBacking for the file metadata might not have 2821 // been set yet. So, we directly use the FileNum. Since these 2822 // files were generated as compaction outputs, these must be 2823 // physical files on disk. This property might not hold once 2824 // https://github.com/cockroachdb/pebble/issues/389 is 2825 // implemented if #389 creates virtual sstables as output files. 2826 d.mu.versions.obsoleteTables = append( 2827 d.mu.versions.obsoleteTables, 2828 fileInfo{f.FileNum.DiskFileNum(), f.Size}, 2829 ) 2830 } 2831 d.mu.versions.updateObsoleteTableMetricsLocked() 2832 } 2833 } 2834 2835 info.Done = true 2836 info.Err = err 2837 if err == nil { 2838 for i := range ve.NewFiles { 2839 e := &ve.NewFiles[i] 2840 info.Output.Tables = append(info.Output.Tables, e.Meta.TableInfo()) 2841 } 2842 d.mu.snapshots.cumulativePinnedCount += stats.cumulativePinnedKeys 2843 d.mu.snapshots.cumulativePinnedSize += stats.cumulativePinnedSize 2844 d.mu.versions.metrics.Keys.MissizedTombstonesCount += stats.countMissizedDels 2845 d.maybeUpdateDeleteCompactionHints(c) 2846 } 2847 2848 // NB: clearing compacting state must occur before updating the read state; 2849 // L0Sublevels initialization depends on it. 2850 d.clearCompactingState(c, err != nil) 2851 d.mu.versions.incrementCompactions(c.kind, c.extraLevels, c.pickerMetrics) 2852 d.mu.versions.incrementCompactionBytes(-c.bytesWritten) 2853 2854 info.TotalDuration = d.timeNow().Sub(c.beganAt) 2855 d.opts.EventListener.CompactionEnd(info) 2856 2857 // Update the read state before deleting obsolete files because the 2858 // read-state update will cause the previous version to be unref'd and if 2859 // there are no references obsolete tables will be added to the obsolete 2860 // table list. 2861 if err == nil { 2862 d.updateReadStateLocked(d.opts.DebugCheck) 2863 d.updateTableStatsLocked(ve.NewFiles) 2864 } 2865 d.deleteObsoleteFiles(jobID) 2866 2867 return err 2868 } 2869 2870 type compactStats struct { 2871 cumulativePinnedKeys uint64 2872 cumulativePinnedSize uint64 2873 countMissizedDels uint64 2874 } 2875 2876 // runCopyCompaction runs a copy compaction where a new FileNum is created that 2877 // is a byte-for-byte copy of the input file. This is used in lieu of a move 2878 // compaction when a file is being moved across the local/remote storage 2879 // boundary. 2880 // 2881 // d.mu must be held when calling this method. 2882 func (d *DB) runCopyCompaction( 2883 jobID int, 2884 c *compaction, 2885 meta *fileMetadata, 2886 objMeta objstorage.ObjectMetadata, 2887 versionEdit *versionEdit, 2888 ) (ve *versionEdit, pendingOutputs []physicalMeta, retErr error) { 2889 ve = versionEdit 2890 if objMeta.IsRemote() || !remote.ShouldCreateShared(d.opts.Experimental.CreateOnShared, c.outputLevel.level) { 2891 panic("pebble: scheduled a copy compaction that is not actually moving files to shared storage") 2892 } 2893 // Note that based on logic in the compaction picker, we're guaranteed 2894 // meta.Virtual is false. 2895 if meta.Virtual { 2896 panic(errors.AssertionFailedf("cannot do a copy compaction of a virtual sstable across local/remote storage")) 2897 } 2898 // We are in the relatively more complex case where we need to copy this 2899 // file to remote/shared storage. Drop the db mutex while we do the 2900 // copy. 2901 // 2902 // To ease up cleanup of the local file and tracking of refs, we create 2903 // a new FileNum. This has the potential of making the block cache less 2904 // effective, however. 2905 metaCopy := new(fileMetadata) 2906 *metaCopy = fileMetadata{ 2907 Size: meta.Size, 2908 CreationTime: meta.CreationTime, 2909 SmallestSeqNum: meta.SmallestSeqNum, 2910 LargestSeqNum: meta.LargestSeqNum, 2911 Stats: meta.Stats, 2912 Virtual: meta.Virtual, 2913 } 2914 if meta.HasPointKeys { 2915 metaCopy.ExtendPointKeyBounds(c.cmp, meta.SmallestPointKey, meta.LargestPointKey) 2916 } 2917 if meta.HasRangeKeys { 2918 metaCopy.ExtendRangeKeyBounds(c.cmp, meta.SmallestRangeKey, meta.LargestRangeKey) 2919 } 2920 metaCopy.FileNum = d.mu.versions.getNextFileNum() 2921 metaCopy.InitPhysicalBacking() 2922 c.metrics = map[int]*LevelMetrics{ 2923 c.outputLevel.level: { 2924 BytesIn: meta.Size, 2925 BytesCompacted: meta.Size, 2926 TablesCompacted: 1, 2927 }, 2928 } 2929 pendingOutputs = append(pendingOutputs, metaCopy.PhysicalMeta()) 2930 // Before dropping the db mutex, grab a ref to the current version. This 2931 // prevents any concurrent excises from deleting files that this compaction 2932 // needs to read/maintain a reference to. 2933 vers := d.mu.versions.currentVersion() 2934 vers.Ref() 2935 defer vers.UnrefLocked() 2936 2937 d.mu.Unlock() 2938 defer d.mu.Lock() 2939 _, err := d.objProvider.LinkOrCopyFromLocal(context.TODO(), d.opts.FS, 2940 d.objProvider.Path(objMeta), fileTypeTable, metaCopy.FileBacking.DiskFileNum, 2941 objstorage.CreateOptions{PreferSharedStorage: true}) 2942 if err != nil { 2943 return ve, pendingOutputs, err 2944 } 2945 ve.NewFiles[0].Meta = metaCopy 2946 2947 if err := d.objProvider.Sync(); err != nil { 2948 return nil, pendingOutputs, err 2949 } 2950 return ve, pendingOutputs, nil 2951 } 2952 2953 // runCompactions runs a compaction that produces new on-disk tables from 2954 // memtables or old on-disk tables. 2955 // 2956 // d.mu must be held when calling this, but the mutex may be dropped and 2957 // re-acquired during the course of this method. 2958 func (d *DB) runCompaction( 2959 jobID int, c *compaction, 2960 ) (ve *versionEdit, pendingOutputs []physicalMeta, stats compactStats, retErr error) { 2961 // As a sanity check, confirm that the smallest / largest keys for new and 2962 // deleted files in the new versionEdit pass a validation function before 2963 // returning the edit. 2964 defer func() { 2965 // If we're handling a panic, don't expect the version edit to validate. 2966 if r := recover(); r != nil { 2967 panic(r) 2968 } else if ve != nil { 2969 err := validateVersionEdit(ve, d.opts.Experimental.KeyValidationFunc, d.opts.Comparer.FormatKey) 2970 if err != nil { 2971 d.opts.Logger.Fatalf("pebble: version edit validation failed: %s", err) 2972 } 2973 } 2974 }() 2975 2976 // Check for a delete-only compaction. This can occur when wide range 2977 // tombstones completely contain sstables. 2978 if c.kind == compactionKindDeleteOnly { 2979 c.metrics = make(map[int]*LevelMetrics, len(c.inputs)) 2980 ve := &versionEdit{ 2981 DeletedFiles: map[deletedFileEntry]*fileMetadata{}, 2982 } 2983 for _, cl := range c.inputs { 2984 levelMetrics := &LevelMetrics{} 2985 iter := cl.files.Iter() 2986 for f := iter.First(); f != nil; f = iter.Next() { 2987 ve.DeletedFiles[deletedFileEntry{ 2988 Level: cl.level, 2989 FileNum: f.FileNum, 2990 }] = f 2991 } 2992 c.metrics[cl.level] = levelMetrics 2993 } 2994 return ve, nil, stats, nil 2995 } 2996 2997 if c.kind == compactionKindIngestedFlushable { 2998 panic("pebble: runCompaction cannot handle compactionKindIngestedFlushable.") 2999 } 3000 3001 // Check for a move or copy of one table from one level to the next. We avoid 3002 // such a move if there is lots of overlapping grandparent data. Otherwise, 3003 // the move could create a parent file that will require a very expensive 3004 // merge later on. 3005 if c.kind == compactionKindMove || c.kind == compactionKindCopy { 3006 iter := c.startLevel.files.Iter() 3007 meta := iter.First() 3008 if invariants.Enabled { 3009 if iter.Next() != nil { 3010 panic("got more than one file for a move or copy compaction") 3011 } 3012 } 3013 if c.cancel.Load() { 3014 return ve, nil, stats, ErrCancelledCompaction 3015 } 3016 objMeta, err := d.objProvider.Lookup(fileTypeTable, meta.FileBacking.DiskFileNum) 3017 if err != nil { 3018 return ve, pendingOutputs, stats, err 3019 } 3020 c.metrics = map[int]*LevelMetrics{ 3021 c.outputLevel.level: { 3022 BytesMoved: meta.Size, 3023 TablesMoved: 1, 3024 }, 3025 } 3026 ve := &versionEdit{ 3027 DeletedFiles: map[deletedFileEntry]*fileMetadata{ 3028 {Level: c.startLevel.level, FileNum: meta.FileNum}: meta, 3029 }, 3030 NewFiles: []newFileEntry{ 3031 {Level: c.outputLevel.level, Meta: meta}, 3032 }, 3033 } 3034 if c.kind == compactionKindCopy { 3035 ve, pendingOutputs, retErr = d.runCopyCompaction(jobID, c, meta, objMeta, ve) 3036 if retErr != nil { 3037 return ve, pendingOutputs, stats, retErr 3038 } 3039 } 3040 return ve, nil, stats, nil 3041 } 3042 3043 defer func() { 3044 if retErr != nil { 3045 pendingOutputs = nil 3046 } 3047 }() 3048 3049 snapshots := d.mu.snapshots.toSlice() 3050 formatVers := d.FormatMajorVersion() 3051 3052 if c.flushing == nil { 3053 // Before dropping the db mutex, grab a ref to the current version. This 3054 // prevents any concurrent excises from deleting files that this compaction 3055 // needs to read/maintain a reference to. 3056 // 3057 // Note that unlike user iterators, compactionIter does not maintain a ref 3058 // of the version or read state. 3059 vers := d.mu.versions.currentVersion() 3060 vers.Ref() 3061 defer vers.UnrefLocked() 3062 } 3063 3064 if c.cancel.Load() { 3065 return ve, nil, stats, ErrCancelledCompaction 3066 } 3067 3068 // Release the d.mu lock while doing I/O. 3069 // Note the unusual order: Unlock and then Lock. 3070 d.mu.Unlock() 3071 defer d.mu.Lock() 3072 3073 // Compactions use a pool of buffers to read blocks, avoiding polluting the 3074 // block cache with blocks that will not be read again. We initialize the 3075 // buffer pool with a size 12. This initial size does not need to be 3076 // accurate, because the pool will grow to accommodate the maximum number of 3077 // blocks allocated at a given time over the course of the compaction. But 3078 // choosing a size larger than that working set avoids any additional 3079 // allocations to grow the size of the pool over the course of iteration. 3080 // 3081 // Justification for initial size 12: In a two-level compaction, at any 3082 // given moment we'll have 2 index blocks in-use and 2 data blocks in-use. 3083 // Additionally, when decoding a compressed block, we'll temporarily 3084 // allocate 1 additional block to hold the compressed buffer. In the worst 3085 // case that all input sstables have two-level index blocks (+2), value 3086 // blocks (+2), range deletion blocks (+n) and range key blocks (+n), we'll 3087 // additionally require 2n+4 blocks where n is the number of input sstables. 3088 // Range deletion and range key blocks are relatively rare, and the cost of 3089 // an additional allocation or two over the course of the compaction is 3090 // considered to be okay. A larger initial size would cause the pool to hold 3091 // on to more memory, even when it's not in-use because the pool will 3092 // recycle buffers up to the current capacity of the pool. The memory use of 3093 // a 12-buffer pool is expected to be within reason, even if all the buffers 3094 // grow to the typical size of an index block (256 KiB) which would 3095 // translate to 3 MiB per compaction. 3096 c.bufferPool.Init(12) 3097 defer c.bufferPool.Release() 3098 3099 iiter, err := c.newInputIter(d.newIters, d.tableNewRangeKeyIter, snapshots) 3100 if err != nil { 3101 return nil, pendingOutputs, stats, err 3102 } 3103 c.allowedZeroSeqNum = c.allowZeroSeqNum() 3104 iiter = invalidating.MaybeWrapIfInvariants(iiter) 3105 iter := newCompactionIter(c.cmp, c.equal, c.formatKey, d.merge, iiter, snapshots, 3106 &c.rangeDelFrag, &c.rangeKeyFrag, c.allowedZeroSeqNum, c.elideTombstone, 3107 c.elideRangeTombstone, d.opts.Experimental.IneffectualSingleDeleteCallback, 3108 d.opts.Experimental.SingleDeleteInvariantViolationCallback, 3109 d.FormatMajorVersion()) 3110 3111 var ( 3112 createdFiles []base.DiskFileNum 3113 tw *sstable.Writer 3114 pinnedKeySize uint64 3115 pinnedValueSize uint64 3116 pinnedCount uint64 3117 ) 3118 defer func() { 3119 if iter != nil { 3120 retErr = firstError(retErr, iter.Close()) 3121 } 3122 if tw != nil { 3123 retErr = firstError(retErr, tw.Close()) 3124 } 3125 if retErr != nil { 3126 for _, fileNum := range createdFiles { 3127 _ = d.objProvider.Remove(fileTypeTable, fileNum) 3128 } 3129 } 3130 for _, closer := range c.closers { 3131 retErr = firstError(retErr, closer.Close()) 3132 } 3133 }() 3134 3135 ve = &versionEdit{ 3136 DeletedFiles: map[deletedFileEntry]*fileMetadata{}, 3137 } 3138 3139 startLevelBytes := c.startLevel.files.SizeSum() 3140 outputMetrics := &LevelMetrics{ 3141 BytesIn: startLevelBytes, 3142 BytesRead: c.outputLevel.files.SizeSum(), 3143 } 3144 if len(c.extraLevels) > 0 { 3145 outputMetrics.BytesIn += c.extraLevels[0].files.SizeSum() 3146 } 3147 outputMetrics.BytesRead += outputMetrics.BytesIn 3148 3149 c.metrics = map[int]*LevelMetrics{ 3150 c.outputLevel.level: outputMetrics, 3151 } 3152 if len(c.flushing) == 0 && c.metrics[c.startLevel.level] == nil { 3153 c.metrics[c.startLevel.level] = &LevelMetrics{} 3154 } 3155 if len(c.extraLevels) > 0 { 3156 c.metrics[c.extraLevels[0].level] = &LevelMetrics{} 3157 outputMetrics.MultiLevel.BytesInTop = startLevelBytes 3158 outputMetrics.MultiLevel.BytesIn = outputMetrics.BytesIn 3159 outputMetrics.MultiLevel.BytesRead = outputMetrics.BytesRead 3160 } 3161 3162 // The table is typically written at the maximum allowable format implied by 3163 // the current format major version of the DB. 3164 tableFormat := formatVers.MaxTableFormat() 3165 3166 // In format major versions with maximum table formats of Pebblev3, value 3167 // blocks were conditional on an experimental setting. In format major 3168 // versions with maximum table formats of Pebblev4 and higher, value blocks 3169 // are always enabled. 3170 if tableFormat == sstable.TableFormatPebblev3 && 3171 (d.opts.Experimental.EnableValueBlocks == nil || !d.opts.Experimental.EnableValueBlocks()) { 3172 tableFormat = sstable.TableFormatPebblev2 3173 } 3174 3175 writerOpts := d.opts.MakeWriterOptions(c.outputLevel.level, tableFormat) 3176 if formatVers < FormatBlockPropertyCollector { 3177 // Cannot yet write block properties. 3178 writerOpts.BlockPropertyCollectors = nil 3179 } 3180 3181 // prevPointKey is a sstable.WriterOption that provides access to 3182 // the last point key written to a writer's sstable. When a new 3183 // output begins in newOutput, prevPointKey is updated to point to 3184 // the new output's sstable.Writer. This allows the compaction loop 3185 // to access the last written point key without requiring the 3186 // compaction loop to make a copy of each key ahead of time. Users 3187 // must be careful, because the byte slice returned by UnsafeKey 3188 // points directly into the Writer's block buffer. 3189 var prevPointKey sstable.PreviousPointKeyOpt 3190 var cpuWorkHandle CPUWorkHandle 3191 defer func() { 3192 if cpuWorkHandle != nil { 3193 d.opts.Experimental.CPUWorkPermissionGranter.CPUWorkDone(cpuWorkHandle) 3194 } 3195 }() 3196 3197 newOutput := func() error { 3198 // Check if we've been cancelled by a concurrent operation. 3199 if c.cancel.Load() { 3200 return ErrCancelledCompaction 3201 } 3202 fileMeta := &fileMetadata{} 3203 d.mu.Lock() 3204 fileNum := d.mu.versions.getNextFileNum() 3205 fileMeta.FileNum = fileNum 3206 pendingOutputs = append(pendingOutputs, fileMeta.PhysicalMeta()) 3207 d.mu.Unlock() 3208 3209 ctx := context.TODO() 3210 if objiotracing.Enabled { 3211 ctx = objiotracing.WithLevel(ctx, c.outputLevel.level) 3212 switch c.kind { 3213 case compactionKindFlush: 3214 ctx = objiotracing.WithReason(ctx, objiotracing.ForFlush) 3215 case compactionKindIngestedFlushable: 3216 ctx = objiotracing.WithReason(ctx, objiotracing.ForIngestion) 3217 default: 3218 ctx = objiotracing.WithReason(ctx, objiotracing.ForCompaction) 3219 } 3220 } 3221 // Prefer shared storage if present. 3222 createOpts := objstorage.CreateOptions{ 3223 PreferSharedStorage: remote.ShouldCreateShared(d.opts.Experimental.CreateOnShared, c.outputLevel.level), 3224 } 3225 writable, objMeta, err := d.objProvider.Create(ctx, fileTypeTable, fileNum.DiskFileNum(), createOpts) 3226 if err != nil { 3227 return err 3228 } 3229 3230 reason := "flushing" 3231 if c.flushing == nil { 3232 reason = "compacting" 3233 } 3234 d.opts.EventListener.TableCreated(TableCreateInfo{ 3235 JobID: jobID, 3236 Reason: reason, 3237 Path: d.objProvider.Path(objMeta), 3238 FileNum: fileNum, 3239 }) 3240 if c.kind != compactionKindFlush { 3241 writable = &compactionWritable{ 3242 Writable: writable, 3243 versions: d.mu.versions, 3244 written: &c.bytesWritten, 3245 } 3246 } 3247 createdFiles = append(createdFiles, fileNum.DiskFileNum()) 3248 cacheOpts := private.SSTableCacheOpts(d.cacheID, fileNum.DiskFileNum()).(sstable.WriterOption) 3249 3250 const MaxFileWriteAdditionalCPUTime = time.Millisecond * 100 3251 cpuWorkHandle = d.opts.Experimental.CPUWorkPermissionGranter.GetPermission( 3252 MaxFileWriteAdditionalCPUTime, 3253 ) 3254 writerOpts.Parallelism = 3255 d.opts.Experimental.MaxWriterConcurrency > 0 && 3256 (cpuWorkHandle.Permitted() || d.opts.Experimental.ForceWriterParallelism) 3257 3258 tw = sstable.NewWriter(writable, writerOpts, cacheOpts, &prevPointKey) 3259 3260 fileMeta.CreationTime = time.Now().Unix() 3261 ve.NewFiles = append(ve.NewFiles, newFileEntry{ 3262 Level: c.outputLevel.level, 3263 Meta: fileMeta, 3264 }) 3265 return nil 3266 } 3267 3268 // splitL0Outputs is true during flushes and intra-L0 compactions with flush 3269 // splits enabled. 3270 splitL0Outputs := c.outputLevel.level == 0 && d.opts.FlushSplitBytes > 0 3271 3272 // finishOutput is called with the a user key up to which all tombstones 3273 // should be flushed. Typically, this is the first key of the next 3274 // sstable or an empty key if this output is the final sstable. 3275 finishOutput := func(splitKey []byte) error { 3276 // If we haven't output any point records to the sstable (tw == nil) then the 3277 // sstable will only contain range tombstones and/or range keys. The smallest 3278 // key in the sstable will be the start key of the first range tombstone or 3279 // range key added. We need to ensure that this start key is distinct from 3280 // the splitKey passed to finishOutput (if set), otherwise we would generate 3281 // an sstable where the largest key is smaller than the smallest key due to 3282 // how the largest key boundary is set below. NB: It is permissible for the 3283 // range tombstone / range key start key to be the empty string. 3284 // 3285 // TODO: It is unfortunate that we have to do this check here rather than 3286 // when we decide to finish the sstable in the runCompaction loop. A better 3287 // structure currently eludes us. 3288 if tw == nil { 3289 startKey := c.rangeDelFrag.Start() 3290 if len(iter.tombstones) > 0 { 3291 startKey = iter.tombstones[0].Start 3292 } 3293 if startKey == nil { 3294 startKey = c.rangeKeyFrag.Start() 3295 if len(iter.rangeKeys) > 0 { 3296 startKey = iter.rangeKeys[0].Start 3297 } 3298 } 3299 if splitKey != nil && d.cmp(startKey, splitKey) == 0 { 3300 return nil 3301 } 3302 } 3303 3304 // NB: clone the key because the data can be held on to by the call to 3305 // compactionIter.Tombstones via keyspan.Fragmenter.FlushTo, and by the 3306 // WriterMetadata.LargestRangeDel.UserKey. 3307 splitKey = append([]byte(nil), splitKey...) 3308 for _, v := range iter.Tombstones(splitKey) { 3309 if tw == nil { 3310 if err := newOutput(); err != nil { 3311 return err 3312 } 3313 } 3314 // The tombstone being added could be completely outside the 3315 // eventual bounds of the sstable. Consider this example (bounds 3316 // in square brackets next to table filename): 3317 // 3318 // ./000240.sst [tmgc#391,MERGE-tmgc#391,MERGE] 3319 // tmgc#391,MERGE [786e627a] 3320 // tmgc-udkatvs#331,RANGEDEL 3321 // 3322 // ./000241.sst [tmgc#384,MERGE-tmgc#384,MERGE] 3323 // tmgc#384,MERGE [666c7070] 3324 // tmgc-tvsalezade#383,RANGEDEL 3325 // tmgc-tvsalezade#331,RANGEDEL 3326 // 3327 // ./000242.sst [tmgc#383,RANGEDEL-tvsalezade#72057594037927935,RANGEDEL] 3328 // tmgc-tvsalezade#383,RANGEDEL 3329 // tmgc#375,SET [72646c78766965616c72776865676e79] 3330 // tmgc-tvsalezade#356,RANGEDEL 3331 // 3332 // Note that both of the top two SSTables have range tombstones 3333 // that start after the file's end keys. Since the file bound 3334 // computation happens well after all range tombstones have been 3335 // added to the writer, eliding out-of-file range tombstones based 3336 // on sequence number at this stage is difficult, and necessitates 3337 // read-time logic to ignore range tombstones outside file bounds. 3338 if err := rangedel.Encode(&v, tw.Add); err != nil { 3339 return err 3340 } 3341 } 3342 for _, v := range iter.RangeKeys(splitKey) { 3343 // Same logic as for range tombstones, except added using tw.AddRangeKey. 3344 if tw == nil { 3345 if err := newOutput(); err != nil { 3346 return err 3347 } 3348 } 3349 if err := rangekey.Encode(&v, tw.AddRangeKey); err != nil { 3350 return err 3351 } 3352 } 3353 3354 if tw == nil { 3355 return nil 3356 } 3357 { 3358 // Set internal sstable properties. 3359 p := getInternalWriterProperties(tw) 3360 // Set the external sst version to 0. This is what RocksDB expects for 3361 // db-internal sstables; otherwise, it could apply a global sequence number. 3362 p.ExternalFormatVersion = 0 3363 // Set the snapshot pinned totals. 3364 p.SnapshotPinnedKeys = pinnedCount 3365 p.SnapshotPinnedKeySize = pinnedKeySize 3366 p.SnapshotPinnedValueSize = pinnedValueSize 3367 stats.cumulativePinnedKeys += pinnedCount 3368 stats.cumulativePinnedSize += pinnedKeySize + pinnedValueSize 3369 pinnedCount = 0 3370 pinnedKeySize = 0 3371 pinnedValueSize = 0 3372 } 3373 if err := tw.Close(); err != nil { 3374 tw = nil 3375 return err 3376 } 3377 d.opts.Experimental.CPUWorkPermissionGranter.CPUWorkDone(cpuWorkHandle) 3378 cpuWorkHandle = nil 3379 writerMeta, err := tw.Metadata() 3380 if err != nil { 3381 tw = nil 3382 return err 3383 } 3384 tw = nil 3385 meta := ve.NewFiles[len(ve.NewFiles)-1].Meta 3386 meta.Size = writerMeta.Size 3387 meta.SmallestSeqNum = writerMeta.SmallestSeqNum 3388 meta.LargestSeqNum = writerMeta.LargestSeqNum 3389 meta.InitPhysicalBacking() 3390 3391 // If the file didn't contain any range deletions, we can fill its 3392 // table stats now, avoiding unnecessarily loading the table later. 3393 maybeSetStatsFromProperties( 3394 meta.PhysicalMeta(), &writerMeta.Properties, 3395 ) 3396 3397 if c.flushing == nil { 3398 outputMetrics.TablesCompacted++ 3399 outputMetrics.BytesCompacted += meta.Size 3400 } else { 3401 outputMetrics.TablesFlushed++ 3402 outputMetrics.BytesFlushed += meta.Size 3403 } 3404 outputMetrics.Size += int64(meta.Size) 3405 outputMetrics.NumFiles++ 3406 outputMetrics.Additional.BytesWrittenDataBlocks += writerMeta.Properties.DataSize 3407 outputMetrics.Additional.BytesWrittenValueBlocks += writerMeta.Properties.ValueBlocksSize 3408 3409 if n := len(ve.NewFiles); n > 1 { 3410 // This is not the first output file. Ensure the sstable boundaries 3411 // are nonoverlapping. 3412 prevMeta := ve.NewFiles[n-2].Meta 3413 if writerMeta.SmallestRangeDel.UserKey != nil { 3414 c := d.cmp(writerMeta.SmallestRangeDel.UserKey, prevMeta.Largest.UserKey) 3415 if c < 0 { 3416 return errors.Errorf( 3417 "pebble: smallest range tombstone start key is less than previous sstable largest key: %s < %s", 3418 writerMeta.SmallestRangeDel.Pretty(d.opts.Comparer.FormatKey), 3419 prevMeta.Largest.Pretty(d.opts.Comparer.FormatKey)) 3420 } else if c == 0 && !prevMeta.Largest.IsExclusiveSentinel() { 3421 // The user key portion of the range boundary start key is 3422 // equal to the previous table's largest key user key, and 3423 // the previous table's largest key is not exclusive. This 3424 // violates the invariant that tables are key-space 3425 // partitioned. 3426 return errors.Errorf( 3427 "pebble: invariant violation: previous sstable largest key %s, current sstable smallest rangedel: %s", 3428 prevMeta.Largest.Pretty(d.opts.Comparer.FormatKey), 3429 writerMeta.SmallestRangeDel.Pretty(d.opts.Comparer.FormatKey), 3430 ) 3431 } 3432 } 3433 } 3434 3435 // Verify that all range deletions outputted to the sstable are 3436 // truncated to split key. 3437 if splitKey != nil && writerMeta.LargestRangeDel.UserKey != nil && 3438 d.cmp(writerMeta.LargestRangeDel.UserKey, splitKey) > 0 { 3439 return errors.Errorf( 3440 "pebble: invariant violation: rangedel largest key %q extends beyond split key %q", 3441 writerMeta.LargestRangeDel.Pretty(d.opts.Comparer.FormatKey), 3442 d.opts.Comparer.FormatKey(splitKey), 3443 ) 3444 } 3445 3446 if writerMeta.HasPointKeys { 3447 meta.ExtendPointKeyBounds(d.cmp, writerMeta.SmallestPoint, writerMeta.LargestPoint) 3448 } 3449 if writerMeta.HasRangeDelKeys { 3450 meta.ExtendPointKeyBounds(d.cmp, writerMeta.SmallestRangeDel, writerMeta.LargestRangeDel) 3451 } 3452 if writerMeta.HasRangeKeys { 3453 meta.ExtendRangeKeyBounds(d.cmp, writerMeta.SmallestRangeKey, writerMeta.LargestRangeKey) 3454 } 3455 3456 // Verify that the sstable bounds fall within the compaction input 3457 // bounds. This is a sanity check that we don't have a logic error 3458 // elsewhere that causes the sstable bounds to accidentally expand past the 3459 // compaction input bounds as doing so could lead to various badness such 3460 // as keys being deleted by a range tombstone incorrectly. 3461 if c.smallest.UserKey != nil { 3462 switch v := d.cmp(meta.Smallest.UserKey, c.smallest.UserKey); { 3463 case v >= 0: 3464 // Nothing to do. 3465 case v < 0: 3466 return errors.Errorf("pebble: compaction output grew beyond bounds of input: %s < %s", 3467 meta.Smallest.Pretty(d.opts.Comparer.FormatKey), 3468 c.smallest.Pretty(d.opts.Comparer.FormatKey)) 3469 } 3470 } 3471 if c.largest.UserKey != nil { 3472 switch v := d.cmp(meta.Largest.UserKey, c.largest.UserKey); { 3473 case v <= 0: 3474 // Nothing to do. 3475 case v > 0: 3476 return errors.Errorf("pebble: compaction output grew beyond bounds of input: %s > %s", 3477 meta.Largest.Pretty(d.opts.Comparer.FormatKey), 3478 c.largest.Pretty(d.opts.Comparer.FormatKey)) 3479 } 3480 } 3481 // Verify that we never split different revisions of the same user key 3482 // across two different sstables. 3483 if err := c.errorOnUserKeyOverlap(ve); err != nil { 3484 return err 3485 } 3486 if err := meta.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil { 3487 return err 3488 } 3489 return nil 3490 } 3491 3492 // Build a compactionOutputSplitter that contains all logic to determine 3493 // whether the compaction loop should stop writing to one output sstable and 3494 // switch to a new one. Some splitters can wrap other splitters, and the 3495 // splitterGroup can be composed of multiple splitters. In this case, we 3496 // start off with splitters for file sizes, grandparent limits, and (for L0 3497 // splits) L0 limits, before wrapping them in an splitterGroup. 3498 sizeSplitter := newFileSizeSplitter(&iter.frontiers, c.maxOutputFileSize, c.grandparents.Iter()) 3499 unsafePrevUserKey := func() []byte { 3500 // Return the largest point key written to tw or the start of 3501 // the current range deletion in the fragmenter, whichever is 3502 // greater. 3503 prevPoint := prevPointKey.UnsafeKey() 3504 if c.cmp(prevPoint.UserKey, c.rangeDelFrag.Start()) > 0 { 3505 return prevPoint.UserKey 3506 } 3507 return c.rangeDelFrag.Start() 3508 } 3509 outputSplitters := []compactionOutputSplitter{ 3510 // We do not split the same user key across different sstables within 3511 // one flush or compaction. The fileSizeSplitter may request a split in 3512 // the middle of a user key, so the userKeyChangeSplitter ensures we are 3513 // at a user key change boundary when doing a split. 3514 &userKeyChangeSplitter{ 3515 cmp: c.cmp, 3516 splitter: sizeSplitter, 3517 unsafePrevUserKey: unsafePrevUserKey, 3518 }, 3519 newLimitFuncSplitter(&iter.frontiers, c.findGrandparentLimit), 3520 } 3521 if splitL0Outputs { 3522 outputSplitters = append(outputSplitters, newLimitFuncSplitter(&iter.frontiers, c.findL0Limit)) 3523 } 3524 splitter := &splitterGroup{cmp: c.cmp, splitters: outputSplitters} 3525 3526 // Each outer loop iteration produces one output file. An iteration that 3527 // produces a file containing point keys (and optionally range tombstones) 3528 // guarantees that the input iterator advanced. An iteration that produces 3529 // a file containing only range tombstones guarantees the limit passed to 3530 // `finishOutput()` advanced to a strictly greater user key corresponding 3531 // to a grandparent file largest key, or nil. Taken together, these 3532 // progress guarantees ensure that eventually the input iterator will be 3533 // exhausted and the range tombstone fragments will all be flushed. 3534 for key, val := iter.First(); key != nil || !c.rangeDelFrag.Empty() || !c.rangeKeyFrag.Empty(); { 3535 var firstKey []byte 3536 if key != nil { 3537 firstKey = key.UserKey 3538 } else if startKey := c.rangeDelFrag.Start(); startKey != nil { 3539 // Pass the start key of the first pending tombstone to find the 3540 // next limit. All pending tombstones have the same start key. We 3541 // use this as opposed to the end key of the last written sstable to 3542 // effectively handle cases like these: 3543 // 3544 // a.SET.3 3545 // (lf.limit at b) 3546 // d.RANGEDEL.4:f 3547 // 3548 // In this case, the partition after b has only range deletions, so 3549 // if we were to find the limit after the last written key at the 3550 // split point (key a), we'd get the limit b again, and 3551 // finishOutput() would not advance any further because the next 3552 // range tombstone to write does not start until after the L0 split 3553 // point. 3554 firstKey = startKey 3555 } 3556 splitterSuggestion := splitter.onNewOutput(firstKey) 3557 3558 // Each inner loop iteration processes one key from the input iterator. 3559 for ; key != nil; key, val = iter.Next() { 3560 if split := splitter.shouldSplitBefore(key, tw); split == splitNow { 3561 break 3562 } 3563 3564 switch key.Kind() { 3565 case InternalKeyKindRangeDelete: 3566 // Range tombstones are handled specially. They are fragmented, 3567 // and they're not written until later during `finishOutput()`. 3568 // We add them to the `Fragmenter` now to make them visible to 3569 // `compactionIter` so covered keys in the same snapshot stripe 3570 // can be elided. 3571 3572 // The interleaved range deletion might only be one of many with 3573 // these bounds. Some fragmenting is performed ahead of time by 3574 // keyspan.MergingIter. 3575 if s := c.rangeDelIter.Span(); !s.Empty() { 3576 // The memory management here is subtle. Range deletions 3577 // blocks do NOT use prefix compression, which ensures that 3578 // range deletion spans' memory is available as long we keep 3579 // the iterator open. However, the keyspan.MergingIter that 3580 // merges spans across levels only guarantees the lifetime 3581 // of the [start, end) bounds until the next positioning 3582 // method is called. 3583 // 3584 // Additionally, the Span.Keys slice is owned by the the 3585 // range deletion iterator stack, and it may be overwritten 3586 // when we advance. 3587 // 3588 // Clone the Keys slice and the start and end keys. 3589 // 3590 // TODO(jackson): Avoid the clone by removing c.rangeDelFrag 3591 // and performing explicit truncation of the pending 3592 // rangedel span as necessary. 3593 clone := keyspan.Span{ 3594 Start: iter.cloneKey(s.Start), 3595 End: iter.cloneKey(s.End), 3596 Keys: make([]keyspan.Key, len(s.Keys)), 3597 } 3598 copy(clone.Keys, s.Keys) 3599 c.rangeDelFrag.Add(clone) 3600 } 3601 continue 3602 case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 3603 // Range keys are handled in the same way as range tombstones, except 3604 // with a dedicated fragmenter. 3605 if s := c.rangeKeyInterleaving.Span(); !s.Empty() { 3606 clone := keyspan.Span{ 3607 Start: iter.cloneKey(s.Start), 3608 End: iter.cloneKey(s.End), 3609 Keys: make([]keyspan.Key, len(s.Keys)), 3610 } 3611 // Since the keys' Suffix and Value fields are not deep cloned, the 3612 // underlying blockIter must be kept open for the lifetime of the 3613 // compaction. 3614 copy(clone.Keys, s.Keys) 3615 c.rangeKeyFrag.Add(clone) 3616 } 3617 continue 3618 } 3619 if tw == nil { 3620 if err := newOutput(); err != nil { 3621 return nil, pendingOutputs, stats, err 3622 } 3623 } 3624 if err := tw.AddWithForceObsolete(*key, val, iter.forceObsoleteDueToRangeDel); err != nil { 3625 return nil, pendingOutputs, stats, err 3626 } 3627 if iter.snapshotPinned { 3628 // The kv pair we just added to the sstable was only surfaced by 3629 // the compaction iterator because an open snapshot prevented 3630 // its elision. Increment the stats. 3631 pinnedCount++ 3632 pinnedKeySize += uint64(len(key.UserKey)) + base.InternalTrailerLen 3633 pinnedValueSize += uint64(len(val)) 3634 } 3635 } 3636 3637 // A splitter requested a split, and we're ready to finish the output. 3638 // We need to choose the key at which to split any pending range 3639 // tombstones. There are two options: 3640 // 1. splitterSuggestion — The key suggested by the splitter. This key 3641 // is guaranteed to be greater than the last key written to the 3642 // current output. 3643 // 2. key.UserKey — the first key of the next sstable output. This user 3644 // key is also guaranteed to be greater than the last user key 3645 // written to the current output (see userKeyChangeSplitter). 3646 // 3647 // Use whichever is smaller. Using the smaller of the two limits 3648 // overlap with grandparents. Consider the case where the 3649 // grandparent limit is calculated to be 'b', key is 'x', and 3650 // there exist many sstables between 'b' and 'x'. If the range 3651 // deletion fragmenter has a pending tombstone [a,x), splitting 3652 // at 'x' would cause the output table to overlap many 3653 // grandparents well beyond the calculated grandparent limit 3654 // 'b'. Splitting at the smaller `splitterSuggestion` avoids 3655 // this unbounded overlap with grandparent tables. 3656 splitKey := splitterSuggestion 3657 if key != nil && (splitKey == nil || c.cmp(splitKey, key.UserKey) > 0) { 3658 splitKey = key.UserKey 3659 } 3660 if err := finishOutput(splitKey); err != nil { 3661 return nil, pendingOutputs, stats, err 3662 } 3663 } 3664 3665 for _, cl := range c.inputs { 3666 iter := cl.files.Iter() 3667 for f := iter.First(); f != nil; f = iter.Next() { 3668 ve.DeletedFiles[deletedFileEntry{ 3669 Level: cl.level, 3670 FileNum: f.FileNum, 3671 }] = f 3672 } 3673 } 3674 3675 // The compaction iterator keeps track of a count of the number of DELSIZED 3676 // keys that encoded an incorrect size. Propagate it up as a part of 3677 // compactStats. 3678 stats.countMissizedDels = iter.stats.countMissizedDels 3679 3680 if err := d.objProvider.Sync(); err != nil { 3681 return nil, pendingOutputs, stats, err 3682 } 3683 3684 // Refresh the disk available statistic whenever a compaction/flush 3685 // completes, before re-acquiring the mutex. 3686 _ = d.calculateDiskAvailableBytes() 3687 3688 return ve, pendingOutputs, stats, nil 3689 } 3690 3691 // validateVersionEdit validates that start and end keys across new and deleted 3692 // files in a versionEdit pass the given validation function. 3693 func validateVersionEdit( 3694 ve *versionEdit, validateFn func([]byte) error, format base.FormatKey, 3695 ) error { 3696 validateMetaFn := func(f *manifest.FileMetadata) error { 3697 for _, key := range []InternalKey{f.Smallest, f.Largest} { 3698 if err := validateFn(key.UserKey); err != nil { 3699 return errors.Wrapf(err, "key=%q; file=%s", format(key.UserKey), f) 3700 } 3701 } 3702 return nil 3703 } 3704 3705 // Validate both new and deleted files. 3706 for _, f := range ve.NewFiles { 3707 if err := validateMetaFn(f.Meta); err != nil { 3708 return err 3709 } 3710 } 3711 for _, m := range ve.DeletedFiles { 3712 if err := validateMetaFn(m); err != nil { 3713 return err 3714 } 3715 } 3716 3717 return nil 3718 } 3719 3720 // scanObsoleteFiles scans the filesystem for files that are no longer needed 3721 // and adds those to the internal lists of obsolete files. Note that the files 3722 // are not actually deleted by this method. A subsequent call to 3723 // deleteObsoleteFiles must be performed. Must be not be called concurrently 3724 // with compactions and flushes. db.mu must be held when calling this function. 3725 func (d *DB) scanObsoleteFiles(list []string) { 3726 // Disable automatic compactions temporarily to avoid concurrent compactions / 3727 // flushes from interfering. The original value is restored on completion. 3728 disabledPrev := d.opts.DisableAutomaticCompactions 3729 defer func() { 3730 d.opts.DisableAutomaticCompactions = disabledPrev 3731 }() 3732 d.opts.DisableAutomaticCompactions = true 3733 3734 // Wait for any ongoing compaction to complete before continuing. 3735 for d.mu.compact.compactingCount > 0 || d.mu.compact.flushing { 3736 d.mu.compact.cond.Wait() 3737 } 3738 3739 liveFileNums := make(map[base.DiskFileNum]struct{}) 3740 d.mu.versions.addLiveFileNums(liveFileNums) 3741 // Protect against files which are only referred to by the ingestedFlushable 3742 // from being deleted. These are added to the flushable queue on WAL replay 3743 // during read only mode and aren't part of the Version. Note that if 3744 // !d.opts.ReadOnly, then all flushables of type ingestedFlushable have 3745 // already been flushed. 3746 for _, fEntry := range d.mu.mem.queue { 3747 if f, ok := fEntry.flushable.(*ingestedFlushable); ok { 3748 for _, file := range f.files { 3749 liveFileNums[file.FileBacking.DiskFileNum] = struct{}{} 3750 } 3751 } 3752 } 3753 3754 minUnflushedLogNum := d.mu.versions.minUnflushedLogNum 3755 manifestFileNum := d.mu.versions.manifestFileNum 3756 3757 var obsoleteLogs []fileInfo 3758 var obsoleteTables []fileInfo 3759 var obsoleteManifests []fileInfo 3760 var obsoleteOptions []fileInfo 3761 3762 for _, filename := range list { 3763 fileType, diskFileNum, ok := base.ParseFilename(d.opts.FS, filename) 3764 if !ok { 3765 continue 3766 } 3767 switch fileType { 3768 case fileTypeLog: 3769 if diskFileNum >= minUnflushedLogNum { 3770 continue 3771 } 3772 fi := fileInfo{fileNum: diskFileNum} 3773 if stat, err := d.opts.FS.Stat(filename); err == nil { 3774 fi.fileSize = uint64(stat.Size()) 3775 } 3776 obsoleteLogs = append(obsoleteLogs, fi) 3777 case fileTypeManifest: 3778 if diskFileNum >= manifestFileNum { 3779 continue 3780 } 3781 fi := fileInfo{fileNum: diskFileNum} 3782 if stat, err := d.opts.FS.Stat(filename); err == nil { 3783 fi.fileSize = uint64(stat.Size()) 3784 } 3785 obsoleteManifests = append(obsoleteManifests, fi) 3786 case fileTypeOptions: 3787 if diskFileNum.FileNum() >= d.optionsFileNum.FileNum() { 3788 continue 3789 } 3790 fi := fileInfo{fileNum: diskFileNum} 3791 if stat, err := d.opts.FS.Stat(filename); err == nil { 3792 fi.fileSize = uint64(stat.Size()) 3793 } 3794 obsoleteOptions = append(obsoleteOptions, fi) 3795 case fileTypeTable: 3796 // Objects are handled through the objstorage provider below. 3797 default: 3798 // Don't delete files we don't know about. 3799 } 3800 } 3801 3802 objects := d.objProvider.List() 3803 for _, obj := range objects { 3804 switch obj.FileType { 3805 case fileTypeTable: 3806 if _, ok := liveFileNums[obj.DiskFileNum]; ok { 3807 continue 3808 } 3809 fileInfo := fileInfo{ 3810 fileNum: obj.DiskFileNum, 3811 } 3812 if size, err := d.objProvider.Size(obj); err == nil { 3813 fileInfo.fileSize = uint64(size) 3814 } 3815 obsoleteTables = append(obsoleteTables, fileInfo) 3816 3817 default: 3818 // Ignore object types we don't know about. 3819 } 3820 } 3821 3822 d.mu.log.queue = merge(d.mu.log.queue, obsoleteLogs) 3823 d.mu.versions.metrics.WAL.Files = int64(len(d.mu.log.queue)) 3824 d.mu.versions.obsoleteTables = merge(d.mu.versions.obsoleteTables, obsoleteTables) 3825 d.mu.versions.updateObsoleteTableMetricsLocked() 3826 d.mu.versions.obsoleteManifests = merge(d.mu.versions.obsoleteManifests, obsoleteManifests) 3827 d.mu.versions.obsoleteOptions = merge(d.mu.versions.obsoleteOptions, obsoleteOptions) 3828 } 3829 3830 // disableFileDeletions disables file deletions and then waits for any 3831 // in-progress deletion to finish. The caller is required to call 3832 // enableFileDeletions in order to enable file deletions again. It is ok for 3833 // multiple callers to disable file deletions simultaneously, though they must 3834 // all invoke enableFileDeletions in order for file deletions to be re-enabled 3835 // (there is an internal reference count on file deletion disablement). 3836 // 3837 // d.mu must be held when calling this method. 3838 func (d *DB) disableFileDeletions() { 3839 d.mu.disableFileDeletions++ 3840 d.mu.Unlock() 3841 defer d.mu.Lock() 3842 d.cleanupManager.Wait() 3843 } 3844 3845 // enableFileDeletions enables previously disabled file deletions. A cleanup job 3846 // is queued if necessary. 3847 // 3848 // d.mu must be held when calling this method. 3849 func (d *DB) enableFileDeletions() { 3850 if d.mu.disableFileDeletions <= 0 { 3851 panic("pebble: file deletion disablement invariant violated") 3852 } 3853 d.mu.disableFileDeletions-- 3854 if d.mu.disableFileDeletions > 0 { 3855 return 3856 } 3857 jobID := d.mu.nextJobID 3858 d.mu.nextJobID++ 3859 d.deleteObsoleteFiles(jobID) 3860 } 3861 3862 type fileInfo struct { 3863 fileNum base.DiskFileNum 3864 fileSize uint64 3865 } 3866 3867 // deleteObsoleteFiles enqueues a cleanup job to the cleanup manager, if necessary. 3868 // 3869 // d.mu must be held when calling this. The function will release and re-aquire the mutex. 3870 // 3871 // Does nothing if file deletions are disabled (see disableFileDeletions). A 3872 // cleanup job will be scheduled when file deletions are re-enabled. 3873 func (d *DB) deleteObsoleteFiles(jobID int) { 3874 if d.mu.disableFileDeletions > 0 { 3875 return 3876 } 3877 3878 var obsoleteLogs []fileInfo 3879 for i := range d.mu.log.queue { 3880 // NB: d.mu.versions.minUnflushedLogNum is the log number of the earliest 3881 // log that has not had its contents flushed to an sstable. We can recycle 3882 // the prefix of d.mu.log.queue with log numbers less than 3883 // minUnflushedLogNum. 3884 if d.mu.log.queue[i].fileNum >= d.mu.versions.minUnflushedLogNum { 3885 obsoleteLogs = d.mu.log.queue[:i] 3886 d.mu.log.queue = d.mu.log.queue[i:] 3887 d.mu.versions.metrics.WAL.Files -= int64(len(obsoleteLogs)) 3888 break 3889 } 3890 } 3891 3892 obsoleteTables := append([]fileInfo(nil), d.mu.versions.obsoleteTables...) 3893 d.mu.versions.obsoleteTables = nil 3894 3895 for _, tbl := range obsoleteTables { 3896 delete(d.mu.versions.zombieTables, tbl.fileNum) 3897 } 3898 3899 // Sort the manifests cause we want to delete some contiguous prefix 3900 // of the older manifests. 3901 slices.SortFunc(d.mu.versions.obsoleteManifests, func(a, b fileInfo) int { 3902 return cmp.Compare(a.fileNum, b.fileNum) 3903 }) 3904 3905 var obsoleteManifests []fileInfo 3906 manifestsToDelete := len(d.mu.versions.obsoleteManifests) - d.opts.NumPrevManifest 3907 if manifestsToDelete > 0 { 3908 obsoleteManifests = d.mu.versions.obsoleteManifests[:manifestsToDelete] 3909 d.mu.versions.obsoleteManifests = d.mu.versions.obsoleteManifests[manifestsToDelete:] 3910 if len(d.mu.versions.obsoleteManifests) == 0 { 3911 d.mu.versions.obsoleteManifests = nil 3912 } 3913 } 3914 3915 obsoleteOptions := d.mu.versions.obsoleteOptions 3916 d.mu.versions.obsoleteOptions = nil 3917 3918 // Release d.mu while preparing the cleanup job and possibly waiting. 3919 // Note the unusual order: Unlock and then Lock. 3920 d.mu.Unlock() 3921 defer d.mu.Lock() 3922 3923 files := [4]struct { 3924 fileType fileType 3925 obsolete []fileInfo 3926 }{ 3927 {fileTypeLog, obsoleteLogs}, 3928 {fileTypeTable, obsoleteTables}, 3929 {fileTypeManifest, obsoleteManifests}, 3930 {fileTypeOptions, obsoleteOptions}, 3931 } 3932 _, noRecycle := d.opts.Cleaner.(base.NeedsFileContents) 3933 filesToDelete := make([]obsoleteFile, 0, len(obsoleteLogs)+len(obsoleteTables)+len(obsoleteManifests)+len(obsoleteOptions)) 3934 for _, f := range files { 3935 // We sort to make the order of deletions deterministic, which is nice for 3936 // tests. 3937 slices.SortFunc(f.obsolete, func(a, b fileInfo) int { 3938 return cmp.Compare(a.fileNum, b.fileNum) 3939 }) 3940 for _, fi := range f.obsolete { 3941 dir := d.dirname 3942 switch f.fileType { 3943 case fileTypeLog: 3944 if !noRecycle && d.logRecycler.add(fi) { 3945 continue 3946 } 3947 dir = d.walDirname 3948 case fileTypeTable: 3949 d.tableCache.evict(fi.fileNum) 3950 } 3951 3952 filesToDelete = append(filesToDelete, obsoleteFile{ 3953 dir: dir, 3954 fileNum: fi.fileNum, 3955 fileType: f.fileType, 3956 fileSize: fi.fileSize, 3957 }) 3958 } 3959 } 3960 if len(filesToDelete) > 0 { 3961 d.cleanupManager.EnqueueJob(jobID, filesToDelete) 3962 } 3963 if d.opts.private.testingAlwaysWaitForCleanup { 3964 d.cleanupManager.Wait() 3965 } 3966 } 3967 3968 func (d *DB) maybeScheduleObsoleteTableDeletion() { 3969 d.mu.Lock() 3970 defer d.mu.Unlock() 3971 d.maybeScheduleObsoleteTableDeletionLocked() 3972 } 3973 3974 func (d *DB) maybeScheduleObsoleteTableDeletionLocked() { 3975 if len(d.mu.versions.obsoleteTables) > 0 { 3976 jobID := d.mu.nextJobID 3977 d.mu.nextJobID++ 3978 d.deleteObsoleteFiles(jobID) 3979 } 3980 } 3981 3982 func merge(a, b []fileInfo) []fileInfo { 3983 if len(b) == 0 { 3984 return a 3985 } 3986 3987 a = append(a, b...) 3988 slices.SortFunc(a, func(a, b fileInfo) int { 3989 return cmp.Compare(a.fileNum, b.fileNum) 3990 }) 3991 return slices.CompactFunc(a, func(a, b fileInfo) bool { 3992 return a.fileNum == b.fileNum 3993 }) 3994 }