github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/compaction.go (about) 1 // Copyright 2013 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "bytes" 9 "errors" 10 "fmt" 11 "math" 12 "os" 13 "sort" 14 "sync/atomic" 15 "unsafe" 16 17 "github.com/petermattis/pebble/internal/base" 18 "github.com/petermattis/pebble/internal/manifest" 19 "github.com/petermattis/pebble/internal/rangedel" 20 "github.com/petermattis/pebble/sstable" 21 "github.com/petermattis/pebble/vfs" 22 ) 23 24 var errEmptyTable = errors.New("pebble: empty table") 25 26 // expandedCompactionByteSizeLimit is the maximum number of bytes in all 27 // compacted files. We avoid expanding the lower level file set of a compaction 28 // if it would make the total compaction cover more than this many bytes. 29 func expandedCompactionByteSizeLimit(opts *Options, level int) uint64 { 30 return uint64(25 * opts.Level(level).TargetFileSize) 31 } 32 33 // maxGrandparentOverlapBytes is the maximum bytes of overlap with level+2 34 // before we stop building a single file in a level to level+1 compaction. 35 func maxGrandparentOverlapBytes(opts *Options, level int) uint64 { 36 return uint64(10 * opts.Level(level).TargetFileSize) 37 } 38 39 // totalSize returns the total size of all the files in f. 40 func totalSize(f []fileMetadata) (size uint64) { 41 for _, x := range f { 42 size += x.Size 43 } 44 return size 45 } 46 47 // compaction is a table compaction from one level to the next, starting from a 48 // given version. 49 type compaction struct { 50 cmp Compare 51 version *version 52 53 // startLevel is the level that is being compacted. Inputs from startLevel 54 // and outputLevel will be merged to produce a set of outputLevel files. 55 startLevel int 56 // outputLevel is the level that files are being produced in. outputLevel is 57 // equal to startLevel+1 except when startLevel is 0 in which case it is 58 // equal to compactionPicker.baseLevel. 59 outputLevel int 60 61 // maxOutputFileSize is the maximum size of an individual table created 62 // during compaction. 63 maxOutputFileSize uint64 64 // maxOverlapBytes is the maximum number of bytes of overlap allowed for a 65 // single output table with the tables in the grandparent level. 66 maxOverlapBytes uint64 67 // maxExpandedBytes is the maximum size of an expanded compaction. If growing 68 // a compaction results in a larger size, the original compaction is used 69 // instead. 70 maxExpandedBytes uint64 71 // disableRangeTombstoneElision disables elision of range tombstones. Used by 72 // tests to allow range tombstones to be added to tables where they would 73 // otherwise be elided. 74 disableRangeTombstoneElision bool 75 76 // flushing contains the flushables (aka memtables) that are being flushed. 77 flushing []flushable 78 // bytesIterated contains the number of bytes that have been flushed/compacted. 79 bytesIterated uint64 80 // atomicBytesIterated points to the variable to increment during iteration. 81 // atomicBytesIterated must be read/written atomically. Flushing will increment 82 // the shared variable which compaction will read. This allows for the 83 // compaction routine to know how many bytes have been flushed before the flush 84 // is applied. 85 atomicBytesIterated *uint64 86 // inputs are the tables to be compacted. 87 inputs [2][]fileMetadata 88 89 // grandparents are the tables in level+2 that overlap with the files being 90 // compacted. Used to determine output table boundaries. 91 grandparents []fileMetadata 92 overlappedBytes uint64 // bytes of overlap with grandparent tables 93 seenKey bool // some output key has been seen 94 95 metrics map[int]*LevelMetrics 96 } 97 98 func newCompaction( 99 opts *Options, 100 cur *version, 101 startLevel, 102 baseLevel int, 103 bytesCompacted *uint64, 104 ) *compaction { 105 if startLevel > 0 && startLevel < baseLevel { 106 panic(fmt.Sprintf("invalid compaction: start level %d should be empty (base level %d)", 107 startLevel, baseLevel)) 108 } 109 110 outputLevel := startLevel + 1 111 if startLevel == 0 { 112 outputLevel = baseLevel 113 } 114 if outputLevel >= numLevels-1 { 115 outputLevel = numLevels - 1 116 } 117 // Output level is in the range [baseLevel,numLevels]. For the purpose of 118 // determining the target output file size, overlap bytes, and expanded 119 // bytes, we want to adjust the range to [1,numLevels]. 120 adjustedOutputLevel := 1 + outputLevel - baseLevel 121 122 return &compaction{ 123 cmp: opts.Comparer.Compare, 124 version: cur, 125 startLevel: startLevel, 126 outputLevel: outputLevel, 127 maxOutputFileSize: uint64(opts.Level(adjustedOutputLevel).TargetFileSize), 128 maxOverlapBytes: maxGrandparentOverlapBytes(opts, adjustedOutputLevel), 129 maxExpandedBytes: expandedCompactionByteSizeLimit(opts, adjustedOutputLevel), 130 atomicBytesIterated: bytesCompacted, 131 } 132 } 133 134 func newFlush( 135 opts *Options, 136 cur *version, 137 baseLevel int, 138 flushing []flushable, 139 bytesFlushed *uint64, 140 ) *compaction { 141 c := &compaction{ 142 cmp: opts.Comparer.Compare, 143 version: cur, 144 startLevel: -1, 145 outputLevel: 0, 146 maxOutputFileSize: math.MaxUint64, 147 maxOverlapBytes: math.MaxUint64, 148 maxExpandedBytes: math.MaxUint64, 149 flushing: flushing, 150 atomicBytesIterated: bytesFlushed, 151 } 152 153 // TODO(peter): When we allow flushing to create multiple tables we'll want 154 // to choose sstable boundaries based on the grandparents. But for now we 155 // want to create a single table during flushing so this is all commented 156 // out. 157 if false { 158 c.maxOutputFileSize = uint64(opts.Level(0).TargetFileSize) 159 c.maxOverlapBytes = maxGrandparentOverlapBytes(opts, 0) 160 c.maxExpandedBytes = expandedCompactionByteSizeLimit(opts, 0) 161 162 var smallest InternalKey 163 var largest InternalKey 164 smallestSet, largestSet := false, false 165 166 updatePointBounds := func(iter internalIterator) { 167 if key, _ := iter.First(); key != nil { 168 if !smallestSet || 169 base.InternalCompare(c.cmp, smallest, *key) > 0 { 170 smallestSet = true 171 smallest = key.Clone() 172 } 173 } 174 if key, _ := iter.Last(); key != nil { 175 if !largestSet || 176 base.InternalCompare(c.cmp, largest, *key) < 0 { 177 largestSet = true 178 largest = key.Clone() 179 } 180 } 181 } 182 183 updateRangeBounds := func(iter internalIterator) { 184 if key, _ := iter.First(); key != nil { 185 if !smallestSet || 186 base.InternalCompare(c.cmp, smallest, *key) > 0 { 187 smallestSet = true 188 smallest = key.Clone() 189 } 190 } 191 } 192 193 for i := range flushing { 194 f := flushing[i] 195 updatePointBounds(f.newIter(nil)) 196 if rangeDelIter := f.newRangeDelIter(nil); rangeDelIter != nil { 197 updateRangeBounds(rangeDelIter) 198 } 199 } 200 201 c.grandparents = c.version.Overlaps(baseLevel, c.cmp, smallest.UserKey, largest.UserKey) 202 } 203 return c 204 } 205 206 // setupOtherInputs fills in the rest of the compaction inputs, regardless of 207 // whether the compaction was automatically scheduled or user initiated. 208 func (c *compaction) setupOtherInputs() { 209 c.inputs[0] = c.expandInputs(c.inputs[0]) 210 smallest0, largest0 := manifest.KeyRange(c.cmp, c.inputs[0], nil) 211 c.inputs[1] = c.version.Overlaps(c.outputLevel, c.cmp, smallest0.UserKey, largest0.UserKey) 212 smallest01, largest01 := manifest.KeyRange(c.cmp, c.inputs[0], c.inputs[1]) 213 214 // Grow the inputs if it doesn't affect the number of level+1 files. 215 if c.grow(smallest01, largest01) { 216 smallest01, largest01 = manifest.KeyRange(c.cmp, c.inputs[0], c.inputs[1]) 217 } 218 219 // Compute the set of outputLevel+1 files that overlap this compaction. 220 if c.outputLevel+1 < numLevels { 221 c.grandparents = c.version.Overlaps(c.outputLevel+1, c.cmp, smallest01.UserKey, largest01.UserKey) 222 } 223 } 224 225 // expandInputs expands the files in inputs[0] in order to maintain the 226 // invariant that the versions of keys at level+1 are older than the versions 227 // of keys at level. This is achieved by adding tables to the right of the 228 // current input tables such that the rightmost table has a "clean cut". A 229 // clean cut is either a change in user keys, or 230 func (c *compaction) expandInputs(inputs []fileMetadata) []fileMetadata { 231 if c.startLevel == 0 { 232 // We already call version.overlaps for L0 and that call guarantees that we 233 // get a "clean cut". 234 return inputs 235 } 236 files := c.version.Files[c.startLevel] 237 // Pointer arithmetic to figure out the index if inputs[0] with 238 // files[0]. This requires that the inputs slice is a sub-slice of 239 // files. This is true for non-L0 files returned from version.overlaps. 240 if uintptr(unsafe.Pointer(&inputs[0])) < uintptr(unsafe.Pointer(&files[0])) { 241 panic("pebble: invalid input slice") 242 } 243 start := int((uintptr(unsafe.Pointer(&inputs[0])) - 244 uintptr(unsafe.Pointer(&files[0]))) / unsafe.Sizeof(inputs[0])) 245 if start >= len(files) { 246 panic("pebble: invalid input slice") 247 } 248 end := start + len(inputs) 249 for ; end < len(files); end++ { 250 cur := &files[end-1] 251 next := &files[end] 252 if c.cmp(cur.Largest.UserKey, next.Smallest.UserKey) < 0 { 253 break 254 } 255 if cur.Largest.Trailer == InternalKeyRangeDeleteSentinel { 256 // The range deletion sentinel key is set for the largest key in a table 257 // when a range deletion tombstone straddles a table. It isn't necessary 258 // to include the next table in the compaction as cur.largest.UserKey 259 // does not actually exist in the table. 260 break 261 } 262 // cur.largest.UserKey == next.largest.UserKey, so we need to include next 263 // in the compaction. 264 } 265 return files[start:end] 266 } 267 268 // grow grows the number of inputs at c.level without changing the number of 269 // c.level+1 files in the compaction, and returns whether the inputs grew. sm 270 // and la are the smallest and largest InternalKeys in all of the inputs. 271 func (c *compaction) grow(sm, la InternalKey) bool { 272 if len(c.inputs[1]) == 0 { 273 return false 274 } 275 grow0 := c.version.Overlaps(c.startLevel, c.cmp, sm.UserKey, la.UserKey) 276 grow0 = c.expandInputs(grow0) 277 if len(grow0) <= len(c.inputs[0]) { 278 return false 279 } 280 if totalSize(grow0)+totalSize(c.inputs[1]) >= c.maxExpandedBytes { 281 return false 282 } 283 sm1, la1 := manifest.KeyRange(c.cmp, grow0, nil) 284 grow1 := c.version.Overlaps(c.outputLevel, c.cmp, sm1.UserKey, la1.UserKey) 285 if len(grow1) != len(c.inputs[1]) { 286 return false 287 } 288 c.inputs[0] = grow0 289 c.inputs[1] = grow1 290 return true 291 } 292 293 func (c *compaction) trivialMove() bool { 294 if len(c.flushing) != 0 { 295 return false 296 } 297 // Check for a trivial move of one table from one level to the next. We avoid 298 // such a move if there is lots of overlapping grandparent data. Otherwise, 299 // the move could create a parent file that will require a very expensive 300 // merge later on. 301 if len(c.inputs[0]) == 1 && len(c.inputs[1]) == 0 && 302 totalSize(c.grandparents) <= c.maxOverlapBytes { 303 return true 304 } 305 return false 306 } 307 308 // shouldStopBefore returns true if the output to the current table should be 309 // finished and a new table started before adding the specified key. This is 310 // done in order to prevent a table at level N from overlapping too much data 311 // at level N+1. We want to avoid such large overlaps because they translate 312 // into large compactions. The current heuristic stops output of a table if the 313 // addition of another key would cause the table to overlap more than 10x the 314 // target file size at level N. See maxGrandparentOverlapBytes. 315 // 316 // TODO(peter): Stopping compaction output in the middle of a user-key creates 317 // 2 sstables that need to be compacted together as an "atomic compaction 318 // unit". This is unfortunate as it removes the benefit of stopping output to 319 // an sstable in order to prevent a large compaction with the next level. Seems 320 // better to adjust shouldStopBefore to not stop output in the middle of a 321 // user-key. Perhaps this isn't a problem if the compaction picking heuristics 322 // always pick the right (older) sibling for compaction first. 323 func (c *compaction) shouldStopBefore(key InternalKey) bool { 324 for len(c.grandparents) > 0 { 325 g := &c.grandparents[0] 326 if base.InternalCompare(c.cmp, key, g.Largest) <= 0 { 327 break 328 } 329 if c.seenKey { 330 c.overlappedBytes += g.Size 331 } 332 c.grandparents = c.grandparents[1:] 333 } 334 c.seenKey = true 335 if c.overlappedBytes > c.maxOverlapBytes { 336 c.overlappedBytes = 0 337 return true 338 } 339 return false 340 } 341 342 // allowZeroSeqNum returns true if seqnum's can be zeroed if there are no 343 // snapshots requiring them to be kept. It performs this determination by 344 // looking for an sstable which overlaps the bounds of the compaction at a 345 // lower level in the LSM. 346 func (c *compaction) allowZeroSeqNum(iter internalIterator) bool { 347 if len(c.flushing) != 0 { 348 if len(c.version.Files[0]) > 0 { 349 // We can only allow zeroing of seqnum for L0 tables if no other L0 tables 350 // exist. Otherwise we may violate the invariant that L0 tables are ordered 351 // by increasing seqnum. This could be relaxed with a bit more intelligence 352 // in how a new L0 table is merged into the existing set of L0 tables. 353 return false 354 } 355 lower, _ := iter.First() 356 upper, _ := iter.Last() 357 if lower == nil || upper == nil { 358 return false 359 } 360 return c.elideRangeTombstone(lower.UserKey, upper.UserKey) 361 } 362 363 var lower, upper []byte 364 for i := range c.inputs { 365 files := c.inputs[i] 366 for j := range files { 367 f := &files[j] 368 if lower == nil || c.cmp(lower, f.Smallest.UserKey) > 0 { 369 lower = f.Smallest.UserKey 370 } 371 if upper == nil || c.cmp(upper, f.Largest.UserKey) < 0 { 372 upper = f.Largest.UserKey 373 } 374 } 375 } 376 // [lower,upper] now cover the bounds of the compaction inputs. Check to see 377 // if those bounds overlap an sstable at a lower level. 378 return c.elideRangeTombstone(lower, upper) 379 } 380 381 // elideTombstone returns true if it is ok to elide a tombstone for the 382 // specified key. A return value of true guarantees that there are no key/value 383 // pairs at c.level+2 or higher that possibly contain the specified user key. 384 func (c *compaction) elideTombstone(key []byte) bool { 385 if len(c.flushing) != 0 { 386 return false 387 } 388 389 level := c.outputLevel + 1 390 if c.outputLevel == 0 { 391 // Level 0 can contain overlapping sstables so we need to check it for 392 // overlaps. 393 level = 0 394 } 395 396 // TODO(peter): this can be faster if key is always increasing between 397 // successive elideTombstones calls and we can keep some state in between 398 // calls. 399 for ; level < numLevels; level++ { 400 for _, f := range c.version.Files[level] { 401 if c.cmp(key, f.Largest.UserKey) <= 0 { 402 if c.cmp(key, f.Smallest.UserKey) >= 0 { 403 return false 404 } 405 // For levels below level 0, the files within a level are in 406 // increasing ikey order, so we can break early. 407 break 408 } 409 } 410 } 411 return true 412 } 413 414 // elideRangeTombstone returns true if it is ok to elide the specified range 415 // tombstone. A return value of true guarantees that there are no key/value 416 // pairs at c.outputLevel+1 or higher that possibly overlap the specified 417 // tombstone. 418 func (c *compaction) elideRangeTombstone(start, end []byte) bool { 419 if c.disableRangeTombstoneElision { 420 return false 421 } 422 423 level := c.outputLevel + 1 424 if c.outputLevel == 0 { 425 // Level 0 can contain overlapping sstables so we need to check it for 426 // overlaps. 427 level = 0 428 } 429 430 for ; level < numLevels; level++ { 431 overlaps := c.version.Overlaps(level, c.cmp, start, end) 432 if len(overlaps) > 0 { 433 return false 434 } 435 } 436 return true 437 } 438 439 // atomicUnitBounds returns the bounds of the atomic compaction unit containing 440 // the specified sstable (identified by a pointer to its fileMetadata). 441 func (c *compaction) atomicUnitBounds(f *fileMetadata) (lower, upper []byte) { 442 for i := range c.inputs { 443 files := c.inputs[i] 444 for j := range files { 445 if f == &files[j] { 446 lowerBound := f.Smallest.UserKey 447 for k := j; k > 0; k-- { 448 cur := &files[k] 449 prev := &files[k-1] 450 if c.cmp(prev.Largest.UserKey, cur.Smallest.UserKey) < 0 { 451 break 452 } 453 if prev.Largest.Trailer == InternalKeyRangeDeleteSentinel { 454 // The range deletion sentinel key is set for the largest key in a 455 // table when a range deletion tombstone straddles a table. It 456 // isn't necessary to include the next table in the atomic 457 // compaction unit as cur.largest.UserKey does not actually exist 458 // in the table. 459 break 460 } 461 lowerBound = prev.Smallest.UserKey 462 } 463 464 upperBound := f.Largest.UserKey 465 for k := j + 1; k < len(files); k++ { 466 cur := &files[k-1] 467 next := &files[k] 468 if c.cmp(cur.Largest.UserKey, next.Smallest.UserKey) < 0 { 469 break 470 } 471 if cur.Largest.Trailer == InternalKeyRangeDeleteSentinel { 472 // The range deletion sentinel key is set for the largest key in a 473 // table when a range deletion tombstone straddles a table. It 474 // isn't necessary to include the next table in the atomic 475 // compaction unit as cur.largest.UserKey does not actually exist 476 // in the table. 477 break 478 } 479 // cur.largest.UserKey == next.largest.UserKey, so next is part of 480 // the atomic compaction unit. 481 upperBound = next.Largest.UserKey 482 } 483 return lowerBound, upperBound 484 } 485 } 486 } 487 return nil, nil 488 } 489 490 // newInputIter returns an iterator over all the input tables in a compaction. 491 func (c *compaction) newInputIter( 492 newIters tableNewIters, 493 ) (_ internalIterator, retErr error) { 494 if len(c.flushing) != 0 { 495 if len(c.flushing) == 1 { 496 f := c.flushing[0] 497 iter := f.newFlushIter(nil, &c.bytesIterated) 498 if rangeDelIter := f.newRangeDelIter(nil); rangeDelIter != nil { 499 return newMergingIter(c.cmp, iter, rangeDelIter), nil 500 } 501 return iter, nil 502 } 503 iters := make([]internalIterator, 0, 2*len(c.flushing)) 504 for i := range c.flushing { 505 f := c.flushing[i] 506 iters = append(iters, f.newFlushIter(nil, &c.bytesIterated)) 507 rangeDelIter := f.newRangeDelIter(nil) 508 if rangeDelIter != nil { 509 iters = append(iters, rangeDelIter) 510 } 511 } 512 return newMergingIter(c.cmp, iters...), nil 513 } 514 515 iters := make([]internalIterator, 0, 2*len(c.inputs[0])+1) 516 defer func() { 517 if retErr != nil { 518 for _, iter := range iters { 519 if iter != nil { 520 iter.Close() 521 } 522 } 523 } 524 }() 525 526 // In normal operation, levelIter iterates over the point operations in a 527 // level, and initializes a rangeDelIter pointer for the range deletions in 528 // each table. During compaction, we want to iterate over the merged view of 529 // point operations and range deletions. In order to do this we create two 530 // levelIters per level, one which iterates over the point operations, and 531 // one which iterates over the range deletions. These two iterators are 532 // combined with a mergingIter. 533 newRangeDelIter := func( 534 f *fileMetadata, _ *IterOptions, bytesIterated *uint64, 535 ) (internalIterator, internalIterator, error) { 536 iter, rangeDelIter, err := newIters(f, nil /* iter options */, &c.bytesIterated) 537 if err == nil { 538 // TODO(peter): It is mildly wasteful to open the point iterator only to 539 // immediately close it. One way to solve this would be to add new 540 // methods to tableCache for creating point and range-deletion iterators 541 // independently. We'd only want to use those methods here, 542 // though. Doesn't seem worth the hassle in the near term. 543 if err = iter.Close(); err != nil { 544 rangeDelIter.Close() 545 rangeDelIter = nil 546 } 547 } 548 if rangeDelIter != nil { 549 // Truncate the range tombstones returned by the iterator to the upper 550 // bound of the atomic compaction unit. 551 lowerBound, upperBound := c.atomicUnitBounds(f) 552 if lowerBound != nil || upperBound != nil { 553 rangeDelIter = rangedel.Truncate(c.cmp, rangeDelIter, lowerBound, upperBound) 554 } 555 } 556 return rangeDelIter, nil, err 557 } 558 559 if c.startLevel != 0 { 560 iters = append(iters, newLevelIter(nil, c.cmp, newIters, c.inputs[0], &c.bytesIterated)) 561 iters = append(iters, newLevelIter(nil, c.cmp, newRangeDelIter, c.inputs[0], &c.bytesIterated)) 562 } else { 563 for i := range c.inputs[0] { 564 f := &c.inputs[0][i] 565 iter, rangeDelIter, err := newIters(f, nil /* iter options */, &c.bytesIterated) 566 if err != nil { 567 return nil, fmt.Errorf("pebble: could not open table %d: %v", f.FileNum, err) 568 } 569 iters = append(iters, iter) 570 if rangeDelIter != nil { 571 iters = append(iters, rangeDelIter) 572 } 573 } 574 } 575 576 iters = append(iters, newLevelIter(nil, c.cmp, newIters, c.inputs[1], &c.bytesIterated)) 577 iters = append(iters, newLevelIter(nil, c.cmp, newRangeDelIter, c.inputs[1], &c.bytesIterated)) 578 return newMergingIter(c.cmp, iters...), nil 579 } 580 581 func (c *compaction) String() string { 582 if len(c.flushing) != 0 { 583 return "flush\n" 584 } 585 586 var buf bytes.Buffer 587 for i := range c.inputs { 588 level := c.startLevel 589 if i == 1 { 590 level = c.outputLevel 591 } 592 fmt.Fprintf(&buf, "%d:", level) 593 for _, f := range c.inputs[i] { 594 fmt.Fprintf(&buf, " %d:%s-%s", f.FileNum, f.Smallest, f.Largest) 595 } 596 fmt.Fprintf(&buf, "\n") 597 } 598 return buf.String() 599 } 600 601 type manualCompaction struct { 602 level int 603 outputLevel int 604 done chan error 605 start InternalKey 606 end InternalKey 607 } 608 609 func (d *DB) getCompactionPacerInfo() compactionPacerInfo { 610 bytesFlushed := atomic.LoadUint64(&d.bytesFlushed) 611 612 d.mu.Lock() 613 estimatedMaxWAmp := d.mu.versions.picker.estimatedMaxWAmp 614 pacerInfo := compactionPacerInfo{ 615 slowdownThreshold: uint64(estimatedMaxWAmp * float64(d.opts.MemTableSize)), 616 totalCompactionDebt: d.mu.versions.picker.estimatedCompactionDebt(bytesFlushed), 617 } 618 for _, m := range d.mu.mem.queue { 619 pacerInfo.totalDirtyBytes += m.totalBytes() 620 } 621 d.mu.Unlock() 622 623 return pacerInfo 624 } 625 626 func (d *DB) getFlushPacerInfo() flushPacerInfo { 627 var pacerInfo flushPacerInfo 628 d.mu.Lock() 629 for _, m := range d.mu.mem.queue { 630 pacerInfo.totalBytes += m.totalBytes() 631 } 632 d.mu.Unlock() 633 return pacerInfo 634 } 635 636 // maybeScheduleFlush schedules a flush if necessary. 637 // 638 // d.mu must be held when calling this. 639 func (d *DB) maybeScheduleFlush() { 640 if d.mu.compact.flushing || atomic.LoadInt32(&d.closed) != 0 || d.opts.ReadOnly { 641 return 642 } 643 if len(d.mu.mem.queue) <= 1 { 644 return 645 } 646 if !d.mu.mem.queue[0].readyForFlush() { 647 return 648 } 649 650 d.mu.compact.flushing = true 651 go d.flush() 652 } 653 654 func (d *DB) flush() { 655 d.mu.Lock() 656 defer d.mu.Unlock() 657 if err := d.flush1(); err != nil { 658 // TODO(peter): count consecutive flush errors and backoff. 659 if d.opts.EventListener.BackgroundError != nil { 660 d.opts.EventListener.BackgroundError(err) 661 } 662 } 663 d.mu.compact.flushing = false 664 // More flush work may have arrived while we were flushing, so schedule 665 // another flush if needed. 666 d.maybeScheduleFlush() 667 // The flush may have produced too many files in a level, so schedule a 668 // compaction if needed. 669 d.maybeScheduleCompaction() 670 d.mu.compact.cond.Broadcast() 671 } 672 673 // flush runs a compaction that copies the immutable memtables from memory to 674 // disk. 675 // 676 // d.mu must be held when calling this, but the mutex may be dropped and 677 // re-acquired during the course of this method. 678 func (d *DB) flush1() error { 679 var n int 680 for ; n < len(d.mu.mem.queue)-1; n++ { 681 if !d.mu.mem.queue[n].readyForFlush() { 682 break 683 } 684 } 685 if n == 0 { 686 // None of the immutable memtables are ready for flushing. 687 return nil 688 } 689 690 c := newFlush(d.opts, d.mu.versions.currentVersion(), 691 d.mu.versions.picker.baseLevel, d.mu.mem.queue[:n], &d.bytesFlushed) 692 693 jobID := d.mu.nextJobID 694 d.mu.nextJobID++ 695 if d.opts.EventListener.FlushBegin != nil { 696 d.opts.EventListener.FlushBegin(FlushInfo{ 697 JobID: jobID, 698 }) 699 } 700 701 flushPacer := newFlushPacer(flushPacerEnv{ 702 limiter: d.flushLimiter, 703 memTableSize: uint64(d.opts.MemTableSize), 704 getInfo: d.getFlushPacerInfo, 705 }) 706 ve, pendingOutputs, err := d.runCompaction(jobID, c, flushPacer) 707 708 if d.opts.EventListener.FlushEnd != nil { 709 info := FlushInfo{ 710 JobID: jobID, 711 Err: err, 712 } 713 if err == nil { 714 for i := range ve.NewFiles { 715 e := &ve.NewFiles[i] 716 info.Output = append(info.Output, e.Meta.TableInfo(d.dirname)) 717 } 718 if len(ve.NewFiles) == 0 { 719 info.Err = errEmptyTable 720 } 721 } 722 d.opts.EventListener.FlushEnd(info) 723 } 724 725 if err != nil { 726 return err 727 } 728 729 // The flush succeeded or it produced an empty sstable. In either case we 730 // want to bump the log number. 731 ve.LogNum, _ = d.mu.mem.queue[n].logInfo() 732 metrics := c.metrics[0] 733 for i := 0; i < n; i++ { 734 _, size := d.mu.mem.queue[i].logInfo() 735 metrics.BytesIn += size 736 } 737 738 err = d.mu.versions.logAndApply(jobID, ve, c.metrics, d.dataDir) 739 for _, fileNum := range pendingOutputs { 740 if _, ok := d.mu.compact.pendingOutputs[fileNum]; !ok { 741 panic("pebble: expected pending output not present") 742 } 743 delete(d.mu.compact.pendingOutputs, fileNum) 744 } 745 if err != nil { 746 return err 747 } 748 749 // Refresh bytes flushed count. 750 atomic.StoreUint64(&d.bytesFlushed, 0) 751 752 flushed := d.mu.mem.queue[:n] 753 d.mu.mem.queue = d.mu.mem.queue[n:] 754 d.updateReadStateLocked() 755 d.deleteObsoleteFiles(jobID) 756 757 // Mark all the memtables we flushed as flushed. Note that we do this last so 758 // that a synchronous call to DB.Flush() will not return until the deletion 759 // of obsolete files from this job have completed. This makes testing easier 760 // and provides similar behavior to manual compactions where the compaction 761 // is not marked as completed until the deletion of obsolete files job has 762 // completed. 763 for i := range flushed { 764 close(flushed[i].flushed()) 765 } 766 return nil 767 } 768 769 // maybeScheduleCompaction schedules a compaction if necessary. 770 // 771 // d.mu must be held when calling this. 772 func (d *DB) maybeScheduleCompaction() { 773 if d.mu.compact.compacting || atomic.LoadInt32(&d.closed) != 0 || d.opts.ReadOnly { 774 return 775 } 776 777 if len(d.mu.compact.manual) > 0 { 778 d.mu.compact.compacting = true 779 go d.compact() 780 return 781 } 782 783 if !d.mu.versions.picker.compactionNeeded() { 784 // There is no work to be done. 785 return 786 } 787 788 d.mu.compact.compacting = true 789 go d.compact() 790 } 791 792 // compact runs one compaction and maybe schedules another call to compact. 793 func (d *DB) compact() { 794 d.mu.Lock() 795 defer d.mu.Unlock() 796 if err := d.compact1(); err != nil { 797 // TODO(peter): count consecutive compaction errors and backoff. 798 if d.opts.EventListener.BackgroundError != nil { 799 d.opts.EventListener.BackgroundError(err) 800 } 801 } 802 d.mu.compact.compacting = false 803 // The previous compaction may have produced too many files in a 804 // level, so reschedule another compaction if needed. 805 d.maybeScheduleCompaction() 806 d.mu.compact.cond.Broadcast() 807 } 808 809 // compact1 runs one compaction. 810 // 811 // d.mu must be held when calling this, but the mutex may be dropped and 812 // re-acquired during the course of this method. 813 func (d *DB) compact1() (err error) { 814 var c *compaction 815 if len(d.mu.compact.manual) > 0 { 816 manual := d.mu.compact.manual[0] 817 d.mu.compact.manual = d.mu.compact.manual[1:] 818 c = d.mu.versions.picker.pickManual(d.opts, manual, &d.bytesCompacted) 819 defer func() { 820 manual.done <- err 821 }() 822 } else { 823 c = d.mu.versions.picker.pickAuto(d.opts, &d.bytesCompacted) 824 } 825 if c == nil { 826 return nil 827 } 828 829 jobID := d.mu.nextJobID 830 d.mu.nextJobID++ 831 info := CompactionInfo{ 832 JobID: jobID, 833 } 834 if d.opts.EventListener.CompactionBegin != nil || d.opts.EventListener.CompactionEnd != nil { 835 info.Input.Level = c.startLevel 836 info.Output.Level = c.outputLevel 837 for i := range c.inputs { 838 for j := range c.inputs[i] { 839 m := &c.inputs[i][j] 840 info.Input.Tables[i] = append(info.Input.Tables[i], m.TableInfo(d.dirname)) 841 } 842 } 843 } 844 if d.opts.EventListener.CompactionBegin != nil { 845 d.opts.EventListener.CompactionBegin(info) 846 } 847 848 compactionPacer := newCompactionPacer(compactionPacerEnv{ 849 limiter: d.compactionLimiter, 850 memTableSize: uint64(d.opts.MemTableSize), 851 getInfo: d.getCompactionPacerInfo, 852 }) 853 ve, pendingOutputs, err := d.runCompaction(jobID, c, compactionPacer) 854 855 if d.opts.EventListener.CompactionEnd != nil { 856 info.Err = err 857 if err == nil { 858 for i := range ve.NewFiles { 859 e := &ve.NewFiles[i] 860 info.Output.Tables = append(info.Output.Tables, e.Meta.TableInfo(d.dirname)) 861 } 862 } 863 d.opts.EventListener.CompactionEnd(info) 864 } 865 866 if err != nil { 867 return err 868 } 869 err = d.mu.versions.logAndApply(jobID, ve, c.metrics, d.dataDir) 870 for _, fileNum := range pendingOutputs { 871 if _, ok := d.mu.compact.pendingOutputs[fileNum]; !ok { 872 panic("pebble: expected pending output not present") 873 } 874 delete(d.mu.compact.pendingOutputs, fileNum) 875 } 876 if err != nil { 877 return err 878 } 879 880 d.updateReadStateLocked() 881 d.deleteObsoleteFiles(jobID) 882 return nil 883 } 884 885 // runCompactions runs a compaction that produces new on-disk tables from 886 // memtables or old on-disk tables. 887 // 888 // d.mu must be held when calling this, but the mutex may be dropped and 889 // re-acquired during the course of this method. 890 func (d *DB) runCompaction(jobID int, c *compaction, pacer pacer) ( 891 ve *versionEdit, pendingOutputs []uint64, retErr error, 892 ) { 893 // Check for a trivial move of one table from one level to the next. We avoid 894 // such a move if there is lots of overlapping grandparent data. Otherwise, 895 // the move could create a parent file that will require a very expensive 896 // merge later on. 897 if c.trivialMove() { 898 meta := &c.inputs[0][0] 899 c.metrics = map[int]*LevelMetrics{ 900 c.outputLevel: &LevelMetrics{ 901 BytesMoved: meta.Size, 902 }, 903 } 904 ve := &versionEdit{ 905 DeletedFiles: map[deletedFileEntry]bool{ 906 deletedFileEntry{Level: c.startLevel, FileNum: meta.FileNum}: true, 907 }, 908 NewFiles: []newFileEntry{ 909 {Level: c.outputLevel, Meta: *meta}, 910 }, 911 } 912 return ve, nil, nil 913 } 914 915 defer func() { 916 if retErr != nil { 917 for _, fileNum := range pendingOutputs { 918 delete(d.mu.compact.pendingOutputs, fileNum) 919 } 920 pendingOutputs = nil 921 } 922 }() 923 924 snapshots := d.mu.snapshots.toSlice() 925 926 // Release the d.mu lock while doing I/O. 927 // Note the unusual order: Unlock and then Lock. 928 d.mu.Unlock() 929 defer d.mu.Lock() 930 931 iiter, err := c.newInputIter(d.newIters) 932 if err != nil { 933 return nil, pendingOutputs, err 934 } 935 iter := newCompactionIter(c.cmp, d.merge, iiter, snapshots, 936 c.allowZeroSeqNum(iiter), c.elideTombstone, c.elideRangeTombstone) 937 938 var ( 939 filenames []string 940 tw *sstable.Writer 941 ) 942 defer func() { 943 if iter != nil { 944 retErr = firstError(retErr, iter.Close()) 945 } 946 if tw != nil { 947 retErr = firstError(retErr, tw.Close()) 948 } 949 if retErr != nil { 950 for _, filename := range filenames { 951 d.opts.FS.Remove(filename) 952 } 953 } 954 }() 955 956 ve = &versionEdit{ 957 DeletedFiles: map[deletedFileEntry]bool{}, 958 } 959 960 metrics := &LevelMetrics{ 961 BytesIn: totalSize(c.inputs[0]), 962 BytesRead: totalSize(c.inputs[1]), 963 } 964 metrics.BytesRead += metrics.BytesIn 965 c.metrics = map[int]*LevelMetrics{ 966 c.outputLevel: metrics, 967 } 968 969 newOutput := func() error { 970 d.mu.Lock() 971 fileNum := d.mu.versions.getNextFileNum() 972 d.mu.compact.pendingOutputs[fileNum] = struct{}{} 973 pendingOutputs = append(pendingOutputs, fileNum) 974 d.mu.Unlock() 975 976 filename := base.MakeFilename(d.dirname, fileTypeTable, fileNum) 977 file, err := d.opts.FS.Create(filename) 978 if err != nil { 979 return err 980 } 981 if d.opts.EventListener.TableCreated != nil { 982 reason := "flushing" 983 if c.flushing == nil { 984 reason = "compacting" 985 } 986 d.opts.EventListener.TableCreated(TableCreateInfo{ 987 JobID: jobID, 988 Reason: reason, 989 Path: filename, 990 FileNum: fileNum, 991 }) 992 } 993 file = vfs.NewSyncingFile(file, vfs.SyncingFileOptions{ 994 BytesPerSync: d.opts.BytesPerSync, 995 }) 996 filenames = append(filenames, filename) 997 tw = sstable.NewWriter(file, d.opts, d.opts.Level(c.outputLevel)) 998 999 ve.NewFiles = append(ve.NewFiles, newFileEntry{ 1000 Level: c.outputLevel, 1001 Meta: fileMetadata{ 1002 FileNum: fileNum, 1003 }, 1004 }) 1005 return nil 1006 } 1007 1008 finishOutput := func(key InternalKey) error { 1009 // NB: clone the key because the data can be held on to by the call to 1010 // compactionIter.Tombstones via rangedel.Fragmenter.FlushTo. 1011 key = key.Clone() 1012 for _, v := range iter.Tombstones(key.UserKey) { 1013 if tw == nil { 1014 if err := newOutput(); err != nil { 1015 return err 1016 } 1017 } 1018 if err := tw.Add(v.Start, v.End); err != nil { 1019 return err 1020 } 1021 } 1022 1023 if tw == nil { 1024 return nil 1025 } 1026 1027 if err := tw.Close(); err != nil { 1028 tw = nil 1029 return err 1030 } 1031 writerMeta, err := tw.Metadata() 1032 if err != nil { 1033 tw = nil 1034 return err 1035 } 1036 tw = nil 1037 meta := &ve.NewFiles[len(ve.NewFiles)-1].Meta 1038 meta.Size = writerMeta.Size 1039 meta.SmallestSeqNum = writerMeta.SmallestSeqNum 1040 meta.LargestSeqNum = writerMeta.LargestSeqNum 1041 1042 metrics.BytesWritten += meta.Size 1043 1044 // The handling of range boundaries is a bit complicated. 1045 if n := len(ve.NewFiles); n > 1 { 1046 // This is not the first output. Bound the smallest range key by the 1047 // previous tables largest key. 1048 prevMeta := &ve.NewFiles[n-2].Meta 1049 if writerMeta.SmallestRange.UserKey != nil && 1050 d.cmp(writerMeta.SmallestRange.UserKey, prevMeta.Largest.UserKey) <= 0 { 1051 // The range boundary user key is less than or equal to the previous 1052 // table's largest key. We need the tables to be key-space partitioned, 1053 // so force the boundary to a key that we know is larger than the 1054 // previous key. 1055 // 1056 // We use seqnum zero since seqnums are in descending order, and our 1057 // goal is to ensure this forged key does not overlap with the previous 1058 // file. `InternalKeyRangeDeleteSentinel` is actually the first key 1059 // kind as key kinds are also in descending order. But, this is OK 1060 // because choosing seqnum zero is already enough to prevent overlap 1061 // (the previous file could not end with a key at seqnum zero if this 1062 // file had a tombstone extending into it). 1063 writerMeta.SmallestRange = base.MakeInternalKey( 1064 prevMeta.Largest.UserKey, 0, InternalKeyKindRangeDelete) 1065 } 1066 } 1067 1068 if key.UserKey != nil && writerMeta.LargestRange.UserKey != nil { 1069 if d.cmp(writerMeta.LargestRange.UserKey, key.UserKey) >= 0 { 1070 writerMeta.LargestRange = key 1071 writerMeta.LargestRange.Trailer = InternalKeyRangeDeleteSentinel 1072 } 1073 } 1074 1075 meta.Smallest = writerMeta.Smallest(d.cmp) 1076 meta.Largest = writerMeta.Largest(d.cmp) 1077 return nil 1078 } 1079 1080 for key, val := iter.First(); key != nil; key, val = iter.Next() { 1081 atomic.StoreUint64(c.atomicBytesIterated, c.bytesIterated) 1082 1083 if err := pacer.maybeThrottle(c.bytesIterated); err != nil { 1084 return nil, pendingOutputs, err 1085 } 1086 1087 // TODO(peter,rangedel): Need to incorporate the range tombstones in the 1088 // shouldStopBefore decision. 1089 if tw != nil && (tw.EstimatedSize() >= c.maxOutputFileSize || c.shouldStopBefore(*key)) { 1090 if err := finishOutput(*key); err != nil { 1091 return nil, pendingOutputs, err 1092 } 1093 } 1094 1095 if tw == nil { 1096 if err := newOutput(); err != nil { 1097 return nil, pendingOutputs, err 1098 } 1099 } 1100 1101 if err := tw.Add(*key, val); err != nil { 1102 return nil, pendingOutputs, err 1103 } 1104 } 1105 1106 if err := finishOutput(InternalKey{}); err != nil { 1107 return nil, pendingOutputs, err 1108 } 1109 1110 for i := range c.inputs { 1111 level := c.startLevel 1112 if i == 1 { 1113 level = c.outputLevel 1114 } 1115 for _, f := range c.inputs[i] { 1116 ve.DeletedFiles[deletedFileEntry{ 1117 Level: level, 1118 FileNum: f.FileNum, 1119 }] = true 1120 } 1121 } 1122 1123 if err := d.dataDir.Sync(); err != nil { 1124 return nil, pendingOutputs, err 1125 } 1126 return ve, pendingOutputs, nil 1127 } 1128 1129 // scanObsoleteFiles scans the filesystem for files that are no longer needed 1130 // and adds those to the internal lists of obsolete files. Note that he files 1131 // are not actually deleted by this method. A subsequent call to 1132 // deleteObsoleteFiles must be performed. 1133 func (d *DB) scanObsoleteFiles(list []string) { 1134 liveFileNums := make(map[uint64]struct{}, len(d.mu.compact.pendingOutputs)) 1135 for fileNum := range d.mu.compact.pendingOutputs { 1136 liveFileNums[fileNum] = struct{}{} 1137 } 1138 d.mu.versions.addLiveFileNums(liveFileNums) 1139 logNumber := d.mu.versions.logNum 1140 manifestFileNumber := d.mu.versions.manifestFileNum 1141 1142 var obsoleteLogs []uint64 1143 var obsoleteTables []uint64 1144 var obsoleteManifests []uint64 1145 var obsoleteOptions []uint64 1146 1147 for _, filename := range list { 1148 fileType, fileNum, ok := base.ParseFilename(filename) 1149 if !ok { 1150 continue 1151 } 1152 switch fileType { 1153 case fileTypeLog: 1154 // TODO(peter): also look at prevLogNumber? 1155 if fileNum >= logNumber { 1156 continue 1157 } 1158 obsoleteLogs = append(obsoleteLogs, fileNum) 1159 case fileTypeManifest: 1160 if fileNum >= manifestFileNumber { 1161 continue 1162 } 1163 obsoleteManifests = append(obsoleteManifests, fileNum) 1164 case fileTypeOptions: 1165 if fileNum >= d.optionsFileNum { 1166 continue 1167 } 1168 obsoleteOptions = append(obsoleteOptions, fileNum) 1169 case fileTypeTable: 1170 if _, ok := liveFileNums[fileNum]; ok { 1171 continue 1172 } 1173 obsoleteTables = append(obsoleteTables, fileNum) 1174 default: 1175 // Don't delete files we don't know about. 1176 continue 1177 } 1178 } 1179 1180 d.mu.log.queue = merge(d.mu.log.queue, obsoleteLogs) 1181 d.mu.versions.metrics.WAL.Files += int64(len(obsoleteLogs)) 1182 d.mu.versions.obsoleteTables = merge(d.mu.versions.obsoleteTables, obsoleteTables) 1183 d.mu.versions.obsoleteManifests = merge(d.mu.versions.obsoleteManifests, obsoleteManifests) 1184 d.mu.versions.obsoleteOptions = merge(d.mu.versions.obsoleteOptions, obsoleteOptions) 1185 } 1186 1187 // deleteObsoleteFiles deletes those files that are no longer needed. 1188 // 1189 // d.mu must be held when calling this, but the mutex may be dropped and 1190 // re-acquired during the course of this method. 1191 func (d *DB) deleteObsoleteFiles(jobID int) { 1192 // Only allow a single delete obsolete files job to run at a time. 1193 for d.mu.cleaner.cleaning { 1194 d.mu.cleaner.cond.Wait() 1195 } 1196 d.mu.cleaner.cleaning = true 1197 defer func() { 1198 d.mu.cleaner.cleaning = false 1199 d.mu.cleaner.cond.Signal() 1200 }() 1201 1202 var obsoleteLogs []uint64 1203 for i := range d.mu.log.queue { 1204 // NB: d.mu.versions.logNumber is the file number of the latest log that 1205 // has had its contents persisted to the LSM. 1206 if d.mu.log.queue[i] >= d.mu.versions.logNum { 1207 obsoleteLogs = d.mu.log.queue[:i] 1208 d.mu.log.queue = d.mu.log.queue[i:] 1209 d.mu.versions.metrics.WAL.Files -= int64(len(obsoleteLogs)) 1210 break 1211 } 1212 } 1213 1214 obsoleteTables := d.mu.versions.obsoleteTables 1215 d.mu.versions.obsoleteTables = nil 1216 1217 obsoleteManifests := d.mu.versions.obsoleteManifests 1218 d.mu.versions.obsoleteManifests = nil 1219 1220 obsoleteOptions := d.mu.versions.obsoleteOptions 1221 d.mu.versions.obsoleteOptions = nil 1222 1223 // Release d.mu while doing I/O 1224 // Note the unusual order: Unlock and then Lock. 1225 d.mu.Unlock() 1226 defer d.mu.Lock() 1227 1228 files := [4]struct { 1229 fileType fileType 1230 obsolete []uint64 1231 }{ 1232 {fileTypeLog, obsoleteLogs}, 1233 {fileTypeTable, obsoleteTables}, 1234 {fileTypeManifest, obsoleteManifests}, 1235 {fileTypeOptions, obsoleteOptions}, 1236 } 1237 for _, f := range files { 1238 // We sort to make the order of deletions deterministic, which is nice for 1239 // tests. 1240 sort.Slice(f.obsolete, func(i, j int) bool { 1241 return f.obsolete[i] < f.obsolete[j] 1242 }) 1243 for _, fileNum := range f.obsolete { 1244 switch f.fileType { 1245 case fileTypeLog: 1246 if d.logRecycler.add(fileNum) { 1247 continue 1248 } 1249 case fileTypeTable: 1250 d.tableCache.evict(fileNum) 1251 } 1252 1253 path := base.MakeFilename(d.dirname, f.fileType, fileNum) 1254 err := d.opts.FS.Remove(path) 1255 if err == os.ErrNotExist { 1256 continue 1257 } 1258 1259 // TODO(peter): need to handle this errror, probably by re-adding the 1260 // file that couldn't be deleted to one of the obsolete slices map. 1261 1262 switch f.fileType { 1263 case fileTypeLog: 1264 if d.opts.EventListener.WALDeleted != nil { 1265 d.opts.EventListener.WALDeleted(WALDeleteInfo{ 1266 JobID: jobID, 1267 Path: path, 1268 FileNum: fileNum, 1269 Err: err, 1270 }) 1271 } 1272 case fileTypeManifest: 1273 if d.opts.EventListener.ManifestDeleted != nil { 1274 d.opts.EventListener.ManifestDeleted(ManifestDeleteInfo{ 1275 JobID: jobID, 1276 Path: path, 1277 FileNum: fileNum, 1278 Err: err, 1279 }) 1280 } 1281 case fileTypeTable: 1282 if d.opts.EventListener.TableDeleted != nil { 1283 d.opts.EventListener.TableDeleted(TableDeleteInfo{ 1284 JobID: jobID, 1285 Path: path, 1286 FileNum: fileNum, 1287 Err: err, 1288 }) 1289 } 1290 } 1291 } 1292 } 1293 } 1294 1295 func merge(a, b []uint64) []uint64 { 1296 if len(b) == 0 { 1297 return a 1298 } 1299 1300 a = append(a, b...) 1301 sort.Slice(a, func(i, j int) bool { 1302 return a[i] < a[j] 1303 }) 1304 1305 n := 0 1306 for i := 0; i < len(a); i++ { 1307 if n == 0 || a[i] != a[n-1] { 1308 a[n] = a[i] 1309 n++ 1310 } 1311 } 1312 return a[:n] 1313 }