github.com/coocood/badger@v1.5.1-0.20200528065104-c02ac3616d04/levels.go (about) 1 /* 2 * Copyright 2017 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package badger 18 19 import ( 20 "bytes" 21 "fmt" 22 "os" 23 "sort" 24 "time" 25 26 "github.com/coocood/badger/epoch" 27 "github.com/coocood/badger/options" 28 "github.com/coocood/badger/protos" 29 "github.com/coocood/badger/table" 30 "github.com/coocood/badger/table/sstable" 31 "github.com/coocood/badger/y" 32 "github.com/ncw/directio" 33 "github.com/pingcap/errors" 34 "github.com/pingcap/log" 35 "go.uber.org/zap" 36 "golang.org/x/time/rate" 37 ) 38 39 type levelsController struct { 40 nextFileID uint64 // Atomic 41 42 // The following are initialized once and const. 43 resourceMgr *epoch.ResourceManager 44 levels []*levelHandler 45 kv *DB 46 47 cstatus compactStatus 48 49 opt options.TableBuilderOptions 50 } 51 52 var ( 53 // This is for getting timings between stalls. 54 lastUnstalled time.Time 55 ) 56 57 // revertToManifest checks that all necessary table files exist and removes all table files not 58 // referenced by the manifest. idMap is a set of table file id's that were read from the directory 59 // listing. 60 func revertToManifest(kv *DB, mf *Manifest, idMap map[uint64]struct{}) error { 61 // 1. Check all files in manifest exist. 62 for id := range mf.Tables { 63 if _, ok := idMap[id]; !ok { 64 return fmt.Errorf("file does not exist for table %d", id) 65 } 66 } 67 68 // 2. Delete files that shouldn't exist. 69 for id := range idMap { 70 if _, ok := mf.Tables[id]; !ok { 71 log.Info("table file not referenced in MANIFEST", zap.Uint64("id", id)) 72 filename := sstable.NewFilename(id, kv.opt.Dir) 73 if err := os.Remove(filename); err != nil { 74 return y.Wrapf(err, "While removing table %d", id) 75 } 76 } 77 } 78 79 return nil 80 } 81 82 func newLevelsController(kv *DB, mf *Manifest, mgr *epoch.ResourceManager, opt options.TableBuilderOptions) (*levelsController, error) { 83 y.Assert(kv.opt.NumLevelZeroTablesStall > kv.opt.NumLevelZeroTables) 84 s := &levelsController{ 85 kv: kv, 86 levels: make([]*levelHandler, kv.opt.TableBuilderOptions.MaxLevels), 87 opt: opt, 88 resourceMgr: mgr, 89 } 90 s.cstatus.levels = make([]*levelCompactStatus, kv.opt.TableBuilderOptions.MaxLevels) 91 92 for i := 0; i < kv.opt.TableBuilderOptions.MaxLevels; i++ { 93 s.levels[i] = newLevelHandler(kv, i) 94 if i == 0 { 95 // Do nothing. 96 } else if i == 1 { 97 // Level 1 probably shouldn't be too much bigger than level 0. 98 s.levels[i].maxTotalSize = kv.opt.LevelOneSize 99 } else { 100 s.levels[i].maxTotalSize = s.levels[i-1].maxTotalSize * int64(kv.opt.TableBuilderOptions.LevelSizeMultiplier) 101 } 102 s.cstatus.levels[i] = new(levelCompactStatus) 103 } 104 105 // Compare manifest against directory, check for existent/non-existent files, and remove. 106 if err := revertToManifest(kv, mf, getIDMap(kv.opt.Dir)); err != nil { 107 return nil, err 108 } 109 110 // Some files may be deleted. Let's reload. 111 tables := make([][]table.Table, kv.opt.TableBuilderOptions.MaxLevels) 112 var maxFileID uint64 113 for fileID, tableManifest := range mf.Tables { 114 fname := sstable.NewFilename(fileID, kv.opt.Dir) 115 var flags uint32 = y.Sync 116 if kv.opt.ReadOnly { 117 flags |= y.ReadOnly 118 } 119 120 t, err := sstable.OpenTable(fname, kv.blockCache, kv.indexCache) 121 if err != nil { 122 closeAllTables(tables) 123 return nil, errors.Wrapf(err, "Opening table: %q", fname) 124 } 125 126 level := tableManifest.Level 127 tables[level] = append(tables[level], t) 128 129 if fileID > maxFileID { 130 maxFileID = fileID 131 } 132 } 133 s.nextFileID = maxFileID + 1 134 for i, tbls := range tables { 135 s.levels[i].initTables(tbls) 136 } 137 138 // Make sure key ranges do not overlap etc. 139 if err := s.validate(); err != nil { 140 _ = s.cleanupLevels() 141 return nil, errors.Wrap(err, "Level validation") 142 } 143 144 // Sync directory (because we have at least removed some files, or previously created the 145 // manifest file). 146 if err := syncDir(kv.opt.Dir); err != nil { 147 _ = s.close() 148 return nil, err 149 } 150 151 return s, nil 152 } 153 154 // Closes the tables, for cleanup in newLevelsController. (We Close() instead of using DecrRef() 155 // because that would delete the underlying files.) We ignore errors, which is OK because tables 156 // are read-only. 157 func closeAllTables(tables [][]table.Table) { 158 for _, tableSlice := range tables { 159 for _, table := range tableSlice { 160 _ = table.Close() 161 } 162 } 163 } 164 165 func (lc *levelsController) cleanupLevels() error { 166 var firstErr error 167 for _, l := range lc.levels { 168 if err := l.close(); err != nil && firstErr == nil { 169 firstErr = err 170 } 171 } 172 return firstErr 173 } 174 175 func (lc *levelsController) startCompact(c *y.Closer) { 176 n := lc.kv.opt.NumCompactors 177 c.AddRunning(n - 1) 178 for i := 0; i < n; i++ { 179 // The first half compaction workers take level as priority, others take score 180 // as priority. 181 go lc.runWorker(c, i*2 >= n) 182 } 183 } 184 185 func (lc *levelsController) runWorker(c *y.Closer, scorePriority bool) { 186 defer c.Done() 187 if lc.kv.opt.DoNotCompact { 188 return 189 } 190 191 for { 192 guard := lc.resourceMgr.Acquire() 193 prios := lc.pickCompactLevels() 194 if scorePriority { 195 sort.Slice(prios, func(i, j int) bool { 196 return prios[i].score > prios[j].score 197 }) 198 } 199 var didCompact bool 200 for _, p := range prios { 201 // TODO: Handle error. 202 didCompact, _ = lc.doCompact(p, guard) 203 if didCompact { 204 break 205 } 206 } 207 guard.Done() 208 waitDur := time.Second * 3 209 if didCompact { 210 waitDur /= 10 211 } 212 timer := time.NewTimer(waitDur) 213 select { 214 case <-c.HasBeenClosed(): 215 timer.Stop() 216 return 217 case <-timer.C: 218 } 219 } 220 } 221 222 // Returns true if level zero may be compacted, without accounting for compactions that already 223 // might be happening. 224 func (lc *levelsController) isL0Compactable() bool { 225 return lc.levels[0].numTables() >= lc.kv.opt.NumLevelZeroTables 226 } 227 228 // Returns true if the non-zero level may be compacted. deltaSize provides the size of the tables 229 // which are currently being compacted so that we treat them as already having started being 230 // compacted (because they have been, yet their size is already counted in getTotalSize). 231 func (l *levelHandler) isCompactable(deltaSize int64) bool { 232 return l.getTotalSize() >= l.maxTotalSize+deltaSize 233 } 234 235 type compactionPriority struct { 236 level int 237 score float64 238 } 239 240 // pickCompactLevel determines which level to compact. 241 // Based on: https://github.com/facebook/rocksdb/wiki/Leveled-Compaction 242 func (lc *levelsController) pickCompactLevels() (prios []compactionPriority) { 243 // This function must use identical criteria for guaranteeing compaction's progress that 244 // addLevel0Table uses. 245 246 // cstatus is checked to see if level 0's tables are already being compacted 247 if !lc.cstatus.overlapsWith(0, infRange) && lc.isL0Compactable() { 248 pri := compactionPriority{ 249 level: 0, 250 score: float64(lc.levels[0].numTables()) / float64(lc.kv.opt.NumLevelZeroTables), 251 } 252 prios = append(prios, pri) 253 } 254 255 // now calcalute scores from level 1 256 for levelNum := 1; levelNum < len(lc.levels); levelNum++ { 257 // Don't consider those tables that are already being compacted right now. 258 deltaSize := lc.cstatus.deltaSize(levelNum) 259 260 l := lc.levels[levelNum] 261 if l.isCompactable(deltaSize) { 262 pri := compactionPriority{ 263 level: levelNum, 264 score: float64(l.getTotalSize()-deltaSize) / float64(l.maxTotalSize), 265 } 266 prios = append(prios, pri) 267 } 268 } 269 // We used to sort compaction priorities based on the score. But, we 270 // decided to compact based on the level, not the priority. So, upper 271 // levels (level 0, level 1, etc) always get compacted first, before the 272 // lower levels -- this allows us to avoid stalls. 273 return prios 274 } 275 276 func (lc *levelsController) hasOverlapTable(cd *compactDef) bool { 277 kr := getKeyRange(cd.top) 278 for i := cd.nextLevel.level + 1; i < len(lc.levels); i++ { 279 lh := lc.levels[i] 280 lh.RLock() 281 left, right := lh.overlappingTables(levelHandlerRLocked{}, kr) 282 lh.RUnlock() 283 if right-left > 0 { 284 return true 285 } 286 } 287 return false 288 } 289 290 type DiscardStats struct { 291 numSkips int64 292 skippedBytes int64 293 ptrs []blobPointer 294 } 295 296 func (ds *DiscardStats) collect(vs y.ValueStruct) { 297 if vs.Meta&bitValuePointer > 0 { 298 var bp blobPointer 299 bp.decode(vs.Value) 300 ds.ptrs = append(ds.ptrs, bp) 301 ds.skippedBytes += int64(bp.length) 302 } 303 ds.numSkips++ 304 } 305 306 func (ds *DiscardStats) String() string { 307 return fmt.Sprintf("numSkips:%d, skippedBytes:%d", ds.numSkips, ds.skippedBytes) 308 } 309 310 func shouldFinishFile(key, lastKey y.Key, guard *Guard, currentSize, maxSize int64) bool { 311 if lastKey.IsEmpty() { 312 return false 313 } 314 if guard != nil { 315 if !bytes.HasPrefix(key.UserKey, guard.Prefix) { 316 return true 317 } 318 if !matchGuard(key.UserKey, lastKey.UserKey, guard) { 319 if maxSize > guard.MinSize { 320 maxSize = guard.MinSize 321 } 322 } 323 } 324 return currentSize > maxSize 325 } 326 327 func matchGuard(key, lastKey []byte, guard *Guard) bool { 328 if len(lastKey) < guard.MatchLen { 329 return false 330 } 331 return bytes.HasPrefix(key, lastKey[:guard.MatchLen]) 332 } 333 334 func searchGuard(key []byte, guards []Guard) *Guard { 335 var maxMatchGuard *Guard 336 for i := range guards { 337 guard := &guards[i] 338 if bytes.HasPrefix(key, guard.Prefix) { 339 if maxMatchGuard == nil || len(guard.Prefix) > len(maxMatchGuard.Prefix) { 340 maxMatchGuard = guard 341 } 342 } 343 } 344 return maxMatchGuard 345 } 346 347 func overSkipTables(key y.Key, skippedTables []table.Table) (newSkippedTables []table.Table, over bool) { 348 var i int 349 for i < len(skippedTables) { 350 t := skippedTables[i] 351 if key.Compare(t.Biggest()) > 0 { 352 i++ 353 } else { 354 break 355 } 356 } 357 return skippedTables[i:], i > 0 358 } 359 360 // compactBuildTables merge topTables and botTables to form a list of new tables. 361 func (lc *levelsController) compactBuildTables(level int, cd *compactDef, 362 limiter *rate.Limiter, splitHints []y.Key) (newTables []table.Table, err error) { 363 topTables := cd.top 364 botTables := cd.bot 365 366 hasOverlap := lc.hasOverlapTable(cd) 367 log.Info("check range with lower level", zap.Bool("overlapped", hasOverlap)) 368 369 // Try to collect stats so that we can inform value log about GC. That would help us find which 370 // value log file should be GCed. 371 discardStats := &DiscardStats{} 372 373 // Create iterators across all the tables involved first. 374 var iters []y.Iterator 375 if level == 0 { 376 iters = appendIteratorsReversed(iters, topTables, false) 377 } else { 378 iters = []y.Iterator{table.NewConcatIterator(topTables, false)} 379 } 380 381 // Next level has level>=1 and we can use ConcatIterator as key ranges do not overlap. 382 iters = append(iters, table.NewConcatIterator(botTables, false)) 383 it := table.NewMergeIterator(iters, false) 384 385 it.Rewind() 386 387 // Pick up the currently pending transactions' min readTs, so we can discard versions below this 388 // readTs. We should never discard any versions starting from above this timestamp, because that 389 // would affect the snapshot view guarantee provided by transactions. 390 safeTs := lc.kv.getCompactSafeTs() 391 392 var filter CompactionFilter 393 var guards []Guard 394 if lc.kv.opt.CompactionFilterFactory != nil { 395 filter = lc.kv.opt.CompactionFilterFactory(level+1, cd.smallest().UserKey, cd.biggest().UserKey) 396 guards = filter.Guards() 397 } 398 skippedTbls := cd.skippedTbls 399 400 var lastKey, skipKey y.Key 401 var builder *sstable.Builder 402 var bytesRead, bytesWrite, numRead, numWrite int 403 for it.Valid() { 404 fileID := lc.reserveFileID() 405 filename := sstable.NewFilename(fileID, lc.kv.opt.Dir) 406 var fd *os.File 407 fd, err = directio.OpenFile(filename, os.O_CREATE|os.O_RDWR, 0666) 408 if err != nil { 409 return 410 } 411 if builder == nil { 412 builder = sstable.NewTableBuilder(fd, limiter, cd.nextLevel.level, lc.opt) 413 } else { 414 builder.Reset(fd) 415 } 416 lastKey.Reset() 417 guard := searchGuard(it.Key().UserKey, guards) 418 for ; it.Valid(); y.NextAllVersion(it) { 419 numRead++ 420 vs := it.Value() 421 key := it.Key() 422 kvSize := int(vs.EncodedSize()) + key.Len() 423 bytesRead += kvSize 424 // See if we need to skip this key. 425 if !skipKey.IsEmpty() { 426 if key.SameUserKey(skipKey) { 427 discardStats.collect(vs) 428 continue 429 } else { 430 skipKey.Reset() 431 } 432 } 433 if !key.SameUserKey(lastKey) { 434 // Only break if we are on a different key, and have reached capacity. We want 435 // to ensure that all versions of the key are stored in the same sstable, and 436 // not divided across multiple tables at the same level. 437 if len(skippedTbls) > 0 { 438 var over bool 439 skippedTbls, over = overSkipTables(key, skippedTbls) 440 if over && !builder.Empty() { 441 break 442 } 443 } 444 if shouldFinishFile(key, lastKey, guard, int64(builder.EstimateSize()), lc.kv.opt.MaxTableSize) { 445 break 446 } 447 if len(splitHints) != 0 && key.Compare(splitHints[0]) >= 0 { 448 splitHints = splitHints[1:] 449 for len(splitHints) > 0 && key.Compare(splitHints[0]) >= 0 { 450 splitHints = splitHints[1:] 451 } 452 break 453 } 454 lastKey.Copy(key) 455 } 456 457 // Only consider the versions which are below the minReadTs, otherwise, we might end up discarding the 458 // only valid version for a running transaction. 459 if key.Version <= safeTs { 460 // key is the latest readable version of this key, so we simply discard all the rest of the versions. 461 skipKey.Copy(key) 462 463 if isDeleted(vs.Meta) { 464 // If this key range has overlap with lower levels, then keep the deletion 465 // marker with the latest version, discarding the rest. We have set skipKey, 466 // so the following key versions would be skipped. Otherwise discard the deletion marker. 467 if !hasOverlap { 468 continue 469 } 470 } else if filter != nil { 471 switch filter.Filter(key.UserKey, vs.Value, vs.UserMeta) { 472 case DecisionMarkTombstone: 473 discardStats.collect(vs) 474 if hasOverlap { 475 // There may have ole versions for this key, so convert to delete tombstone. 476 builder.Add(key, y.ValueStruct{Meta: bitDelete}) 477 } 478 continue 479 case DecisionDrop: 480 discardStats.collect(vs) 481 continue 482 case DecisionKeep: 483 } 484 } 485 } 486 builder.Add(key, vs) 487 numWrite++ 488 bytesWrite += kvSize 489 } 490 if builder.Empty() { 491 continue 492 } 493 if err = builder.Finish(); err != nil { 494 return 495 } 496 fd.Close() 497 var tbl table.Table 498 tbl, err = sstable.OpenTable(filename, lc.kv.blockCache, lc.kv.indexCache) 499 if err != nil { 500 return 501 } 502 if tbl.Smallest().IsEmpty() { 503 tbl.Delete() 504 } else { 505 newTables = append(newTables, tbl) 506 } 507 } 508 509 stats := &y.CompactionStats{ 510 KeysRead: numRead, 511 BytesRead: bytesRead, 512 KeysWrite: numWrite, 513 BytesWrite: bytesWrite, 514 KeysDiscard: int(discardStats.numSkips), 515 BytesDiscard: int(discardStats.skippedBytes), 516 } 517 cd.nextLevel.metrics.UpdateCompactionStats(stats) 518 // Ensure created files' directory entries are visible. We don't mind the extra latency 519 // from not doing this ASAP after all file creation has finished because this is a 520 // background operation. 521 err = syncDir(lc.kv.opt.Dir) 522 if err != nil { 523 log.Error("compact sync dir error", zap.Error(err)) 524 return 525 } 526 sortTables(newTables) 527 log.Info("compact send discard stats", zap.Stringer("stats", discardStats)) 528 if len(discardStats.ptrs) > 0 { 529 lc.kv.blobManger.discardCh <- discardStats 530 } 531 return 532 } 533 534 func buildChangeSet(cd *compactDef, newTables []table.Table) protos.ManifestChangeSet { 535 changes := []*protos.ManifestChange{} 536 for _, table := range newTables { 537 changes = append(changes, 538 newCreateChange(table.ID(), cd.nextLevel.level)) 539 } 540 for _, table := range cd.top { 541 changes = append(changes, newDeleteChange(table.ID())) 542 } 543 for _, table := range cd.bot { 544 changes = append(changes, newDeleteChange(table.ID())) 545 } 546 return protos.ManifestChangeSet{Changes: changes} 547 } 548 549 type compactDef struct { 550 thisLevel *levelHandler 551 nextLevel *levelHandler 552 553 top []table.Table 554 bot []table.Table 555 556 skippedTbls []table.Table 557 558 thisRange keyRange 559 nextRange keyRange 560 561 topSize int64 562 topLeftIdx int 563 topRightIdx int 564 botSize int64 565 botLeftIdx int 566 botRightIdx int 567 } 568 569 func (cd *compactDef) String() string { 570 return fmt.Sprintf("%d top:[%d:%d](%d), bot:[%d:%d](%d), skip:%d, write_amp:%.2f", 571 cd.thisLevel.level, cd.topLeftIdx, cd.topRightIdx, cd.topSize, 572 cd.botLeftIdx, cd.botRightIdx, cd.botSize, len(cd.skippedTbls), float64(cd.topSize+cd.botSize)/float64(cd.topSize)) 573 } 574 575 func (cd *compactDef) lockLevels() { 576 cd.thisLevel.RLock() 577 cd.nextLevel.RLock() 578 } 579 580 func (cd *compactDef) unlockLevels() { 581 cd.nextLevel.RUnlock() 582 cd.thisLevel.RUnlock() 583 } 584 585 func (cd *compactDef) smallest() y.Key { 586 if len(cd.bot) > 0 && cd.nextRange.left.Compare(cd.thisRange.left) < 0 { 587 return cd.nextRange.left 588 } 589 return cd.thisRange.left 590 } 591 592 func (cd *compactDef) biggest() y.Key { 593 if len(cd.bot) > 0 && cd.nextRange.right.Compare(cd.thisRange.right) > 0 { 594 return cd.nextRange.right 595 } 596 return cd.thisRange.right 597 } 598 599 func (cd *compactDef) markTablesCompacting() { 600 for _, tbl := range cd.top { 601 tbl.MarkCompacting(true) 602 } 603 for _, tbl := range cd.bot { 604 tbl.MarkCompacting(true) 605 } 606 for _, tbl := range cd.skippedTbls { 607 tbl.MarkCompacting(true) 608 } 609 } 610 611 func (lc *levelsController) fillTablesL0(cd *compactDef) bool { 612 cd.lockLevels() 613 defer cd.unlockLevels() 614 615 if len(cd.thisLevel.tables) == 0 { 616 return false 617 } 618 619 cd.top = make([]table.Table, len(cd.thisLevel.tables)) 620 copy(cd.top, cd.thisLevel.tables) 621 for _, t := range cd.top { 622 cd.topSize += t.Size() 623 } 624 cd.topRightIdx = len(cd.top) 625 626 cd.thisRange = infRange 627 628 kr := getKeyRange(cd.top) 629 left, right := cd.nextLevel.overlappingTables(levelHandlerRLocked{}, kr) 630 overlappingTables := cd.nextLevel.tables[left:right] 631 cd.botLeftIdx = left 632 cd.botRightIdx = right 633 lc.fillBottomTables(cd, overlappingTables) 634 for _, t := range cd.bot { 635 cd.botSize += t.Size() 636 } 637 638 if len(overlappingTables) == 0 { // the bottom-most level 639 cd.nextRange = kr 640 } else { 641 cd.nextRange = getKeyRange(overlappingTables) 642 } 643 644 if !lc.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd) { 645 return false 646 } 647 648 return true 649 } 650 651 const minSkippedTableSize = 1024 * 1024 652 653 func (lc *levelsController) fillBottomTables(cd *compactDef, overlappingTables []table.Table) { 654 for _, t := range overlappingTables { 655 // If none of the top tables contains the range in an overlapping bottom table, 656 // we can skip it during compaction to reduce write amplification. 657 var added bool 658 for _, topTbl := range cd.top { 659 if topTbl.HasOverlap(t.Smallest(), t.Biggest(), true) { 660 cd.bot = append(cd.bot, t) 661 added = true 662 break 663 } 664 } 665 if !added { 666 if t.Size() >= minSkippedTableSize { 667 // We need to limit the minimum size of the table to be skipped, 668 // otherwise the number of tables in a level will keep growing 669 // until we meet too many open files error. 670 cd.skippedTbls = append(cd.skippedTbls, t) 671 } else { 672 cd.bot = append(cd.bot, t) 673 } 674 } 675 } 676 } 677 678 const maxCompactionExpandSize = 1 << 30 // 1GB 679 680 func (lc *levelsController) fillTables(cd *compactDef) bool { 681 cd.lockLevels() 682 defer cd.unlockLevels() 683 684 if len(cd.thisLevel.tables) == 0 { 685 return false 686 } 687 this := make([]table.Table, len(cd.thisLevel.tables)) 688 copy(this, cd.thisLevel.tables) 689 next := make([]table.Table, len(cd.nextLevel.tables)) 690 copy(next, cd.nextLevel.tables) 691 692 // First pick one table has max topSize/bottomSize ratio. 693 var candidateRatio float64 694 for i, t := range this { 695 if lc.isCompacting(cd.thisLevel.level, t) { 696 continue 697 } 698 left, right := getTablesInRange(next, t.Smallest(), t.Biggest()) 699 if lc.isCompacting(cd.nextLevel.level, next[left:right]...) { 700 continue 701 } 702 botSize := sumTableSize(next[left:right]) 703 ratio := calcRatio(t.Size(), botSize) 704 if ratio > candidateRatio { 705 candidateRatio = ratio 706 cd.topLeftIdx = i 707 cd.topRightIdx = i + 1 708 cd.top = this[cd.topLeftIdx:cd.topRightIdx:cd.topRightIdx] 709 cd.topSize = t.Size() 710 cd.botLeftIdx = left 711 cd.botRightIdx = right 712 cd.botSize = botSize 713 } 714 } 715 if len(cd.top) == 0 { 716 return false 717 } 718 bots := next[cd.botLeftIdx:cd.botRightIdx:cd.botRightIdx] 719 // Expand to left to include more tops as long as the ratio doesn't decrease and the total size 720 // do not exceeds maxCompactionExpandSize. 721 for i := cd.topLeftIdx - 1; i >= 0; i-- { 722 t := this[i] 723 if lc.isCompacting(cd.thisLevel.level, t) { 724 break 725 } 726 left, right := getTablesInRange(next, t.Smallest(), t.Biggest()) 727 if right < cd.botLeftIdx { 728 // A bottom table is skipped, we can compact in another run. 729 break 730 } 731 if lc.isCompacting(cd.nextLevel.level, next[left:cd.botLeftIdx]...) { 732 break 733 } 734 newTopSize := t.Size() + cd.topSize 735 newBotSize := sumTableSize(next[left:cd.botLeftIdx]) + cd.botSize 736 newRatio := calcRatio(newTopSize, newBotSize) 737 if newRatio > candidateRatio && (newTopSize+newBotSize) < maxCompactionExpandSize { 738 cd.top = append([]table.Table{t}, cd.top...) 739 cd.topLeftIdx-- 740 bots = append(next[left:cd.botLeftIdx:cd.botLeftIdx], bots...) 741 cd.botLeftIdx = left 742 cd.topSize = newTopSize 743 cd.botSize = newBotSize 744 } else { 745 break 746 } 747 } 748 // Expand to right to include more tops as long as the ratio doesn't decrease and the total size 749 // do not exceeds maxCompactionExpandSize. 750 for i := cd.topRightIdx; i < len(this); i++ { 751 t := this[i] 752 if lc.isCompacting(cd.thisLevel.level, t) { 753 break 754 } 755 left, right := getTablesInRange(next, t.Smallest(), t.Biggest()) 756 if left > cd.botRightIdx { 757 // A bottom table is skipped, we can compact in another run. 758 break 759 } 760 if lc.isCompacting(cd.nextLevel.level, next[cd.botRightIdx:right]...) { 761 break 762 } 763 newTopSize := t.Size() + cd.topSize 764 newBotSize := sumTableSize(next[cd.botRightIdx:right]) + cd.botSize 765 newRatio := calcRatio(newTopSize, newBotSize) 766 if newRatio > candidateRatio && (newTopSize+newBotSize) < maxCompactionExpandSize { 767 cd.top = append(cd.top, t) 768 cd.topRightIdx++ 769 bots = append(bots, next[cd.botRightIdx:right]...) 770 cd.botRightIdx = right 771 cd.topSize = newTopSize 772 cd.botSize = newBotSize 773 } else { 774 break 775 } 776 } 777 cd.thisRange = keyRange{left: cd.top[0].Smallest(), right: cd.top[len(cd.top)-1].Biggest()} 778 if len(bots) > 0 { 779 cd.nextRange = keyRange{left: bots[0].Smallest(), right: bots[len(bots)-1].Biggest()} 780 } else { 781 cd.nextRange = cd.thisRange 782 } 783 lc.fillBottomTables(cd, bots) 784 for _, t := range cd.skippedTbls { 785 cd.botSize -= t.Size() 786 } 787 return lc.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd) 788 } 789 790 func sumTableSize(tables []table.Table) int64 { 791 var size int64 792 for _, t := range tables { 793 size += t.Size() 794 } 795 return size 796 } 797 798 func calcRatio(topSize, botSize int64) float64 { 799 if botSize == 0 { 800 return float64(topSize) 801 } 802 return float64(topSize) / float64(botSize) 803 } 804 805 func (lc *levelsController) isCompacting(level int, tables ...table.Table) bool { 806 if len(tables) == 0 { 807 return false 808 } 809 kr := keyRange{ 810 left: tables[0].Smallest(), 811 right: tables[len(tables)-1].Biggest(), 812 } 813 y.Assert(!kr.left.IsEmpty()) 814 y.Assert(!kr.right.IsEmpty()) 815 return lc.cstatus.overlapsWith(level, kr) 816 } 817 818 func (lc *levelsController) runCompactDef(l int, cd *compactDef, limiter *rate.Limiter, guard *epoch.Guard) error { 819 timeStart := time.Now() 820 821 thisLevel := cd.thisLevel 822 nextLevel := cd.nextLevel 823 824 var newTables []table.Table 825 var changeSet protos.ManifestChangeSet 826 var topMove bool 827 defer func() { 828 for _, tbl := range newTables { 829 tbl.MarkCompacting(false) 830 } 831 for _, tbl := range cd.skippedTbls { 832 tbl.MarkCompacting(false) 833 } 834 }() 835 836 if l > 0 && len(cd.bot) == 0 && len(cd.skippedTbls) == 0 { 837 // skip level 0, since it may has many table overlap with each other 838 newTables = cd.top 839 changeSet = protos.ManifestChangeSet{} 840 for _, t := range newTables { 841 changeSet.Changes = append(changeSet.Changes, newMoveDownChange(t.ID(), cd.nextLevel.level)) 842 } 843 topMove = true 844 } else { 845 var err error 846 newTables, err = lc.compactBuildTables(l, cd, limiter, nil) 847 if err != nil { 848 return err 849 } 850 changeSet = buildChangeSet(cd, newTables) 851 } 852 853 // We write to the manifest _before_ we delete files (and after we created files) 854 if err := lc.kv.manifest.addChanges(changeSet.Changes, nil); err != nil { 855 return err 856 } 857 858 // See comment earlier in this function about the ordering of these ops, and the order in which 859 // we access levels when reading. 860 nextLevel.replaceTables(newTables, cd, guard) 861 thisLevel.deleteTables(cd.top, guard, topMove) 862 863 // Note: For level 0, while doCompact is running, it is possible that new tables are added. 864 // However, the tables are added only to the end, so it is ok to just delete the first table. 865 866 log.Info("compaction done", 867 zap.Stringer("def", cd), zap.Int("deleted", len(cd.top)+len(cd.bot)), zap.Int("added", len(newTables)), 868 zap.Duration("duration", time.Since(timeStart))) 869 return nil 870 } 871 872 // doCompact picks some table on level l and compacts it away to the next level. 873 func (lc *levelsController) doCompact(p compactionPriority, guard *epoch.Guard) (bool, error) { 874 l := p.level 875 y.Assert(l+1 < lc.kv.opt.TableBuilderOptions.MaxLevels) // Sanity check. 876 877 cd := &compactDef{ 878 thisLevel: lc.levels[l], 879 nextLevel: lc.levels[l+1], 880 } 881 882 log.Info("start compaction", zap.Int("level", p.level), zap.Float64("score", p.score)) 883 884 // While picking tables to be compacted, both levels' tables are expected to 885 // remain unchanged. 886 if l == 0 { 887 if !lc.fillTablesL0(cd) { 888 log.Info("build compaction fill tables failed", zap.Int("level", l)) 889 return false, nil 890 } 891 } else { 892 if !lc.fillTables(cd) { 893 log.Info("build compaction fill tables failed", zap.Int("level", l)) 894 return false, nil 895 } 896 } 897 defer lc.cstatus.delete(cd) // Remove the ranges from compaction status. 898 899 log.Info("running compaction", zap.Stringer("def", cd)) 900 if err := lc.runCompactDef(l, cd, lc.kv.limiter, guard); err != nil { 901 // This compaction couldn't be done successfully. 902 log.Info("compact failed", zap.Stringer("def", cd), zap.Error(err)) 903 return false, err 904 } 905 906 log.Info("compaction done", zap.Int("level", cd.thisLevel.level)) 907 return true, nil 908 } 909 910 func (lc *levelsController) addLevel0Table(t table.Table, head *protos.HeadInfo) error { 911 // We update the manifest _before_ the table becomes part of a levelHandler, because at that 912 // point it could get used in some compaction. This ensures the manifest file gets updated in 913 // the proper order. (That means this update happens before that of some compaction which 914 // deletes the table.) 915 err := lc.kv.manifest.addChanges([]*protos.ManifestChange{ 916 newCreateChange(t.ID(), 0), 917 }, head) 918 if err != nil { 919 return err 920 } 921 922 for !lc.levels[0].tryAddLevel0Table(t) { 923 // Stall. Make sure all levels are healthy before we unstall. 924 var timeStart time.Time 925 { 926 log.Warn("STALLED STALLED STALLED", zap.Duration("duration", time.Since(lastUnstalled))) 927 lc.cstatus.RLock() 928 for i := 0; i < lc.kv.opt.TableBuilderOptions.MaxLevels; i++ { 929 log.Warn("dump level status", zap.Int("level", i), zap.String("status", lc.cstatus.levels[i].debug()), 930 zap.Int64("size", lc.levels[i].getTotalSize())) 931 } 932 lc.cstatus.RUnlock() 933 timeStart = time.Now() 934 } 935 // Before we unstall, we need to make sure that level 0 is healthy. Otherwise, we 936 // will very quickly fill up level 0 again. 937 for i := 0; ; i++ { 938 // It's crucial that this behavior replicates pickCompactLevels' behavior in 939 // computing compactability in order to guarantee progress. 940 // Break the loop once L0 has enough space to accommodate new tables. 941 if !lc.isL0Compactable() { 942 break 943 } 944 time.Sleep(10 * time.Millisecond) 945 if i%100 == 0 { 946 prios := lc.pickCompactLevels() 947 log.S().Warnf("waiting to add level 0 table, %+v", prios) 948 i = 0 949 } 950 } 951 log.Info("UNSTALLED UNSTALLED UNSTALLED UNSTALLED UNSTALLED UNSTALLED", zap.Duration("duration", time.Since(timeStart))) 952 lastUnstalled = time.Now() 953 } 954 955 return nil 956 } 957 958 func (s *levelsController) close() error { 959 err := s.cleanupLevels() 960 return errors.Wrap(err, "levelsController.Close") 961 } 962 963 // get returns the found value if any. If not found, we return nil. 964 func (s *levelsController) get(key y.Key, keyHash uint64) y.ValueStruct { 965 // It's important that we iterate the levels from 0 on upward. The reason is, if we iterated 966 // in opposite order, or in parallel (naively calling all the h.RLock() in some order) we could 967 // read level L's tables post-compaction and level L+1's tables pre-compaction. (If we do 968 // parallelize this, we will need to call the h.RLock() function by increasing order of level 969 // number.) 970 start := time.Now() 971 defer s.kv.metrics.LSMGetDuration.Observe(time.Since(start).Seconds()) 972 for _, h := range s.levels { 973 vs := h.get(key, keyHash) // Calls h.RLock() and h.RUnlock(). 974 if vs.Valid() { 975 return vs 976 } 977 } 978 return y.ValueStruct{} 979 } 980 981 func (s *levelsController) multiGet(pairs []keyValuePair) { 982 start := time.Now() 983 for _, h := range s.levels { 984 h.multiGet(pairs) 985 } 986 s.kv.metrics.LSMMultiGetDuration.Observe(time.Since(start).Seconds()) 987 } 988 989 func appendIteratorsReversed(out []y.Iterator, th []table.Table, reversed bool) []y.Iterator { 990 for i := len(th) - 1; i >= 0; i-- { 991 // This will increment the reference of the table handler. 992 out = append(out, table.NewConcatIterator(th[i:i+1], reversed)) 993 } 994 return out 995 } 996 997 // appendIterators appends iterators to an array of iterators, for merging. 998 // Note: This obtains references for the table handlers. Remember to close these iterators. 999 func (s *levelsController) appendIterators( 1000 iters []y.Iterator, opts *IteratorOptions) []y.Iterator { 1001 // Just like with get, it's important we iterate the levels from 0 on upward, to avoid missing 1002 // data when there's a compaction. 1003 for _, level := range s.levels { 1004 iters = level.appendIterators(iters, opts) 1005 } 1006 return iters 1007 } 1008 1009 type TableInfo struct { 1010 ID uint64 1011 Level int 1012 Left []byte 1013 Right []byte 1014 } 1015 1016 func (lc *levelsController) getTableInfo() (result []TableInfo) { 1017 for _, l := range lc.levels { 1018 for _, t := range l.tables { 1019 info := TableInfo{ 1020 ID: t.ID(), 1021 Level: l.level, 1022 Left: t.Smallest().UserKey, 1023 Right: t.Biggest().UserKey, 1024 } 1025 result = append(result, info) 1026 } 1027 } 1028 sort.Slice(result, func(i, j int) bool { 1029 if result[i].Level != result[j].Level { 1030 return result[i].Level < result[j].Level 1031 } 1032 return result[i].ID < result[j].ID 1033 }) 1034 return 1035 }