github.com/pingcap/badger@v1.5.1-0.20230103063557-828f39b09b6d/levels.go (about) 1 /* 2 * Copyright 2017 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package badger 18 19 import ( 20 "bytes" 21 "fmt" 22 "os" 23 "sort" 24 "time" 25 26 "github.com/pingcap/badger/directio" 27 "github.com/pingcap/badger/epoch" 28 "github.com/pingcap/badger/options" 29 "github.com/pingcap/badger/protos" 30 "github.com/pingcap/badger/table" 31 "github.com/pingcap/badger/table/sstable" 32 "github.com/pingcap/badger/y" 33 "github.com/pingcap/errors" 34 "github.com/pingcap/log" 35 "go.uber.org/zap" 36 ) 37 38 type levelsController struct { 39 nextFileID uint64 // Atomic 40 41 // The following are initialized once and const. 42 resourceMgr *epoch.ResourceManager 43 levels []*levelHandler 44 kv *DB 45 46 cstatus compactStatus 47 48 opt options.TableBuilderOptions 49 } 50 51 var ( 52 // This is for getting timings between stalls. 53 lastUnstalled time.Time 54 ) 55 56 // revertToManifest checks that all necessary table files exist and removes all table files not 57 // referenced by the manifest. idMap is a set of table file id's that were read from the directory 58 // listing. 59 func revertToManifest(kv *DB, mf *Manifest, idMap map[uint64]struct{}) error { 60 // 1. Check all files in manifest exist. 61 for id := range mf.Tables { 62 if _, ok := idMap[id]; !ok { 63 return fmt.Errorf("file does not exist for table %d", id) 64 } 65 } 66 67 // 2. Delete files that shouldn't exist. 68 for id := range idMap { 69 if _, ok := mf.Tables[id]; !ok { 70 log.Info("table file not referenced in MANIFEST", zap.Uint64("id", id)) 71 filename := sstable.NewFilename(id, kv.opt.Dir) 72 if err := os.Remove(filename); err != nil { 73 return y.Wrapf(err, "While removing table %d", id) 74 } 75 } 76 } 77 78 return nil 79 } 80 81 func newLevelsController(kv *DB, mf *Manifest, mgr *epoch.ResourceManager, opt options.TableBuilderOptions) (*levelsController, error) { 82 y.Assert(kv.opt.NumLevelZeroTablesStall > kv.opt.NumLevelZeroTables) 83 s := &levelsController{ 84 kv: kv, 85 levels: make([]*levelHandler, kv.opt.TableBuilderOptions.MaxLevels), 86 opt: opt, 87 resourceMgr: mgr, 88 } 89 s.cstatus.levels = make([]*levelCompactStatus, kv.opt.TableBuilderOptions.MaxLevels) 90 91 for i := 0; i < kv.opt.TableBuilderOptions.MaxLevels; i++ { 92 s.levels[i] = newLevelHandler(kv, i) 93 if i == 0 { 94 // Do nothing. 95 } else if i == 1 { 96 // Level 1 probably shouldn't be too much bigger than level 0. 97 s.levels[i].maxTotalSize = kv.opt.LevelOneSize 98 } else { 99 s.levels[i].maxTotalSize = s.levels[i-1].maxTotalSize * int64(kv.opt.TableBuilderOptions.LevelSizeMultiplier) 100 } 101 s.cstatus.levels[i] = new(levelCompactStatus) 102 } 103 104 // Compare manifest against directory, check for existent/non-existent files, and remove. 105 if err := revertToManifest(kv, mf, getIDMap(kv.opt.Dir)); err != nil { 106 return nil, err 107 } 108 109 // Some files may be deleted. Let's reload. 110 tables := make([][]table.Table, kv.opt.TableBuilderOptions.MaxLevels) 111 var maxFileID uint64 112 for fileID, tableManifest := range mf.Tables { 113 fname := sstable.NewFilename(fileID, kv.opt.Dir) 114 var flags uint32 = y.Sync 115 if kv.opt.ReadOnly { 116 flags |= y.ReadOnly 117 } 118 119 t, err := sstable.OpenTable(fname, kv.blockCache, kv.indexCache) 120 if err != nil { 121 closeAllTables(tables) 122 return nil, errors.Wrapf(err, "Opening table: %q", fname) 123 } 124 125 level := tableManifest.Level 126 tables[level] = append(tables[level], t) 127 128 if fileID > maxFileID { 129 maxFileID = fileID 130 } 131 } 132 s.nextFileID = maxFileID + 1 133 for i, tbls := range tables { 134 s.levels[i].initTables(tbls) 135 } 136 137 // Make sure key ranges do not overlap etc. 138 if err := s.validate(); err != nil { 139 _ = s.cleanupLevels() 140 return nil, errors.Wrap(err, "Level validation") 141 } 142 143 // Sync directory (because we have at least removed some files, or previously created the 144 // manifest file). 145 if err := syncDir(kv.opt.Dir); err != nil { 146 _ = s.close() 147 return nil, err 148 } 149 150 return s, nil 151 } 152 153 // Closes the tables, for cleanup in newLevelsController. (We Close() instead of using DecrRef() 154 // because that would delete the underlying files.) We ignore errors, which is OK because tables 155 // are read-only. 156 func closeAllTables(tables [][]table.Table) { 157 for _, tableSlice := range tables { 158 for _, table := range tableSlice { 159 _ = table.Close() 160 } 161 } 162 } 163 164 func (lc *levelsController) cleanupLevels() error { 165 var firstErr error 166 for _, l := range lc.levels { 167 if err := l.close(); err != nil && firstErr == nil { 168 firstErr = err 169 } 170 } 171 return firstErr 172 } 173 174 func (lc *levelsController) startCompact(c *y.Closer) { 175 n := lc.kv.opt.NumCompactors 176 c.AddRunning(n - 1) 177 for i := 0; i < n; i++ { 178 // The first half compaction workers take level as priority, others take score 179 // as priority. 180 go lc.runWorker(c, i*2 >= n) 181 } 182 } 183 184 func (lc *levelsController) runWorker(c *y.Closer, scorePriority bool) { 185 defer c.Done() 186 if lc.kv.opt.DoNotCompact { 187 return 188 } 189 190 for { 191 guard := lc.resourceMgr.Acquire() 192 prios := lc.pickCompactLevels() 193 if scorePriority { 194 sort.Slice(prios, func(i, j int) bool { 195 return prios[i].score > prios[j].score 196 }) 197 } 198 var didCompact bool 199 for _, p := range prios { 200 // TODO: Handle error. 201 didCompact, _ = lc.doCompact(p, guard) 202 if didCompact { 203 break 204 } 205 } 206 guard.Done() 207 waitDur := time.Second * 3 208 if didCompact { 209 waitDur /= 10 210 } 211 timer := time.NewTimer(waitDur) 212 select { 213 case <-c.HasBeenClosed(): 214 timer.Stop() 215 return 216 case <-timer.C: 217 } 218 } 219 } 220 221 // Returns true if level zero may be compacted, without accounting for compactions that already 222 // might be happening. 223 func (lc *levelsController) isL0Compactable() bool { 224 return lc.levels[0].numTables() >= lc.kv.opt.NumLevelZeroTables 225 } 226 227 // Returns true if the non-zero level may be compacted. deltaSize provides the size of the tables 228 // which are currently being compacted so that we treat them as already having started being 229 // compacted (because they have been, yet their size is already counted in getTotalSize). 230 func (l *levelHandler) isCompactable(deltaSize int64) bool { 231 return l.getTotalSize() >= l.maxTotalSize+deltaSize 232 } 233 234 type compactionPriority struct { 235 level int 236 score float64 237 } 238 239 // pickCompactLevel determines which level to compact. 240 // Based on: https://github.com/facebook/rocksdb/wiki/Leveled-Compaction 241 func (lc *levelsController) pickCompactLevels() (prios []compactionPriority) { 242 // This function must use identical criteria for guaranteeing compaction's progress that 243 // addLevel0Table uses. 244 245 // cstatus is checked to see if level 0's tables are already being compacted 246 if !lc.cstatus.overlapsWith(0, infRange) && lc.isL0Compactable() { 247 pri := compactionPriority{ 248 level: 0, 249 score: float64(lc.levels[0].numTables()) / float64(lc.kv.opt.NumLevelZeroTables), 250 } 251 prios = append(prios, pri) 252 } 253 254 // now calcalute scores from level 1 255 for levelNum := 1; levelNum < len(lc.levels); levelNum++ { 256 // Don't consider those tables that are already being compacted right now. 257 deltaSize := lc.cstatus.deltaSize(levelNum) 258 259 l := lc.levels[levelNum] 260 if l.isCompactable(deltaSize) { 261 pri := compactionPriority{ 262 level: levelNum, 263 score: float64(l.getTotalSize()-deltaSize) / float64(l.maxTotalSize), 264 } 265 prios = append(prios, pri) 266 } 267 } 268 // We used to sort compaction priorities based on the score. But, we 269 // decided to compact based on the level, not the priority. So, upper 270 // levels (level 0, level 1, etc) always get compacted first, before the 271 // lower levels -- this allows us to avoid stalls. 272 return prios 273 } 274 275 func (lc *levelsController) setHasOverlapTable(cd *CompactDef) { 276 if cd.moveDown() { 277 return 278 } 279 kr := getKeyRange(cd.Top) 280 for i := cd.Level + 2; i < len(lc.levels); i++ { 281 lh := lc.levels[i] 282 lh.RLock() 283 left, right := lh.overlappingTables(levelHandlerRLocked{}, kr) 284 lh.RUnlock() 285 if right-left > 0 { 286 cd.HasOverlap = true 287 return 288 } 289 } 290 return 291 } 292 293 type DiscardStats struct { 294 numSkips int64 295 skippedBytes int64 296 ptrs []blobPointer 297 } 298 299 func (ds *DiscardStats) collect(vs y.ValueStruct) { 300 if vs.Meta&bitValuePointer > 0 { 301 var bp blobPointer 302 bp.decode(vs.Value) 303 ds.ptrs = append(ds.ptrs, bp) 304 ds.skippedBytes += int64(bp.length) 305 } 306 ds.numSkips++ 307 } 308 309 func (ds *DiscardStats) String() string { 310 return fmt.Sprintf("numSkips:%d, skippedBytes:%d", ds.numSkips, ds.skippedBytes) 311 } 312 313 func shouldFinishFile(key, lastKey y.Key, guard *Guard, currentSize, maxSize int64) bool { 314 if lastKey.IsEmpty() { 315 return false 316 } 317 if guard != nil { 318 if !bytes.HasPrefix(key.UserKey, guard.Prefix) { 319 return true 320 } 321 if !matchGuard(key.UserKey, lastKey.UserKey, guard) { 322 if maxSize > guard.MinSize { 323 maxSize = guard.MinSize 324 } 325 } 326 } 327 return currentSize > maxSize 328 } 329 330 func matchGuard(key, lastKey []byte, guard *Guard) bool { 331 if len(lastKey) < guard.MatchLen { 332 return false 333 } 334 return bytes.HasPrefix(key, lastKey[:guard.MatchLen]) 335 } 336 337 func searchGuard(key []byte, guards []Guard) *Guard { 338 var maxMatchGuard *Guard 339 for i := range guards { 340 guard := &guards[i] 341 if bytes.HasPrefix(key, guard.Prefix) { 342 if maxMatchGuard == nil || len(guard.Prefix) > len(maxMatchGuard.Prefix) { 343 maxMatchGuard = guard 344 } 345 } 346 } 347 return maxMatchGuard 348 } 349 350 func overSkipTables(key y.Key, skippedTables []table.Table) (newSkippedTables []table.Table, over bool) { 351 var i int 352 for i < len(skippedTables) { 353 t := skippedTables[i] 354 if key.Compare(t.Biggest()) > 0 { 355 i++ 356 } else { 357 break 358 } 359 } 360 return skippedTables[i:], i > 0 361 } 362 363 func (lc *levelsController) prepareCompactionDef(cd *CompactDef) { 364 // Pick up the currently pending transactions' min readTs, so we can discard versions below this 365 // readTs. We should never discard any versions starting from above this timestamp, because that 366 // would affect the snapshot view guarantee provided by transactions. 367 cd.SafeTS = lc.kv.getCompactSafeTs() 368 if lc.kv.opt.CompactionFilterFactory != nil { 369 cd.Filter = lc.kv.opt.CompactionFilterFactory(cd.Level+1, cd.smallest().UserKey, cd.biggest().UserKey) 370 cd.Guards = cd.Filter.Guards() 371 } 372 cd.Opt = lc.opt 373 cd.Dir = lc.kv.opt.Dir 374 cd.AllocIDFunc = lc.reserveFileID 375 cd.Limiter = lc.kv.limiter 376 } 377 378 func (lc *levelsController) getCompactor(cd *CompactDef) compactor { 379 if len(cd.SkippedTbls) > 0 || lc.kv.opt.RemoteCompactionAddr == "" || lc.kv.opt.ValueThreshold > 0 { 380 return &localCompactor{} 381 } 382 return &remoteCompactor{ 383 remoteAddr: lc.kv.opt.RemoteCompactionAddr, 384 } 385 } 386 387 // compactBuildTables merge topTables and botTables to form a list of new tables. 388 func (lc *levelsController) compactBuildTables(cd *CompactDef) (newTables []table.Table, err error) { 389 390 // Try to collect stats so that we can inform value log about GC. That would help us find which 391 // value log file should be GCed. 392 lc.prepareCompactionDef(cd) 393 stats := &y.CompactionStats{} 394 discardStats := &DiscardStats{} 395 buildResults, err := lc.getCompactor(cd).compact(cd, stats, discardStats) 396 if err != nil { 397 return nil, err 398 } 399 newTables, err = lc.openTables(buildResults) 400 if err != nil { 401 return nil, err 402 } 403 lc.handleStats(cd.Level+1, stats, discardStats) 404 return 405 } 406 407 // CompactTables compacts tables in CompactDef and returns the file names. 408 func CompactTables(cd *CompactDef, stats *y.CompactionStats, discardStats *DiscardStats) ([]*sstable.BuildResult, error) { 409 var buildResults []*sstable.BuildResult 410 it := cd.buildIterator() 411 defer it.Close() 412 413 skippedTbls := cd.SkippedTbls 414 splitHints := cd.splitHints 415 416 var lastKey, skipKey y.Key 417 var builder *sstable.Builder 418 for it.Valid() { 419 var fd *os.File 420 if !cd.InMemory { 421 fileID := cd.AllocIDFunc() 422 filename := sstable.NewFilename(fileID, cd.Dir) 423 var err error 424 fd, err = directio.OpenFile(filename, os.O_CREATE|os.O_RDWR, 0666) 425 if err != nil { 426 return nil, err 427 } 428 } 429 if builder == nil { 430 builder = sstable.NewTableBuilder(fd, cd.Limiter, cd.Level+1, cd.Opt) 431 } else { 432 builder.Reset(fd) 433 } 434 lastKey.Reset() 435 guard := searchGuard(it.Key().UserKey, cd.Guards) 436 for ; it.Valid(); y.NextAllVersion(it) { 437 stats.KeysRead++ 438 vs := it.Value() 439 key := it.Key() 440 kvSize := int(vs.EncodedSize()) + key.Len() 441 stats.BytesRead += kvSize 442 // See if we need to skip this key. 443 if !skipKey.IsEmpty() { 444 if key.SameUserKey(skipKey) { 445 discardStats.collect(vs) 446 continue 447 } else { 448 skipKey.Reset() 449 } 450 } 451 if !key.SameUserKey(lastKey) { 452 // Only break if we are on a different key, and have reached capacity. We want 453 // to ensure that all versions of the key are stored in the same sstable, and 454 // not divided across multiple tables at the same level. 455 if len(skippedTbls) > 0 { 456 var over bool 457 skippedTbls, over = overSkipTables(key, skippedTbls) 458 if over && !builder.Empty() { 459 break 460 } 461 } 462 if shouldFinishFile(key, lastKey, guard, int64(builder.EstimateSize()+kvSize), cd.Opt.MaxTableSize) { 463 break 464 } 465 if len(splitHints) != 0 && key.Compare(splitHints[0]) >= 0 { 466 splitHints = splitHints[1:] 467 for len(splitHints) > 0 && key.Compare(splitHints[0]) >= 0 { 468 splitHints = splitHints[1:] 469 } 470 break 471 } 472 lastKey.Copy(key) 473 } 474 475 // Only consider the versions which are below the minReadTs, otherwise, we might end up discarding the 476 // only valid version for a running transaction. 477 if key.Version <= cd.SafeTS { 478 // key is the latest readable version of this key, so we simply discard all the rest of the versions. 479 skipKey.Copy(key) 480 481 if isDeleted(vs.Meta) { 482 // If this key range has overlap with lower levels, then keep the deletion 483 // marker with the latest version, discarding the rest. We have set skipKey, 484 // so the following key versions would be skipped. Otherwise discard the deletion marker. 485 if !cd.HasOverlap { 486 continue 487 } 488 } else if cd.Filter != nil { 489 switch cd.Filter.Filter(key.UserKey, vs.Value, vs.UserMeta) { 490 case DecisionMarkTombstone: 491 discardStats.collect(vs) 492 if cd.HasOverlap { 493 // There may have ole versions for this key, so convert to delete tombstone. 494 builder.Add(key, y.ValueStruct{Meta: bitDelete}) 495 } 496 continue 497 case DecisionDrop: 498 discardStats.collect(vs) 499 continue 500 case DecisionKeep: 501 } 502 } 503 } 504 builder.Add(key, vs) 505 stats.KeysWrite++ 506 stats.BytesWrite += kvSize 507 } 508 if builder.Empty() { 509 continue 510 } 511 result, err := builder.Finish() 512 if err != nil { 513 return nil, err 514 } 515 fd.Close() 516 buildResults = append(buildResults, result) 517 } 518 return buildResults, nil 519 } 520 521 func (lc *levelsController) openTables(buildResults []*sstable.BuildResult) (newTables []table.Table, err error) { 522 for _, result := range buildResults { 523 var tbl table.Table 524 tbl, err = sstable.OpenTable(result.FileName, lc.kv.blockCache, lc.kv.indexCache) 525 if err != nil { 526 return 527 } 528 newTables = append(newTables, tbl) 529 } 530 // Ensure created files' directory entries are visible. We don't mind the extra latency 531 // from not doing this ASAP after all file creation has finished because this is a 532 // background operation. 533 err = syncDir(lc.kv.opt.Dir) 534 if err != nil { 535 log.Error("compact sync dir error", zap.Error(err)) 536 return 537 } 538 sortTables(newTables) 539 return 540 } 541 542 func (lc *levelsController) handleStats(nexLevel int, stats *y.CompactionStats, discardStats *DiscardStats) { 543 stats.KeysDiscard = int(discardStats.numSkips) 544 stats.BytesDiscard = int(discardStats.skippedBytes) 545 lc.levels[nexLevel].metrics.UpdateCompactionStats(stats) 546 log.Info("compact send discard stats", zap.Stringer("stats", discardStats)) 547 if len(discardStats.ptrs) > 0 { 548 lc.kv.blobManger.discardCh <- discardStats 549 } 550 } 551 552 func buildChangeSet(cd *CompactDef, newTables []table.Table) protos.ManifestChangeSet { 553 changes := []*protos.ManifestChange{} 554 for _, table := range newTables { 555 changes = append(changes, 556 newCreateChange(table.ID(), cd.Level+1)) 557 } 558 for _, table := range cd.Top { 559 changes = append(changes, newDeleteChange(table.ID())) 560 } 561 for _, table := range cd.Bot { 562 changes = append(changes, newDeleteChange(table.ID())) 563 } 564 return protos.ManifestChangeSet{Changes: changes} 565 } 566 567 func sumTableSize(tables []table.Table) int64 { 568 var size int64 569 for _, t := range tables { 570 size += t.Size() 571 } 572 return size 573 } 574 575 func calcRatio(topSize, botSize int64) float64 { 576 if botSize == 0 { 577 return float64(topSize) 578 } 579 return float64(topSize) / float64(botSize) 580 } 581 582 func (lc *levelsController) runCompactDef(cd *CompactDef, guard *epoch.Guard) error { 583 timeStart := time.Now() 584 585 thisLevel := lc.levels[cd.Level] 586 nextLevel := lc.levels[cd.Level+1] 587 588 var newTables []table.Table 589 var changeSet protos.ManifestChangeSet 590 defer func() { 591 for _, tbl := range newTables { 592 tbl.MarkCompacting(false) 593 } 594 for _, tbl := range cd.SkippedTbls { 595 tbl.MarkCompacting(false) 596 } 597 }() 598 599 if cd.moveDown() { 600 // skip level 0, since it may has many table overlap with each other 601 newTables = cd.Top 602 changeSet = protos.ManifestChangeSet{} 603 for _, t := range newTables { 604 changeSet.Changes = append(changeSet.Changes, newMoveDownChange(t.ID(), cd.Level+1)) 605 } 606 } else { 607 var err error 608 newTables, err = lc.compactBuildTables(cd) 609 if err != nil { 610 return err 611 } 612 changeSet = buildChangeSet(cd, newTables) 613 } 614 615 // We write to the manifest _before_ we delete files (and after we created files) 616 if err := lc.kv.manifest.addChanges(changeSet.Changes, nil); err != nil { 617 return err 618 } 619 620 // See comment earlier in this function about the ordering of these ops, and the order in which 621 // we access levels when reading. 622 nextLevel.replaceTables(newTables, cd, guard) 623 thisLevel.deleteTables(cd.Top, guard, cd.moveDown()) 624 625 // Note: For level 0, while doCompact is running, it is possible that new tables are added. 626 // However, the tables are added only to the end, so it is ok to just delete the first table. 627 628 log.Info("compaction done", 629 zap.Stringer("def", cd), zap.Int("deleted", len(cd.Top)+len(cd.Bot)), zap.Int("added", len(newTables)), 630 zap.Duration("duration", time.Since(timeStart))) 631 return nil 632 } 633 634 // doCompact picks some table on level l and compacts it away to the next level. 635 func (lc *levelsController) doCompact(p compactionPriority, guard *epoch.Guard) (bool, error) { 636 l := p.level 637 y.Assert(l+1 < lc.kv.opt.TableBuilderOptions.MaxLevels) // Sanity check. 638 639 cd := &CompactDef{ 640 Level: l, 641 } 642 thisLevel := lc.levels[cd.Level] 643 nextLevel := lc.levels[cd.Level+1] 644 645 log.Info("start compaction", zap.Int("level", p.level), zap.Float64("score", p.score)) 646 647 // While picking tables to be compacted, both levels' tables are expected to 648 // remain unchanged. 649 if l == 0 { 650 if !cd.fillTablesL0(&lc.cstatus, thisLevel, nextLevel) { 651 log.Info("build compaction fill tables failed", zap.Int("level", l)) 652 return false, nil 653 } 654 } else { 655 if !cd.fillTables(&lc.cstatus, thisLevel, nextLevel) { 656 log.Info("build compaction fill tables failed", zap.Int("level", l)) 657 return false, nil 658 } 659 } 660 lc.setHasOverlapTable(cd) 661 defer lc.cstatus.delete(cd) // Remove the ranges from compaction status. 662 663 log.Info("running compaction", zap.Stringer("def", cd)) 664 if err := lc.runCompactDef(cd, guard); err != nil { 665 // This compaction couldn't be done successfully. 666 log.Info("compact failed", zap.Stringer("def", cd), zap.Error(err)) 667 return false, err 668 } 669 670 log.Info("compaction done", zap.Int("level", cd.Level)) 671 return true, nil 672 } 673 674 func (lc *levelsController) addLevel0Table(t table.Table, head *protos.HeadInfo) error { 675 // We update the manifest _before_ the table becomes part of a levelHandler, because at that 676 // point it could get used in some compaction. This ensures the manifest file gets updated in 677 // the proper order. (That means this update happens before that of some compaction which 678 // deletes the table.) 679 err := lc.kv.manifest.addChanges([]*protos.ManifestChange{ 680 newCreateChange(t.ID(), 0), 681 }, head) 682 if err != nil { 683 return err 684 } 685 686 for !lc.levels[0].tryAddLevel0Table(t) { 687 // Stall. Make sure all levels are healthy before we unstall. 688 var timeStart time.Time 689 { 690 log.Warn("STALLED STALLED STALLED", zap.Duration("duration", time.Since(lastUnstalled))) 691 for i := 0; i < lc.kv.opt.TableBuilderOptions.MaxLevels; i++ { 692 lc.cstatus.RLock() 693 status := lc.cstatus.levels[i].debug() 694 lc.cstatus.RUnlock() 695 log.Warn("dump level status", zap.Int("level", i), zap.String("status", status), 696 zap.Int64("size", lc.levels[i].getTotalSize())) 697 } 698 timeStart = time.Now() 699 } 700 // Before we unstall, we need to make sure that level 0 is healthy. Otherwise, we 701 // will very quickly fill up level 0 again. 702 for i := 0; ; i++ { 703 // It's crucial that this behavior replicates pickCompactLevels' behavior in 704 // computing compactability in order to guarantee progress. 705 // Break the loop once L0 has enough space to accommodate new tables. 706 if !lc.isL0Compactable() { 707 break 708 } 709 time.Sleep(10 * time.Millisecond) 710 if i%100 == 0 { 711 prios := lc.pickCompactLevels() 712 log.S().Warnf("waiting to add level 0 table, %+v", prios) 713 i = 0 714 } 715 } 716 log.Info("UNSTALLED UNSTALLED UNSTALLED UNSTALLED UNSTALLED UNSTALLED", zap.Duration("duration", time.Since(timeStart))) 717 lastUnstalled = time.Now() 718 } 719 720 return nil 721 } 722 723 func (s *levelsController) close() error { 724 err := s.cleanupLevels() 725 return errors.Wrap(err, "levelsController.Close") 726 } 727 728 // get returns the found value if any. If not found, we return nil. 729 func (s *levelsController) get(key y.Key, keyHash uint64) y.ValueStruct { 730 // It's important that we iterate the levels from 0 on upward. The reason is, if we iterated 731 // in opposite order, or in parallel (naively calling all the h.RLock() in some order) we could 732 // read level L's tables post-compaction and level L+1's tables pre-compaction. (If we do 733 // parallelize this, we will need to call the h.RLock() function by increasing order of level 734 // number.) 735 start := time.Now() 736 defer s.kv.metrics.LSMGetDuration.Observe(time.Since(start).Seconds()) 737 for _, h := range s.levels { 738 vs := h.get(key, keyHash) // Calls h.RLock() and h.RUnlock(). 739 if vs.Valid() { 740 return vs 741 } 742 } 743 return y.ValueStruct{} 744 } 745 746 func (s *levelsController) multiGet(pairs []keyValuePair) { 747 start := time.Now() 748 for _, h := range s.levels { 749 h.multiGet(pairs) 750 } 751 s.kv.metrics.LSMMultiGetDuration.Observe(time.Since(start).Seconds()) 752 } 753 754 func appendIteratorsReversed(out []y.Iterator, th []table.Table, reversed bool) []y.Iterator { 755 for i := len(th) - 1; i >= 0; i-- { 756 // This will increment the reference of the table handler. 757 out = append(out, table.NewConcatIterator(th[i:i+1], reversed)) 758 } 759 return out 760 } 761 762 // appendIterators appends iterators to an array of iterators, for merging. 763 // Note: This obtains references for the table handlers. Remember to close these iterators. 764 func (s *levelsController) appendIterators( 765 iters []y.Iterator, opts *IteratorOptions) []y.Iterator { 766 // Just like with get, it's important we iterate the levels from 0 on upward, to avoid missing 767 // data when there's a compaction. 768 for _, level := range s.levels { 769 iters = level.appendIterators(iters, opts) 770 } 771 return iters 772 } 773 774 type TableInfo struct { 775 ID uint64 776 Level int 777 Left []byte 778 Right []byte 779 } 780 781 func (lc *levelsController) getTableInfo() (result []TableInfo) { 782 for _, l := range lc.levels { 783 for _, t := range l.tables { 784 info := TableInfo{ 785 ID: t.ID(), 786 Level: l.level, 787 Left: t.Smallest().UserKey, 788 Right: t.Biggest().UserKey, 789 } 790 result = append(result, info) 791 } 792 } 793 sort.Slice(result, func(i, j int) bool { 794 if result[i].Level != result[j].Level { 795 return result[i].Level < result[j].Level 796 } 797 return result[i].ID < result[j].ID 798 }) 799 return 800 }