github.com/pingcap/badger@v1.5.1-0.20230103063557-828f39b09b6d/levels.go (about)

     1  /*
     2   * Copyright 2017 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package badger
    18  
    19  import (
    20  	"bytes"
    21  	"fmt"
    22  	"os"
    23  	"sort"
    24  	"time"
    25  
    26  	"github.com/pingcap/badger/directio"
    27  	"github.com/pingcap/badger/epoch"
    28  	"github.com/pingcap/badger/options"
    29  	"github.com/pingcap/badger/protos"
    30  	"github.com/pingcap/badger/table"
    31  	"github.com/pingcap/badger/table/sstable"
    32  	"github.com/pingcap/badger/y"
    33  	"github.com/pingcap/errors"
    34  	"github.com/pingcap/log"
    35  	"go.uber.org/zap"
    36  )
    37  
    38  type levelsController struct {
    39  	nextFileID uint64 // Atomic
    40  
    41  	// The following are initialized once and const.
    42  	resourceMgr *epoch.ResourceManager
    43  	levels      []*levelHandler
    44  	kv          *DB
    45  
    46  	cstatus compactStatus
    47  
    48  	opt options.TableBuilderOptions
    49  }
    50  
    51  var (
    52  	// This is for getting timings between stalls.
    53  	lastUnstalled time.Time
    54  )
    55  
    56  // revertToManifest checks that all necessary table files exist and removes all table files not
    57  // referenced by the manifest.  idMap is a set of table file id's that were read from the directory
    58  // listing.
    59  func revertToManifest(kv *DB, mf *Manifest, idMap map[uint64]struct{}) error {
    60  	// 1. Check all files in manifest exist.
    61  	for id := range mf.Tables {
    62  		if _, ok := idMap[id]; !ok {
    63  			return fmt.Errorf("file does not exist for table %d", id)
    64  		}
    65  	}
    66  
    67  	// 2. Delete files that shouldn't exist.
    68  	for id := range idMap {
    69  		if _, ok := mf.Tables[id]; !ok {
    70  			log.Info("table file not referenced in MANIFEST", zap.Uint64("id", id))
    71  			filename := sstable.NewFilename(id, kv.opt.Dir)
    72  			if err := os.Remove(filename); err != nil {
    73  				return y.Wrapf(err, "While removing table %d", id)
    74  			}
    75  		}
    76  	}
    77  
    78  	return nil
    79  }
    80  
    81  func newLevelsController(kv *DB, mf *Manifest, mgr *epoch.ResourceManager, opt options.TableBuilderOptions) (*levelsController, error) {
    82  	y.Assert(kv.opt.NumLevelZeroTablesStall > kv.opt.NumLevelZeroTables)
    83  	s := &levelsController{
    84  		kv:          kv,
    85  		levels:      make([]*levelHandler, kv.opt.TableBuilderOptions.MaxLevels),
    86  		opt:         opt,
    87  		resourceMgr: mgr,
    88  	}
    89  	s.cstatus.levels = make([]*levelCompactStatus, kv.opt.TableBuilderOptions.MaxLevels)
    90  
    91  	for i := 0; i < kv.opt.TableBuilderOptions.MaxLevels; i++ {
    92  		s.levels[i] = newLevelHandler(kv, i)
    93  		if i == 0 {
    94  			// Do nothing.
    95  		} else if i == 1 {
    96  			// Level 1 probably shouldn't be too much bigger than level 0.
    97  			s.levels[i].maxTotalSize = kv.opt.LevelOneSize
    98  		} else {
    99  			s.levels[i].maxTotalSize = s.levels[i-1].maxTotalSize * int64(kv.opt.TableBuilderOptions.LevelSizeMultiplier)
   100  		}
   101  		s.cstatus.levels[i] = new(levelCompactStatus)
   102  	}
   103  
   104  	// Compare manifest against directory, check for existent/non-existent files, and remove.
   105  	if err := revertToManifest(kv, mf, getIDMap(kv.opt.Dir)); err != nil {
   106  		return nil, err
   107  	}
   108  
   109  	// Some files may be deleted. Let's reload.
   110  	tables := make([][]table.Table, kv.opt.TableBuilderOptions.MaxLevels)
   111  	var maxFileID uint64
   112  	for fileID, tableManifest := range mf.Tables {
   113  		fname := sstable.NewFilename(fileID, kv.opt.Dir)
   114  		var flags uint32 = y.Sync
   115  		if kv.opt.ReadOnly {
   116  			flags |= y.ReadOnly
   117  		}
   118  
   119  		t, err := sstable.OpenTable(fname, kv.blockCache, kv.indexCache)
   120  		if err != nil {
   121  			closeAllTables(tables)
   122  			return nil, errors.Wrapf(err, "Opening table: %q", fname)
   123  		}
   124  
   125  		level := tableManifest.Level
   126  		tables[level] = append(tables[level], t)
   127  
   128  		if fileID > maxFileID {
   129  			maxFileID = fileID
   130  		}
   131  	}
   132  	s.nextFileID = maxFileID + 1
   133  	for i, tbls := range tables {
   134  		s.levels[i].initTables(tbls)
   135  	}
   136  
   137  	// Make sure key ranges do not overlap etc.
   138  	if err := s.validate(); err != nil {
   139  		_ = s.cleanupLevels()
   140  		return nil, errors.Wrap(err, "Level validation")
   141  	}
   142  
   143  	// Sync directory (because we have at least removed some files, or previously created the
   144  	// manifest file).
   145  	if err := syncDir(kv.opt.Dir); err != nil {
   146  		_ = s.close()
   147  		return nil, err
   148  	}
   149  
   150  	return s, nil
   151  }
   152  
   153  // Closes the tables, for cleanup in newLevelsController.  (We Close() instead of using DecrRef()
   154  // because that would delete the underlying files.)  We ignore errors, which is OK because tables
   155  // are read-only.
   156  func closeAllTables(tables [][]table.Table) {
   157  	for _, tableSlice := range tables {
   158  		for _, table := range tableSlice {
   159  			_ = table.Close()
   160  		}
   161  	}
   162  }
   163  
   164  func (lc *levelsController) cleanupLevels() error {
   165  	var firstErr error
   166  	for _, l := range lc.levels {
   167  		if err := l.close(); err != nil && firstErr == nil {
   168  			firstErr = err
   169  		}
   170  	}
   171  	return firstErr
   172  }
   173  
   174  func (lc *levelsController) startCompact(c *y.Closer) {
   175  	n := lc.kv.opt.NumCompactors
   176  	c.AddRunning(n - 1)
   177  	for i := 0; i < n; i++ {
   178  		// The first half compaction workers take level as priority, others take score
   179  		// as priority.
   180  		go lc.runWorker(c, i*2 >= n)
   181  	}
   182  }
   183  
   184  func (lc *levelsController) runWorker(c *y.Closer, scorePriority bool) {
   185  	defer c.Done()
   186  	if lc.kv.opt.DoNotCompact {
   187  		return
   188  	}
   189  
   190  	for {
   191  		guard := lc.resourceMgr.Acquire()
   192  		prios := lc.pickCompactLevels()
   193  		if scorePriority {
   194  			sort.Slice(prios, func(i, j int) bool {
   195  				return prios[i].score > prios[j].score
   196  			})
   197  		}
   198  		var didCompact bool
   199  		for _, p := range prios {
   200  			// TODO: Handle error.
   201  			didCompact, _ = lc.doCompact(p, guard)
   202  			if didCompact {
   203  				break
   204  			}
   205  		}
   206  		guard.Done()
   207  		waitDur := time.Second * 3
   208  		if didCompact {
   209  			waitDur /= 10
   210  		}
   211  		timer := time.NewTimer(waitDur)
   212  		select {
   213  		case <-c.HasBeenClosed():
   214  			timer.Stop()
   215  			return
   216  		case <-timer.C:
   217  		}
   218  	}
   219  }
   220  
   221  // Returns true if level zero may be compacted, without accounting for compactions that already
   222  // might be happening.
   223  func (lc *levelsController) isL0Compactable() bool {
   224  	return lc.levels[0].numTables() >= lc.kv.opt.NumLevelZeroTables
   225  }
   226  
   227  // Returns true if the non-zero level may be compacted.  deltaSize provides the size of the tables
   228  // which are currently being compacted so that we treat them as already having started being
   229  // compacted (because they have been, yet their size is already counted in getTotalSize).
   230  func (l *levelHandler) isCompactable(deltaSize int64) bool {
   231  	return l.getTotalSize() >= l.maxTotalSize+deltaSize
   232  }
   233  
   234  type compactionPriority struct {
   235  	level int
   236  	score float64
   237  }
   238  
   239  // pickCompactLevel determines which level to compact.
   240  // Based on: https://github.com/facebook/rocksdb/wiki/Leveled-Compaction
   241  func (lc *levelsController) pickCompactLevels() (prios []compactionPriority) {
   242  	// This function must use identical criteria for guaranteeing compaction's progress that
   243  	// addLevel0Table uses.
   244  
   245  	// cstatus is checked to see if level 0's tables are already being compacted
   246  	if !lc.cstatus.overlapsWith(0, infRange) && lc.isL0Compactable() {
   247  		pri := compactionPriority{
   248  			level: 0,
   249  			score: float64(lc.levels[0].numTables()) / float64(lc.kv.opt.NumLevelZeroTables),
   250  		}
   251  		prios = append(prios, pri)
   252  	}
   253  
   254  	// now calcalute scores from level 1
   255  	for levelNum := 1; levelNum < len(lc.levels); levelNum++ {
   256  		// Don't consider those tables that are already being compacted right now.
   257  		deltaSize := lc.cstatus.deltaSize(levelNum)
   258  
   259  		l := lc.levels[levelNum]
   260  		if l.isCompactable(deltaSize) {
   261  			pri := compactionPriority{
   262  				level: levelNum,
   263  				score: float64(l.getTotalSize()-deltaSize) / float64(l.maxTotalSize),
   264  			}
   265  			prios = append(prios, pri)
   266  		}
   267  	}
   268  	// We used to sort compaction priorities based on the score. But, we
   269  	// decided to compact based on the level, not the priority. So, upper
   270  	// levels (level 0, level 1, etc) always get compacted first, before the
   271  	// lower levels -- this allows us to avoid stalls.
   272  	return prios
   273  }
   274  
   275  func (lc *levelsController) setHasOverlapTable(cd *CompactDef) {
   276  	if cd.moveDown() {
   277  		return
   278  	}
   279  	kr := getKeyRange(cd.Top)
   280  	for i := cd.Level + 2; i < len(lc.levels); i++ {
   281  		lh := lc.levels[i]
   282  		lh.RLock()
   283  		left, right := lh.overlappingTables(levelHandlerRLocked{}, kr)
   284  		lh.RUnlock()
   285  		if right-left > 0 {
   286  			cd.HasOverlap = true
   287  			return
   288  		}
   289  	}
   290  	return
   291  }
   292  
   293  type DiscardStats struct {
   294  	numSkips     int64
   295  	skippedBytes int64
   296  	ptrs         []blobPointer
   297  }
   298  
   299  func (ds *DiscardStats) collect(vs y.ValueStruct) {
   300  	if vs.Meta&bitValuePointer > 0 {
   301  		var bp blobPointer
   302  		bp.decode(vs.Value)
   303  		ds.ptrs = append(ds.ptrs, bp)
   304  		ds.skippedBytes += int64(bp.length)
   305  	}
   306  	ds.numSkips++
   307  }
   308  
   309  func (ds *DiscardStats) String() string {
   310  	return fmt.Sprintf("numSkips:%d, skippedBytes:%d", ds.numSkips, ds.skippedBytes)
   311  }
   312  
   313  func shouldFinishFile(key, lastKey y.Key, guard *Guard, currentSize, maxSize int64) bool {
   314  	if lastKey.IsEmpty() {
   315  		return false
   316  	}
   317  	if guard != nil {
   318  		if !bytes.HasPrefix(key.UserKey, guard.Prefix) {
   319  			return true
   320  		}
   321  		if !matchGuard(key.UserKey, lastKey.UserKey, guard) {
   322  			if maxSize > guard.MinSize {
   323  				maxSize = guard.MinSize
   324  			}
   325  		}
   326  	}
   327  	return currentSize > maxSize
   328  }
   329  
   330  func matchGuard(key, lastKey []byte, guard *Guard) bool {
   331  	if len(lastKey) < guard.MatchLen {
   332  		return false
   333  	}
   334  	return bytes.HasPrefix(key, lastKey[:guard.MatchLen])
   335  }
   336  
   337  func searchGuard(key []byte, guards []Guard) *Guard {
   338  	var maxMatchGuard *Guard
   339  	for i := range guards {
   340  		guard := &guards[i]
   341  		if bytes.HasPrefix(key, guard.Prefix) {
   342  			if maxMatchGuard == nil || len(guard.Prefix) > len(maxMatchGuard.Prefix) {
   343  				maxMatchGuard = guard
   344  			}
   345  		}
   346  	}
   347  	return maxMatchGuard
   348  }
   349  
   350  func overSkipTables(key y.Key, skippedTables []table.Table) (newSkippedTables []table.Table, over bool) {
   351  	var i int
   352  	for i < len(skippedTables) {
   353  		t := skippedTables[i]
   354  		if key.Compare(t.Biggest()) > 0 {
   355  			i++
   356  		} else {
   357  			break
   358  		}
   359  	}
   360  	return skippedTables[i:], i > 0
   361  }
   362  
   363  func (lc *levelsController) prepareCompactionDef(cd *CompactDef) {
   364  	// Pick up the currently pending transactions' min readTs, so we can discard versions below this
   365  	// readTs. We should never discard any versions starting from above this timestamp, because that
   366  	// would affect the snapshot view guarantee provided by transactions.
   367  	cd.SafeTS = lc.kv.getCompactSafeTs()
   368  	if lc.kv.opt.CompactionFilterFactory != nil {
   369  		cd.Filter = lc.kv.opt.CompactionFilterFactory(cd.Level+1, cd.smallest().UserKey, cd.biggest().UserKey)
   370  		cd.Guards = cd.Filter.Guards()
   371  	}
   372  	cd.Opt = lc.opt
   373  	cd.Dir = lc.kv.opt.Dir
   374  	cd.AllocIDFunc = lc.reserveFileID
   375  	cd.Limiter = lc.kv.limiter
   376  }
   377  
   378  func (lc *levelsController) getCompactor(cd *CompactDef) compactor {
   379  	if len(cd.SkippedTbls) > 0 || lc.kv.opt.RemoteCompactionAddr == "" || lc.kv.opt.ValueThreshold > 0 {
   380  		return &localCompactor{}
   381  	}
   382  	return &remoteCompactor{
   383  		remoteAddr: lc.kv.opt.RemoteCompactionAddr,
   384  	}
   385  }
   386  
   387  // compactBuildTables merge topTables and botTables to form a list of new tables.
   388  func (lc *levelsController) compactBuildTables(cd *CompactDef) (newTables []table.Table, err error) {
   389  
   390  	// Try to collect stats so that we can inform value log about GC. That would help us find which
   391  	// value log file should be GCed.
   392  	lc.prepareCompactionDef(cd)
   393  	stats := &y.CompactionStats{}
   394  	discardStats := &DiscardStats{}
   395  	buildResults, err := lc.getCompactor(cd).compact(cd, stats, discardStats)
   396  	if err != nil {
   397  		return nil, err
   398  	}
   399  	newTables, err = lc.openTables(buildResults)
   400  	if err != nil {
   401  		return nil, err
   402  	}
   403  	lc.handleStats(cd.Level+1, stats, discardStats)
   404  	return
   405  }
   406  
   407  // CompactTables compacts tables in CompactDef and returns the file names.
   408  func CompactTables(cd *CompactDef, stats *y.CompactionStats, discardStats *DiscardStats) ([]*sstable.BuildResult, error) {
   409  	var buildResults []*sstable.BuildResult
   410  	it := cd.buildIterator()
   411  	defer it.Close()
   412  
   413  	skippedTbls := cd.SkippedTbls
   414  	splitHints := cd.splitHints
   415  
   416  	var lastKey, skipKey y.Key
   417  	var builder *sstable.Builder
   418  	for it.Valid() {
   419  		var fd *os.File
   420  		if !cd.InMemory {
   421  			fileID := cd.AllocIDFunc()
   422  			filename := sstable.NewFilename(fileID, cd.Dir)
   423  			var err error
   424  			fd, err = directio.OpenFile(filename, os.O_CREATE|os.O_RDWR, 0666)
   425  			if err != nil {
   426  				return nil, err
   427  			}
   428  		}
   429  		if builder == nil {
   430  			builder = sstable.NewTableBuilder(fd, cd.Limiter, cd.Level+1, cd.Opt)
   431  		} else {
   432  			builder.Reset(fd)
   433  		}
   434  		lastKey.Reset()
   435  		guard := searchGuard(it.Key().UserKey, cd.Guards)
   436  		for ; it.Valid(); y.NextAllVersion(it) {
   437  			stats.KeysRead++
   438  			vs := it.Value()
   439  			key := it.Key()
   440  			kvSize := int(vs.EncodedSize()) + key.Len()
   441  			stats.BytesRead += kvSize
   442  			// See if we need to skip this key.
   443  			if !skipKey.IsEmpty() {
   444  				if key.SameUserKey(skipKey) {
   445  					discardStats.collect(vs)
   446  					continue
   447  				} else {
   448  					skipKey.Reset()
   449  				}
   450  			}
   451  			if !key.SameUserKey(lastKey) {
   452  				// Only break if we are on a different key, and have reached capacity. We want
   453  				// to ensure that all versions of the key are stored in the same sstable, and
   454  				// not divided across multiple tables at the same level.
   455  				if len(skippedTbls) > 0 {
   456  					var over bool
   457  					skippedTbls, over = overSkipTables(key, skippedTbls)
   458  					if over && !builder.Empty() {
   459  						break
   460  					}
   461  				}
   462  				if shouldFinishFile(key, lastKey, guard, int64(builder.EstimateSize()+kvSize), cd.Opt.MaxTableSize) {
   463  					break
   464  				}
   465  				if len(splitHints) != 0 && key.Compare(splitHints[0]) >= 0 {
   466  					splitHints = splitHints[1:]
   467  					for len(splitHints) > 0 && key.Compare(splitHints[0]) >= 0 {
   468  						splitHints = splitHints[1:]
   469  					}
   470  					break
   471  				}
   472  				lastKey.Copy(key)
   473  			}
   474  
   475  			// Only consider the versions which are below the minReadTs, otherwise, we might end up discarding the
   476  			// only valid version for a running transaction.
   477  			if key.Version <= cd.SafeTS {
   478  				// key is the latest readable version of this key, so we simply discard all the rest of the versions.
   479  				skipKey.Copy(key)
   480  
   481  				if isDeleted(vs.Meta) {
   482  					// If this key range has overlap with lower levels, then keep the deletion
   483  					// marker with the latest version, discarding the rest. We have set skipKey,
   484  					// so the following key versions would be skipped. Otherwise discard the deletion marker.
   485  					if !cd.HasOverlap {
   486  						continue
   487  					}
   488  				} else if cd.Filter != nil {
   489  					switch cd.Filter.Filter(key.UserKey, vs.Value, vs.UserMeta) {
   490  					case DecisionMarkTombstone:
   491  						discardStats.collect(vs)
   492  						if cd.HasOverlap {
   493  							// There may have ole versions for this key, so convert to delete tombstone.
   494  							builder.Add(key, y.ValueStruct{Meta: bitDelete})
   495  						}
   496  						continue
   497  					case DecisionDrop:
   498  						discardStats.collect(vs)
   499  						continue
   500  					case DecisionKeep:
   501  					}
   502  				}
   503  			}
   504  			builder.Add(key, vs)
   505  			stats.KeysWrite++
   506  			stats.BytesWrite += kvSize
   507  		}
   508  		if builder.Empty() {
   509  			continue
   510  		}
   511  		result, err := builder.Finish()
   512  		if err != nil {
   513  			return nil, err
   514  		}
   515  		fd.Close()
   516  		buildResults = append(buildResults, result)
   517  	}
   518  	return buildResults, nil
   519  }
   520  
   521  func (lc *levelsController) openTables(buildResults []*sstable.BuildResult) (newTables []table.Table, err error) {
   522  	for _, result := range buildResults {
   523  		var tbl table.Table
   524  		tbl, err = sstable.OpenTable(result.FileName, lc.kv.blockCache, lc.kv.indexCache)
   525  		if err != nil {
   526  			return
   527  		}
   528  		newTables = append(newTables, tbl)
   529  	}
   530  	// Ensure created files' directory entries are visible.  We don't mind the extra latency
   531  	// from not doing this ASAP after all file creation has finished because this is a
   532  	// background operation.
   533  	err = syncDir(lc.kv.opt.Dir)
   534  	if err != nil {
   535  		log.Error("compact sync dir error", zap.Error(err))
   536  		return
   537  	}
   538  	sortTables(newTables)
   539  	return
   540  }
   541  
   542  func (lc *levelsController) handleStats(nexLevel int, stats *y.CompactionStats, discardStats *DiscardStats) {
   543  	stats.KeysDiscard = int(discardStats.numSkips)
   544  	stats.BytesDiscard = int(discardStats.skippedBytes)
   545  	lc.levels[nexLevel].metrics.UpdateCompactionStats(stats)
   546  	log.Info("compact send discard stats", zap.Stringer("stats", discardStats))
   547  	if len(discardStats.ptrs) > 0 {
   548  		lc.kv.blobManger.discardCh <- discardStats
   549  	}
   550  }
   551  
   552  func buildChangeSet(cd *CompactDef, newTables []table.Table) protos.ManifestChangeSet {
   553  	changes := []*protos.ManifestChange{}
   554  	for _, table := range newTables {
   555  		changes = append(changes,
   556  			newCreateChange(table.ID(), cd.Level+1))
   557  	}
   558  	for _, table := range cd.Top {
   559  		changes = append(changes, newDeleteChange(table.ID()))
   560  	}
   561  	for _, table := range cd.Bot {
   562  		changes = append(changes, newDeleteChange(table.ID()))
   563  	}
   564  	return protos.ManifestChangeSet{Changes: changes}
   565  }
   566  
   567  func sumTableSize(tables []table.Table) int64 {
   568  	var size int64
   569  	for _, t := range tables {
   570  		size += t.Size()
   571  	}
   572  	return size
   573  }
   574  
   575  func calcRatio(topSize, botSize int64) float64 {
   576  	if botSize == 0 {
   577  		return float64(topSize)
   578  	}
   579  	return float64(topSize) / float64(botSize)
   580  }
   581  
   582  func (lc *levelsController) runCompactDef(cd *CompactDef, guard *epoch.Guard) error {
   583  	timeStart := time.Now()
   584  
   585  	thisLevel := lc.levels[cd.Level]
   586  	nextLevel := lc.levels[cd.Level+1]
   587  
   588  	var newTables []table.Table
   589  	var changeSet protos.ManifestChangeSet
   590  	defer func() {
   591  		for _, tbl := range newTables {
   592  			tbl.MarkCompacting(false)
   593  		}
   594  		for _, tbl := range cd.SkippedTbls {
   595  			tbl.MarkCompacting(false)
   596  		}
   597  	}()
   598  
   599  	if cd.moveDown() {
   600  		// skip level 0, since it may has many table overlap with each other
   601  		newTables = cd.Top
   602  		changeSet = protos.ManifestChangeSet{}
   603  		for _, t := range newTables {
   604  			changeSet.Changes = append(changeSet.Changes, newMoveDownChange(t.ID(), cd.Level+1))
   605  		}
   606  	} else {
   607  		var err error
   608  		newTables, err = lc.compactBuildTables(cd)
   609  		if err != nil {
   610  			return err
   611  		}
   612  		changeSet = buildChangeSet(cd, newTables)
   613  	}
   614  
   615  	// We write to the manifest _before_ we delete files (and after we created files)
   616  	if err := lc.kv.manifest.addChanges(changeSet.Changes, nil); err != nil {
   617  		return err
   618  	}
   619  
   620  	// See comment earlier in this function about the ordering of these ops, and the order in which
   621  	// we access levels when reading.
   622  	nextLevel.replaceTables(newTables, cd, guard)
   623  	thisLevel.deleteTables(cd.Top, guard, cd.moveDown())
   624  
   625  	// Note: For level 0, while doCompact is running, it is possible that new tables are added.
   626  	// However, the tables are added only to the end, so it is ok to just delete the first table.
   627  
   628  	log.Info("compaction done",
   629  		zap.Stringer("def", cd), zap.Int("deleted", len(cd.Top)+len(cd.Bot)), zap.Int("added", len(newTables)),
   630  		zap.Duration("duration", time.Since(timeStart)))
   631  	return nil
   632  }
   633  
   634  // doCompact picks some table on level l and compacts it away to the next level.
   635  func (lc *levelsController) doCompact(p compactionPriority, guard *epoch.Guard) (bool, error) {
   636  	l := p.level
   637  	y.Assert(l+1 < lc.kv.opt.TableBuilderOptions.MaxLevels) // Sanity check.
   638  
   639  	cd := &CompactDef{
   640  		Level: l,
   641  	}
   642  	thisLevel := lc.levels[cd.Level]
   643  	nextLevel := lc.levels[cd.Level+1]
   644  
   645  	log.Info("start compaction", zap.Int("level", p.level), zap.Float64("score", p.score))
   646  
   647  	// While picking tables to be compacted, both levels' tables are expected to
   648  	// remain unchanged.
   649  	if l == 0 {
   650  		if !cd.fillTablesL0(&lc.cstatus, thisLevel, nextLevel) {
   651  			log.Info("build compaction fill tables failed", zap.Int("level", l))
   652  			return false, nil
   653  		}
   654  	} else {
   655  		if !cd.fillTables(&lc.cstatus, thisLevel, nextLevel) {
   656  			log.Info("build compaction fill tables failed", zap.Int("level", l))
   657  			return false, nil
   658  		}
   659  	}
   660  	lc.setHasOverlapTable(cd)
   661  	defer lc.cstatus.delete(cd) // Remove the ranges from compaction status.
   662  
   663  	log.Info("running compaction", zap.Stringer("def", cd))
   664  	if err := lc.runCompactDef(cd, guard); err != nil {
   665  		// This compaction couldn't be done successfully.
   666  		log.Info("compact failed", zap.Stringer("def", cd), zap.Error(err))
   667  		return false, err
   668  	}
   669  
   670  	log.Info("compaction done", zap.Int("level", cd.Level))
   671  	return true, nil
   672  }
   673  
   674  func (lc *levelsController) addLevel0Table(t table.Table, head *protos.HeadInfo) error {
   675  	// We update the manifest _before_ the table becomes part of a levelHandler, because at that
   676  	// point it could get used in some compaction.  This ensures the manifest file gets updated in
   677  	// the proper order. (That means this update happens before that of some compaction which
   678  	// deletes the table.)
   679  	err := lc.kv.manifest.addChanges([]*protos.ManifestChange{
   680  		newCreateChange(t.ID(), 0),
   681  	}, head)
   682  	if err != nil {
   683  		return err
   684  	}
   685  
   686  	for !lc.levels[0].tryAddLevel0Table(t) {
   687  		// Stall. Make sure all levels are healthy before we unstall.
   688  		var timeStart time.Time
   689  		{
   690  			log.Warn("STALLED STALLED STALLED", zap.Duration("duration", time.Since(lastUnstalled)))
   691  			for i := 0; i < lc.kv.opt.TableBuilderOptions.MaxLevels; i++ {
   692  				lc.cstatus.RLock()
   693  				status := lc.cstatus.levels[i].debug()
   694  				lc.cstatus.RUnlock()
   695  				log.Warn("dump level status", zap.Int("level", i), zap.String("status", status),
   696  					zap.Int64("size", lc.levels[i].getTotalSize()))
   697  			}
   698  			timeStart = time.Now()
   699  		}
   700  		// Before we unstall, we need to make sure that level 0 is healthy. Otherwise, we
   701  		// will very quickly fill up level 0 again.
   702  		for i := 0; ; i++ {
   703  			// It's crucial that this behavior replicates pickCompactLevels' behavior in
   704  			// computing compactability in order to guarantee progress.
   705  			// Break the loop once L0 has enough space to accommodate new tables.
   706  			if !lc.isL0Compactable() {
   707  				break
   708  			}
   709  			time.Sleep(10 * time.Millisecond)
   710  			if i%100 == 0 {
   711  				prios := lc.pickCompactLevels()
   712  				log.S().Warnf("waiting to add level 0 table, %+v", prios)
   713  				i = 0
   714  			}
   715  		}
   716  		log.Info("UNSTALLED UNSTALLED UNSTALLED UNSTALLED UNSTALLED UNSTALLED", zap.Duration("duration", time.Since(timeStart)))
   717  		lastUnstalled = time.Now()
   718  	}
   719  
   720  	return nil
   721  }
   722  
   723  func (s *levelsController) close() error {
   724  	err := s.cleanupLevels()
   725  	return errors.Wrap(err, "levelsController.Close")
   726  }
   727  
   728  // get returns the found value if any. If not found, we return nil.
   729  func (s *levelsController) get(key y.Key, keyHash uint64) y.ValueStruct {
   730  	// It's important that we iterate the levels from 0 on upward.  The reason is, if we iterated
   731  	// in opposite order, or in parallel (naively calling all the h.RLock() in some order) we could
   732  	// read level L's tables post-compaction and level L+1's tables pre-compaction.  (If we do
   733  	// parallelize this, we will need to call the h.RLock() function by increasing order of level
   734  	// number.)
   735  	start := time.Now()
   736  	defer s.kv.metrics.LSMGetDuration.Observe(time.Since(start).Seconds())
   737  	for _, h := range s.levels {
   738  		vs := h.get(key, keyHash) // Calls h.RLock() and h.RUnlock().
   739  		if vs.Valid() {
   740  			return vs
   741  		}
   742  	}
   743  	return y.ValueStruct{}
   744  }
   745  
   746  func (s *levelsController) multiGet(pairs []keyValuePair) {
   747  	start := time.Now()
   748  	for _, h := range s.levels {
   749  		h.multiGet(pairs)
   750  	}
   751  	s.kv.metrics.LSMMultiGetDuration.Observe(time.Since(start).Seconds())
   752  }
   753  
   754  func appendIteratorsReversed(out []y.Iterator, th []table.Table, reversed bool) []y.Iterator {
   755  	for i := len(th) - 1; i >= 0; i-- {
   756  		// This will increment the reference of the table handler.
   757  		out = append(out, table.NewConcatIterator(th[i:i+1], reversed))
   758  	}
   759  	return out
   760  }
   761  
   762  // appendIterators appends iterators to an array of iterators, for merging.
   763  // Note: This obtains references for the table handlers. Remember to close these iterators.
   764  func (s *levelsController) appendIterators(
   765  	iters []y.Iterator, opts *IteratorOptions) []y.Iterator {
   766  	// Just like with get, it's important we iterate the levels from 0 on upward, to avoid missing
   767  	// data when there's a compaction.
   768  	for _, level := range s.levels {
   769  		iters = level.appendIterators(iters, opts)
   770  	}
   771  	return iters
   772  }
   773  
   774  type TableInfo struct {
   775  	ID    uint64
   776  	Level int
   777  	Left  []byte
   778  	Right []byte
   779  }
   780  
   781  func (lc *levelsController) getTableInfo() (result []TableInfo) {
   782  	for _, l := range lc.levels {
   783  		for _, t := range l.tables {
   784  			info := TableInfo{
   785  				ID:    t.ID(),
   786  				Level: l.level,
   787  				Left:  t.Smallest().UserKey,
   788  				Right: t.Biggest().UserKey,
   789  			}
   790  			result = append(result, info)
   791  		}
   792  	}
   793  	sort.Slice(result, func(i, j int) bool {
   794  		if result[i].Level != result[j].Level {
   795  			return result[i].Level < result[j].Level
   796  		}
   797  		return result[i].ID < result[j].ID
   798  	})
   799  	return
   800  }