github.com/coocood/badger@v1.5.1-0.20200528065104-c02ac3616d04/levels.go (about)

     1  /*
     2   * Copyright 2017 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package badger
    18  
    19  import (
    20  	"bytes"
    21  	"fmt"
    22  	"os"
    23  	"sort"
    24  	"time"
    25  
    26  	"github.com/coocood/badger/epoch"
    27  	"github.com/coocood/badger/options"
    28  	"github.com/coocood/badger/protos"
    29  	"github.com/coocood/badger/table"
    30  	"github.com/coocood/badger/table/sstable"
    31  	"github.com/coocood/badger/y"
    32  	"github.com/ncw/directio"
    33  	"github.com/pingcap/errors"
    34  	"github.com/pingcap/log"
    35  	"go.uber.org/zap"
    36  	"golang.org/x/time/rate"
    37  )
    38  
    39  type levelsController struct {
    40  	nextFileID uint64 // Atomic
    41  
    42  	// The following are initialized once and const.
    43  	resourceMgr *epoch.ResourceManager
    44  	levels      []*levelHandler
    45  	kv          *DB
    46  
    47  	cstatus compactStatus
    48  
    49  	opt options.TableBuilderOptions
    50  }
    51  
    52  var (
    53  	// This is for getting timings between stalls.
    54  	lastUnstalled time.Time
    55  )
    56  
    57  // revertToManifest checks that all necessary table files exist and removes all table files not
    58  // referenced by the manifest.  idMap is a set of table file id's that were read from the directory
    59  // listing.
    60  func revertToManifest(kv *DB, mf *Manifest, idMap map[uint64]struct{}) error {
    61  	// 1. Check all files in manifest exist.
    62  	for id := range mf.Tables {
    63  		if _, ok := idMap[id]; !ok {
    64  			return fmt.Errorf("file does not exist for table %d", id)
    65  		}
    66  	}
    67  
    68  	// 2. Delete files that shouldn't exist.
    69  	for id := range idMap {
    70  		if _, ok := mf.Tables[id]; !ok {
    71  			log.Info("table file not referenced in MANIFEST", zap.Uint64("id", id))
    72  			filename := sstable.NewFilename(id, kv.opt.Dir)
    73  			if err := os.Remove(filename); err != nil {
    74  				return y.Wrapf(err, "While removing table %d", id)
    75  			}
    76  		}
    77  	}
    78  
    79  	return nil
    80  }
    81  
    82  func newLevelsController(kv *DB, mf *Manifest, mgr *epoch.ResourceManager, opt options.TableBuilderOptions) (*levelsController, error) {
    83  	y.Assert(kv.opt.NumLevelZeroTablesStall > kv.opt.NumLevelZeroTables)
    84  	s := &levelsController{
    85  		kv:          kv,
    86  		levels:      make([]*levelHandler, kv.opt.TableBuilderOptions.MaxLevels),
    87  		opt:         opt,
    88  		resourceMgr: mgr,
    89  	}
    90  	s.cstatus.levels = make([]*levelCompactStatus, kv.opt.TableBuilderOptions.MaxLevels)
    91  
    92  	for i := 0; i < kv.opt.TableBuilderOptions.MaxLevels; i++ {
    93  		s.levels[i] = newLevelHandler(kv, i)
    94  		if i == 0 {
    95  			// Do nothing.
    96  		} else if i == 1 {
    97  			// Level 1 probably shouldn't be too much bigger than level 0.
    98  			s.levels[i].maxTotalSize = kv.opt.LevelOneSize
    99  		} else {
   100  			s.levels[i].maxTotalSize = s.levels[i-1].maxTotalSize * int64(kv.opt.TableBuilderOptions.LevelSizeMultiplier)
   101  		}
   102  		s.cstatus.levels[i] = new(levelCompactStatus)
   103  	}
   104  
   105  	// Compare manifest against directory, check for existent/non-existent files, and remove.
   106  	if err := revertToManifest(kv, mf, getIDMap(kv.opt.Dir)); err != nil {
   107  		return nil, err
   108  	}
   109  
   110  	// Some files may be deleted. Let's reload.
   111  	tables := make([][]table.Table, kv.opt.TableBuilderOptions.MaxLevels)
   112  	var maxFileID uint64
   113  	for fileID, tableManifest := range mf.Tables {
   114  		fname := sstable.NewFilename(fileID, kv.opt.Dir)
   115  		var flags uint32 = y.Sync
   116  		if kv.opt.ReadOnly {
   117  			flags |= y.ReadOnly
   118  		}
   119  
   120  		t, err := sstable.OpenTable(fname, kv.blockCache, kv.indexCache)
   121  		if err != nil {
   122  			closeAllTables(tables)
   123  			return nil, errors.Wrapf(err, "Opening table: %q", fname)
   124  		}
   125  
   126  		level := tableManifest.Level
   127  		tables[level] = append(tables[level], t)
   128  
   129  		if fileID > maxFileID {
   130  			maxFileID = fileID
   131  		}
   132  	}
   133  	s.nextFileID = maxFileID + 1
   134  	for i, tbls := range tables {
   135  		s.levels[i].initTables(tbls)
   136  	}
   137  
   138  	// Make sure key ranges do not overlap etc.
   139  	if err := s.validate(); err != nil {
   140  		_ = s.cleanupLevels()
   141  		return nil, errors.Wrap(err, "Level validation")
   142  	}
   143  
   144  	// Sync directory (because we have at least removed some files, or previously created the
   145  	// manifest file).
   146  	if err := syncDir(kv.opt.Dir); err != nil {
   147  		_ = s.close()
   148  		return nil, err
   149  	}
   150  
   151  	return s, nil
   152  }
   153  
   154  // Closes the tables, for cleanup in newLevelsController.  (We Close() instead of using DecrRef()
   155  // because that would delete the underlying files.)  We ignore errors, which is OK because tables
   156  // are read-only.
   157  func closeAllTables(tables [][]table.Table) {
   158  	for _, tableSlice := range tables {
   159  		for _, table := range tableSlice {
   160  			_ = table.Close()
   161  		}
   162  	}
   163  }
   164  
   165  func (lc *levelsController) cleanupLevels() error {
   166  	var firstErr error
   167  	for _, l := range lc.levels {
   168  		if err := l.close(); err != nil && firstErr == nil {
   169  			firstErr = err
   170  		}
   171  	}
   172  	return firstErr
   173  }
   174  
   175  func (lc *levelsController) startCompact(c *y.Closer) {
   176  	n := lc.kv.opt.NumCompactors
   177  	c.AddRunning(n - 1)
   178  	for i := 0; i < n; i++ {
   179  		// The first half compaction workers take level as priority, others take score
   180  		// as priority.
   181  		go lc.runWorker(c, i*2 >= n)
   182  	}
   183  }
   184  
   185  func (lc *levelsController) runWorker(c *y.Closer, scorePriority bool) {
   186  	defer c.Done()
   187  	if lc.kv.opt.DoNotCompact {
   188  		return
   189  	}
   190  
   191  	for {
   192  		guard := lc.resourceMgr.Acquire()
   193  		prios := lc.pickCompactLevels()
   194  		if scorePriority {
   195  			sort.Slice(prios, func(i, j int) bool {
   196  				return prios[i].score > prios[j].score
   197  			})
   198  		}
   199  		var didCompact bool
   200  		for _, p := range prios {
   201  			// TODO: Handle error.
   202  			didCompact, _ = lc.doCompact(p, guard)
   203  			if didCompact {
   204  				break
   205  			}
   206  		}
   207  		guard.Done()
   208  		waitDur := time.Second * 3
   209  		if didCompact {
   210  			waitDur /= 10
   211  		}
   212  		timer := time.NewTimer(waitDur)
   213  		select {
   214  		case <-c.HasBeenClosed():
   215  			timer.Stop()
   216  			return
   217  		case <-timer.C:
   218  		}
   219  	}
   220  }
   221  
   222  // Returns true if level zero may be compacted, without accounting for compactions that already
   223  // might be happening.
   224  func (lc *levelsController) isL0Compactable() bool {
   225  	return lc.levels[0].numTables() >= lc.kv.opt.NumLevelZeroTables
   226  }
   227  
   228  // Returns true if the non-zero level may be compacted.  deltaSize provides the size of the tables
   229  // which are currently being compacted so that we treat them as already having started being
   230  // compacted (because they have been, yet their size is already counted in getTotalSize).
   231  func (l *levelHandler) isCompactable(deltaSize int64) bool {
   232  	return l.getTotalSize() >= l.maxTotalSize+deltaSize
   233  }
   234  
   235  type compactionPriority struct {
   236  	level int
   237  	score float64
   238  }
   239  
   240  // pickCompactLevel determines which level to compact.
   241  // Based on: https://github.com/facebook/rocksdb/wiki/Leveled-Compaction
   242  func (lc *levelsController) pickCompactLevels() (prios []compactionPriority) {
   243  	// This function must use identical criteria for guaranteeing compaction's progress that
   244  	// addLevel0Table uses.
   245  
   246  	// cstatus is checked to see if level 0's tables are already being compacted
   247  	if !lc.cstatus.overlapsWith(0, infRange) && lc.isL0Compactable() {
   248  		pri := compactionPriority{
   249  			level: 0,
   250  			score: float64(lc.levels[0].numTables()) / float64(lc.kv.opt.NumLevelZeroTables),
   251  		}
   252  		prios = append(prios, pri)
   253  	}
   254  
   255  	// now calcalute scores from level 1
   256  	for levelNum := 1; levelNum < len(lc.levels); levelNum++ {
   257  		// Don't consider those tables that are already being compacted right now.
   258  		deltaSize := lc.cstatus.deltaSize(levelNum)
   259  
   260  		l := lc.levels[levelNum]
   261  		if l.isCompactable(deltaSize) {
   262  			pri := compactionPriority{
   263  				level: levelNum,
   264  				score: float64(l.getTotalSize()-deltaSize) / float64(l.maxTotalSize),
   265  			}
   266  			prios = append(prios, pri)
   267  		}
   268  	}
   269  	// We used to sort compaction priorities based on the score. But, we
   270  	// decided to compact based on the level, not the priority. So, upper
   271  	// levels (level 0, level 1, etc) always get compacted first, before the
   272  	// lower levels -- this allows us to avoid stalls.
   273  	return prios
   274  }
   275  
   276  func (lc *levelsController) hasOverlapTable(cd *compactDef) bool {
   277  	kr := getKeyRange(cd.top)
   278  	for i := cd.nextLevel.level + 1; i < len(lc.levels); i++ {
   279  		lh := lc.levels[i]
   280  		lh.RLock()
   281  		left, right := lh.overlappingTables(levelHandlerRLocked{}, kr)
   282  		lh.RUnlock()
   283  		if right-left > 0 {
   284  			return true
   285  		}
   286  	}
   287  	return false
   288  }
   289  
   290  type DiscardStats struct {
   291  	numSkips     int64
   292  	skippedBytes int64
   293  	ptrs         []blobPointer
   294  }
   295  
   296  func (ds *DiscardStats) collect(vs y.ValueStruct) {
   297  	if vs.Meta&bitValuePointer > 0 {
   298  		var bp blobPointer
   299  		bp.decode(vs.Value)
   300  		ds.ptrs = append(ds.ptrs, bp)
   301  		ds.skippedBytes += int64(bp.length)
   302  	}
   303  	ds.numSkips++
   304  }
   305  
   306  func (ds *DiscardStats) String() string {
   307  	return fmt.Sprintf("numSkips:%d, skippedBytes:%d", ds.numSkips, ds.skippedBytes)
   308  }
   309  
   310  func shouldFinishFile(key, lastKey y.Key, guard *Guard, currentSize, maxSize int64) bool {
   311  	if lastKey.IsEmpty() {
   312  		return false
   313  	}
   314  	if guard != nil {
   315  		if !bytes.HasPrefix(key.UserKey, guard.Prefix) {
   316  			return true
   317  		}
   318  		if !matchGuard(key.UserKey, lastKey.UserKey, guard) {
   319  			if maxSize > guard.MinSize {
   320  				maxSize = guard.MinSize
   321  			}
   322  		}
   323  	}
   324  	return currentSize > maxSize
   325  }
   326  
   327  func matchGuard(key, lastKey []byte, guard *Guard) bool {
   328  	if len(lastKey) < guard.MatchLen {
   329  		return false
   330  	}
   331  	return bytes.HasPrefix(key, lastKey[:guard.MatchLen])
   332  }
   333  
   334  func searchGuard(key []byte, guards []Guard) *Guard {
   335  	var maxMatchGuard *Guard
   336  	for i := range guards {
   337  		guard := &guards[i]
   338  		if bytes.HasPrefix(key, guard.Prefix) {
   339  			if maxMatchGuard == nil || len(guard.Prefix) > len(maxMatchGuard.Prefix) {
   340  				maxMatchGuard = guard
   341  			}
   342  		}
   343  	}
   344  	return maxMatchGuard
   345  }
   346  
   347  func overSkipTables(key y.Key, skippedTables []table.Table) (newSkippedTables []table.Table, over bool) {
   348  	var i int
   349  	for i < len(skippedTables) {
   350  		t := skippedTables[i]
   351  		if key.Compare(t.Biggest()) > 0 {
   352  			i++
   353  		} else {
   354  			break
   355  		}
   356  	}
   357  	return skippedTables[i:], i > 0
   358  }
   359  
   360  // compactBuildTables merge topTables and botTables to form a list of new tables.
   361  func (lc *levelsController) compactBuildTables(level int, cd *compactDef,
   362  	limiter *rate.Limiter, splitHints []y.Key) (newTables []table.Table, err error) {
   363  	topTables := cd.top
   364  	botTables := cd.bot
   365  
   366  	hasOverlap := lc.hasOverlapTable(cd)
   367  	log.Info("check range with lower level", zap.Bool("overlapped", hasOverlap))
   368  
   369  	// Try to collect stats so that we can inform value log about GC. That would help us find which
   370  	// value log file should be GCed.
   371  	discardStats := &DiscardStats{}
   372  
   373  	// Create iterators across all the tables involved first.
   374  	var iters []y.Iterator
   375  	if level == 0 {
   376  		iters = appendIteratorsReversed(iters, topTables, false)
   377  	} else {
   378  		iters = []y.Iterator{table.NewConcatIterator(topTables, false)}
   379  	}
   380  
   381  	// Next level has level>=1 and we can use ConcatIterator as key ranges do not overlap.
   382  	iters = append(iters, table.NewConcatIterator(botTables, false))
   383  	it := table.NewMergeIterator(iters, false)
   384  
   385  	it.Rewind()
   386  
   387  	// Pick up the currently pending transactions' min readTs, so we can discard versions below this
   388  	// readTs. We should never discard any versions starting from above this timestamp, because that
   389  	// would affect the snapshot view guarantee provided by transactions.
   390  	safeTs := lc.kv.getCompactSafeTs()
   391  
   392  	var filter CompactionFilter
   393  	var guards []Guard
   394  	if lc.kv.opt.CompactionFilterFactory != nil {
   395  		filter = lc.kv.opt.CompactionFilterFactory(level+1, cd.smallest().UserKey, cd.biggest().UserKey)
   396  		guards = filter.Guards()
   397  	}
   398  	skippedTbls := cd.skippedTbls
   399  
   400  	var lastKey, skipKey y.Key
   401  	var builder *sstable.Builder
   402  	var bytesRead, bytesWrite, numRead, numWrite int
   403  	for it.Valid() {
   404  		fileID := lc.reserveFileID()
   405  		filename := sstable.NewFilename(fileID, lc.kv.opt.Dir)
   406  		var fd *os.File
   407  		fd, err = directio.OpenFile(filename, os.O_CREATE|os.O_RDWR, 0666)
   408  		if err != nil {
   409  			return
   410  		}
   411  		if builder == nil {
   412  			builder = sstable.NewTableBuilder(fd, limiter, cd.nextLevel.level, lc.opt)
   413  		} else {
   414  			builder.Reset(fd)
   415  		}
   416  		lastKey.Reset()
   417  		guard := searchGuard(it.Key().UserKey, guards)
   418  		for ; it.Valid(); y.NextAllVersion(it) {
   419  			numRead++
   420  			vs := it.Value()
   421  			key := it.Key()
   422  			kvSize := int(vs.EncodedSize()) + key.Len()
   423  			bytesRead += kvSize
   424  			// See if we need to skip this key.
   425  			if !skipKey.IsEmpty() {
   426  				if key.SameUserKey(skipKey) {
   427  					discardStats.collect(vs)
   428  					continue
   429  				} else {
   430  					skipKey.Reset()
   431  				}
   432  			}
   433  			if !key.SameUserKey(lastKey) {
   434  				// Only break if we are on a different key, and have reached capacity. We want
   435  				// to ensure that all versions of the key are stored in the same sstable, and
   436  				// not divided across multiple tables at the same level.
   437  				if len(skippedTbls) > 0 {
   438  					var over bool
   439  					skippedTbls, over = overSkipTables(key, skippedTbls)
   440  					if over && !builder.Empty() {
   441  						break
   442  					}
   443  				}
   444  				if shouldFinishFile(key, lastKey, guard, int64(builder.EstimateSize()), lc.kv.opt.MaxTableSize) {
   445  					break
   446  				}
   447  				if len(splitHints) != 0 && key.Compare(splitHints[0]) >= 0 {
   448  					splitHints = splitHints[1:]
   449  					for len(splitHints) > 0 && key.Compare(splitHints[0]) >= 0 {
   450  						splitHints = splitHints[1:]
   451  					}
   452  					break
   453  				}
   454  				lastKey.Copy(key)
   455  			}
   456  
   457  			// Only consider the versions which are below the minReadTs, otherwise, we might end up discarding the
   458  			// only valid version for a running transaction.
   459  			if key.Version <= safeTs {
   460  				// key is the latest readable version of this key, so we simply discard all the rest of the versions.
   461  				skipKey.Copy(key)
   462  
   463  				if isDeleted(vs.Meta) {
   464  					// If this key range has overlap with lower levels, then keep the deletion
   465  					// marker with the latest version, discarding the rest. We have set skipKey,
   466  					// so the following key versions would be skipped. Otherwise discard the deletion marker.
   467  					if !hasOverlap {
   468  						continue
   469  					}
   470  				} else if filter != nil {
   471  					switch filter.Filter(key.UserKey, vs.Value, vs.UserMeta) {
   472  					case DecisionMarkTombstone:
   473  						discardStats.collect(vs)
   474  						if hasOverlap {
   475  							// There may have ole versions for this key, so convert to delete tombstone.
   476  							builder.Add(key, y.ValueStruct{Meta: bitDelete})
   477  						}
   478  						continue
   479  					case DecisionDrop:
   480  						discardStats.collect(vs)
   481  						continue
   482  					case DecisionKeep:
   483  					}
   484  				}
   485  			}
   486  			builder.Add(key, vs)
   487  			numWrite++
   488  			bytesWrite += kvSize
   489  		}
   490  		if builder.Empty() {
   491  			continue
   492  		}
   493  		if err = builder.Finish(); err != nil {
   494  			return
   495  		}
   496  		fd.Close()
   497  		var tbl table.Table
   498  		tbl, err = sstable.OpenTable(filename, lc.kv.blockCache, lc.kv.indexCache)
   499  		if err != nil {
   500  			return
   501  		}
   502  		if tbl.Smallest().IsEmpty() {
   503  			tbl.Delete()
   504  		} else {
   505  			newTables = append(newTables, tbl)
   506  		}
   507  	}
   508  
   509  	stats := &y.CompactionStats{
   510  		KeysRead:     numRead,
   511  		BytesRead:    bytesRead,
   512  		KeysWrite:    numWrite,
   513  		BytesWrite:   bytesWrite,
   514  		KeysDiscard:  int(discardStats.numSkips),
   515  		BytesDiscard: int(discardStats.skippedBytes),
   516  	}
   517  	cd.nextLevel.metrics.UpdateCompactionStats(stats)
   518  	// Ensure created files' directory entries are visible.  We don't mind the extra latency
   519  	// from not doing this ASAP after all file creation has finished because this is a
   520  	// background operation.
   521  	err = syncDir(lc.kv.opt.Dir)
   522  	if err != nil {
   523  		log.Error("compact sync dir error", zap.Error(err))
   524  		return
   525  	}
   526  	sortTables(newTables)
   527  	log.Info("compact send discard stats", zap.Stringer("stats", discardStats))
   528  	if len(discardStats.ptrs) > 0 {
   529  		lc.kv.blobManger.discardCh <- discardStats
   530  	}
   531  	return
   532  }
   533  
   534  func buildChangeSet(cd *compactDef, newTables []table.Table) protos.ManifestChangeSet {
   535  	changes := []*protos.ManifestChange{}
   536  	for _, table := range newTables {
   537  		changes = append(changes,
   538  			newCreateChange(table.ID(), cd.nextLevel.level))
   539  	}
   540  	for _, table := range cd.top {
   541  		changes = append(changes, newDeleteChange(table.ID()))
   542  	}
   543  	for _, table := range cd.bot {
   544  		changes = append(changes, newDeleteChange(table.ID()))
   545  	}
   546  	return protos.ManifestChangeSet{Changes: changes}
   547  }
   548  
   549  type compactDef struct {
   550  	thisLevel *levelHandler
   551  	nextLevel *levelHandler
   552  
   553  	top []table.Table
   554  	bot []table.Table
   555  
   556  	skippedTbls []table.Table
   557  
   558  	thisRange keyRange
   559  	nextRange keyRange
   560  
   561  	topSize     int64
   562  	topLeftIdx  int
   563  	topRightIdx int
   564  	botSize     int64
   565  	botLeftIdx  int
   566  	botRightIdx int
   567  }
   568  
   569  func (cd *compactDef) String() string {
   570  	return fmt.Sprintf("%d top:[%d:%d](%d), bot:[%d:%d](%d), skip:%d, write_amp:%.2f",
   571  		cd.thisLevel.level, cd.topLeftIdx, cd.topRightIdx, cd.topSize,
   572  		cd.botLeftIdx, cd.botRightIdx, cd.botSize, len(cd.skippedTbls), float64(cd.topSize+cd.botSize)/float64(cd.topSize))
   573  }
   574  
   575  func (cd *compactDef) lockLevels() {
   576  	cd.thisLevel.RLock()
   577  	cd.nextLevel.RLock()
   578  }
   579  
   580  func (cd *compactDef) unlockLevels() {
   581  	cd.nextLevel.RUnlock()
   582  	cd.thisLevel.RUnlock()
   583  }
   584  
   585  func (cd *compactDef) smallest() y.Key {
   586  	if len(cd.bot) > 0 && cd.nextRange.left.Compare(cd.thisRange.left) < 0 {
   587  		return cd.nextRange.left
   588  	}
   589  	return cd.thisRange.left
   590  }
   591  
   592  func (cd *compactDef) biggest() y.Key {
   593  	if len(cd.bot) > 0 && cd.nextRange.right.Compare(cd.thisRange.right) > 0 {
   594  		return cd.nextRange.right
   595  	}
   596  	return cd.thisRange.right
   597  }
   598  
   599  func (cd *compactDef) markTablesCompacting() {
   600  	for _, tbl := range cd.top {
   601  		tbl.MarkCompacting(true)
   602  	}
   603  	for _, tbl := range cd.bot {
   604  		tbl.MarkCompacting(true)
   605  	}
   606  	for _, tbl := range cd.skippedTbls {
   607  		tbl.MarkCompacting(true)
   608  	}
   609  }
   610  
   611  func (lc *levelsController) fillTablesL0(cd *compactDef) bool {
   612  	cd.lockLevels()
   613  	defer cd.unlockLevels()
   614  
   615  	if len(cd.thisLevel.tables) == 0 {
   616  		return false
   617  	}
   618  
   619  	cd.top = make([]table.Table, len(cd.thisLevel.tables))
   620  	copy(cd.top, cd.thisLevel.tables)
   621  	for _, t := range cd.top {
   622  		cd.topSize += t.Size()
   623  	}
   624  	cd.topRightIdx = len(cd.top)
   625  
   626  	cd.thisRange = infRange
   627  
   628  	kr := getKeyRange(cd.top)
   629  	left, right := cd.nextLevel.overlappingTables(levelHandlerRLocked{}, kr)
   630  	overlappingTables := cd.nextLevel.tables[left:right]
   631  	cd.botLeftIdx = left
   632  	cd.botRightIdx = right
   633  	lc.fillBottomTables(cd, overlappingTables)
   634  	for _, t := range cd.bot {
   635  		cd.botSize += t.Size()
   636  	}
   637  
   638  	if len(overlappingTables) == 0 { // the bottom-most level
   639  		cd.nextRange = kr
   640  	} else {
   641  		cd.nextRange = getKeyRange(overlappingTables)
   642  	}
   643  
   644  	if !lc.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd) {
   645  		return false
   646  	}
   647  
   648  	return true
   649  }
   650  
   651  const minSkippedTableSize = 1024 * 1024
   652  
   653  func (lc *levelsController) fillBottomTables(cd *compactDef, overlappingTables []table.Table) {
   654  	for _, t := range overlappingTables {
   655  		// If none of the top tables contains the range in an overlapping bottom table,
   656  		// we can skip it during compaction to reduce write amplification.
   657  		var added bool
   658  		for _, topTbl := range cd.top {
   659  			if topTbl.HasOverlap(t.Smallest(), t.Biggest(), true) {
   660  				cd.bot = append(cd.bot, t)
   661  				added = true
   662  				break
   663  			}
   664  		}
   665  		if !added {
   666  			if t.Size() >= minSkippedTableSize {
   667  				// We need to limit the minimum size of the table to be skipped,
   668  				// otherwise the number of tables in a level will keep growing
   669  				// until we meet too many open files error.
   670  				cd.skippedTbls = append(cd.skippedTbls, t)
   671  			} else {
   672  				cd.bot = append(cd.bot, t)
   673  			}
   674  		}
   675  	}
   676  }
   677  
   678  const maxCompactionExpandSize = 1 << 30 // 1GB
   679  
   680  func (lc *levelsController) fillTables(cd *compactDef) bool {
   681  	cd.lockLevels()
   682  	defer cd.unlockLevels()
   683  
   684  	if len(cd.thisLevel.tables) == 0 {
   685  		return false
   686  	}
   687  	this := make([]table.Table, len(cd.thisLevel.tables))
   688  	copy(this, cd.thisLevel.tables)
   689  	next := make([]table.Table, len(cd.nextLevel.tables))
   690  	copy(next, cd.nextLevel.tables)
   691  
   692  	// First pick one table has max topSize/bottomSize ratio.
   693  	var candidateRatio float64
   694  	for i, t := range this {
   695  		if lc.isCompacting(cd.thisLevel.level, t) {
   696  			continue
   697  		}
   698  		left, right := getTablesInRange(next, t.Smallest(), t.Biggest())
   699  		if lc.isCompacting(cd.nextLevel.level, next[left:right]...) {
   700  			continue
   701  		}
   702  		botSize := sumTableSize(next[left:right])
   703  		ratio := calcRatio(t.Size(), botSize)
   704  		if ratio > candidateRatio {
   705  			candidateRatio = ratio
   706  			cd.topLeftIdx = i
   707  			cd.topRightIdx = i + 1
   708  			cd.top = this[cd.topLeftIdx:cd.topRightIdx:cd.topRightIdx]
   709  			cd.topSize = t.Size()
   710  			cd.botLeftIdx = left
   711  			cd.botRightIdx = right
   712  			cd.botSize = botSize
   713  		}
   714  	}
   715  	if len(cd.top) == 0 {
   716  		return false
   717  	}
   718  	bots := next[cd.botLeftIdx:cd.botRightIdx:cd.botRightIdx]
   719  	// Expand to left to include more tops as long as the ratio doesn't decrease and the total size
   720  	// do not exceeds maxCompactionExpandSize.
   721  	for i := cd.topLeftIdx - 1; i >= 0; i-- {
   722  		t := this[i]
   723  		if lc.isCompacting(cd.thisLevel.level, t) {
   724  			break
   725  		}
   726  		left, right := getTablesInRange(next, t.Smallest(), t.Biggest())
   727  		if right < cd.botLeftIdx {
   728  			// A bottom table is skipped, we can compact in another run.
   729  			break
   730  		}
   731  		if lc.isCompacting(cd.nextLevel.level, next[left:cd.botLeftIdx]...) {
   732  			break
   733  		}
   734  		newTopSize := t.Size() + cd.topSize
   735  		newBotSize := sumTableSize(next[left:cd.botLeftIdx]) + cd.botSize
   736  		newRatio := calcRatio(newTopSize, newBotSize)
   737  		if newRatio > candidateRatio && (newTopSize+newBotSize) < maxCompactionExpandSize {
   738  			cd.top = append([]table.Table{t}, cd.top...)
   739  			cd.topLeftIdx--
   740  			bots = append(next[left:cd.botLeftIdx:cd.botLeftIdx], bots...)
   741  			cd.botLeftIdx = left
   742  			cd.topSize = newTopSize
   743  			cd.botSize = newBotSize
   744  		} else {
   745  			break
   746  		}
   747  	}
   748  	// Expand to right to include more tops as long as the ratio doesn't decrease and the total size
   749  	// do not exceeds maxCompactionExpandSize.
   750  	for i := cd.topRightIdx; i < len(this); i++ {
   751  		t := this[i]
   752  		if lc.isCompacting(cd.thisLevel.level, t) {
   753  			break
   754  		}
   755  		left, right := getTablesInRange(next, t.Smallest(), t.Biggest())
   756  		if left > cd.botRightIdx {
   757  			// A bottom table is skipped, we can compact in another run.
   758  			break
   759  		}
   760  		if lc.isCompacting(cd.nextLevel.level, next[cd.botRightIdx:right]...) {
   761  			break
   762  		}
   763  		newTopSize := t.Size() + cd.topSize
   764  		newBotSize := sumTableSize(next[cd.botRightIdx:right]) + cd.botSize
   765  		newRatio := calcRatio(newTopSize, newBotSize)
   766  		if newRatio > candidateRatio && (newTopSize+newBotSize) < maxCompactionExpandSize {
   767  			cd.top = append(cd.top, t)
   768  			cd.topRightIdx++
   769  			bots = append(bots, next[cd.botRightIdx:right]...)
   770  			cd.botRightIdx = right
   771  			cd.topSize = newTopSize
   772  			cd.botSize = newBotSize
   773  		} else {
   774  			break
   775  		}
   776  	}
   777  	cd.thisRange = keyRange{left: cd.top[0].Smallest(), right: cd.top[len(cd.top)-1].Biggest()}
   778  	if len(bots) > 0 {
   779  		cd.nextRange = keyRange{left: bots[0].Smallest(), right: bots[len(bots)-1].Biggest()}
   780  	} else {
   781  		cd.nextRange = cd.thisRange
   782  	}
   783  	lc.fillBottomTables(cd, bots)
   784  	for _, t := range cd.skippedTbls {
   785  		cd.botSize -= t.Size()
   786  	}
   787  	return lc.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd)
   788  }
   789  
   790  func sumTableSize(tables []table.Table) int64 {
   791  	var size int64
   792  	for _, t := range tables {
   793  		size += t.Size()
   794  	}
   795  	return size
   796  }
   797  
   798  func calcRatio(topSize, botSize int64) float64 {
   799  	if botSize == 0 {
   800  		return float64(topSize)
   801  	}
   802  	return float64(topSize) / float64(botSize)
   803  }
   804  
   805  func (lc *levelsController) isCompacting(level int, tables ...table.Table) bool {
   806  	if len(tables) == 0 {
   807  		return false
   808  	}
   809  	kr := keyRange{
   810  		left:  tables[0].Smallest(),
   811  		right: tables[len(tables)-1].Biggest(),
   812  	}
   813  	y.Assert(!kr.left.IsEmpty())
   814  	y.Assert(!kr.right.IsEmpty())
   815  	return lc.cstatus.overlapsWith(level, kr)
   816  }
   817  
   818  func (lc *levelsController) runCompactDef(l int, cd *compactDef, limiter *rate.Limiter, guard *epoch.Guard) error {
   819  	timeStart := time.Now()
   820  
   821  	thisLevel := cd.thisLevel
   822  	nextLevel := cd.nextLevel
   823  
   824  	var newTables []table.Table
   825  	var changeSet protos.ManifestChangeSet
   826  	var topMove bool
   827  	defer func() {
   828  		for _, tbl := range newTables {
   829  			tbl.MarkCompacting(false)
   830  		}
   831  		for _, tbl := range cd.skippedTbls {
   832  			tbl.MarkCompacting(false)
   833  		}
   834  	}()
   835  
   836  	if l > 0 && len(cd.bot) == 0 && len(cd.skippedTbls) == 0 {
   837  		// skip level 0, since it may has many table overlap with each other
   838  		newTables = cd.top
   839  		changeSet = protos.ManifestChangeSet{}
   840  		for _, t := range newTables {
   841  			changeSet.Changes = append(changeSet.Changes, newMoveDownChange(t.ID(), cd.nextLevel.level))
   842  		}
   843  		topMove = true
   844  	} else {
   845  		var err error
   846  		newTables, err = lc.compactBuildTables(l, cd, limiter, nil)
   847  		if err != nil {
   848  			return err
   849  		}
   850  		changeSet = buildChangeSet(cd, newTables)
   851  	}
   852  
   853  	// We write to the manifest _before_ we delete files (and after we created files)
   854  	if err := lc.kv.manifest.addChanges(changeSet.Changes, nil); err != nil {
   855  		return err
   856  	}
   857  
   858  	// See comment earlier in this function about the ordering of these ops, and the order in which
   859  	// we access levels when reading.
   860  	nextLevel.replaceTables(newTables, cd, guard)
   861  	thisLevel.deleteTables(cd.top, guard, topMove)
   862  
   863  	// Note: For level 0, while doCompact is running, it is possible that new tables are added.
   864  	// However, the tables are added only to the end, so it is ok to just delete the first table.
   865  
   866  	log.Info("compaction done",
   867  		zap.Stringer("def", cd), zap.Int("deleted", len(cd.top)+len(cd.bot)), zap.Int("added", len(newTables)),
   868  		zap.Duration("duration", time.Since(timeStart)))
   869  	return nil
   870  }
   871  
   872  // doCompact picks some table on level l and compacts it away to the next level.
   873  func (lc *levelsController) doCompact(p compactionPriority, guard *epoch.Guard) (bool, error) {
   874  	l := p.level
   875  	y.Assert(l+1 < lc.kv.opt.TableBuilderOptions.MaxLevels) // Sanity check.
   876  
   877  	cd := &compactDef{
   878  		thisLevel: lc.levels[l],
   879  		nextLevel: lc.levels[l+1],
   880  	}
   881  
   882  	log.Info("start compaction", zap.Int("level", p.level), zap.Float64("score", p.score))
   883  
   884  	// While picking tables to be compacted, both levels' tables are expected to
   885  	// remain unchanged.
   886  	if l == 0 {
   887  		if !lc.fillTablesL0(cd) {
   888  			log.Info("build compaction fill tables failed", zap.Int("level", l))
   889  			return false, nil
   890  		}
   891  	} else {
   892  		if !lc.fillTables(cd) {
   893  			log.Info("build compaction fill tables failed", zap.Int("level", l))
   894  			return false, nil
   895  		}
   896  	}
   897  	defer lc.cstatus.delete(cd) // Remove the ranges from compaction status.
   898  
   899  	log.Info("running compaction", zap.Stringer("def", cd))
   900  	if err := lc.runCompactDef(l, cd, lc.kv.limiter, guard); err != nil {
   901  		// This compaction couldn't be done successfully.
   902  		log.Info("compact failed", zap.Stringer("def", cd), zap.Error(err))
   903  		return false, err
   904  	}
   905  
   906  	log.Info("compaction done", zap.Int("level", cd.thisLevel.level))
   907  	return true, nil
   908  }
   909  
   910  func (lc *levelsController) addLevel0Table(t table.Table, head *protos.HeadInfo) error {
   911  	// We update the manifest _before_ the table becomes part of a levelHandler, because at that
   912  	// point it could get used in some compaction.  This ensures the manifest file gets updated in
   913  	// the proper order. (That means this update happens before that of some compaction which
   914  	// deletes the table.)
   915  	err := lc.kv.manifest.addChanges([]*protos.ManifestChange{
   916  		newCreateChange(t.ID(), 0),
   917  	}, head)
   918  	if err != nil {
   919  		return err
   920  	}
   921  
   922  	for !lc.levels[0].tryAddLevel0Table(t) {
   923  		// Stall. Make sure all levels are healthy before we unstall.
   924  		var timeStart time.Time
   925  		{
   926  			log.Warn("STALLED STALLED STALLED", zap.Duration("duration", time.Since(lastUnstalled)))
   927  			lc.cstatus.RLock()
   928  			for i := 0; i < lc.kv.opt.TableBuilderOptions.MaxLevels; i++ {
   929  				log.Warn("dump level status", zap.Int("level", i), zap.String("status", lc.cstatus.levels[i].debug()),
   930  					zap.Int64("size", lc.levels[i].getTotalSize()))
   931  			}
   932  			lc.cstatus.RUnlock()
   933  			timeStart = time.Now()
   934  		}
   935  		// Before we unstall, we need to make sure that level 0 is healthy. Otherwise, we
   936  		// will very quickly fill up level 0 again.
   937  		for i := 0; ; i++ {
   938  			// It's crucial that this behavior replicates pickCompactLevels' behavior in
   939  			// computing compactability in order to guarantee progress.
   940  			// Break the loop once L0 has enough space to accommodate new tables.
   941  			if !lc.isL0Compactable() {
   942  				break
   943  			}
   944  			time.Sleep(10 * time.Millisecond)
   945  			if i%100 == 0 {
   946  				prios := lc.pickCompactLevels()
   947  				log.S().Warnf("waiting to add level 0 table, %+v", prios)
   948  				i = 0
   949  			}
   950  		}
   951  		log.Info("UNSTALLED UNSTALLED UNSTALLED UNSTALLED UNSTALLED UNSTALLED", zap.Duration("duration", time.Since(timeStart)))
   952  		lastUnstalled = time.Now()
   953  	}
   954  
   955  	return nil
   956  }
   957  
   958  func (s *levelsController) close() error {
   959  	err := s.cleanupLevels()
   960  	return errors.Wrap(err, "levelsController.Close")
   961  }
   962  
   963  // get returns the found value if any. If not found, we return nil.
   964  func (s *levelsController) get(key y.Key, keyHash uint64) y.ValueStruct {
   965  	// It's important that we iterate the levels from 0 on upward.  The reason is, if we iterated
   966  	// in opposite order, or in parallel (naively calling all the h.RLock() in some order) we could
   967  	// read level L's tables post-compaction and level L+1's tables pre-compaction.  (If we do
   968  	// parallelize this, we will need to call the h.RLock() function by increasing order of level
   969  	// number.)
   970  	start := time.Now()
   971  	defer s.kv.metrics.LSMGetDuration.Observe(time.Since(start).Seconds())
   972  	for _, h := range s.levels {
   973  		vs := h.get(key, keyHash) // Calls h.RLock() and h.RUnlock().
   974  		if vs.Valid() {
   975  			return vs
   976  		}
   977  	}
   978  	return y.ValueStruct{}
   979  }
   980  
   981  func (s *levelsController) multiGet(pairs []keyValuePair) {
   982  	start := time.Now()
   983  	for _, h := range s.levels {
   984  		h.multiGet(pairs)
   985  	}
   986  	s.kv.metrics.LSMMultiGetDuration.Observe(time.Since(start).Seconds())
   987  }
   988  
   989  func appendIteratorsReversed(out []y.Iterator, th []table.Table, reversed bool) []y.Iterator {
   990  	for i := len(th) - 1; i >= 0; i-- {
   991  		// This will increment the reference of the table handler.
   992  		out = append(out, table.NewConcatIterator(th[i:i+1], reversed))
   993  	}
   994  	return out
   995  }
   996  
   997  // appendIterators appends iterators to an array of iterators, for merging.
   998  // Note: This obtains references for the table handlers. Remember to close these iterators.
   999  func (s *levelsController) appendIterators(
  1000  	iters []y.Iterator, opts *IteratorOptions) []y.Iterator {
  1001  	// Just like with get, it's important we iterate the levels from 0 on upward, to avoid missing
  1002  	// data when there's a compaction.
  1003  	for _, level := range s.levels {
  1004  		iters = level.appendIterators(iters, opts)
  1005  	}
  1006  	return iters
  1007  }
  1008  
  1009  type TableInfo struct {
  1010  	ID    uint64
  1011  	Level int
  1012  	Left  []byte
  1013  	Right []byte
  1014  }
  1015  
  1016  func (lc *levelsController) getTableInfo() (result []TableInfo) {
  1017  	for _, l := range lc.levels {
  1018  		for _, t := range l.tables {
  1019  			info := TableInfo{
  1020  				ID:    t.ID(),
  1021  				Level: l.level,
  1022  				Left:  t.Smallest().UserKey,
  1023  				Right: t.Biggest().UserKey,
  1024  			}
  1025  			result = append(result, info)
  1026  		}
  1027  	}
  1028  	sort.Slice(result, func(i, j int) bool {
  1029  		if result[i].Level != result[j].Level {
  1030  			return result[i].Level < result[j].Level
  1031  		}
  1032  		return result[i].ID < result[j].ID
  1033  	})
  1034  	return
  1035  }