github.com/pingcap/badger@v1.5.1-0.20230103063557-828f39b09b6d/db.go (about)

     1  /*
     2   * Copyright 2017 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package badger
    18  
    19  import (
    20  	"bytes"
    21  	"io"
    22  	"math"
    23  	"os"
    24  	"path/filepath"
    25  	"sort"
    26  	"strconv"
    27  	"sync"
    28  	"sync/atomic"
    29  	"time"
    30  
    31  	"github.com/dgryski/go-farm"
    32  	"github.com/pingcap/badger/cache"
    33  	"github.com/pingcap/badger/directio"
    34  	"github.com/pingcap/badger/epoch"
    35  	"github.com/pingcap/badger/options"
    36  	"github.com/pingcap/badger/protos"
    37  	"github.com/pingcap/badger/table"
    38  	"github.com/pingcap/badger/table/memtable"
    39  	"github.com/pingcap/badger/table/sstable"
    40  	"github.com/pingcap/badger/y"
    41  	"github.com/pingcap/errors"
    42  	"github.com/pingcap/log"
    43  	"go.uber.org/zap"
    44  	"golang.org/x/time/rate"
    45  )
    46  
    47  var (
    48  	txnKey = []byte("!badger!txn") // For indicating end of entries in txn.
    49  )
    50  
    51  type closers struct {
    52  	updateSize      *y.Closer
    53  	compactors      *y.Closer
    54  	resourceManager *y.Closer
    55  	blobManager     *y.Closer
    56  	memtable        *y.Closer
    57  	writes          *y.Closer
    58  }
    59  
    60  // DB provides the various functions required to interact with Badger.
    61  // DB is thread-safe.
    62  type DB struct {
    63  	dirLockGuard *directoryLockGuard
    64  	// nil if Dir and ValueDir are the same
    65  	valueDirGuard *directoryLockGuard
    66  
    67  	closers   closers
    68  	mtbls     atomic.Value
    69  	opt       Options
    70  	manifest  *manifestFile
    71  	lc        *levelsController
    72  	vlog      valueLog
    73  	logOff    logOffset // less than or equal to a pointer to the last vlog value put into mt
    74  	syncedFid uint32    // The log fid that has been flushed to SST, older log files are safe to be deleted.
    75  	writeCh   chan *request
    76  	flushChan chan *flushTask // For flushing memtables.
    77  	ingestCh  chan *ingestTask
    78  
    79  	// mem table buffer to avoid expensive allocating big chunk of memory
    80  	memTableCh chan *memtable.Table
    81  
    82  	orc           *oracle
    83  	safeTsTracker safeTsTracker
    84  
    85  	limiter *rate.Limiter
    86  
    87  	blockCache *cache.Cache
    88  	indexCache *cache.Cache
    89  
    90  	metrics      *y.MetricsSet
    91  	lsmSize      int64
    92  	vlogSize     int64
    93  	volatileMode bool
    94  
    95  	blobManger blobManager
    96  
    97  	resourceMgr *epoch.ResourceManager
    98  }
    99  
   100  type memTables struct {
   101  	tables []*memtable.Table // tables from new to old, the first one is mutable.
   102  	length uint32            // The length is updated by the flusher.
   103  }
   104  
   105  func (tbls *memTables) getMutable() *memtable.Table {
   106  	return tbls.tables[0]
   107  }
   108  
   109  func newMemTables(mt *memtable.Table, old *memTables) *memTables {
   110  	newTbls := &memTables{}
   111  	newTbls.tables = make([]*memtable.Table, 1+atomic.LoadUint32(&old.length))
   112  	newTbls.tables[0] = mt
   113  	copy(newTbls.tables[1:], old.tables)
   114  	newTbls.length = uint32(len(newTbls.tables))
   115  	return newTbls
   116  }
   117  
   118  const (
   119  	kvWriteChCapacity = 1000
   120  )
   121  
   122  func replayFunction(out *DB) func(Entry) error {
   123  	type txnEntry struct {
   124  		nk y.Key
   125  		v  y.ValueStruct
   126  	}
   127  
   128  	var txn []txnEntry
   129  	var lastCommit uint64
   130  
   131  	toLSM := func(nk y.Key, vs y.ValueStruct) {
   132  		e := memtable.Entry{Key: nk.UserKey, Value: vs}
   133  		mTbls := out.mtbls.Load().(*memTables)
   134  		if out.ensureRoomForWrite(mTbls.getMutable(), e.EstimateSize()) == out.opt.MaxMemTableSize {
   135  			mTbls = out.mtbls.Load().(*memTables)
   136  		}
   137  		mTbls.getMutable().PutToSkl(nk.UserKey, vs)
   138  	}
   139  
   140  	first := true
   141  	return func(e Entry) error { // Function for replaying.
   142  		if first {
   143  			log.Info("replay wal", zap.Stringer("first key", e.Key))
   144  		}
   145  		first = false
   146  
   147  		if out.orc.curRead < e.Key.Version {
   148  			out.orc.curRead = e.Key.Version
   149  		}
   150  
   151  		var nk y.Key
   152  		nk.Copy(e.Key)
   153  		nv := make([]byte, len(e.Value))
   154  		copy(nv, e.Value)
   155  
   156  		v := y.ValueStruct{
   157  			Value:    nv,
   158  			Meta:     e.meta,
   159  			UserMeta: e.UserMeta,
   160  			Version:  nk.Version,
   161  		}
   162  
   163  		if e.meta&bitFinTxn > 0 {
   164  			txnTs, err := strconv.ParseUint(string(e.Value), 10, 64)
   165  			if err != nil {
   166  				return errors.Wrapf(err, "Unable to parse txn fin: %q", e.Value)
   167  			}
   168  			if !out.IsManaged() {
   169  				y.Assert(lastCommit == txnTs)
   170  			}
   171  			y.Assert(len(txn) > 0)
   172  			// Got the end of txn. Now we can store them.
   173  			for _, t := range txn {
   174  				toLSM(t.nk, t.v)
   175  			}
   176  			txn = txn[:0]
   177  			lastCommit = 0
   178  
   179  		} else if e.meta&bitTxn == 0 {
   180  			// This entry is from a rewrite.
   181  			toLSM(nk, v)
   182  
   183  			// We shouldn't get this entry in the middle of a transaction.
   184  			y.Assert(lastCommit == 0)
   185  			y.Assert(len(txn) == 0)
   186  
   187  		} else {
   188  			if lastCommit == 0 {
   189  				lastCommit = e.Key.Version
   190  			}
   191  			if !out.IsManaged() {
   192  				y.Assert(lastCommit == e.Key.Version)
   193  			}
   194  			te := txnEntry{nk: nk, v: v}
   195  			txn = append(txn, te)
   196  		}
   197  		return nil
   198  	}
   199  }
   200  
   201  // Open returns a new DB object.
   202  func Open(opt Options) (db *DB, err error) {
   203  	opt.maxBatchSize = (15 * opt.MaxMemTableSize) / 100
   204  	opt.maxBatchCount = opt.maxBatchSize / int64(memtable.MaxNodeSize)
   205  
   206  	if opt.ValueThreshold > math.MaxUint16-16 {
   207  		return nil, ErrValueThreshold
   208  	}
   209  
   210  	if opt.ReadOnly {
   211  		// Can't truncate if the DB is read only.
   212  		opt.Truncate = false
   213  	}
   214  
   215  	for _, path := range []string{opt.Dir, opt.ValueDir} {
   216  		dirExists, err := exists(path)
   217  		if err != nil {
   218  			return nil, y.Wrapf(err, "Invalid Dir: %q", path)
   219  		}
   220  		if !dirExists {
   221  			if opt.ReadOnly {
   222  				return nil, y.Wrapf(err, "Cannot find Dir for read-only open: %q", path)
   223  			}
   224  			// Try to create the directory
   225  			err = os.Mkdir(path, 0700)
   226  			if err != nil {
   227  				return nil, y.Wrapf(err, "Error Creating Dir: %q", path)
   228  			}
   229  		}
   230  	}
   231  	absDir, err := filepath.Abs(opt.Dir)
   232  	if err != nil {
   233  		return nil, err
   234  	}
   235  	absValueDir, err := filepath.Abs(opt.ValueDir)
   236  	if err != nil {
   237  		return nil, err
   238  	}
   239  	var dirLockGuard, valueDirLockGuard *directoryLockGuard
   240  	dirLockGuard, err = acquireDirectoryLock(opt.Dir, lockFile, opt.ReadOnly)
   241  	if err != nil {
   242  		return nil, err
   243  	}
   244  	defer func() {
   245  		if dirLockGuard != nil {
   246  			_ = dirLockGuard.release()
   247  		}
   248  	}()
   249  	if absValueDir != absDir {
   250  		valueDirLockGuard, err = acquireDirectoryLock(opt.ValueDir, lockFile, opt.ReadOnly)
   251  		if err != nil {
   252  			return nil, err
   253  		}
   254  	}
   255  	defer func() {
   256  		if valueDirLockGuard != nil {
   257  			_ = valueDirLockGuard.release()
   258  		}
   259  	}()
   260  	if !(opt.ValueLogFileSize <= 2<<30 && opt.ValueLogFileSize >= 1<<20) {
   261  		return nil, ErrValueLogSize
   262  	}
   263  	manifestFile, manifest, err := openOrCreateManifestFile(opt.Dir, opt.ReadOnly)
   264  	if err != nil {
   265  		return nil, err
   266  	}
   267  	defer func() {
   268  		if manifestFile != nil {
   269  			_ = manifestFile.close()
   270  		}
   271  	}()
   272  
   273  	orc := &oracle{
   274  		isManaged:  opt.ManagedTxns,
   275  		nextCommit: 1,
   276  		commits:    make(map[uint64]uint64),
   277  	}
   278  
   279  	var blkCache, idxCache *cache.Cache
   280  	if opt.MaxBlockCacheSize != 0 {
   281  		var err error
   282  		blkCache, err = cache.NewCache(&cache.Config{
   283  			// The expected keys is MaxCacheSize / BlockSize, then x10 as documentation suggests.
   284  			NumCounters: opt.MaxBlockCacheSize / int64(opt.TableBuilderOptions.BlockSize) * 10,
   285  			MaxCost:     opt.MaxBlockCacheSize,
   286  			BufferItems: 64,
   287  			OnEvict:     sstable.OnEvict,
   288  		})
   289  		if err != nil {
   290  			return nil, errors.Wrap(err, "failed to create block cache")
   291  		}
   292  
   293  		indexSizeHint := float64(opt.TableBuilderOptions.MaxTableSize) / 6.0
   294  		idxCache, err = cache.NewCache(&cache.Config{
   295  			NumCounters: int64(float64(opt.MaxIndexCacheSize) / indexSizeHint * 10),
   296  			MaxCost:     opt.MaxIndexCacheSize,
   297  			BufferItems: 64,
   298  		})
   299  		if err != nil {
   300  			return nil, errors.Wrap(err, "failed to create index cache")
   301  		}
   302  	}
   303  	db = &DB{
   304  		flushChan:     make(chan *flushTask, opt.NumMemtables),
   305  		writeCh:       make(chan *request, kvWriteChCapacity),
   306  		memTableCh:    make(chan *memtable.Table, 1),
   307  		ingestCh:      make(chan *ingestTask),
   308  		opt:           opt,
   309  		manifest:      manifestFile,
   310  		dirLockGuard:  dirLockGuard,
   311  		valueDirGuard: valueDirLockGuard,
   312  		orc:           orc,
   313  		metrics:       y.NewMetricSet(opt.Dir),
   314  		blockCache:    blkCache,
   315  		indexCache:    idxCache,
   316  		volatileMode:  opt.VolatileMode,
   317  	}
   318  	db.vlog.metrics = db.metrics
   319  
   320  	rateLimit := opt.TableBuilderOptions.BytesPerSecond
   321  	if rateLimit > 0 {
   322  		db.limiter = rate.NewLimiter(rate.Limit(rateLimit), rateLimit)
   323  	}
   324  
   325  	// Calculate initial size.
   326  	db.calculateSize()
   327  	db.closers.updateSize = y.NewCloser(1)
   328  	go db.updateSize(db.closers.updateSize)
   329  
   330  	db.closers.resourceManager = y.NewCloser(0)
   331  	db.resourceMgr = epoch.NewResourceManager(db.closers.resourceManager, &db.safeTsTracker)
   332  
   333  	// newLevelsController potentially loads files in directory.
   334  	if db.lc, err = newLevelsController(db, &manifest, db.resourceMgr, opt.TableBuilderOptions); err != nil {
   335  		return nil, err
   336  	}
   337  
   338  	db.closers.memtable = y.NewCloser(1)
   339  	go func() {
   340  		lc := db.closers.memtable
   341  		for {
   342  			select {
   343  			case db.memTableCh <- memtable.New(arenaSize(db.opt), db.lc.reserveFileID()):
   344  			case <-lc.HasBeenClosed():
   345  				lc.Done()
   346  				return
   347  			}
   348  		}
   349  	}()
   350  	db.mtbls.Store(newMemTables(<-db.memTableCh, &memTables{}))
   351  
   352  	if err = db.blobManger.Open(db, opt); err != nil {
   353  		return nil, err
   354  	}
   355  
   356  	if !opt.ReadOnly {
   357  		db.closers.compactors = y.NewCloser(1)
   358  		db.lc.startCompact(db.closers.compactors)
   359  
   360  		db.closers.memtable.AddRunning(1)
   361  		go db.runFlushMemTable(db.closers.memtable) // Need levels controller to be up.
   362  	}
   363  
   364  	if err = db.vlog.Open(db, opt); err != nil {
   365  		return nil, err
   366  	}
   367  
   368  	var logOff logOffset
   369  	head := manifest.Head
   370  	if head != nil {
   371  		db.orc.curRead = head.Version
   372  		logOff.fid = head.LogID
   373  		logOff.offset = head.LogOffset
   374  	}
   375  
   376  	// lastUsedCasCounter will either be the value stored in !badger!head, or some subsequently
   377  	// written value log entry that we replay.  (Subsequent value log entries might be _less_
   378  	// than lastUsedCasCounter, if there was value log gc so we have to max() values while
   379  	// replaying.)
   380  	// out.lastUsedCasCounter = item.casCounter
   381  	// TODO: Figure this out. This would update the read timestamp, and set nextCommitTs.
   382  
   383  	replayCloser := startWriteWorker(db)
   384  
   385  	if err = db.vlog.Replay(logOff, replayFunction(db)); err != nil {
   386  		return db, err
   387  	}
   388  
   389  	replayCloser.SignalAndWait() // Wait for replay to be applied first.
   390  	// Now that we have the curRead, we can update the nextCommit.
   391  	db.orc.Lock()
   392  	db.orc.nextCommit = db.orc.curRead + 1
   393  	db.orc.Unlock()
   394  
   395  	db.writeCh = make(chan *request, kvWriteChCapacity)
   396  	db.closers.writes = startWriteWorker(db)
   397  
   398  	valueDirLockGuard = nil
   399  	dirLockGuard = nil
   400  	manifestFile = nil
   401  	return db, nil
   402  }
   403  
   404  // DeleteFilesInRange delete files in [start, end).
   405  // If some file contains keys outside the range, they will not be deleted.
   406  // This function is designed to reclaim space quickly.
   407  // If you want to ensure no future transaction can read keys in range,
   408  // considering iterate and delete the remained keys, or using compaction filter to cleanup them asynchronously.
   409  func (db *DB) DeleteFilesInRange(start, end []byte) {
   410  	var (
   411  		changes   []*protos.ManifestChange
   412  		pruneTbls []table.Table
   413  		startKey  = y.KeyWithTs(start, math.MaxUint64)
   414  		endKey    = y.KeyWithTs(end, 0)
   415  		guard     = db.resourceMgr.Acquire()
   416  	)
   417  
   418  	for level, lc := range db.lc.levels {
   419  		lc.Lock()
   420  		left, right := 0, len(lc.tables)
   421  		if lc.level > 0 {
   422  			left, right = getTablesInRange(lc.tables, startKey, endKey)
   423  		}
   424  		if left >= right {
   425  			lc.Unlock()
   426  			continue
   427  		}
   428  
   429  		newTables := lc.tables[:left]
   430  		for _, tbl := range lc.tables[left:right] {
   431  			if !isRangeCoversTable(startKey, endKey, tbl) || tbl.IsCompacting() {
   432  				newTables = append(newTables, tbl)
   433  				continue
   434  			}
   435  			pruneTbls = append(pruneTbls, tbl)
   436  			changes = append(changes, newDeleteChange(tbl.ID()))
   437  		}
   438  		newTables = append(newTables, lc.tables[right:]...)
   439  		for i := len(newTables); i < len(lc.tables); i++ {
   440  			lc.tables[i] = nil
   441  		}
   442  		assertTablesOrder(level, newTables, nil)
   443  		lc.tables = newTables
   444  		lc.Unlock()
   445  	}
   446  
   447  	db.manifest.addChanges(changes, nil)
   448  	var discardStats DiscardStats
   449  	deletes := make([]epoch.Resource, len(pruneTbls))
   450  	for i, tbl := range pruneTbls {
   451  		it := tbl.NewIterator(false)
   452  		// TODO: use rate limiter to avoid burst IO.
   453  		for it.Rewind(); it.Valid(); y.NextAllVersion(it) {
   454  			discardStats.collect(it.Value())
   455  		}
   456  		deletes[i] = tbl
   457  		it.Close()
   458  	}
   459  	if len(discardStats.ptrs) > 0 {
   460  		db.blobManger.discardCh <- &discardStats
   461  	}
   462  	guard.Delete(deletes)
   463  	guard.Done()
   464  }
   465  
   466  func isRangeCoversTable(start, end y.Key, t table.Table) bool {
   467  	left := start.Compare(t.Smallest()) <= 0
   468  	right := t.Biggest().Compare(end) < 0
   469  	return left && right
   470  }
   471  
   472  // NewExternalTableBuilder returns a new sst builder.
   473  func (db *DB) NewExternalTableBuilder(f *os.File, compression options.CompressionType, limiter *rate.Limiter) *sstable.Builder {
   474  	return sstable.NewExternalTableBuilder(f, limiter, db.opt.TableBuilderOptions, compression)
   475  }
   476  
   477  // ErrExternalTableOverlap returned by IngestExternalFiles when files overlaps.
   478  var ErrExternalTableOverlap = errors.New("keys of external tables has overlap")
   479  
   480  type ExternalTableSpec struct {
   481  	Filename string
   482  }
   483  
   484  // IngestExternalFiles ingest external constructed tables into DB.
   485  // Note: insure there is no concurrent write overlap with tables to be ingested.
   486  func (db *DB) IngestExternalFiles(files []ExternalTableSpec) (int, error) {
   487  	tbls, err := db.prepareExternalFiles(files)
   488  	if err != nil {
   489  		return 0, err
   490  	}
   491  
   492  	if err := db.checkExternalTables(tbls); err != nil {
   493  		return 0, err
   494  	}
   495  
   496  	task := &ingestTask{tbls: tbls}
   497  	task.Add(1)
   498  	db.ingestCh <- task
   499  	task.Wait()
   500  	return task.cnt, task.err
   501  }
   502  
   503  func (db *DB) prepareExternalFiles(specs []ExternalTableSpec) ([]table.Table, error) {
   504  	tbls := make([]table.Table, len(specs))
   505  	for i, spec := range specs {
   506  		id := db.lc.reserveFileID()
   507  		filename := sstable.NewFilename(id, db.opt.Dir)
   508  
   509  		err := os.Link(spec.Filename, filename)
   510  		if err != nil {
   511  			return nil, err
   512  		}
   513  
   514  		err = os.Link(sstable.IndexFilename(spec.Filename), sstable.IndexFilename(filename))
   515  		if err != nil {
   516  			return nil, err
   517  		}
   518  
   519  		tbl, err := sstable.OpenTable(filename, db.blockCache, db.indexCache)
   520  		if err != nil {
   521  			return nil, err
   522  		}
   523  
   524  		tbls[i] = tbl
   525  	}
   526  
   527  	sort.Slice(tbls, func(i, j int) bool {
   528  		return tbls[i].Smallest().Compare(tbls[j].Smallest()) < 0
   529  	})
   530  
   531  	return tbls, syncDir(db.lc.kv.opt.Dir)
   532  }
   533  
   534  func (db *DB) checkExternalTables(tbls []table.Table) error {
   535  	keys := make([][]byte, 0, len(tbls)*2)
   536  	for _, t := range tbls {
   537  		keys = append(keys, t.Smallest().UserKey, t.Biggest().UserKey)
   538  	}
   539  	ok := sort.SliceIsSorted(keys, func(i, j int) bool {
   540  		return bytes.Compare(keys[i], keys[j]) < 0
   541  	})
   542  	if !ok {
   543  		return ErrExternalTableOverlap
   544  	}
   545  
   546  	for i := 1; i < len(keys)-1; i += 2 {
   547  		if bytes.Compare(keys[i], keys[i+1]) == 0 {
   548  			return ErrExternalTableOverlap
   549  		}
   550  	}
   551  
   552  	return nil
   553  }
   554  
   555  // CacheMetrics returns the metrics for the underlying cache.
   556  func (db *DB) CacheMetrics() *cache.Metrics {
   557  	// Do not enable ristretto metrics in badger until issue
   558  	// https://github.com/dgraph-io/ristretto/issues/92 is resolved.
   559  	// return db.blockCache.Metrics()
   560  	return nil
   561  }
   562  
   563  // Close closes a DB. It's crucial to call it to ensure all the pending updates
   564  // make their way to disk. Calling DB.Close() multiple times is not safe and would
   565  // cause panic.
   566  func (db *DB) Close() (err error) {
   567  	log.Info("Closing database")
   568  
   569  	// Stop writes next.
   570  	db.closers.writes.SignalAndWait()
   571  
   572  	// Now close the value log.
   573  	if vlogErr := db.vlog.Close(); err == nil {
   574  		err = errors.Wrap(vlogErr, "DB.Close")
   575  	}
   576  
   577  	// Make sure that block writer is done pushing stuff into memtable!
   578  	// Otherwise, you will have a race condition: we are trying to flush memtables
   579  	// and remove them completely, while the block / memtable writer is still
   580  	// trying to push stuff into the memtable. This will also resolve the value
   581  	// offset problem: as we push into memtable, we update value offsets there.
   582  	mTbls := db.mtbls.Load().(*memTables)
   583  	if !mTbls.getMutable().Empty() && !db.volatileMode {
   584  		log.Info("Flushing memtable")
   585  		db.mtbls.Store(newMemTables(nil, mTbls))
   586  		db.flushChan <- newFlushTask(mTbls.getMutable(), db.logOff)
   587  	}
   588  	db.flushChan <- newFlushTask(nil, logOffset{}) // Tell flusher to quit.
   589  
   590  	if db.closers.memtable != nil {
   591  		db.closers.memtable.SignalAndWait()
   592  		log.Info("Memtable flushed")
   593  	}
   594  	if db.closers.compactors != nil {
   595  		db.closers.compactors.SignalAndWait()
   596  		log.Info("Compaction finished")
   597  	}
   598  	if db.opt.CompactL0WhenClose && !db.volatileMode {
   599  		// Force Compact L0
   600  		// We don't need to care about cstatus since no parallel compaction is running.
   601  		cd := &CompactDef{}
   602  		guard := db.resourceMgr.Acquire()
   603  		defer guard.Done()
   604  		if cd.fillTablesL0(&db.lc.cstatus, db.lc.levels[0], db.lc.levels[1]) {
   605  			if err := db.lc.runCompactDef(cd, guard); err != nil {
   606  				log.Info("LOG Compact FAILED", zap.Stringer("compact def", cd), zap.Error(err))
   607  			}
   608  		} else {
   609  			log.Info("fillTables failed for level zero. No compaction required")
   610  		}
   611  	}
   612  
   613  	if db.closers.blobManager != nil {
   614  		db.closers.blobManager.SignalAndWait()
   615  		log.Info("BlobManager finished")
   616  	}
   617  	if db.closers.resourceManager != nil {
   618  		db.closers.resourceManager.SignalAndWait()
   619  		log.Info("ResourceManager finished")
   620  	}
   621  
   622  	if lcErr := db.lc.close(); err == nil {
   623  		err = errors.Wrap(lcErr, "DB.Close")
   624  	}
   625  	log.Info("Waiting for closer")
   626  	db.closers.updateSize.SignalAndWait()
   627  	if db.blockCache != nil {
   628  		db.blockCache.Close()
   629  	}
   630  
   631  	if db.indexCache != nil {
   632  		db.indexCache.Close()
   633  	}
   634  
   635  	if db.dirLockGuard != nil {
   636  		if guardErr := db.dirLockGuard.release(); err == nil {
   637  			err = errors.Wrap(guardErr, "DB.Close")
   638  		}
   639  	}
   640  	if db.valueDirGuard != nil {
   641  		if guardErr := db.valueDirGuard.release(); err == nil {
   642  			err = errors.Wrap(guardErr, "DB.Close")
   643  		}
   644  	}
   645  	if manifestErr := db.manifest.close(); err == nil {
   646  		err = errors.Wrap(manifestErr, "DB.Close")
   647  	}
   648  
   649  	// Fsync directories to ensure that lock file, and any other removed files whose directory
   650  	// we haven't specifically fsynced, are guaranteed to have their directory entry removal
   651  	// persisted to disk.
   652  	if syncErr := syncDir(db.opt.Dir); err == nil {
   653  		err = errors.Wrap(syncErr, "DB.Close")
   654  	}
   655  	if syncErr := syncDir(db.opt.ValueDir); err == nil {
   656  		err = errors.Wrap(syncErr, "DB.Close")
   657  	}
   658  
   659  	return err
   660  }
   661  
   662  const (
   663  	lockFile = "LOCK"
   664  )
   665  
   666  // When you create or delete a file, you have to ensure the directory entry for the file is synced
   667  // in order to guarantee the file is visible (if the system crashes).  (See the man page for fsync,
   668  // or see https://github.com/coreos/etcd/issues/6368 for an example.)
   669  func syncDir(dir string) error {
   670  	f, err := openDir(dir)
   671  	if err != nil {
   672  		return errors.Wrapf(err, "While opening directory: %s.", dir)
   673  	}
   674  	err = f.Sync()
   675  	closeErr := f.Close()
   676  	if err != nil {
   677  		return errors.Wrapf(err, "While syncing directory: %s.", dir)
   678  	}
   679  	return errors.Wrapf(closeErr, "While closing directory: %s.", dir)
   680  }
   681  
   682  // getMemtables returns the current memtables.
   683  func (db *DB) getMemTables() []*memtable.Table {
   684  	tbls := db.mtbls.Load().(*memTables)
   685  	l := atomic.LoadUint32(&tbls.length)
   686  	return tbls.tables[:l]
   687  }
   688  
   689  // get returns the value in memtable or disk for given key.
   690  // Note that value will include meta byte.
   691  //
   692  // IMPORTANT: We should never write an entry with an older timestamp for the same key, We need to
   693  // maintain this invariant to search for the latest value of a key, or else we need to search in all
   694  // tables and find the max version among them.  To maintain this invariant, we also need to ensure
   695  // that all versions of a key are always present in the same table from level 1, because compaction
   696  // can push any table down.
   697  func (db *DB) get(key y.Key) y.ValueStruct {
   698  	tables := db.getMemTables() // Lock should be released.
   699  
   700  	db.metrics.NumGets.Inc()
   701  	for _, table := range tables {
   702  		db.metrics.NumMemtableGets.Inc()
   703  		vs, err := table.Get(key, 0)
   704  		if err != nil {
   705  			log.Error("search table meets error", zap.Error(err))
   706  		}
   707  		if vs.Valid() {
   708  			return vs
   709  		}
   710  	}
   711  	keyHash := farm.Fingerprint64(key.UserKey)
   712  	return db.lc.get(key, keyHash)
   713  }
   714  
   715  func (db *DB) multiGet(pairs []keyValuePair) {
   716  	tables := db.getMemTables() // Lock should be released.
   717  
   718  	var foundCount, mtGets int
   719  	for _, table := range tables {
   720  		for j := range pairs {
   721  			pair := &pairs[j]
   722  			if pair.found {
   723  				continue
   724  			}
   725  			for {
   726  				val, err := table.Get(pair.key, 0)
   727  				if err != nil {
   728  					log.Error("search table meets error", zap.Error(err))
   729  				}
   730  				if val.Valid() {
   731  					pair.val = val
   732  					pair.found = true
   733  					foundCount++
   734  				}
   735  				mtGets++
   736  				break
   737  			}
   738  		}
   739  	}
   740  	db.metrics.NumMemtableGets.Add(float64(mtGets))
   741  	db.metrics.NumGets.Add(float64(len(pairs)))
   742  
   743  	if foundCount == len(pairs) {
   744  		return
   745  	}
   746  	db.lc.multiGet(pairs)
   747  }
   748  
   749  func (db *DB) updateOffset(off logOffset) {
   750  	y.Assert(!off.Less(db.logOff))
   751  	// We don't need to protect it by a lock because the value is never accessed
   752  	// by more than one goroutine at the same time.
   753  	db.logOff = off
   754  }
   755  
   756  var requestPool = sync.Pool{
   757  	New: func() interface{} {
   758  		return new(request)
   759  	},
   760  }
   761  
   762  func (db *DB) sendToWriteCh(entries []*Entry) (*request, error) {
   763  	var count, size int64
   764  	for _, e := range entries {
   765  		size += int64(e.estimateSize())
   766  		count++
   767  	}
   768  
   769  	// We can only service one request because we need each txn to be stored in a contigous section.
   770  	// Txns should not interleave among other txns or rewrites.
   771  	req := requestPool.Get().(*request)
   772  	req.Entries = entries
   773  	req.Wg = sync.WaitGroup{}
   774  	req.Wg.Add(1)
   775  	db.writeCh <- req // Handled in writeWorker.
   776  	db.metrics.NumPuts.Add(float64(len(entries)))
   777  
   778  	return req, nil
   779  }
   780  
   781  // batchSet applies a list of badger.Entry. If a request level error occurs it
   782  // will be returned.
   783  //
   784  //	Check(kv.BatchSet(entries))
   785  func (db *DB) batchSet(entries []*Entry) error {
   786  	sort.Slice(entries, func(i, j int) bool {
   787  		return entries[i].Key.Compare(entries[j].Key) < 0
   788  	})
   789  	req, err := db.sendToWriteCh(entries)
   790  	if err != nil {
   791  		return err
   792  	}
   793  
   794  	return req.Wait()
   795  }
   796  
   797  // batchSetAsync is the asynchronous version of batchSet. It accepts a callback
   798  // function which is called when all the sets are complete. If a request level
   799  // error occurs, it will be passed back via the callback.
   800  //
   801  //	err := kv.BatchSetAsync(entries, func(err error)) {
   802  //	   Check(err)
   803  //	}
   804  func (db *DB) batchSetAsync(entries []*Entry, f func(error)) error {
   805  	req, err := db.sendToWriteCh(entries)
   806  	if err != nil {
   807  		return err
   808  	}
   809  	go func() {
   810  		err := req.Wait()
   811  		// Write is complete. Let's call the callback function now.
   812  		f(err)
   813  	}()
   814  	return nil
   815  }
   816  
   817  // ensureRoomForWrite is always called serially.
   818  func (db *DB) ensureRoomForWrite(mt *memtable.Table, minSize int64) int64 {
   819  	free := db.opt.MaxMemTableSize - mt.Size()
   820  	if free >= minSize {
   821  		return free
   822  	}
   823  	_ = db.flushMemTable()
   824  	return db.opt.MaxMemTableSize
   825  }
   826  
   827  func (db *DB) flushMemTable() *sync.WaitGroup {
   828  	mTbls := db.mtbls.Load().(*memTables)
   829  	newTbls := newMemTables(<-db.memTableCh, mTbls)
   830  	db.mtbls.Store(newTbls)
   831  	ft := newFlushTask(mTbls.getMutable(), db.logOff)
   832  	db.flushChan <- ft
   833  	log.Info("flushing memtable", zap.Int64("memtable size", mTbls.getMutable().Size()), zap.Int("size of flushChan", len(db.flushChan)))
   834  
   835  	// New memtable is empty. We certainly have room.
   836  	return &ft.wg
   837  }
   838  
   839  func arenaSize(opt Options) int64 {
   840  	return opt.MaxMemTableSize + opt.maxBatchCount*int64(memtable.MaxNodeSize)
   841  }
   842  
   843  // WriteLevel0Table flushes memtable. It drops deleteValues.
   844  func (db *DB) writeLevel0Table(s *memtable.Table, f *os.File) error {
   845  	iter := s.NewIterator(false)
   846  	defer iter.Close()
   847  	var (
   848  		bb                   *blobFileBuilder
   849  		numWrite, bytesWrite int
   850  		err                  error
   851  	)
   852  	b := sstable.NewTableBuilder(f, db.limiter, 0, db.opt.TableBuilderOptions)
   853  	defer b.Close()
   854  
   855  	for iter.Rewind(); iter.Valid(); y.NextAllVersion(iter) {
   856  		key := iter.Key()
   857  		value := iter.Value()
   858  		if db.opt.ValueThreshold > 0 && len(value.Value) > db.opt.ValueThreshold {
   859  			if bb == nil {
   860  				if bb, err = db.newBlobFileBuilder(); err != nil {
   861  					return y.Wrap(err)
   862  				}
   863  			}
   864  
   865  			bp, err := bb.append(value.Value)
   866  			if err != nil {
   867  				return err
   868  			}
   869  			value.Meta |= bitValuePointer
   870  			value.Value = bp
   871  		}
   872  		if err = b.Add(key, value); err != nil {
   873  			return err
   874  		}
   875  		numWrite++
   876  		bytesWrite += key.Len() + int(value.EncodedSize())
   877  	}
   878  	stats := &y.CompactionStats{
   879  		KeysWrite:  numWrite,
   880  		BytesWrite: bytesWrite,
   881  	}
   882  	db.lc.levels[0].metrics.UpdateCompactionStats(stats)
   883  
   884  	if _, err = b.Finish(); err != nil {
   885  		return y.Wrap(err)
   886  	}
   887  	if bb != nil {
   888  		bf, err1 := bb.finish()
   889  		if err1 != nil {
   890  			return err1
   891  		}
   892  		log.Info("build L0 blob", zap.Uint32("id", bf.fid), zap.Uint32("size", bf.fileSize))
   893  		err1 = db.blobManger.addFile(bf)
   894  		if err1 != nil {
   895  			return err1
   896  		}
   897  	}
   898  	return nil
   899  }
   900  
   901  func (db *DB) newBlobFileBuilder() (*blobFileBuilder, error) {
   902  	return newBlobFileBuilder(db.blobManger.allocFileID(), db.opt.Dir, db.opt.TableBuilderOptions.WriteBufferSize)
   903  }
   904  
   905  type flushTask struct {
   906  	mt  *memtable.Table
   907  	off logOffset
   908  	wg  sync.WaitGroup
   909  }
   910  
   911  func newFlushTask(mt *memtable.Table, off logOffset) *flushTask {
   912  	ft := &flushTask{mt: mt, off: off}
   913  	ft.wg.Add(1)
   914  	return ft
   915  }
   916  
   917  // TODO: Ensure that this function doesn't return, or is handled by another wrapper function.
   918  // Otherwise, we would have no goroutine which can flush memtables.
   919  func (db *DB) runFlushMemTable(c *y.Closer) error {
   920  	defer c.Done()
   921  
   922  	for ft := range db.flushChan {
   923  		if ft.mt == nil {
   924  			return nil
   925  		}
   926  		guard := db.resourceMgr.Acquire()
   927  		var headInfo *protos.HeadInfo
   928  		if !ft.mt.Empty() {
   929  			headInfo = &protos.HeadInfo{
   930  				// Pick the max commit ts, so in case of crash, our read ts would be higher than all the
   931  				// commits.
   932  				Version:   db.orc.commitTs(),
   933  				LogID:     ft.off.fid,
   934  				LogOffset: ft.off.offset,
   935  			}
   936  			// Store badger head even if vptr is zero, need it for readTs
   937  			log.Info("flush memtable storing offset", zap.Uint32("fid", ft.off.fid), zap.Uint32("offset", ft.off.offset))
   938  		}
   939  
   940  		fileID := ft.mt.ID()
   941  		filename := sstable.NewFilename(fileID, db.opt.Dir)
   942  		fd, err := directio.OpenFile(filename, os.O_CREATE|os.O_RDWR, 0666)
   943  		if err != nil {
   944  			log.Error("error while writing to level 0", zap.Error(err))
   945  			return y.Wrap(err)
   946  		}
   947  
   948  		// Don't block just to sync the directory entry.
   949  		dirSyncCh := make(chan error)
   950  		go func() { dirSyncCh <- syncDir(db.opt.Dir) }()
   951  
   952  		err = db.writeLevel0Table(ft.mt, fd)
   953  		dirSyncErr := <-dirSyncCh
   954  		if err != nil {
   955  			log.Error("error while writing to level 0", zap.Error(err))
   956  			return err
   957  		}
   958  		if dirSyncErr != nil {
   959  			log.Error("error while syncing level directory", zap.Error(dirSyncErr))
   960  			return err
   961  		}
   962  		atomic.StoreUint32(&db.syncedFid, ft.off.fid)
   963  		fd.Close()
   964  		tbl, err := sstable.OpenTable(filename, db.blockCache, db.indexCache)
   965  		if err != nil {
   966  			log.Info("error while opening table", zap.Error(err))
   967  			return err
   968  		}
   969  		err = db.lc.addLevel0Table(tbl, headInfo)
   970  		if err != nil {
   971  			log.Error("error while syncing level directory", zap.Error(err))
   972  			return err
   973  		}
   974  		mTbls := db.mtbls.Load().(*memTables)
   975  		// Update the length of mTbls.
   976  		for i, tbl := range mTbls.tables {
   977  			if tbl == ft.mt {
   978  				atomic.StoreUint32(&mTbls.length, uint32(i))
   979  				break
   980  			}
   981  		}
   982  		guard.Delete([]epoch.Resource{ft.mt})
   983  		guard.Done()
   984  		ft.wg.Done()
   985  	}
   986  	return nil
   987  }
   988  
   989  func exists(path string) (bool, error) {
   990  	_, err := os.Stat(path)
   991  	if err == nil {
   992  		return true, nil
   993  	}
   994  	if os.IsNotExist(err) {
   995  		return false, nil
   996  	}
   997  	return true, err
   998  }
   999  
  1000  // This function does a filewalk, calculates the size of vlog and sst files and stores it in
  1001  // y.LSMSize and y.VlogSize.
  1002  func (db *DB) calculateSize() {
  1003  	totalSize := func(dir string) (int64, int64) {
  1004  		var lsmSize, vlogSize int64
  1005  		err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
  1006  			if err != nil {
  1007  				return err
  1008  			}
  1009  			ext := filepath.Ext(path)
  1010  			if ext == ".sst" {
  1011  				lsmSize += info.Size()
  1012  			} else if ext == ".vlog" {
  1013  				vlogSize += info.Size()
  1014  			}
  1015  			return nil
  1016  		})
  1017  		if err != nil {
  1018  			log.Info("error while calculating total size of directory", zap.String("path", dir))
  1019  		}
  1020  		return lsmSize, vlogSize
  1021  	}
  1022  
  1023  	lsmSize, vlogSize := totalSize(db.opt.Dir)
  1024  	// If valueDir is different from dir, we'd have to do another walk.
  1025  	if db.opt.ValueDir != db.opt.Dir {
  1026  		_, vlogSize = totalSize(db.opt.ValueDir)
  1027  	}
  1028  	atomic.StoreInt64(&db.lsmSize, lsmSize)
  1029  	atomic.StoreInt64(&db.vlogSize, vlogSize)
  1030  	db.metrics.LSMSize.Set(float64(lsmSize))
  1031  	db.metrics.VlogSize.Set(float64(vlogSize))
  1032  }
  1033  
  1034  func (db *DB) updateSize(c *y.Closer) {
  1035  	defer c.Done()
  1036  
  1037  	metricsTicker := time.NewTicker(time.Minute)
  1038  	defer metricsTicker.Stop()
  1039  
  1040  	for {
  1041  		select {
  1042  		case <-metricsTicker.C:
  1043  			db.calculateSize()
  1044  		case <-c.HasBeenClosed():
  1045  			return
  1046  		}
  1047  	}
  1048  }
  1049  
  1050  // Size returns the size of lsm and value log files in bytes. It can be used to decide how often to
  1051  // call RunValueLogGC.
  1052  func (db *DB) Size() (lsm int64, vlog int64) {
  1053  	return atomic.LoadInt64(&db.lsmSize), atomic.LoadInt64(&db.vlogSize)
  1054  }
  1055  
  1056  func (db *DB) Tables() []TableInfo {
  1057  	return db.lc.getTableInfo()
  1058  }
  1059  
  1060  func (db *DB) GetVLogOffset() uint64 {
  1061  	return db.vlog.getMaxPtr()
  1062  }
  1063  
  1064  // IterateVLog iterates VLog for external replay, this function should be called only when there is no
  1065  // concurrent write operation on the DB.
  1066  func (db *DB) IterateVLog(offset uint64, fn func(e Entry)) error {
  1067  	startFid := uint32(offset >> 32)
  1068  	vOffset := uint32(offset)
  1069  	for fid := startFid; fid <= db.vlog.maxFid(); fid++ {
  1070  		lf, err := db.vlog.getFile(fid)
  1071  		if err != nil {
  1072  			return err
  1073  		}
  1074  		if fid != startFid {
  1075  			vOffset = 0
  1076  		}
  1077  		endOffset, err := db.vlog.iterate(lf, vOffset, func(e Entry) error {
  1078  			if e.meta&bitTxn > 0 {
  1079  				fn(e)
  1080  			}
  1081  			return nil
  1082  		})
  1083  		if err != nil {
  1084  			return err
  1085  		}
  1086  		if fid == db.vlog.maxFid() {
  1087  			_, err = lf.fd.Seek(int64(endOffset), io.SeekStart)
  1088  			if err != nil {
  1089  				return err
  1090  			}
  1091  		}
  1092  	}
  1093  	return nil
  1094  }
  1095  
  1096  func (db *DB) getCompactSafeTs() uint64 {
  1097  	return atomic.LoadUint64(&db.safeTsTracker.safeTs)
  1098  }
  1099  
  1100  // UpdateSafeTs is used for Managed DB, during compaction old version smaller than the safe ts will be discarded.
  1101  // If this is not called, all old versions are kept.
  1102  func (db *DB) UpdateSafeTs(ts uint64) {
  1103  	y.Assert(db.IsManaged())
  1104  	for {
  1105  		old := db.getCompactSafeTs()
  1106  		if old < ts {
  1107  			if !atomic.CompareAndSwapUint64(&db.safeTsTracker.safeTs, old, ts) {
  1108  				continue
  1109  			}
  1110  		}
  1111  		break
  1112  	}
  1113  }
  1114  
  1115  func (db *DB) IsManaged() bool {
  1116  	return db.opt.ManagedTxns
  1117  }
  1118  
  1119  type safeTsTracker struct {
  1120  	safeTs uint64
  1121  
  1122  	maxInactive uint64
  1123  	minActive   uint64
  1124  }
  1125  
  1126  func (t *safeTsTracker) Begin() {
  1127  	// t.maxInactive = 0
  1128  	t.minActive = math.MaxUint64
  1129  }
  1130  
  1131  func (t *safeTsTracker) Inspect(payload interface{}, isActive bool) {
  1132  	ts, ok := payload.(uint64)
  1133  	if !ok {
  1134  		return
  1135  	}
  1136  
  1137  	if isActive {
  1138  		if ts < t.minActive {
  1139  			t.minActive = ts
  1140  		}
  1141  	} else {
  1142  		if ts > t.maxInactive {
  1143  			t.maxInactive = ts
  1144  		}
  1145  	}
  1146  }
  1147  
  1148  func (t *safeTsTracker) End() {
  1149  	var safe uint64
  1150  	if t.minActive == math.MaxUint64 {
  1151  		safe = t.maxInactive
  1152  	} else {
  1153  		safe = t.minActive - 1
  1154  	}
  1155  
  1156  	if safe > atomic.LoadUint64(&t.safeTs) {
  1157  		atomic.StoreUint64(&t.safeTs, safe)
  1158  	}
  1159  }