github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/version_set.go

github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/version_set.go (about)

     1  // Copyright 2012 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package bitalostable
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"io"
    11  	"math"
    12  	"sync"
    13  	"sync/atomic"
    14  
    15  	"github.com/cockroachdb/errors"
    16  	"github.com/cockroachdb/errors/oserror"
    17  	"github.com/zuoyebang/bitalostable/internal/base"
    18  	"github.com/zuoyebang/bitalostable/internal/invariants"
    19  	"github.com/zuoyebang/bitalostable/internal/manifest"
    20  	"github.com/zuoyebang/bitalostable/record"
    21  	"github.com/zuoyebang/bitalostable/vfs"
    22  	"github.com/zuoyebang/bitalostable/vfs/atomicfs"
    23  )
    24  
    25  const numLevels = manifest.NumLevels
    26  
    27  const manifestMarkerName = `manifest`
    28  
    29  // Provide type aliases for the various manifest structs.
    30  type bulkVersionEdit = manifest.BulkVersionEdit
    31  type deletedFileEntry = manifest.DeletedFileEntry
    32  type fileMetadata = manifest.FileMetadata
    33  type newFileEntry = manifest.NewFileEntry
    34  type version = manifest.Version
    35  type versionEdit = manifest.VersionEdit
    36  type versionList = manifest.VersionList
    37  
    38  // versionSet manages a collection of immutable versions, and manages the
    39  // creation of a new version from the most recent version. A new version is
    40  // created from an existing version by applying a version edit which is just
    41  // like it sounds: a delta from the previous version. Version edits are logged
    42  // to the MANIFEST file, which is replayed at startup.
    43  type versionSet struct {
    44  	// WARNING: The following struct `atomic` contains fields are accessed atomically.
    45  	//
    46  	// Go allocations are guaranteed to be 64-bit aligned which we take advantage
    47  	// of by placing the 64-bit fields which we access atomically at the beginning
    48  	// of the versionSet struct.
    49  	// For more information, see https://golang.org/pkg/sync/atomic/#pkg-note-BUG.
    50  	atomic struct {
    51  		logSeqNum uint64 // next seqNum to use for WAL writes
    52  
    53  		// The upper bound on sequence numbers that have been assigned so far.
    54  		// A suffix of these sequence numbers may not have been written to a
    55  		// WAL. Both logSeqNum and visibleSeqNum are atomically updated by the
    56  		// commitPipeline.
    57  		visibleSeqNum uint64 // visible seqNum (<= logSeqNum)
    58  
    59  		// Number of bytes present in sstables being written by in-progress
    60  		// compactions. This value will be zero if there are no in-progress
    61  		// compactions. Updated and read atomically.
    62  		atomicInProgressBytes int64
    63  	}
    64  
    65  	// Immutable fields.
    66  	dirname string
    67  	// Set to DB.mu.
    68  	mu             *sync.Mutex
    69  	opts           *Options
    70  	fs             vfs.FS
    71  	cmp            Compare
    72  	cmpName        string
    73  	diskAvailBytes func() uint64
    74  	// Dynamic base level allows the dynamic base level computation to be
    75  	// disabled. Used by tests which want to create specific LSM structures.
    76  	dynamicBaseLevel bool
    77  
    78  	// Mutable fields.
    79  	versions versionList
    80  	picker   compactionPicker
    81  
    82  	metrics Metrics
    83  
    84  	// A pointer to versionSet.addObsoleteLocked. Avoids allocating a new closure
    85  	// on the creation of every version.
    86  	obsoleteFn        func(obsolete []*manifest.FileMetadata)
    87  	obsoleteTables    []*manifest.FileMetadata
    88  	obsoleteManifests []fileInfo
    89  	obsoleteOptions   []fileInfo
    90  
    91  	// Zombie tables which have been removed from the current version but are
    92  	// still referenced by an inuse iterator.
    93  	zombieTables map[FileNum]uint64 // filenum -> size
    94  
    95  	// minUnflushedLogNum is the smallest WAL log file number corresponding to
    96  	// mutations that have not been flushed to an sstable.
    97  	minUnflushedLogNum FileNum
    98  
    99  	// The next file number. A single counter is used to assign file numbers
   100  	// for the WAL, MANIFEST, sstable, and OPTIONS files.
   101  	nextFileNum FileNum
   102  
   103  	// The current manifest file number.
   104  	manifestFileNum FileNum
   105  	manifestMarker  *atomicfs.Marker
   106  
   107  	manifestFile vfs.File
   108  	manifest     *record.Writer
   109  	setCurrent   func(FileNum) error
   110  
   111  	writing    bool
   112  	writerCond sync.Cond
   113  }
   114  
   115  func (vs *versionSet) init(
   116  	dirname string,
   117  	opts *Options,
   118  	marker *atomicfs.Marker,
   119  	setCurrent func(FileNum) error,
   120  	mu *sync.Mutex,
   121  ) {
   122  	vs.dirname = dirname
   123  	vs.mu = mu
   124  	vs.writerCond.L = mu
   125  	vs.opts = opts
   126  	vs.fs = opts.FS
   127  	vs.cmp = opts.Comparer.Compare
   128  	vs.cmpName = opts.Comparer.Name
   129  	vs.dynamicBaseLevel = true
   130  	vs.versions.Init(mu)
   131  	vs.obsoleteFn = vs.addObsoleteLocked
   132  	vs.zombieTables = make(map[FileNum]uint64)
   133  	vs.nextFileNum = 1
   134  	vs.manifestMarker = marker
   135  	vs.setCurrent = setCurrent
   136  	if vs.diskAvailBytes == nil {
   137  		vs.diskAvailBytes = func() uint64 { return math.MaxUint64 }
   138  	}
   139  }
   140  
   141  // create creates a version set for a fresh DB.
   142  func (vs *versionSet) create(
   143  	jobID int,
   144  	dirname string,
   145  	opts *Options,
   146  	marker *atomicfs.Marker,
   147  	setCurrent func(FileNum) error,
   148  	mu *sync.Mutex,
   149  ) error {
   150  	vs.init(dirname, opts, marker, setCurrent, mu)
   151  	newVersion := &version{}
   152  	vs.append(newVersion)
   153  	var err error
   154  
   155  	vs.picker = newCompactionPicker(newVersion, vs.opts, nil, vs.metrics.levelSizes(), vs.diskAvailBytes)
   156  
   157  	// Note that a "snapshot" version edit is written to the manifest when it is
   158  	// created.
   159  	vs.manifestFileNum = vs.getNextFileNum()
   160  	err = vs.createManifest(vs.dirname, vs.manifestFileNum, vs.minUnflushedLogNum, vs.nextFileNum)
   161  	if err == nil {
   162  		if err = vs.manifest.Flush(); err != nil {
   163  			vs.opts.Logger.Fatalf("MANIFEST flush failed: %v", err)
   164  		}
   165  	}
   166  	if err == nil {
   167  		if err = vs.manifestFile.Sync(); err != nil {
   168  			vs.opts.Logger.Fatalf("MANIFEST sync failed: %v", err)
   169  		}
   170  	}
   171  	if err == nil {
   172  		// NB: setCurrent is responsible for syncing the data directory.
   173  		if err = vs.setCurrent(vs.manifestFileNum); err != nil {
   174  			vs.opts.Logger.Fatalf("MANIFEST set current failed: %v", err)
   175  		}
   176  	}
   177  
   178  	vs.opts.EventListener.ManifestCreated(ManifestCreateInfo{
   179  		JobID:   jobID,
   180  		Path:    base.MakeFilepath(vs.fs, vs.dirname, fileTypeManifest, vs.manifestFileNum),
   181  		FileNum: vs.manifestFileNum,
   182  		Err:     err,
   183  	})
   184  	if err != nil {
   185  		return err
   186  	}
   187  	return nil
   188  }
   189  
   190  // load loads the version set from the manifest file.
   191  func (vs *versionSet) load(
   192  	dirname string,
   193  	opts *Options,
   194  	manifestFileNum FileNum,
   195  	marker *atomicfs.Marker,
   196  	setCurrent func(FileNum) error,
   197  	mu *sync.Mutex,
   198  ) error {
   199  	vs.init(dirname, opts, marker, setCurrent, mu)
   200  
   201  	vs.manifestFileNum = manifestFileNum
   202  	manifestPath := base.MakeFilepath(opts.FS, dirname, fileTypeManifest, vs.manifestFileNum)
   203  	manifestFilename := opts.FS.PathBase(manifestPath)
   204  
   205  	// Read the versionEdits in the manifest file.
   206  	var bve bulkVersionEdit
   207  	bve.AddedByFileNum = make(map[base.FileNum]*fileMetadata)
   208  	manifest, err := vs.fs.Open(manifestPath)
   209  	if err != nil {
   210  		return errors.Wrapf(err, "bitalostable: could not open manifest file %q for DB %q",
   211  			errors.Safe(manifestFilename), dirname)
   212  	}
   213  	defer manifest.Close()
   214  	rr := record.NewReader(manifest, 0 /* logNum */)
   215  	for {
   216  		r, err := rr.Next()
   217  		if err == io.EOF || record.IsInvalidRecord(err) {
   218  			break
   219  		}
   220  		if err != nil {
   221  			return errors.Wrapf(err, "bitalostable: error when loading manifest file %q",
   222  				errors.Safe(manifestFilename))
   223  		}
   224  		var ve versionEdit
   225  		err = ve.Decode(r)
   226  		if err != nil {
   227  			// Break instead of returning an error if the record is corrupted
   228  			// or invalid.
   229  			if err == io.EOF || record.IsInvalidRecord(err) {
   230  				break
   231  			}
   232  			return err
   233  		}
   234  		if ve.ComparerName != "" {
   235  			if ve.ComparerName != vs.cmpName {
   236  				return errors.Errorf("bitalostable: manifest file %q for DB %q: "+
   237  					"comparer name from file %q != comparer name from Options %q",
   238  					errors.Safe(manifestFilename), dirname, errors.Safe(ve.ComparerName), errors.Safe(vs.cmpName))
   239  			}
   240  		}
   241  		if err := bve.Accumulate(&ve); err != nil {
   242  			return err
   243  		}
   244  		if ve.MinUnflushedLogNum != 0 {
   245  			vs.minUnflushedLogNum = ve.MinUnflushedLogNum
   246  		}
   247  		if ve.NextFileNum != 0 {
   248  			vs.nextFileNum = ve.NextFileNum
   249  		}
   250  		if ve.LastSeqNum != 0 {
   251  			// logSeqNum is the _next_ sequence number that will be assigned,
   252  			// while LastSeqNum is the last assigned sequence number. Note that
   253  			// this behaviour mimics that in RocksDB; the first sequence number
   254  			// assigned is one greater than the one present in the manifest
   255  			// (assuming no WALs contain higher sequence numbers than the
   256  			// manifest's LastSeqNum). Increment LastSeqNum by 1 to get the
   257  			// next sequence number that will be assigned.
   258  			vs.atomic.logSeqNum = ve.LastSeqNum + 1
   259  		}
   260  	}
   261  	// We have already set vs.nextFileNum = 2 at the beginning of the
   262  	// function and could have only updated it to some other non-zero value,
   263  	// so it cannot be 0 here.
   264  	if vs.minUnflushedLogNum == 0 {
   265  		if vs.nextFileNum >= 2 {
   266  			// We either have a freshly created DB, or a DB created by RocksDB
   267  			// that has not had a single flushed SSTable yet. This is because
   268  			// RocksDB bumps up nextFileNum in this case without bumping up
   269  			// minUnflushedLogNum, even if WALs with non-zero file numbers are
   270  			// present in the directory.
   271  		} else {
   272  			return base.CorruptionErrorf("bitalostable: malformed manifest file %q for DB %q",
   273  				errors.Safe(manifestFilename), dirname)
   274  		}
   275  	}
   276  	vs.markFileNumUsed(vs.minUnflushedLogNum)
   277  
   278  	newVersion, _, err := bve.Apply(nil, vs.cmp, opts.Comparer.FormatKey, opts.FlushSplitBytes, opts.Experimental.ReadCompactionRate)
   279  	if err != nil {
   280  		return err
   281  	}
   282  	newVersion.L0Sublevels.InitCompactingFileInfo(nil /* in-progress compactions */)
   283  	vs.append(newVersion)
   284  
   285  	for i := range vs.metrics.Levels {
   286  		l := &vs.metrics.Levels[i]
   287  		l.NumFiles = int64(newVersion.Levels[i].Len())
   288  		files := newVersion.Levels[i].Slice()
   289  		l.Size = int64(files.SizeSum())
   290  	}
   291  
   292  	vs.picker = newCompactionPicker(newVersion, vs.opts, nil, vs.metrics.levelSizes(), vs.diskAvailBytes)
   293  	return nil
   294  }
   295  
   296  func (vs *versionSet) close() error {
   297  	if vs.manifestFile != nil {
   298  		if err := vs.manifestFile.Close(); err != nil {
   299  			return err
   300  		}
   301  	}
   302  	if vs.manifestMarker != nil {
   303  		if err := vs.manifestMarker.Close(); err != nil {
   304  			return err
   305  		}
   306  	}
   307  	return nil
   308  }
   309  
   310  // logLock locks the manifest for writing. The lock must be released by either
   311  // a call to logUnlock or logAndApply.
   312  //
   313  // DB.mu must be held when calling this method, but the mutex may be dropped and
   314  // re-acquired during the course of this method.
   315  func (vs *versionSet) logLock() {
   316  	// Wait for any existing writing to the manifest to complete, then mark the
   317  	// manifest as busy.
   318  	for vs.writing {
   319  		vs.writerCond.Wait()
   320  	}
   321  	vs.writing = true
   322  }
   323  
   324  // logUnlock releases the lock for manifest writing.
   325  //
   326  // DB.mu must be held when calling this method.
   327  func (vs *versionSet) logUnlock() {
   328  	if !vs.writing {
   329  		vs.opts.Logger.Fatalf("MANIFEST not locked for writing")
   330  	}
   331  	vs.writing = false
   332  	vs.writerCond.Signal()
   333  }
   334  
   335  // logAndApply logs the version edit to the manifest, applies the version edit
   336  // to the current version, and installs the new version.
   337  //
   338  // DB.mu must be held when calling this method and will be released temporarily
   339  // while performing file I/O. Requires that the manifest is locked for writing
   340  // (see logLock). Will unconditionally release the manifest lock (via
   341  // logUnlock) even if an error occurs.
   342  //
   343  // inProgressCompactions is called while DB.mu is held, to get the list of
   344  // in-progress compactions.
   345  func (vs *versionSet) logAndApply(
   346  	jobID int,
   347  	ve *versionEdit,
   348  	metrics map[int]*LevelMetrics,
   349  	forceRotation bool,
   350  	inProgressCompactions func() []compactionInfo,
   351  ) error {
   352  	if !vs.writing {
   353  		vs.opts.Logger.Fatalf("MANIFEST not locked for writing")
   354  	}
   355  	defer vs.logUnlock()
   356  
   357  	if ve.MinUnflushedLogNum != 0 {
   358  		if ve.MinUnflushedLogNum < vs.minUnflushedLogNum ||
   359  			vs.nextFileNum <= ve.MinUnflushedLogNum {
   360  			panic(fmt.Sprintf("bitalostable: inconsistent versionEdit minUnflushedLogNum %d",
   361  				ve.MinUnflushedLogNum))
   362  		}
   363  	}
   364  
   365  	// This is the next manifest filenum, but if the current file is too big we
   366  	// will write this ve to the next file which means what ve encodes is the
   367  	// current filenum and not the next one.
   368  	//
   369  	// TODO(sbhola): figure out why this is correct and update comment.
   370  	ve.NextFileNum = vs.nextFileNum
   371  
   372  	// LastSeqNum is set to the current upper bound on the assigned sequence
   373  	// numbers. Note that this is exactly the behavior of RocksDB. LastSeqNum is
   374  	// used to initialize versionSet.logSeqNum and versionSet.visibleSeqNum on
   375  	// replay. It must be higher than or equal to any than any sequence number
   376  	// written to an sstable, including sequence numbers in ingested files.
   377  	// Note that LastSeqNum is not (and cannot be) the minimum unflushed sequence
   378  	// number. This is fallout from ingestion which allows a sequence number X to
   379  	// be assigned to an ingested sstable even though sequence number X-1 resides
   380  	// in an unflushed memtable. logSeqNum is the _next_ sequence number that
   381  	// will be assigned, so subtract that by 1 to get the upper bound on the
   382  	// last assigned sequence number.
   383  	logSeqNum := atomic.LoadUint64(&vs.atomic.logSeqNum)
   384  	ve.LastSeqNum = logSeqNum - 1
   385  	if logSeqNum == 0 {
   386  		// logSeqNum is initialized to 1 in Open() if there are no previous WAL
   387  		// or manifest records, so this case should never happen.
   388  		vs.opts.Logger.Fatalf("logSeqNum must be a positive integer: %d", logSeqNum)
   389  	}
   390  
   391  	currentVersion := vs.currentVersion()
   392  	var newVersion *version
   393  
   394  	// Generate a new manifest if we don't currently have one, or the current one
   395  	// is too large.
   396  	var newManifestFileNum FileNum
   397  	var prevManifestFileSize uint64
   398  	if forceRotation || vs.manifest == nil || vs.manifest.Size() >= vs.opts.MaxManifestFileSize {
   399  		newManifestFileNum = vs.getNextFileNum()
   400  		prevManifestFileSize = uint64(vs.manifest.Size())
   401  	}
   402  
   403  	// Grab certain values before releasing vs.mu, in case createManifest() needs
   404  	// to be called.
   405  	minUnflushedLogNum := vs.minUnflushedLogNum
   406  	nextFileNum := vs.nextFileNum
   407  
   408  	var zombies map[FileNum]uint64
   409  	if err := func() error {
   410  		vs.mu.Unlock()
   411  		defer vs.mu.Lock()
   412  
   413  		var bve bulkVersionEdit
   414  		if err := bve.Accumulate(ve); err != nil {
   415  			return err
   416  		}
   417  
   418  		var err error
   419  		newVersion, zombies, err = bve.Apply(currentVersion, vs.cmp, vs.opts.Comparer.FormatKey, vs.opts.FlushSplitBytes, vs.opts.Experimental.ReadCompactionRate)
   420  		if err != nil {
   421  			return errors.Wrap(err, "MANIFEST apply failed")
   422  		}
   423  
   424  		if newManifestFileNum != 0 {
   425  			if err := vs.createManifest(vs.dirname, newManifestFileNum, minUnflushedLogNum, nextFileNum); err != nil {
   426  				vs.opts.EventListener.ManifestCreated(ManifestCreateInfo{
   427  					JobID:   jobID,
   428  					Path:    base.MakeFilepath(vs.fs, vs.dirname, fileTypeManifest, newManifestFileNum),
   429  					FileNum: newManifestFileNum,
   430  					Err:     err,
   431  				})
   432  				return errors.Wrap(err, "MANIFEST create failed")
   433  			}
   434  		}
   435  
   436  		w, err := vs.manifest.Next()
   437  		if err != nil {
   438  			return errors.Wrap(err, "MANIFEST next record write failed")
   439  		}
   440  		// NB: Any error from this point on is considered fatal as we don't now if
   441  		// the MANIFEST write occurred or not. Trying to determine that is
   442  		// fraught. Instead we rely on the standard recovery mechanism run when a
   443  		// database is open. In particular, that mechanism generates a new MANIFEST
   444  		// and ensures it is synced.
   445  		if err := ve.Encode(w); err != nil {
   446  			return errors.Wrap(err, "MANIFEST write failed")
   447  		}
   448  		if err := vs.manifest.Flush(); err != nil {
   449  			return errors.Wrap(err, "MANIFEST flush failed")
   450  		}
   451  		if err := vs.manifestFile.Sync(); err != nil {
   452  			return errors.Wrap(err, "MANIFEST sync failed")
   453  		}
   454  		if newManifestFileNum != 0 {
   455  			// NB: setCurrent is responsible for syncing the data directory.
   456  			if err := vs.setCurrent(newManifestFileNum); err != nil {
   457  				return errors.Wrap(err, "MANIFEST set current failed")
   458  			}
   459  			vs.opts.EventListener.ManifestCreated(ManifestCreateInfo{
   460  				JobID:   jobID,
   461  				Path:    base.MakeFilepath(vs.fs, vs.dirname, fileTypeManifest, newManifestFileNum),
   462  				FileNum: newManifestFileNum,
   463  			})
   464  		}
   465  		return nil
   466  	}(); err != nil {
   467  		// Any error encountered during any of the operations in the previous
   468  		// closure are considered fatal. Treating such errors as fatal is preferred
   469  		// to attempting to unwind various file and b-tree reference counts, and
   470  		// re-generating L0 sublevel metadata. This may change in the future, if
   471  		// certain manifest / WAL operations become retryable. For more context, see
   472  		// #1159 and #1792.
   473  		vs.opts.Logger.Fatalf("%s", err)
   474  		return err
   475  	}
   476  
   477  	// Now that DB.mu is held again, initialize compacting file info in
   478  	// L0Sublevels.
   479  	inProgress := inProgressCompactions()
   480  
   481  	newVersion.L0Sublevels.InitCompactingFileInfo(inProgressL0Compactions(inProgress))
   482  
   483  	// Update the zombie tables set first, as installation of the new version
   484  	// will unref the previous version which could result in addObsoleteLocked
   485  	// being called.
   486  	for fileNum, size := range zombies {
   487  		vs.zombieTables[fileNum] = size
   488  	}
   489  
   490  	// Install the new version.
   491  	vs.append(newVersion)
   492  	if ve.MinUnflushedLogNum != 0 {
   493  		vs.minUnflushedLogNum = ve.MinUnflushedLogNum
   494  	}
   495  	if newManifestFileNum != 0 {
   496  		if vs.manifestFileNum != 0 {
   497  			vs.obsoleteManifests = append(vs.obsoleteManifests, fileInfo{
   498  				fileNum:  vs.manifestFileNum,
   499  				fileSize: prevManifestFileSize,
   500  			})
   501  		}
   502  		vs.manifestFileNum = newManifestFileNum
   503  	}
   504  
   505  	for level, update := range metrics {
   506  		vs.metrics.Levels[level].Add(update)
   507  	}
   508  	for i := range vs.metrics.Levels {
   509  		l := &vs.metrics.Levels[i]
   510  		l.Sublevels = 0
   511  		if l.NumFiles > 0 {
   512  			l.Sublevels = 1
   513  		}
   514  		if invariants.Enabled {
   515  			if count := int64(newVersion.Levels[i].Len()); l.NumFiles != count {
   516  				vs.opts.Logger.Fatalf("versionSet metrics L%d NumFiles = %d, actual count = %d", i, l.NumFiles, count)
   517  			}
   518  			levelFiles := newVersion.Levels[i].Slice()
   519  			if size := int64(levelFiles.SizeSum()); l.Size != size {
   520  				vs.opts.Logger.Fatalf("versionSet metrics L%d Size = %d, actual size = %d", i, l.Size, size)
   521  			}
   522  		}
   523  	}
   524  	vs.metrics.Levels[0].Sublevels = int32(len(newVersion.L0SublevelFiles))
   525  
   526  	vs.picker = newCompactionPicker(newVersion, vs.opts, inProgress, vs.metrics.levelSizes(), vs.diskAvailBytes)
   527  	if !vs.dynamicBaseLevel {
   528  		vs.picker.forceBaseLevel1()
   529  	}
   530  	return nil
   531  }
   532  
   533  func (vs *versionSet) incrementCompactions(kind compactionKind, extraLevels []*compactionLevel) {
   534  	switch kind {
   535  	case compactionKindDefault:
   536  		vs.metrics.Compact.Count++
   537  		vs.metrics.Compact.DefaultCount++
   538  
   539  	case compactionKindFlush:
   540  		vs.metrics.Flush.Count++
   541  
   542  	case compactionKindMove:
   543  		vs.metrics.Compact.Count++
   544  		vs.metrics.Compact.MoveCount++
   545  
   546  	case compactionKindDeleteOnly:
   547  		vs.metrics.Compact.Count++
   548  		vs.metrics.Compact.DeleteOnlyCount++
   549  
   550  	case compactionKindElisionOnly:
   551  		vs.metrics.Compact.Count++
   552  		vs.metrics.Compact.ElisionOnlyCount++
   553  
   554  	case compactionKindRead:
   555  		vs.metrics.Compact.Count++
   556  		vs.metrics.Compact.ReadCount++
   557  
   558  	case compactionKindRewrite:
   559  		vs.metrics.Compact.Count++
   560  		vs.metrics.Compact.RewriteCount++
   561  	}
   562  	if len(extraLevels) > 0 {
   563  		vs.metrics.Compact.MultiLevelCount++
   564  	}
   565  }
   566  
   567  func (vs *versionSet) incrementCompactionBytes(numBytes int64) {
   568  	atomic.AddInt64(&vs.atomic.atomicInProgressBytes, numBytes)
   569  }
   570  
   571  // createManifest creates a manifest file that contains a snapshot of vs.
   572  func (vs *versionSet) createManifest(
   573  	dirname string, fileNum, minUnflushedLogNum, nextFileNum FileNum,
   574  ) (err error) {
   575  	var (
   576  		filename     = base.MakeFilepath(vs.fs, dirname, fileTypeManifest, fileNum)
   577  		manifestFile vfs.File
   578  		manifest     *record.Writer
   579  	)
   580  	defer func() {
   581  		if manifest != nil {
   582  			manifest.Close()
   583  		}
   584  		if manifestFile != nil {
   585  			manifestFile.Close()
   586  		}
   587  		if err != nil {
   588  			vs.fs.Remove(filename)
   589  		}
   590  	}()
   591  	manifestFile, err = vs.fs.Create(filename)
   592  	if err != nil {
   593  		return err
   594  	}
   595  	manifest = record.NewWriter(manifestFile)
   596  
   597  	snapshot := versionEdit{
   598  		ComparerName: vs.cmpName,
   599  	}
   600  	for level, levelMetadata := range vs.currentVersion().Levels {
   601  		iter := levelMetadata.Iter()
   602  		for meta := iter.First(); meta != nil; meta = iter.Next() {
   603  			snapshot.NewFiles = append(snapshot.NewFiles, newFileEntry{
   604  				Level: level,
   605  				Meta:  meta,
   606  			})
   607  		}
   608  	}
   609  
   610  	// When creating a version snapshot for an existing DB, this snapshot VersionEdit will be
   611  	// immediately followed by another VersionEdit (being written in logAndApply()). That
   612  	// VersionEdit always contains a LastSeqNum, so we don't need to include that in the snapshot.
   613  	// But it does not necessarily include MinUnflushedLogNum, NextFileNum, so we initialize those
   614  	// using the corresponding fields in the versionSet (which came from the latest preceding
   615  	// VersionEdit that had those fields).
   616  	snapshot.MinUnflushedLogNum = minUnflushedLogNum
   617  	snapshot.NextFileNum = nextFileNum
   618  
   619  	w, err1 := manifest.Next()
   620  	if err1 != nil {
   621  		return err1
   622  	}
   623  	if err := snapshot.Encode(w); err != nil {
   624  		return err
   625  	}
   626  
   627  	if vs.manifest != nil {
   628  		vs.manifest.Close()
   629  		vs.manifest = nil
   630  	}
   631  	if vs.manifestFile != nil {
   632  		if err := vs.manifestFile.Close(); err != nil {
   633  			return err
   634  		}
   635  		vs.manifestFile = nil
   636  	}
   637  
   638  	vs.manifest, manifest = manifest, nil
   639  	vs.manifestFile, manifestFile = manifestFile, nil
   640  	return nil
   641  }
   642  
   643  func (vs *versionSet) markFileNumUsed(fileNum FileNum) {
   644  	if vs.nextFileNum <= fileNum {
   645  		vs.nextFileNum = fileNum + 1
   646  	}
   647  }
   648  
   649  func (vs *versionSet) getNextFileNum() FileNum {
   650  	x := vs.nextFileNum
   651  	vs.nextFileNum++
   652  	return x
   653  }
   654  
   655  func (vs *versionSet) append(v *version) {
   656  	if v.Refs() != 0 {
   657  		panic("bitalostable: version should be unreferenced")
   658  	}
   659  	if !vs.versions.Empty() {
   660  		vs.versions.Back().UnrefLocked()
   661  	}
   662  	v.Deleted = vs.obsoleteFn
   663  	v.Ref()
   664  	vs.versions.PushBack(v)
   665  }
   666  
   667  func (vs *versionSet) currentVersion() *version {
   668  	return vs.versions.Back()
   669  }
   670  
   671  func (vs *versionSet) addLiveFileNums(m map[FileNum]struct{}) {
   672  	current := vs.currentVersion()
   673  	for v := vs.versions.Front(); true; v = v.Next() {
   674  		for _, lm := range v.Levels {
   675  			iter := lm.Iter()
   676  			for f := iter.First(); f != nil; f = iter.Next() {
   677  				m[f.FileNum] = struct{}{}
   678  			}
   679  		}
   680  		if v == current {
   681  			break
   682  		}
   683  	}
   684  }
   685  
   686  func (vs *versionSet) addObsoleteLocked(obsolete []*manifest.FileMetadata) {
   687  	for _, fileMeta := range obsolete {
   688  		// Note that the obsolete tables are no longer zombie by the definition of
   689  		// zombie, but we leave them in the zombie tables map until they are
   690  		// deleted from disk.
   691  		if _, ok := vs.zombieTables[fileMeta.FileNum]; !ok {
   692  			vs.opts.Logger.Fatalf("MANIFEST obsolete table %s not marked as zombie", fileMeta.FileNum)
   693  		}
   694  	}
   695  	vs.obsoleteTables = append(vs.obsoleteTables, obsolete...)
   696  	vs.incrementObsoleteTablesLocked(obsolete)
   697  }
   698  
   699  func (vs *versionSet) incrementObsoleteTablesLocked(obsolete []*manifest.FileMetadata) {
   700  	for _, fileMeta := range obsolete {
   701  		vs.metrics.Table.ObsoleteCount++
   702  		vs.metrics.Table.ObsoleteSize += fileMeta.Size
   703  	}
   704  }
   705  
   706  func setCurrentFunc(
   707  	vers FormatMajorVersion, marker *atomicfs.Marker, fs vfs.FS, dirname string, dir vfs.File,
   708  ) func(FileNum) error {
   709  	if vers < formatVersionedManifestMarker {
   710  		// Pebble versions before `formatVersionedManifestMarker` used
   711  		// the CURRENT file to signal which MANIFEST is current. Ignore
   712  		// the filename read during LocateMarker.
   713  		return func(manifestFileNum FileNum) error {
   714  			if err := setCurrentFile(dirname, fs, manifestFileNum); err != nil {
   715  				return err
   716  			}
   717  			if err := dir.Sync(); err != nil {
   718  				// This is a  panic here, rather than higher in the call
   719  				// stack, for parity with the atomicfs.Marker behavior.
   720  				// A panic is always necessary because failed Syncs are
   721  				// unrecoverable.
   722  				panic(errors.Wrap(err, "fatal: MANIFEST dirsync failed"))
   723  			}
   724  			return nil
   725  		}
   726  	}
   727  	return setCurrentFuncMarker(marker, fs, dirname)
   728  }
   729  
   730  func setCurrentFuncMarker(marker *atomicfs.Marker, fs vfs.FS, dirname string) func(FileNum) error {
   731  	return func(manifestFileNum FileNum) error {
   732  		return marker.Move(base.MakeFilename(fileTypeManifest, manifestFileNum))
   733  	}
   734  }
   735  
   736  func findCurrentManifest(
   737  	vers FormatMajorVersion, fs vfs.FS, dirname string,
   738  ) (marker *atomicfs.Marker, manifestNum FileNum, exists bool, err error) {
   739  	// NB: We always locate the manifest marker, even if we might not
   740  	// actually use it (because we're opening the database at an earlier
   741  	// format major version that uses the CURRENT file).  Locating a
   742  	// marker should succeed even if the marker has never been placed.
   743  	var filename string
   744  	marker, filename, err = atomicfs.LocateMarker(fs, dirname, manifestMarkerName)
   745  	if err != nil {
   746  		return nil, 0, false, err
   747  	}
   748  
   749  	if vers < formatVersionedManifestMarker {
   750  		// Pebble versions before `formatVersionedManifestMarker` used
   751  		// the CURRENT file to signal which MANIFEST is current. Ignore
   752  		// the filename read during LocateMarker.
   753  
   754  		manifestNum, err = readCurrentFile(fs, dirname)
   755  		if oserror.IsNotExist(err) {
   756  			return marker, 0, false, nil
   757  		} else if err != nil {
   758  			return marker, 0, false, err
   759  		}
   760  		return marker, manifestNum, true, nil
   761  	}
   762  
   763  	// The current format major version is >=
   764  	// formatVersionedManifestMarker indicating that the
   765  	// atomicfs.Marker is the source of truth on the current manifest.
   766  
   767  	if filename == "" {
   768  		// The marker hasn't been set yet. This database doesn't exist.
   769  		return marker, 0, false, nil
   770  	}
   771  
   772  	var ok bool
   773  	_, manifestNum, ok = base.ParseFilename(fs, filename)
   774  	if !ok {
   775  		return marker, 0, false, base.CorruptionErrorf("bitalostable: MANIFEST name %q is malformed", errors.Safe(filename))
   776  	}
   777  	return marker, manifestNum, true, nil
   778  }
   779  
   780  func readCurrentFile(fs vfs.FS, dirname string) (FileNum, error) {
   781  	// Read the CURRENT file to find the current manifest file.
   782  	current, err := fs.Open(base.MakeFilepath(fs, dirname, fileTypeCurrent, 0))
   783  	if err != nil {
   784  		return 0, errors.Wrapf(err, "bitalostable: could not open CURRENT file for DB %q", dirname)
   785  	}
   786  	defer current.Close()
   787  	stat, err := current.Stat()
   788  	if err != nil {
   789  		return 0, err
   790  	}
   791  	n := stat.Size()
   792  	if n == 0 {
   793  		return 0, errors.Errorf("bitalostable: CURRENT file for DB %q is empty", dirname)
   794  	}
   795  	if n > 4096 {
   796  		return 0, errors.Errorf("bitalostable: CURRENT file for DB %q is too large", dirname)
   797  	}
   798  	b := make([]byte, n)
   799  	_, err = current.ReadAt(b, 0)
   800  	if err != nil {
   801  		return 0, err
   802  	}
   803  	if b[n-1] != '\n' {
   804  		return 0, base.CorruptionErrorf("bitalostable: CURRENT file for DB %q is malformed", dirname)
   805  	}
   806  	b = bytes.TrimSpace(b)
   807  
   808  	_, manifestFileNum, ok := base.ParseFilename(fs, string(b))
   809  	if !ok {
   810  		return 0, base.CorruptionErrorf("bitalostable: MANIFEST name %q is malformed", errors.Safe(b))
   811  	}
   812  	return manifestFileNum, nil
   813  }
   814  
   815  func newFileMetrics(newFiles []manifest.NewFileEntry) map[int]*LevelMetrics {
   816  	m := map[int]*LevelMetrics{}
   817  	for _, nf := range newFiles {
   818  		lm := m[nf.Level]
   819  		if lm == nil {
   820  			lm = &LevelMetrics{}
   821  			m[nf.Level] = lm
   822  		}
   823  		lm.NumFiles++
   824  		lm.Size += int64(nf.Meta.Size)
   825  	}
   826  	return m
   827  }