github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/version_set.go (about)

     1  // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"io"
    11  	"sync"
    12  	"sync/atomic"
    13  
    14  	"github.com/cockroachdb/errors"
    15  	"github.com/cockroachdb/errors/oserror"
    16  	"github.com/cockroachdb/pebble/internal/base"
    17  	"github.com/cockroachdb/pebble/internal/invariants"
    18  	"github.com/cockroachdb/pebble/internal/manifest"
    19  	"github.com/cockroachdb/pebble/record"
    20  	"github.com/cockroachdb/pebble/vfs"
    21  	"github.com/cockroachdb/pebble/vfs/atomicfs"
    22  )
    23  
    24  const numLevels = manifest.NumLevels
    25  
    26  const manifestMarkerName = `manifest`
    27  
    28  // Provide type aliases for the various manifest structs.
    29  type bulkVersionEdit = manifest.BulkVersionEdit
    30  type deletedFileEntry = manifest.DeletedFileEntry
    31  type fileMetadata = manifest.FileMetadata
    32  type physicalMeta = manifest.PhysicalFileMeta
    33  type virtualMeta = manifest.VirtualFileMeta
    34  type fileBacking = manifest.FileBacking
    35  type newFileEntry = manifest.NewFileEntry
    36  type version = manifest.Version
    37  type versionEdit = manifest.VersionEdit
    38  type versionList = manifest.VersionList
    39  
    40  // versionSet manages a collection of immutable versions, and manages the
    41  // creation of a new version from the most recent version. A new version is
    42  // created from an existing version by applying a version edit which is just
    43  // like it sounds: a delta from the previous version. Version edits are logged
    44  // to the MANIFEST file, which is replayed at startup.
    45  type versionSet struct {
    46  	// Next seqNum to use for WAL writes.
    47  	logSeqNum atomic.Uint64
    48  
    49  	// The upper bound on sequence numbers that have been assigned so far. A
    50  	// suffix of these sequence numbers may not have been written to a WAL. Both
    51  	// logSeqNum and visibleSeqNum are atomically updated by the commitPipeline.
    52  	// visibleSeqNum is <= logSeqNum.
    53  	visibleSeqNum atomic.Uint64
    54  
    55  	// Number of bytes present in sstables being written by in-progress
    56  	// compactions. This value will be zero if there are no in-progress
    57  	// compactions. Updated and read atomically.
    58  	atomicInProgressBytes atomic.Int64
    59  
    60  	// Immutable fields.
    61  	dirname string
    62  	// Set to DB.mu.
    63  	mu      *sync.Mutex
    64  	opts    *Options
    65  	fs      vfs.FS
    66  	cmp     Compare
    67  	cmpName string
    68  	// Dynamic base level allows the dynamic base level computation to be
    69  	// disabled. Used by tests which want to create specific LSM structures.
    70  	dynamicBaseLevel bool
    71  
    72  	// Mutable fields.
    73  	versions versionList
    74  	picker   compactionPicker
    75  
    76  	metrics Metrics
    77  
    78  	// A pointer to versionSet.addObsoleteLocked. Avoids allocating a new closure
    79  	// on the creation of every version.
    80  	obsoleteFn        func(obsolete []*fileBacking)
    81  	obsoleteTables    []fileInfo
    82  	obsoleteManifests []fileInfo
    83  	obsoleteOptions   []fileInfo
    84  
    85  	// Zombie tables which have been removed from the current version but are
    86  	// still referenced by an inuse iterator.
    87  	zombieTables map[base.DiskFileNum]uint64 // filenum -> size
    88  
    89  	// backingState is protected by the versionSet.logLock. It's populated
    90  	// during Open in versionSet.load, but it's not used concurrently during
    91  	// load.
    92  	backingState struct {
    93  		// fileBackingMap is a map for the FileBacking which is supporting virtual
    94  		// sstables in the latest version. Once the file backing is backing no
    95  		// virtual sstables in the latest version, it is removed from this map and
    96  		// the corresponding state is added to the zombieTables map. Note that we
    97  		// don't keep track of file backing which supports a virtual sstable
    98  		// which is not in the latest version.
    99  		fileBackingMap map[base.DiskFileNum]*fileBacking
   100  		// fileBackingSize is the sum of the sizes of the fileBackings in the
   101  		// fileBackingMap.
   102  		fileBackingSize uint64
   103  	}
   104  
   105  	// minUnflushedLogNum is the smallest WAL log file number corresponding to
   106  	// mutations that have not been flushed to an sstable.
   107  	minUnflushedLogNum base.DiskFileNum
   108  
   109  	// The next file number. A single counter is used to assign file
   110  	// numbers for the WAL, MANIFEST, sstable, and OPTIONS files.
   111  	nextFileNum uint64
   112  
   113  	// The current manifest file number.
   114  	manifestFileNum base.DiskFileNum
   115  	manifestMarker  *atomicfs.Marker
   116  
   117  	manifestFile          vfs.File
   118  	manifest              *record.Writer
   119  	setCurrent            func(base.DiskFileNum) error
   120  	getFormatMajorVersion func() FormatMajorVersion
   121  
   122  	writing    bool
   123  	writerCond sync.Cond
   124  	// State for deciding when to write a snapshot. Protected by mu.
   125  	rotationHelper record.RotationHelper
   126  }
   127  
   128  func (vs *versionSet) init(
   129  	dirname string,
   130  	opts *Options,
   131  	marker *atomicfs.Marker,
   132  	setCurrent func(base.DiskFileNum) error,
   133  	getFMV func() FormatMajorVersion,
   134  	mu *sync.Mutex,
   135  ) {
   136  	vs.dirname = dirname
   137  	vs.mu = mu
   138  	vs.writerCond.L = mu
   139  	vs.opts = opts
   140  	vs.fs = opts.FS
   141  	vs.cmp = opts.Comparer.Compare
   142  	vs.cmpName = opts.Comparer.Name
   143  	vs.dynamicBaseLevel = true
   144  	vs.versions.Init(mu)
   145  	vs.obsoleteFn = vs.addObsoleteLocked
   146  	vs.zombieTables = make(map[base.DiskFileNum]uint64)
   147  	vs.backingState.fileBackingMap = make(map[base.DiskFileNum]*fileBacking)
   148  	vs.backingState.fileBackingSize = 0
   149  	vs.nextFileNum = 1
   150  	vs.manifestMarker = marker
   151  	vs.setCurrent = setCurrent
   152  	vs.getFormatMajorVersion = getFMV
   153  }
   154  
   155  // create creates a version set for a fresh DB.
   156  func (vs *versionSet) create(
   157  	jobID int,
   158  	dirname string,
   159  	opts *Options,
   160  	marker *atomicfs.Marker,
   161  	setCurrent func(base.DiskFileNum) error,
   162  	getFormatMajorVersion func() FormatMajorVersion,
   163  	mu *sync.Mutex,
   164  ) error {
   165  	vs.init(dirname, opts, marker, setCurrent, getFormatMajorVersion, mu)
   166  	newVersion := &version{}
   167  	vs.append(newVersion)
   168  	var err error
   169  
   170  	vs.picker = newCompactionPicker(newVersion, vs.opts, nil)
   171  	// Note that a "snapshot" version edit is written to the manifest when it is
   172  	// created.
   173  	vs.manifestFileNum = vs.getNextDiskFileNum()
   174  	err = vs.createManifest(vs.dirname, vs.manifestFileNum, vs.minUnflushedLogNum, vs.nextFileNum)
   175  	if err == nil {
   176  		if err = vs.manifest.Flush(); err != nil {
   177  			vs.opts.Logger.Fatalf("MANIFEST flush failed: %v", err)
   178  		}
   179  	}
   180  	if err == nil {
   181  		if err = vs.manifestFile.Sync(); err != nil {
   182  			vs.opts.Logger.Fatalf("MANIFEST sync failed: %v", err)
   183  		}
   184  	}
   185  	if err == nil {
   186  		// NB: setCurrent is responsible for syncing the data directory.
   187  		if err = vs.setCurrent(vs.manifestFileNum); err != nil {
   188  			vs.opts.Logger.Fatalf("MANIFEST set current failed: %v", err)
   189  		}
   190  	}
   191  
   192  	vs.opts.EventListener.ManifestCreated(ManifestCreateInfo{
   193  		JobID:   jobID,
   194  		Path:    base.MakeFilepath(vs.fs, vs.dirname, fileTypeManifest, vs.manifestFileNum),
   195  		FileNum: vs.manifestFileNum,
   196  		Err:     err,
   197  	})
   198  	if err != nil {
   199  		return err
   200  	}
   201  	return nil
   202  }
   203  
   204  // load loads the version set from the manifest file.
   205  func (vs *versionSet) load(
   206  	dirname string,
   207  	opts *Options,
   208  	manifestFileNum base.DiskFileNum,
   209  	marker *atomicfs.Marker,
   210  	setCurrent func(base.DiskFileNum) error,
   211  	getFormatMajorVersion func() FormatMajorVersion,
   212  	mu *sync.Mutex,
   213  ) error {
   214  	vs.init(dirname, opts, marker, setCurrent, getFormatMajorVersion, mu)
   215  
   216  	vs.manifestFileNum = manifestFileNum
   217  	manifestPath := base.MakeFilepath(opts.FS, dirname, fileTypeManifest, vs.manifestFileNum)
   218  	manifestFilename := opts.FS.PathBase(manifestPath)
   219  
   220  	// Read the versionEdits in the manifest file.
   221  	var bve bulkVersionEdit
   222  	bve.AddedByFileNum = make(map[base.FileNum]*fileMetadata)
   223  	manifest, err := vs.fs.Open(manifestPath)
   224  	if err != nil {
   225  		return errors.Wrapf(err, "pebble: could not open manifest file %q for DB %q",
   226  			errors.Safe(manifestFilename), dirname)
   227  	}
   228  	defer manifest.Close()
   229  	rr := record.NewReader(manifest, 0 /* logNum */)
   230  	for {
   231  		r, err := rr.Next()
   232  		if err == io.EOF || record.IsInvalidRecord(err) {
   233  			break
   234  		}
   235  		if err != nil {
   236  			return errors.Wrapf(err, "pebble: error when loading manifest file %q",
   237  				errors.Safe(manifestFilename))
   238  		}
   239  		var ve versionEdit
   240  		err = ve.Decode(r)
   241  		if err != nil {
   242  			// Break instead of returning an error if the record is corrupted
   243  			// or invalid.
   244  			if err == io.EOF || record.IsInvalidRecord(err) {
   245  				break
   246  			}
   247  			return err
   248  		}
   249  		if ve.ComparerName != "" {
   250  			if ve.ComparerName != vs.cmpName {
   251  				return errors.Errorf("pebble: manifest file %q for DB %q: "+
   252  					"comparer name from file %q != comparer name from Options %q",
   253  					errors.Safe(manifestFilename), dirname, errors.Safe(ve.ComparerName), errors.Safe(vs.cmpName))
   254  			}
   255  		}
   256  		if err := bve.Accumulate(&ve); err != nil {
   257  			return err
   258  		}
   259  		if ve.MinUnflushedLogNum != 0 {
   260  			vs.minUnflushedLogNum = ve.MinUnflushedLogNum
   261  		}
   262  		if ve.NextFileNum != 0 {
   263  			vs.nextFileNum = ve.NextFileNum
   264  		}
   265  		if ve.LastSeqNum != 0 {
   266  			// logSeqNum is the _next_ sequence number that will be assigned,
   267  			// while LastSeqNum is the last assigned sequence number. Note that
   268  			// this behaviour mimics that in RocksDB; the first sequence number
   269  			// assigned is one greater than the one present in the manifest
   270  			// (assuming no WALs contain higher sequence numbers than the
   271  			// manifest's LastSeqNum). Increment LastSeqNum by 1 to get the
   272  			// next sequence number that will be assigned.
   273  			//
   274  			// If LastSeqNum is less than SeqNumStart, increase it to at least
   275  			// SeqNumStart to leave ample room for reserved sequence numbers.
   276  			if ve.LastSeqNum+1 < base.SeqNumStart {
   277  				vs.logSeqNum.Store(base.SeqNumStart)
   278  			} else {
   279  				vs.logSeqNum.Store(ve.LastSeqNum + 1)
   280  			}
   281  		}
   282  	}
   283  	// We have already set vs.nextFileNum = 2 at the beginning of the
   284  	// function and could have only updated it to some other non-zero value,
   285  	// so it cannot be 0 here.
   286  	if vs.minUnflushedLogNum == 0 {
   287  		if vs.nextFileNum >= 2 {
   288  			// We either have a freshly created DB, or a DB created by RocksDB
   289  			// that has not had a single flushed SSTable yet. This is because
   290  			// RocksDB bumps up nextFileNum in this case without bumping up
   291  			// minUnflushedLogNum, even if WALs with non-zero file numbers are
   292  			// present in the directory.
   293  		} else {
   294  			return base.CorruptionErrorf("pebble: malformed manifest file %q for DB %q",
   295  				errors.Safe(manifestFilename), dirname)
   296  		}
   297  	}
   298  	vs.markFileNumUsed(vs.minUnflushedLogNum)
   299  
   300  	// Populate the fileBackingMap and the FileBacking for virtual sstables since
   301  	// we have finished version edit accumulation.
   302  	for _, s := range bve.AddedFileBacking {
   303  		vs.addFileBacking(s)
   304  	}
   305  
   306  	for _, fileNum := range bve.RemovedFileBacking {
   307  		vs.removeFileBacking(fileNum)
   308  	}
   309  
   310  	newVersion, err := bve.Apply(
   311  		nil, vs.cmp, opts.Comparer.FormatKey, opts.FlushSplitBytes,
   312  		opts.Experimental.ReadCompactionRate, nil, /* zombies */
   313  		getFormatMajorVersion().orderingInvariants(),
   314  	)
   315  	if err != nil {
   316  		return err
   317  	}
   318  	newVersion.L0Sublevels.InitCompactingFileInfo(nil /* in-progress compactions */)
   319  	vs.append(newVersion)
   320  
   321  	for i := range vs.metrics.Levels {
   322  		l := &vs.metrics.Levels[i]
   323  		l.NumFiles = int64(newVersion.Levels[i].Len())
   324  		files := newVersion.Levels[i].Slice()
   325  		l.Size = int64(files.SizeSum())
   326  	}
   327  
   328  	vs.picker = newCompactionPicker(newVersion, vs.opts, nil)
   329  	return nil
   330  }
   331  
   332  func (vs *versionSet) close() error {
   333  	if vs.manifestFile != nil {
   334  		if err := vs.manifestFile.Close(); err != nil {
   335  			return err
   336  		}
   337  	}
   338  	if vs.manifestMarker != nil {
   339  		if err := vs.manifestMarker.Close(); err != nil {
   340  			return err
   341  		}
   342  	}
   343  	return nil
   344  }
   345  
   346  // logLock locks the manifest for writing. The lock must be released by either
   347  // a call to logUnlock or logAndApply.
   348  //
   349  // DB.mu must be held when calling this method, but the mutex may be dropped and
   350  // re-acquired during the course of this method.
   351  func (vs *versionSet) logLock() {
   352  	// Wait for any existing writing to the manifest to complete, then mark the
   353  	// manifest as busy.
   354  	for vs.writing {
   355  		vs.writerCond.Wait()
   356  	}
   357  	vs.writing = true
   358  }
   359  
   360  // logUnlock releases the lock for manifest writing.
   361  //
   362  // DB.mu must be held when calling this method.
   363  func (vs *versionSet) logUnlock() {
   364  	if !vs.writing {
   365  		vs.opts.Logger.Fatalf("MANIFEST not locked for writing")
   366  	}
   367  	vs.writing = false
   368  	vs.writerCond.Signal()
   369  }
   370  
   371  // Only call if the DiskFileNum doesn't exist in the fileBackingMap.
   372  func (vs *versionSet) addFileBacking(backing *manifest.FileBacking) {
   373  	_, ok := vs.backingState.fileBackingMap[backing.DiskFileNum]
   374  	if ok {
   375  		panic("pebble: trying to add an existing file backing")
   376  	}
   377  	vs.backingState.fileBackingMap[backing.DiskFileNum] = backing
   378  	vs.backingState.fileBackingSize += backing.Size
   379  }
   380  
   381  // Only call if the the DiskFileNum exists in the fileBackingMap.
   382  func (vs *versionSet) removeFileBacking(dfn base.DiskFileNum) {
   383  	backing, ok := vs.backingState.fileBackingMap[dfn]
   384  	if !ok {
   385  		panic("pebble: trying to remove an unknown file backing")
   386  	}
   387  	delete(vs.backingState.fileBackingMap, dfn)
   388  	vs.backingState.fileBackingSize -= backing.Size
   389  }
   390  
   391  // logAndApply logs the version edit to the manifest, applies the version edit
   392  // to the current version, and installs the new version.
   393  //
   394  // DB.mu must be held when calling this method and will be released temporarily
   395  // while performing file I/O. Requires that the manifest is locked for writing
   396  // (see logLock). Will unconditionally release the manifest lock (via
   397  // logUnlock) even if an error occurs.
   398  //
   399  // inProgressCompactions is called while DB.mu is held, to get the list of
   400  // in-progress compactions.
   401  func (vs *versionSet) logAndApply(
   402  	jobID int,
   403  	ve *versionEdit,
   404  	metrics map[int]*LevelMetrics,
   405  	forceRotation bool,
   406  	inProgressCompactions func() []compactionInfo,
   407  ) error {
   408  	if !vs.writing {
   409  		vs.opts.Logger.Fatalf("MANIFEST not locked for writing")
   410  	}
   411  	defer vs.logUnlock()
   412  
   413  	if ve.MinUnflushedLogNum != 0 {
   414  		if ve.MinUnflushedLogNum < vs.minUnflushedLogNum ||
   415  			vs.nextFileNum <= uint64(ve.MinUnflushedLogNum) {
   416  			panic(fmt.Sprintf("pebble: inconsistent versionEdit minUnflushedLogNum %d",
   417  				ve.MinUnflushedLogNum))
   418  		}
   419  	}
   420  
   421  	// This is the next manifest filenum, but if the current file is too big we
   422  	// will write this ve to the next file which means what ve encodes is the
   423  	// current filenum and not the next one.
   424  	//
   425  	// TODO(sbhola): figure out why this is correct and update comment.
   426  	ve.NextFileNum = vs.nextFileNum
   427  
   428  	// LastSeqNum is set to the current upper bound on the assigned sequence
   429  	// numbers. Note that this is exactly the behavior of RocksDB. LastSeqNum is
   430  	// used to initialize versionSet.logSeqNum and versionSet.visibleSeqNum on
   431  	// replay. It must be higher than or equal to any than any sequence number
   432  	// written to an sstable, including sequence numbers in ingested files.
   433  	// Note that LastSeqNum is not (and cannot be) the minimum unflushed sequence
   434  	// number. This is fallout from ingestion which allows a sequence number X to
   435  	// be assigned to an ingested sstable even though sequence number X-1 resides
   436  	// in an unflushed memtable. logSeqNum is the _next_ sequence number that
   437  	// will be assigned, so subtract that by 1 to get the upper bound on the
   438  	// last assigned sequence number.
   439  	logSeqNum := vs.logSeqNum.Load()
   440  	ve.LastSeqNum = logSeqNum - 1
   441  	if logSeqNum == 0 {
   442  		// logSeqNum is initialized to 1 in Open() if there are no previous WAL
   443  		// or manifest records, so this case should never happen.
   444  		vs.opts.Logger.Fatalf("logSeqNum must be a positive integer: %d", logSeqNum)
   445  	}
   446  
   447  	currentVersion := vs.currentVersion()
   448  	fmv := vs.getFormatMajorVersion()
   449  	orderingInvariants := fmv.orderingInvariants()
   450  	var newVersion *version
   451  
   452  	// Generate a new manifest if we don't currently have one, or forceRotation
   453  	// is true, or the current one is too large.
   454  	//
   455  	// For largeness, we do not exclusively use MaxManifestFileSize size
   456  	// threshold since we have had incidents where due to either large keys or
   457  	// large numbers of files, each edit results in a snapshot + write of the
   458  	// edit. This slows the system down since each flush or compaction is
   459  	// writing a new manifest snapshot. The primary goal of the size-based
   460  	// rollover logic is to ensure that when reopening a DB, the number of edits
   461  	// that need to be replayed on top of the snapshot is "sane". Rolling over
   462  	// to a new manifest after each edit is not relevant to that goal.
   463  	//
   464  	// Consider the following cases:
   465  	// - The number of live files F in the DB is roughly stable: after writing
   466  	//   the snapshot (with F files), say we require that there be enough edits
   467  	//   such that the cumulative number of files in those edits, E, be greater
   468  	//   than F. This will ensure that the total amount of time in logAndApply
   469  	//   that is spent in snapshot writing is ~50%.
   470  	//
   471  	// - The number of live files F in the DB is shrinking drastically, say from
   472  	//   F to F/10: This can happen for various reasons, like wide range
   473  	//   tombstones, or large numbers of smaller than usual files that are being
   474  	//   merged together into larger files. And say the new files generated
   475  	//   during this shrinkage is insignificant compared to F/10, and so for
   476  	//   this example we will assume it is effectively 0. After this shrinking,
   477  	//   E = 0.9F, and so if we used the previous snapshot file count, F, as the
   478  	//   threshold that needs to be exceeded, we will further delay the snapshot
   479  	//   writing. Which means on DB reopen we will need to replay 0.9F edits to
   480  	//   get to a version with 0.1F files. It would be better to create a new
   481  	//   snapshot when E exceeds the number of files in the current version.
   482  	//
   483  	// - The number of live files F in the DB is growing via perfect ingests
   484  	//   into L6: Say we wrote the snapshot when there were F files and now we
   485  	//   have 10F files, so E = 9F. We will further delay writing a new
   486  	//   snapshot. This case can be critiqued as contrived, but we consider it
   487  	//   nonetheless.
   488  	//
   489  	// The logic below uses the min of the last snapshot file count and the file
   490  	// count in the current version.
   491  	vs.rotationHelper.AddRecord(int64(len(ve.DeletedFiles) + len(ve.NewFiles)))
   492  	sizeExceeded := vs.manifest.Size() >= vs.opts.MaxManifestFileSize
   493  	requireRotation := forceRotation || vs.manifest == nil
   494  
   495  	var nextSnapshotFilecount int64
   496  	for i := range vs.metrics.Levels {
   497  		nextSnapshotFilecount += vs.metrics.Levels[i].NumFiles
   498  	}
   499  	if sizeExceeded && !requireRotation {
   500  		requireRotation = vs.rotationHelper.ShouldRotate(nextSnapshotFilecount)
   501  	}
   502  	var newManifestFileNum base.DiskFileNum
   503  	var prevManifestFileSize uint64
   504  	if requireRotation {
   505  		newManifestFileNum = vs.getNextDiskFileNum()
   506  		prevManifestFileSize = uint64(vs.manifest.Size())
   507  	}
   508  
   509  	// Grab certain values before releasing vs.mu, in case createManifest() needs
   510  	// to be called.
   511  	minUnflushedLogNum := vs.minUnflushedLogNum
   512  	nextFileNum := vs.nextFileNum
   513  
   514  	var zombies map[base.DiskFileNum]uint64
   515  	if err := func() error {
   516  		vs.mu.Unlock()
   517  		defer vs.mu.Lock()
   518  
   519  		var err error
   520  		if vs.getFormatMajorVersion() < FormatVirtualSSTables && len(ve.CreatedBackingTables) > 0 {
   521  			return errors.AssertionFailedf("MANIFEST cannot contain virtual sstable records due to format major version")
   522  		}
   523  		newVersion, zombies, err = manifest.AccumulateIncompleteAndApplySingleVE(
   524  			ve, currentVersion, vs.cmp, vs.opts.Comparer.FormatKey,
   525  			vs.opts.FlushSplitBytes, vs.opts.Experimental.ReadCompactionRate,
   526  			vs.backingState.fileBackingMap, vs.addFileBacking, vs.removeFileBacking,
   527  			orderingInvariants,
   528  		)
   529  		if err != nil {
   530  			return errors.Wrap(err, "MANIFEST apply failed")
   531  		}
   532  
   533  		if newManifestFileNum != 0 {
   534  			if err := vs.createManifest(vs.dirname, newManifestFileNum, minUnflushedLogNum, nextFileNum); err != nil {
   535  				vs.opts.EventListener.ManifestCreated(ManifestCreateInfo{
   536  					JobID:   jobID,
   537  					Path:    base.MakeFilepath(vs.fs, vs.dirname, fileTypeManifest, newManifestFileNum),
   538  					FileNum: newManifestFileNum,
   539  					Err:     err,
   540  				})
   541  				return errors.Wrap(err, "MANIFEST create failed")
   542  			}
   543  		}
   544  
   545  		w, err := vs.manifest.Next()
   546  		if err != nil {
   547  			return errors.Wrap(err, "MANIFEST next record write failed")
   548  		}
   549  
   550  		// NB: Any error from this point on is considered fatal as we don't know if
   551  		// the MANIFEST write occurred or not. Trying to determine that is
   552  		// fraught. Instead we rely on the standard recovery mechanism run when a
   553  		// database is open. In particular, that mechanism generates a new MANIFEST
   554  		// and ensures it is synced.
   555  		if err := ve.Encode(w); err != nil {
   556  			return errors.Wrap(err, "MANIFEST write failed")
   557  		}
   558  		if err := vs.manifest.Flush(); err != nil {
   559  			return errors.Wrap(err, "MANIFEST flush failed")
   560  		}
   561  		if err := vs.manifestFile.Sync(); err != nil {
   562  			return errors.Wrap(err, "MANIFEST sync failed")
   563  		}
   564  		if newManifestFileNum != 0 {
   565  			// NB: setCurrent is responsible for syncing the data directory.
   566  			if err := vs.setCurrent(newManifestFileNum); err != nil {
   567  				return errors.Wrap(err, "MANIFEST set current failed")
   568  			}
   569  			vs.opts.EventListener.ManifestCreated(ManifestCreateInfo{
   570  				JobID:   jobID,
   571  				Path:    base.MakeFilepath(vs.fs, vs.dirname, fileTypeManifest, newManifestFileNum),
   572  				FileNum: newManifestFileNum,
   573  			})
   574  		}
   575  		return nil
   576  	}(); err != nil {
   577  		// Any error encountered during any of the operations in the previous
   578  		// closure are considered fatal. Treating such errors as fatal is preferred
   579  		// to attempting to unwind various file and b-tree reference counts, and
   580  		// re-generating L0 sublevel metadata. This may change in the future, if
   581  		// certain manifest / WAL operations become retryable. For more context, see
   582  		// #1159 and #1792.
   583  		vs.opts.Logger.Fatalf("%s", err)
   584  		return err
   585  	}
   586  
   587  	if requireRotation {
   588  		// Successfully rotated.
   589  		vs.rotationHelper.Rotate(nextSnapshotFilecount)
   590  	}
   591  	// Now that DB.mu is held again, initialize compacting file info in
   592  	// L0Sublevels.
   593  	inProgress := inProgressCompactions()
   594  
   595  	newVersion.L0Sublevels.InitCompactingFileInfo(inProgressL0Compactions(inProgress))
   596  
   597  	// Update the zombie tables set first, as installation of the new version
   598  	// will unref the previous version which could result in addObsoleteLocked
   599  	// being called.
   600  	for fileNum, size := range zombies {
   601  		vs.zombieTables[fileNum] = size
   602  	}
   603  
   604  	// Install the new version.
   605  	vs.append(newVersion)
   606  	if ve.MinUnflushedLogNum != 0 {
   607  		vs.minUnflushedLogNum = ve.MinUnflushedLogNum
   608  	}
   609  	if newManifestFileNum != 0 {
   610  		if vs.manifestFileNum != 0 {
   611  			vs.obsoleteManifests = append(vs.obsoleteManifests, fileInfo{
   612  				fileNum:  vs.manifestFileNum,
   613  				fileSize: prevManifestFileSize,
   614  			})
   615  		}
   616  		vs.manifestFileNum = newManifestFileNum
   617  	}
   618  
   619  	for level, update := range metrics {
   620  		vs.metrics.Levels[level].Add(update)
   621  	}
   622  	for i := range vs.metrics.Levels {
   623  		l := &vs.metrics.Levels[i]
   624  		l.NumFiles = int64(newVersion.Levels[i].Len())
   625  		l.NumVirtualFiles = newVersion.Levels[i].NumVirtual
   626  		l.VirtualSize = newVersion.Levels[i].VirtualSize
   627  		l.Size = int64(newVersion.Levels[i].Size())
   628  
   629  		l.Sublevels = 0
   630  		if l.NumFiles > 0 {
   631  			l.Sublevels = 1
   632  		}
   633  		if invariants.Enabled {
   634  			levelFiles := newVersion.Levels[i].Slice()
   635  			if size := int64(levelFiles.SizeSum()); l.Size != size {
   636  				vs.opts.Logger.Fatalf("versionSet metrics L%d Size = %d, actual size = %d", i, l.Size, size)
   637  			}
   638  			if nVirtual := levelFiles.NumVirtual(); nVirtual != l.NumVirtualFiles {
   639  				vs.opts.Logger.Fatalf(
   640  					"versionSet metrics L%d NumVirtual = %d, actual NumVirtual = %d",
   641  					i, l.NumVirtualFiles, nVirtual,
   642  				)
   643  			}
   644  			if vSize := levelFiles.VirtualSizeSum(); vSize != l.VirtualSize {
   645  				vs.opts.Logger.Fatalf(
   646  					"versionSet metrics L%d Virtual size = %d, actual size = %d",
   647  					i, l.VirtualSize, vSize,
   648  				)
   649  			}
   650  		}
   651  	}
   652  	vs.metrics.Levels[0].Sublevels = int32(len(newVersion.L0SublevelFiles))
   653  
   654  	vs.picker = newCompactionPicker(newVersion, vs.opts, inProgress)
   655  	if !vs.dynamicBaseLevel {
   656  		vs.picker.forceBaseLevel1()
   657  	}
   658  	return nil
   659  }
   660  
   661  func (vs *versionSet) incrementCompactions(
   662  	kind compactionKind, extraLevels []*compactionLevel, pickerMetrics compactionPickerMetrics,
   663  ) {
   664  	switch kind {
   665  	case compactionKindDefault:
   666  		vs.metrics.Compact.Count++
   667  		vs.metrics.Compact.DefaultCount++
   668  
   669  	case compactionKindFlush, compactionKindIngestedFlushable:
   670  		vs.metrics.Flush.Count++
   671  
   672  	case compactionKindMove:
   673  		vs.metrics.Compact.Count++
   674  		vs.metrics.Compact.MoveCount++
   675  
   676  	case compactionKindDeleteOnly:
   677  		vs.metrics.Compact.Count++
   678  		vs.metrics.Compact.DeleteOnlyCount++
   679  
   680  	case compactionKindElisionOnly:
   681  		vs.metrics.Compact.Count++
   682  		vs.metrics.Compact.ElisionOnlyCount++
   683  
   684  	case compactionKindRead:
   685  		vs.metrics.Compact.Count++
   686  		vs.metrics.Compact.ReadCount++
   687  
   688  	case compactionKindRewrite:
   689  		vs.metrics.Compact.Count++
   690  		vs.metrics.Compact.RewriteCount++
   691  	}
   692  	if len(extraLevels) > 0 {
   693  		vs.metrics.Compact.MultiLevelCount++
   694  	}
   695  }
   696  
   697  func (vs *versionSet) incrementCompactionBytes(numBytes int64) {
   698  	vs.atomicInProgressBytes.Add(numBytes)
   699  }
   700  
   701  // createManifest creates a manifest file that contains a snapshot of vs.
   702  func (vs *versionSet) createManifest(
   703  	dirname string, fileNum, minUnflushedLogNum base.DiskFileNum, nextFileNum uint64,
   704  ) (err error) {
   705  	var (
   706  		filename     = base.MakeFilepath(vs.fs, dirname, fileTypeManifest, fileNum)
   707  		manifestFile vfs.File
   708  		manifest     *record.Writer
   709  	)
   710  	defer func() {
   711  		if manifest != nil {
   712  			manifest.Close()
   713  		}
   714  		if manifestFile != nil {
   715  			manifestFile.Close()
   716  		}
   717  		if err != nil {
   718  			vs.fs.Remove(filename)
   719  		}
   720  	}()
   721  	manifestFile, err = vs.fs.Create(filename)
   722  	if err != nil {
   723  		return err
   724  	}
   725  	manifest = record.NewWriter(manifestFile)
   726  
   727  	snapshot := versionEdit{
   728  		ComparerName: vs.cmpName,
   729  	}
   730  	dedup := make(map[base.DiskFileNum]struct{})
   731  	for level, levelMetadata := range vs.currentVersion().Levels {
   732  		iter := levelMetadata.Iter()
   733  		for meta := iter.First(); meta != nil; meta = iter.Next() {
   734  			snapshot.NewFiles = append(snapshot.NewFiles, newFileEntry{
   735  				Level: level,
   736  				Meta:  meta,
   737  			})
   738  			if _, ok := dedup[meta.FileBacking.DiskFileNum]; meta.Virtual && !ok {
   739  				dedup[meta.FileBacking.DiskFileNum] = struct{}{}
   740  				snapshot.CreatedBackingTables = append(
   741  					snapshot.CreatedBackingTables,
   742  					meta.FileBacking,
   743  				)
   744  			}
   745  		}
   746  	}
   747  
   748  	// When creating a version snapshot for an existing DB, this snapshot VersionEdit will be
   749  	// immediately followed by another VersionEdit (being written in logAndApply()). That
   750  	// VersionEdit always contains a LastSeqNum, so we don't need to include that in the snapshot.
   751  	// But it does not necessarily include MinUnflushedLogNum, NextFileNum, so we initialize those
   752  	// using the corresponding fields in the versionSet (which came from the latest preceding
   753  	// VersionEdit that had those fields).
   754  	snapshot.MinUnflushedLogNum = minUnflushedLogNum
   755  	snapshot.NextFileNum = nextFileNum
   756  
   757  	w, err1 := manifest.Next()
   758  	if err1 != nil {
   759  		return err1
   760  	}
   761  	if err := snapshot.Encode(w); err != nil {
   762  		return err
   763  	}
   764  
   765  	if vs.manifest != nil {
   766  		vs.manifest.Close()
   767  		vs.manifest = nil
   768  	}
   769  	if vs.manifestFile != nil {
   770  		if err := vs.manifestFile.Close(); err != nil {
   771  			return err
   772  		}
   773  		vs.manifestFile = nil
   774  	}
   775  
   776  	vs.manifest, manifest = manifest, nil
   777  	vs.manifestFile, manifestFile = manifestFile, nil
   778  	return nil
   779  }
   780  
   781  func (vs *versionSet) markFileNumUsed(fileNum base.DiskFileNum) {
   782  	if vs.nextFileNum <= uint64(fileNum) {
   783  		vs.nextFileNum = uint64(fileNum + 1)
   784  	}
   785  }
   786  
   787  func (vs *versionSet) getNextFileNum() base.FileNum {
   788  	x := vs.nextFileNum
   789  	vs.nextFileNum++
   790  	return base.FileNum(x)
   791  }
   792  
   793  func (vs *versionSet) getNextDiskFileNum() base.DiskFileNum {
   794  	x := vs.nextFileNum
   795  	vs.nextFileNum++
   796  	return base.DiskFileNum(x)
   797  }
   798  
   799  func (vs *versionSet) append(v *version) {
   800  	if v.Refs() != 0 {
   801  		panic("pebble: version should be unreferenced")
   802  	}
   803  	if !vs.versions.Empty() {
   804  		vs.versions.Back().UnrefLocked()
   805  	}
   806  	v.Deleted = vs.obsoleteFn
   807  	v.Ref()
   808  	vs.versions.PushBack(v)
   809  }
   810  
   811  func (vs *versionSet) currentVersion() *version {
   812  	return vs.versions.Back()
   813  }
   814  
   815  func (vs *versionSet) addLiveFileNums(m map[base.DiskFileNum]struct{}) {
   816  	current := vs.currentVersion()
   817  	for v := vs.versions.Front(); true; v = v.Next() {
   818  		for _, lm := range v.Levels {
   819  			iter := lm.Iter()
   820  			for f := iter.First(); f != nil; f = iter.Next() {
   821  				m[f.FileBacking.DiskFileNum] = struct{}{}
   822  			}
   823  		}
   824  		if v == current {
   825  			break
   826  		}
   827  	}
   828  }
   829  
   830  // addObsoleteLocked will add the fileInfo associated with obsolete backing
   831  // sstables to the obsolete tables list.
   832  //
   833  // The file backings in the obsolete list must not appear more than once.
   834  //
   835  // DB.mu must be held when addObsoleteLocked is called.
   836  func (vs *versionSet) addObsoleteLocked(obsolete []*fileBacking) {
   837  	if len(obsolete) == 0 {
   838  		return
   839  	}
   840  
   841  	obsoleteFileInfo := make([]fileInfo, len(obsolete))
   842  	for i, bs := range obsolete {
   843  		obsoleteFileInfo[i].fileNum = bs.DiskFileNum
   844  		obsoleteFileInfo[i].fileSize = bs.Size
   845  	}
   846  
   847  	if invariants.Enabled {
   848  		dedup := make(map[base.DiskFileNum]struct{})
   849  		for _, fi := range obsoleteFileInfo {
   850  			dedup[fi.fileNum] = struct{}{}
   851  		}
   852  		if len(dedup) != len(obsoleteFileInfo) {
   853  			panic("pebble: duplicate FileBacking present in obsolete list")
   854  		}
   855  	}
   856  
   857  	for _, fi := range obsoleteFileInfo {
   858  		// Note that the obsolete tables are no longer zombie by the definition of
   859  		// zombie, but we leave them in the zombie tables map until they are
   860  		// deleted from disk.
   861  		if _, ok := vs.zombieTables[fi.fileNum]; !ok {
   862  			vs.opts.Logger.Fatalf("MANIFEST obsolete table %s not marked as zombie", fi.fileNum)
   863  		}
   864  	}
   865  
   866  	vs.obsoleteTables = append(vs.obsoleteTables, obsoleteFileInfo...)
   867  	vs.updateObsoleteTableMetricsLocked()
   868  }
   869  
   870  // addObsolete will acquire DB.mu, so DB.mu must not be held when this is
   871  // called.
   872  func (vs *versionSet) addObsolete(obsolete []*fileBacking) {
   873  	vs.mu.Lock()
   874  	defer vs.mu.Unlock()
   875  	vs.addObsoleteLocked(obsolete)
   876  }
   877  
   878  func (vs *versionSet) updateObsoleteTableMetricsLocked() {
   879  	vs.metrics.Table.ObsoleteCount = int64(len(vs.obsoleteTables))
   880  	vs.metrics.Table.ObsoleteSize = 0
   881  	for _, fi := range vs.obsoleteTables {
   882  		vs.metrics.Table.ObsoleteSize += fi.fileSize
   883  	}
   884  }
   885  
   886  func setCurrentFunc(
   887  	vers FormatMajorVersion, marker *atomicfs.Marker, fs vfs.FS, dirname string, dir vfs.File,
   888  ) func(base.DiskFileNum) error {
   889  	if vers < formatVersionedManifestMarker {
   890  		// Pebble versions before `formatVersionedManifestMarker` used
   891  		// the CURRENT file to signal which MANIFEST is current. Ignore
   892  		// the filename read during LocateMarker.
   893  		return func(manifestFileNum base.DiskFileNum) error {
   894  			if err := setCurrentFile(dirname, fs, manifestFileNum); err != nil {
   895  				return err
   896  			}
   897  			if err := dir.Sync(); err != nil {
   898  				// This is a panic here, rather than higher in the call
   899  				// stack, for parity with the atomicfs.Marker behavior.
   900  				// A panic is always necessary because failed Syncs are
   901  				// unrecoverable.
   902  				panic(errors.Wrap(err, "fatal: MANIFEST dirsync failed"))
   903  			}
   904  			return nil
   905  		}
   906  	}
   907  	return setCurrentFuncMarker(marker, fs, dirname)
   908  }
   909  
   910  func setCurrentFuncMarker(
   911  	marker *atomicfs.Marker, fs vfs.FS, dirname string,
   912  ) func(base.DiskFileNum) error {
   913  	return func(manifestFileNum base.DiskFileNum) error {
   914  		return marker.Move(base.MakeFilename(fileTypeManifest, manifestFileNum))
   915  	}
   916  }
   917  
   918  func findCurrentManifest(
   919  	vers FormatMajorVersion, fs vfs.FS, dirname string,
   920  ) (marker *atomicfs.Marker, manifestNum base.DiskFileNum, exists bool, err error) {
   921  	// NB: We always locate the manifest marker, even if we might not
   922  	// actually use it (because we're opening the database at an earlier
   923  	// format major version that uses the CURRENT file).  Locating a
   924  	// marker should succeed even if the marker has never been placed.
   925  	var filename string
   926  	marker, filename, err = atomicfs.LocateMarker(fs, dirname, manifestMarkerName)
   927  	if err != nil {
   928  		return nil, base.FileNum(0).DiskFileNum(), false, err
   929  	}
   930  
   931  	if vers < formatVersionedManifestMarker {
   932  		// Pebble versions before `formatVersionedManifestMarker` used
   933  		// the CURRENT file to signal which MANIFEST is current. Ignore
   934  		// the filename read during LocateMarker.
   935  
   936  		manifestNum, err = readCurrentFile(fs, dirname)
   937  		if oserror.IsNotExist(err) {
   938  			return marker, base.FileNum(0).DiskFileNum(), false, nil
   939  		} else if err != nil {
   940  			return marker, base.FileNum(0).DiskFileNum(), false, err
   941  		}
   942  		return marker, manifestNum, true, nil
   943  	}
   944  
   945  	// The current format major version is >=
   946  	// formatVersionedManifestMarker indicating that the
   947  	// atomicfs.Marker is the source of truth on the current manifest.
   948  
   949  	if filename == "" {
   950  		// The marker hasn't been set yet. This database doesn't exist.
   951  		return marker, base.FileNum(0).DiskFileNum(), false, nil
   952  	}
   953  
   954  	var ok bool
   955  	_, manifestNum, ok = base.ParseFilename(fs, filename)
   956  	if !ok {
   957  		return marker, base.FileNum(0).DiskFileNum(), false, base.CorruptionErrorf("pebble: MANIFEST name %q is malformed", errors.Safe(filename))
   958  	}
   959  	return marker, manifestNum, true, nil
   960  }
   961  
   962  func readCurrentFile(fs vfs.FS, dirname string) (base.DiskFileNum, error) {
   963  	// Read the CURRENT file to find the current manifest file.
   964  	current, err := fs.Open(base.MakeFilepath(fs, dirname, fileTypeCurrent, base.FileNum(0).DiskFileNum()))
   965  	if err != nil {
   966  		return base.FileNum(0).DiskFileNum(), errors.Wrapf(err, "pebble: could not open CURRENT file for DB %q", dirname)
   967  	}
   968  	defer current.Close()
   969  	stat, err := current.Stat()
   970  	if err != nil {
   971  		return base.FileNum(0).DiskFileNum(), err
   972  	}
   973  	n := stat.Size()
   974  	if n == 0 {
   975  		return base.FileNum(0).DiskFileNum(), errors.Errorf("pebble: CURRENT file for DB %q is empty", dirname)
   976  	}
   977  	if n > 4096 {
   978  		return base.FileNum(0).DiskFileNum(), errors.Errorf("pebble: CURRENT file for DB %q is too large", dirname)
   979  	}
   980  	b := make([]byte, n)
   981  	_, err = current.ReadAt(b, 0)
   982  	if err != nil {
   983  		return base.FileNum(0).DiskFileNum(), err
   984  	}
   985  	if b[n-1] != '\n' {
   986  		return base.FileNum(0).DiskFileNum(), base.CorruptionErrorf("pebble: CURRENT file for DB %q is malformed", dirname)
   987  	}
   988  	b = bytes.TrimSpace(b)
   989  
   990  	_, manifestFileNum, ok := base.ParseFilename(fs, string(b))
   991  	if !ok {
   992  		return base.FileNum(0).DiskFileNum(), base.CorruptionErrorf("pebble: MANIFEST name %q is malformed", errors.Safe(b))
   993  	}
   994  	return manifestFileNum, nil
   995  }
   996  
   997  func newFileMetrics(newFiles []manifest.NewFileEntry) map[int]*LevelMetrics {
   998  	m := map[int]*LevelMetrics{}
   999  	for _, nf := range newFiles {
  1000  		lm := m[nf.Level]
  1001  		if lm == nil {
  1002  			lm = &LevelMetrics{}
  1003  			m[nf.Level] = lm
  1004  		}
  1005  		lm.NumFiles++
  1006  		lm.Size += int64(nf.Meta.Size)
  1007  	}
  1008  	return m
  1009  }