github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/open.go (about)

     1  // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"bytes"
     9  	"cmp"
    10  	"context"
    11  	"encoding/binary"
    12  	"fmt"
    13  	"io"
    14  	"math"
    15  	"os"
    16  	"slices"
    17  	"sync/atomic"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/errors"
    21  	"github.com/cockroachdb/errors/oserror"
    22  	"github.com/cockroachdb/pebble/internal/arenaskl"
    23  	"github.com/cockroachdb/pebble/internal/base"
    24  	"github.com/cockroachdb/pebble/internal/cache"
    25  	"github.com/cockroachdb/pebble/internal/constants"
    26  	"github.com/cockroachdb/pebble/internal/invariants"
    27  	"github.com/cockroachdb/pebble/internal/manifest"
    28  	"github.com/cockroachdb/pebble/internal/manual"
    29  	"github.com/cockroachdb/pebble/objstorage"
    30  	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
    31  	"github.com/cockroachdb/pebble/record"
    32  	"github.com/cockroachdb/pebble/sstable"
    33  	"github.com/cockroachdb/pebble/vfs"
    34  	"github.com/prometheus/client_golang/prometheus"
    35  )
    36  
    37  const (
    38  	initialMemTableSize = 256 << 10 // 256 KB
    39  
    40  	// The max batch size is limited by the uint32 offsets stored in
    41  	// internal/batchskl.node, DeferredBatchOp, and flushableBatchEntry.
    42  	//
    43  	// We limit the size to MaxUint32 (just short of 4GB) so that the exclusive
    44  	// end of an allocation fits in uint32.
    45  	//
    46  	// On 32-bit systems, slices are naturally limited to MaxInt (just short of
    47  	// 2GB).
    48  	maxBatchSize = constants.MaxUint32OrInt
    49  
    50  	// The max memtable size is limited by the uint32 offsets stored in
    51  	// internal/arenaskl.node, DeferredBatchOp, and flushableBatchEntry.
    52  	//
    53  	// We limit the size to MaxUint32 (just short of 4GB) so that the exclusive
    54  	// end of an allocation fits in uint32.
    55  	//
    56  	// On 32-bit systems, slices are naturally limited to MaxInt (just short of
    57  	// 2GB).
    58  	maxMemTableSize = constants.MaxUint32OrInt
    59  )
    60  
    61  // TableCacheSize can be used to determine the table
    62  // cache size for a single db, given the maximum open
    63  // files which can be used by a table cache which is
    64  // only used by a single db.
    65  func TableCacheSize(maxOpenFiles int) int {
    66  	tableCacheSize := maxOpenFiles - numNonTableCacheFiles
    67  	if tableCacheSize < minTableCacheSize {
    68  		tableCacheSize = minTableCacheSize
    69  	}
    70  	return tableCacheSize
    71  }
    72  
    73  // Open opens a DB whose files live in the given directory.
    74  func Open(dirname string, opts *Options) (db *DB, _ error) {
    75  	// Make a copy of the options so that we don't mutate the passed in options.
    76  	opts = opts.Clone()
    77  	opts = opts.EnsureDefaults()
    78  	if err := opts.Validate(); err != nil {
    79  		return nil, err
    80  	}
    81  	if opts.LoggerAndTracer == nil {
    82  		opts.LoggerAndTracer = &base.LoggerWithNoopTracer{Logger: opts.Logger}
    83  	} else {
    84  		opts.Logger = opts.LoggerAndTracer
    85  	}
    86  
    87  	// In all error cases, we return db = nil; this is used by various
    88  	// deferred cleanups.
    89  
    90  	// Open the database and WAL directories first.
    91  	walDirname, dataDir, walDir, err := prepareAndOpenDirs(dirname, opts)
    92  	if err != nil {
    93  		return nil, errors.Wrapf(err, "error opening database at %q", dirname)
    94  	}
    95  	defer func() {
    96  		if db == nil {
    97  			if walDir != dataDir {
    98  				walDir.Close()
    99  			}
   100  			dataDir.Close()
   101  		}
   102  	}()
   103  
   104  	// Lock the database directory.
   105  	var fileLock *Lock
   106  	if opts.Lock != nil {
   107  		// The caller already acquired the database lock. Ensure that the
   108  		// directory matches.
   109  		if dirname != opts.Lock.dirname {
   110  			return nil, errors.Newf("pebble: opts.Lock acquired in %q not %q", opts.Lock.dirname, dirname)
   111  		}
   112  		if err := opts.Lock.refForOpen(); err != nil {
   113  			return nil, err
   114  		}
   115  		fileLock = opts.Lock
   116  	} else {
   117  		fileLock, err = LockDirectory(dirname, opts.FS)
   118  		if err != nil {
   119  			return nil, err
   120  		}
   121  	}
   122  	defer func() {
   123  		if db == nil {
   124  			fileLock.Close()
   125  		}
   126  	}()
   127  
   128  	// Establish the format major version.
   129  	formatVersion, formatVersionMarker, err := lookupFormatMajorVersion(opts.FS, dirname)
   130  	if err != nil {
   131  		return nil, err
   132  	}
   133  	defer func() {
   134  		if db == nil {
   135  			formatVersionMarker.Close()
   136  		}
   137  	}()
   138  
   139  	// Find the currently active manifest, if there is one.
   140  	manifestMarker, manifestFileNum, manifestExists, err := findCurrentManifest(formatVersion, opts.FS, dirname)
   141  	if err != nil {
   142  		return nil, errors.Wrapf(err, "pebble: database %q", dirname)
   143  	}
   144  	defer func() {
   145  		if db == nil {
   146  			manifestMarker.Close()
   147  		}
   148  	}()
   149  
   150  	// Atomic markers may leave behind obsolete files if there's a crash
   151  	// mid-update. Clean these up if we're not in read-only mode.
   152  	if !opts.ReadOnly {
   153  		if err := formatVersionMarker.RemoveObsolete(); err != nil {
   154  			return nil, err
   155  		}
   156  		if err := manifestMarker.RemoveObsolete(); err != nil {
   157  			return nil, err
   158  		}
   159  	}
   160  
   161  	if opts.Cache == nil {
   162  		opts.Cache = cache.New(cacheDefaultSize)
   163  	} else {
   164  		opts.Cache.Ref()
   165  	}
   166  
   167  	d := &DB{
   168  		cacheID:             opts.Cache.NewID(),
   169  		dirname:             dirname,
   170  		walDirname:          walDirname,
   171  		opts:                opts,
   172  		cmp:                 opts.Comparer.Compare,
   173  		equal:               opts.equal(),
   174  		merge:               opts.Merger.Merge,
   175  		split:               opts.Comparer.Split,
   176  		abbreviatedKey:      opts.Comparer.AbbreviatedKey,
   177  		largeBatchThreshold: (opts.MemTableSize - uint64(memTableEmptySize)) / 2,
   178  		fileLock:            fileLock,
   179  		dataDir:             dataDir,
   180  		walDir:              walDir,
   181  		logRecycler:         logRecycler{limit: opts.MemTableStopWritesThreshold + 1},
   182  		closed:              new(atomic.Value),
   183  		closedCh:            make(chan struct{}),
   184  	}
   185  	d.mu.versions = &versionSet{}
   186  	d.diskAvailBytes.Store(math.MaxUint64)
   187  
   188  	defer func() {
   189  		// If an error or panic occurs during open, attempt to release the manually
   190  		// allocated memory resources. Note that rather than look for an error, we
   191  		// look for the return of a nil DB pointer.
   192  		if r := recover(); db == nil {
   193  			// Release our references to the Cache. Note that both the DB, and
   194  			// tableCache have a reference. When we release the reference to
   195  			// the tableCache, and if there are no other references to
   196  			// the tableCache, then the tableCache will also release its
   197  			// reference to the cache.
   198  			opts.Cache.Unref()
   199  
   200  			if d.tableCache != nil {
   201  				_ = d.tableCache.close()
   202  			}
   203  
   204  			for _, mem := range d.mu.mem.queue {
   205  				switch t := mem.flushable.(type) {
   206  				case *memTable:
   207  					manual.Free(t.arenaBuf)
   208  					t.arenaBuf = nil
   209  				}
   210  			}
   211  			if d.cleanupManager != nil {
   212  				d.cleanupManager.Close()
   213  			}
   214  			if d.objProvider != nil {
   215  				d.objProvider.Close()
   216  			}
   217  			if r != nil {
   218  				panic(r)
   219  			}
   220  		}
   221  	}()
   222  
   223  	d.commit = newCommitPipeline(commitEnv{
   224  		logSeqNum:     &d.mu.versions.logSeqNum,
   225  		visibleSeqNum: &d.mu.versions.visibleSeqNum,
   226  		apply:         d.commitApply,
   227  		write:         d.commitWrite,
   228  	})
   229  	d.mu.nextJobID = 1
   230  	d.mu.mem.nextSize = opts.MemTableSize
   231  	if d.mu.mem.nextSize > initialMemTableSize {
   232  		d.mu.mem.nextSize = initialMemTableSize
   233  	}
   234  	d.mu.compact.cond.L = &d.mu.Mutex
   235  	d.mu.compact.inProgress = make(map[*compaction]struct{})
   236  	d.mu.compact.noOngoingFlushStartTime = time.Now()
   237  	d.mu.snapshots.init()
   238  	// logSeqNum is the next sequence number that will be assigned.
   239  	// Start assigning sequence numbers from base.SeqNumStart to leave
   240  	// room for reserved sequence numbers (see comments around
   241  	// SeqNumStart).
   242  	d.mu.versions.logSeqNum.Store(base.SeqNumStart)
   243  	d.mu.formatVers.vers.Store(uint64(formatVersion))
   244  	d.mu.formatVers.marker = formatVersionMarker
   245  
   246  	d.timeNow = time.Now
   247  	d.openedAt = d.timeNow()
   248  
   249  	d.mu.Lock()
   250  	defer d.mu.Unlock()
   251  
   252  	jobID := d.mu.nextJobID
   253  	d.mu.nextJobID++
   254  
   255  	setCurrent := setCurrentFunc(d.FormatMajorVersion(), manifestMarker, opts.FS, dirname, d.dataDir)
   256  
   257  	if !manifestExists {
   258  		// DB does not exist.
   259  		if d.opts.ErrorIfNotExists || d.opts.ReadOnly {
   260  			return nil, errors.Wrapf(ErrDBDoesNotExist, "dirname=%q", dirname)
   261  		}
   262  
   263  		// Create the DB.
   264  		if err := d.mu.versions.create(jobID, dirname, opts, manifestMarker, setCurrent, d.FormatMajorVersion, &d.mu.Mutex); err != nil {
   265  			return nil, err
   266  		}
   267  	} else {
   268  		if opts.ErrorIfExists {
   269  			return nil, errors.Wrapf(ErrDBAlreadyExists, "dirname=%q", dirname)
   270  		}
   271  		// Load the version set.
   272  		if err := d.mu.versions.load(dirname, opts, manifestFileNum, manifestMarker, setCurrent, d.FormatMajorVersion, &d.mu.Mutex); err != nil {
   273  			return nil, err
   274  		}
   275  		if opts.ErrorIfNotPristine {
   276  			liveFileNums := make(map[base.DiskFileNum]struct{})
   277  			d.mu.versions.addLiveFileNums(liveFileNums)
   278  			if len(liveFileNums) != 0 {
   279  				return nil, errors.Wrapf(ErrDBNotPristine, "dirname=%q", dirname)
   280  			}
   281  		}
   282  	}
   283  
   284  	// In read-only mode, we replay directly into the mutable memtable but never
   285  	// flush it. We need to delay creation of the memtable until we know the
   286  	// sequence number of the first batch that will be inserted.
   287  	if !d.opts.ReadOnly {
   288  		var entry *flushableEntry
   289  		d.mu.mem.mutable, entry = d.newMemTable(0 /* logNum */, d.mu.versions.logSeqNum.Load())
   290  		d.mu.mem.queue = append(d.mu.mem.queue, entry)
   291  	}
   292  
   293  	// List the objects
   294  	ls, err := opts.FS.List(d.walDirname)
   295  	if err != nil {
   296  		return nil, err
   297  	}
   298  	if d.dirname != d.walDirname {
   299  		ls2, err := opts.FS.List(d.dirname)
   300  		if err != nil {
   301  			return nil, err
   302  		}
   303  		ls = append(ls, ls2...)
   304  	}
   305  	providerSettings := objstorageprovider.Settings{
   306  		Logger:              opts.Logger,
   307  		FS:                  opts.FS,
   308  		FSDirName:           dirname,
   309  		FSDirInitialListing: ls,
   310  		FSCleaner:           opts.Cleaner,
   311  		NoSyncOnClose:       opts.NoSyncOnClose,
   312  		BytesPerSync:        opts.BytesPerSync,
   313  	}
   314  	providerSettings.Remote.StorageFactory = opts.Experimental.RemoteStorage
   315  	providerSettings.Remote.CreateOnShared = opts.Experimental.CreateOnShared
   316  	providerSettings.Remote.CreateOnSharedLocator = opts.Experimental.CreateOnSharedLocator
   317  	providerSettings.Remote.CacheSizeBytes = opts.Experimental.SecondaryCacheSizeBytes
   318  
   319  	d.objProvider, err = objstorageprovider.Open(providerSettings)
   320  	if err != nil {
   321  		return nil, err
   322  	}
   323  
   324  	d.cleanupManager = openCleanupManager(opts, d.objProvider, d.onObsoleteTableDelete, d.getDeletionPacerInfo)
   325  
   326  	if manifestExists {
   327  		curVersion := d.mu.versions.currentVersion()
   328  		if err := checkConsistency(curVersion, dirname, d.objProvider); err != nil {
   329  			return nil, err
   330  		}
   331  	}
   332  
   333  	tableCacheSize := TableCacheSize(opts.MaxOpenFiles)
   334  	d.tableCache = newTableCacheContainer(
   335  		opts.TableCache, d.cacheID, d.objProvider, d.opts, tableCacheSize,
   336  		&sstable.CategoryStatsCollector{})
   337  	d.newIters = d.tableCache.newIters
   338  	d.tableNewRangeKeyIter = d.tableCache.newRangeKeyIter
   339  
   340  	// Replay any newer log files than the ones named in the manifest.
   341  	type fileNumAndName struct {
   342  		num  base.DiskFileNum
   343  		name string
   344  	}
   345  	var logFiles []fileNumAndName
   346  	var previousOptionsFileNum FileNum
   347  	var previousOptionsFilename string
   348  	for _, filename := range ls {
   349  		ft, fn, ok := base.ParseFilename(opts.FS, filename)
   350  		if !ok {
   351  			continue
   352  		}
   353  
   354  		// Don't reuse any obsolete file numbers to avoid modifying an
   355  		// ingested sstable's original external file.
   356  		if d.mu.versions.nextFileNum <= uint64(fn.FileNum()) {
   357  			d.mu.versions.nextFileNum = uint64(fn.FileNum()) + 1
   358  		}
   359  
   360  		switch ft {
   361  		case fileTypeLog:
   362  			if fn >= d.mu.versions.minUnflushedLogNum {
   363  				logFiles = append(logFiles, fileNumAndName{fn, filename})
   364  			}
   365  			if d.logRecycler.minRecycleLogNum <= fn.FileNum() {
   366  				d.logRecycler.minRecycleLogNum = fn.FileNum() + 1
   367  			}
   368  		case fileTypeOptions:
   369  			if previousOptionsFileNum < fn.FileNum() {
   370  				previousOptionsFileNum = fn.FileNum()
   371  				previousOptionsFilename = filename
   372  			}
   373  		case fileTypeTemp, fileTypeOldTemp:
   374  			if !d.opts.ReadOnly {
   375  				// Some codepaths write to a temporary file and then
   376  				// rename it to its final location when complete.  A
   377  				// temp file is leftover if a process exits before the
   378  				// rename.  Remove it.
   379  				err := opts.FS.Remove(opts.FS.PathJoin(dirname, filename))
   380  				if err != nil {
   381  					return nil, err
   382  				}
   383  			}
   384  		}
   385  	}
   386  
   387  	// Ratchet d.mu.versions.nextFileNum ahead of all known objects in the
   388  	// objProvider. This avoids FileNum collisions with obsolete sstables.
   389  	objects := d.objProvider.List()
   390  	for _, obj := range objects {
   391  		if d.mu.versions.nextFileNum <= uint64(obj.DiskFileNum) {
   392  			d.mu.versions.nextFileNum = uint64(obj.DiskFileNum) + 1
   393  		}
   394  	}
   395  
   396  	// Validate the most-recent OPTIONS file, if there is one.
   397  	var strictWALTail bool
   398  	if previousOptionsFilename != "" {
   399  		path := opts.FS.PathJoin(dirname, previousOptionsFilename)
   400  		strictWALTail, err = checkOptions(opts, path)
   401  		if err != nil {
   402  			return nil, err
   403  		}
   404  	}
   405  
   406  	slices.SortFunc(logFiles, func(a, b fileNumAndName) int {
   407  		return cmp.Compare(a.num, b.num)
   408  	})
   409  
   410  	var ve versionEdit
   411  	var toFlush flushableList
   412  	for i, lf := range logFiles {
   413  		lastWAL := i == len(logFiles)-1
   414  		flush, maxSeqNum, err := d.replayWAL(jobID, &ve, opts.FS,
   415  			opts.FS.PathJoin(d.walDirname, lf.name), lf.num, strictWALTail && !lastWAL)
   416  		if err != nil {
   417  			return nil, err
   418  		}
   419  		toFlush = append(toFlush, flush...)
   420  		d.mu.versions.markFileNumUsed(lf.num)
   421  		if d.mu.versions.logSeqNum.Load() < maxSeqNum {
   422  			d.mu.versions.logSeqNum.Store(maxSeqNum)
   423  		}
   424  	}
   425  	d.mu.versions.visibleSeqNum.Store(d.mu.versions.logSeqNum.Load())
   426  
   427  	if !d.opts.ReadOnly {
   428  		// Create an empty .log file.
   429  		newLogNum := d.mu.versions.getNextDiskFileNum()
   430  
   431  		// This logic is slightly different than RocksDB's. Specifically, RocksDB
   432  		// sets MinUnflushedLogNum to max-recovered-log-num + 1. We set it to the
   433  		// newLogNum. There should be no difference in using either value.
   434  		ve.MinUnflushedLogNum = newLogNum
   435  
   436  		// Create the manifest with the updated MinUnflushedLogNum before
   437  		// creating the new log file. If we created the log file first, a
   438  		// crash before the manifest is synced could leave two WALs with
   439  		// unclean tails.
   440  		d.mu.versions.logLock()
   441  		if err := d.mu.versions.logAndApply(jobID, &ve, newFileMetrics(ve.NewFiles), false /* forceRotation */, func() []compactionInfo {
   442  			return nil
   443  		}); err != nil {
   444  			return nil, err
   445  		}
   446  
   447  		for _, entry := range toFlush {
   448  			entry.readerUnrefLocked(true)
   449  		}
   450  
   451  		newLogName := base.MakeFilepath(opts.FS, d.walDirname, fileTypeLog, newLogNum)
   452  		d.mu.log.queue = append(d.mu.log.queue, fileInfo{fileNum: newLogNum, fileSize: 0})
   453  		logFile, err := opts.FS.Create(newLogName)
   454  		if err != nil {
   455  			return nil, err
   456  		}
   457  		if err := d.walDir.Sync(); err != nil {
   458  			return nil, err
   459  		}
   460  		d.opts.EventListener.WALCreated(WALCreateInfo{
   461  			JobID:   jobID,
   462  			Path:    newLogName,
   463  			FileNum: newLogNum,
   464  		})
   465  		// This isn't strictly necessary as we don't use the log number for
   466  		// memtables being flushed, only for the next unflushed memtable.
   467  		d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum = newLogNum
   468  
   469  		logFile = vfs.NewSyncingFile(logFile, vfs.SyncingFileOptions{
   470  			NoSyncOnClose:   d.opts.NoSyncOnClose,
   471  			BytesPerSync:    d.opts.WALBytesPerSync,
   472  			PreallocateSize: d.walPreallocateSize(),
   473  		})
   474  		d.mu.log.metrics.fsyncLatency = prometheus.NewHistogram(prometheus.HistogramOpts{
   475  			Buckets: FsyncLatencyBuckets,
   476  		})
   477  
   478  		logWriterConfig := record.LogWriterConfig{
   479  			WALMinSyncInterval: d.opts.WALMinSyncInterval,
   480  			WALFsyncLatency:    d.mu.log.metrics.fsyncLatency,
   481  			QueueSemChan:       d.commit.logSyncQSem,
   482  		}
   483  		d.mu.log.LogWriter = record.NewLogWriter(logFile, newLogNum, logWriterConfig)
   484  		d.mu.versions.metrics.WAL.Files++
   485  	}
   486  	d.updateReadStateLocked(d.opts.DebugCheck)
   487  
   488  	// If the Options specify a format major version higher than the
   489  	// loaded database's, upgrade it. If this is a new database, this
   490  	// code path also performs an initial upgrade from the starting
   491  	// implicit MostCompatible version.
   492  	//
   493  	// We ratchet the version this far into Open so that migrations have a read
   494  	// state available.
   495  	if !d.opts.ReadOnly && opts.FormatMajorVersion > d.FormatMajorVersion() {
   496  		if err := d.ratchetFormatMajorVersionLocked(opts.FormatMajorVersion); err != nil {
   497  			return nil, err
   498  		}
   499  	}
   500  
   501  	if !d.opts.ReadOnly {
   502  		// Write the current options to disk.
   503  		d.optionsFileNum = d.mu.versions.getNextDiskFileNum()
   504  		tmpPath := base.MakeFilepath(opts.FS, dirname, fileTypeTemp, d.optionsFileNum)
   505  		optionsPath := base.MakeFilepath(opts.FS, dirname, fileTypeOptions, d.optionsFileNum)
   506  
   507  		// Write them to a temporary file first, in case we crash before
   508  		// we're done. A corrupt options file prevents opening the
   509  		// database.
   510  		optionsFile, err := opts.FS.Create(tmpPath)
   511  		if err != nil {
   512  			return nil, err
   513  		}
   514  		serializedOpts := []byte(opts.String())
   515  		if _, err := optionsFile.Write(serializedOpts); err != nil {
   516  			return nil, errors.CombineErrors(err, optionsFile.Close())
   517  		}
   518  		d.optionsFileSize = uint64(len(serializedOpts))
   519  		if err := optionsFile.Sync(); err != nil {
   520  			return nil, errors.CombineErrors(err, optionsFile.Close())
   521  		}
   522  		if err := optionsFile.Close(); err != nil {
   523  			return nil, err
   524  		}
   525  		// Atomically rename to the OPTIONS-XXXXXX path. This rename is
   526  		// guaranteed to be atomic because the destination path does not
   527  		// exist.
   528  		if err := opts.FS.Rename(tmpPath, optionsPath); err != nil {
   529  			return nil, err
   530  		}
   531  		if err := d.dataDir.Sync(); err != nil {
   532  			return nil, err
   533  		}
   534  	}
   535  
   536  	if !d.opts.ReadOnly {
   537  		d.scanObsoleteFiles(ls)
   538  		d.deleteObsoleteFiles(jobID)
   539  	} else {
   540  		// All the log files are obsolete.
   541  		d.mu.versions.metrics.WAL.Files = int64(len(logFiles))
   542  	}
   543  	d.mu.tableStats.cond.L = &d.mu.Mutex
   544  	d.mu.tableValidation.cond.L = &d.mu.Mutex
   545  	if !d.opts.ReadOnly {
   546  		d.maybeCollectTableStatsLocked()
   547  	}
   548  	d.calculateDiskAvailableBytes()
   549  
   550  	d.maybeScheduleFlush()
   551  	d.maybeScheduleCompaction()
   552  
   553  	// Note: this is a no-op if invariants are disabled or race is enabled.
   554  	//
   555  	// Setting a finalizer on *DB causes *DB to never be reclaimed and the
   556  	// finalizer to never be run. The problem is due to this limitation of
   557  	// finalizers mention in the SetFinalizer docs:
   558  	//
   559  	//   If a cyclic structure includes a block with a finalizer, that cycle is
   560  	//   not guaranteed to be garbage collected and the finalizer is not
   561  	//   guaranteed to run, because there is no ordering that respects the
   562  	//   dependencies.
   563  	//
   564  	// DB has cycles with several of its internal structures: readState,
   565  	// newIters, tableCache, versions, etc. Each of this individually cause a
   566  	// cycle and prevent the finalizer from being run. But we can workaround this
   567  	// finializer limitation by setting a finalizer on another object that is
   568  	// tied to the lifetime of DB: the DB.closed atomic.Value.
   569  	dPtr := fmt.Sprintf("%p", d)
   570  	invariants.SetFinalizer(d.closed, func(obj interface{}) {
   571  		v := obj.(*atomic.Value)
   572  		if err := v.Load(); err == nil {
   573  			fmt.Fprintf(os.Stderr, "%s: unreferenced DB not closed\n", dPtr)
   574  			os.Exit(1)
   575  		}
   576  	})
   577  
   578  	return d, nil
   579  }
   580  
   581  // prepareAndOpenDirs opens the directories for the store (and creates them if
   582  // necessary).
   583  //
   584  // Returns an error if ReadOnly is set and the directories don't exist.
   585  func prepareAndOpenDirs(
   586  	dirname string, opts *Options,
   587  ) (walDirname string, dataDir vfs.File, walDir vfs.File, err error) {
   588  	walDirname = opts.WALDir
   589  	if opts.WALDir == "" {
   590  		walDirname = dirname
   591  	}
   592  
   593  	// Create directories if needed.
   594  	if !opts.ReadOnly {
   595  		if err := opts.FS.MkdirAll(dirname, 0755); err != nil {
   596  			return "", nil, nil, err
   597  		}
   598  		if walDirname != dirname {
   599  			if err := opts.FS.MkdirAll(walDirname, 0755); err != nil {
   600  				return "", nil, nil, err
   601  			}
   602  		}
   603  	}
   604  
   605  	dataDir, err = opts.FS.OpenDir(dirname)
   606  	if err != nil {
   607  		if opts.ReadOnly && oserror.IsNotExist(err) {
   608  			return "", nil, nil, errors.Errorf("pebble: database %q does not exist", dirname)
   609  		}
   610  		return "", nil, nil, err
   611  	}
   612  
   613  	if walDirname == dirname {
   614  		walDir = dataDir
   615  	} else {
   616  		walDir, err = opts.FS.OpenDir(walDirname)
   617  		if err != nil {
   618  			dataDir.Close()
   619  			return "", nil, nil, err
   620  		}
   621  	}
   622  	return walDirname, dataDir, walDir, nil
   623  }
   624  
   625  // GetVersion returns the engine version string from the latest options
   626  // file present in dir. Used to check what Pebble or RocksDB version was last
   627  // used to write to the database stored in this directory. An empty string is
   628  // returned if no valid OPTIONS file with a version key was found.
   629  func GetVersion(dir string, fs vfs.FS) (string, error) {
   630  	ls, err := fs.List(dir)
   631  	if err != nil {
   632  		return "", err
   633  	}
   634  	var version string
   635  	lastOptionsSeen := FileNum(0)
   636  	for _, filename := range ls {
   637  		ft, fn, ok := base.ParseFilename(fs, filename)
   638  		if !ok {
   639  			continue
   640  		}
   641  		switch ft {
   642  		case fileTypeOptions:
   643  			// If this file has a higher number than the last options file
   644  			// processed, reset version. This is because rocksdb often
   645  			// writes multiple options files without deleting previous ones.
   646  			// Otherwise, skip parsing this options file.
   647  			if fn.FileNum() > lastOptionsSeen {
   648  				version = ""
   649  				lastOptionsSeen = fn.FileNum()
   650  			} else {
   651  				continue
   652  			}
   653  			f, err := fs.Open(fs.PathJoin(dir, filename))
   654  			if err != nil {
   655  				return "", err
   656  			}
   657  			data, err := io.ReadAll(f)
   658  			f.Close()
   659  
   660  			if err != nil {
   661  				return "", err
   662  			}
   663  			err = parseOptions(string(data), func(section, key, value string) error {
   664  				switch {
   665  				case section == "Version":
   666  					switch key {
   667  					case "pebble_version":
   668  						version = value
   669  					case "rocksdb_version":
   670  						version = fmt.Sprintf("rocksdb v%s", value)
   671  					}
   672  				}
   673  				return nil
   674  			})
   675  			if err != nil {
   676  				return "", err
   677  			}
   678  		}
   679  	}
   680  	return version, nil
   681  }
   682  
   683  // replayWAL replays the edits in the specified log file. If the DB is in
   684  // read only mode, then the WALs are replayed into memtables and not flushed. If
   685  // the DB is not in read only mode, then the contents of the WAL are guaranteed
   686  // to be flushed.
   687  //
   688  // The toFlush return value is a list of flushables associated with the WAL
   689  // being replayed which will be flushed. Once the version edit has been applied
   690  // to the manifest, it is up to the caller of replayWAL to unreference the
   691  // toFlush flushables returned by replayWAL.
   692  //
   693  // d.mu must be held when calling this, but the mutex may be dropped and
   694  // re-acquired during the course of this method.
   695  func (d *DB) replayWAL(
   696  	jobID int,
   697  	ve *versionEdit,
   698  	fs vfs.FS,
   699  	filename string,
   700  	logNum base.DiskFileNum,
   701  	strictWALTail bool,
   702  ) (toFlush flushableList, maxSeqNum uint64, err error) {
   703  	file, err := fs.Open(filename)
   704  	if err != nil {
   705  		return nil, 0, err
   706  	}
   707  	defer file.Close()
   708  	var (
   709  		b               Batch
   710  		buf             bytes.Buffer
   711  		mem             *memTable
   712  		entry           *flushableEntry
   713  		rr              = record.NewReader(file, logNum)
   714  		offset          int64 // byte offset in rr
   715  		lastFlushOffset int64
   716  		keysReplayed    int64 // number of keys replayed
   717  		batchesReplayed int64 // number of batches replayed
   718  	)
   719  
   720  	// TODO(jackson): This function is interspersed with panics, in addition to
   721  	// corruption error propagation. Audit them to ensure we're truly only
   722  	// panicking where the error points to Pebble bug and not user or
   723  	// hardware-induced corruption.
   724  
   725  	if d.opts.ReadOnly {
   726  		// In read-only mode, we replay directly into the mutable memtable which will
   727  		// never be flushed.
   728  		mem = d.mu.mem.mutable
   729  		if mem != nil {
   730  			entry = d.mu.mem.queue[len(d.mu.mem.queue)-1]
   731  		}
   732  	}
   733  
   734  	// Flushes the current memtable, if not nil.
   735  	flushMem := func() {
   736  		if mem == nil {
   737  			return
   738  		}
   739  		var logSize uint64
   740  		if offset >= lastFlushOffset {
   741  			logSize = uint64(offset - lastFlushOffset)
   742  		}
   743  		// Else, this was the initial memtable in the read-only case which must have
   744  		// been empty, but we need to flush it since we don't want to add to it later.
   745  		lastFlushOffset = offset
   746  		entry.logSize = logSize
   747  		if !d.opts.ReadOnly {
   748  			toFlush = append(toFlush, entry)
   749  		}
   750  		mem, entry = nil, nil
   751  	}
   752  	// Creates a new memtable if there is no current memtable.
   753  	ensureMem := func(seqNum uint64) {
   754  		if mem != nil {
   755  			return
   756  		}
   757  		mem, entry = d.newMemTable(logNum, seqNum)
   758  		if d.opts.ReadOnly {
   759  			d.mu.mem.mutable = mem
   760  			d.mu.mem.queue = append(d.mu.mem.queue, entry)
   761  		}
   762  	}
   763  
   764  	// updateVE is used to update ve with information about new files created
   765  	// during the flush of any flushable not of type ingestedFlushable. For the
   766  	// flushable of type ingestedFlushable we use custom handling below.
   767  	updateVE := func() error {
   768  		// TODO(bananabrick): See if we can use the actual base level here,
   769  		// instead of using 1.
   770  		c := newFlush(d.opts, d.mu.versions.currentVersion(),
   771  			1 /* base level */, toFlush, d.timeNow())
   772  		newVE, _, _, err := d.runCompaction(jobID, c)
   773  		if err != nil {
   774  			return errors.Wrapf(err, "running compaction during WAL replay")
   775  		}
   776  		ve.NewFiles = append(ve.NewFiles, newVE.NewFiles...)
   777  		return nil
   778  	}
   779  	defer func() {
   780  		if err != nil {
   781  			err = errors.WithDetailf(err, "replaying log %s, offset %d", logNum, offset)
   782  		}
   783  	}()
   784  
   785  	for {
   786  		offset = rr.Offset()
   787  		r, err := rr.Next()
   788  		if err == nil {
   789  			_, err = io.Copy(&buf, r)
   790  		}
   791  		if err != nil {
   792  			// It is common to encounter a zeroed or invalid chunk due to WAL
   793  			// preallocation and WAL recycling. We need to distinguish these
   794  			// errors from EOF in order to recognize that the record was
   795  			// truncated and to avoid replaying subsequent WALs, but want
   796  			// to otherwise treat them like EOF.
   797  			if err == io.EOF {
   798  				break
   799  			} else if record.IsInvalidRecord(err) && !strictWALTail {
   800  				break
   801  			}
   802  			return nil, 0, errors.Wrap(err, "pebble: error when replaying WAL")
   803  		}
   804  
   805  		if buf.Len() < batchHeaderLen {
   806  			return nil, 0, base.CorruptionErrorf("pebble: corrupt log file %q (num %s)",
   807  				filename, errors.Safe(logNum))
   808  		}
   809  
   810  		if d.opts.ErrorIfNotPristine {
   811  			return nil, 0, errors.WithDetailf(ErrDBNotPristine, "location: %q", d.dirname)
   812  		}
   813  
   814  		// Specify Batch.db so that Batch.SetRepr will compute Batch.memTableSize
   815  		// which is used below.
   816  		b = Batch{}
   817  		b.db = d
   818  		b.SetRepr(buf.Bytes())
   819  		seqNum := b.SeqNum()
   820  		maxSeqNum = seqNum + uint64(b.Count())
   821  		keysReplayed += int64(b.Count())
   822  		batchesReplayed++
   823  		{
   824  			br := b.Reader()
   825  			if kind, encodedFileNum, _, ok, err := br.Next(); err != nil {
   826  				return nil, 0, err
   827  			} else if ok && kind == InternalKeyKindIngestSST {
   828  				fileNums := make([]base.DiskFileNum, 0, b.Count())
   829  				addFileNum := func(encodedFileNum []byte) {
   830  					fileNum, n := binary.Uvarint(encodedFileNum)
   831  					if n <= 0 {
   832  						panic("pebble: ingest sstable file num is invalid.")
   833  					}
   834  					fileNums = append(fileNums, base.FileNum(fileNum).DiskFileNum())
   835  				}
   836  				addFileNum(encodedFileNum)
   837  
   838  				for i := 1; i < int(b.Count()); i++ {
   839  					kind, encodedFileNum, _, ok, err := br.Next()
   840  					if err != nil {
   841  						return nil, 0, err
   842  					}
   843  					if kind != InternalKeyKindIngestSST {
   844  						panic("pebble: invalid batch key kind.")
   845  					}
   846  					if !ok {
   847  						panic("pebble: invalid batch count.")
   848  					}
   849  					addFileNum(encodedFileNum)
   850  				}
   851  
   852  				if _, _, _, ok, err := br.Next(); err != nil {
   853  					return nil, 0, err
   854  				} else if ok {
   855  					panic("pebble: invalid number of entries in batch.")
   856  				}
   857  
   858  				meta := make([]*fileMetadata, len(fileNums))
   859  				for i, n := range fileNums {
   860  					var readable objstorage.Readable
   861  					objMeta, err := d.objProvider.Lookup(fileTypeTable, n)
   862  					if err != nil {
   863  						return nil, 0, errors.Wrap(err, "pebble: error when looking up ingested SSTs")
   864  					}
   865  					if objMeta.IsRemote() {
   866  						readable, err = d.objProvider.OpenForReading(context.TODO(), fileTypeTable, n, objstorage.OpenOptions{MustExist: true})
   867  						if err != nil {
   868  							return nil, 0, errors.Wrap(err, "pebble: error when opening flushable ingest files")
   869  						}
   870  					} else {
   871  						path := base.MakeFilepath(d.opts.FS, d.dirname, fileTypeTable, n)
   872  						f, err := d.opts.FS.Open(path)
   873  						if err != nil {
   874  							return nil, 0, err
   875  						}
   876  
   877  						readable, err = sstable.NewSimpleReadable(f)
   878  						if err != nil {
   879  							return nil, 0, err
   880  						}
   881  					}
   882  					// NB: ingestLoad1 will close readable.
   883  					meta[i], err = ingestLoad1(d.opts, d.FormatMajorVersion(), readable, d.cacheID, n)
   884  					if err != nil {
   885  						return nil, 0, errors.Wrap(err, "pebble: error when loading flushable ingest files")
   886  					}
   887  				}
   888  
   889  				if uint32(len(meta)) != b.Count() {
   890  					panic("pebble: couldn't load all files in WAL entry.")
   891  				}
   892  
   893  				entry, err = d.newIngestedFlushableEntry(
   894  					meta, seqNum, logNum,
   895  				)
   896  				if err != nil {
   897  					return nil, 0, err
   898  				}
   899  
   900  				if d.opts.ReadOnly {
   901  					d.mu.mem.queue = append(d.mu.mem.queue, entry)
   902  					// We added the IngestSST flushable to the queue. But there
   903  					// must be at least one WAL entry waiting to be replayed. We
   904  					// have to ensure this newer WAL entry isn't replayed into
   905  					// the current value of d.mu.mem.mutable because the current
   906  					// mutable memtable exists before this flushable entry in
   907  					// the memtable queue. To ensure this, we just need to unset
   908  					// d.mu.mem.mutable. When a newer WAL is replayed, we will
   909  					// set d.mu.mem.mutable to a newer value.
   910  					d.mu.mem.mutable = nil
   911  				} else {
   912  					toFlush = append(toFlush, entry)
   913  					// During WAL replay, the lsm only has L0, hence, the
   914  					// baseLevel is 1. For the sake of simplicity, we place the
   915  					// ingested files in L0 here, instead of finding their
   916  					// target levels. This is a simplification for the sake of
   917  					// simpler code. It is expected that WAL replay should be
   918  					// rare, and that flushables of type ingestedFlushable
   919  					// should also be rare. So, placing the ingested files in L0
   920  					// is alright.
   921  					//
   922  					// TODO(bananabrick): Maybe refactor this function to allow
   923  					// us to easily place ingested files in levels as low as
   924  					// possible during WAL replay. It would require breaking up
   925  					// the application of ve to the manifest into chunks and is
   926  					// not pretty w/o a refactor to this function and how it's
   927  					// used.
   928  					c := newFlush(
   929  						d.opts, d.mu.versions.currentVersion(),
   930  						1, /* base level */
   931  						[]*flushableEntry{entry},
   932  						d.timeNow(),
   933  					)
   934  					for _, file := range c.flushing[0].flushable.(*ingestedFlushable).files {
   935  						ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: 0, Meta: file.FileMetadata})
   936  					}
   937  				}
   938  				return toFlush, maxSeqNum, nil
   939  			}
   940  		}
   941  
   942  		if b.memTableSize >= uint64(d.largeBatchThreshold) {
   943  			flushMem()
   944  			// Make a copy of the data slice since it is currently owned by buf and will
   945  			// be reused in the next iteration.
   946  			b.data = slices.Clone(b.data)
   947  			b.flushable, err = newFlushableBatch(&b, d.opts.Comparer)
   948  			if err != nil {
   949  				return nil, 0, err
   950  			}
   951  			entry := d.newFlushableEntry(b.flushable, logNum, b.SeqNum())
   952  			// Disable memory accounting by adding a reader ref that will never be
   953  			// removed.
   954  			entry.readerRefs.Add(1)
   955  			if d.opts.ReadOnly {
   956  				d.mu.mem.queue = append(d.mu.mem.queue, entry)
   957  				// We added the flushable batch to the flushable to the queue.
   958  				// But there must be at least one WAL entry waiting to be
   959  				// replayed. We have to ensure this newer WAL entry isn't
   960  				// replayed into the current value of d.mu.mem.mutable because
   961  				// the current mutable memtable exists before this flushable
   962  				// entry in the memtable queue. To ensure this, we just need to
   963  				// unset d.mu.mem.mutable. When a newer WAL is replayed, we will
   964  				// set d.mu.mem.mutable to a newer value.
   965  				d.mu.mem.mutable = nil
   966  			} else {
   967  				toFlush = append(toFlush, entry)
   968  			}
   969  		} else {
   970  			ensureMem(seqNum)
   971  			if err = mem.prepare(&b); err != nil && err != arenaskl.ErrArenaFull {
   972  				return nil, 0, err
   973  			}
   974  			// We loop since DB.newMemTable() slowly grows the size of allocated memtables, so the
   975  			// batch may not initially fit, but will eventually fit (since it is smaller than
   976  			// largeBatchThreshold).
   977  			for err == arenaskl.ErrArenaFull {
   978  				flushMem()
   979  				ensureMem(seqNum)
   980  				err = mem.prepare(&b)
   981  				if err != nil && err != arenaskl.ErrArenaFull {
   982  					return nil, 0, err
   983  				}
   984  			}
   985  			if err = mem.apply(&b, seqNum); err != nil {
   986  				return nil, 0, err
   987  			}
   988  			mem.writerUnref()
   989  		}
   990  		buf.Reset()
   991  	}
   992  
   993  	d.opts.Logger.Infof("[JOB %d] WAL file %s with log number %s stopped reading at offset: %d; replayed %d keys in %d batches", jobID, filename, logNum.String(), offset, keysReplayed, batchesReplayed)
   994  	flushMem()
   995  
   996  	// mem is nil here.
   997  	if !d.opts.ReadOnly {
   998  		err = updateVE()
   999  		if err != nil {
  1000  			return nil, 0, err
  1001  		}
  1002  	}
  1003  	return toFlush, maxSeqNum, err
  1004  }
  1005  
  1006  func checkOptions(opts *Options, path string) (strictWALTail bool, err error) {
  1007  	f, err := opts.FS.Open(path)
  1008  	if err != nil {
  1009  		return false, err
  1010  	}
  1011  	defer f.Close()
  1012  
  1013  	data, err := io.ReadAll(f)
  1014  	if err != nil {
  1015  		return false, err
  1016  	}
  1017  	return opts.checkOptions(string(data))
  1018  }
  1019  
  1020  // DBDesc briefly describes high-level state about a database.
  1021  type DBDesc struct {
  1022  	// Exists is true if an existing database was found.
  1023  	Exists bool
  1024  	// FormatMajorVersion indicates the database's current format
  1025  	// version.
  1026  	FormatMajorVersion FormatMajorVersion
  1027  	// ManifestFilename is the filename of the current active manifest,
  1028  	// if the database exists.
  1029  	ManifestFilename string
  1030  }
  1031  
  1032  // Peek looks for an existing database in dirname on the provided FS. It
  1033  // returns a brief description of the database. Peek is read-only and
  1034  // does not open the database
  1035  func Peek(dirname string, fs vfs.FS) (*DBDesc, error) {
  1036  	vers, versMarker, err := lookupFormatMajorVersion(fs, dirname)
  1037  	if err != nil {
  1038  		return nil, err
  1039  	}
  1040  	// TODO(jackson): Immediately closing the marker is clunky. Add a
  1041  	// PeekMarker variant that avoids opening the directory.
  1042  	if err := versMarker.Close(); err != nil {
  1043  		return nil, err
  1044  	}
  1045  
  1046  	// Find the currently active manifest, if there is one.
  1047  	manifestMarker, manifestFileNum, exists, err := findCurrentManifest(vers, fs, dirname)
  1048  	if err != nil {
  1049  		return nil, err
  1050  	}
  1051  	// TODO(jackson): Immediately closing the marker is clunky. Add a
  1052  	// PeekMarker variant that avoids opening the directory.
  1053  	if err := manifestMarker.Close(); err != nil {
  1054  		return nil, err
  1055  	}
  1056  
  1057  	desc := &DBDesc{
  1058  		Exists:             exists,
  1059  		FormatMajorVersion: vers,
  1060  	}
  1061  	if exists {
  1062  		desc.ManifestFilename = base.MakeFilepath(fs, dirname, fileTypeManifest, manifestFileNum)
  1063  	}
  1064  	return desc, nil
  1065  }
  1066  
  1067  // LockDirectory acquires the database directory lock in the named directory,
  1068  // preventing another process from opening the database. LockDirectory returns a
  1069  // handle to the held lock that may be passed to Open through Options.Lock to
  1070  // subsequently open the database, skipping lock acquistion during Open.
  1071  //
  1072  // LockDirectory may be used to expand the critical section protected by the
  1073  // database lock to include setup before the call to Open.
  1074  func LockDirectory(dirname string, fs vfs.FS) (*Lock, error) {
  1075  	fileLock, err := fs.Lock(base.MakeFilepath(fs, dirname, fileTypeLock, base.FileNum(0).DiskFileNum()))
  1076  	if err != nil {
  1077  		return nil, err
  1078  	}
  1079  	l := &Lock{dirname: dirname, fileLock: fileLock}
  1080  	l.refs.Store(1)
  1081  	invariants.SetFinalizer(l, func(obj interface{}) {
  1082  		if refs := obj.(*Lock).refs.Load(); refs > 0 {
  1083  			panic(errors.AssertionFailedf("lock for %q finalized with %d refs", dirname, refs))
  1084  		}
  1085  	})
  1086  	return l, nil
  1087  }
  1088  
  1089  // Lock represents a file lock on a directory. It may be passed to Open through
  1090  // Options.Lock to elide lock aquisition during Open.
  1091  type Lock struct {
  1092  	dirname  string
  1093  	fileLock io.Closer
  1094  	// refs is a count of the number of handles on the lock. refs must be 0, 1
  1095  	// or 2.
  1096  	//
  1097  	// When acquired by the client and passed to Open, refs = 1 and the Open
  1098  	// call increments it to 2. When the database is closed, it's decremented to
  1099  	// 1. Finally when the original caller, calls Close on the Lock, it's
  1100  	// drecemented to zero and the underlying file lock is released.
  1101  	//
  1102  	// When Open acquires the file lock, refs remains at 1 until the database is
  1103  	// closed.
  1104  	refs atomic.Int32
  1105  }
  1106  
  1107  func (l *Lock) refForOpen() error {
  1108  	// During Open, when a user passed in a lock, the reference count must be
  1109  	// exactly 1. If it's zero, the lock is no longer held and is invalid. If
  1110  	// it's 2, the lock is already in use by another database within the
  1111  	// process.
  1112  	if !l.refs.CompareAndSwap(1, 2) {
  1113  		return errors.Errorf("pebble: unexpected Lock reference count; is the lock already in use?")
  1114  	}
  1115  	return nil
  1116  }
  1117  
  1118  // Close releases the lock, permitting another process to lock and open the
  1119  // database. Close must not be called until after a database using the Lock has
  1120  // been closed.
  1121  func (l *Lock) Close() error {
  1122  	if l.refs.Add(-1) > 0 {
  1123  		return nil
  1124  	}
  1125  	defer func() { l.fileLock = nil }()
  1126  	return l.fileLock.Close()
  1127  }
  1128  
  1129  // ErrDBDoesNotExist is generated when ErrorIfNotExists is set and the database
  1130  // does not exist.
  1131  //
  1132  // Note that errors can be wrapped with more details; use errors.Is().
  1133  var ErrDBDoesNotExist = errors.New("pebble: database does not exist")
  1134  
  1135  // ErrDBAlreadyExists is generated when ErrorIfExists is set and the database
  1136  // already exists.
  1137  //
  1138  // Note that errors can be wrapped with more details; use errors.Is().
  1139  var ErrDBAlreadyExists = errors.New("pebble: database already exists")
  1140  
  1141  // ErrDBNotPristine is generated when ErrorIfNotPristine is set and the database
  1142  // already exists and is not pristine.
  1143  //
  1144  // Note that errors can be wrapped with more details; use errors.Is().
  1145  var ErrDBNotPristine = errors.New("pebble: database already exists and is not pristine")
  1146  
  1147  // IsCorruptionError returns true if the given error indicates database
  1148  // corruption.
  1149  func IsCorruptionError(err error) bool {
  1150  	return errors.Is(err, base.ErrCorruption)
  1151  }
  1152  
  1153  func checkConsistency(v *manifest.Version, dirname string, objProvider objstorage.Provider) error {
  1154  	var errs []error
  1155  	dedup := make(map[base.DiskFileNum]struct{})
  1156  	for level, files := range v.Levels {
  1157  		iter := files.Iter()
  1158  		for f := iter.First(); f != nil; f = iter.Next() {
  1159  			backingState := f.FileBacking
  1160  			if _, ok := dedup[backingState.DiskFileNum]; ok {
  1161  				continue
  1162  			}
  1163  			dedup[backingState.DiskFileNum] = struct{}{}
  1164  			fileNum := backingState.DiskFileNum
  1165  			fileSize := backingState.Size
  1166  			// We skip over remote objects; those are instead checked asynchronously
  1167  			// by the table stats loading job.
  1168  			meta, err := objProvider.Lookup(base.FileTypeTable, fileNum)
  1169  			var size int64
  1170  			if err == nil {
  1171  				if meta.IsRemote() {
  1172  					continue
  1173  				}
  1174  				size, err = objProvider.Size(meta)
  1175  			}
  1176  			if err != nil {
  1177  				errs = append(errs, errors.Wrapf(err, "L%d: %s", errors.Safe(level), fileNum))
  1178  				continue
  1179  			}
  1180  
  1181  			if size != int64(fileSize) {
  1182  				errs = append(errs, errors.Errorf(
  1183  					"L%d: %s: object size mismatch (%s): %d (disk) != %d (MANIFEST)",
  1184  					errors.Safe(level), fileNum, objProvider.Path(meta),
  1185  					errors.Safe(size), errors.Safe(fileSize)))
  1186  				continue
  1187  			}
  1188  		}
  1189  	}
  1190  	return errors.Join(errs...)
  1191  }