github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/open.go

github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/open.go (about)

     1  // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"bytes"
     9  	"context"
    10  	"encoding/binary"
    11  	"fmt"
    12  	"io"
    13  	"math"
    14  	"os"
    15  	"sort"
    16  	"sync/atomic"
    17  	"time"
    18  
    19  	"github.com/cockroachdb/errors"
    20  	"github.com/cockroachdb/errors/oserror"
    21  	"github.com/cockroachdb/pebble/internal/arenaskl"
    22  	"github.com/cockroachdb/pebble/internal/base"
    23  	"github.com/cockroachdb/pebble/internal/cache"
    24  	"github.com/cockroachdb/pebble/internal/constants"
    25  	"github.com/cockroachdb/pebble/internal/invariants"
    26  	"github.com/cockroachdb/pebble/internal/manifest"
    27  	"github.com/cockroachdb/pebble/internal/manual"
    28  	"github.com/cockroachdb/pebble/objstorage"
    29  	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
    30  	"github.com/cockroachdb/pebble/record"
    31  	"github.com/cockroachdb/pebble/sstable"
    32  	"github.com/cockroachdb/pebble/vfs"
    33  	"github.com/prometheus/client_golang/prometheus"
    34  )
    35  
    36  const (
    37  	initialMemTableSize = 256 << 10 // 256 KB
    38  
    39  	// The max batch size is limited by the uint32 offsets stored in
    40  	// internal/batchskl.node, DeferredBatchOp, and flushableBatchEntry.
    41  	//
    42  	// We limit the size to MaxUint32 (just short of 4GB) so that the exclusive
    43  	// end of an allocation fits in uint32.
    44  	//
    45  	// On 32-bit systems, slices are naturally limited to MaxInt (just short of
    46  	// 2GB).
    47  	maxBatchSize = constants.MaxUint32OrInt
    48  
    49  	// The max memtable size is limited by the uint32 offsets stored in
    50  	// internal/arenaskl.node, DeferredBatchOp, and flushableBatchEntry.
    51  	//
    52  	// We limit the size to MaxUint32 (just short of 4GB) so that the exclusive
    53  	// end of an allocation fits in uint32.
    54  	//
    55  	// On 32-bit systems, slices are naturally limited to MaxInt (just short of
    56  	// 2GB).
    57  	maxMemTableSize = constants.MaxUint32OrInt
    58  )
    59  
    60  // TableCacheSize can be used to determine the table
    61  // cache size for a single db, given the maximum open
    62  // files which can be used by a table cache which is
    63  // only used by a single db.
    64  func TableCacheSize(maxOpenFiles int) int {
    65  	tableCacheSize := maxOpenFiles - numNonTableCacheFiles
    66  	if tableCacheSize < minTableCacheSize {
    67  		tableCacheSize = minTableCacheSize
    68  	}
    69  	return tableCacheSize
    70  }
    71  
    72  // Open opens a DB whose files live in the given directory.
    73  func Open(dirname string, opts *Options) (db *DB, _ error) {
    74  	// Make a copy of the options so that we don't mutate the passed in options.
    75  	opts = opts.Clone()
    76  	opts = opts.EnsureDefaults()
    77  	if err := opts.Validate(); err != nil {
    78  		return nil, err
    79  	}
    80  	if opts.LoggerAndTracer == nil {
    81  		opts.LoggerAndTracer = &base.LoggerWithNoopTracer{Logger: opts.Logger}
    82  	} else {
    83  		opts.Logger = opts.LoggerAndTracer
    84  	}
    85  
    86  	// In all error cases, we return db = nil; this is used by various
    87  	// deferred cleanups.
    88  
    89  	// Open the database and WAL directories first.
    90  	walDirname, dataDir, walDir, err := prepareAndOpenDirs(dirname, opts)
    91  	if err != nil {
    92  		return nil, errors.Wrapf(err, "error opening database at %q", dirname)
    93  	}
    94  	defer func() {
    95  		if db == nil {
    96  			if walDir != dataDir {
    97  				walDir.Close()
    98  			}
    99  			dataDir.Close()
   100  		}
   101  	}()
   102  
   103  	// Lock the database directory.
   104  	var fileLock *Lock
   105  	if opts.Lock != nil {
   106  		// The caller already acquired the database lock. Ensure that the
   107  		// directory matches.
   108  		if dirname != opts.Lock.dirname {
   109  			return nil, errors.Newf("pebble: opts.Lock acquired in %q not %q", opts.Lock.dirname, dirname)
   110  		}
   111  		if err := opts.Lock.refForOpen(); err != nil {
   112  			return nil, err
   113  		}
   114  		fileLock = opts.Lock
   115  	} else {
   116  		fileLock, err = LockDirectory(dirname, opts.FS)
   117  		if err != nil {
   118  			return nil, err
   119  		}
   120  	}
   121  	defer func() {
   122  		if db == nil {
   123  			fileLock.Close()
   124  		}
   125  	}()
   126  
   127  	// Establish the format major version.
   128  	formatVersion, formatVersionMarker, err := lookupFormatMajorVersion(opts.FS, dirname)
   129  	if err != nil {
   130  		return nil, err
   131  	}
   132  	defer func() {
   133  		if db == nil {
   134  			formatVersionMarker.Close()
   135  		}
   136  	}()
   137  
   138  	// Find the currently active manifest, if there is one.
   139  	manifestMarker, manifestFileNum, manifestExists, err := findCurrentManifest(formatVersion, opts.FS, dirname)
   140  	if err != nil {
   141  		return nil, errors.Wrapf(err, "pebble: database %q", dirname)
   142  	}
   143  	defer func() {
   144  		if db == nil {
   145  			manifestMarker.Close()
   146  		}
   147  	}()
   148  
   149  	// Atomic markers may leave behind obsolete files if there's a crash
   150  	// mid-update. Clean these up if we're not in read-only mode.
   151  	if !opts.ReadOnly {
   152  		if err := formatVersionMarker.RemoveObsolete(); err != nil {
   153  			return nil, err
   154  		}
   155  		if err := manifestMarker.RemoveObsolete(); err != nil {
   156  			return nil, err
   157  		}
   158  	}
   159  
   160  	if opts.Cache == nil {
   161  		opts.Cache = cache.New(cacheDefaultSize)
   162  	} else {
   163  		opts.Cache.Ref()
   164  	}
   165  
   166  	d := &DB{
   167  		cacheID:             opts.Cache.NewID(),
   168  		dirname:             dirname,
   169  		walDirname:          walDirname,
   170  		opts:                opts,
   171  		cmp:                 opts.Comparer.Compare,
   172  		equal:               opts.equal(),
   173  		merge:               opts.Merger.Merge,
   174  		split:               opts.Comparer.Split,
   175  		abbreviatedKey:      opts.Comparer.AbbreviatedKey,
   176  		largeBatchThreshold: (opts.MemTableSize - uint64(memTableEmptySize)) / 2,
   177  		fileLock:            fileLock,
   178  		dataDir:             dataDir,
   179  		walDir:              walDir,
   180  		logRecycler:         logRecycler{limit: opts.MemTableStopWritesThreshold + 1},
   181  		closed:              new(atomic.Value),
   182  		closedCh:            make(chan struct{}),
   183  	}
   184  	d.mu.versions = &versionSet{}
   185  	d.diskAvailBytes.Store(math.MaxUint64)
   186  
   187  	defer func() {
   188  		// If an error or panic occurs during open, attempt to release the manually
   189  		// allocated memory resources. Note that rather than look for an error, we
   190  		// look for the return of a nil DB pointer.
   191  		if r := recover(); db == nil {
   192  			// Release our references to the Cache. Note that both the DB, and
   193  			// tableCache have a reference. When we release the reference to
   194  			// the tableCache, and if there are no other references to
   195  			// the tableCache, then the tableCache will also release its
   196  			// reference to the cache.
   197  			opts.Cache.Unref()
   198  
   199  			if d.tableCache != nil {
   200  				_ = d.tableCache.close()
   201  			}
   202  
   203  			for _, mem := range d.mu.mem.queue {
   204  				switch t := mem.flushable.(type) {
   205  				case *memTable:
   206  					manual.Free(t.arenaBuf)
   207  					t.arenaBuf = nil
   208  				}
   209  			}
   210  			if d.cleanupManager != nil {
   211  				d.cleanupManager.Close()
   212  			}
   213  			if d.objProvider != nil {
   214  				d.objProvider.Close()
   215  			}
   216  			if r != nil {
   217  				panic(r)
   218  			}
   219  		}
   220  	}()
   221  
   222  	d.commit = newCommitPipeline(commitEnv{
   223  		logSeqNum:     &d.mu.versions.logSeqNum,
   224  		visibleSeqNum: &d.mu.versions.visibleSeqNum,
   225  		apply:         d.commitApply,
   226  		write:         d.commitWrite,
   227  	})
   228  	d.mu.nextJobID = 1
   229  	d.mu.mem.nextSize = opts.MemTableSize
   230  	if d.mu.mem.nextSize > initialMemTableSize {
   231  		d.mu.mem.nextSize = initialMemTableSize
   232  	}
   233  	d.mu.compact.cond.L = &d.mu.Mutex
   234  	d.mu.compact.inProgress = make(map[*compaction]struct{})
   235  	d.mu.compact.noOngoingFlushStartTime = time.Now()
   236  	d.mu.snapshots.init()
   237  	// logSeqNum is the next sequence number that will be assigned.
   238  	// Start assigning sequence numbers from base.SeqNumStart to leave
   239  	// room for reserved sequence numbers (see comments around
   240  	// SeqNumStart).
   241  	d.mu.versions.logSeqNum.Store(base.SeqNumStart)
   242  	d.mu.formatVers.vers.Store(uint64(formatVersion))
   243  	d.mu.formatVers.marker = formatVersionMarker
   244  
   245  	d.timeNow = time.Now
   246  	d.openedAt = d.timeNow()
   247  
   248  	d.mu.Lock()
   249  	defer d.mu.Unlock()
   250  
   251  	jobID := d.mu.nextJobID
   252  	d.mu.nextJobID++
   253  
   254  	setCurrent := setCurrentFunc(d.FormatMajorVersion(), manifestMarker, opts.FS, dirname, d.dataDir)
   255  
   256  	if !manifestExists {
   257  		// DB does not exist.
   258  		if d.opts.ErrorIfNotExists || d.opts.ReadOnly {
   259  			return nil, errors.Wrapf(ErrDBDoesNotExist, "dirname=%q", dirname)
   260  		}
   261  
   262  		// Create the DB.
   263  		if err := d.mu.versions.create(jobID, dirname, opts, manifestMarker, setCurrent, d.FormatMajorVersion, &d.mu.Mutex); err != nil {
   264  			return nil, err
   265  		}
   266  	} else {
   267  		if opts.ErrorIfExists {
   268  			return nil, errors.Wrapf(ErrDBAlreadyExists, "dirname=%q", dirname)
   269  		}
   270  		// Load the version set.
   271  		if err := d.mu.versions.load(dirname, opts, manifestFileNum.FileNum(), manifestMarker, setCurrent, d.FormatMajorVersion, &d.mu.Mutex); err != nil {
   272  			return nil, err
   273  		}
   274  		if opts.ErrorIfNotPristine {
   275  			liveFileNums := make(map[base.DiskFileNum]struct{})
   276  			d.mu.versions.addLiveFileNums(liveFileNums)
   277  			if len(liveFileNums) != 0 {
   278  				return nil, errors.Wrapf(ErrDBNotPristine, "dirname=%q", dirname)
   279  			}
   280  		}
   281  	}
   282  
   283  	// In read-only mode, we replay directly into the mutable memtable but never
   284  	// flush it. We need to delay creation of the memtable until we know the
   285  	// sequence number of the first batch that will be inserted.
   286  	if !d.opts.ReadOnly {
   287  		var entry *flushableEntry
   288  		d.mu.mem.mutable, entry = d.newMemTable(0 /* logNum */, d.mu.versions.logSeqNum.Load())
   289  		d.mu.mem.queue = append(d.mu.mem.queue, entry)
   290  	}
   291  
   292  	// List the objects
   293  	ls, err := opts.FS.List(d.walDirname)
   294  	if err != nil {
   295  		return nil, err
   296  	}
   297  	if d.dirname != d.walDirname {
   298  		ls2, err := opts.FS.List(d.dirname)
   299  		if err != nil {
   300  			return nil, err
   301  		}
   302  		ls = append(ls, ls2...)
   303  	}
   304  	providerSettings := objstorageprovider.Settings{
   305  		Logger:              opts.Logger,
   306  		FS:                  opts.FS,
   307  		FSDirName:           dirname,
   308  		FSDirInitialListing: ls,
   309  		FSCleaner:           opts.Cleaner,
   310  		NoSyncOnClose:       opts.NoSyncOnClose,
   311  		BytesPerSync:        opts.BytesPerSync,
   312  	}
   313  	providerSettings.Remote.StorageFactory = opts.Experimental.RemoteStorage
   314  	providerSettings.Remote.CreateOnShared = opts.Experimental.CreateOnShared
   315  	providerSettings.Remote.CreateOnSharedLocator = opts.Experimental.CreateOnSharedLocator
   316  	providerSettings.Remote.CacheSizeBytes = opts.Experimental.SecondaryCacheSizeBytes
   317  
   318  	d.objProvider, err = objstorageprovider.Open(providerSettings)
   319  	if err != nil {
   320  		return nil, err
   321  	}
   322  
   323  	d.cleanupManager = openCleanupManager(opts, d.objProvider, d.onObsoleteTableDelete, d.getDeletionPacerInfo)
   324  
   325  	if manifestExists {
   326  		curVersion := d.mu.versions.currentVersion()
   327  		if err := checkConsistency(curVersion, dirname, d.objProvider); err != nil {
   328  			return nil, err
   329  		}
   330  	}
   331  
   332  	tableCacheSize := TableCacheSize(opts.MaxOpenFiles)
   333  	d.tableCache = newTableCacheContainer(opts.TableCache, d.cacheID, d.objProvider, d.opts, tableCacheSize)
   334  	d.newIters = d.tableCache.newIters
   335  	d.tableNewRangeKeyIter = d.tableCache.newRangeKeyIter
   336  
   337  	// Replay any newer log files than the ones named in the manifest.
   338  	type fileNumAndName struct {
   339  		num  FileNum
   340  		name string
   341  	}
   342  	var logFiles []fileNumAndName
   343  	var previousOptionsFileNum FileNum
   344  	var previousOptionsFilename string
   345  	for _, filename := range ls {
   346  		ft, fn, ok := base.ParseFilename(opts.FS, filename)
   347  		if !ok {
   348  			continue
   349  		}
   350  
   351  		// Don't reuse any obsolete file numbers to avoid modifying an
   352  		// ingested sstable's original external file.
   353  		if d.mu.versions.nextFileNum <= fn.FileNum() {
   354  			d.mu.versions.nextFileNum = fn.FileNum() + 1
   355  		}
   356  
   357  		switch ft {
   358  		case fileTypeLog:
   359  			if fn.FileNum() >= d.mu.versions.minUnflushedLogNum {
   360  				logFiles = append(logFiles, fileNumAndName{fn.FileNum(), filename})
   361  			}
   362  			if d.logRecycler.minRecycleLogNum <= fn.FileNum() {
   363  				d.logRecycler.minRecycleLogNum = fn.FileNum() + 1
   364  			}
   365  		case fileTypeOptions:
   366  			if previousOptionsFileNum < fn.FileNum() {
   367  				previousOptionsFileNum = fn.FileNum()
   368  				previousOptionsFilename = filename
   369  			}
   370  		case fileTypeTemp, fileTypeOldTemp:
   371  			if !d.opts.ReadOnly {
   372  				// Some codepaths write to a temporary file and then
   373  				// rename it to its final location when complete.  A
   374  				// temp file is leftover if a process exits before the
   375  				// rename.  Remove it.
   376  				err := opts.FS.Remove(opts.FS.PathJoin(dirname, filename))
   377  				if err != nil {
   378  					return nil, err
   379  				}
   380  			}
   381  		}
   382  	}
   383  
   384  	// Ratchet d.mu.versions.nextFileNum ahead of all known objects in the
   385  	// objProvider. This avoids FileNum collisions with obsolete sstables.
   386  	objects := d.objProvider.List()
   387  	for _, obj := range objects {
   388  		if d.mu.versions.nextFileNum <= obj.DiskFileNum.FileNum() {
   389  			d.mu.versions.nextFileNum = obj.DiskFileNum.FileNum() + 1
   390  		}
   391  	}
   392  
   393  	// Validate the most-recent OPTIONS file, if there is one.
   394  	var strictWALTail bool
   395  	if previousOptionsFilename != "" {
   396  		path := opts.FS.PathJoin(dirname, previousOptionsFilename)
   397  		strictWALTail, err = checkOptions(opts, path)
   398  		if err != nil {
   399  			return nil, err
   400  		}
   401  	}
   402  
   403  	sort.Slice(logFiles, func(i, j int) bool {
   404  		return logFiles[i].num < logFiles[j].num
   405  	})
   406  
   407  	var ve versionEdit
   408  	var toFlush flushableList
   409  	for i, lf := range logFiles {
   410  		lastWAL := i == len(logFiles)-1
   411  		flush, maxSeqNum, err := d.replayWAL(jobID, &ve, opts.FS,
   412  			opts.FS.PathJoin(d.walDirname, lf.name), lf.num, strictWALTail && !lastWAL)
   413  		if err != nil {
   414  			return nil, err
   415  		}
   416  		toFlush = append(toFlush, flush...)
   417  		d.mu.versions.markFileNumUsed(lf.num)
   418  		if d.mu.versions.logSeqNum.Load() < maxSeqNum {
   419  			d.mu.versions.logSeqNum.Store(maxSeqNum)
   420  		}
   421  	}
   422  	d.mu.versions.visibleSeqNum.Store(d.mu.versions.logSeqNum.Load())
   423  
   424  	if !d.opts.ReadOnly {
   425  		// Create an empty .log file.
   426  		newLogNum := d.mu.versions.getNextFileNum()
   427  
   428  		// This logic is slightly different than RocksDB's. Specifically, RocksDB
   429  		// sets MinUnflushedLogNum to max-recovered-log-num + 1. We set it to the
   430  		// newLogNum. There should be no difference in using either value.
   431  		ve.MinUnflushedLogNum = newLogNum
   432  
   433  		// Create the manifest with the updated MinUnflushedLogNum before
   434  		// creating the new log file. If we created the log file first, a
   435  		// crash before the manifest is synced could leave two WALs with
   436  		// unclean tails.
   437  		d.mu.versions.logLock()
   438  		if err := d.mu.versions.logAndApply(jobID, &ve, newFileMetrics(ve.NewFiles), false /* forceRotation */, func() []compactionInfo {
   439  			return nil
   440  		}); err != nil {
   441  			return nil, err
   442  		}
   443  
   444  		for _, entry := range toFlush {
   445  			entry.readerUnrefLocked(true)
   446  		}
   447  
   448  		newLogName := base.MakeFilepath(opts.FS, d.walDirname, fileTypeLog, newLogNum.DiskFileNum())
   449  		d.mu.log.queue = append(d.mu.log.queue, fileInfo{fileNum: newLogNum.DiskFileNum(), fileSize: 0})
   450  		logFile, err := opts.FS.Create(newLogName)
   451  		if err != nil {
   452  			return nil, err
   453  		}
   454  		if err := d.walDir.Sync(); err != nil {
   455  			return nil, err
   456  		}
   457  		d.opts.EventListener.WALCreated(WALCreateInfo{
   458  			JobID:   jobID,
   459  			Path:    newLogName,
   460  			FileNum: newLogNum,
   461  		})
   462  		// This isn't strictly necessary as we don't use the log number for
   463  		// memtables being flushed, only for the next unflushed memtable.
   464  		d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum = newLogNum
   465  
   466  		logFile = vfs.NewSyncingFile(logFile, vfs.SyncingFileOptions{
   467  			NoSyncOnClose:   d.opts.NoSyncOnClose,
   468  			BytesPerSync:    d.opts.WALBytesPerSync,
   469  			PreallocateSize: d.walPreallocateSize(),
   470  		})
   471  		d.mu.log.metrics.fsyncLatency = prometheus.NewHistogram(prometheus.HistogramOpts{
   472  			Buckets: FsyncLatencyBuckets,
   473  		})
   474  
   475  		logWriterConfig := record.LogWriterConfig{
   476  			WALMinSyncInterval: d.opts.WALMinSyncInterval,
   477  			WALFsyncLatency:    d.mu.log.metrics.fsyncLatency,
   478  			QueueSemChan:       d.commit.logSyncQSem,
   479  		}
   480  		d.mu.log.LogWriter = record.NewLogWriter(logFile, newLogNum, logWriterConfig)
   481  		d.mu.versions.metrics.WAL.Files++
   482  	}
   483  	d.updateReadStateLocked(d.opts.DebugCheck)
   484  
   485  	// If the Options specify a format major version higher than the
   486  	// loaded database's, upgrade it. If this is a new database, this
   487  	// code path also performs an initial upgrade from the starting
   488  	// implicit MostCompatible version.
   489  	//
   490  	// We ratchet the version this far into Open so that migrations have a read
   491  	// state available.
   492  	if !d.opts.ReadOnly && opts.FormatMajorVersion > d.FormatMajorVersion() {
   493  		if err := d.ratchetFormatMajorVersionLocked(opts.FormatMajorVersion); err != nil {
   494  			return nil, err
   495  		}
   496  	}
   497  
   498  	if !d.opts.ReadOnly {
   499  		// Write the current options to disk.
   500  		d.optionsFileNum = d.mu.versions.getNextFileNum().DiskFileNum()
   501  		tmpPath := base.MakeFilepath(opts.FS, dirname, fileTypeTemp, d.optionsFileNum)
   502  		optionsPath := base.MakeFilepath(opts.FS, dirname, fileTypeOptions, d.optionsFileNum)
   503  
   504  		// Write them to a temporary file first, in case we crash before
   505  		// we're done. A corrupt options file prevents opening the
   506  		// database.
   507  		optionsFile, err := opts.FS.Create(tmpPath)
   508  		if err != nil {
   509  			return nil, err
   510  		}
   511  		serializedOpts := []byte(opts.String())
   512  		if _, err := optionsFile.Write(serializedOpts); err != nil {
   513  			return nil, errors.CombineErrors(err, optionsFile.Close())
   514  		}
   515  		d.optionsFileSize = uint64(len(serializedOpts))
   516  		if err := optionsFile.Sync(); err != nil {
   517  			return nil, errors.CombineErrors(err, optionsFile.Close())
   518  		}
   519  		if err := optionsFile.Close(); err != nil {
   520  			return nil, err
   521  		}
   522  		// Atomically rename to the OPTIONS-XXXXXX path. This rename is
   523  		// guaranteed to be atomic because the destination path does not
   524  		// exist.
   525  		if err := opts.FS.Rename(tmpPath, optionsPath); err != nil {
   526  			return nil, err
   527  		}
   528  		if err := d.dataDir.Sync(); err != nil {
   529  			return nil, err
   530  		}
   531  	}
   532  
   533  	if !d.opts.ReadOnly {
   534  		d.scanObsoleteFiles(ls)
   535  		d.deleteObsoleteFiles(jobID)
   536  	} else {
   537  		// All the log files are obsolete.
   538  		d.mu.versions.metrics.WAL.Files = int64(len(logFiles))
   539  	}
   540  	d.mu.tableStats.cond.L = &d.mu.Mutex
   541  	d.mu.tableValidation.cond.L = &d.mu.Mutex
   542  	if !d.opts.ReadOnly {
   543  		d.maybeCollectTableStatsLocked()
   544  	}
   545  	d.calculateDiskAvailableBytes()
   546  
   547  	d.maybeScheduleFlush()
   548  	d.maybeScheduleCompaction()
   549  
   550  	// Note: this is a no-op if invariants are disabled or race is enabled.
   551  	//
   552  	// Setting a finalizer on *DB causes *DB to never be reclaimed and the
   553  	// finalizer to never be run. The problem is due to this limitation of
   554  	// finalizers mention in the SetFinalizer docs:
   555  	//
   556  	//   If a cyclic structure includes a block with a finalizer, that cycle is
   557  	//   not guaranteed to be garbage collected and the finalizer is not
   558  	//   guaranteed to run, because there is no ordering that respects the
   559  	//   dependencies.
   560  	//
   561  	// DB has cycles with several of its internal structures: readState,
   562  	// newIters, tableCache, versions, etc. Each of this individually cause a
   563  	// cycle and prevent the finalizer from being run. But we can workaround this
   564  	// finializer limitation by setting a finalizer on another object that is
   565  	// tied to the lifetime of DB: the DB.closed atomic.Value.
   566  	dPtr := fmt.Sprintf("%p", d)
   567  	invariants.SetFinalizer(d.closed, func(obj interface{}) {
   568  		v := obj.(*atomic.Value)
   569  		if err := v.Load(); err == nil {
   570  			fmt.Fprintf(os.Stderr, "%s: unreferenced DB not closed\n", dPtr)
   571  			os.Exit(1)
   572  		}
   573  	})
   574  
   575  	return d, nil
   576  }
   577  
   578  // prepareAndOpenDirs opens the directories for the store (and creates them if
   579  // necessary).
   580  //
   581  // Returns an error if ReadOnly is set and the directories don't exist.
   582  func prepareAndOpenDirs(
   583  	dirname string, opts *Options,
   584  ) (walDirname string, dataDir vfs.File, walDir vfs.File, err error) {
   585  	walDirname = opts.WALDir
   586  	if opts.WALDir == "" {
   587  		walDirname = dirname
   588  	}
   589  
   590  	// Create directories if needed.
   591  	if !opts.ReadOnly {
   592  		if err := opts.FS.MkdirAll(dirname, 0755); err != nil {
   593  			return "", nil, nil, err
   594  		}
   595  		if walDirname != dirname {
   596  			if err := opts.FS.MkdirAll(walDirname, 0755); err != nil {
   597  				return "", nil, nil, err
   598  			}
   599  		}
   600  	}
   601  
   602  	dataDir, err = opts.FS.OpenDir(dirname)
   603  	if err != nil {
   604  		if opts.ReadOnly && oserror.IsNotExist(err) {
   605  			return "", nil, nil, errors.Errorf("pebble: database %q does not exist", dirname)
   606  		}
   607  		return "", nil, nil, err
   608  	}
   609  
   610  	if walDirname == dirname {
   611  		walDir = dataDir
   612  	} else {
   613  		walDir, err = opts.FS.OpenDir(walDirname)
   614  		if err != nil {
   615  			dataDir.Close()
   616  			return "", nil, nil, err
   617  		}
   618  	}
   619  	return walDirname, dataDir, walDir, nil
   620  }
   621  
   622  // GetVersion returns the engine version string from the latest options
   623  // file present in dir. Used to check what Pebble or RocksDB version was last
   624  // used to write to the database stored in this directory. An empty string is
   625  // returned if no valid OPTIONS file with a version key was found.
   626  func GetVersion(dir string, fs vfs.FS) (string, error) {
   627  	ls, err := fs.List(dir)
   628  	if err != nil {
   629  		return "", err
   630  	}
   631  	var version string
   632  	lastOptionsSeen := FileNum(0)
   633  	for _, filename := range ls {
   634  		ft, fn, ok := base.ParseFilename(fs, filename)
   635  		if !ok {
   636  			continue
   637  		}
   638  		switch ft {
   639  		case fileTypeOptions:
   640  			// If this file has a higher number than the last options file
   641  			// processed, reset version. This is because rocksdb often
   642  			// writes multiple options files without deleting previous ones.
   643  			// Otherwise, skip parsing this options file.
   644  			if fn.FileNum() > lastOptionsSeen {
   645  				version = ""
   646  				lastOptionsSeen = fn.FileNum()
   647  			} else {
   648  				continue
   649  			}
   650  			f, err := fs.Open(fs.PathJoin(dir, filename))
   651  			if err != nil {
   652  				return "", err
   653  			}
   654  			data, err := io.ReadAll(f)
   655  			f.Close()
   656  
   657  			if err != nil {
   658  				return "", err
   659  			}
   660  			err = parseOptions(string(data), func(section, key, value string) error {
   661  				switch {
   662  				case section == "Version":
   663  					switch key {
   664  					case "pebble_version":
   665  						version = value
   666  					case "rocksdb_version":
   667  						version = fmt.Sprintf("rocksdb v%s", value)
   668  					}
   669  				}
   670  				return nil
   671  			})
   672  			if err != nil {
   673  				return "", err
   674  			}
   675  		}
   676  	}
   677  	return version, nil
   678  }
   679  
   680  // replayWAL replays the edits in the specified log file. If the DB is in
   681  // read only mode, then the WALs are replayed into memtables and not flushed. If
   682  // the DB is not in read only mode, then the contents of the WAL are guaranteed
   683  // to be flushed.
   684  //
   685  // The toFlush return value is a list of flushables associated with the WAL
   686  // being replayed which will be flushed. Once the version edit has been applied
   687  // to the manifest, it is up to the caller of replayWAL to unreference the
   688  // toFlush flushables returned by replayWAL.
   689  //
   690  // d.mu must be held when calling this, but the mutex may be dropped and
   691  // re-acquired during the course of this method.
   692  func (d *DB) replayWAL(
   693  	jobID int, ve *versionEdit, fs vfs.FS, filename string, logNum FileNum, strictWALTail bool,
   694  ) (toFlush flushableList, maxSeqNum uint64, err error) {
   695  	file, err := fs.Open(filename)
   696  	if err != nil {
   697  		return nil, 0, err
   698  	}
   699  	defer file.Close()
   700  	var (
   701  		b               Batch
   702  		buf             bytes.Buffer
   703  		mem             *memTable
   704  		entry           *flushableEntry
   705  		rr              = record.NewReader(file, logNum)
   706  		offset          int64 // byte offset in rr
   707  		lastFlushOffset int64
   708  		keysReplayed    int64 // number of keys replayed
   709  		batchesReplayed int64 // number of batches replayed
   710  	)
   711  
   712  	// TODO(jackson): This function is interspersed with panics, in addition to
   713  	// corruption error propagation. Audit them to ensure we're truly only
   714  	// panicking where the error points to Pebble bug and not user or
   715  	// hardware-induced corruption.
   716  
   717  	if d.opts.ReadOnly {
   718  		// In read-only mode, we replay directly into the mutable memtable which will
   719  		// never be flushed.
   720  		mem = d.mu.mem.mutable
   721  		if mem != nil {
   722  			entry = d.mu.mem.queue[len(d.mu.mem.queue)-1]
   723  		}
   724  	}
   725  
   726  	// Flushes the current memtable, if not nil.
   727  	flushMem := func() {
   728  		if mem == nil {
   729  			return
   730  		}
   731  		var logSize uint64
   732  		if offset >= lastFlushOffset {
   733  			logSize = uint64(offset - lastFlushOffset)
   734  		}
   735  		// Else, this was the initial memtable in the read-only case which must have
   736  		// been empty, but we need to flush it since we don't want to add to it later.
   737  		lastFlushOffset = offset
   738  		entry.logSize = logSize
   739  		if !d.opts.ReadOnly {
   740  			toFlush = append(toFlush, entry)
   741  		}
   742  		mem, entry = nil, nil
   743  	}
   744  	// Creates a new memtable if there is no current memtable.
   745  	ensureMem := func(seqNum uint64) {
   746  		if mem != nil {
   747  			return
   748  		}
   749  		mem, entry = d.newMemTable(logNum, seqNum)
   750  		if d.opts.ReadOnly {
   751  			d.mu.mem.mutable = mem
   752  			d.mu.mem.queue = append(d.mu.mem.queue, entry)
   753  		}
   754  	}
   755  
   756  	// updateVE is used to update ve with information about new files created
   757  	// during the flush of any flushable not of type ingestedFlushable. For the
   758  	// flushable of type ingestedFlushable we use custom handling below.
   759  	updateVE := func() error {
   760  		// TODO(bananabrick): See if we can use the actual base level here,
   761  		// instead of using 1.
   762  		c := newFlush(d.opts, d.mu.versions.currentVersion(),
   763  			1 /* base level */, toFlush, d.timeNow())
   764  		newVE, _, _, err := d.runCompaction(jobID, c)
   765  		if err != nil {
   766  			return errors.Wrapf(err, "running compaction during WAL replay")
   767  		}
   768  		ve.NewFiles = append(ve.NewFiles, newVE.NewFiles...)
   769  		return nil
   770  	}
   771  	defer func() {
   772  		if err != nil {
   773  			err = errors.WithDetailf(err, "replaying log %s, offset %d", logNum, offset)
   774  		}
   775  	}()
   776  
   777  	for {
   778  		offset = rr.Offset()
   779  		r, err := rr.Next()
   780  		if err == nil {
   781  			_, err = io.Copy(&buf, r)
   782  		}
   783  		if err != nil {
   784  			// It is common to encounter a zeroed or invalid chunk due to WAL
   785  			// preallocation and WAL recycling. We need to distinguish these
   786  			// errors from EOF in order to recognize that the record was
   787  			// truncated and to avoid replaying subsequent WALs, but want
   788  			// to otherwise treat them like EOF.
   789  			if err == io.EOF {
   790  				break
   791  			} else if record.IsInvalidRecord(err) && !strictWALTail {
   792  				break
   793  			}
   794  			return nil, 0, errors.Wrap(err, "pebble: error when replaying WAL")
   795  		}
   796  
   797  		if buf.Len() < batchHeaderLen {
   798  			return nil, 0, base.CorruptionErrorf("pebble: corrupt log file %q (num %s)",
   799  				filename, errors.Safe(logNum))
   800  		}
   801  
   802  		if d.opts.ErrorIfNotPristine {
   803  			return nil, 0, errors.WithDetailf(ErrDBNotPristine, "location: %q", d.dirname)
   804  		}
   805  
   806  		// Specify Batch.db so that Batch.SetRepr will compute Batch.memTableSize
   807  		// which is used below.
   808  		b = Batch{}
   809  		b.db = d
   810  		b.SetRepr(buf.Bytes())
   811  		seqNum := b.SeqNum()
   812  		maxSeqNum = seqNum + uint64(b.Count())
   813  		keysReplayed += int64(b.Count())
   814  		batchesReplayed++
   815  		{
   816  			br := b.Reader()
   817  			if kind, encodedFileNum, _, ok, err := br.Next(); err != nil {
   818  				return nil, 0, err
   819  			} else if ok && kind == InternalKeyKindIngestSST {
   820  				fileNums := make([]base.DiskFileNum, 0, b.Count())
   821  				addFileNum := func(encodedFileNum []byte) {
   822  					fileNum, n := binary.Uvarint(encodedFileNum)
   823  					if n <= 0 {
   824  						panic("pebble: ingest sstable file num is invalid.")
   825  					}
   826  					fileNums = append(fileNums, base.FileNum(fileNum).DiskFileNum())
   827  				}
   828  				addFileNum(encodedFileNum)
   829  
   830  				for i := 1; i < int(b.Count()); i++ {
   831  					kind, encodedFileNum, _, ok, err := br.Next()
   832  					if err != nil {
   833  						return nil, 0, err
   834  					}
   835  					if kind != InternalKeyKindIngestSST {
   836  						panic("pebble: invalid batch key kind.")
   837  					}
   838  					if !ok {
   839  						panic("pebble: invalid batch count.")
   840  					}
   841  					addFileNum(encodedFileNum)
   842  				}
   843  
   844  				if _, _, _, ok, err := br.Next(); err != nil {
   845  					return nil, 0, err
   846  				} else if ok {
   847  					panic("pebble: invalid number of entries in batch.")
   848  				}
   849  
   850  				meta := make([]*fileMetadata, len(fileNums))
   851  				for i, n := range fileNums {
   852  					var readable objstorage.Readable
   853  					objMeta, err := d.objProvider.Lookup(fileTypeTable, n)
   854  					if err != nil {
   855  						return nil, 0, errors.Wrap(err, "pebble: error when looking up ingested SSTs")
   856  					}
   857  					if objMeta.IsRemote() {
   858  						readable, err = d.objProvider.OpenForReading(context.TODO(), fileTypeTable, n, objstorage.OpenOptions{MustExist: true})
   859  						if err != nil {
   860  							return nil, 0, errors.Wrap(err, "pebble: error when opening flushable ingest files")
   861  						}
   862  					} else {
   863  						path := base.MakeFilepath(d.opts.FS, d.dirname, fileTypeTable, n)
   864  						f, err := d.opts.FS.Open(path)
   865  						if err != nil {
   866  							return nil, 0, err
   867  						}
   868  
   869  						readable, err = sstable.NewSimpleReadable(f)
   870  						if err != nil {
   871  							return nil, 0, err
   872  						}
   873  					}
   874  					// NB: ingestLoad1 will close readable.
   875  					meta[i], err = ingestLoad1(d.opts, d.FormatMajorVersion(), readable, d.cacheID, n)
   876  					if err != nil {
   877  						return nil, 0, errors.Wrap(err, "pebble: error when loading flushable ingest files")
   878  					}
   879  				}
   880  
   881  				if uint32(len(meta)) != b.Count() {
   882  					panic("pebble: couldn't load all files in WAL entry.")
   883  				}
   884  
   885  				entry, err = d.newIngestedFlushableEntry(
   886  					meta, seqNum, logNum,
   887  				)
   888  				if err != nil {
   889  					return nil, 0, err
   890  				}
   891  
   892  				if d.opts.ReadOnly {
   893  					d.mu.mem.queue = append(d.mu.mem.queue, entry)
   894  					// We added the IngestSST flushable to the queue. But there
   895  					// must be at least one WAL entry waiting to be replayed. We
   896  					// have to ensure this newer WAL entry isn't replayed into
   897  					// the current value of d.mu.mem.mutable because the current
   898  					// mutable memtable exists before this flushable entry in
   899  					// the memtable queue. To ensure this, we just need to unset
   900  					// d.mu.mem.mutable. When a newer WAL is replayed, we will
   901  					// set d.mu.mem.mutable to a newer value.
   902  					d.mu.mem.mutable = nil
   903  				} else {
   904  					toFlush = append(toFlush, entry)
   905  					// During WAL replay, the lsm only has L0, hence, the
   906  					// baseLevel is 1. For the sake of simplicity, we place the
   907  					// ingested files in L0 here, instead of finding their
   908  					// target levels. This is a simplification for the sake of
   909  					// simpler code. It is expected that WAL replay should be
   910  					// rare, and that flushables of type ingestedFlushable
   911  					// should also be rare. So, placing the ingested files in L0
   912  					// is alright.
   913  					//
   914  					// TODO(bananabrick): Maybe refactor this function to allow
   915  					// us to easily place ingested files in levels as low as
   916  					// possible during WAL replay. It would require breaking up
   917  					// the application of ve to the manifest into chunks and is
   918  					// not pretty w/o a refactor to this function and how it's
   919  					// used.
   920  					c := newFlush(
   921  						d.opts, d.mu.versions.currentVersion(),
   922  						1, /* base level */
   923  						[]*flushableEntry{entry},
   924  						d.timeNow(),
   925  					)
   926  					for _, file := range c.flushing[0].flushable.(*ingestedFlushable).files {
   927  						ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: 0, Meta: file.FileMetadata})
   928  					}
   929  				}
   930  				return toFlush, maxSeqNum, nil
   931  			}
   932  		}
   933  
   934  		if b.memTableSize >= uint64(d.largeBatchThreshold) {
   935  			flushMem()
   936  			// Make a copy of the data slice since it is currently owned by buf and will
   937  			// be reused in the next iteration.
   938  			b.data = append([]byte(nil), b.data...)
   939  			b.flushable, err = newFlushableBatch(&b, d.opts.Comparer)
   940  			if err != nil {
   941  				return nil, 0, err
   942  			}
   943  			entry := d.newFlushableEntry(b.flushable, logNum, b.SeqNum())
   944  			// Disable memory accounting by adding a reader ref that will never be
   945  			// removed.
   946  			entry.readerRefs.Add(1)
   947  			if d.opts.ReadOnly {
   948  				d.mu.mem.queue = append(d.mu.mem.queue, entry)
   949  				// We added the flushable batch to the flushable to the queue.
   950  				// But there must be at least one WAL entry waiting to be
   951  				// replayed. We have to ensure this newer WAL entry isn't
   952  				// replayed into the current value of d.mu.mem.mutable because
   953  				// the current mutable memtable exists before this flushable
   954  				// entry in the memtable queue. To ensure this, we just need to
   955  				// unset d.mu.mem.mutable. When a newer WAL is replayed, we will
   956  				// set d.mu.mem.mutable to a newer value.
   957  				d.mu.mem.mutable = nil
   958  			} else {
   959  				toFlush = append(toFlush, entry)
   960  			}
   961  		} else {
   962  			ensureMem(seqNum)
   963  			if err = mem.prepare(&b); err != nil && err != arenaskl.ErrArenaFull {
   964  				return nil, 0, err
   965  			}
   966  			// We loop since DB.newMemTable() slowly grows the size of allocated memtables, so the
   967  			// batch may not initially fit, but will eventually fit (since it is smaller than
   968  			// largeBatchThreshold).
   969  			for err == arenaskl.ErrArenaFull {
   970  				flushMem()
   971  				ensureMem(seqNum)
   972  				err = mem.prepare(&b)
   973  				if err != nil && err != arenaskl.ErrArenaFull {
   974  					return nil, 0, err
   975  				}
   976  			}
   977  			if err = mem.apply(&b, seqNum); err != nil {
   978  				return nil, 0, err
   979  			}
   980  			mem.writerUnref()
   981  		}
   982  		buf.Reset()
   983  	}
   984  
   985  	d.opts.Logger.Infof("[JOB %d] WAL file %s with log number %s stopped reading at offset: %d; replayed %d keys in %d batches", jobID, filename, logNum.String(), offset, keysReplayed, batchesReplayed)
   986  	flushMem()
   987  
   988  	// mem is nil here.
   989  	if !d.opts.ReadOnly {
   990  		err = updateVE()
   991  		if err != nil {
   992  			return nil, 0, err
   993  		}
   994  	}
   995  	return toFlush, maxSeqNum, err
   996  }
   997  
   998  func checkOptions(opts *Options, path string) (strictWALTail bool, err error) {
   999  	f, err := opts.FS.Open(path)
  1000  	if err != nil {
  1001  		return false, err
  1002  	}
  1003  	defer f.Close()
  1004  
  1005  	data, err := io.ReadAll(f)
  1006  	if err != nil {
  1007  		return false, err
  1008  	}
  1009  	return opts.checkOptions(string(data))
  1010  }
  1011  
  1012  // DBDesc briefly describes high-level state about a database.
  1013  type DBDesc struct {
  1014  	// Exists is true if an existing database was found.
  1015  	Exists bool
  1016  	// FormatMajorVersion indicates the database's current format
  1017  	// version.
  1018  	FormatMajorVersion FormatMajorVersion
  1019  	// ManifestFilename is the filename of the current active manifest,
  1020  	// if the database exists.
  1021  	ManifestFilename string
  1022  }
  1023  
  1024  // Peek looks for an existing database in dirname on the provided FS. It
  1025  // returns a brief description of the database. Peek is read-only and
  1026  // does not open the database
  1027  func Peek(dirname string, fs vfs.FS) (*DBDesc, error) {
  1028  	vers, versMarker, err := lookupFormatMajorVersion(fs, dirname)
  1029  	if err != nil {
  1030  		return nil, err
  1031  	}
  1032  	// TODO(jackson): Immediately closing the marker is clunky. Add a
  1033  	// PeekMarker variant that avoids opening the directory.
  1034  	if err := versMarker.Close(); err != nil {
  1035  		return nil, err
  1036  	}
  1037  
  1038  	// Find the currently active manifest, if there is one.
  1039  	manifestMarker, manifestFileNum, exists, err := findCurrentManifest(vers, fs, dirname)
  1040  	if err != nil {
  1041  		return nil, err
  1042  	}
  1043  	// TODO(jackson): Immediately closing the marker is clunky. Add a
  1044  	// PeekMarker variant that avoids opening the directory.
  1045  	if err := manifestMarker.Close(); err != nil {
  1046  		return nil, err
  1047  	}
  1048  
  1049  	desc := &DBDesc{
  1050  		Exists:             exists,
  1051  		FormatMajorVersion: vers,
  1052  	}
  1053  	if exists {
  1054  		desc.ManifestFilename = base.MakeFilepath(fs, dirname, fileTypeManifest, manifestFileNum)
  1055  	}
  1056  	return desc, nil
  1057  }
  1058  
  1059  // LockDirectory acquires the database directory lock in the named directory,
  1060  // preventing another process from opening the database. LockDirectory returns a
  1061  // handle to the held lock that may be passed to Open through Options.Lock to
  1062  // subsequently open the database, skipping lock acquistion during Open.
  1063  //
  1064  // LockDirectory may be used to expand the critical section protected by the
  1065  // database lock to include setup before the call to Open.
  1066  func LockDirectory(dirname string, fs vfs.FS) (*Lock, error) {
  1067  	fileLock, err := fs.Lock(base.MakeFilepath(fs, dirname, fileTypeLock, base.FileNum(0).DiskFileNum()))
  1068  	if err != nil {
  1069  		return nil, err
  1070  	}
  1071  	l := &Lock{dirname: dirname, fileLock: fileLock}
  1072  	l.refs.Store(1)
  1073  	invariants.SetFinalizer(l, func(obj interface{}) {
  1074  		if refs := obj.(*Lock).refs.Load(); refs > 0 {
  1075  			panic(errors.AssertionFailedf("lock for %q finalized with %d refs", dirname, refs))
  1076  		}
  1077  	})
  1078  	return l, nil
  1079  }
  1080  
  1081  // Lock represents a file lock on a directory. It may be passed to Open through
  1082  // Options.Lock to elide lock aquisition during Open.
  1083  type Lock struct {
  1084  	dirname  string
  1085  	fileLock io.Closer
  1086  	// refs is a count of the number of handles on the lock. refs must be 0, 1
  1087  	// or 2.
  1088  	//
  1089  	// When acquired by the client and passed to Open, refs = 1 and the Open
  1090  	// call increments it to 2. When the database is closed, it's decremented to
  1091  	// 1. Finally when the original caller, calls Close on the Lock, it's
  1092  	// drecemented to zero and the underlying file lock is released.
  1093  	//
  1094  	// When Open acquires the file lock, refs remains at 1 until the database is
  1095  	// closed.
  1096  	refs atomic.Int32
  1097  }
  1098  
  1099  func (l *Lock) refForOpen() error {
  1100  	// During Open, when a user passed in a lock, the reference count must be
  1101  	// exactly 1. If it's zero, the lock is no longer held and is invalid. If
  1102  	// it's 2, the lock is already in use by another database within the
  1103  	// process.
  1104  	if !l.refs.CompareAndSwap(1, 2) {
  1105  		return errors.Errorf("pebble: unexpected Lock reference count; is the lock already in use?")
  1106  	}
  1107  	return nil
  1108  }
  1109  
  1110  // Close releases the lock, permitting another process to lock and open the
  1111  // database. Close must not be called until after a database using the Lock has
  1112  // been closed.
  1113  func (l *Lock) Close() error {
  1114  	if l.refs.Add(-1) > 0 {
  1115  		return nil
  1116  	}
  1117  	defer func() { l.fileLock = nil }()
  1118  	return l.fileLock.Close()
  1119  }
  1120  
  1121  // ErrDBDoesNotExist is generated when ErrorIfNotExists is set and the database
  1122  // does not exist.
  1123  //
  1124  // Note that errors can be wrapped with more details; use errors.Is().
  1125  var ErrDBDoesNotExist = errors.New("pebble: database does not exist")
  1126  
  1127  // ErrDBAlreadyExists is generated when ErrorIfExists is set and the database
  1128  // already exists.
  1129  //
  1130  // Note that errors can be wrapped with more details; use errors.Is().
  1131  var ErrDBAlreadyExists = errors.New("pebble: database already exists")
  1132  
  1133  // ErrDBNotPristine is generated when ErrorIfNotPristine is set and the database
  1134  // already exists and is not pristine.
  1135  //
  1136  // Note that errors can be wrapped with more details; use errors.Is().
  1137  var ErrDBNotPristine = errors.New("pebble: database already exists and is not pristine")
  1138  
  1139  // IsCorruptionError returns true if the given error indicates database
  1140  // corruption.
  1141  func IsCorruptionError(err error) bool {
  1142  	return errors.Is(err, base.ErrCorruption)
  1143  }
  1144  
  1145  func checkConsistency(v *manifest.Version, dirname string, objProvider objstorage.Provider) error {
  1146  	var buf bytes.Buffer
  1147  	var args []interface{}
  1148  
  1149  	dedup := make(map[base.DiskFileNum]struct{})
  1150  	for level, files := range v.Levels {
  1151  		iter := files.Iter()
  1152  		for f := iter.First(); f != nil; f = iter.Next() {
  1153  			backingState := f.FileBacking
  1154  			if _, ok := dedup[backingState.DiskFileNum]; ok {
  1155  				continue
  1156  			}
  1157  			dedup[backingState.DiskFileNum] = struct{}{}
  1158  			fileNum := backingState.DiskFileNum
  1159  			fileSize := backingState.Size
  1160  			// We allow foreign objects to have a mismatch between sizes. This is
  1161  			// because we might skew the backing size stored by our objprovider
  1162  			// to prevent us from over-prioritizing this file for compaction.
  1163  			meta, err := objProvider.Lookup(base.FileTypeTable, fileNum)
  1164  			var size int64
  1165  			if err == nil {
  1166  				if objProvider.IsSharedForeign(meta) {
  1167  					continue
  1168  				}
  1169  				size, err = objProvider.Size(meta)
  1170  			}
  1171  			if err != nil {
  1172  				buf.WriteString("L%d: %s: %v\n")
  1173  				args = append(args, errors.Safe(level), errors.Safe(fileNum), err)
  1174  				continue
  1175  			}
  1176  
  1177  			if size != int64(fileSize) {
  1178  				buf.WriteString("L%d: %s: object size mismatch (%s): %d (disk) != %d (MANIFEST)\n")
  1179  				args = append(args, errors.Safe(level), errors.Safe(fileNum), objProvider.Path(meta),
  1180  					errors.Safe(size), errors.Safe(fileSize))
  1181  				continue
  1182  			}
  1183  		}
  1184  	}
  1185  
  1186  	if buf.Len() == 0 {
  1187  		return nil
  1188  	}
  1189  	return errors.Errorf(buf.String(), args...)
  1190  }