github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/open.go (about)

     1  // Copyright 2012 The LevelDB-Go and Pebble and Bitalostored Authors and Bitalostored Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package bitalostable
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"io"
    11  	"io/ioutil"
    12  	"math"
    13  	"os"
    14  	"sort"
    15  	"sync/atomic"
    16  	"time"
    17  
    18  	"github.com/cockroachdb/errors"
    19  	"github.com/zuoyebang/bitalostable/internal/arenaskl"
    20  	"github.com/zuoyebang/bitalostable/internal/base"
    21  	"github.com/zuoyebang/bitalostable/internal/cache"
    22  	"github.com/zuoyebang/bitalostable/internal/invariants"
    23  	"github.com/zuoyebang/bitalostable/internal/manual"
    24  	"github.com/zuoyebang/bitalostable/internal/rate"
    25  	"github.com/zuoyebang/bitalostable/record"
    26  	"github.com/zuoyebang/bitalostable/vfs"
    27  )
    28  
    29  const (
    30  	initialMemTableSize = 256 << 10 // 256 KB
    31  
    32  	// The max batch size is limited by the uint32 offsets stored in
    33  	// internal/batchskl.node, DeferredBatchOp, and flushableBatchEntry.
    34  	maxBatchSize = 4 << 30 // 4 GB
    35  
    36  	// The max memtable size is limited by the uint32 offsets stored in
    37  	// internal/arenaskl.node, DeferredBatchOp, and flushableBatchEntry.
    38  	maxMemTableSize = 4 << 30 // 4 GB
    39  )
    40  
    41  // TableCacheSize can be used to determine the table
    42  // cache size for a single db, given the maximum open
    43  // files which can be used by a table cache which is
    44  // only used by a single db.
    45  func TableCacheSize(maxOpenFiles int) int {
    46  	tableCacheSize := maxOpenFiles - numNonTableCacheFiles
    47  	if tableCacheSize < minTableCacheSize {
    48  		tableCacheSize = minTableCacheSize
    49  	}
    50  	return tableCacheSize
    51  }
    52  
    53  // Open opens a DB whose files live in the given directory.
    54  func Open(dirname string, opts *Options) (db *DB, _ error) {
    55  	// Make a copy of the options so that we don't mutate the passed in options.
    56  	opts = opts.Clone()
    57  	opts = opts.EnsureDefaults()
    58  	if err := opts.Validate(); err != nil {
    59  		return nil, err
    60  	}
    61  
    62  	if opts.Cache == nil {
    63  		opts.Cache = cache.New(cacheDefaultSize)
    64  	} else {
    65  		opts.Cache.Ref()
    66  	}
    67  
    68  	d := &DB{
    69  		cacheID:             opts.Cache.NewID(),
    70  		dirname:             dirname,
    71  		walDirname:          opts.WALDir,
    72  		opts:                opts,
    73  		cmp:                 opts.Comparer.Compare,
    74  		equal:               opts.equal(),
    75  		merge:               opts.Merger.Merge,
    76  		split:               opts.Comparer.Split,
    77  		abbreviatedKey:      opts.Comparer.AbbreviatedKey,
    78  		largeBatchThreshold: (opts.MemTableSize - int(memTableEmptySize)) / 2,
    79  		logRecycler:         logRecycler{limit: opts.MemTableStopWritesThreshold + 1},
    80  		closed:              new(atomic.Value),
    81  		closedCh:            make(chan struct{}),
    82  	}
    83  	d.mu.versions = &versionSet{}
    84  	d.atomic.diskAvailBytes = math.MaxUint64
    85  	d.mu.versions.diskAvailBytes = d.getDiskAvailableBytesCached
    86  
    87  	defer func() {
    88  		// If an error or panic occurs during open, attempt to release the manually
    89  		// allocated memory resources. Note that rather than look for an error, we
    90  		// look for the return of a nil DB pointer.
    91  		if r := recover(); db == nil {
    92  			// Release our references to the Cache. Note that both the DB, and
    93  			// tableCache have a reference. When we release the reference to
    94  			// the tableCache, and if there are no other references to
    95  			// the tableCache, then the tableCache will also release its
    96  			// reference to the cache.
    97  			opts.Cache.Unref()
    98  
    99  			if d.tableCache != nil {
   100  				_ = d.tableCache.close()
   101  			}
   102  
   103  			for _, mem := range d.mu.mem.queue {
   104  				switch t := mem.flushable.(type) {
   105  				case *memTable:
   106  					manual.Free(t.arenaBuf)
   107  					t.arenaBuf = nil
   108  				}
   109  			}
   110  			if r != nil {
   111  				panic(r)
   112  			}
   113  		}
   114  	}()
   115  
   116  	tableCacheSize := TableCacheSize(opts.MaxOpenFiles)
   117  	d.tableCache = newTableCacheContainer(opts.TableCache, d.cacheID, dirname, opts.FS, d.opts, tableCacheSize)
   118  	d.newIters = d.tableCache.newIters
   119  	d.tableNewRangeKeyIter = d.tableCache.newRangeKeyIter
   120  
   121  	d.commit = newCommitPipeline(commitEnv{
   122  		logSeqNum:     &d.mu.versions.atomic.logSeqNum,
   123  		visibleSeqNum: &d.mu.versions.atomic.visibleSeqNum,
   124  		apply:         d.commitApply,
   125  		write:         d.commitWrite,
   126  	})
   127  	d.deletionLimiter = rate.NewLimiter(
   128  		rate.Limit(d.opts.Experimental.MinDeletionRate),
   129  		d.opts.Experimental.MinDeletionRate)
   130  	d.mu.nextJobID = 1
   131  	d.mu.mem.nextSize = opts.MemTableSize
   132  	//if d.mu.mem.nextSize > initialMemTableSize {
   133  	//	d.mu.mem.nextSize = initialMemTableSize
   134  	//}
   135  	d.mu.mem.cond.L = &d.mu.Mutex
   136  	d.mu.cleaner.cond.L = &d.mu.Mutex
   137  	d.mu.compact.cond.L = &d.mu.Mutex
   138  	d.mu.compact.inProgress = make(map[*compaction]struct{})
   139  	d.mu.compact.noOngoingFlushStartTime = time.Now()
   140  	d.mu.snapshots.init()
   141  	// logSeqNum is the next sequence number that will be assigned. Start
   142  	// assigning sequence numbers from 1 to match rocksdb.
   143  	d.mu.versions.atomic.logSeqNum = 1
   144  
   145  	d.timeNow = time.Now
   146  
   147  	d.mu.Lock()
   148  	defer d.mu.Unlock()
   149  
   150  	if !d.opts.ReadOnly {
   151  		err := opts.FS.MkdirAll(dirname, 0755)
   152  		if err != nil {
   153  			return nil, err
   154  		}
   155  	}
   156  
   157  	// Ensure we close resources if we error out early. If the database is
   158  	// successfully opened, the named return value `db` will be set to `d`.
   159  	defer func() {
   160  		if db != nil {
   161  			// The database was successfully opened.
   162  			return
   163  		}
   164  		if d.dataDir != nil {
   165  			d.dataDir.Close()
   166  		}
   167  		if d.walDirname != d.dirname && d.walDir != nil {
   168  			d.walDir.Close()
   169  		}
   170  		if d.mu.formatVers.marker != nil {
   171  			d.mu.formatVers.marker.Close()
   172  		}
   173  	}()
   174  
   175  	// Open the database and WAL directories first in order to check for their
   176  	// existence.
   177  	var err error
   178  	d.dataDir, err = opts.FS.OpenDir(dirname)
   179  	if err != nil {
   180  		return nil, err
   181  	}
   182  	if d.walDirname == "" {
   183  		d.walDirname = d.dirname
   184  	}
   185  	if d.walDirname == d.dirname {
   186  		d.walDir = d.dataDir
   187  	} else {
   188  		if !d.opts.ReadOnly {
   189  			err := opts.FS.MkdirAll(d.walDirname, 0755)
   190  			if err != nil {
   191  				return nil, err
   192  			}
   193  		}
   194  		d.walDir, err = opts.FS.OpenDir(d.walDirname)
   195  		if err != nil {
   196  			return nil, err
   197  		}
   198  	}
   199  
   200  	// Lock the database directory.
   201  	fileLock, err := opts.FS.Lock(base.MakeFilepath(opts.FS, dirname, fileTypeLock, 0))
   202  	if err != nil {
   203  		d.dataDir.Close()
   204  		if d.dataDir != d.walDir {
   205  			d.walDir.Close()
   206  		}
   207  		return nil, err
   208  	}
   209  	defer func() {
   210  		if fileLock != nil {
   211  			fileLock.Close()
   212  		}
   213  	}()
   214  
   215  	// Establish the format major version.
   216  	{
   217  		d.mu.formatVers.vers, d.mu.formatVers.marker, err = lookupFormatMajorVersion(opts.FS, dirname)
   218  		if err != nil {
   219  			return nil, err
   220  		}
   221  		if !d.opts.ReadOnly {
   222  			if err := d.mu.formatVers.marker.RemoveObsolete(); err != nil {
   223  				return nil, err
   224  			}
   225  		}
   226  	}
   227  
   228  	jobID := d.mu.nextJobID
   229  	d.mu.nextJobID++
   230  
   231  	// Find the currently active manifest, if there is one.
   232  	manifestMarker, manifestFileNum, exists, err := findCurrentManifest(d.mu.formatVers.vers, opts.FS, dirname)
   233  	setCurrent := setCurrentFunc(d.mu.formatVers.vers, manifestMarker, opts.FS, dirname, d.dataDir)
   234  	defer func() {
   235  		// Ensure we close the manifest marker if we error out for any reason.
   236  		// If the database is successfully opened, the *versionSet will take
   237  		// ownership over the manifest marker, ensuring it's closed when the DB
   238  		// is closed.
   239  		if db == nil {
   240  			manifestMarker.Close()
   241  		}
   242  	}()
   243  	if err != nil {
   244  		return nil, errors.Wrapf(err, "bitalostable: database %q", dirname)
   245  	} else if !exists && !d.opts.ReadOnly && !d.opts.ErrorIfNotExists {
   246  		// Create the DB if it did not already exist.
   247  
   248  		if err := d.mu.versions.create(jobID, dirname, opts, manifestMarker, setCurrent, &d.mu.Mutex); err != nil {
   249  			return nil, err
   250  		}
   251  	} else if opts.ErrorIfExists {
   252  		return nil, errors.Errorf("bitalostable: database %q already exists", dirname)
   253  	} else {
   254  		// Load the version set.
   255  		if err := d.mu.versions.load(dirname, opts, manifestFileNum, manifestMarker, setCurrent, &d.mu.Mutex); err != nil {
   256  			return nil, err
   257  		}
   258  		if err := d.mu.versions.currentVersion().CheckConsistency(dirname, opts.FS); err != nil {
   259  			return nil, err
   260  		}
   261  	}
   262  
   263  	// If the Options specify a format major version higher than the
   264  	// loaded database's, upgrade it. If this is a new database, this
   265  	// code path also performs an initial upgrade from the starting
   266  	// implicit MostCompatible version.
   267  	if !d.opts.ReadOnly && opts.FormatMajorVersion > d.mu.formatVers.vers {
   268  		if err := d.ratchetFormatMajorVersionLocked(opts.FormatMajorVersion); err != nil {
   269  			return nil, err
   270  		}
   271  	}
   272  
   273  	// Atomic markers like the one used for the MANIFEST may leave
   274  	// behind obsolete files if there's a crash mid-update. Clean these
   275  	// up if we're not in read-only mode.
   276  	if !d.opts.ReadOnly {
   277  		if err := manifestMarker.RemoveObsolete(); err != nil {
   278  			return nil, err
   279  		}
   280  	}
   281  
   282  	// In read-only mode, we replay directly into the mutable memtable but never
   283  	// flush it. We need to delay creation of the memtable until we know the
   284  	// sequence number of the first batch that will be inserted.
   285  	if !d.opts.ReadOnly {
   286  		var entry *flushableEntry
   287  		d.mu.mem.mutable, entry = d.newMemTable(0 /* logNum */, d.mu.versions.atomic.logSeqNum)
   288  		d.mu.mem.queue = append(d.mu.mem.queue, entry)
   289  	}
   290  
   291  	ls, err := opts.FS.List(d.walDirname)
   292  	if err != nil {
   293  		return nil, err
   294  	}
   295  	if d.dirname != d.walDirname {
   296  		ls2, err := opts.FS.List(d.dirname)
   297  		if err != nil {
   298  			return nil, err
   299  		}
   300  		ls = append(ls, ls2...)
   301  	}
   302  
   303  	// Replay any newer log files than the ones named in the manifest.
   304  	type fileNumAndName struct {
   305  		num  FileNum
   306  		name string
   307  	}
   308  	var logFiles []fileNumAndName
   309  	var previousOptionsFileNum FileNum
   310  	var previousOptionsFilename string
   311  	for _, filename := range ls {
   312  		ft, fn, ok := base.ParseFilename(opts.FS, filename)
   313  		if !ok {
   314  			continue
   315  		}
   316  
   317  		// Don't reuse any obsolete file numbers to avoid modifying an
   318  		// ingested sstable's original external file.
   319  		if d.mu.versions.nextFileNum <= fn {
   320  			d.mu.versions.nextFileNum = fn + 1
   321  		}
   322  
   323  		switch ft {
   324  		case fileTypeLog:
   325  			if fn >= d.mu.versions.minUnflushedLogNum {
   326  				logFiles = append(logFiles, fileNumAndName{fn, filename})
   327  			}
   328  			if d.logRecycler.minRecycleLogNum <= fn {
   329  				d.logRecycler.minRecycleLogNum = fn + 1
   330  			}
   331  		case fileTypeOptions:
   332  			if previousOptionsFileNum < fn {
   333  				previousOptionsFileNum = fn
   334  				previousOptionsFilename = filename
   335  			}
   336  		case fileTypeTemp, fileTypeOldTemp:
   337  			if !d.opts.ReadOnly {
   338  				// Some codepaths write to a temporary file and then
   339  				// rename it to its final location when complete.  A
   340  				// temp file is leftover if a process exits before the
   341  				// rename.  Remove it.
   342  				err := opts.FS.Remove(opts.FS.PathJoin(dirname, filename))
   343  				if err != nil {
   344  					return nil, err
   345  				}
   346  			}
   347  		}
   348  	}
   349  
   350  	// Validate the most-recent OPTIONS file, if there is one.
   351  	var strictWALTail bool
   352  	if previousOptionsFilename != "" {
   353  		path := opts.FS.PathJoin(dirname, previousOptionsFilename)
   354  		strictWALTail, err = checkOptions(opts, path)
   355  		if err != nil {
   356  			return nil, err
   357  		}
   358  	}
   359  
   360  	sort.Slice(logFiles, func(i, j int) bool {
   361  		return logFiles[i].num < logFiles[j].num
   362  	})
   363  
   364  	var ve versionEdit
   365  	for i, lf := range logFiles {
   366  		lastWAL := i == len(logFiles)-1
   367  		maxSeqNum, err := d.replayWAL(jobID, &ve, opts.FS,
   368  			opts.FS.PathJoin(d.walDirname, lf.name), lf.num, strictWALTail && !lastWAL)
   369  		if err != nil {
   370  			return nil, err
   371  		}
   372  		d.mu.versions.markFileNumUsed(lf.num)
   373  		if d.mu.versions.atomic.logSeqNum < maxSeqNum {
   374  			d.mu.versions.atomic.logSeqNum = maxSeqNum
   375  		}
   376  	}
   377  	d.mu.versions.atomic.visibleSeqNum = d.mu.versions.atomic.logSeqNum
   378  
   379  	if !d.opts.ReadOnly {
   380  		// Create an empty .log file.
   381  		newLogNum := d.mu.versions.getNextFileNum()
   382  
   383  		// This logic is slightly different than RocksDB's. Specifically, RocksDB
   384  		// sets MinUnflushedLogNum to max-recovered-log-num + 1. We set it to the
   385  		// newLogNum. There should be no difference in using either value.
   386  		ve.MinUnflushedLogNum = newLogNum
   387  
   388  		// Create the manifest with the updated MinUnflushedLogNum before
   389  		// creating the new log file. If we created the log file first, a
   390  		// crash before the manifest is synced could leave two WALs with
   391  		// unclean tails.
   392  		d.mu.versions.logLock()
   393  		if err := d.mu.versions.logAndApply(jobID, &ve, newFileMetrics(ve.NewFiles), false /* forceRotation */, func() []compactionInfo {
   394  			return nil
   395  		}); err != nil {
   396  			return nil, err
   397  		}
   398  
   399  		newLogName := base.MakeFilepath(opts.FS, d.walDirname, fileTypeLog, newLogNum)
   400  		d.mu.log.queue = append(d.mu.log.queue, fileInfo{fileNum: newLogNum, fileSize: 0})
   401  		logFile, err := opts.FS.Create(newLogName)
   402  		if err != nil {
   403  			return nil, err
   404  		}
   405  		if err := d.walDir.Sync(); err != nil {
   406  			return nil, err
   407  		}
   408  		d.opts.EventListener.WALCreated(WALCreateInfo{
   409  			JobID:   jobID,
   410  			Path:    newLogName,
   411  			FileNum: newLogNum,
   412  		})
   413  		// This isn't strictly necessary as we don't use the log number for
   414  		// memtables being flushed, only for the next unflushed memtable.
   415  		d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum = newLogNum
   416  
   417  		logFile = vfs.NewSyncingFile(logFile, vfs.SyncingFileOptions{
   418  			NoSyncOnClose:   d.opts.NoSyncOnClose,
   419  			BytesPerSync:    d.opts.WALBytesPerSync,
   420  			PreallocateSize: d.walPreallocateSize(),
   421  		})
   422  		d.mu.log.LogWriter = record.NewLogWriter(logFile, newLogNum)
   423  		d.mu.log.LogWriter.SetMinSyncInterval(d.opts.WALMinSyncInterval)
   424  		d.mu.versions.metrics.WAL.Files++
   425  	}
   426  	d.updateReadStateLocked(d.opts.DebugCheck)
   427  
   428  	if !d.opts.ReadOnly {
   429  		// Write the current options to disk.
   430  		d.optionsFileNum = d.mu.versions.getNextFileNum()
   431  		tmpPath := base.MakeFilepath(opts.FS, dirname, fileTypeTemp, d.optionsFileNum)
   432  		optionsPath := base.MakeFilepath(opts.FS, dirname, fileTypeOptions, d.optionsFileNum)
   433  
   434  		// Write them to a temporary file first, in case we crash before
   435  		// we're done. A corrupt options file prevents opening the
   436  		// database.
   437  		optionsFile, err := opts.FS.Create(tmpPath)
   438  		if err != nil {
   439  			return nil, err
   440  		}
   441  		serializedOpts := []byte(opts.String())
   442  		if _, err := optionsFile.Write(serializedOpts); err != nil {
   443  			return nil, errors.CombineErrors(err, optionsFile.Close())
   444  		}
   445  		d.optionsFileSize = uint64(len(serializedOpts))
   446  		if err := optionsFile.Sync(); err != nil {
   447  			return nil, errors.CombineErrors(err, optionsFile.Close())
   448  		}
   449  		if err := optionsFile.Close(); err != nil {
   450  			return nil, err
   451  		}
   452  		// Atomically rename to the OPTIONS-XXXXXX path. This rename is
   453  		// guaranteed to be atomic because the destination path does not
   454  		// exist.
   455  		if err := opts.FS.Rename(tmpPath, optionsPath); err != nil {
   456  			return nil, err
   457  		}
   458  		if err := d.dataDir.Sync(); err != nil {
   459  			return nil, err
   460  		}
   461  	}
   462  
   463  	if !d.opts.ReadOnly {
   464  		d.scanObsoleteFiles(ls)
   465  		d.deleteObsoleteFiles(jobID, true /* waitForOngoing */)
   466  	} else {
   467  		// All the log files are obsolete.
   468  		d.mu.versions.metrics.WAL.Files = int64(len(logFiles))
   469  	}
   470  	d.mu.tableStats.cond.L = &d.mu.Mutex
   471  	d.mu.tableValidation.cond.L = &d.mu.Mutex
   472  	if !d.opts.ReadOnly && !d.opts.private.disableTableStats {
   473  		d.maybeCollectTableStatsLocked()
   474  	}
   475  	d.calculateDiskAvailableBytes()
   476  
   477  	d.maybeScheduleFlush(false)
   478  	d.maybeScheduleCompaction()
   479  
   480  	// Note: this is a no-op if invariants are disabled or race is enabled.
   481  	//
   482  	// Setting a finalizer on *DB causes *DB to never be reclaimed and the
   483  	// finalizer to never be run. The problem is due to this limitation of
   484  	// finalizers mention in the SetFinalizer docs:
   485  	//
   486  	//   If a cyclic structure includes a block with a finalizer, that cycle is
   487  	//   not guaranteed to be garbage collected and the finalizer is not
   488  	//   guaranteed to run, because there is no ordering that respects the
   489  	//   dependencies.
   490  	//
   491  	// DB has cycles with several of its internal structures: readState,
   492  	// newIters, tableCache, versions, etc. Each of this individually cause a
   493  	// cycle and prevent the finalizer from being run. But we can workaround this
   494  	// finializer limitation by setting a finalizer on another object that is
   495  	// tied to the lifetime of DB: the DB.closed atomic.Value.
   496  	dPtr := fmt.Sprintf("%p", d)
   497  	invariants.SetFinalizer(d.closed, func(obj interface{}) {
   498  		v := obj.(*atomic.Value)
   499  		if err := v.Load(); err == nil {
   500  			fmt.Fprintf(os.Stderr, "%s: unreferenced DB not closed\n", dPtr)
   501  			os.Exit(1)
   502  		}
   503  	})
   504  
   505  	d.fileLock, fileLock = fileLock, nil
   506  	d.opts.Logger.Info("open bitalostable success")
   507  	return d, nil
   508  }
   509  
   510  // GetVersion returns the engine version string from the latest options
   511  // file present in dir. Used to check what Pebble or RocksDB version was last
   512  // used to write to the database stored in this directory. An empty string is
   513  // returned if no valid OPTIONS file with a version key was found.
   514  func GetVersion(dir string, fs vfs.FS) (string, error) {
   515  	ls, err := fs.List(dir)
   516  	if err != nil {
   517  		return "", err
   518  	}
   519  	var version string
   520  	lastOptionsSeen := FileNum(0)
   521  	for _, filename := range ls {
   522  		ft, fn, ok := base.ParseFilename(fs, filename)
   523  		if !ok {
   524  			continue
   525  		}
   526  		switch ft {
   527  		case fileTypeOptions:
   528  			// If this file has a higher number than the last options file
   529  			// processed, reset version. This is because rocksdb often
   530  			// writes multiple options files without deleting previous ones.
   531  			// Otherwise, skip parsing this options file.
   532  			if fn > lastOptionsSeen {
   533  				version = ""
   534  				lastOptionsSeen = fn
   535  			} else {
   536  				continue
   537  			}
   538  			f, err := fs.Open(fs.PathJoin(dir, filename))
   539  			if err != nil {
   540  				return "", err
   541  			}
   542  			data, err := ioutil.ReadAll(f)
   543  			f.Close()
   544  
   545  			if err != nil {
   546  				return "", err
   547  			}
   548  			err = parseOptions(string(data), func(section, key, value string) error {
   549  				switch {
   550  				case section == "Version":
   551  					switch key {
   552  					case "bitalostable_version":
   553  						version = value
   554  					case "rocksdb_version":
   555  						version = fmt.Sprintf("rocksdb v%s", value)
   556  					}
   557  				}
   558  				return nil
   559  			})
   560  			if err != nil {
   561  				return "", err
   562  			}
   563  		}
   564  	}
   565  	return version, nil
   566  }
   567  
   568  // replayWAL replays the edits in the specified log file.
   569  //
   570  // d.mu must be held when calling this, but the mutex may be dropped and
   571  // re-acquired during the course of this method.
   572  func (d *DB) replayWAL(
   573  	jobID int, ve *versionEdit, fs vfs.FS, filename string, logNum FileNum, strictWALTail bool,
   574  ) (maxSeqNum uint64, err error) {
   575  	file, err := fs.Open(filename)
   576  	if err != nil {
   577  		return 0, err
   578  	}
   579  	defer file.Close()
   580  
   581  	var (
   582  		b               Batch
   583  		buf             bytes.Buffer
   584  		mem             *memTable
   585  		entry           *flushableEntry
   586  		toFlush         flushableList
   587  		rr              = record.NewReader(file, logNum)
   588  		offset          int64 // byte offset in rr
   589  		lastFlushOffset int64
   590  	)
   591  
   592  	if d.opts.ReadOnly {
   593  		// In read-only mode, we replay directly into the mutable memtable which will
   594  		// never be flushed.
   595  		mem = d.mu.mem.mutable
   596  		if mem != nil {
   597  			entry = d.mu.mem.queue[len(d.mu.mem.queue)-1]
   598  		}
   599  	}
   600  
   601  	// Flushes the current memtable, if not nil.
   602  	flushMem := func() {
   603  		if mem == nil {
   604  			return
   605  		}
   606  		var logSize uint64
   607  		if offset >= lastFlushOffset {
   608  			logSize = uint64(offset - lastFlushOffset)
   609  		}
   610  		// Else, this was the initial memtable in the read-only case which must have
   611  		// been empty, but we need to flush it since we don't want to add to it later.
   612  		lastFlushOffset = offset
   613  		entry.logSize = logSize
   614  		if !d.opts.ReadOnly {
   615  			toFlush = append(toFlush, entry)
   616  		}
   617  		mem, entry = nil, nil
   618  	}
   619  	// Creates a new memtable if there is no current memtable.
   620  	ensureMem := func(seqNum uint64) {
   621  		if mem != nil {
   622  			return
   623  		}
   624  		mem, entry = d.newMemTable(logNum, seqNum)
   625  		if d.opts.ReadOnly {
   626  			d.mu.mem.mutable = mem
   627  			d.mu.mem.queue = append(d.mu.mem.queue, entry)
   628  		}
   629  	}
   630  	for {
   631  		offset = rr.Offset()
   632  		r, err := rr.Next()
   633  		if err == nil {
   634  			_, err = io.Copy(&buf, r)
   635  		}
   636  		if err != nil {
   637  			// It is common to encounter a zeroed or invalid chunk due to WAL
   638  			// preallocation and WAL recycling. We need to distinguish these
   639  			// errors from EOF in order to recognize that the record was
   640  			// truncated and to avoid replaying subsequent WALs, but want
   641  			// to otherwise treat them like EOF.
   642  			if err == io.EOF {
   643  				break
   644  			} else if record.IsInvalidRecord(err) && !strictWALTail {
   645  				break
   646  			}
   647  			return 0, errors.Wrap(err, "bitalostable: error when replaying WAL")
   648  		}
   649  
   650  		if buf.Len() < batchHeaderLen {
   651  			return 0, base.CorruptionErrorf("bitalostable: corrupt log file %q (num %s)",
   652  				filename, errors.Safe(logNum))
   653  		}
   654  
   655  		// Specify Batch.db so that Batch.SetRepr will compute Batch.memTableSize
   656  		// which is used below.
   657  		b = Batch{db: d}
   658  		b.SetRepr(buf.Bytes())
   659  		seqNum := b.SeqNum()
   660  		maxSeqNum = seqNum + uint64(b.Count())
   661  
   662  		if b.memTableSize >= uint64(d.largeBatchThreshold) {
   663  			flushMem()
   664  			// Make a copy of the data slice since it is currently owned by buf and will
   665  			// be reused in the next iteration.
   666  			b.data = append([]byte(nil), b.data...)
   667  			b.flushable = newFlushableBatch(&b, d.opts.Comparer)
   668  			entry := d.newFlushableEntry(b.flushable, logNum, b.SeqNum())
   669  			// Disable memory accounting by adding a reader ref that will never be
   670  			// removed.
   671  			entry.readerRefs++
   672  			if d.opts.ReadOnly {
   673  				d.mu.mem.queue = append(d.mu.mem.queue, entry)
   674  			} else {
   675  				toFlush = append(toFlush, entry)
   676  			}
   677  		} else {
   678  			ensureMem(seqNum)
   679  			if err = mem.prepare(&b); err != nil && err != arenaskl.ErrArenaFull {
   680  				return 0, err
   681  			}
   682  			// We loop since DB.newMemTable() slowly grows the size of allocated memtables, so the
   683  			// batch may not initially fit, but will eventually fit (since it is smaller than
   684  			// largeBatchThreshold).
   685  			for err == arenaskl.ErrArenaFull {
   686  				flushMem()
   687  				ensureMem(seqNum)
   688  				err = mem.prepare(&b)
   689  				if err != nil && err != arenaskl.ErrArenaFull {
   690  					return 0, err
   691  				}
   692  			}
   693  			if err = mem.apply(&b, seqNum); err != nil {
   694  				return 0, err
   695  			}
   696  			mem.writerUnref()
   697  		}
   698  		buf.Reset()
   699  	}
   700  	flushMem()
   701  	// mem is nil here.
   702  	if !d.opts.ReadOnly {
   703  		c := newFlush(d.opts, d.mu.versions.currentVersion(),
   704  			1 /* base level */, toFlush)
   705  		newVE, _, err := d.runCompaction(jobID, c)
   706  		if err != nil {
   707  			return 0, err
   708  		}
   709  		ve.NewFiles = append(ve.NewFiles, newVE.NewFiles...)
   710  		for i := range toFlush {
   711  			toFlush[i].readerUnref()
   712  		}
   713  	}
   714  	return maxSeqNum, err
   715  }
   716  
   717  func checkOptions(opts *Options, path string) (strictWALTail bool, err error) {
   718  	f, err := opts.FS.Open(path)
   719  	if err != nil {
   720  		return false, err
   721  	}
   722  	defer f.Close()
   723  
   724  	data, err := ioutil.ReadAll(f)
   725  	if err != nil {
   726  		return false, err
   727  	}
   728  	return opts.checkOptions(string(data))
   729  }
   730  
   731  // DBDesc briefly describes high-level state about a database.
   732  type DBDesc struct {
   733  	// Exists is true if an existing database was found.
   734  	Exists bool
   735  	// FormatMajorVersion indicates the database's current format
   736  	// version.
   737  	FormatMajorVersion FormatMajorVersion
   738  	// ManifestFilename is the filename of the current active manifest,
   739  	// if the database exists.
   740  	ManifestFilename string
   741  }
   742  
   743  // Peek looks for an existing database in dirname on the provided FS. It
   744  // returns a brief description of the database. Peek is read-only and
   745  // does not open the database.
   746  func Peek(dirname string, fs vfs.FS) (*DBDesc, error) {
   747  	vers, versMarker, err := lookupFormatMajorVersion(fs, dirname)
   748  	if err != nil {
   749  		return nil, err
   750  	}
   751  	// TODO(jackson): Immediately closing the marker is clunky. Add a
   752  	// PeekMarker variant that avoids opening the directory.
   753  	if err := versMarker.Close(); err != nil {
   754  		return nil, err
   755  	}
   756  
   757  	// Find the currently active manifest, if there is one.
   758  	manifestMarker, manifestFileNum, exists, err := findCurrentManifest(vers, fs, dirname)
   759  	if err != nil {
   760  		return nil, err
   761  	}
   762  	// TODO(jackson): Immediately closing the marker is clunky. Add a
   763  	// PeekMarker variant that avoids opening the directory.
   764  	if err := manifestMarker.Close(); err != nil {
   765  		return nil, err
   766  	}
   767  
   768  	desc := &DBDesc{
   769  		Exists:             exists,
   770  		FormatMajorVersion: vers,
   771  	}
   772  	if exists {
   773  		desc.ManifestFilename = base.MakeFilepath(fs, dirname, fileTypeManifest, manifestFileNum)
   774  	}
   775  	return desc, nil
   776  }