github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/format_major_version.go (about)

     1  // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"fmt"
     9  	"strconv"
    10  
    11  	"github.com/cockroachdb/errors"
    12  	"github.com/cockroachdb/pebble/internal/base"
    13  	"github.com/cockroachdb/pebble/internal/manifest"
    14  	"github.com/cockroachdb/pebble/sstable"
    15  	"github.com/cockroachdb/pebble/vfs"
    16  	"github.com/cockroachdb/pebble/vfs/atomicfs"
    17  )
    18  
    19  // FormatMajorVersion is a constant controlling the format of persisted
    20  // data. Backwards incompatible changes to durable formats are gated
    21  // behind new format major versions.
    22  //
    23  // At any point, a database's format major version may be bumped.
    24  // However, once a database's format major version is increased,
    25  // previous versions of Pebble will refuse to open the database.
    26  //
    27  // The zero value format is the FormatDefault constant. The exact
    28  // FormatVersion that the default corresponds to may change with time.
    29  type FormatMajorVersion uint64
    30  
    31  // SafeValue implements redact.SafeValue.
    32  func (v FormatMajorVersion) SafeValue() {}
    33  
    34  // String implements fmt.Stringer.
    35  func (v FormatMajorVersion) String() string {
    36  	// NB: This must not change. It's used as the value for the on-disk
    37  	// version marker file.
    38  	//
    39  	// Specifically, this value must always parse as a base 10 integer
    40  	// that fits in a uint64. We format it as zero-padded, 3-digit
    41  	// number today, but the padding may change.
    42  	return fmt.Sprintf("%03d", v)
    43  }
    44  
    45  const (
    46  	// 21.2 versions.
    47  
    48  	// FormatDefault leaves the format version unspecified. The
    49  	// FormatDefault constant may be ratcheted upwards over time.
    50  	FormatDefault FormatMajorVersion = iota
    51  	// FormatMostCompatible maintains the most backwards compatibility,
    52  	// maintaining bi-directional compatibility with RocksDB 6.2.1 in
    53  	// the particular configuration described in the Pebble README.
    54  	FormatMostCompatible
    55  	// formatVersionedManifestMarker is the first
    56  	// backwards-incompatible change made to Pebble, introducing the
    57  	// format-version marker file for handling backwards-incompatible
    58  	// changes more broadly, and replacing the `CURRENT` file with a
    59  	// marker file.
    60  	//
    61  	// This format version is intended as an intermediary version state.
    62  	// It is deliberately unexported to discourage direct use of this
    63  	// format major version.  Clients should use FormatVersioned which
    64  	// also ensures earlier versions of Pebble fail to open a database
    65  	// written in a future format major version.
    66  	formatVersionedManifestMarker
    67  	// FormatVersioned is a new format major version that replaces the
    68  	// old `CURRENT` file with a new 'marker' file scheme.  Previous
    69  	// Pebble versions will be unable to open the database unless
    70  	// they're aware of format versions.
    71  	FormatVersioned
    72  	// FormatSetWithDelete is a format major version that introduces a new key
    73  	// kind, base.InternalKeyKindSetWithDelete. Previous Pebble versions will be
    74  	// unable to open this database.
    75  	FormatSetWithDelete
    76  
    77  	// 22.1 versions.
    78  
    79  	// FormatBlockPropertyCollector is a format major version that introduces
    80  	// BlockPropertyCollectors.
    81  	FormatBlockPropertyCollector
    82  	// FormatSplitUserKeysMarked is a format major version that guarantees that
    83  	// all files that share user keys with neighbors are marked for compaction
    84  	// in the manifest. Ratcheting to FormatSplitUserKeysMarked will block
    85  	// (without holding mutexes) until the scan of the LSM is complete and the
    86  	// manifest has been rotated.
    87  	FormatSplitUserKeysMarked
    88  
    89  	// 22.2 versions.
    90  
    91  	// FormatSplitUserKeysMarkedCompacted is a format major version that
    92  	// guarantees that all files explicitly marked for compaction in the manifest
    93  	// have been compacted. Combined with the FormatSplitUserKeysMarked format
    94  	// major version, this version guarantees that there are no user keys split
    95  	// across multiple files within a level L1+. Ratcheting to this format version
    96  	// will block (without holding mutexes) until all necessary compactions for
    97  	// files marked for compaction are complete.
    98  	FormatSplitUserKeysMarkedCompacted
    99  	// FormatRangeKeys is a format major version that introduces range keys.
   100  	FormatRangeKeys
   101  	// FormatMinTableFormatPebblev1 is a format major version that guarantees that
   102  	// tables created by or ingested into the DB at or above this format major
   103  	// version will have a table format version of at least Pebblev1 (Block
   104  	// Properties).
   105  	FormatMinTableFormatPebblev1
   106  	// FormatPrePebblev1Marked is a format major version that guarantees that all
   107  	// sstables with a table format version pre-Pebblev1 (i.e. those that are
   108  	// guaranteed to not contain block properties) are marked for compaction in
   109  	// the manifest. Ratcheting to FormatPrePebblev1Marked will block (without
   110  	// holding mutexes) until the scan of the LSM is complete and the manifest has
   111  	// been rotated.
   112  	FormatPrePebblev1Marked
   113  
   114  	// 23.1 versions.
   115  
   116  	// formatUnusedPrePebblev1MarkedCompacted is an unused format major version.
   117  	// This format major version was originally intended to ship in the 23.1
   118  	// release. It was later decided that this should be deferred until a
   119  	// subsequent release. The original ordering is preserved so as not to
   120  	// introduce breaking changes in Cockroach.
   121  	formatUnusedPrePebblev1MarkedCompacted
   122  
   123  	// FormatSSTableValueBlocks is a format major version that adds support for
   124  	// storing values in value blocks in the sstable. Value block support is not
   125  	// necessarily enabled when writing sstables, when running with this format
   126  	// major version.
   127  	//
   128  	// WARNING: In development, so no production code should upgrade to this
   129  	// format, since a DB with this format major version will not actually
   130  	// interoperate correctly with another DB with the same format major
   131  	// version. This format major version is introduced so that tests can start
   132  	// being executed up to this version. Note that these tests succeed despite
   133  	// the incomplete support since they do not enable value blocks and use
   134  	// TableFormatPebblev2.
   135  	FormatSSTableValueBlocks
   136  
   137  	// FormatFlushableIngest is a format major version that enables lazy
   138  	// addition of ingested sstables into the LSM structure. When an ingest
   139  	// overlaps with a memtable, a record of the ingest is written to the WAL
   140  	// without waiting for a flush. Subsequent reads treat the ingested files as
   141  	// a level above the overlapping memtable. Once the memtable is flushed, the
   142  	// ingested files are moved into the lowest possible levels.
   143  	//
   144  	// This feature is behind a format major version because it required
   145  	// breaking changes to the WAL format.
   146  	FormatFlushableIngest
   147  
   148  	// 23.2 versions.
   149  
   150  	// FormatPrePebblev1MarkedCompacted is a format major version that guarantees
   151  	// that all sstables explicitly marked for compaction in the manifest (see
   152  	// FormatPrePebblev1Marked) have been compacted. Ratcheting to this format
   153  	// version will block (without holding mutexes) until all necessary
   154  	// compactions for files marked for compaction are complete.
   155  	FormatPrePebblev1MarkedCompacted
   156  
   157  	// FormatDeleteSizedAndObsolete is a format major version that adds support
   158  	// for deletion tombstones that encode the size of the value they're
   159  	// expected to delete. This format major version is required before the
   160  	// associated key kind may be committed through batch applications or
   161  	// ingests. It also adds support for keys that are marked obsolete (see
   162  	// sstable/format.go for details).
   163  	FormatDeleteSizedAndObsolete
   164  
   165  	// FormatVirtualSSTables is a format major version that adds support for
   166  	// virtual sstables that can reference a sub-range of keys in an underlying
   167  	// physical sstable. This information is persisted through new,
   168  	// backward-incompatible fields in the Manifest, and therefore requires
   169  	// a format major version.
   170  	FormatVirtualSSTables
   171  
   172  	// internalFormatNewest holds the newest format major version, including
   173  	// experimental ones excluded from the exported FormatNewest constant until
   174  	// they've stabilized. Used in tests.
   175  	internalFormatNewest FormatMajorVersion = iota - 1
   176  
   177  	// FormatNewest always contains the most recent format major version.
   178  	FormatNewest FormatMajorVersion = internalFormatNewest
   179  )
   180  
   181  // MaxTableFormat returns the maximum sstable.TableFormat that can be used at
   182  // this FormatMajorVersion.
   183  func (v FormatMajorVersion) MaxTableFormat() sstable.TableFormat {
   184  	switch v {
   185  	case FormatDefault, FormatMostCompatible, formatVersionedManifestMarker,
   186  		FormatVersioned, FormatSetWithDelete:
   187  		return sstable.TableFormatRocksDBv2
   188  	case FormatBlockPropertyCollector, FormatSplitUserKeysMarked,
   189  		FormatSplitUserKeysMarkedCompacted:
   190  		return sstable.TableFormatPebblev1
   191  	case FormatRangeKeys, FormatMinTableFormatPebblev1, FormatPrePebblev1Marked,
   192  		formatUnusedPrePebblev1MarkedCompacted:
   193  		return sstable.TableFormatPebblev2
   194  	case FormatSSTableValueBlocks, FormatFlushableIngest, FormatPrePebblev1MarkedCompacted:
   195  		return sstable.TableFormatPebblev3
   196  	case FormatDeleteSizedAndObsolete, FormatVirtualSSTables:
   197  		return sstable.TableFormatPebblev4
   198  	default:
   199  		panic(fmt.Sprintf("pebble: unsupported format major version: %s", v))
   200  	}
   201  }
   202  
   203  // MinTableFormat returns the minimum sstable.TableFormat that can be used at
   204  // this FormatMajorVersion.
   205  func (v FormatMajorVersion) MinTableFormat() sstable.TableFormat {
   206  	switch v {
   207  	case FormatDefault, FormatMostCompatible, formatVersionedManifestMarker,
   208  		FormatVersioned, FormatSetWithDelete, FormatBlockPropertyCollector,
   209  		FormatSplitUserKeysMarked, FormatSplitUserKeysMarkedCompacted,
   210  		FormatRangeKeys:
   211  		return sstable.TableFormatLevelDB
   212  	case FormatMinTableFormatPebblev1, FormatPrePebblev1Marked,
   213  		formatUnusedPrePebblev1MarkedCompacted, FormatSSTableValueBlocks,
   214  		FormatFlushableIngest, FormatPrePebblev1MarkedCompacted,
   215  		FormatDeleteSizedAndObsolete, FormatVirtualSSTables:
   216  		return sstable.TableFormatPebblev1
   217  	default:
   218  		panic(fmt.Sprintf("pebble: unsupported format major version: %s", v))
   219  	}
   220  }
   221  
   222  // orderingInvariants returns an enum encoding the set of invariants that must
   223  // hold within the receiver format major version. Invariants only get stricter
   224  // as the format major version advances, so it is okay to retrieve the
   225  // invariants from the current format major version and by the time the
   226  // invariants are enforced, the format major version has advanced.
   227  func (v FormatMajorVersion) orderingInvariants() manifest.OrderingInvariants {
   228  	if v < FormatSplitUserKeysMarkedCompacted {
   229  		return manifest.AllowSplitUserKeys
   230  	}
   231  	return manifest.ProhibitSplitUserKeys
   232  }
   233  
   234  // formatMajorVersionMigrations defines the migrations from one format
   235  // major version to the next. Each migration is defined as a closure
   236  // which will be invoked on the database before the new format major
   237  // version is committed. Migrations must be idempotent. Migrations are
   238  // invoked with d.mu locked.
   239  //
   240  // Each migration is responsible for invoking finalizeFormatVersUpgrade
   241  // to set the new format major version.  RatchetFormatMajorVersion will
   242  // panic if a migration returns a nil error but fails to finalize the
   243  // new format major version.
   244  var formatMajorVersionMigrations = map[FormatMajorVersion]func(*DB) error{
   245  	FormatMostCompatible: func(d *DB) error { return nil },
   246  	formatVersionedManifestMarker: func(d *DB) error {
   247  		// formatVersionedManifestMarker introduces the use of a marker
   248  		// file for pointing to the current MANIFEST file.
   249  
   250  		// Lock the manifest.
   251  		d.mu.versions.logLock()
   252  		defer d.mu.versions.logUnlock()
   253  
   254  		// Construct the filename of the currently active manifest and
   255  		// move the manifest marker to that filename. The marker is
   256  		// guaranteed to exist, because we unconditionally locate it
   257  		// during Open.
   258  		manifestFileNum := d.mu.versions.manifestFileNum
   259  		filename := base.MakeFilename(fileTypeManifest, manifestFileNum.DiskFileNum())
   260  		if err := d.mu.versions.manifestMarker.Move(filename); err != nil {
   261  			return errors.Wrap(err, "moving manifest marker")
   262  		}
   263  
   264  		// Now that we have a manifest marker file in place and pointing
   265  		// to the current MANIFEST, finalize the upgrade. If we fail for
   266  		// some reason, a retry of this migration is guaranteed to again
   267  		// move the manifest marker file to the latest manifest. If
   268  		// we're unable to finalize the upgrade, a subsequent call to
   269  		// Open will ignore the manifest marker.
   270  		if err := d.finalizeFormatVersUpgrade(formatVersionedManifestMarker); err != nil {
   271  			return err
   272  		}
   273  
   274  		// We've finalized the upgrade. All subsequent Open calls will
   275  		// ignore the CURRENT file and instead read the manifest marker.
   276  		// Before we unlock the manifest, we need to update versionSet
   277  		// to use the manifest marker on future rotations.
   278  		d.mu.versions.setCurrent = setCurrentFuncMarker(
   279  			d.mu.versions.manifestMarker,
   280  			d.mu.versions.fs,
   281  			d.mu.versions.dirname)
   282  		return nil
   283  	},
   284  	// The FormatVersioned version is split into two, each with their
   285  	// own migration to ensure the post-migration cleanup happens even
   286  	// if there's a crash immediately after finalizing the version. Once
   287  	// a new format major version is finalized, its migration will never
   288  	// run again. Post-migration cleanup like the one in the migration
   289  	// below must be performed in a separate migration or every time the
   290  	// database opens.
   291  	FormatVersioned: func(d *DB) error {
   292  		// Replace the `CURRENT` file with one that points to the
   293  		// nonexistent `MANIFEST-000000` file. If an earlier Pebble
   294  		// version that does not know about format major versions
   295  		// attempts to open the database, it will error avoiding
   296  		// accidental corruption.
   297  		if err := setCurrentFile(d.mu.versions.dirname, d.mu.versions.fs, base.FileNum(0).DiskFileNum()); err != nil {
   298  			return err
   299  		}
   300  		return d.finalizeFormatVersUpgrade(FormatVersioned)
   301  	},
   302  	// As SetWithDelete is a new key kind, there is nothing to migrate. We can
   303  	// simply finalize the format version and we're done.
   304  	FormatSetWithDelete: func(d *DB) error {
   305  		return d.finalizeFormatVersUpgrade(FormatSetWithDelete)
   306  	},
   307  	FormatBlockPropertyCollector: func(d *DB) error {
   308  		return d.finalizeFormatVersUpgrade(FormatBlockPropertyCollector)
   309  	},
   310  	FormatSplitUserKeysMarked: func(d *DB) error {
   311  		// Mark any unmarked files with split-user keys. Note all format major
   312  		// versions migrations are invoked with DB.mu locked.
   313  		if err := d.markFilesLocked(markFilesWithSplitUserKeys(d.opts.Comparer.Equal)); err != nil {
   314  			return err
   315  		}
   316  		return d.finalizeFormatVersUpgrade(FormatSplitUserKeysMarked)
   317  	},
   318  	FormatSplitUserKeysMarkedCompacted: func(d *DB) error {
   319  		// Before finalizing the format major version, rewrite any sstables
   320  		// still marked for compaction. Note all format major versions
   321  		// migrations are invoked with DB.mu locked.
   322  		if err := d.compactMarkedFilesLocked(); err != nil {
   323  			return err
   324  		}
   325  		return d.finalizeFormatVersUpgrade(FormatSplitUserKeysMarkedCompacted)
   326  	},
   327  	FormatRangeKeys: func(d *DB) error {
   328  		return d.finalizeFormatVersUpgrade(FormatRangeKeys)
   329  	},
   330  	FormatMinTableFormatPebblev1: func(d *DB) error {
   331  		return d.finalizeFormatVersUpgrade(FormatMinTableFormatPebblev1)
   332  	},
   333  	FormatPrePebblev1Marked: func(d *DB) error {
   334  		// Mark any unmarked files that contain only table properties. Note all
   335  		// format major versions migrations are invoked with DB.mu locked.
   336  		if err := d.markFilesLocked(markFilesPrePebblev1(d.tableCache)); err != nil {
   337  			return err
   338  		}
   339  		return d.finalizeFormatVersUpgrade(FormatPrePebblev1Marked)
   340  	},
   341  	formatUnusedPrePebblev1MarkedCompacted: func(d *DB) error {
   342  		// Intentional no-op.
   343  		return d.finalizeFormatVersUpgrade(formatUnusedPrePebblev1MarkedCompacted)
   344  	},
   345  	FormatSSTableValueBlocks: func(d *DB) error {
   346  		return d.finalizeFormatVersUpgrade(FormatSSTableValueBlocks)
   347  	},
   348  	FormatFlushableIngest: func(d *DB) error {
   349  		return d.finalizeFormatVersUpgrade(FormatFlushableIngest)
   350  	},
   351  	FormatPrePebblev1MarkedCompacted: func(d *DB) error {
   352  		// Before finalizing the format major version, rewrite any sstables
   353  		// still marked for compaction. Note all format major versions
   354  		// migrations are invoked with DB.mu locked.
   355  		if err := d.compactMarkedFilesLocked(); err != nil {
   356  			return err
   357  		}
   358  		return d.finalizeFormatVersUpgrade(FormatPrePebblev1MarkedCompacted)
   359  	},
   360  	FormatDeleteSizedAndObsolete: func(d *DB) error {
   361  		return d.finalizeFormatVersUpgrade(FormatDeleteSizedAndObsolete)
   362  	},
   363  	FormatVirtualSSTables: func(d *DB) error {
   364  		return d.finalizeFormatVersUpgrade(FormatVirtualSSTables)
   365  	},
   366  }
   367  
   368  const formatVersionMarkerName = `format-version`
   369  
   370  func lookupFormatMajorVersion(
   371  	fs vfs.FS, dirname string,
   372  ) (FormatMajorVersion, *atomicfs.Marker, error) {
   373  	m, versString, err := atomicfs.LocateMarker(fs, dirname, formatVersionMarkerName)
   374  	if err != nil {
   375  		return 0, nil, err
   376  	}
   377  	if versString == "" {
   378  		return FormatMostCompatible, m, nil
   379  	}
   380  	v, err := strconv.ParseUint(versString, 10, 64)
   381  	if err != nil {
   382  		return 0, nil, errors.Wrap(err, "parsing format major version")
   383  	}
   384  	vers := FormatMajorVersion(v)
   385  	if vers == FormatDefault {
   386  		return 0, nil, errors.Newf("pebble: default format major version should not persisted", vers)
   387  	}
   388  	if vers > internalFormatNewest {
   389  		return 0, nil, errors.Newf("pebble: database %q written in format major version %d", dirname, vers)
   390  	}
   391  	return vers, m, nil
   392  }
   393  
   394  // FormatMajorVersion returns the database's active format major
   395  // version. The format major version may be higher than the one
   396  // provided in Options when the database was opened if the existing
   397  // database was written with a higher format version.
   398  func (d *DB) FormatMajorVersion() FormatMajorVersion {
   399  	return FormatMajorVersion(d.mu.formatVers.vers.Load())
   400  }
   401  
   402  // RatchetFormatMajorVersion ratchets the opened database's format major
   403  // version to the provided version. It errors if the provided format
   404  // major version is below the database's current version. Once a
   405  // database's format major version is upgraded, previous Pebble versions
   406  // that do not know of the format version will be unable to open the
   407  // database.
   408  func (d *DB) RatchetFormatMajorVersion(fmv FormatMajorVersion) error {
   409  	if err := d.closed.Load(); err != nil {
   410  		panic(err)
   411  	}
   412  
   413  	d.mu.Lock()
   414  	defer d.mu.Unlock()
   415  	return d.ratchetFormatMajorVersionLocked(fmv)
   416  }
   417  
   418  func (d *DB) ratchetFormatMajorVersionLocked(formatVers FormatMajorVersion) error {
   419  	if d.opts.ReadOnly {
   420  		return ErrReadOnly
   421  	}
   422  	if formatVers > internalFormatNewest {
   423  		// Guard against accidentally forgetting to update internalFormatNewest.
   424  		return errors.Errorf("pebble: unknown format version %d", formatVers)
   425  	}
   426  	if currentVers := d.FormatMajorVersion(); currentVers > formatVers {
   427  		return errors.Newf("pebble: database already at format major version %d; cannot reduce to %d",
   428  			currentVers, formatVers)
   429  	}
   430  	if d.mu.formatVers.ratcheting {
   431  		return errors.Newf("pebble: database format major version upgrade is in-progress")
   432  	}
   433  	d.mu.formatVers.ratcheting = true
   434  	defer func() { d.mu.formatVers.ratcheting = false }()
   435  
   436  	for nextVers := d.FormatMajorVersion() + 1; nextVers <= formatVers; nextVers++ {
   437  		if err := formatMajorVersionMigrations[nextVers](d); err != nil {
   438  			return errors.Wrapf(err, "migrating to version %d", nextVers)
   439  		}
   440  
   441  		// NB: The migration is responsible for calling
   442  		// finalizeFormatVersUpgrade to finalize the upgrade. This
   443  		// structure is necessary because some migrations may need to
   444  		// update in-memory state (without ever dropping locks) after
   445  		// the upgrade is finalized. Here we assert that the upgrade
   446  		// did occur.
   447  		if d.FormatMajorVersion() != nextVers {
   448  			d.opts.Logger.Fatalf("pebble: successful migration to format version %d never finalized the upgrade", nextVers)
   449  		}
   450  	}
   451  	return nil
   452  }
   453  
   454  // finalizeFormatVersUpgrade is typically only be called from within a
   455  // format major version migration.
   456  //
   457  // See formatMajorVersionMigrations.
   458  func (d *DB) finalizeFormatVersUpgrade(formatVers FormatMajorVersion) error {
   459  	// We use the marker to encode the active format version in the
   460  	// marker filename. Unlike other uses of the atomic marker, there is
   461  	// no file with the filename `formatVers.String()` on the
   462  	// filesystem.
   463  	if err := d.mu.formatVers.marker.Move(formatVers.String()); err != nil {
   464  		return err
   465  	}
   466  	d.mu.formatVers.vers.Store(uint64(formatVers))
   467  	d.opts.EventListener.FormatUpgrade(formatVers)
   468  	return nil
   469  }
   470  
   471  // compactMarkedFilesLocked performs a migration that schedules rewrite
   472  // compactions to compact away any sstables marked for compaction.
   473  // compactMarkedFilesLocked is run while ratcheting the database's format major
   474  // version to FormatSplitUserKeysMarkedCompacted.
   475  //
   476  // Note that while this method is called with the DB.mu held, and will not
   477  // return until all marked files have been compacted, the mutex is dropped while
   478  // waiting for compactions to complete (or for slots to free up).
   479  func (d *DB) compactMarkedFilesLocked() error {
   480  	curr := d.mu.versions.currentVersion()
   481  	for curr.Stats.MarkedForCompaction > 0 {
   482  		// Attempt to schedule a compaction to rewrite a file marked for
   483  		// compaction.
   484  		d.maybeScheduleCompactionPicker(func(picker compactionPicker, env compactionEnv) *pickedCompaction {
   485  			return picker.pickRewriteCompaction(env)
   486  		})
   487  
   488  		// The above attempt might succeed and schedule a rewrite compaction. Or
   489  		// there might not be available compaction concurrency to schedule the
   490  		// compaction.  Or compaction of the file might have already been in
   491  		// progress. In any scenario, wait until there's some change in the
   492  		// state of active compactions.
   493  
   494  		// Before waiting, check that the database hasn't been closed. Trying to
   495  		// schedule the compaction may have dropped d.mu while waiting for a
   496  		// manifest write to complete. In that dropped interim, the database may
   497  		// have been closed.
   498  		if err := d.closed.Load(); err != nil {
   499  			return err.(error)
   500  		}
   501  
   502  		// Some flush or compaction may have scheduled or completed while we waited
   503  		// for the manifest lock in maybeScheduleCompactionPicker. Get the latest
   504  		// Version before waiting on a compaction.
   505  		curr = d.mu.versions.currentVersion()
   506  
   507  		// Only wait on compactions if there are files still marked for compaction.
   508  		// NB: Waiting on this condition variable drops d.mu while blocked.
   509  		if curr.Stats.MarkedForCompaction > 0 {
   510  			if d.mu.compact.compactingCount == 0 {
   511  				panic("expected a compaction of marked files in progress")
   512  			}
   513  			d.mu.compact.cond.Wait()
   514  			// Refresh the current version again.
   515  			curr = d.mu.versions.currentVersion()
   516  		}
   517  	}
   518  	return nil
   519  }
   520  
   521  // findFilesFunc scans the LSM for files, returning true if at least one
   522  // file was found. The returned array contains the matched files, if any, per
   523  // level.
   524  type findFilesFunc func(v *version) (found bool, files [numLevels][]*fileMetadata, _ error)
   525  
   526  // markFilesWithSplitUserKeys scans the LSM's levels 1 through 6 for adjacent
   527  // files that contain the same user key. Such arrangements of files were
   528  // permitted in RocksDB and in Pebble up to SHA a860bbad.
   529  var markFilesWithSplitUserKeys = func(equal Equal) findFilesFunc {
   530  	return func(v *version) (found bool, files [numLevels][]*fileMetadata, _ error) {
   531  		// Files with split user keys are expected to be rare and performing key
   532  		// comparisons for every file within the LSM is expensive, so drop the
   533  		// database lock while scanning the file metadata.
   534  		for l := numLevels - 1; l > 0; l-- {
   535  			iter := v.Levels[l].Iter()
   536  			var prevFile *fileMetadata
   537  			var prevUserKey []byte
   538  			for f := iter.First(); f != nil; f = iter.Next() {
   539  				if prevUserKey != nil && equal(prevUserKey, f.Smallest.UserKey) {
   540  					// NB: We may append a file twice, once as prevFile and once
   541  					// as f. That's okay, and handled below.
   542  					files[l] = append(files[l], prevFile, f)
   543  					found = true
   544  				}
   545  				if f.Largest.IsExclusiveSentinel() {
   546  					prevUserKey = nil
   547  					prevFile = nil
   548  				} else {
   549  					prevUserKey = f.Largest.UserKey
   550  					prevFile = f
   551  				}
   552  			}
   553  		}
   554  		return
   555  	}
   556  }
   557  
   558  // markFilesPrePebblev1 scans the LSM for files that do not support block
   559  // properties (i.e. a table format version pre-Pebblev1).
   560  var markFilesPrePebblev1 = func(tc *tableCacheContainer) findFilesFunc {
   561  	return func(v *version) (found bool, files [numLevels][]*fileMetadata, err error) {
   562  		for l := numLevels - 1; l > 0; l-- {
   563  			iter := v.Levels[l].Iter()
   564  			for f := iter.First(); f != nil; f = iter.Next() {
   565  				if f.Virtual {
   566  					// Any physical sstable which has been virtualized must
   567  					// have already undergone this migration, and we don't
   568  					// need to worry about the virtual sstable themselves.
   569  					panic("pebble: unexpected virtual sstable during migration")
   570  				}
   571  				err = tc.withReader(
   572  					f.PhysicalMeta(), func(r *sstable.Reader) error {
   573  						tf, err := r.TableFormat()
   574  						if err != nil {
   575  							return err
   576  						}
   577  						if tf < sstable.TableFormatPebblev1 {
   578  							found = true
   579  							files[l] = append(files[l], f)
   580  						}
   581  						return nil
   582  					})
   583  				if err != nil {
   584  					return
   585  				}
   586  			}
   587  		}
   588  		return
   589  	}
   590  }
   591  
   592  // markFilesLock durably marks the files that match the given findFilesFunc for
   593  // compaction.
   594  func (d *DB) markFilesLocked(findFn findFilesFunc) error {
   595  	jobID := d.mu.nextJobID
   596  	d.mu.nextJobID++
   597  
   598  	// Acquire a read state to have a view of the LSM and a guarantee that none
   599  	// of the referenced files will be deleted until we've unreferenced the read
   600  	// state. Some findFilesFuncs may read the files, requiring they not be
   601  	// deleted.
   602  	rs := d.loadReadState()
   603  	var (
   604  		found bool
   605  		files [numLevels][]*fileMetadata
   606  		err   error
   607  	)
   608  	func() {
   609  		defer rs.unrefLocked()
   610  		// Note the unusual locking: unlock, defer Lock(). The scan of the files in
   611  		// the version does not need to block other operations that require the
   612  		// DB.mu. Drop it for the scan, before re-acquiring it.
   613  		d.mu.Unlock()
   614  		defer d.mu.Lock()
   615  		found, files, err = findFn(rs.current)
   616  	}()
   617  	if err != nil {
   618  		return err
   619  	}
   620  
   621  	// The database lock has been acquired again by the defer within the above
   622  	// anonymous function.
   623  	if !found {
   624  		// Nothing to do.
   625  		return nil
   626  	}
   627  
   628  	// After scanning, if we found files to mark, we fetch the current state of
   629  	// the LSM (which may have changed) and set MarkedForCompaction on the files,
   630  	// and update the version's Stats.MarkedForCompaction count, which are both
   631  	// protected by d.mu.
   632  
   633  	// Lock the manifest for a coherent view of the LSM. The database lock has
   634  	// been re-acquired by the defer within the above anonymous function.
   635  	d.mu.versions.logLock()
   636  	vers := d.mu.versions.currentVersion()
   637  	for l, filesToMark := range files {
   638  		if len(filesToMark) == 0 {
   639  			continue
   640  		}
   641  		for _, f := range filesToMark {
   642  			// Ignore files to be marked that have already been compacted or marked.
   643  			if f.CompactionState == manifest.CompactionStateCompacted ||
   644  				f.MarkedForCompaction {
   645  				continue
   646  			}
   647  			// Else, mark the file for compaction in this version.
   648  			vers.Stats.MarkedForCompaction++
   649  			f.MarkedForCompaction = true
   650  		}
   651  		// The compaction picker uses the markedForCompactionAnnotator to
   652  		// quickly find files marked for compaction, or to quickly determine
   653  		// that there are no such files marked for compaction within a level.
   654  		// A b-tree node may be annotated with an annotation recording that
   655  		// there are no files marked for compaction within the node's subtree,
   656  		// based on the assumption that it's static.
   657  		//
   658  		// Since we're marking files for compaction, these b-tree nodes'
   659  		// annotations will be out of date. Clear the compaction-picking
   660  		// annotation, so that it's recomputed the next time the compaction
   661  		// picker looks for a file marked for compaction.
   662  		vers.Levels[l].InvalidateAnnotation(markedForCompactionAnnotator{})
   663  	}
   664  
   665  	// The 'marked-for-compaction' bit is persisted in the MANIFEST file
   666  	// metadata. We've already modified the in-memory file metadata, but the
   667  	// manifest hasn't been updated. Force rotation to a new MANIFEST file,
   668  	// which will write every file metadata to the new manifest file and ensure
   669  	// that the now marked-for-compaction file metadata are persisted as marked.
   670  	// NB: This call to logAndApply will unlockthe MANIFEST, which we locked up
   671  	// above before obtaining `vers`.
   672  	return d.mu.versions.logAndApply(
   673  		jobID,
   674  		&manifest.VersionEdit{},
   675  		map[int]*LevelMetrics{},
   676  		true, /* forceRotation */
   677  		func() []compactionInfo { return d.getInProgressCompactionInfoLocked(nil) })
   678  }