github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/internal/manifest/version.go (about)

     1  // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package manifest
     6  
     7  import (
     8  	"bytes"
     9  	stdcmp "cmp"
    10  	"fmt"
    11  	"sort"
    12  	"strconv"
    13  	"strings"
    14  	"sync"
    15  	"sync/atomic"
    16  	"unicode"
    17  
    18  	"github.com/cockroachdb/errors"
    19  	"github.com/cockroachdb/pebble/internal/base"
    20  	"github.com/cockroachdb/pebble/internal/invariants"
    21  )
    22  
    23  // Compare exports the base.Compare type.
    24  type Compare = base.Compare
    25  
    26  // InternalKey exports the base.InternalKey type.
    27  type InternalKey = base.InternalKey
    28  
    29  // TableInfo contains the common information for table related events.
    30  type TableInfo struct {
    31  	// FileNum is the internal DB identifier for the table.
    32  	FileNum base.FileNum
    33  	// Size is the size of the file in bytes.
    34  	Size uint64
    35  	// Smallest is the smallest internal key in the table.
    36  	Smallest InternalKey
    37  	// Largest is the largest internal key in the table.
    38  	Largest InternalKey
    39  	// SmallestSeqNum is the smallest sequence number in the table.
    40  	SmallestSeqNum uint64
    41  	// LargestSeqNum is the largest sequence number in the table.
    42  	LargestSeqNum uint64
    43  }
    44  
    45  // TableStats contains statistics on a table used for compaction heuristics,
    46  // and export via Metrics.
    47  type TableStats struct {
    48  	// The total number of entries in the table.
    49  	NumEntries uint64
    50  	// The number of point and range deletion entries in the table.
    51  	NumDeletions uint64
    52  	// NumRangeKeySets is the total number of range key sets in the table.
    53  	//
    54  	// NB: If there's a chance that the sstable contains any range key sets,
    55  	// then NumRangeKeySets must be > 0.
    56  	NumRangeKeySets uint64
    57  	// Estimate of the total disk space that may be dropped by this table's
    58  	// point deletions by compacting them.
    59  	PointDeletionsBytesEstimate uint64
    60  	// Estimate of the total disk space that may be dropped by this table's
    61  	// range deletions by compacting them. This estimate is at data-block
    62  	// granularity and is not updated if compactions beneath the table reduce
    63  	// the amount of reclaimable disk space. It also does not account for
    64  	// overlapping data in L0 and ignores L0 sublevels, but the error that
    65  	// introduces is expected to be small.
    66  	//
    67  	// Tables in the bottommost level of the LSM may have a nonzero estimate if
    68  	// snapshots or move compactions prevented the elision of their range
    69  	// tombstones. A table in the bottommost level that was ingested into L6
    70  	// will have a zero estimate, because the file's sequence numbers indicate
    71  	// that the tombstone cannot drop any data contained within the file itself.
    72  	RangeDeletionsBytesEstimate uint64
    73  	// Total size of value blocks and value index block.
    74  	ValueBlocksSize uint64
    75  }
    76  
    77  // boundType represents the type of key (point or range) present as the smallest
    78  // and largest keys.
    79  type boundType uint8
    80  
    81  const (
    82  	boundTypePointKey boundType = iota + 1
    83  	boundTypeRangeKey
    84  )
    85  
    86  // CompactionState is the compaction state of a file.
    87  //
    88  // The following shows the valid state transitions:
    89  //
    90  //	NotCompacting --> Compacting --> Compacted
    91  //	      ^               |
    92  //	      |               |
    93  //	      +-------<-------+
    94  //
    95  // Input files to a compaction transition to Compacting when a compaction is
    96  // picked. A file that has finished compacting typically transitions into the
    97  // Compacted state, at which point it is effectively obsolete ("zombied") and
    98  // will eventually be removed from the LSM. A file that has been move-compacted
    99  // will transition from Compacting back into the NotCompacting state, signaling
   100  // that the file may be selected for a subsequent compaction. A failed
   101  // compaction will result in all input tables transitioning from Compacting to
   102  // NotCompacting.
   103  //
   104  // This state is in-memory only. It is not persisted to the manifest.
   105  type CompactionState uint8
   106  
   107  // CompactionStates.
   108  const (
   109  	CompactionStateNotCompacting CompactionState = iota
   110  	CompactionStateCompacting
   111  	CompactionStateCompacted
   112  )
   113  
   114  // String implements fmt.Stringer.
   115  func (s CompactionState) String() string {
   116  	switch s {
   117  	case CompactionStateNotCompacting:
   118  		return "NotCompacting"
   119  	case CompactionStateCompacting:
   120  		return "Compacting"
   121  	case CompactionStateCompacted:
   122  		return "Compacted"
   123  	default:
   124  		panic(fmt.Sprintf("pebble: unknown compaction state %d", s))
   125  	}
   126  }
   127  
   128  // FileMetadata is maintained for leveled-ssts, i.e., they belong to a level of
   129  // some version. FileMetadata does not contain the actual level of the sst,
   130  // since such leveled-ssts can move across levels in different versions, while
   131  // sharing the same FileMetadata. There are two kinds of leveled-ssts, physical
   132  // and virtual. Underlying both leveled-ssts is a backing-sst, for which the
   133  // only state is FileBacking. A backing-sst is level-less. It is possible for a
   134  // backing-sst to be referred to by a physical sst in one version and by one or
   135  // more virtual ssts in one or more versions. A backing-sst becomes obsolete
   136  // and can be deleted once it is no longer required by any physical or virtual
   137  // sst in any version.
   138  //
   139  // We maintain some invariants:
   140  //
   141  //  1. Each physical and virtual sst will have a unique FileMetadata.FileNum,
   142  //     and there will be exactly one FileMetadata associated with the FileNum.
   143  //
   144  //  2. Within a version, a backing-sst is either only referred to by one
   145  //     physical sst or one or more virtual ssts.
   146  //
   147  //  3. Once a backing-sst is referred to by a virtual sst in the latest version,
   148  //     it cannot go back to being referred to by a physical sst in any future
   149  //     version.
   150  //
   151  // Once a physical sst is no longer needed by any version, we will no longer
   152  // maintain the file metadata associated with it. We will still maintain the
   153  // FileBacking associated with the physical sst if the backing sst is required
   154  // by any virtual ssts in any version.
   155  type FileMetadata struct {
   156  	// AllowedSeeks is used to determine if a file should be picked for
   157  	// a read triggered compaction. It is decremented when read sampling
   158  	// in pebble.Iterator after every after every positioning operation
   159  	// that returns a user key (eg. Next, Prev, SeekGE, SeekLT, etc).
   160  	AllowedSeeks atomic.Int64
   161  
   162  	// statsValid indicates if stats have been loaded for the table. The
   163  	// TableStats structure is populated only if valid is true.
   164  	statsValid atomic.Bool
   165  
   166  	// FileBacking is the state which backs either a physical or virtual
   167  	// sstables.
   168  	FileBacking *FileBacking
   169  
   170  	// InitAllowedSeeks is the inital value of allowed seeks. This is used
   171  	// to re-set allowed seeks on a file once it hits 0.
   172  	InitAllowedSeeks int64
   173  	// FileNum is the file number.
   174  	//
   175  	// INVARIANT: when !FileMetadata.Virtual, FileNum == FileBacking.DiskFileNum.
   176  	FileNum base.FileNum
   177  	// Size is the size of the file, in bytes. Size is an approximate value for
   178  	// virtual sstables.
   179  	//
   180  	// INVARIANTS:
   181  	// - When !FileMetadata.Virtual, Size == FileBacking.Size.
   182  	// - Size should be non-zero. Size 0 virtual sstables must not be created.
   183  	Size uint64
   184  	// File creation time in seconds since the epoch (1970-01-01 00:00:00
   185  	// UTC). For ingested sstables, this corresponds to the time the file was
   186  	// ingested. For virtual sstables, this corresponds to the wall clock time
   187  	// when the FileMetadata for the virtual sstable was first created.
   188  	CreationTime int64
   189  	// Lower and upper bounds for the smallest and largest sequence numbers in
   190  	// the table, across both point and range keys. For physical sstables, these
   191  	// values are tight bounds. For virtual sstables, there is no guarantee that
   192  	// there will be keys with SmallestSeqNum or LargestSeqNum within virtual
   193  	// sstable bounds.
   194  	SmallestSeqNum uint64
   195  	LargestSeqNum  uint64
   196  	// SmallestPointKey and LargestPointKey are the inclusive bounds for the
   197  	// internal point keys stored in the table. This includes RANGEDELs, which
   198  	// alter point keys.
   199  	// NB: these field should be set using ExtendPointKeyBounds. They are left
   200  	// exported for reads as an optimization.
   201  	SmallestPointKey InternalKey
   202  	LargestPointKey  InternalKey
   203  	// SmallestRangeKey and LargestRangeKey are the inclusive bounds for the
   204  	// internal range keys stored in the table.
   205  	// NB: these field should be set using ExtendRangeKeyBounds. They are left
   206  	// exported for reads as an optimization.
   207  	SmallestRangeKey InternalKey
   208  	LargestRangeKey  InternalKey
   209  	// Smallest and Largest are the inclusive bounds for the internal keys stored
   210  	// in the table, across both point and range keys.
   211  	// NB: these fields are derived from their point and range key equivalents,
   212  	// and are updated via the MaybeExtend{Point,Range}KeyBounds methods.
   213  	Smallest InternalKey
   214  	Largest  InternalKey
   215  	// Stats describe table statistics. Protected by DB.mu.
   216  	//
   217  	// For virtual sstables, set stats upon virtual sstable creation as
   218  	// asynchronous computation of stats is not currently supported.
   219  	//
   220  	// TODO(bananabrick): To support manifest replay for virtual sstables, we
   221  	// probably need to compute virtual sstable stats asynchronously. Otherwise,
   222  	// we'd have to write virtual sstable stats to the version edit.
   223  	Stats TableStats
   224  
   225  	// For L0 files only. Protected by DB.mu. Used to generate L0 sublevels and
   226  	// pick L0 compactions. Only accurate for the most recent Version.
   227  	SubLevel         int
   228  	L0Index          int
   229  	minIntervalIndex int
   230  	maxIntervalIndex int
   231  
   232  	// NB: the alignment of this struct is 8 bytes. We pack all the bools to
   233  	// ensure an optimal packing.
   234  
   235  	// IsIntraL0Compacting is set to True if this file is part of an intra-L0
   236  	// compaction. When it's true, IsCompacting must also return true. If
   237  	// Compacting is true and IsIntraL0Compacting is false for an L0 file, the
   238  	// file must be part of a compaction to Lbase.
   239  	IsIntraL0Compacting bool
   240  	CompactionState     CompactionState
   241  	// True if compaction of this file has been explicitly requested.
   242  	// Previously, RocksDB and earlier versions of Pebble allowed this
   243  	// flag to be set by a user table property collector. Some earlier
   244  	// versions of Pebble respected this flag, while other more recent
   245  	// versions ignored this flag.
   246  	//
   247  	// More recently this flag has been repurposed to facilitate the
   248  	// compaction of 'atomic compaction units'. Files marked for
   249  	// compaction are compacted in a rewrite compaction at the lowest
   250  	// possible compaction priority.
   251  	//
   252  	// NB: A count of files marked for compaction is maintained on
   253  	// Version, and compaction picking reads cached annotations
   254  	// determined by this field.
   255  	//
   256  	// Protected by DB.mu.
   257  	MarkedForCompaction bool
   258  	// HasPointKeys tracks whether the table contains point keys (including
   259  	// RANGEDELs). If a table contains only range deletions, HasPointsKeys is
   260  	// still true.
   261  	HasPointKeys bool
   262  	// HasRangeKeys tracks whether the table contains any range keys.
   263  	HasRangeKeys bool
   264  	// smallestSet and largestSet track whether the overall bounds have been set.
   265  	boundsSet bool
   266  	// boundTypeSmallest and boundTypeLargest provide an indication as to which
   267  	// key type (point or range) corresponds to the smallest and largest overall
   268  	// table bounds.
   269  	boundTypeSmallest, boundTypeLargest boundType
   270  	// Virtual is true if the FileMetadata belongs to a virtual sstable.
   271  	Virtual bool
   272  }
   273  
   274  // PhysicalFileMeta is used by functions which want a guarantee that their input
   275  // belongs to a physical sst and not a virtual sst.
   276  //
   277  // NB: This type should only be constructed by calling
   278  // FileMetadata.PhysicalMeta.
   279  type PhysicalFileMeta struct {
   280  	*FileMetadata
   281  }
   282  
   283  // VirtualFileMeta is used by functions which want a guarantee that their input
   284  // belongs to a virtual sst and not a physical sst.
   285  //
   286  // A VirtualFileMeta inherits all the same fields as a FileMetadata. These
   287  // fields have additional invariants imposed on them, and/or slightly varying
   288  // meanings:
   289  //   - Smallest and Largest (and their counterparts
   290  //     {Smallest, Largest}{Point,Range}Key) remain tight bounds that represent a
   291  //     key at that exact bound. We make the effort to determine the next smallest
   292  //     or largest key in an sstable after virtualizing it, to maintain this
   293  //     tightness. If the largest is a sentinel key (IsExclusiveSentinel()), it
   294  //     could mean that a rangedel or range key ends at that user key, or has been
   295  //     truncated to that user key.
   296  //   - One invariant is that if a rangedel or range key is truncated on its
   297  //     upper bound, the virtual sstable *must* have a rangedel or range key
   298  //     sentinel key as its upper bound. This is because truncation yields
   299  //     an exclusive upper bound for the rangedel/rangekey, and if there are
   300  //     any points at that exclusive upper bound within the same virtual
   301  //     sstable, those could get uncovered by this truncation. We enforce this
   302  //     invariant in calls to keyspan.Truncate.
   303  //   - Size is an estimate of the size of the virtualized portion of this sstable.
   304  //     The underlying file's size is stored in FileBacking.Size, though it could
   305  //     also be estimated or could correspond to just the referenced portion of
   306  //     a file (eg. if the file originated on another node).
   307  //   - Size must be > 0.
   308  //   - SmallestSeqNum and LargestSeqNum are loose bounds for virtual sstables.
   309  //     This means that all keys in the virtual sstable must have seqnums within
   310  //     [SmallestSeqNum, LargestSeqNum], however there's no guarantee that there's
   311  //     a key with a seqnum at either of the bounds. Calculating tight seqnum
   312  //     bounds would be too expensive and deliver little value.
   313  //
   314  // NB: This type should only be constructed by calling FileMetadata.VirtualMeta.
   315  type VirtualFileMeta struct {
   316  	*FileMetadata
   317  }
   318  
   319  // PhysicalMeta should be the only source of creating the PhysicalFileMeta
   320  // wrapper type.
   321  func (m *FileMetadata) PhysicalMeta() PhysicalFileMeta {
   322  	if m.Virtual {
   323  		panic("pebble: file metadata does not belong to a physical sstable")
   324  	}
   325  	return PhysicalFileMeta{
   326  		m,
   327  	}
   328  }
   329  
   330  // VirtualMeta should be the only source of creating the VirtualFileMeta wrapper
   331  // type.
   332  func (m *FileMetadata) VirtualMeta() VirtualFileMeta {
   333  	if !m.Virtual {
   334  		panic("pebble: file metadata does not belong to a virtual sstable")
   335  	}
   336  	return VirtualFileMeta{
   337  		m,
   338  	}
   339  }
   340  
   341  // FileBacking either backs a single physical sstable, or one or more virtual
   342  // sstables.
   343  //
   344  // See the comment above the FileMetadata type for sstable terminology.
   345  type FileBacking struct {
   346  	// Reference count for the backing file on disk: incremented when a
   347  	// physical or virtual sstable which is backed by the FileBacking is
   348  	// added to a version and decremented when the version is unreferenced.
   349  	// We ref count in order to determine when it is safe to delete a
   350  	// backing sst file from disk. The backing file is obsolete when the
   351  	// reference count falls to zero.
   352  	refs atomic.Int32
   353  	// latestVersionRefs are the references to the FileBacking in the
   354  	// latest version. This reference can be through a single physical
   355  	// sstable in the latest version, or one or more virtual sstables in the
   356  	// latest version.
   357  	//
   358  	// INVARIANT: latestVersionRefs <= refs.
   359  	latestVersionRefs atomic.Int32
   360  	// VirtualizedSize is set iff the backing sst is only referred to by
   361  	// virtual ssts in the latest version. VirtualizedSize is the sum of the
   362  	// virtual sstable sizes of all of the virtual sstables in the latest
   363  	// version which are backed by the physical sstable. When a virtual
   364  	// sstable is removed from the latest version, we will decrement the
   365  	// VirtualizedSize. During compaction picking, we'll compensate a
   366  	// virtual sstable file size by
   367  	// (FileBacking.Size - FileBacking.VirtualizedSize) / latestVersionRefs.
   368  	// The intuition is that if FileBacking.Size - FileBacking.VirtualizedSize
   369  	// is high, then the space amplification due to virtual sstables is
   370  	// high, and we should pick the virtual sstable with a higher priority.
   371  	//
   372  	// TODO(bananabrick): Compensate the virtual sstable file size using
   373  	// the VirtualizedSize during compaction picking and test.
   374  	VirtualizedSize atomic.Uint64
   375  	DiskFileNum     base.DiskFileNum
   376  	Size            uint64
   377  }
   378  
   379  // InitPhysicalBacking allocates and sets the FileBacking which is required by a
   380  // physical sstable FileMetadata.
   381  //
   382  // Ensure that the state required by FileBacking, such as the FileNum, is
   383  // already set on the FileMetadata before InitPhysicalBacking is called.
   384  // Calling InitPhysicalBacking only after the relevant state has been set in the
   385  // FileMetadata is not necessary in tests which don't rely on FileBacking.
   386  func (m *FileMetadata) InitPhysicalBacking() {
   387  	if m.Virtual {
   388  		panic("pebble: virtual sstables should use a pre-existing FileBacking")
   389  	}
   390  	if m.FileBacking == nil {
   391  		m.FileBacking = &FileBacking{Size: m.Size, DiskFileNum: m.FileNum.DiskFileNum()}
   392  	}
   393  }
   394  
   395  // InitProviderBacking creates a new FileBacking for a file backed by
   396  // an objstorage.Provider.
   397  func (m *FileMetadata) InitProviderBacking(fileNum base.DiskFileNum) {
   398  	if !m.Virtual {
   399  		panic("pebble: provider-backed sstables must be virtual")
   400  	}
   401  	if m.FileBacking == nil {
   402  		m.FileBacking = &FileBacking{DiskFileNum: fileNum}
   403  	}
   404  }
   405  
   406  // ValidateVirtual should be called once the FileMetadata for a virtual sstable
   407  // is created to verify that the fields of the virtual sstable are sound.
   408  func (m *FileMetadata) ValidateVirtual(createdFrom *FileMetadata) {
   409  	if !m.Virtual {
   410  		panic("pebble: invalid virtual sstable")
   411  	}
   412  
   413  	if createdFrom.SmallestSeqNum != m.SmallestSeqNum {
   414  		panic("pebble: invalid smallest sequence number for virtual sstable")
   415  	}
   416  
   417  	if createdFrom.LargestSeqNum != m.LargestSeqNum {
   418  		panic("pebble: invalid largest sequence number for virtual sstable")
   419  	}
   420  
   421  	if createdFrom.FileBacking != nil && createdFrom.FileBacking != m.FileBacking {
   422  		panic("pebble: invalid physical sstable state for virtual sstable")
   423  	}
   424  
   425  	if m.Size == 0 {
   426  		panic("pebble: virtual sstable size must be set upon creation")
   427  	}
   428  }
   429  
   430  // Refs returns the refcount of backing sstable.
   431  func (m *FileMetadata) Refs() int32 {
   432  	return m.FileBacking.refs.Load()
   433  }
   434  
   435  // Ref increments the ref count associated with the backing sstable.
   436  func (m *FileMetadata) Ref() {
   437  	m.FileBacking.refs.Add(1)
   438  }
   439  
   440  // Unref decrements the ref count associated with the backing sstable.
   441  func (m *FileMetadata) Unref() int32 {
   442  	v := m.FileBacking.refs.Add(-1)
   443  	if invariants.Enabled && v < 0 {
   444  		panic("pebble: invalid FileMetadata refcounting")
   445  	}
   446  	return v
   447  }
   448  
   449  // LatestRef increments the latest ref count associated with the backing
   450  // sstable.
   451  func (m *FileMetadata) LatestRef() {
   452  	m.FileBacking.latestVersionRefs.Add(1)
   453  
   454  	if m.Virtual {
   455  		m.FileBacking.VirtualizedSize.Add(m.Size)
   456  	}
   457  }
   458  
   459  // LatestUnref decrements the latest ref count associated with the backing
   460  // sstable.
   461  func (m *FileMetadata) LatestUnref() int32 {
   462  	if m.Virtual {
   463  		m.FileBacking.VirtualizedSize.Add(-m.Size)
   464  	}
   465  
   466  	v := m.FileBacking.latestVersionRefs.Add(-1)
   467  	if invariants.Enabled && v < 0 {
   468  		panic("pebble: invalid FileMetadata latest refcounting")
   469  	}
   470  	return v
   471  }
   472  
   473  // LatestRefs returns the latest ref count associated with the backing sstable.
   474  func (m *FileMetadata) LatestRefs() int32 {
   475  	return m.FileBacking.latestVersionRefs.Load()
   476  }
   477  
   478  // SetCompactionState transitions this file's compaction state to the given
   479  // state. Protected by DB.mu.
   480  func (m *FileMetadata) SetCompactionState(to CompactionState) {
   481  	if invariants.Enabled {
   482  		transitionErr := func() error {
   483  			return errors.Newf("pebble: invalid compaction state transition: %s -> %s", m.CompactionState, to)
   484  		}
   485  		switch m.CompactionState {
   486  		case CompactionStateNotCompacting:
   487  			if to != CompactionStateCompacting {
   488  				panic(transitionErr())
   489  			}
   490  		case CompactionStateCompacting:
   491  			if to != CompactionStateCompacted && to != CompactionStateNotCompacting {
   492  				panic(transitionErr())
   493  			}
   494  		case CompactionStateCompacted:
   495  			panic(transitionErr())
   496  		default:
   497  			panic(fmt.Sprintf("pebble: unknown compaction state: %d", m.CompactionState))
   498  		}
   499  	}
   500  	m.CompactionState = to
   501  }
   502  
   503  // IsCompacting returns true if this file's compaction state is
   504  // CompactionStateCompacting. Protected by DB.mu.
   505  func (m *FileMetadata) IsCompacting() bool {
   506  	return m.CompactionState == CompactionStateCompacting
   507  }
   508  
   509  // StatsValid returns true if the table stats have been populated. If StatValid
   510  // returns true, the Stats field may be read (with or without holding the
   511  // database mutex).
   512  func (m *FileMetadata) StatsValid() bool {
   513  	return m.statsValid.Load()
   514  }
   515  
   516  // StatsMarkValid marks the TableStats as valid. The caller must hold DB.mu
   517  // while populating TableStats and calling StatsMarkValud. Once stats are
   518  // populated, they must not be mutated.
   519  func (m *FileMetadata) StatsMarkValid() {
   520  	m.statsValid.Store(true)
   521  }
   522  
   523  // ExtendPointKeyBounds attempts to extend the lower and upper point key bounds
   524  // and overall table bounds with the given smallest and largest keys. The
   525  // smallest and largest bounds may not be extended if the table already has a
   526  // bound that is smaller or larger, respectively. The receiver is returned.
   527  // NB: calling this method should be preferred to manually setting the bounds by
   528  // manipulating the fields directly, to maintain certain invariants.
   529  func (m *FileMetadata) ExtendPointKeyBounds(
   530  	cmp Compare, smallest, largest InternalKey,
   531  ) *FileMetadata {
   532  	// Update the point key bounds.
   533  	if !m.HasPointKeys {
   534  		m.SmallestPointKey, m.LargestPointKey = smallest, largest
   535  		m.HasPointKeys = true
   536  	} else {
   537  		if base.InternalCompare(cmp, smallest, m.SmallestPointKey) < 0 {
   538  			m.SmallestPointKey = smallest
   539  		}
   540  		if base.InternalCompare(cmp, largest, m.LargestPointKey) > 0 {
   541  			m.LargestPointKey = largest
   542  		}
   543  	}
   544  	// Update the overall bounds.
   545  	m.extendOverallBounds(cmp, m.SmallestPointKey, m.LargestPointKey, boundTypePointKey)
   546  	return m
   547  }
   548  
   549  // ExtendRangeKeyBounds attempts to extend the lower and upper range key bounds
   550  // and overall table bounds with the given smallest and largest keys. The
   551  // smallest and largest bounds may not be extended if the table already has a
   552  // bound that is smaller or larger, respectively. The receiver is returned.
   553  // NB: calling this method should be preferred to manually setting the bounds by
   554  // manipulating the fields directly, to maintain certain invariants.
   555  func (m *FileMetadata) ExtendRangeKeyBounds(
   556  	cmp Compare, smallest, largest InternalKey,
   557  ) *FileMetadata {
   558  	// Update the range key bounds.
   559  	if !m.HasRangeKeys {
   560  		m.SmallestRangeKey, m.LargestRangeKey = smallest, largest
   561  		m.HasRangeKeys = true
   562  	} else {
   563  		if base.InternalCompare(cmp, smallest, m.SmallestRangeKey) < 0 {
   564  			m.SmallestRangeKey = smallest
   565  		}
   566  		if base.InternalCompare(cmp, largest, m.LargestRangeKey) > 0 {
   567  			m.LargestRangeKey = largest
   568  		}
   569  	}
   570  	// Update the overall bounds.
   571  	m.extendOverallBounds(cmp, m.SmallestRangeKey, m.LargestRangeKey, boundTypeRangeKey)
   572  	return m
   573  }
   574  
   575  // extendOverallBounds attempts to extend the overall table lower and upper
   576  // bounds. The given bounds may not be used if a lower or upper bound already
   577  // exists that is smaller or larger than the given keys, respectively. The given
   578  // boundType will be used if the bounds are updated.
   579  func (m *FileMetadata) extendOverallBounds(
   580  	cmp Compare, smallest, largest InternalKey, bTyp boundType,
   581  ) {
   582  	if !m.boundsSet {
   583  		m.Smallest, m.Largest = smallest, largest
   584  		m.boundsSet = true
   585  		m.boundTypeSmallest, m.boundTypeLargest = bTyp, bTyp
   586  	} else {
   587  		if base.InternalCompare(cmp, smallest, m.Smallest) < 0 {
   588  			m.Smallest = smallest
   589  			m.boundTypeSmallest = bTyp
   590  		}
   591  		if base.InternalCompare(cmp, largest, m.Largest) > 0 {
   592  			m.Largest = largest
   593  			m.boundTypeLargest = bTyp
   594  		}
   595  	}
   596  }
   597  
   598  // Overlaps returns true if the file key range overlaps with the given range.
   599  func (m *FileMetadata) Overlaps(cmp Compare, start []byte, end []byte, exclusiveEnd bool) bool {
   600  	if c := cmp(m.Largest.UserKey, start); c < 0 || (c == 0 && m.Largest.IsExclusiveSentinel()) {
   601  		// f is completely before the specified range; no overlap.
   602  		return false
   603  	}
   604  	if c := cmp(m.Smallest.UserKey, end); c > 0 || (c == 0 && exclusiveEnd) {
   605  		// f is completely after the specified range; no overlap.
   606  		return false
   607  	}
   608  	return true
   609  }
   610  
   611  // ContainedWithinSpan returns true if the file key range completely overlaps with the
   612  // given range ("end" is assumed to exclusive).
   613  func (m *FileMetadata) ContainedWithinSpan(cmp Compare, start, end []byte) bool {
   614  	lowerCmp, upperCmp := cmp(m.Smallest.UserKey, start), cmp(m.Largest.UserKey, end)
   615  	return lowerCmp >= 0 && (upperCmp < 0 || (upperCmp == 0 && m.Largest.IsExclusiveSentinel()))
   616  }
   617  
   618  // ContainsKeyType returns whether or not the file contains keys of the provided
   619  // type.
   620  func (m *FileMetadata) ContainsKeyType(kt KeyType) bool {
   621  	switch kt {
   622  	case KeyTypePointAndRange:
   623  		return true
   624  	case KeyTypePoint:
   625  		return m.HasPointKeys
   626  	case KeyTypeRange:
   627  		return m.HasRangeKeys
   628  	default:
   629  		panic("unrecognized key type")
   630  	}
   631  }
   632  
   633  // SmallestBound returns the file's smallest bound of the key type. It returns a
   634  // false second return value if the file does not contain any keys of the key
   635  // type.
   636  func (m *FileMetadata) SmallestBound(kt KeyType) (*InternalKey, bool) {
   637  	switch kt {
   638  	case KeyTypePointAndRange:
   639  		return &m.Smallest, true
   640  	case KeyTypePoint:
   641  		return &m.SmallestPointKey, m.HasPointKeys
   642  	case KeyTypeRange:
   643  		return &m.SmallestRangeKey, m.HasRangeKeys
   644  	default:
   645  		panic("unrecognized key type")
   646  	}
   647  }
   648  
   649  // LargestBound returns the file's largest bound of the key type. It returns a
   650  // false second return value if the file does not contain any keys of the key
   651  // type.
   652  func (m *FileMetadata) LargestBound(kt KeyType) (*InternalKey, bool) {
   653  	switch kt {
   654  	case KeyTypePointAndRange:
   655  		return &m.Largest, true
   656  	case KeyTypePoint:
   657  		return &m.LargestPointKey, m.HasPointKeys
   658  	case KeyTypeRange:
   659  		return &m.LargestRangeKey, m.HasRangeKeys
   660  	default:
   661  		panic("unrecognized key type")
   662  	}
   663  }
   664  
   665  const (
   666  	maskContainsPointKeys = 1 << 0
   667  	maskSmallest          = 1 << 1
   668  	maskLargest           = 1 << 2
   669  )
   670  
   671  // boundsMarker returns a marker byte whose bits encode the following
   672  // information (in order from least significant bit):
   673  // - if the table contains point keys
   674  // - if the table's smallest key is a point key
   675  // - if the table's largest key is a point key
   676  func (m *FileMetadata) boundsMarker() (sentinel uint8, err error) {
   677  	if m.HasPointKeys {
   678  		sentinel |= maskContainsPointKeys
   679  	}
   680  	switch m.boundTypeSmallest {
   681  	case boundTypePointKey:
   682  		sentinel |= maskSmallest
   683  	case boundTypeRangeKey:
   684  		// No op - leave bit unset.
   685  	default:
   686  		return 0, base.CorruptionErrorf("file %s has neither point nor range key as smallest key", m.FileNum)
   687  	}
   688  	switch m.boundTypeLargest {
   689  	case boundTypePointKey:
   690  		sentinel |= maskLargest
   691  	case boundTypeRangeKey:
   692  		// No op - leave bit unset.
   693  	default:
   694  		return 0, base.CorruptionErrorf("file %s has neither point nor range key as largest key", m.FileNum)
   695  	}
   696  	return
   697  }
   698  
   699  // String implements fmt.Stringer, printing the file number and the overall
   700  // table bounds.
   701  func (m *FileMetadata) String() string {
   702  	return fmt.Sprintf("%s:[%s-%s]", m.FileNum, m.Smallest, m.Largest)
   703  }
   704  
   705  // DebugString returns a verbose representation of FileMetadata, typically for
   706  // use in tests and debugging, returning the file number and the point, range
   707  // and overall bounds for the table.
   708  func (m *FileMetadata) DebugString(format base.FormatKey, verbose bool) string {
   709  	var b bytes.Buffer
   710  	fmt.Fprintf(&b, "%s:[%s-%s]",
   711  		m.FileNum, m.Smallest.Pretty(format), m.Largest.Pretty(format))
   712  	if !verbose {
   713  		return b.String()
   714  	}
   715  	fmt.Fprintf(&b, " seqnums:[%d-%d]", m.SmallestSeqNum, m.LargestSeqNum)
   716  	if m.HasPointKeys {
   717  		fmt.Fprintf(&b, " points:[%s-%s]",
   718  			m.SmallestPointKey.Pretty(format), m.LargestPointKey.Pretty(format))
   719  	}
   720  	if m.HasRangeKeys {
   721  		fmt.Fprintf(&b, " ranges:[%s-%s]",
   722  			m.SmallestRangeKey.Pretty(format), m.LargestRangeKey.Pretty(format))
   723  	}
   724  	return b.String()
   725  }
   726  
   727  // ParseFileMetadataDebug parses a FileMetadata from its DebugString
   728  // representation.
   729  func ParseFileMetadataDebug(s string) (*FileMetadata, error) {
   730  	// Split lines of the form:
   731  	//  000000:[a#0,SET-z#0,SET] seqnums:[5-5] points:[...] ranges:[...]
   732  	fields := strings.FieldsFunc(s, func(c rune) bool {
   733  		switch c {
   734  		case ':', '[', '-', ']':
   735  			return true
   736  		default:
   737  			return unicode.IsSpace(c) // NB: also trim whitespace padding.
   738  		}
   739  	})
   740  	if len(fields)%3 != 0 {
   741  		return nil, errors.Newf("malformed input: %s", s)
   742  	}
   743  	m := &FileMetadata{}
   744  	for len(fields) > 0 {
   745  		prefix := fields[0]
   746  		if prefix == "seqnums" {
   747  			smallestSeqNum, err := strconv.ParseUint(fields[1], 10, 64)
   748  			if err != nil {
   749  				return m, errors.Newf("malformed input: %s: %s", s, err)
   750  			}
   751  			largestSeqNum, err := strconv.ParseUint(fields[2], 10, 64)
   752  			if err != nil {
   753  				return m, errors.Newf("malformed input: %s: %s", s, err)
   754  			}
   755  			m.SmallestSeqNum, m.LargestSeqNum = smallestSeqNum, largestSeqNum
   756  			fields = fields[3:]
   757  			continue
   758  		}
   759  		smallest := base.ParsePrettyInternalKey(fields[1])
   760  		largest := base.ParsePrettyInternalKey(fields[2])
   761  		switch prefix {
   762  		case "points":
   763  			m.SmallestPointKey, m.LargestPointKey = smallest, largest
   764  			m.HasPointKeys = true
   765  		case "ranges":
   766  			m.SmallestRangeKey, m.LargestRangeKey = smallest, largest
   767  			m.HasRangeKeys = true
   768  		default:
   769  			fileNum, err := strconv.ParseUint(prefix, 10, 64)
   770  			if err != nil {
   771  				return m, errors.Newf("malformed input: %s: %s", s, err)
   772  			}
   773  			m.FileNum = base.FileNum(fileNum)
   774  			m.Smallest, m.Largest = smallest, largest
   775  			m.boundsSet = true
   776  		}
   777  		fields = fields[3:]
   778  	}
   779  	// By default, when the parser sees just the overall bounds, we set the point
   780  	// keys. This preserves backwards compatability with existing test cases that
   781  	// specify only the overall bounds.
   782  	if !m.HasPointKeys && !m.HasRangeKeys {
   783  		m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest
   784  		m.HasPointKeys = true
   785  	}
   786  	m.InitPhysicalBacking()
   787  	return m, nil
   788  }
   789  
   790  // Validate validates the metadata for consistency with itself, returning an
   791  // error if inconsistent.
   792  func (m *FileMetadata) Validate(cmp Compare, formatKey base.FormatKey) error {
   793  	// Combined range and point key validation.
   794  
   795  	if !m.HasPointKeys && !m.HasRangeKeys {
   796  		return base.CorruptionErrorf("file %s has neither point nor range keys",
   797  			errors.Safe(m.FileNum))
   798  	}
   799  	if base.InternalCompare(cmp, m.Smallest, m.Largest) > 0 {
   800  		return base.CorruptionErrorf("file %s has inconsistent bounds: %s vs %s",
   801  			errors.Safe(m.FileNum), m.Smallest.Pretty(formatKey),
   802  			m.Largest.Pretty(formatKey))
   803  	}
   804  	if m.SmallestSeqNum > m.LargestSeqNum {
   805  		return base.CorruptionErrorf("file %s has inconsistent seqnum bounds: %d vs %d",
   806  			errors.Safe(m.FileNum), m.SmallestSeqNum, m.LargestSeqNum)
   807  	}
   808  
   809  	// Point key validation.
   810  
   811  	if m.HasPointKeys {
   812  		if base.InternalCompare(cmp, m.SmallestPointKey, m.LargestPointKey) > 0 {
   813  			return base.CorruptionErrorf("file %s has inconsistent point key bounds: %s vs %s",
   814  				errors.Safe(m.FileNum), m.SmallestPointKey.Pretty(formatKey),
   815  				m.LargestPointKey.Pretty(formatKey))
   816  		}
   817  		if base.InternalCompare(cmp, m.SmallestPointKey, m.Smallest) < 0 ||
   818  			base.InternalCompare(cmp, m.LargestPointKey, m.Largest) > 0 {
   819  			return base.CorruptionErrorf(
   820  				"file %s has inconsistent point key bounds relative to overall bounds: "+
   821  					"overall = [%s-%s], point keys = [%s-%s]",
   822  				errors.Safe(m.FileNum),
   823  				m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey),
   824  				m.SmallestPointKey.Pretty(formatKey), m.LargestPointKey.Pretty(formatKey),
   825  			)
   826  		}
   827  	}
   828  
   829  	// Range key validation.
   830  
   831  	if m.HasRangeKeys {
   832  		if base.InternalCompare(cmp, m.SmallestRangeKey, m.LargestRangeKey) > 0 {
   833  			return base.CorruptionErrorf("file %s has inconsistent range key bounds: %s vs %s",
   834  				errors.Safe(m.FileNum), m.SmallestRangeKey.Pretty(formatKey),
   835  				m.LargestRangeKey.Pretty(formatKey))
   836  		}
   837  		if base.InternalCompare(cmp, m.SmallestRangeKey, m.Smallest) < 0 ||
   838  			base.InternalCompare(cmp, m.LargestRangeKey, m.Largest) > 0 {
   839  			return base.CorruptionErrorf(
   840  				"file %s has inconsistent range key bounds relative to overall bounds: "+
   841  					"overall = [%s-%s], range keys = [%s-%s]",
   842  				errors.Safe(m.FileNum),
   843  				m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey),
   844  				m.SmallestRangeKey.Pretty(formatKey), m.LargestRangeKey.Pretty(formatKey),
   845  			)
   846  		}
   847  	}
   848  
   849  	// Ensure that FileMetadata.Init was called.
   850  	if m.FileBacking == nil {
   851  		return base.CorruptionErrorf("file metadata FileBacking not set")
   852  	}
   853  
   854  	return nil
   855  }
   856  
   857  // TableInfo returns a subset of the FileMetadata state formatted as a
   858  // TableInfo.
   859  func (m *FileMetadata) TableInfo() TableInfo {
   860  	return TableInfo{
   861  		FileNum:        m.FileNum,
   862  		Size:           m.Size,
   863  		Smallest:       m.Smallest,
   864  		Largest:        m.Largest,
   865  		SmallestSeqNum: m.SmallestSeqNum,
   866  		LargestSeqNum:  m.LargestSeqNum,
   867  	}
   868  }
   869  
   870  func (m *FileMetadata) cmpSeqNum(b *FileMetadata) int {
   871  	// NB: This is the same ordering that RocksDB uses for L0 files.
   872  
   873  	// Sort first by largest sequence number.
   874  	if v := stdcmp.Compare(m.LargestSeqNum, b.LargestSeqNum); v != 0 {
   875  		return v
   876  	}
   877  	// Then by smallest sequence number.
   878  	if v := stdcmp.Compare(m.SmallestSeqNum, b.SmallestSeqNum); v != 0 {
   879  		return v
   880  	}
   881  	// Break ties by file number.
   882  	return stdcmp.Compare(m.FileNum, b.FileNum)
   883  }
   884  
   885  func (m *FileMetadata) lessSeqNum(b *FileMetadata) bool {
   886  	return m.cmpSeqNum(b) < 0
   887  }
   888  
   889  func (m *FileMetadata) cmpSmallestKey(b *FileMetadata, cmp Compare) int {
   890  	return base.InternalCompare(cmp, m.Smallest, b.Smallest)
   891  }
   892  
   893  // KeyRange returns the minimum smallest and maximum largest internalKey for
   894  // all the FileMetadata in iters.
   895  func KeyRange(ucmp Compare, iters ...LevelIterator) (smallest, largest InternalKey) {
   896  	first := true
   897  	for _, iter := range iters {
   898  		for meta := iter.First(); meta != nil; meta = iter.Next() {
   899  			if first {
   900  				first = false
   901  				smallest, largest = meta.Smallest, meta.Largest
   902  				continue
   903  			}
   904  			if base.InternalCompare(ucmp, smallest, meta.Smallest) >= 0 {
   905  				smallest = meta.Smallest
   906  			}
   907  			if base.InternalCompare(ucmp, largest, meta.Largest) <= 0 {
   908  				largest = meta.Largest
   909  			}
   910  		}
   911  	}
   912  	return smallest, largest
   913  }
   914  
   915  type bySeqNum []*FileMetadata
   916  
   917  func (b bySeqNum) Len() int { return len(b) }
   918  func (b bySeqNum) Less(i, j int) bool {
   919  	return b[i].lessSeqNum(b[j])
   920  }
   921  func (b bySeqNum) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
   922  
   923  // SortBySeqNum sorts the specified files by increasing sequence number.
   924  func SortBySeqNum(files []*FileMetadata) {
   925  	sort.Sort(bySeqNum(files))
   926  }
   927  
   928  type bySmallest struct {
   929  	files []*FileMetadata
   930  	cmp   Compare
   931  }
   932  
   933  func (b bySmallest) Len() int { return len(b.files) }
   934  func (b bySmallest) Less(i, j int) bool {
   935  	return b.files[i].cmpSmallestKey(b.files[j], b.cmp) < 0
   936  }
   937  func (b bySmallest) Swap(i, j int) { b.files[i], b.files[j] = b.files[j], b.files[i] }
   938  
   939  // SortBySmallest sorts the specified files by smallest key using the supplied
   940  // comparison function to order user keys.
   941  func SortBySmallest(files []*FileMetadata, cmp Compare) {
   942  	sort.Sort(bySmallest{files, cmp})
   943  }
   944  
   945  func overlaps(iter LevelIterator, cmp Compare, start, end []byte, exclusiveEnd bool) LevelSlice {
   946  	startIter := iter.Clone()
   947  	{
   948  		startIterFile := startIter.SeekGE(cmp, start)
   949  		// SeekGE compares user keys. The user key `start` may be equal to the
   950  		// f.Largest because f.Largest is a range deletion sentinel, indicating
   951  		// that the user key `start` is NOT contained within the file f. If
   952  		// that's the case, we can narrow the overlapping bounds to exclude the
   953  		// file with the sentinel.
   954  		if startIterFile != nil && startIterFile.Largest.IsExclusiveSentinel() &&
   955  			cmp(startIterFile.Largest.UserKey, start) == 0 {
   956  			startIterFile = startIter.Next()
   957  		}
   958  		_ = startIterFile // Ignore unused assignment.
   959  	}
   960  
   961  	endIter := iter.Clone()
   962  	{
   963  		endIterFile := endIter.SeekGE(cmp, end)
   964  
   965  		if !exclusiveEnd {
   966  			// endIter is now pointing at the *first* file with a largest key >= end.
   967  			// If there are multiple files including the user key `end`, we want all
   968  			// of them, so move forward.
   969  			for endIterFile != nil && cmp(endIterFile.Largest.UserKey, end) == 0 {
   970  				endIterFile = endIter.Next()
   971  			}
   972  		}
   973  
   974  		// LevelSlice uses inclusive bounds, so if we seeked to the end sentinel
   975  		// or nexted too far because Largest.UserKey equaled `end`, go back.
   976  		//
   977  		// Consider !exclusiveEnd and end = 'f', with the following file bounds:
   978  		//
   979  		//     [b,d] [e, f] [f, f] [g, h]
   980  		//
   981  		// the above for loop will Next until it arrives at [g, h]. We need to
   982  		// observe that g > f, and Prev to the file with bounds [f, f].
   983  		if endIterFile == nil {
   984  			endIterFile = endIter.Prev()
   985  		} else if c := cmp(endIterFile.Smallest.UserKey, end); c > 0 || c == 0 && exclusiveEnd {
   986  			endIterFile = endIter.Prev()
   987  		}
   988  		_ = endIterFile // Ignore unused assignment.
   989  	}
   990  	return newBoundedLevelSlice(startIter.Clone().iter, &startIter.iter, &endIter.iter)
   991  }
   992  
   993  // NumLevels is the number of levels a Version contains.
   994  const NumLevels = 7
   995  
   996  // NewVersion constructs a new Version with the provided files. It requires
   997  // the provided files are already well-ordered. It's intended for testing.
   998  func NewVersion(
   999  	cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, files [NumLevels][]*FileMetadata,
  1000  ) *Version {
  1001  	var v Version
  1002  	for l := range files {
  1003  		// NB: We specifically insert `files` into the B-Tree in the order
  1004  		// they appear within `files`. Some tests depend on this behavior in
  1005  		// order to test consistency checking, etc. Once we've constructed the
  1006  		// initial B-Tree, we swap out the btreeCmp for the correct one.
  1007  		// TODO(jackson): Adjust or remove the tests and remove this.
  1008  		v.Levels[l].tree, _ = makeBTree(btreeCmpSpecificOrder(files[l]), files[l])
  1009  		v.Levels[l].level = l
  1010  		if l == 0 {
  1011  			v.Levels[l].tree.cmp = btreeCmpSeqNum
  1012  		} else {
  1013  			v.Levels[l].tree.cmp = btreeCmpSmallestKey(cmp)
  1014  		}
  1015  		for _, f := range files[l] {
  1016  			v.Levels[l].totalSize += f.Size
  1017  		}
  1018  	}
  1019  	if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil {
  1020  		panic(err)
  1021  	}
  1022  	return &v
  1023  }
  1024  
  1025  // Version is a collection of file metadata for on-disk tables at various
  1026  // levels. In-memory DBs are written to level-0 tables, and compactions
  1027  // migrate data from level N to level N+1. The tables map internal keys (which
  1028  // are a user key, a delete or set bit, and a sequence number) to user values.
  1029  //
  1030  // The tables at level 0 are sorted by largest sequence number. Due to file
  1031  // ingestion, there may be overlap in the ranges of sequence numbers contain in
  1032  // level 0 sstables. In particular, it is valid for one level 0 sstable to have
  1033  // the seqnum range [1,100] while an adjacent sstable has the seqnum range
  1034  // [50,50]. This occurs when the [50,50] table was ingested and given a global
  1035  // seqnum. The ingestion code will have ensured that the [50,50] sstable will
  1036  // not have any keys that overlap with the [1,100] in the seqnum range
  1037  // [1,49]. The range of internal keys [fileMetadata.smallest,
  1038  // fileMetadata.largest] in each level 0 table may overlap.
  1039  //
  1040  // The tables at any non-0 level are sorted by their internal key range and any
  1041  // two tables at the same non-0 level do not overlap.
  1042  //
  1043  // The internal key ranges of two tables at different levels X and Y may
  1044  // overlap, for any X != Y.
  1045  //
  1046  // Finally, for every internal key in a table at level X, there is no internal
  1047  // key in a higher level table that has both the same user key and a higher
  1048  // sequence number.
  1049  type Version struct {
  1050  	refs atomic.Int32
  1051  
  1052  	// The level 0 sstables are organized in a series of sublevels. Similar to
  1053  	// the seqnum invariant in normal levels, there is no internal key in a
  1054  	// higher level table that has both the same user key and a higher sequence
  1055  	// number. Within a sublevel, tables are sorted by their internal key range
  1056  	// and any two tables at the same sublevel do not overlap. Unlike the normal
  1057  	// levels, sublevel n contains older tables (lower sequence numbers) than
  1058  	// sublevel n+1.
  1059  	//
  1060  	// The L0Sublevels struct is mostly used for compaction picking. As most
  1061  	// internal data structures in it are only necessary for compaction picking
  1062  	// and not for iterator creation, the reference to L0Sublevels is nil'd
  1063  	// after this version becomes the non-newest version, to reduce memory
  1064  	// usage.
  1065  	//
  1066  	// L0Sublevels.Levels contains L0 files ordered by sublevels. All the files
  1067  	// in Levels[0] are in L0Sublevels.Levels. L0SublevelFiles is also set to
  1068  	// a reference to that slice, as that slice is necessary for iterator
  1069  	// creation and needs to outlast L0Sublevels.
  1070  	L0Sublevels     *L0Sublevels
  1071  	L0SublevelFiles []LevelSlice
  1072  
  1073  	Levels [NumLevels]LevelMetadata
  1074  
  1075  	// RangeKeyLevels holds a subset of the same files as Levels that contain range
  1076  	// keys (i.e. fileMeta.HasRangeKeys == true). The memory amplification of this
  1077  	// duplication should be minimal, as range keys are expected to be rare.
  1078  	RangeKeyLevels [NumLevels]LevelMetadata
  1079  
  1080  	// The callback to invoke when the last reference to a version is
  1081  	// removed. Will be called with list.mu held.
  1082  	Deleted func(obsolete []*FileBacking)
  1083  
  1084  	// Stats holds aggregated stats about the version maintained from
  1085  	// version to version.
  1086  	Stats struct {
  1087  		// MarkedForCompaction records the count of files marked for
  1088  		// compaction within the version.
  1089  		MarkedForCompaction int
  1090  	}
  1091  
  1092  	// The list the version is linked into.
  1093  	list *VersionList
  1094  
  1095  	// The next/prev link for the versionList doubly-linked list of versions.
  1096  	prev, next *Version
  1097  }
  1098  
  1099  // String implements fmt.Stringer, printing the FileMetadata for each level in
  1100  // the Version.
  1101  func (v *Version) String() string {
  1102  	return v.string(base.DefaultFormatter, false)
  1103  }
  1104  
  1105  // DebugString returns an alternative format to String() which includes sequence
  1106  // number and kind information for the sstable boundaries.
  1107  func (v *Version) DebugString(format base.FormatKey) string {
  1108  	return v.string(format, true)
  1109  }
  1110  
  1111  func describeSublevels(format base.FormatKey, verbose bool, sublevels []LevelSlice) string {
  1112  	var buf bytes.Buffer
  1113  	for sublevel := len(sublevels) - 1; sublevel >= 0; sublevel-- {
  1114  		fmt.Fprintf(&buf, "0.%d:\n", sublevel)
  1115  		sublevels[sublevel].Each(func(f *FileMetadata) {
  1116  			fmt.Fprintf(&buf, "  %s\n", f.DebugString(format, verbose))
  1117  		})
  1118  	}
  1119  	return buf.String()
  1120  }
  1121  
  1122  func (v *Version) string(format base.FormatKey, verbose bool) string {
  1123  	var buf bytes.Buffer
  1124  	if len(v.L0SublevelFiles) > 0 {
  1125  		fmt.Fprintf(&buf, "%s", describeSublevels(format, verbose, v.L0SublevelFiles))
  1126  	}
  1127  	for level := 1; level < NumLevels; level++ {
  1128  		if v.Levels[level].Empty() {
  1129  			continue
  1130  		}
  1131  		fmt.Fprintf(&buf, "%d:\n", level)
  1132  		iter := v.Levels[level].Iter()
  1133  		for f := iter.First(); f != nil; f = iter.Next() {
  1134  			fmt.Fprintf(&buf, "  %s\n", f.DebugString(format, verbose))
  1135  		}
  1136  	}
  1137  	return buf.String()
  1138  }
  1139  
  1140  // ParseVersionDebug parses a Version from its DebugString output.
  1141  func ParseVersionDebug(
  1142  	cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, s string,
  1143  ) (*Version, error) {
  1144  	var level int
  1145  	var files [NumLevels][]*FileMetadata
  1146  	for _, l := range strings.Split(s, "\n") {
  1147  		l = strings.TrimSpace(l)
  1148  
  1149  		switch l[:2] {
  1150  		case "0.", "0:", "1:", "2:", "3:", "4:", "5:", "6:":
  1151  			var err error
  1152  			level, err = strconv.Atoi(l[:1])
  1153  			if err != nil {
  1154  				return nil, err
  1155  			}
  1156  		default:
  1157  			m, err := ParseFileMetadataDebug(l)
  1158  			if err != nil {
  1159  				return nil, err
  1160  			}
  1161  			// If we only parsed overall bounds, default to setting the point bounds.
  1162  			if !m.HasPointKeys && !m.HasRangeKeys {
  1163  				m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest
  1164  				m.HasPointKeys = true
  1165  			}
  1166  			files[level] = append(files[level], m)
  1167  		}
  1168  	}
  1169  	// Reverse the order of L0 files. This ensures we construct the same
  1170  	// sublevels. (They're printed from higher sublevel to lower, which means in
  1171  	// a partial order that represents newest to oldest).
  1172  	for i := 0; i < len(files[0])/2; i++ {
  1173  		files[0][i], files[0][len(files[0])-i-1] = files[0][len(files[0])-i-1], files[0][i]
  1174  	}
  1175  	return NewVersion(cmp, formatKey, flushSplitBytes, files), nil
  1176  }
  1177  
  1178  // Refs returns the number of references to the version.
  1179  func (v *Version) Refs() int32 {
  1180  	return v.refs.Load()
  1181  }
  1182  
  1183  // Ref increments the version refcount.
  1184  func (v *Version) Ref() {
  1185  	v.refs.Add(1)
  1186  }
  1187  
  1188  // Unref decrements the version refcount. If the last reference to the version
  1189  // was removed, the version is removed from the list of versions and the
  1190  // Deleted callback is invoked. Requires that the VersionList mutex is NOT
  1191  // locked.
  1192  func (v *Version) Unref() {
  1193  	if v.refs.Add(-1) == 0 {
  1194  		l := v.list
  1195  		l.mu.Lock()
  1196  		l.Remove(v)
  1197  		v.Deleted(v.unrefFiles())
  1198  		l.mu.Unlock()
  1199  	}
  1200  }
  1201  
  1202  // UnrefLocked decrements the version refcount. If the last reference to the
  1203  // version was removed, the version is removed from the list of versions and
  1204  // the Deleted callback is invoked. Requires that the VersionList mutex is
  1205  // already locked.
  1206  func (v *Version) UnrefLocked() {
  1207  	if v.refs.Add(-1) == 0 {
  1208  		v.list.Remove(v)
  1209  		v.Deleted(v.unrefFiles())
  1210  	}
  1211  }
  1212  
  1213  func (v *Version) unrefFiles() []*FileBacking {
  1214  	var obsolete []*FileBacking
  1215  	for _, lm := range v.Levels {
  1216  		obsolete = append(obsolete, lm.release()...)
  1217  	}
  1218  	for _, lm := range v.RangeKeyLevels {
  1219  		obsolete = append(obsolete, lm.release()...)
  1220  	}
  1221  	return obsolete
  1222  }
  1223  
  1224  // Next returns the next version in the list of versions.
  1225  func (v *Version) Next() *Version {
  1226  	return v.next
  1227  }
  1228  
  1229  // InitL0Sublevels initializes the L0Sublevels
  1230  func (v *Version) InitL0Sublevels(
  1231  	cmp Compare, formatKey base.FormatKey, flushSplitBytes int64,
  1232  ) error {
  1233  	var err error
  1234  	v.L0Sublevels, err = NewL0Sublevels(&v.Levels[0], cmp, formatKey, flushSplitBytes)
  1235  	if err == nil && v.L0Sublevels != nil {
  1236  		v.L0SublevelFiles = v.L0Sublevels.Levels
  1237  	}
  1238  	return err
  1239  }
  1240  
  1241  // Contains returns a boolean indicating whether the provided file exists in
  1242  // the version at the given level. If level is non-zero then Contains binary
  1243  // searches among the files. If level is zero, Contains scans the entire
  1244  // level.
  1245  func (v *Version) Contains(level int, cmp Compare, m *FileMetadata) bool {
  1246  	iter := v.Levels[level].Iter()
  1247  	if level > 0 {
  1248  		overlaps := v.Overlaps(level, cmp, m.Smallest.UserKey, m.Largest.UserKey,
  1249  			m.Largest.IsExclusiveSentinel())
  1250  		iter = overlaps.Iter()
  1251  	}
  1252  	for f := iter.First(); f != nil; f = iter.Next() {
  1253  		if f == m {
  1254  			return true
  1255  		}
  1256  	}
  1257  	return false
  1258  }
  1259  
  1260  // Overlaps returns all elements of v.files[level] whose user key range
  1261  // intersects the given range. If level is non-zero then the user key ranges of
  1262  // v.files[level] are assumed to not overlap (although they may touch). If level
  1263  // is zero then that assumption cannot be made, and the [start, end] range is
  1264  // expanded to the union of those matching ranges so far and the computation is
  1265  // repeated until [start, end] stabilizes.
  1266  // The returned files are a subsequence of the input files, i.e., the ordering
  1267  // is not changed.
  1268  func (v *Version) Overlaps(
  1269  	level int, cmp Compare, start, end []byte, exclusiveEnd bool,
  1270  ) LevelSlice {
  1271  	if level == 0 {
  1272  		// Indices that have been selected as overlapping.
  1273  		l0 := v.Levels[level]
  1274  		l0Iter := l0.Iter()
  1275  		selectedIndices := make([]bool, l0.Len())
  1276  		numSelected := 0
  1277  		var slice LevelSlice
  1278  		for {
  1279  			restart := false
  1280  			for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() {
  1281  				selected := selectedIndices[i]
  1282  				if selected {
  1283  					continue
  1284  				}
  1285  				if !meta.Overlaps(cmp, start, end, exclusiveEnd) {
  1286  					// meta is completely outside the specified range; skip it.
  1287  					continue
  1288  				}
  1289  				// Overlaps.
  1290  				selectedIndices[i] = true
  1291  				numSelected++
  1292  
  1293  				smallest := meta.Smallest.UserKey
  1294  				largest := meta.Largest.UserKey
  1295  				// Since level == 0, check if the newly added fileMetadata has
  1296  				// expanded the range. We expand the range immediately for files
  1297  				// we have remaining to check in this loop. All already checked
  1298  				// and unselected files will need to be rechecked via the
  1299  				// restart below.
  1300  				if cmp(smallest, start) < 0 {
  1301  					start = smallest
  1302  					restart = true
  1303  				}
  1304  				if v := cmp(largest, end); v > 0 {
  1305  					end = largest
  1306  					exclusiveEnd = meta.Largest.IsExclusiveSentinel()
  1307  					restart = true
  1308  				} else if v == 0 && exclusiveEnd && !meta.Largest.IsExclusiveSentinel() {
  1309  					// Only update the exclusivity of our existing `end`
  1310  					// bound.
  1311  					exclusiveEnd = false
  1312  					restart = true
  1313  				}
  1314  			}
  1315  
  1316  			if !restart {
  1317  				// Construct a B-Tree containing only the matching items.
  1318  				var tr btree
  1319  				tr.cmp = v.Levels[level].tree.cmp
  1320  				for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() {
  1321  					if selectedIndices[i] {
  1322  						err := tr.Insert(meta)
  1323  						if err != nil {
  1324  							panic(err)
  1325  						}
  1326  					}
  1327  				}
  1328  				slice = newLevelSlice(tr.Iter())
  1329  				// TODO(jackson): Avoid the oddity of constructing and
  1330  				// immediately releasing a B-Tree. Make LevelSlice an
  1331  				// interface?
  1332  				tr.Release()
  1333  				break
  1334  			}
  1335  			// Continue looping to retry the files that were not selected.
  1336  		}
  1337  		return slice
  1338  	}
  1339  
  1340  	return overlaps(v.Levels[level].Iter(), cmp, start, end, exclusiveEnd)
  1341  }
  1342  
  1343  // CheckOrdering checks that the files are consistent with respect to
  1344  // increasing file numbers (for level 0 files) and increasing and non-
  1345  // overlapping internal key ranges (for level non-0 files).
  1346  func (v *Version) CheckOrdering(
  1347  	cmp Compare, format base.FormatKey, order OrderingInvariants,
  1348  ) error {
  1349  	for sublevel := len(v.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- {
  1350  		sublevelIter := v.L0SublevelFiles[sublevel].Iter()
  1351  		// Sublevels have NEVER allowed split user keys, so we can pass
  1352  		// ProhibitSplitUserKeys.
  1353  		if err := CheckOrdering(cmp, format, L0Sublevel(sublevel), sublevelIter, ProhibitSplitUserKeys); err != nil {
  1354  			return base.CorruptionErrorf("%s\n%s", err, v.DebugString(format))
  1355  		}
  1356  	}
  1357  
  1358  	for level, lm := range v.Levels {
  1359  		if err := CheckOrdering(cmp, format, Level(level), lm.Iter(), order); err != nil {
  1360  			return base.CorruptionErrorf("%s\n%s", err, v.DebugString(format))
  1361  		}
  1362  	}
  1363  	return nil
  1364  }
  1365  
  1366  // VersionList holds a list of versions. The versions are ordered from oldest
  1367  // to newest.
  1368  type VersionList struct {
  1369  	mu   *sync.Mutex
  1370  	root Version
  1371  }
  1372  
  1373  // Init initializes the version list.
  1374  func (l *VersionList) Init(mu *sync.Mutex) {
  1375  	l.mu = mu
  1376  	l.root.next = &l.root
  1377  	l.root.prev = &l.root
  1378  }
  1379  
  1380  // Empty returns true if the list is empty, and false otherwise.
  1381  func (l *VersionList) Empty() bool {
  1382  	return l.root.next == &l.root
  1383  }
  1384  
  1385  // Front returns the oldest version in the list. Note that this version is only
  1386  // valid if Empty() returns true.
  1387  func (l *VersionList) Front() *Version {
  1388  	return l.root.next
  1389  }
  1390  
  1391  // Back returns the newest version in the list. Note that this version is only
  1392  // valid if Empty() returns true.
  1393  func (l *VersionList) Back() *Version {
  1394  	return l.root.prev
  1395  }
  1396  
  1397  // PushBack adds a new version to the back of the list. This new version
  1398  // becomes the "newest" version in the list.
  1399  func (l *VersionList) PushBack(v *Version) {
  1400  	if v.list != nil || v.prev != nil || v.next != nil {
  1401  		panic("pebble: version list is inconsistent")
  1402  	}
  1403  	v.prev = l.root.prev
  1404  	v.prev.next = v
  1405  	v.next = &l.root
  1406  	v.next.prev = v
  1407  	v.list = l
  1408  	// Let L0Sublevels on the second newest version get GC'd, as it is no longer
  1409  	// necessary. See the comment in Version.
  1410  	v.prev.L0Sublevels = nil
  1411  }
  1412  
  1413  // Remove removes the specified version from the list.
  1414  func (l *VersionList) Remove(v *Version) {
  1415  	if v == &l.root {
  1416  		panic("pebble: cannot remove version list root node")
  1417  	}
  1418  	if v.list != l {
  1419  		panic("pebble: version list is inconsistent")
  1420  	}
  1421  	v.prev.next = v.next
  1422  	v.next.prev = v.prev
  1423  	v.next = nil // avoid memory leaks
  1424  	v.prev = nil // avoid memory leaks
  1425  	v.list = nil // avoid memory leaks
  1426  }
  1427  
  1428  // OrderingInvariants dictates the file ordering invariants active.
  1429  type OrderingInvariants int8
  1430  
  1431  const (
  1432  	// ProhibitSplitUserKeys indicates that adjacent files within a level cannot
  1433  	// contain the same user key.
  1434  	ProhibitSplitUserKeys OrderingInvariants = iota
  1435  	// AllowSplitUserKeys indicates that adjacent files within a level may
  1436  	// contain the same user key. This is only allowed by historical format
  1437  	// major versions.
  1438  	//
  1439  	// TODO(jackson): Remove.
  1440  	AllowSplitUserKeys
  1441  )
  1442  
  1443  // CheckOrdering checks that the files are consistent with respect to
  1444  // seqnums (for level 0 files -- see detailed comment below) and increasing and non-
  1445  // overlapping internal key ranges (for non-level 0 files).
  1446  //
  1447  // The ordering field may be passed AllowSplitUserKeys to allow adjacent files that are both
  1448  // inclusive of the same user key. Pebble no longer creates version edits
  1449  // installing such files, and Pebble databases with sufficiently high format
  1450  // major version should no longer have any such files within their LSM.
  1451  // TODO(jackson): Remove AllowSplitUserKeys when we remove support for the
  1452  // earlier format major versions.
  1453  func CheckOrdering(
  1454  	cmp Compare, format base.FormatKey, level Level, files LevelIterator, ordering OrderingInvariants,
  1455  ) error {
  1456  	// The invariants to check for L0 sublevels are the same as the ones to
  1457  	// check for all other levels. However, if L0 is not organized into
  1458  	// sublevels, or if all L0 files are being passed in, we do the legacy L0
  1459  	// checks, defined in the detailed comment below.
  1460  	if level == Level(0) {
  1461  		// We have 2 kinds of files:
  1462  		// - Files with exactly one sequence number: these could be either ingested files
  1463  		//   or flushed files. We cannot tell the difference between them based on FileMetadata,
  1464  		//   so our consistency checking here uses the weaker checks assuming it is a narrow
  1465  		//   flushed file. We cannot error on ingested files having sequence numbers coincident
  1466  		//   with flushed files as the seemingly ingested file could just be a flushed file
  1467  		//   with just one key in it which is a truncated range tombstone sharing sequence numbers
  1468  		//   with other files in the same flush.
  1469  		// - Files with multiple sequence numbers: these are necessarily flushed files.
  1470  		//
  1471  		// Three cases of overlapping sequence numbers:
  1472  		// Case 1:
  1473  		// An ingested file contained in the sequence numbers of the flushed file -- it must be
  1474  		// fully contained (not coincident with either end of the flushed file) since the memtable
  1475  		// must have been at [a, b-1] (where b > a) when the ingested file was assigned sequence
  1476  		// num b, and the memtable got a subsequent update that was given sequence num b+1, before
  1477  		// being flushed.
  1478  		//
  1479  		// So a sequence [1000, 1000] [1002, 1002] [1000, 2000] is invalid since the first and
  1480  		// third file are inconsistent with each other. So comparing adjacent files is insufficient
  1481  		// for consistency checking.
  1482  		//
  1483  		// Visually we have something like
  1484  		// x------y x-----------yx-------------y (flushed files where x, y are the endpoints)
  1485  		//     y       y  y        y             (y's represent ingested files)
  1486  		// And these are ordered in increasing order of y. Note that y's must be unique.
  1487  		//
  1488  		// Case 2:
  1489  		// A flushed file that did not overlap in keys with any file in any level, but does overlap
  1490  		// in the file key intervals. This file is placed in L0 since it overlaps in the file
  1491  		// key intervals but since it has no overlapping data, it is assigned a sequence number
  1492  		// of 0 in RocksDB. We handle this case for compatibility with RocksDB.
  1493  		//
  1494  		// Case 3:
  1495  		// A sequence of flushed files that overlap in sequence numbers with one another,
  1496  		// but do not overlap in keys inside the sstables. These files correspond to
  1497  		// partitioned flushes or the results of intra-L0 compactions of partitioned
  1498  		// flushes.
  1499  		//
  1500  		// Since these types of SSTables violate most other sequence number
  1501  		// overlap invariants, and handling this case is important for compatibility
  1502  		// with future versions of pebble, this method relaxes most L0 invariant
  1503  		// checks.
  1504  
  1505  		var prev *FileMetadata
  1506  		for f := files.First(); f != nil; f, prev = files.Next(), f {
  1507  			if prev == nil {
  1508  				continue
  1509  			}
  1510  			// Validate that the sorting is sane.
  1511  			if prev.LargestSeqNum == 0 && f.LargestSeqNum == prev.LargestSeqNum {
  1512  				// Multiple files satisfying case 2 mentioned above.
  1513  			} else if !prev.lessSeqNum(f) {
  1514  				return base.CorruptionErrorf("L0 files %s and %s are not properly ordered: <#%d-#%d> vs <#%d-#%d>",
  1515  					errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
  1516  					errors.Safe(prev.SmallestSeqNum), errors.Safe(prev.LargestSeqNum),
  1517  					errors.Safe(f.SmallestSeqNum), errors.Safe(f.LargestSeqNum))
  1518  			}
  1519  		}
  1520  	} else {
  1521  		var prev *FileMetadata
  1522  		for f := files.First(); f != nil; f, prev = files.Next(), f {
  1523  			if err := f.Validate(cmp, format); err != nil {
  1524  				return errors.Wrapf(err, "%s ", level)
  1525  			}
  1526  			if prev != nil {
  1527  				if prev.cmpSmallestKey(f, cmp) >= 0 {
  1528  					return base.CorruptionErrorf("%s files %s and %s are not properly ordered: [%s-%s] vs [%s-%s]",
  1529  						errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
  1530  						prev.Smallest.Pretty(format), prev.Largest.Pretty(format),
  1531  						f.Smallest.Pretty(format), f.Largest.Pretty(format))
  1532  				}
  1533  
  1534  				// What's considered "overlapping" is dependent on the format
  1535  				// major version. If ordering=ProhibitSplitUserKeys, then both
  1536  				// files cannot contain keys with the same user keys. If the
  1537  				// bounds have the same user key, the previous file's boundary
  1538  				// must have a Trailer indicating that it's exclusive.
  1539  				switch ordering {
  1540  				case AllowSplitUserKeys:
  1541  					if base.InternalCompare(cmp, prev.Largest, f.Smallest) >= 0 {
  1542  						return base.CorruptionErrorf("%s files %s and %s have overlapping ranges: [%s-%s] vs [%s-%s]",
  1543  							errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
  1544  							prev.Smallest.Pretty(format), prev.Largest.Pretty(format),
  1545  							f.Smallest.Pretty(format), f.Largest.Pretty(format))
  1546  					}
  1547  				case ProhibitSplitUserKeys:
  1548  					if v := cmp(prev.Largest.UserKey, f.Smallest.UserKey); v > 0 || (v == 0 && !prev.Largest.IsExclusiveSentinel()) {
  1549  						return base.CorruptionErrorf("%s files %s and %s have overlapping ranges: [%s-%s] vs [%s-%s]",
  1550  							errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
  1551  							prev.Smallest.Pretty(format), prev.Largest.Pretty(format),
  1552  							f.Smallest.Pretty(format), f.Largest.Pretty(format))
  1553  					}
  1554  				default:
  1555  					panic("unreachable")
  1556  				}
  1557  			}
  1558  		}
  1559  	}
  1560  	return nil
  1561  }