github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/internal/manifest/version.go (about)

     1  // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package manifest
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"sort"
    11  	"strconv"
    12  	"strings"
    13  	"sync"
    14  	"sync/atomic"
    15  	"unicode"
    16  
    17  	"github.com/cockroachdb/errors"
    18  	"github.com/cockroachdb/pebble/internal/base"
    19  	"github.com/cockroachdb/pebble/internal/invariants"
    20  )
    21  
    22  // Compare exports the base.Compare type.
    23  type Compare = base.Compare
    24  
    25  // InternalKey exports the base.InternalKey type.
    26  type InternalKey = base.InternalKey
    27  
    28  // TableInfo contains the common information for table related events.
    29  type TableInfo struct {
    30  	// FileNum is the internal DB identifier for the table.
    31  	FileNum base.FileNum
    32  	// Size is the size of the file in bytes.
    33  	Size uint64
    34  	// Smallest is the smallest internal key in the table.
    35  	Smallest InternalKey
    36  	// Largest is the largest internal key in the table.
    37  	Largest InternalKey
    38  	// SmallestSeqNum is the smallest sequence number in the table.
    39  	SmallestSeqNum uint64
    40  	// LargestSeqNum is the largest sequence number in the table.
    41  	LargestSeqNum uint64
    42  }
    43  
    44  // TableStats contains statistics on a table used for compaction heuristics,
    45  // and export via Metrics.
    46  type TableStats struct {
    47  	// The total number of entries in the table.
    48  	NumEntries uint64
    49  	// The number of point and range deletion entries in the table.
    50  	NumDeletions uint64
    51  	// NumRangeKeySets is the total number of range key sets in the table.
    52  	//
    53  	// NB: If there's a chance that the sstable contains any range key sets,
    54  	// then NumRangeKeySets must be > 0.
    55  	NumRangeKeySets uint64
    56  	// Estimate of the total disk space that may be dropped by this table's
    57  	// point deletions by compacting them.
    58  	PointDeletionsBytesEstimate uint64
    59  	// Estimate of the total disk space that may be dropped by this table's
    60  	// range deletions by compacting them. This estimate is at data-block
    61  	// granularity and is not updated if compactions beneath the table reduce
    62  	// the amount of reclaimable disk space. It also does not account for
    63  	// overlapping data in L0 and ignores L0 sublevels, but the error that
    64  	// introduces is expected to be small.
    65  	//
    66  	// Tables in the bottommost level of the LSM may have a nonzero estimate if
    67  	// snapshots or move compactions prevented the elision of their range
    68  	// tombstones. A table in the bottommost level that was ingested into L6
    69  	// will have a zero estimate, because the file's sequence numbers indicate
    70  	// that the tombstone cannot drop any data contained within the file itself.
    71  	RangeDeletionsBytesEstimate uint64
    72  	// Total size of value blocks and value index block.
    73  	ValueBlocksSize uint64
    74  }
    75  
    76  // boundType represents the type of key (point or range) present as the smallest
    77  // and largest keys.
    78  type boundType uint8
    79  
    80  const (
    81  	boundTypePointKey boundType = iota + 1
    82  	boundTypeRangeKey
    83  )
    84  
    85  // CompactionState is the compaction state of a file.
    86  //
    87  // The following shows the valid state transitions:
    88  //
    89  //	NotCompacting --> Compacting --> Compacted
    90  //	      ^               |
    91  //	      |               |
    92  //	      +-------<-------+
    93  //
    94  // Input files to a compaction transition to Compacting when a compaction is
    95  // picked. A file that has finished compacting typically transitions into the
    96  // Compacted state, at which point it is effectively obsolete ("zombied") and
    97  // will eventually be removed from the LSM. A file that has been move-compacted
    98  // will transition from Compacting back into the NotCompacting state, signaling
    99  // that the file may be selected for a subsequent compaction. A failed
   100  // compaction will result in all input tables transitioning from Compacting to
   101  // NotCompacting.
   102  //
   103  // This state is in-memory only. It is not persisted to the manifest.
   104  type CompactionState uint8
   105  
   106  // CompactionStates.
   107  const (
   108  	CompactionStateNotCompacting CompactionState = iota
   109  	CompactionStateCompacting
   110  	CompactionStateCompacted
   111  )
   112  
   113  // String implements fmt.Stringer.
   114  func (s CompactionState) String() string {
   115  	switch s {
   116  	case CompactionStateNotCompacting:
   117  		return "NotCompacting"
   118  	case CompactionStateCompacting:
   119  		return "Compacting"
   120  	case CompactionStateCompacted:
   121  		return "Compacted"
   122  	default:
   123  		panic(fmt.Sprintf("pebble: unknown compaction state %d", s))
   124  	}
   125  }
   126  
   127  // FileMetadata is maintained for leveled-ssts, i.e., they belong to a level of
   128  // some version. FileMetadata does not contain the actual level of the sst,
   129  // since such leveled-ssts can move across levels in different versions, while
   130  // sharing the same FileMetadata. There are two kinds of leveled-ssts, physical
   131  // and virtual. Underlying both leveled-ssts is a backing-sst, for which the
   132  // only state is FileBacking. A backing-sst is level-less. It is possible for a
   133  // backing-sst to be referred to by a physical sst in one version and by one or
   134  // more virtual ssts in one or more versions. A backing-sst becomes obsolete
   135  // and can be deleted once it is no longer required by any physical or virtual
   136  // sst in any version.
   137  //
   138  // We maintain some invariants:
   139  //
   140  //  1. Each physical and virtual sst will have a unique FileMetadata.FileNum,
   141  //     and there will be exactly one FileMetadata associated with the FileNum.
   142  //
   143  //  2. Within a version, a backing-sst is either only referred to by one
   144  //     physical sst or one or more virtual ssts.
   145  //
   146  //  3. Once a backing-sst is referred to by a virtual sst in the latest version,
   147  //     it cannot go back to being referred to by a physical sst in any future
   148  //     version.
   149  //
   150  // Once a physical sst is no longer needed by any version, we will no longer
   151  // maintain the file metadata associated with it. We will still maintain the
   152  // FileBacking associated with the physical sst if the backing sst is required
   153  // by any virtual ssts in any version.
   154  type FileMetadata struct {
   155  	// AllowedSeeks is used to determine if a file should be picked for
   156  	// a read triggered compaction. It is decremented when read sampling
   157  	// in pebble.Iterator after every after every positioning operation
   158  	// that returns a user key (eg. Next, Prev, SeekGE, SeekLT, etc).
   159  	AllowedSeeks atomic.Int64
   160  
   161  	// statsValid indicates if stats have been loaded for the table. The
   162  	// TableStats structure is populated only if valid is true.
   163  	statsValid atomic.Bool
   164  
   165  	// FileBacking is the state which backs either a physical or virtual
   166  	// sstables.
   167  	FileBacking *FileBacking
   168  
   169  	// InitAllowedSeeks is the inital value of allowed seeks. This is used
   170  	// to re-set allowed seeks on a file once it hits 0.
   171  	InitAllowedSeeks int64
   172  	// FileNum is the file number.
   173  	//
   174  	// INVARIANT: when !FileMetadata.Virtual, FileNum == FileBacking.DiskFileNum.
   175  	FileNum base.FileNum
   176  	// Size is the size of the file, in bytes. Size is an approximate value for
   177  	// virtual sstables.
   178  	//
   179  	// INVARIANTS:
   180  	// - When !FileMetadata.Virtual, Size == FileBacking.Size.
   181  	// - Size should be non-zero. Size 0 virtual sstables must not be created.
   182  	Size uint64
   183  	// File creation time in seconds since the epoch (1970-01-01 00:00:00
   184  	// UTC). For ingested sstables, this corresponds to the time the file was
   185  	// ingested. For virtual sstables, this corresponds to the wall clock time
   186  	// when the FileMetadata for the virtual sstable was first created.
   187  	CreationTime int64
   188  	// Lower and upper bounds for the smallest and largest sequence numbers in
   189  	// the table, across both point and range keys. For physical sstables, these
   190  	// values are tight bounds. For virtual sstables, there is no guarantee that
   191  	// there will be keys with SmallestSeqNum or LargestSeqNum within virtual
   192  	// sstable bounds.
   193  	SmallestSeqNum uint64
   194  	LargestSeqNum  uint64
   195  	// SmallestPointKey and LargestPointKey are the inclusive bounds for the
   196  	// internal point keys stored in the table. This includes RANGEDELs, which
   197  	// alter point keys.
   198  	// NB: these field should be set using ExtendPointKeyBounds. They are left
   199  	// exported for reads as an optimization.
   200  	SmallestPointKey InternalKey
   201  	LargestPointKey  InternalKey
   202  	// SmallestRangeKey and LargestRangeKey are the inclusive bounds for the
   203  	// internal range keys stored in the table.
   204  	// NB: these field should be set using ExtendRangeKeyBounds. They are left
   205  	// exported for reads as an optimization.
   206  	SmallestRangeKey InternalKey
   207  	LargestRangeKey  InternalKey
   208  	// Smallest and Largest are the inclusive bounds for the internal keys stored
   209  	// in the table, across both point and range keys.
   210  	// NB: these fields are derived from their point and range key equivalents,
   211  	// and are updated via the MaybeExtend{Point,Range}KeyBounds methods.
   212  	Smallest InternalKey
   213  	Largest  InternalKey
   214  	// Stats describe table statistics. Protected by DB.mu.
   215  	//
   216  	// For virtual sstables, set stats upon virtual sstable creation as
   217  	// asynchronous computation of stats is not currently supported.
   218  	//
   219  	// TODO(bananabrick): To support manifest replay for virtual sstables, we
   220  	// probably need to compute virtual sstable stats asynchronously. Otherwise,
   221  	// we'd have to write virtual sstable stats to the version edit.
   222  	Stats TableStats
   223  
   224  	// For L0 files only. Protected by DB.mu. Used to generate L0 sublevels and
   225  	// pick L0 compactions. Only accurate for the most recent Version.
   226  	SubLevel         int
   227  	L0Index          int
   228  	minIntervalIndex int
   229  	maxIntervalIndex int
   230  
   231  	// NB: the alignment of this struct is 8 bytes. We pack all the bools to
   232  	// ensure an optimal packing.
   233  
   234  	// IsIntraL0Compacting is set to True if this file is part of an intra-L0
   235  	// compaction. When it's true, IsCompacting must also return true. If
   236  	// Compacting is true and IsIntraL0Compacting is false for an L0 file, the
   237  	// file must be part of a compaction to Lbase.
   238  	IsIntraL0Compacting bool
   239  	CompactionState     CompactionState
   240  	// True if compaction of this file has been explicitly requested.
   241  	// Previously, RocksDB and earlier versions of Pebble allowed this
   242  	// flag to be set by a user table property collector. Some earlier
   243  	// versions of Pebble respected this flag, while other more recent
   244  	// versions ignored this flag.
   245  	//
   246  	// More recently this flag has been repurposed to facilitate the
   247  	// compaction of 'atomic compaction units'. Files marked for
   248  	// compaction are compacted in a rewrite compaction at the lowest
   249  	// possible compaction priority.
   250  	//
   251  	// NB: A count of files marked for compaction is maintained on
   252  	// Version, and compaction picking reads cached annotations
   253  	// determined by this field.
   254  	//
   255  	// Protected by DB.mu.
   256  	MarkedForCompaction bool
   257  	// HasPointKeys tracks whether the table contains point keys (including
   258  	// RANGEDELs). If a table contains only range deletions, HasPointsKeys is
   259  	// still true.
   260  	HasPointKeys bool
   261  	// HasRangeKeys tracks whether the table contains any range keys.
   262  	HasRangeKeys bool
   263  	// smallestSet and largestSet track whether the overall bounds have been set.
   264  	boundsSet bool
   265  	// boundTypeSmallest and boundTypeLargest provide an indication as to which
   266  	// key type (point or range) corresponds to the smallest and largest overall
   267  	// table bounds.
   268  	boundTypeSmallest, boundTypeLargest boundType
   269  	// Virtual is true if the FileMetadata belongs to a virtual sstable.
   270  	Virtual bool
   271  }
   272  
   273  // PhysicalFileMeta is used by functions which want a guarantee that their input
   274  // belongs to a physical sst and not a virtual sst.
   275  //
   276  // NB: This type should only be constructed by calling
   277  // FileMetadata.PhysicalMeta.
   278  type PhysicalFileMeta struct {
   279  	*FileMetadata
   280  }
   281  
   282  // VirtualFileMeta is used by functions which want a guarantee that their input
   283  // belongs to a virtual sst and not a physical sst.
   284  //
   285  // A VirtualFileMeta inherits all the same fields as a FileMetadata. These
   286  // fields have additional invariants imposed on them, and/or slightly varying
   287  // meanings:
   288  //   - Smallest and Largest (and their counterparts
   289  //     {Smallest, Largest}{Point,Range}Key) remain tight bounds that represent a
   290  //     key at that exact bound. We make the effort to determine the next smallest
   291  //     or largest key in an sstable after virtualizing it, to maintain this
   292  //     tightness. If the largest is a sentinel key (IsExclusiveSentinel()), it
   293  //     could mean that a rangedel or range key ends at that user key, or has been
   294  //     truncated to that user key.
   295  //   - One invariant is that if a rangedel or range key is truncated on its
   296  //     upper bound, the virtual sstable *must* have a rangedel or range key
   297  //     sentinel key as its upper bound. This is because truncation yields
   298  //     an exclusive upper bound for the rangedel/rangekey, and if there are
   299  //     any points at that exclusive upper bound within the same virtual
   300  //     sstable, those could get uncovered by this truncation. We enforce this
   301  //     invariant in calls to keyspan.Truncate.
   302  //   - Size is an estimate of the size of the virtualized portion of this sstable.
   303  //     The underlying file's size is stored in FileBacking.Size, though it could
   304  //     also be estimated or could correspond to just the referenced portion of
   305  //     a file (eg. if the file originated on another node).
   306  //   - Size must be > 0.
   307  //   - SmallestSeqNum and LargestSeqNum are loose bounds for virtual sstables.
   308  //     This means that all keys in the virtual sstable must have seqnums within
   309  //     [SmallestSeqNum, LargestSeqNum], however there's no guarantee that there's
   310  //     a key with a seqnum at either of the bounds. Calculating tight seqnum
   311  //     bounds would be too expensive and deliver little value.
   312  //
   313  // NB: This type should only be constructed by calling FileMetadata.VirtualMeta.
   314  type VirtualFileMeta struct {
   315  	*FileMetadata
   316  }
   317  
   318  // PhysicalMeta should be the only source of creating the PhysicalFileMeta
   319  // wrapper type.
   320  func (m *FileMetadata) PhysicalMeta() PhysicalFileMeta {
   321  	if m.Virtual {
   322  		panic("pebble: file metadata does not belong to a physical sstable")
   323  	}
   324  	return PhysicalFileMeta{
   325  		m,
   326  	}
   327  }
   328  
   329  // VirtualMeta should be the only source of creating the VirtualFileMeta wrapper
   330  // type.
   331  func (m *FileMetadata) VirtualMeta() VirtualFileMeta {
   332  	if !m.Virtual {
   333  		panic("pebble: file metadata does not belong to a virtual sstable")
   334  	}
   335  	return VirtualFileMeta{
   336  		m,
   337  	}
   338  }
   339  
   340  // FileBacking either backs a single physical sstable, or one or more virtual
   341  // sstables.
   342  //
   343  // See the comment above the FileMetadata type for sstable terminology.
   344  type FileBacking struct {
   345  	// Reference count for the backing file on disk: incremented when a
   346  	// physical or virtual sstable which is backed by the FileBacking is
   347  	// added to a version and decremented when the version is unreferenced.
   348  	// We ref count in order to determine when it is safe to delete a
   349  	// backing sst file from disk. The backing file is obsolete when the
   350  	// reference count falls to zero.
   351  	refs atomic.Int32
   352  	// latestVersionRefs are the references to the FileBacking in the
   353  	// latest version. This reference can be through a single physical
   354  	// sstable in the latest version, or one or more virtual sstables in the
   355  	// latest version.
   356  	//
   357  	// INVARIANT: latestVersionRefs <= refs.
   358  	latestVersionRefs atomic.Int32
   359  	// VirtualizedSize is set iff the backing sst is only referred to by
   360  	// virtual ssts in the latest version. VirtualizedSize is the sum of the
   361  	// virtual sstable sizes of all of the virtual sstables in the latest
   362  	// version which are backed by the physical sstable. When a virtual
   363  	// sstable is removed from the latest version, we will decrement the
   364  	// VirtualizedSize. During compaction picking, we'll compensate a
   365  	// virtual sstable file size by
   366  	// (FileBacking.Size - FileBacking.VirtualizedSize) / latestVersionRefs.
   367  	// The intuition is that if FileBacking.Size - FileBacking.VirtualizedSize
   368  	// is high, then the space amplification due to virtual sstables is
   369  	// high, and we should pick the virtual sstable with a higher priority.
   370  	//
   371  	// TODO(bananabrick): Compensate the virtual sstable file size using
   372  	// the VirtualizedSize during compaction picking and test.
   373  	VirtualizedSize atomic.Uint64
   374  	DiskFileNum     base.DiskFileNum
   375  	Size            uint64
   376  }
   377  
   378  // InitPhysicalBacking allocates and sets the FileBacking which is required by a
   379  // physical sstable FileMetadata.
   380  //
   381  // Ensure that the state required by FileBacking, such as the FileNum, is
   382  // already set on the FileMetadata before InitPhysicalBacking is called.
   383  // Calling InitPhysicalBacking only after the relevant state has been set in the
   384  // FileMetadata is not necessary in tests which don't rely on FileBacking.
   385  func (m *FileMetadata) InitPhysicalBacking() {
   386  	if m.Virtual {
   387  		panic("pebble: virtual sstables should use a pre-existing FileBacking")
   388  	}
   389  	if m.FileBacking == nil {
   390  		m.FileBacking = &FileBacking{Size: m.Size, DiskFileNum: m.FileNum.DiskFileNum()}
   391  	}
   392  }
   393  
   394  // InitProviderBacking creates a new FileBacking for a file backed by
   395  // an objstorage.Provider.
   396  func (m *FileMetadata) InitProviderBacking(fileNum base.DiskFileNum) {
   397  	if !m.Virtual {
   398  		panic("pebble: provider-backed sstables must be virtual")
   399  	}
   400  	if m.FileBacking == nil {
   401  		m.FileBacking = &FileBacking{DiskFileNum: fileNum}
   402  	}
   403  }
   404  
   405  // ValidateVirtual should be called once the FileMetadata for a virtual sstable
   406  // is created to verify that the fields of the virtual sstable are sound.
   407  func (m *FileMetadata) ValidateVirtual(createdFrom *FileMetadata) {
   408  	if !m.Virtual {
   409  		panic("pebble: invalid virtual sstable")
   410  	}
   411  
   412  	if createdFrom.SmallestSeqNum != m.SmallestSeqNum {
   413  		panic("pebble: invalid smallest sequence number for virtual sstable")
   414  	}
   415  
   416  	if createdFrom.LargestSeqNum != m.LargestSeqNum {
   417  		panic("pebble: invalid largest sequence number for virtual sstable")
   418  	}
   419  
   420  	if createdFrom.FileBacking != nil && createdFrom.FileBacking != m.FileBacking {
   421  		panic("pebble: invalid physical sstable state for virtual sstable")
   422  	}
   423  
   424  	if m.Size == 0 {
   425  		panic("pebble: virtual sstable size must be set upon creation")
   426  	}
   427  }
   428  
   429  // Refs returns the refcount of backing sstable.
   430  func (m *FileMetadata) Refs() int32 {
   431  	return m.FileBacking.refs.Load()
   432  }
   433  
   434  // Ref increments the ref count associated with the backing sstable.
   435  func (m *FileMetadata) Ref() {
   436  	m.FileBacking.refs.Add(1)
   437  }
   438  
   439  // Unref decrements the ref count associated with the backing sstable.
   440  func (m *FileMetadata) Unref() int32 {
   441  	v := m.FileBacking.refs.Add(-1)
   442  	if invariants.Enabled && v < 0 {
   443  		panic("pebble: invalid FileMetadata refcounting")
   444  	}
   445  	return v
   446  }
   447  
   448  // LatestRef increments the latest ref count associated with the backing
   449  // sstable.
   450  func (m *FileMetadata) LatestRef() {
   451  	m.FileBacking.latestVersionRefs.Add(1)
   452  
   453  	if m.Virtual {
   454  		m.FileBacking.VirtualizedSize.Add(m.Size)
   455  	}
   456  }
   457  
   458  // LatestUnref decrements the latest ref count associated with the backing
   459  // sstable.
   460  func (m *FileMetadata) LatestUnref() int32 {
   461  	if m.Virtual {
   462  		m.FileBacking.VirtualizedSize.Add(-m.Size)
   463  	}
   464  
   465  	v := m.FileBacking.latestVersionRefs.Add(-1)
   466  	if invariants.Enabled && v < 0 {
   467  		panic("pebble: invalid FileMetadata latest refcounting")
   468  	}
   469  	return v
   470  }
   471  
   472  // LatestRefs returns the latest ref count associated with the backing sstable.
   473  func (m *FileMetadata) LatestRefs() int32 {
   474  	return m.FileBacking.latestVersionRefs.Load()
   475  }
   476  
   477  // SetCompactionState transitions this file's compaction state to the given
   478  // state. Protected by DB.mu.
   479  func (m *FileMetadata) SetCompactionState(to CompactionState) {
   480  	if invariants.Enabled {
   481  		transitionErr := func() error {
   482  			return errors.Newf("pebble: invalid compaction state transition: %s -> %s", m.CompactionState, to)
   483  		}
   484  		switch m.CompactionState {
   485  		case CompactionStateNotCompacting:
   486  			if to != CompactionStateCompacting {
   487  				panic(transitionErr())
   488  			}
   489  		case CompactionStateCompacting:
   490  			if to != CompactionStateCompacted && to != CompactionStateNotCompacting {
   491  				panic(transitionErr())
   492  			}
   493  		case CompactionStateCompacted:
   494  			panic(transitionErr())
   495  		default:
   496  			panic(fmt.Sprintf("pebble: unknown compaction state: %d", m.CompactionState))
   497  		}
   498  	}
   499  	m.CompactionState = to
   500  }
   501  
   502  // IsCompacting returns true if this file's compaction state is
   503  // CompactionStateCompacting. Protected by DB.mu.
   504  func (m *FileMetadata) IsCompacting() bool {
   505  	return m.CompactionState == CompactionStateCompacting
   506  }
   507  
   508  // StatsValid returns true if the table stats have been populated. If StatValid
   509  // returns true, the Stats field may be read (with or without holding the
   510  // database mutex).
   511  func (m *FileMetadata) StatsValid() bool {
   512  	return m.statsValid.Load()
   513  }
   514  
   515  // StatsMarkValid marks the TableStats as valid. The caller must hold DB.mu
   516  // while populating TableStats and calling StatsMarkValud. Once stats are
   517  // populated, they must not be mutated.
   518  func (m *FileMetadata) StatsMarkValid() {
   519  	m.statsValid.Store(true)
   520  }
   521  
   522  // ExtendPointKeyBounds attempts to extend the lower and upper point key bounds
   523  // and overall table bounds with the given smallest and largest keys. The
   524  // smallest and largest bounds may not be extended if the table already has a
   525  // bound that is smaller or larger, respectively. The receiver is returned.
   526  // NB: calling this method should be preferred to manually setting the bounds by
   527  // manipulating the fields directly, to maintain certain invariants.
   528  func (m *FileMetadata) ExtendPointKeyBounds(
   529  	cmp Compare, smallest, largest InternalKey,
   530  ) *FileMetadata {
   531  	// Update the point key bounds.
   532  	if !m.HasPointKeys {
   533  		m.SmallestPointKey, m.LargestPointKey = smallest, largest
   534  		m.HasPointKeys = true
   535  	} else {
   536  		if base.InternalCompare(cmp, smallest, m.SmallestPointKey) < 0 {
   537  			m.SmallestPointKey = smallest
   538  		}
   539  		if base.InternalCompare(cmp, largest, m.LargestPointKey) > 0 {
   540  			m.LargestPointKey = largest
   541  		}
   542  	}
   543  	// Update the overall bounds.
   544  	m.extendOverallBounds(cmp, m.SmallestPointKey, m.LargestPointKey, boundTypePointKey)
   545  	return m
   546  }
   547  
   548  // ExtendRangeKeyBounds attempts to extend the lower and upper range key bounds
   549  // and overall table bounds with the given smallest and largest keys. The
   550  // smallest and largest bounds may not be extended if the table already has a
   551  // bound that is smaller or larger, respectively. The receiver is returned.
   552  // NB: calling this method should be preferred to manually setting the bounds by
   553  // manipulating the fields directly, to maintain certain invariants.
   554  func (m *FileMetadata) ExtendRangeKeyBounds(
   555  	cmp Compare, smallest, largest InternalKey,
   556  ) *FileMetadata {
   557  	// Update the range key bounds.
   558  	if !m.HasRangeKeys {
   559  		m.SmallestRangeKey, m.LargestRangeKey = smallest, largest
   560  		m.HasRangeKeys = true
   561  	} else {
   562  		if base.InternalCompare(cmp, smallest, m.SmallestRangeKey) < 0 {
   563  			m.SmallestRangeKey = smallest
   564  		}
   565  		if base.InternalCompare(cmp, largest, m.LargestRangeKey) > 0 {
   566  			m.LargestRangeKey = largest
   567  		}
   568  	}
   569  	// Update the overall bounds.
   570  	m.extendOverallBounds(cmp, m.SmallestRangeKey, m.LargestRangeKey, boundTypeRangeKey)
   571  	return m
   572  }
   573  
   574  // extendOverallBounds attempts to extend the overall table lower and upper
   575  // bounds. The given bounds may not be used if a lower or upper bound already
   576  // exists that is smaller or larger than the given keys, respectively. The given
   577  // boundType will be used if the bounds are updated.
   578  func (m *FileMetadata) extendOverallBounds(
   579  	cmp Compare, smallest, largest InternalKey, bTyp boundType,
   580  ) {
   581  	if !m.boundsSet {
   582  		m.Smallest, m.Largest = smallest, largest
   583  		m.boundsSet = true
   584  		m.boundTypeSmallest, m.boundTypeLargest = bTyp, bTyp
   585  	} else {
   586  		if base.InternalCompare(cmp, smallest, m.Smallest) < 0 {
   587  			m.Smallest = smallest
   588  			m.boundTypeSmallest = bTyp
   589  		}
   590  		if base.InternalCompare(cmp, largest, m.Largest) > 0 {
   591  			m.Largest = largest
   592  			m.boundTypeLargest = bTyp
   593  		}
   594  	}
   595  }
   596  
   597  // Overlaps returns true if the file key range overlaps with the given range.
   598  func (m *FileMetadata) Overlaps(cmp Compare, start []byte, end []byte, exclusiveEnd bool) bool {
   599  	if c := cmp(m.Largest.UserKey, start); c < 0 || (c == 0 && m.Largest.IsExclusiveSentinel()) {
   600  		// f is completely before the specified range; no overlap.
   601  		return false
   602  	}
   603  	if c := cmp(m.Smallest.UserKey, end); c > 0 || (c == 0 && exclusiveEnd) {
   604  		// f is completely after the specified range; no overlap.
   605  		return false
   606  	}
   607  	return true
   608  }
   609  
   610  // ContainedWithinSpan returns true if the file key range completely overlaps with the
   611  // given range ("end" is assumed to exclusive).
   612  func (m *FileMetadata) ContainedWithinSpan(cmp Compare, start, end []byte) bool {
   613  	lowerCmp, upperCmp := cmp(m.Smallest.UserKey, start), cmp(m.Largest.UserKey, end)
   614  	return lowerCmp >= 0 && (upperCmp < 0 || (upperCmp == 0 && m.Largest.IsExclusiveSentinel()))
   615  }
   616  
   617  // ContainsKeyType returns whether or not the file contains keys of the provided
   618  // type.
   619  func (m *FileMetadata) ContainsKeyType(kt KeyType) bool {
   620  	switch kt {
   621  	case KeyTypePointAndRange:
   622  		return true
   623  	case KeyTypePoint:
   624  		return m.HasPointKeys
   625  	case KeyTypeRange:
   626  		return m.HasRangeKeys
   627  	default:
   628  		panic("unrecognized key type")
   629  	}
   630  }
   631  
   632  // SmallestBound returns the file's smallest bound of the key type. It returns a
   633  // false second return value if the file does not contain any keys of the key
   634  // type.
   635  func (m *FileMetadata) SmallestBound(kt KeyType) (*InternalKey, bool) {
   636  	switch kt {
   637  	case KeyTypePointAndRange:
   638  		return &m.Smallest, true
   639  	case KeyTypePoint:
   640  		return &m.SmallestPointKey, m.HasPointKeys
   641  	case KeyTypeRange:
   642  		return &m.SmallestRangeKey, m.HasRangeKeys
   643  	default:
   644  		panic("unrecognized key type")
   645  	}
   646  }
   647  
   648  // LargestBound returns the file's largest bound of the key type. It returns a
   649  // false second return value if the file does not contain any keys of the key
   650  // type.
   651  func (m *FileMetadata) LargestBound(kt KeyType) (*InternalKey, bool) {
   652  	switch kt {
   653  	case KeyTypePointAndRange:
   654  		return &m.Largest, true
   655  	case KeyTypePoint:
   656  		return &m.LargestPointKey, m.HasPointKeys
   657  	case KeyTypeRange:
   658  		return &m.LargestRangeKey, m.HasRangeKeys
   659  	default:
   660  		panic("unrecognized key type")
   661  	}
   662  }
   663  
   664  const (
   665  	maskContainsPointKeys = 1 << 0
   666  	maskSmallest          = 1 << 1
   667  	maskLargest           = 1 << 2
   668  )
   669  
   670  // boundsMarker returns a marker byte whose bits encode the following
   671  // information (in order from least significant bit):
   672  // - if the table contains point keys
   673  // - if the table's smallest key is a point key
   674  // - if the table's largest key is a point key
   675  func (m *FileMetadata) boundsMarker() (sentinel uint8, err error) {
   676  	if m.HasPointKeys {
   677  		sentinel |= maskContainsPointKeys
   678  	}
   679  	switch m.boundTypeSmallest {
   680  	case boundTypePointKey:
   681  		sentinel |= maskSmallest
   682  	case boundTypeRangeKey:
   683  		// No op - leave bit unset.
   684  	default:
   685  		return 0, base.CorruptionErrorf("file %s has neither point nor range key as smallest key", m.FileNum)
   686  	}
   687  	switch m.boundTypeLargest {
   688  	case boundTypePointKey:
   689  		sentinel |= maskLargest
   690  	case boundTypeRangeKey:
   691  		// No op - leave bit unset.
   692  	default:
   693  		return 0, base.CorruptionErrorf("file %s has neither point nor range key as largest key", m.FileNum)
   694  	}
   695  	return
   696  }
   697  
   698  // String implements fmt.Stringer, printing the file number and the overall
   699  // table bounds.
   700  func (m *FileMetadata) String() string {
   701  	return fmt.Sprintf("%s:[%s-%s]", m.FileNum, m.Smallest, m.Largest)
   702  }
   703  
   704  // DebugString returns a verbose representation of FileMetadata, typically for
   705  // use in tests and debugging, returning the file number and the point, range
   706  // and overall bounds for the table.
   707  func (m *FileMetadata) DebugString(format base.FormatKey, verbose bool) string {
   708  	var b bytes.Buffer
   709  	fmt.Fprintf(&b, "%s:[%s-%s]",
   710  		m.FileNum, m.Smallest.Pretty(format), m.Largest.Pretty(format))
   711  	if !verbose {
   712  		return b.String()
   713  	}
   714  	fmt.Fprintf(&b, " seqnums:[%d-%d]", m.SmallestSeqNum, m.LargestSeqNum)
   715  	if m.HasPointKeys {
   716  		fmt.Fprintf(&b, " points:[%s-%s]",
   717  			m.SmallestPointKey.Pretty(format), m.LargestPointKey.Pretty(format))
   718  	}
   719  	if m.HasRangeKeys {
   720  		fmt.Fprintf(&b, " ranges:[%s-%s]",
   721  			m.SmallestRangeKey.Pretty(format), m.LargestRangeKey.Pretty(format))
   722  	}
   723  	return b.String()
   724  }
   725  
   726  // ParseFileMetadataDebug parses a FileMetadata from its DebugString
   727  // representation.
   728  func ParseFileMetadataDebug(s string) (*FileMetadata, error) {
   729  	// Split lines of the form:
   730  	//  000000:[a#0,SET-z#0,SET] seqnums:[5-5] points:[...] ranges:[...]
   731  	fields := strings.FieldsFunc(s, func(c rune) bool {
   732  		switch c {
   733  		case ':', '[', '-', ']':
   734  			return true
   735  		default:
   736  			return unicode.IsSpace(c) // NB: also trim whitespace padding.
   737  		}
   738  	})
   739  	if len(fields)%3 != 0 {
   740  		return nil, errors.Newf("malformed input: %s", s)
   741  	}
   742  	m := &FileMetadata{}
   743  	for len(fields) > 0 {
   744  		prefix := fields[0]
   745  		if prefix == "seqnums" {
   746  			smallestSeqNum, err := strconv.ParseUint(fields[1], 10, 64)
   747  			if err != nil {
   748  				return m, errors.Newf("malformed input: %s: %s", s, err)
   749  			}
   750  			largestSeqNum, err := strconv.ParseUint(fields[2], 10, 64)
   751  			if err != nil {
   752  				return m, errors.Newf("malformed input: %s: %s", s, err)
   753  			}
   754  			m.SmallestSeqNum, m.LargestSeqNum = smallestSeqNum, largestSeqNum
   755  			fields = fields[3:]
   756  			continue
   757  		}
   758  		smallest := base.ParsePrettyInternalKey(fields[1])
   759  		largest := base.ParsePrettyInternalKey(fields[2])
   760  		switch prefix {
   761  		case "points":
   762  			m.SmallestPointKey, m.LargestPointKey = smallest, largest
   763  			m.HasPointKeys = true
   764  		case "ranges":
   765  			m.SmallestRangeKey, m.LargestRangeKey = smallest, largest
   766  			m.HasRangeKeys = true
   767  		default:
   768  			fileNum, err := strconv.ParseUint(prefix, 10, 64)
   769  			if err != nil {
   770  				return m, errors.Newf("malformed input: %s: %s", s, err)
   771  			}
   772  			m.FileNum = base.FileNum(fileNum)
   773  			m.Smallest, m.Largest = smallest, largest
   774  			m.boundsSet = true
   775  		}
   776  		fields = fields[3:]
   777  	}
   778  	// By default, when the parser sees just the overall bounds, we set the point
   779  	// keys. This preserves backwards compatability with existing test cases that
   780  	// specify only the overall bounds.
   781  	if !m.HasPointKeys && !m.HasRangeKeys {
   782  		m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest
   783  		m.HasPointKeys = true
   784  	}
   785  	m.InitPhysicalBacking()
   786  	return m, nil
   787  }
   788  
   789  // Validate validates the metadata for consistency with itself, returning an
   790  // error if inconsistent.
   791  func (m *FileMetadata) Validate(cmp Compare, formatKey base.FormatKey) error {
   792  	// Combined range and point key validation.
   793  
   794  	if !m.HasPointKeys && !m.HasRangeKeys {
   795  		return base.CorruptionErrorf("file %s has neither point nor range keys",
   796  			errors.Safe(m.FileNum))
   797  	}
   798  	if base.InternalCompare(cmp, m.Smallest, m.Largest) > 0 {
   799  		return base.CorruptionErrorf("file %s has inconsistent bounds: %s vs %s",
   800  			errors.Safe(m.FileNum), m.Smallest.Pretty(formatKey),
   801  			m.Largest.Pretty(formatKey))
   802  	}
   803  	if m.SmallestSeqNum > m.LargestSeqNum {
   804  		return base.CorruptionErrorf("file %s has inconsistent seqnum bounds: %d vs %d",
   805  			errors.Safe(m.FileNum), m.SmallestSeqNum, m.LargestSeqNum)
   806  	}
   807  
   808  	// Point key validation.
   809  
   810  	if m.HasPointKeys {
   811  		if base.InternalCompare(cmp, m.SmallestPointKey, m.LargestPointKey) > 0 {
   812  			return base.CorruptionErrorf("file %s has inconsistent point key bounds: %s vs %s",
   813  				errors.Safe(m.FileNum), m.SmallestPointKey.Pretty(formatKey),
   814  				m.LargestPointKey.Pretty(formatKey))
   815  		}
   816  		if base.InternalCompare(cmp, m.SmallestPointKey, m.Smallest) < 0 ||
   817  			base.InternalCompare(cmp, m.LargestPointKey, m.Largest) > 0 {
   818  			return base.CorruptionErrorf(
   819  				"file %s has inconsistent point key bounds relative to overall bounds: "+
   820  					"overall = [%s-%s], point keys = [%s-%s]",
   821  				errors.Safe(m.FileNum),
   822  				m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey),
   823  				m.SmallestPointKey.Pretty(formatKey), m.LargestPointKey.Pretty(formatKey),
   824  			)
   825  		}
   826  	}
   827  
   828  	// Range key validation.
   829  
   830  	if m.HasRangeKeys {
   831  		if base.InternalCompare(cmp, m.SmallestRangeKey, m.LargestRangeKey) > 0 {
   832  			return base.CorruptionErrorf("file %s has inconsistent range key bounds: %s vs %s",
   833  				errors.Safe(m.FileNum), m.SmallestRangeKey.Pretty(formatKey),
   834  				m.LargestRangeKey.Pretty(formatKey))
   835  		}
   836  		if base.InternalCompare(cmp, m.SmallestRangeKey, m.Smallest) < 0 ||
   837  			base.InternalCompare(cmp, m.LargestRangeKey, m.Largest) > 0 {
   838  			return base.CorruptionErrorf(
   839  				"file %s has inconsistent range key bounds relative to overall bounds: "+
   840  					"overall = [%s-%s], range keys = [%s-%s]",
   841  				errors.Safe(m.FileNum),
   842  				m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey),
   843  				m.SmallestRangeKey.Pretty(formatKey), m.LargestRangeKey.Pretty(formatKey),
   844  			)
   845  		}
   846  	}
   847  
   848  	// Ensure that FileMetadata.Init was called.
   849  	if m.FileBacking == nil {
   850  		return base.CorruptionErrorf("file metadata FileBacking not set")
   851  	}
   852  
   853  	return nil
   854  }
   855  
   856  // TableInfo returns a subset of the FileMetadata state formatted as a
   857  // TableInfo.
   858  func (m *FileMetadata) TableInfo() TableInfo {
   859  	return TableInfo{
   860  		FileNum:        m.FileNum,
   861  		Size:           m.Size,
   862  		Smallest:       m.Smallest,
   863  		Largest:        m.Largest,
   864  		SmallestSeqNum: m.SmallestSeqNum,
   865  		LargestSeqNum:  m.LargestSeqNum,
   866  	}
   867  }
   868  
   869  func cmpUint64(a, b uint64) int {
   870  	switch {
   871  	case a < b:
   872  		return -1
   873  	case a > b:
   874  		return +1
   875  	default:
   876  		return 0
   877  	}
   878  }
   879  
   880  func (m *FileMetadata) cmpSeqNum(b *FileMetadata) int {
   881  	// NB: This is the same ordering that RocksDB uses for L0 files.
   882  
   883  	// Sort first by largest sequence number.
   884  	if m.LargestSeqNum != b.LargestSeqNum {
   885  		return cmpUint64(m.LargestSeqNum, b.LargestSeqNum)
   886  	}
   887  	// Then by smallest sequence number.
   888  	if m.SmallestSeqNum != b.SmallestSeqNum {
   889  		return cmpUint64(m.SmallestSeqNum, b.SmallestSeqNum)
   890  	}
   891  	// Break ties by file number.
   892  	return cmpUint64(uint64(m.FileNum), uint64(b.FileNum))
   893  }
   894  
   895  func (m *FileMetadata) lessSeqNum(b *FileMetadata) bool {
   896  	return m.cmpSeqNum(b) < 0
   897  }
   898  
   899  func (m *FileMetadata) cmpSmallestKey(b *FileMetadata, cmp Compare) int {
   900  	return base.InternalCompare(cmp, m.Smallest, b.Smallest)
   901  }
   902  
   903  // KeyRange returns the minimum smallest and maximum largest internalKey for
   904  // all the FileMetadata in iters.
   905  func KeyRange(ucmp Compare, iters ...LevelIterator) (smallest, largest InternalKey) {
   906  	first := true
   907  	for _, iter := range iters {
   908  		for meta := iter.First(); meta != nil; meta = iter.Next() {
   909  			if first {
   910  				first = false
   911  				smallest, largest = meta.Smallest, meta.Largest
   912  				continue
   913  			}
   914  			if base.InternalCompare(ucmp, smallest, meta.Smallest) >= 0 {
   915  				smallest = meta.Smallest
   916  			}
   917  			if base.InternalCompare(ucmp, largest, meta.Largest) <= 0 {
   918  				largest = meta.Largest
   919  			}
   920  		}
   921  	}
   922  	return smallest, largest
   923  }
   924  
   925  type bySeqNum []*FileMetadata
   926  
   927  func (b bySeqNum) Len() int { return len(b) }
   928  func (b bySeqNum) Less(i, j int) bool {
   929  	return b[i].lessSeqNum(b[j])
   930  }
   931  func (b bySeqNum) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
   932  
   933  // SortBySeqNum sorts the specified files by increasing sequence number.
   934  func SortBySeqNum(files []*FileMetadata) {
   935  	sort.Sort(bySeqNum(files))
   936  }
   937  
   938  type bySmallest struct {
   939  	files []*FileMetadata
   940  	cmp   Compare
   941  }
   942  
   943  func (b bySmallest) Len() int { return len(b.files) }
   944  func (b bySmallest) Less(i, j int) bool {
   945  	return b.files[i].cmpSmallestKey(b.files[j], b.cmp) < 0
   946  }
   947  func (b bySmallest) Swap(i, j int) { b.files[i], b.files[j] = b.files[j], b.files[i] }
   948  
   949  // SortBySmallest sorts the specified files by smallest key using the supplied
   950  // comparison function to order user keys.
   951  func SortBySmallest(files []*FileMetadata, cmp Compare) {
   952  	sort.Sort(bySmallest{files, cmp})
   953  }
   954  
   955  func overlaps(iter LevelIterator, cmp Compare, start, end []byte, exclusiveEnd bool) LevelSlice {
   956  	startIter := iter.Clone()
   957  	{
   958  		startIterFile := startIter.SeekGE(cmp, start)
   959  		// SeekGE compares user keys. The user key `start` may be equal to the
   960  		// f.Largest because f.Largest is a range deletion sentinel, indicating
   961  		// that the user key `start` is NOT contained within the file f. If
   962  		// that's the case, we can narrow the overlapping bounds to exclude the
   963  		// file with the sentinel.
   964  		if startIterFile != nil && startIterFile.Largest.IsExclusiveSentinel() &&
   965  			cmp(startIterFile.Largest.UserKey, start) == 0 {
   966  			startIterFile = startIter.Next()
   967  		}
   968  		_ = startIterFile // Ignore unused assignment.
   969  	}
   970  
   971  	endIter := iter.Clone()
   972  	{
   973  		endIterFile := endIter.SeekGE(cmp, end)
   974  
   975  		if !exclusiveEnd {
   976  			// endIter is now pointing at the *first* file with a largest key >= end.
   977  			// If there are multiple files including the user key `end`, we want all
   978  			// of them, so move forward.
   979  			for endIterFile != nil && cmp(endIterFile.Largest.UserKey, end) == 0 {
   980  				endIterFile = endIter.Next()
   981  			}
   982  		}
   983  
   984  		// LevelSlice uses inclusive bounds, so if we seeked to the end sentinel
   985  		// or nexted too far because Largest.UserKey equaled `end`, go back.
   986  		//
   987  		// Consider !exclusiveEnd and end = 'f', with the following file bounds:
   988  		//
   989  		//     [b,d] [e, f] [f, f] [g, h]
   990  		//
   991  		// the above for loop will Next until it arrives at [g, h]. We need to
   992  		// observe that g > f, and Prev to the file with bounds [f, f].
   993  		if endIterFile == nil {
   994  			endIterFile = endIter.Prev()
   995  		} else if c := cmp(endIterFile.Smallest.UserKey, end); c > 0 || c == 0 && exclusiveEnd {
   996  			endIterFile = endIter.Prev()
   997  		}
   998  		_ = endIterFile // Ignore unused assignment.
   999  	}
  1000  	return newBoundedLevelSlice(startIter.Clone().iter, &startIter.iter, &endIter.iter)
  1001  }
  1002  
  1003  // NumLevels is the number of levels a Version contains.
  1004  const NumLevels = 7
  1005  
  1006  // NewVersion constructs a new Version with the provided files. It requires
  1007  // the provided files are already well-ordered. It's intended for testing.
  1008  func NewVersion(
  1009  	cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, files [NumLevels][]*FileMetadata,
  1010  ) *Version {
  1011  	var v Version
  1012  	for l := range files {
  1013  		// NB: We specifically insert `files` into the B-Tree in the order
  1014  		// they appear within `files`. Some tests depend on this behavior in
  1015  		// order to test consistency checking, etc. Once we've constructed the
  1016  		// initial B-Tree, we swap out the btreeCmp for the correct one.
  1017  		// TODO(jackson): Adjust or remove the tests and remove this.
  1018  		v.Levels[l].tree, _ = makeBTree(btreeCmpSpecificOrder(files[l]), files[l])
  1019  		v.Levels[l].level = l
  1020  		if l == 0 {
  1021  			v.Levels[l].tree.cmp = btreeCmpSeqNum
  1022  		} else {
  1023  			v.Levels[l].tree.cmp = btreeCmpSmallestKey(cmp)
  1024  		}
  1025  		for _, f := range files[l] {
  1026  			v.Levels[l].totalSize += f.Size
  1027  		}
  1028  	}
  1029  	if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil {
  1030  		panic(err)
  1031  	}
  1032  	return &v
  1033  }
  1034  
  1035  // Version is a collection of file metadata for on-disk tables at various
  1036  // levels. In-memory DBs are written to level-0 tables, and compactions
  1037  // migrate data from level N to level N+1. The tables map internal keys (which
  1038  // are a user key, a delete or set bit, and a sequence number) to user values.
  1039  //
  1040  // The tables at level 0 are sorted by largest sequence number. Due to file
  1041  // ingestion, there may be overlap in the ranges of sequence numbers contain in
  1042  // level 0 sstables. In particular, it is valid for one level 0 sstable to have
  1043  // the seqnum range [1,100] while an adjacent sstable has the seqnum range
  1044  // [50,50]. This occurs when the [50,50] table was ingested and given a global
  1045  // seqnum. The ingestion code will have ensured that the [50,50] sstable will
  1046  // not have any keys that overlap with the [1,100] in the seqnum range
  1047  // [1,49]. The range of internal keys [fileMetadata.smallest,
  1048  // fileMetadata.largest] in each level 0 table may overlap.
  1049  //
  1050  // The tables at any non-0 level are sorted by their internal key range and any
  1051  // two tables at the same non-0 level do not overlap.
  1052  //
  1053  // The internal key ranges of two tables at different levels X and Y may
  1054  // overlap, for any X != Y.
  1055  //
  1056  // Finally, for every internal key in a table at level X, there is no internal
  1057  // key in a higher level table that has both the same user key and a higher
  1058  // sequence number.
  1059  type Version struct {
  1060  	refs atomic.Int32
  1061  
  1062  	// The level 0 sstables are organized in a series of sublevels. Similar to
  1063  	// the seqnum invariant in normal levels, there is no internal key in a
  1064  	// higher level table that has both the same user key and a higher sequence
  1065  	// number. Within a sublevel, tables are sorted by their internal key range
  1066  	// and any two tables at the same sublevel do not overlap. Unlike the normal
  1067  	// levels, sublevel n contains older tables (lower sequence numbers) than
  1068  	// sublevel n+1.
  1069  	//
  1070  	// The L0Sublevels struct is mostly used for compaction picking. As most
  1071  	// internal data structures in it are only necessary for compaction picking
  1072  	// and not for iterator creation, the reference to L0Sublevels is nil'd
  1073  	// after this version becomes the non-newest version, to reduce memory
  1074  	// usage.
  1075  	//
  1076  	// L0Sublevels.Levels contains L0 files ordered by sublevels. All the files
  1077  	// in Levels[0] are in L0Sublevels.Levels. L0SublevelFiles is also set to
  1078  	// a reference to that slice, as that slice is necessary for iterator
  1079  	// creation and needs to outlast L0Sublevels.
  1080  	L0Sublevels     *L0Sublevels
  1081  	L0SublevelFiles []LevelSlice
  1082  
  1083  	Levels [NumLevels]LevelMetadata
  1084  
  1085  	// RangeKeyLevels holds a subset of the same files as Levels that contain range
  1086  	// keys (i.e. fileMeta.HasRangeKeys == true). The memory amplification of this
  1087  	// duplication should be minimal, as range keys are expected to be rare.
  1088  	RangeKeyLevels [NumLevels]LevelMetadata
  1089  
  1090  	// The callback to invoke when the last reference to a version is
  1091  	// removed. Will be called with list.mu held.
  1092  	Deleted func(obsolete []*FileBacking)
  1093  
  1094  	// Stats holds aggregated stats about the version maintained from
  1095  	// version to version.
  1096  	Stats struct {
  1097  		// MarkedForCompaction records the count of files marked for
  1098  		// compaction within the version.
  1099  		MarkedForCompaction int
  1100  	}
  1101  
  1102  	// The list the version is linked into.
  1103  	list *VersionList
  1104  
  1105  	// The next/prev link for the versionList doubly-linked list of versions.
  1106  	prev, next *Version
  1107  }
  1108  
  1109  // String implements fmt.Stringer, printing the FileMetadata for each level in
  1110  // the Version.
  1111  func (v *Version) String() string {
  1112  	return v.string(base.DefaultFormatter, false)
  1113  }
  1114  
  1115  // DebugString returns an alternative format to String() which includes sequence
  1116  // number and kind information for the sstable boundaries.
  1117  func (v *Version) DebugString(format base.FormatKey) string {
  1118  	return v.string(format, true)
  1119  }
  1120  
  1121  func describeSublevels(format base.FormatKey, verbose bool, sublevels []LevelSlice) string {
  1122  	var buf bytes.Buffer
  1123  	for sublevel := len(sublevels) - 1; sublevel >= 0; sublevel-- {
  1124  		fmt.Fprintf(&buf, "0.%d:\n", sublevel)
  1125  		sublevels[sublevel].Each(func(f *FileMetadata) {
  1126  			fmt.Fprintf(&buf, "  %s\n", f.DebugString(format, verbose))
  1127  		})
  1128  	}
  1129  	return buf.String()
  1130  }
  1131  
  1132  func (v *Version) string(format base.FormatKey, verbose bool) string {
  1133  	var buf bytes.Buffer
  1134  	if len(v.L0SublevelFiles) > 0 {
  1135  		fmt.Fprintf(&buf, "%s", describeSublevels(format, verbose, v.L0SublevelFiles))
  1136  	}
  1137  	for level := 1; level < NumLevels; level++ {
  1138  		if v.Levels[level].Empty() {
  1139  			continue
  1140  		}
  1141  		fmt.Fprintf(&buf, "%d:\n", level)
  1142  		iter := v.Levels[level].Iter()
  1143  		for f := iter.First(); f != nil; f = iter.Next() {
  1144  			fmt.Fprintf(&buf, "  %s\n", f.DebugString(format, verbose))
  1145  		}
  1146  	}
  1147  	return buf.String()
  1148  }
  1149  
  1150  // ParseVersionDebug parses a Version from its DebugString output.
  1151  func ParseVersionDebug(
  1152  	cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, s string,
  1153  ) (*Version, error) {
  1154  	var level int
  1155  	var files [NumLevels][]*FileMetadata
  1156  	for _, l := range strings.Split(s, "\n") {
  1157  		l = strings.TrimSpace(l)
  1158  
  1159  		switch l[:2] {
  1160  		case "0.", "0:", "1:", "2:", "3:", "4:", "5:", "6:":
  1161  			var err error
  1162  			level, err = strconv.Atoi(l[:1])
  1163  			if err != nil {
  1164  				return nil, err
  1165  			}
  1166  		default:
  1167  			m, err := ParseFileMetadataDebug(l)
  1168  			if err != nil {
  1169  				return nil, err
  1170  			}
  1171  			// If we only parsed overall bounds, default to setting the point bounds.
  1172  			if !m.HasPointKeys && !m.HasRangeKeys {
  1173  				m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest
  1174  				m.HasPointKeys = true
  1175  			}
  1176  			files[level] = append(files[level], m)
  1177  		}
  1178  	}
  1179  	// Reverse the order of L0 files. This ensures we construct the same
  1180  	// sublevels. (They're printed from higher sublevel to lower, which means in
  1181  	// a partial order that represents newest to oldest).
  1182  	for i := 0; i < len(files[0])/2; i++ {
  1183  		files[0][i], files[0][len(files[0])-i-1] = files[0][len(files[0])-i-1], files[0][i]
  1184  	}
  1185  	return NewVersion(cmp, formatKey, flushSplitBytes, files), nil
  1186  }
  1187  
  1188  // Refs returns the number of references to the version.
  1189  func (v *Version) Refs() int32 {
  1190  	return v.refs.Load()
  1191  }
  1192  
  1193  // Ref increments the version refcount.
  1194  func (v *Version) Ref() {
  1195  	v.refs.Add(1)
  1196  }
  1197  
  1198  // Unref decrements the version refcount. If the last reference to the version
  1199  // was removed, the version is removed from the list of versions and the
  1200  // Deleted callback is invoked. Requires that the VersionList mutex is NOT
  1201  // locked.
  1202  func (v *Version) Unref() {
  1203  	if v.refs.Add(-1) == 0 {
  1204  		l := v.list
  1205  		l.mu.Lock()
  1206  		l.Remove(v)
  1207  		v.Deleted(v.unrefFiles())
  1208  		l.mu.Unlock()
  1209  	}
  1210  }
  1211  
  1212  // UnrefLocked decrements the version refcount. If the last reference to the
  1213  // version was removed, the version is removed from the list of versions and
  1214  // the Deleted callback is invoked. Requires that the VersionList mutex is
  1215  // already locked.
  1216  func (v *Version) UnrefLocked() {
  1217  	if v.refs.Add(-1) == 0 {
  1218  		v.list.Remove(v)
  1219  		v.Deleted(v.unrefFiles())
  1220  	}
  1221  }
  1222  
  1223  func (v *Version) unrefFiles() []*FileBacking {
  1224  	var obsolete []*FileBacking
  1225  	for _, lm := range v.Levels {
  1226  		obsolete = append(obsolete, lm.release()...)
  1227  	}
  1228  	for _, lm := range v.RangeKeyLevels {
  1229  		obsolete = append(obsolete, lm.release()...)
  1230  	}
  1231  	return obsolete
  1232  }
  1233  
  1234  // Next returns the next version in the list of versions.
  1235  func (v *Version) Next() *Version {
  1236  	return v.next
  1237  }
  1238  
  1239  // InitL0Sublevels initializes the L0Sublevels
  1240  func (v *Version) InitL0Sublevels(
  1241  	cmp Compare, formatKey base.FormatKey, flushSplitBytes int64,
  1242  ) error {
  1243  	var err error
  1244  	v.L0Sublevels, err = NewL0Sublevels(&v.Levels[0], cmp, formatKey, flushSplitBytes)
  1245  	if err == nil && v.L0Sublevels != nil {
  1246  		v.L0SublevelFiles = v.L0Sublevels.Levels
  1247  	}
  1248  	return err
  1249  }
  1250  
  1251  // Contains returns a boolean indicating whether the provided file exists in
  1252  // the version at the given level. If level is non-zero then Contains binary
  1253  // searches among the files. If level is zero, Contains scans the entire
  1254  // level.
  1255  func (v *Version) Contains(level int, cmp Compare, m *FileMetadata) bool {
  1256  	iter := v.Levels[level].Iter()
  1257  	if level > 0 {
  1258  		overlaps := v.Overlaps(level, cmp, m.Smallest.UserKey, m.Largest.UserKey,
  1259  			m.Largest.IsExclusiveSentinel())
  1260  		iter = overlaps.Iter()
  1261  	}
  1262  	for f := iter.First(); f != nil; f = iter.Next() {
  1263  		if f == m {
  1264  			return true
  1265  		}
  1266  	}
  1267  	return false
  1268  }
  1269  
  1270  // Overlaps returns all elements of v.files[level] whose user key range
  1271  // intersects the given range. If level is non-zero then the user key ranges of
  1272  // v.files[level] are assumed to not overlap (although they may touch). If level
  1273  // is zero then that assumption cannot be made, and the [start, end] range is
  1274  // expanded to the union of those matching ranges so far and the computation is
  1275  // repeated until [start, end] stabilizes.
  1276  // The returned files are a subsequence of the input files, i.e., the ordering
  1277  // is not changed.
  1278  func (v *Version) Overlaps(
  1279  	level int, cmp Compare, start, end []byte, exclusiveEnd bool,
  1280  ) LevelSlice {
  1281  	if level == 0 {
  1282  		// Indices that have been selected as overlapping.
  1283  		l0 := v.Levels[level]
  1284  		l0Iter := l0.Iter()
  1285  		selectedIndices := make([]bool, l0.Len())
  1286  		numSelected := 0
  1287  		var slice LevelSlice
  1288  		for {
  1289  			restart := false
  1290  			for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() {
  1291  				selected := selectedIndices[i]
  1292  				if selected {
  1293  					continue
  1294  				}
  1295  				if !meta.Overlaps(cmp, start, end, exclusiveEnd) {
  1296  					// meta is completely outside the specified range; skip it.
  1297  					continue
  1298  				}
  1299  				// Overlaps.
  1300  				selectedIndices[i] = true
  1301  				numSelected++
  1302  
  1303  				smallest := meta.Smallest.UserKey
  1304  				largest := meta.Largest.UserKey
  1305  				// Since level == 0, check if the newly added fileMetadata has
  1306  				// expanded the range. We expand the range immediately for files
  1307  				// we have remaining to check in this loop. All already checked
  1308  				// and unselected files will need to be rechecked via the
  1309  				// restart below.
  1310  				if cmp(smallest, start) < 0 {
  1311  					start = smallest
  1312  					restart = true
  1313  				}
  1314  				if v := cmp(largest, end); v > 0 {
  1315  					end = largest
  1316  					exclusiveEnd = meta.Largest.IsExclusiveSentinel()
  1317  					restart = true
  1318  				} else if v == 0 && exclusiveEnd && !meta.Largest.IsExclusiveSentinel() {
  1319  					// Only update the exclusivity of our existing `end`
  1320  					// bound.
  1321  					exclusiveEnd = false
  1322  					restart = true
  1323  				}
  1324  			}
  1325  
  1326  			if !restart {
  1327  				// Construct a B-Tree containing only the matching items.
  1328  				var tr btree
  1329  				tr.cmp = v.Levels[level].tree.cmp
  1330  				for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() {
  1331  					if selectedIndices[i] {
  1332  						err := tr.Insert(meta)
  1333  						if err != nil {
  1334  							panic(err)
  1335  						}
  1336  					}
  1337  				}
  1338  				slice = newLevelSlice(tr.Iter())
  1339  				// TODO(jackson): Avoid the oddity of constructing and
  1340  				// immediately releasing a B-Tree. Make LevelSlice an
  1341  				// interface?
  1342  				tr.Release()
  1343  				break
  1344  			}
  1345  			// Continue looping to retry the files that were not selected.
  1346  		}
  1347  		return slice
  1348  	}
  1349  
  1350  	return overlaps(v.Levels[level].Iter(), cmp, start, end, exclusiveEnd)
  1351  }
  1352  
  1353  // CheckOrdering checks that the files are consistent with respect to
  1354  // increasing file numbers (for level 0 files) and increasing and non-
  1355  // overlapping internal key ranges (for level non-0 files).
  1356  func (v *Version) CheckOrdering(
  1357  	cmp Compare, format base.FormatKey, order OrderingInvariants,
  1358  ) error {
  1359  	for sublevel := len(v.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- {
  1360  		sublevelIter := v.L0SublevelFiles[sublevel].Iter()
  1361  		// Sublevels have NEVER allowed split user keys, so we can pass
  1362  		// ProhibitSplitUserKeys.
  1363  		if err := CheckOrdering(cmp, format, L0Sublevel(sublevel), sublevelIter, ProhibitSplitUserKeys); err != nil {
  1364  			return base.CorruptionErrorf("%s\n%s", err, v.DebugString(format))
  1365  		}
  1366  	}
  1367  
  1368  	for level, lm := range v.Levels {
  1369  		if err := CheckOrdering(cmp, format, Level(level), lm.Iter(), order); err != nil {
  1370  			return base.CorruptionErrorf("%s\n%s", err, v.DebugString(format))
  1371  		}
  1372  	}
  1373  	return nil
  1374  }
  1375  
  1376  // VersionList holds a list of versions. The versions are ordered from oldest
  1377  // to newest.
  1378  type VersionList struct {
  1379  	mu   *sync.Mutex
  1380  	root Version
  1381  }
  1382  
  1383  // Init initializes the version list.
  1384  func (l *VersionList) Init(mu *sync.Mutex) {
  1385  	l.mu = mu
  1386  	l.root.next = &l.root
  1387  	l.root.prev = &l.root
  1388  }
  1389  
  1390  // Empty returns true if the list is empty, and false otherwise.
  1391  func (l *VersionList) Empty() bool {
  1392  	return l.root.next == &l.root
  1393  }
  1394  
  1395  // Front returns the oldest version in the list. Note that this version is only
  1396  // valid if Empty() returns true.
  1397  func (l *VersionList) Front() *Version {
  1398  	return l.root.next
  1399  }
  1400  
  1401  // Back returns the newest version in the list. Note that this version is only
  1402  // valid if Empty() returns true.
  1403  func (l *VersionList) Back() *Version {
  1404  	return l.root.prev
  1405  }
  1406  
  1407  // PushBack adds a new version to the back of the list. This new version
  1408  // becomes the "newest" version in the list.
  1409  func (l *VersionList) PushBack(v *Version) {
  1410  	if v.list != nil || v.prev != nil || v.next != nil {
  1411  		panic("pebble: version list is inconsistent")
  1412  	}
  1413  	v.prev = l.root.prev
  1414  	v.prev.next = v
  1415  	v.next = &l.root
  1416  	v.next.prev = v
  1417  	v.list = l
  1418  	// Let L0Sublevels on the second newest version get GC'd, as it is no longer
  1419  	// necessary. See the comment in Version.
  1420  	v.prev.L0Sublevels = nil
  1421  }
  1422  
  1423  // Remove removes the specified version from the list.
  1424  func (l *VersionList) Remove(v *Version) {
  1425  	if v == &l.root {
  1426  		panic("pebble: cannot remove version list root node")
  1427  	}
  1428  	if v.list != l {
  1429  		panic("pebble: version list is inconsistent")
  1430  	}
  1431  	v.prev.next = v.next
  1432  	v.next.prev = v.prev
  1433  	v.next = nil // avoid memory leaks
  1434  	v.prev = nil // avoid memory leaks
  1435  	v.list = nil // avoid memory leaks
  1436  }
  1437  
  1438  // OrderingInvariants dictates the file ordering invariants active.
  1439  type OrderingInvariants int8
  1440  
  1441  const (
  1442  	// ProhibitSplitUserKeys indicates that adjacent files within a level cannot
  1443  	// contain the same user key.
  1444  	ProhibitSplitUserKeys OrderingInvariants = iota
  1445  	// AllowSplitUserKeys indicates that adjacent files within a level may
  1446  	// contain the same user key. This is only allowed by historical format
  1447  	// major versions.
  1448  	//
  1449  	// TODO(jackson): Remove.
  1450  	AllowSplitUserKeys
  1451  )
  1452  
  1453  // CheckOrdering checks that the files are consistent with respect to
  1454  // seqnums (for level 0 files -- see detailed comment below) and increasing and non-
  1455  // overlapping internal key ranges (for non-level 0 files).
  1456  //
  1457  // The ordering field may be passed AllowSplitUserKeys to allow adjacent files that are both
  1458  // inclusive of the same user key. Pebble no longer creates version edits
  1459  // installing such files, and Pebble databases with sufficiently high format
  1460  // major version should no longer have any such files within their LSM.
  1461  // TODO(jackson): Remove AllowSplitUserKeys when we remove support for the
  1462  // earlier format major versions.
  1463  func CheckOrdering(
  1464  	cmp Compare, format base.FormatKey, level Level, files LevelIterator, ordering OrderingInvariants,
  1465  ) error {
  1466  	// The invariants to check for L0 sublevels are the same as the ones to
  1467  	// check for all other levels. However, if L0 is not organized into
  1468  	// sublevels, or if all L0 files are being passed in, we do the legacy L0
  1469  	// checks, defined in the detailed comment below.
  1470  	if level == Level(0) {
  1471  		// We have 2 kinds of files:
  1472  		// - Files with exactly one sequence number: these could be either ingested files
  1473  		//   or flushed files. We cannot tell the difference between them based on FileMetadata,
  1474  		//   so our consistency checking here uses the weaker checks assuming it is a narrow
  1475  		//   flushed file. We cannot error on ingested files having sequence numbers coincident
  1476  		//   with flushed files as the seemingly ingested file could just be a flushed file
  1477  		//   with just one key in it which is a truncated range tombstone sharing sequence numbers
  1478  		//   with other files in the same flush.
  1479  		// - Files with multiple sequence numbers: these are necessarily flushed files.
  1480  		//
  1481  		// Three cases of overlapping sequence numbers:
  1482  		// Case 1:
  1483  		// An ingested file contained in the sequence numbers of the flushed file -- it must be
  1484  		// fully contained (not coincident with either end of the flushed file) since the memtable
  1485  		// must have been at [a, b-1] (where b > a) when the ingested file was assigned sequence
  1486  		// num b, and the memtable got a subsequent update that was given sequence num b+1, before
  1487  		// being flushed.
  1488  		//
  1489  		// So a sequence [1000, 1000] [1002, 1002] [1000, 2000] is invalid since the first and
  1490  		// third file are inconsistent with each other. So comparing adjacent files is insufficient
  1491  		// for consistency checking.
  1492  		//
  1493  		// Visually we have something like
  1494  		// x------y x-----------yx-------------y (flushed files where x, y are the endpoints)
  1495  		//     y       y  y        y             (y's represent ingested files)
  1496  		// And these are ordered in increasing order of y. Note that y's must be unique.
  1497  		//
  1498  		// Case 2:
  1499  		// A flushed file that did not overlap in keys with any file in any level, but does overlap
  1500  		// in the file key intervals. This file is placed in L0 since it overlaps in the file
  1501  		// key intervals but since it has no overlapping data, it is assigned a sequence number
  1502  		// of 0 in RocksDB. We handle this case for compatibility with RocksDB.
  1503  		//
  1504  		// Case 3:
  1505  		// A sequence of flushed files that overlap in sequence numbers with one another,
  1506  		// but do not overlap in keys inside the sstables. These files correspond to
  1507  		// partitioned flushes or the results of intra-L0 compactions of partitioned
  1508  		// flushes.
  1509  		//
  1510  		// Since these types of SSTables violate most other sequence number
  1511  		// overlap invariants, and handling this case is important for compatibility
  1512  		// with future versions of pebble, this method relaxes most L0 invariant
  1513  		// checks.
  1514  
  1515  		var prev *FileMetadata
  1516  		for f := files.First(); f != nil; f, prev = files.Next(), f {
  1517  			if prev == nil {
  1518  				continue
  1519  			}
  1520  			// Validate that the sorting is sane.
  1521  			if prev.LargestSeqNum == 0 && f.LargestSeqNum == prev.LargestSeqNum {
  1522  				// Multiple files satisfying case 2 mentioned above.
  1523  			} else if !prev.lessSeqNum(f) {
  1524  				return base.CorruptionErrorf("L0 files %s and %s are not properly ordered: <#%d-#%d> vs <#%d-#%d>",
  1525  					errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
  1526  					errors.Safe(prev.SmallestSeqNum), errors.Safe(prev.LargestSeqNum),
  1527  					errors.Safe(f.SmallestSeqNum), errors.Safe(f.LargestSeqNum))
  1528  			}
  1529  		}
  1530  	} else {
  1531  		var prev *FileMetadata
  1532  		for f := files.First(); f != nil; f, prev = files.Next(), f {
  1533  			if err := f.Validate(cmp, format); err != nil {
  1534  				return errors.Wrapf(err, "%s ", level)
  1535  			}
  1536  			if prev != nil {
  1537  				if prev.cmpSmallestKey(f, cmp) >= 0 {
  1538  					return base.CorruptionErrorf("%s files %s and %s are not properly ordered: [%s-%s] vs [%s-%s]",
  1539  						errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
  1540  						prev.Smallest.Pretty(format), prev.Largest.Pretty(format),
  1541  						f.Smallest.Pretty(format), f.Largest.Pretty(format))
  1542  				}
  1543  
  1544  				// What's considered "overlapping" is dependent on the format
  1545  				// major version. If ordering=ProhibitSplitUserKeys, then both
  1546  				// files cannot contain keys with the same user keys. If the
  1547  				// bounds have the same user key, the previous file's boundary
  1548  				// must have a Trailer indicating that it's exclusive.
  1549  				switch ordering {
  1550  				case AllowSplitUserKeys:
  1551  					if base.InternalCompare(cmp, prev.Largest, f.Smallest) >= 0 {
  1552  						return base.CorruptionErrorf("%s files %s and %s have overlapping ranges: [%s-%s] vs [%s-%s]",
  1553  							errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
  1554  							prev.Smallest.Pretty(format), prev.Largest.Pretty(format),
  1555  							f.Smallest.Pretty(format), f.Largest.Pretty(format))
  1556  					}
  1557  				case ProhibitSplitUserKeys:
  1558  					if v := cmp(prev.Largest.UserKey, f.Smallest.UserKey); v > 0 || (v == 0 && !prev.Largest.IsExclusiveSentinel()) {
  1559  						return base.CorruptionErrorf("%s files %s and %s have overlapping ranges: [%s-%s] vs [%s-%s]",
  1560  							errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
  1561  							prev.Smallest.Pretty(format), prev.Largest.Pretty(format),
  1562  							f.Smallest.Pretty(format), f.Largest.Pretty(format))
  1563  					}
  1564  				default:
  1565  					panic("unreachable")
  1566  				}
  1567  			}
  1568  		}
  1569  	}
  1570  	return nil
  1571  }