github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/internal/manifest/version.go (about)

     1  // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package manifest
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"sort"
    11  	"strconv"
    12  	"strings"
    13  	"sync"
    14  	"sync/atomic"
    15  	"unicode"
    16  
    17  	"github.com/cockroachdb/errors"
    18  	"github.com/zuoyebang/bitalostable/internal/base"
    19  	"github.com/zuoyebang/bitalostable/internal/invariants"
    20  	"github.com/zuoyebang/bitalostable/vfs"
    21  )
    22  
    23  // Compare exports the base.Compare type.
    24  type Compare = base.Compare
    25  
    26  // InternalKey exports the base.InternalKey type.
    27  type InternalKey = base.InternalKey
    28  
    29  // TableInfo contains the common information for table related events.
    30  type TableInfo struct {
    31  	// FileNum is the internal DB identifier for the table.
    32  	FileNum base.FileNum
    33  	// Size is the size of the file in bytes.
    34  	Size uint64
    35  	// Smallest is the smallest internal key in the table.
    36  	Smallest InternalKey
    37  	// Largest is the largest internal key in the table.
    38  	Largest InternalKey
    39  	// SmallestSeqNum is the smallest sequence number in the table.
    40  	SmallestSeqNum uint64
    41  	// LargestSeqNum is the largest sequence number in the table.
    42  	LargestSeqNum uint64
    43  }
    44  
    45  // TableStats contains statistics on a table used for compaction heuristics.
    46  type TableStats struct {
    47  	// The total number of entries in the table.
    48  	NumEntries uint64
    49  	// The number of point and range deletion entries in the table.
    50  	NumDeletions uint64
    51  	// NumRangeKeySets is the total number of range key sets in the table.
    52  	NumRangeKeySets uint64
    53  	// Estimate of the total disk space that may be dropped by this table's
    54  	// point deletions by compacting them.
    55  	PointDeletionsBytesEstimate uint64
    56  	// Estimate of the total disk space that may be dropped by this table's
    57  	// range deletions by compacting them. This estimate is at data-block
    58  	// granularity and is not updated if compactions beneath the table reduce
    59  	// the amount of reclaimable disk space. It also does not account for
    60  	// overlapping data in L0 and ignores L0 sublevels, but the error that
    61  	// introduces is expected to be small.
    62  	//
    63  	// Tables in the bottommost level of the LSM may have a nonzero estimate
    64  	// if snapshots or move compactions prevented the elision of their range
    65  	// tombstones.
    66  	RangeDeletionsBytesEstimate uint64
    67  }
    68  
    69  // boundType represents the type of key (point or range) present as the smallest
    70  // and largest keys.
    71  type boundType uint8
    72  
    73  const (
    74  	boundTypePointKey boundType = iota + 1
    75  	boundTypeRangeKey
    76  )
    77  
    78  // CompactionState is the compaction state of a file.
    79  //
    80  // The following shows the valid state transitions:
    81  //
    82  //	NotCompacting --> Compacting --> Compacted
    83  //	      ^               |
    84  //	      |               |
    85  //	      +-------<-------+
    86  //
    87  // Input files to a compaction transition to Compacting when a compaction is
    88  // picked. A file that has finished compacting typically transitions into the
    89  // Compacted state, at which point it is effectively obsolete ("zombied") and
    90  // will eventually be removed from the LSM. A file that has been move-compacted
    91  // will transition from Compacting back into the NotCompacting state, signaling
    92  // that the file may be selected for a subsequent compaction. A failed
    93  // compaction will result in all input tables transitioning from Compacting to
    94  // NotCompacting.
    95  //
    96  // This state is in-memory only. It is not persisted to the manifest.
    97  type CompactionState uint8
    98  
    99  // CompactionStates.
   100  const (
   101  	CompactionStateNotCompacting CompactionState = iota
   102  	CompactionStateCompacting
   103  	CompactionStateCompacted
   104  )
   105  
   106  // String implements fmt.Stringer.
   107  func (s CompactionState) String() string {
   108  	switch s {
   109  	case CompactionStateNotCompacting:
   110  		return "NotCompacting"
   111  	case CompactionStateCompacting:
   112  		return "Compacting"
   113  	case CompactionStateCompacted:
   114  		return "Compacted"
   115  	default:
   116  		panic(fmt.Sprintf("bitalostable: unknown compaction state %d", s))
   117  	}
   118  }
   119  
   120  // FileMetadata holds the metadata for an on-disk table.
   121  type FileMetadata struct {
   122  	// Atomic contains fields which are accessed atomically. Go allocations
   123  	// are guaranteed to be 64-bit aligned which we take advantage of by
   124  	// placing the 64-bit fields which we access atomically at the beginning
   125  	// of the FileMetadata struct. For more information, see
   126  	// https://golang.org/pkg/sync/atomic/#pkg-note-BUG.
   127  	Atomic struct {
   128  		// AllowedSeeks is used to determine if a file should be picked for
   129  		// a read triggered compaction. It is decremented when read sampling
   130  		// in bitalostable.Iterator after every after every positioning operation
   131  		// that returns a user key (eg. Next, Prev, SeekGE, SeekLT, etc).
   132  		AllowedSeeks int64
   133  
   134  		// statsValid is 1 if stats have been loaded for the table. The
   135  		// TableStats structure is populated only if valid is 1.
   136  		statsValid uint32
   137  	}
   138  
   139  	// InitAllowedSeeks is the inital value of allowed seeks. This is used
   140  	// to re-set allowed seeks on a file once it hits 0.
   141  	InitAllowedSeeks int64
   142  
   143  	// Reference count for the file: incremented when a file is added to a
   144  	// version and decremented when the version is unreferenced. The file is
   145  	// obsolete when the reference count falls to zero.
   146  	refs int32
   147  	// FileNum is the file number.
   148  	FileNum base.FileNum
   149  	// Size is the size of the file, in bytes.
   150  	Size uint64
   151  	// File creation time in seconds since the epoch (1970-01-01 00:00:00
   152  	// UTC). For ingested sstables, this corresponds to the time the file was
   153  	// ingested.
   154  	CreationTime int64
   155  	// Smallest and largest sequence numbers in the table, across both point and
   156  	// range keys.
   157  	SmallestSeqNum uint64
   158  	LargestSeqNum  uint64
   159  	// SmallestPointKey and LargestPointKey are the inclusive bounds for the
   160  	// internal point keys stored in the table. This includes RANGEDELs, which
   161  	// alter point keys.
   162  	// NB: these field should be set using ExtendPointKeyBounds. They are left
   163  	// exported for reads as an optimization.
   164  	SmallestPointKey InternalKey
   165  	LargestPointKey  InternalKey
   166  	// SmallestRangeKey and LargestRangeKey are the inclusive bounds for the
   167  	// internal range keys stored in the table.
   168  	// NB: these field should be set using ExtendRangeKeyBounds. They are left
   169  	// exported for reads as an optimization.
   170  	SmallestRangeKey InternalKey
   171  	LargestRangeKey  InternalKey
   172  	// Smallest and Largest are the inclusive bounds for the internal keys stored
   173  	// in the table, across both point and range keys.
   174  	// NB: these fields are derived from their point and range key equivalents,
   175  	// and are updated via the MaybeExtend{Point,Range}KeyBounds methods.
   176  	Smallest InternalKey
   177  	Largest  InternalKey
   178  	// Stats describe table statistics. Protected by DB.mu.
   179  	Stats TableStats
   180  
   181  	SubLevel         int
   182  	L0Index          int
   183  	minIntervalIndex int
   184  	maxIntervalIndex int
   185  
   186  	// NB: the alignment of this struct is 8 bytes. We pack all the bools to
   187  	// ensure an optimal packing.
   188  
   189  	// For L0 files only. Protected by DB.mu. Used to generate L0 sublevels and
   190  	// pick L0 compactions. Only accurate for the most recent Version.
   191  	//
   192  	// IsIntraL0Compacting is set to True if this file is part of an intra-L0
   193  	// compaction. When it's true, IsCompacting must also return true. If
   194  	// Compacting is true and IsIntraL0Compacting is false for an L0 file, the
   195  	// file must be part of a compaction to Lbase.
   196  	IsIntraL0Compacting bool
   197  	CompactionState     CompactionState
   198  	// True if compaction of this file has been explicitly requested.
   199  	// Previously, RocksDB and earlier versions of Pebble allowed this
   200  	// flag to be set by a user table property collector. Some earlier
   201  	// versions of Pebble respected this flag, while other more recent
   202  	// versions ignored this flag.
   203  	//
   204  	// More recently this flag has been repurposed to facilitate the
   205  	// compaction of 'atomic compaction units'. Files marked for
   206  	// compaction are compacted in a rewrite compaction at the lowest
   207  	// possible compaction priority.
   208  	//
   209  	// NB: A count of files marked for compaction is maintained on
   210  	// Version, and compaction picking reads cached annotations
   211  	// determined by this field.
   212  	//
   213  	// Protected by DB.mu.
   214  	MarkedForCompaction bool
   215  	// HasPointKeys tracks whether the table contains point keys (including
   216  	// RANGEDELs). If a table contains only range deletions, HasPointsKeys is
   217  	// still true.
   218  	HasPointKeys bool
   219  	// HasRangeKeys tracks whether the table contains any range keys.
   220  	HasRangeKeys bool
   221  	// smallestSet and largestSet track whether the overall bounds have been set.
   222  	boundsSet bool
   223  	// boundTypeSmallest and boundTypeLargest provide an indication as to which
   224  	// key type (point or range) corresponds to the smallest and largest overall
   225  	// table bounds.
   226  	boundTypeSmallest, boundTypeLargest boundType
   227  }
   228  
   229  // SetCompactionState transitions this file's compaction state to the given
   230  // state. Protected by DB.mu.
   231  func (m *FileMetadata) SetCompactionState(to CompactionState) {
   232  	if invariants.Enabled {
   233  		transitionErr := func() error {
   234  			return errors.Newf("bitalostable: invalid compaction state transition: %s -> %s", m.CompactionState, to)
   235  		}
   236  		switch m.CompactionState {
   237  		case CompactionStateNotCompacting:
   238  			if to != CompactionStateCompacting {
   239  				panic(transitionErr())
   240  			}
   241  		case CompactionStateCompacting:
   242  			if to != CompactionStateCompacted && to != CompactionStateNotCompacting {
   243  				panic(transitionErr())
   244  			}
   245  		case CompactionStateCompacted:
   246  			panic(transitionErr())
   247  		default:
   248  			panic(fmt.Sprintf("bitalostable: unknown compaction state: %d", m.CompactionState))
   249  		}
   250  	}
   251  	m.CompactionState = to
   252  }
   253  
   254  // IsCompacting returns true if this file's compaction state is
   255  // CompactionStateCompacting. Protected by DB.mu.
   256  func (m *FileMetadata) IsCompacting() bool {
   257  	return m.CompactionState == CompactionStateCompacting
   258  }
   259  
   260  // StatsValid returns true if the table stats have been populated. If StatValid
   261  // returns true, the Stats field may be read (with or without holding the
   262  // database mutex).
   263  func (m *FileMetadata) StatsValid() bool {
   264  	return atomic.LoadUint32(&m.Atomic.statsValid) == 1
   265  }
   266  
   267  // StatsValidLocked returns true if the table stats have been populated.
   268  // StatsValidLocked requires DB.mu is held when it's invoked, and it avoids the
   269  // overhead of an atomic load. This is possible because table stats validity is
   270  // only set while DB.mu is held.
   271  func (m *FileMetadata) StatsValidLocked() bool {
   272  	return m.Atomic.statsValid == 1
   273  }
   274  
   275  // StatsMarkValid marks the TableStats as valid. The caller must hold DB.mu
   276  // while populating TableStats and calling StatsMarkValud. Once stats are
   277  // populated, they must not be mutated.
   278  func (m *FileMetadata) StatsMarkValid() {
   279  	atomic.StoreUint32(&m.Atomic.statsValid, 1)
   280  }
   281  
   282  // ExtendPointKeyBounds attempts to extend the lower and upper point key bounds
   283  // and overall table bounds with the given smallest and largest keys. The
   284  // smallest and largest bounds may not be extended if the table already has a
   285  // bound that is smaller or larger, respectively. The receiver is returned.
   286  // NB: calling this method should be preferred to manually setting the bounds by
   287  // manipulating the fields directly, to maintain certain invariants.
   288  func (m *FileMetadata) ExtendPointKeyBounds(
   289  	cmp Compare, smallest, largest InternalKey,
   290  ) *FileMetadata {
   291  	// Update the point key bounds.
   292  	if !m.HasPointKeys {
   293  		m.SmallestPointKey, m.LargestPointKey = smallest, largest
   294  		m.HasPointKeys = true
   295  	} else {
   296  		if base.InternalCompare(cmp, smallest, m.SmallestPointKey) < 0 {
   297  			m.SmallestPointKey = smallest
   298  		}
   299  		if base.InternalCompare(cmp, largest, m.LargestPointKey) > 0 {
   300  			m.LargestPointKey = largest
   301  		}
   302  	}
   303  	// Update the overall bounds.
   304  	m.extendOverallBounds(cmp, m.SmallestPointKey, m.LargestPointKey, boundTypePointKey)
   305  	return m
   306  }
   307  
   308  // ExtendRangeKeyBounds attempts to extend the lower and upper range key bounds
   309  // and overall table bounds with the given smallest and largest keys. The
   310  // smallest and largest bounds may not be extended if the table already has a
   311  // bound that is smaller or larger, respectively. The receiver is returned.
   312  // NB: calling this method should be preferred to manually setting the bounds by
   313  // manipulating the fields directly, to maintain certain invariants.
   314  func (m *FileMetadata) ExtendRangeKeyBounds(
   315  	cmp Compare, smallest, largest InternalKey,
   316  ) *FileMetadata {
   317  	// Update the range key bounds.
   318  	if !m.HasRangeKeys {
   319  		m.SmallestRangeKey, m.LargestRangeKey = smallest, largest
   320  		m.HasRangeKeys = true
   321  	} else {
   322  		if base.InternalCompare(cmp, smallest, m.SmallestRangeKey) < 0 {
   323  			m.SmallestRangeKey = smallest
   324  		}
   325  		if base.InternalCompare(cmp, largest, m.LargestRangeKey) > 0 {
   326  			m.LargestRangeKey = largest
   327  		}
   328  	}
   329  	// Update the overall bounds.
   330  	m.extendOverallBounds(cmp, m.SmallestRangeKey, m.LargestRangeKey, boundTypeRangeKey)
   331  	return m
   332  }
   333  
   334  // extendOverallBounds attempts to extend the overall table lower and upper
   335  // bounds. The given bounds may not be used if a lower or upper bound already
   336  // exists that is smaller or larger than the given keys, respectively. The given
   337  // boundType will be used if the bounds are updated.
   338  func (m *FileMetadata) extendOverallBounds(
   339  	cmp Compare, smallest, largest InternalKey, bTyp boundType,
   340  ) {
   341  	if !m.boundsSet {
   342  		m.Smallest, m.Largest = smallest, largest
   343  		m.boundsSet = true
   344  		m.boundTypeSmallest, m.boundTypeLargest = bTyp, bTyp
   345  	} else {
   346  		if base.InternalCompare(cmp, smallest, m.Smallest) < 0 {
   347  			m.Smallest = smallest
   348  			m.boundTypeSmallest = bTyp
   349  		}
   350  		if base.InternalCompare(cmp, largest, m.Largest) > 0 {
   351  			m.Largest = largest
   352  			m.boundTypeLargest = bTyp
   353  		}
   354  	}
   355  }
   356  
   357  const (
   358  	maskContainsPointKeys = 1 << 0
   359  	maskSmallest          = 1 << 1
   360  	maskLargest           = 1 << 2
   361  )
   362  
   363  // boundsMarker returns a marker byte whose bits encode the following
   364  // information (in order from least significant bit):
   365  // - if the table contains point keys
   366  // - if the table's smallest key is a point key
   367  // - if the table's largest key is a point key
   368  func (m *FileMetadata) boundsMarker() (sentinel uint8, err error) {
   369  	if m.HasPointKeys {
   370  		sentinel |= maskContainsPointKeys
   371  	}
   372  	switch m.boundTypeSmallest {
   373  	case boundTypePointKey:
   374  		sentinel |= maskSmallest
   375  	case boundTypeRangeKey:
   376  		// No op - leave bit unset.
   377  	default:
   378  		return 0, base.CorruptionErrorf("file %s has neither point nor range key as smallest key", m.FileNum)
   379  	}
   380  	switch m.boundTypeLargest {
   381  	case boundTypePointKey:
   382  		sentinel |= maskLargest
   383  	case boundTypeRangeKey:
   384  		// No op - leave bit unset.
   385  	default:
   386  		return 0, base.CorruptionErrorf("file %s has neither point nor range key as largest key", m.FileNum)
   387  	}
   388  	return
   389  }
   390  
   391  // String implements fmt.Stringer, printing the file number and the overall
   392  // table bounds.
   393  func (m *FileMetadata) String() string {
   394  	return fmt.Sprintf("%s:[%s-%s]", m.FileNum, m.Smallest, m.Largest)
   395  }
   396  
   397  // DebugString returns a verbose representation of FileMetadata, typically for
   398  // use in tests and debugging, returning the file number and the point, range
   399  // and overall bounds for the table.
   400  func (m *FileMetadata) DebugString(format base.FormatKey, verbose bool) string {
   401  	var b bytes.Buffer
   402  	fmt.Fprintf(&b, "%s:[%s-%s]",
   403  		m.FileNum, m.Smallest.Pretty(format), m.Largest.Pretty(format))
   404  	if !verbose {
   405  		return b.String()
   406  	}
   407  	if m.HasPointKeys {
   408  		fmt.Fprintf(&b, " points:[%s-%s]",
   409  			m.SmallestPointKey.Pretty(format), m.LargestPointKey.Pretty(format))
   410  	}
   411  	if m.HasRangeKeys {
   412  		fmt.Fprintf(&b, " ranges:[%s-%s]",
   413  			m.SmallestRangeKey.Pretty(format), m.LargestRangeKey.Pretty(format))
   414  	}
   415  	return b.String()
   416  }
   417  
   418  // ParseFileMetadataDebug parses a FileMetadata from its DebugString
   419  // representation.
   420  func ParseFileMetadataDebug(s string) (m FileMetadata, err error) {
   421  	// Split lines of the form:
   422  	//  000000:[a#0,SET-z#0,SET] points:[...] ranges:[...]
   423  	fields := strings.FieldsFunc(s, func(c rune) bool {
   424  		switch c {
   425  		case ':', '[', '-', ']':
   426  			return true
   427  		default:
   428  			return unicode.IsSpace(c) // NB: also trim whitespace padding.
   429  		}
   430  	})
   431  	if len(fields)%3 != 0 {
   432  		return m, errors.Newf("malformed input: %s", s)
   433  	}
   434  	for len(fields) > 0 {
   435  		prefix := fields[0]
   436  		smallest := base.ParsePrettyInternalKey(fields[1])
   437  		largest := base.ParsePrettyInternalKey(fields[2])
   438  		switch prefix {
   439  		case "points":
   440  			m.SmallestPointKey, m.LargestPointKey = smallest, largest
   441  			m.HasPointKeys = true
   442  		case "ranges":
   443  			m.SmallestRangeKey, m.LargestRangeKey = smallest, largest
   444  			m.HasRangeKeys = true
   445  		default:
   446  			fileNum, err := strconv.ParseUint(prefix, 10, 64)
   447  			if err != nil {
   448  				return m, errors.Newf("malformed input: %s: %s", s, err)
   449  			}
   450  			m.FileNum = base.FileNum(fileNum)
   451  			m.Smallest, m.Largest = smallest, largest
   452  			m.boundsSet = true
   453  		}
   454  		fields = fields[3:]
   455  	}
   456  	// By default, when the parser sees just the overall bounds, we set the point
   457  	// keys. This preserves backwards compatability with existing test cases that
   458  	// specify only the overall bounds.
   459  	if !m.HasPointKeys && !m.HasRangeKeys {
   460  		m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest
   461  		m.HasPointKeys = true
   462  	}
   463  	return
   464  }
   465  
   466  // Validate validates the metadata for consistency with itself, returning an
   467  // error if inconsistent.
   468  func (m *FileMetadata) Validate(cmp Compare, formatKey base.FormatKey) error {
   469  	// Combined range and point key validation.
   470  
   471  	if !m.HasPointKeys && !m.HasRangeKeys {
   472  		return base.CorruptionErrorf("file %s has neither point nor range keys",
   473  			errors.Safe(m.FileNum))
   474  	}
   475  	if base.InternalCompare(cmp, m.Smallest, m.Largest) > 0 {
   476  		return base.CorruptionErrorf("file %s has inconsistent bounds: %s vs %s",
   477  			errors.Safe(m.FileNum), m.Smallest.Pretty(formatKey),
   478  			m.Largest.Pretty(formatKey))
   479  	}
   480  	if m.SmallestSeqNum > m.LargestSeqNum {
   481  		return base.CorruptionErrorf("file %s has inconsistent seqnum bounds: %d vs %d",
   482  			errors.Safe(m.FileNum), m.SmallestSeqNum, m.LargestSeqNum)
   483  	}
   484  
   485  	// Point key validation.
   486  
   487  	if m.HasPointKeys {
   488  		if base.InternalCompare(cmp, m.SmallestPointKey, m.LargestPointKey) > 0 {
   489  			return base.CorruptionErrorf("file %s has inconsistent point key bounds: %s vs %s",
   490  				errors.Safe(m.FileNum), m.SmallestPointKey.Pretty(formatKey),
   491  				m.LargestPointKey.Pretty(formatKey))
   492  		}
   493  		if base.InternalCompare(cmp, m.SmallestPointKey, m.Smallest) < 0 ||
   494  			base.InternalCompare(cmp, m.LargestPointKey, m.Largest) > 0 {
   495  			return base.CorruptionErrorf(
   496  				"file %s has inconsistent point key bounds relative to overall bounds: "+
   497  					"overall = [%s-%s], point keys = [%s-%s]",
   498  				errors.Safe(m.FileNum),
   499  				m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey),
   500  				m.SmallestPointKey.Pretty(formatKey), m.LargestPointKey.Pretty(formatKey),
   501  			)
   502  		}
   503  	}
   504  
   505  	// Range key validation.
   506  
   507  	if m.HasRangeKeys {
   508  		if base.InternalCompare(cmp, m.SmallestRangeKey, m.LargestRangeKey) > 0 {
   509  			return base.CorruptionErrorf("file %s has inconsistent range key bounds: %s vs %s",
   510  				errors.Safe(m.FileNum), m.SmallestRangeKey.Pretty(formatKey),
   511  				m.LargestRangeKey.Pretty(formatKey))
   512  		}
   513  		if base.InternalCompare(cmp, m.SmallestRangeKey, m.Smallest) < 0 ||
   514  			base.InternalCompare(cmp, m.LargestRangeKey, m.Largest) > 0 {
   515  			return base.CorruptionErrorf(
   516  				"file %s has inconsistent range key bounds relative to overall bounds: "+
   517  					"overall = [%s-%s], range keys = [%s-%s]",
   518  				errors.Safe(m.FileNum),
   519  				m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey),
   520  				m.SmallestRangeKey.Pretty(formatKey), m.LargestRangeKey.Pretty(formatKey),
   521  			)
   522  		}
   523  	}
   524  
   525  	return nil
   526  }
   527  
   528  // TableInfo returns a subset of the FileMetadata state formatted as a
   529  // TableInfo.
   530  func (m *FileMetadata) TableInfo() TableInfo {
   531  	return TableInfo{
   532  		FileNum:        m.FileNum,
   533  		Size:           m.Size,
   534  		Smallest:       m.Smallest,
   535  		Largest:        m.Largest,
   536  		SmallestSeqNum: m.SmallestSeqNum,
   537  		LargestSeqNum:  m.LargestSeqNum,
   538  	}
   539  }
   540  
   541  func cmpUint64(a, b uint64) int {
   542  	switch {
   543  	case a < b:
   544  		return -1
   545  	case a > b:
   546  		return +1
   547  	default:
   548  		return 0
   549  	}
   550  }
   551  
   552  func (m *FileMetadata) cmpSeqNum(b *FileMetadata) int {
   553  	// NB: This is the same ordering that RocksDB uses for L0 files.
   554  
   555  	// Sort first by largest sequence number.
   556  	if m.LargestSeqNum != b.LargestSeqNum {
   557  		return cmpUint64(m.LargestSeqNum, b.LargestSeqNum)
   558  	}
   559  	// Then by smallest sequence number.
   560  	if m.SmallestSeqNum != b.SmallestSeqNum {
   561  		return cmpUint64(m.SmallestSeqNum, b.SmallestSeqNum)
   562  	}
   563  	// Break ties by file number.
   564  	return cmpUint64(uint64(m.FileNum), uint64(b.FileNum))
   565  }
   566  
   567  func (m *FileMetadata) lessSeqNum(b *FileMetadata) bool {
   568  	return m.cmpSeqNum(b) < 0
   569  }
   570  
   571  func (m *FileMetadata) cmpSmallestKey(b *FileMetadata, cmp Compare) int {
   572  	return base.InternalCompare(cmp, m.Smallest, b.Smallest)
   573  }
   574  
   575  // KeyRange returns the minimum smallest and maximum largest internalKey for
   576  // all the FileMetadata in iters.
   577  func KeyRange(ucmp Compare, iters ...LevelIterator) (smallest, largest InternalKey) {
   578  	first := true
   579  	for _, iter := range iters {
   580  		for meta := iter.First(); meta != nil; meta = iter.Next() {
   581  			if first {
   582  				first = false
   583  				smallest, largest = meta.Smallest, meta.Largest
   584  				continue
   585  			}
   586  			if base.InternalCompare(ucmp, smallest, meta.Smallest) >= 0 {
   587  				smallest = meta.Smallest
   588  			}
   589  			if base.InternalCompare(ucmp, largest, meta.Largest) <= 0 {
   590  				largest = meta.Largest
   591  			}
   592  		}
   593  	}
   594  	return smallest, largest
   595  }
   596  
   597  type bySeqNum []*FileMetadata
   598  
   599  func (b bySeqNum) Len() int { return len(b) }
   600  func (b bySeqNum) Less(i, j int) bool {
   601  	return b[i].lessSeqNum(b[j])
   602  }
   603  func (b bySeqNum) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
   604  
   605  // SortBySeqNum sorts the specified files by increasing sequence number.
   606  func SortBySeqNum(files []*FileMetadata) {
   607  	sort.Sort(bySeqNum(files))
   608  }
   609  
   610  type bySmallest struct {
   611  	files []*FileMetadata
   612  	cmp   Compare
   613  }
   614  
   615  func (b bySmallest) Len() int { return len(b.files) }
   616  func (b bySmallest) Less(i, j int) bool {
   617  	return b.files[i].cmpSmallestKey(b.files[j], b.cmp) < 0
   618  }
   619  func (b bySmallest) Swap(i, j int) { b.files[i], b.files[j] = b.files[j], b.files[i] }
   620  
   621  // SortBySmallest sorts the specified files by smallest key using the supplied
   622  // comparison function to order user keys.
   623  func SortBySmallest(files []*FileMetadata, cmp Compare) {
   624  	sort.Sort(bySmallest{files, cmp})
   625  }
   626  
   627  func overlaps(iter LevelIterator, cmp Compare, start, end []byte, exclusiveEnd bool) LevelSlice {
   628  	startIter := iter.Clone()
   629  	startIter.SeekGE(cmp, start)
   630  
   631  	// SeekGE compares user keys. The user key `start` may be equal to the
   632  	// f.Largest because f.Largest is a range deletion sentinel, indicating that
   633  	// the user key `start` is NOT contained within the file f. If that's the
   634  	// case, we can narrow the overlapping bounds to exclude the file with the
   635  	// sentinel.
   636  	if f := startIter.Current(); f != nil && f.Largest.IsExclusiveSentinel() &&
   637  		cmp(f.Largest.UserKey, start) == 0 {
   638  		startIter.Next()
   639  	}
   640  
   641  	endIter := iter.Clone()
   642  	endIter.SeekGE(cmp, end)
   643  
   644  	if !exclusiveEnd {
   645  		// endIter is now pointing at the *first* file with a largest key >= end.
   646  		// If there are multiple files including the user key `end`, we want all
   647  		// of them, so move forward.
   648  		for f := endIter.Current(); f != nil && cmp(f.Largest.UserKey, end) == 0; {
   649  			f = endIter.Next()
   650  		}
   651  	}
   652  
   653  	// LevelSlice uses inclusive bounds, so if we seeked to the end sentinel
   654  	// or nexted too far because Largest.UserKey equaled `end`, go back.
   655  	//
   656  	// Consider !exclusiveEnd and end = 'f', with the following file bounds:
   657  	//
   658  	//     [b,d] [e, f] [f, f] [g, h]
   659  	//
   660  	// the above for loop will Next until it arrives at [g, h]. We need to
   661  	// observe that g > f, and Prev to the file with bounds [f, f].
   662  	if !endIter.iter.valid() {
   663  		endIter.Prev()
   664  	} else if c := cmp(endIter.Current().Smallest.UserKey, end); c > 0 || c == 0 && exclusiveEnd {
   665  		endIter.Prev()
   666  	}
   667  
   668  	iter = startIter.Clone()
   669  	return LevelSlice{
   670  		iter:  iter.iter,
   671  		start: &startIter.iter,
   672  		end:   &endIter.iter,
   673  	}
   674  }
   675  
   676  // NumLevels is the number of levels a Version contains.
   677  const NumLevels = 7
   678  
   679  // NewVersion constructs a new Version with the provided files. It requires
   680  // the provided files are already well-ordered. It's intended for testing.
   681  func NewVersion(
   682  	cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, files [NumLevels][]*FileMetadata,
   683  ) *Version {
   684  	var v Version
   685  	for l := range files {
   686  		// NB: We specifically insert `files` into the B-Tree in the order
   687  		// they appear within `files`. Some tests depend on this behavior in
   688  		// order to test consistency checking, etc. Once we've constructed the
   689  		// initial B-Tree, we swap out the btreeCmp for the correct one.
   690  		// TODO(jackson): Adjust or remove the tests and remove this.
   691  		v.Levels[l].tree, _ = makeBTree(btreeCmpSpecificOrder(files[l]), files[l])
   692  		v.Levels[l].level = l
   693  		if l == 0 {
   694  			v.Levels[l].tree.cmp = btreeCmpSeqNum
   695  		} else {
   696  			v.Levels[l].tree.cmp = btreeCmpSmallestKey(cmp)
   697  		}
   698  	}
   699  	if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil {
   700  		panic(err)
   701  	}
   702  	return &v
   703  }
   704  
   705  // Version is a collection of file metadata for on-disk tables at various
   706  // levels. In-memory DBs are written to level-0 tables, and compactions
   707  // migrate data from level N to level N+1. The tables map internal keys (which
   708  // are a user key, a delete or set bit, and a sequence number) to user values.
   709  //
   710  // The tables at level 0 are sorted by largest sequence number. Due to file
   711  // ingestion, there may be overlap in the ranges of sequence numbers contain in
   712  // level 0 sstables. In particular, it is valid for one level 0 sstable to have
   713  // the seqnum range [1,100] while an adjacent sstable has the seqnum range
   714  // [50,50]. This occurs when the [50,50] table was ingested and given a global
   715  // seqnum. The ingestion code will have ensured that the [50,50] sstable will
   716  // not have any keys that overlap with the [1,100] in the seqnum range
   717  // [1,49]. The range of internal keys [fileMetadata.smallest,
   718  // fileMetadata.largest] in each level 0 table may overlap.
   719  //
   720  // The tables at any non-0 level are sorted by their internal key range and any
   721  // two tables at the same non-0 level do not overlap.
   722  //
   723  // The internal key ranges of two tables at different levels X and Y may
   724  // overlap, for any X != Y.
   725  //
   726  // Finally, for every internal key in a table at level X, there is no internal
   727  // key in a higher level table that has both the same user key and a higher
   728  // sequence number.
   729  type Version struct {
   730  	refs int32
   731  
   732  	// The level 0 sstables are organized in a series of sublevels. Similar to
   733  	// the seqnum invariant in normal levels, there is no internal key in a
   734  	// higher level table that has both the same user key and a higher sequence
   735  	// number. Within a sublevel, tables are sorted by their internal key range
   736  	// and any two tables at the same sublevel do not overlap. Unlike the normal
   737  	// levels, sublevel n contains older tables (lower sequence numbers) than
   738  	// sublevel n+1.
   739  	//
   740  	// The L0Sublevels struct is mostly used for compaction picking. As most
   741  	// internal data structures in it are only necessary for compaction picking
   742  	// and not for iterator creation, the reference to L0Sublevels is nil'd
   743  	// after this version becomes the non-newest version, to reduce memory
   744  	// usage.
   745  	//
   746  	// L0Sublevels.Levels contains L0 files ordered by sublevels. All the files
   747  	// in Files[0] are in L0Sublevels.Levels. L0SublevelFiles is also set to
   748  	// a reference to that slice, as that slice is necessary for iterator
   749  	// creation and needs to outlast L0Sublevels.
   750  	L0Sublevels     *L0Sublevels
   751  	L0SublevelFiles []LevelSlice
   752  
   753  	Levels [NumLevels]LevelMetadata
   754  
   755  	// RangeKeyLevels holds a subset of the same files as Levels that contain range
   756  	// keys (i.e. fileMeta.HasRangeKeys == true). The memory amplification of this
   757  	// duplication should be minimal, as range keys are expected to be rare.
   758  	RangeKeyLevels [NumLevels]LevelMetadata
   759  
   760  	// The callback to invoke when the last reference to a version is
   761  	// removed. Will be called with list.mu held.
   762  	Deleted func(obsolete []*FileMetadata)
   763  
   764  	// Stats holds aggregated stats about the version maintained from
   765  	// version to version.
   766  	Stats struct {
   767  		// MarkedForCompaction records the count of files marked for
   768  		// compaction within the version.
   769  		MarkedForCompaction int
   770  	}
   771  
   772  	// The list the version is linked into.
   773  	list *VersionList
   774  
   775  	// The next/prev link for the versionList doubly-linked list of versions.
   776  	prev, next *Version
   777  }
   778  
   779  // String implements fmt.Stringer, printing the FileMetadata for each level in
   780  // the Version.
   781  func (v *Version) String() string {
   782  	return v.string(base.DefaultFormatter, false)
   783  }
   784  
   785  // DebugString returns an alternative format to String() which includes sequence
   786  // number and kind information for the sstable boundaries.
   787  func (v *Version) DebugString(format base.FormatKey) string {
   788  	return v.string(format, true)
   789  }
   790  
   791  func (v *Version) string(format base.FormatKey, verbose bool) string {
   792  	var buf bytes.Buffer
   793  	if len(v.L0SublevelFiles) > 0 {
   794  		for sublevel := len(v.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- {
   795  			fmt.Fprintf(&buf, "0.%d:\n", sublevel)
   796  			v.L0SublevelFiles[sublevel].Each(func(f *FileMetadata) {
   797  				fmt.Fprintf(&buf, "  %s\n", f.DebugString(format, verbose))
   798  			})
   799  		}
   800  	}
   801  	for level := 1; level < NumLevels; level++ {
   802  		if v.Levels[level].Empty() {
   803  			continue
   804  		}
   805  		fmt.Fprintf(&buf, "%d:\n", level)
   806  		iter := v.Levels[level].Iter()
   807  		for f := iter.First(); f != nil; f = iter.Next() {
   808  			fmt.Fprintf(&buf, "  %s\n", f.DebugString(format, verbose))
   809  		}
   810  	}
   811  	return buf.String()
   812  }
   813  
   814  // ParseVersionDebug parses a Version from its DebugString output.
   815  func ParseVersionDebug(
   816  	cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, s string,
   817  ) (*Version, error) {
   818  	var level int
   819  	var files [NumLevels][]*FileMetadata
   820  	for _, l := range strings.Split(s, "\n") {
   821  		l = strings.TrimSpace(l)
   822  
   823  		switch l[:2] {
   824  		case "0.", "0:", "1:", "2:", "3:", "4:", "5:", "6:":
   825  			var err error
   826  			level, err = strconv.Atoi(l[:1])
   827  			if err != nil {
   828  				return nil, err
   829  			}
   830  		default:
   831  			m, err := ParseFileMetadataDebug(l)
   832  			if err != nil {
   833  				return nil, err
   834  			}
   835  			// If we only parsed overall bounds, default to setting the point bounds.
   836  			if !m.HasPointKeys && !m.HasRangeKeys {
   837  				m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest
   838  				m.HasPointKeys = true
   839  			}
   840  			files[level] = append(files[level], &m)
   841  		}
   842  	}
   843  	// Reverse the order of L0 files. This ensures we construct the same
   844  	// sublevels. (They're printed from higher sublevel to lower, which means in
   845  	// a partial order that represents newest to oldest).
   846  	for i := 0; i < len(files[0])/2; i++ {
   847  		files[0][i], files[0][len(files[0])-i-1] = files[0][len(files[0])-i-1], files[0][i]
   848  	}
   849  	return NewVersion(cmp, formatKey, flushSplitBytes, files), nil
   850  }
   851  
   852  // Refs returns the number of references to the version.
   853  func (v *Version) Refs() int32 {
   854  	return atomic.LoadInt32(&v.refs)
   855  }
   856  
   857  // Ref increments the version refcount.
   858  func (v *Version) Ref() {
   859  	atomic.AddInt32(&v.refs, 1)
   860  }
   861  
   862  // Unref decrements the version refcount. If the last reference to the version
   863  // was removed, the version is removed from the list of versions and the
   864  // Deleted callback is invoked. Requires that the VersionList mutex is NOT
   865  // locked.
   866  func (v *Version) Unref() {
   867  	if atomic.AddInt32(&v.refs, -1) == 0 {
   868  		obsolete := v.unrefFiles()
   869  		l := v.list
   870  		l.mu.Lock()
   871  		l.Remove(v)
   872  		v.Deleted(obsolete)
   873  		l.mu.Unlock()
   874  	}
   875  }
   876  
   877  // UnrefLocked decrements the version refcount. If the last reference to the
   878  // version was removed, the version is removed from the list of versions and
   879  // the Deleted callback is invoked. Requires that the VersionList mutex is
   880  // already locked.
   881  func (v *Version) UnrefLocked() {
   882  	if atomic.AddInt32(&v.refs, -1) == 0 {
   883  		v.list.Remove(v)
   884  		v.Deleted(v.unrefFiles())
   885  	}
   886  }
   887  
   888  func (v *Version) unrefFiles() []*FileMetadata {
   889  	var obsolete []*FileMetadata
   890  	for _, lm := range v.Levels {
   891  		obsolete = append(obsolete, lm.release()...)
   892  	}
   893  	return obsolete
   894  }
   895  
   896  // Next returns the next version in the list of versions.
   897  func (v *Version) Next() *Version {
   898  	return v.next
   899  }
   900  
   901  // InitL0Sublevels initializes the L0Sublevels
   902  func (v *Version) InitL0Sublevels(
   903  	cmp Compare, formatKey base.FormatKey, flushSplitBytes int64,
   904  ) error {
   905  	var err error
   906  	v.L0Sublevels, err = NewL0Sublevels(&v.Levels[0], cmp, formatKey, flushSplitBytes)
   907  	if err == nil && v.L0Sublevels != nil {
   908  		v.L0SublevelFiles = v.L0Sublevels.Levels
   909  	}
   910  	return err
   911  }
   912  
   913  // Contains returns a boolean indicating whether the provided file exists in
   914  // the version at the given level. If level is non-zero then Contains binary
   915  // searches among the files. If level is zero, Contains scans the entire
   916  // level.
   917  func (v *Version) Contains(level int, cmp Compare, m *FileMetadata) bool {
   918  	iter := v.Levels[level].Iter()
   919  	if level > 0 {
   920  		overlaps := v.Overlaps(level, cmp, m.Smallest.UserKey, m.Largest.UserKey,
   921  			m.Largest.IsExclusiveSentinel())
   922  		iter = overlaps.Iter()
   923  	}
   924  	for f := iter.First(); f != nil; f = iter.Next() {
   925  		if f == m {
   926  			return true
   927  		}
   928  	}
   929  	return false
   930  }
   931  
   932  // Overlaps returns all elements of v.files[level] whose user key range
   933  // intersects the inclusive range [start, end]. If level is non-zero then the
   934  // user key ranges of v.files[level] are assumed to not overlap (although they
   935  // may touch). If level is zero then that assumption cannot be made, and the
   936  // [start, end] range is expanded to the union of those matching ranges so far
   937  // and the computation is repeated until [start, end] stabilizes.
   938  // The returned files are a subsequence of the input files, i.e., the ordering
   939  // is not changed.
   940  func (v *Version) Overlaps(
   941  	level int, cmp Compare, start, end []byte, exclusiveEnd bool,
   942  ) LevelSlice {
   943  	if level == 0 {
   944  		// Indices that have been selected as overlapping.
   945  		l0 := v.Levels[level]
   946  		l0Iter := l0.Iter()
   947  		selectedIndices := make([]bool, l0.Len())
   948  		numSelected := 0
   949  		var slice LevelSlice
   950  		for {
   951  			restart := false
   952  			for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() {
   953  				selected := selectedIndices[i]
   954  				if selected {
   955  					continue
   956  				}
   957  				smallest := meta.Smallest.UserKey
   958  				largest := meta.Largest.UserKey
   959  				if c := cmp(largest, start); c < 0 || c == 0 && meta.Largest.IsExclusiveSentinel() {
   960  					// meta is completely before the specified range; skip it.
   961  					continue
   962  				}
   963  				if c := cmp(smallest, end); c > 0 || c == 0 && exclusiveEnd {
   964  					// meta is completely after the specified range; skip it.
   965  					continue
   966  				}
   967  				// Overlaps.
   968  				selectedIndices[i] = true
   969  				numSelected++
   970  
   971  				// Since level == 0, check if the newly added fileMetadata has
   972  				// expanded the range. We expand the range immediately for files
   973  				// we have remaining to check in this loop. All already checked
   974  				// and unselected files will need to be rechecked via the
   975  				// restart below.
   976  				if cmp(smallest, start) < 0 {
   977  					start = smallest
   978  					restart = true
   979  				}
   980  				if v := cmp(largest, end); v > 0 {
   981  					end = largest
   982  					exclusiveEnd = meta.Largest.IsExclusiveSentinel()
   983  					restart = true
   984  				} else if v == 0 && exclusiveEnd && !meta.Largest.IsExclusiveSentinel() {
   985  					// Only update the exclusivity of our existing `end`
   986  					// bound.
   987  					exclusiveEnd = false
   988  					restart = true
   989  				}
   990  			}
   991  
   992  			if !restart {
   993  				// Construct a B-Tree containing only the matching items.
   994  				var tr btree
   995  				tr.cmp = v.Levels[level].tree.cmp
   996  				for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() {
   997  					if selectedIndices[i] {
   998  						err := tr.insert(meta)
   999  						if err != nil {
  1000  							panic(err)
  1001  						}
  1002  					}
  1003  				}
  1004  				slice = LevelSlice{iter: tr.iter(), length: tr.length}
  1005  				// TODO(jackson): Avoid the oddity of constructing and
  1006  				// immediately releasing a B-Tree. Make LevelSlice an
  1007  				// interface?
  1008  				tr.release()
  1009  				break
  1010  			}
  1011  			// Continue looping to retry the files that were not selected.
  1012  		}
  1013  		return slice
  1014  	}
  1015  
  1016  	return overlaps(v.Levels[level].Iter(), cmp, start, end, exclusiveEnd)
  1017  }
  1018  
  1019  // CheckOrdering checks that the files are consistent with respect to
  1020  // increasing file numbers (for level 0 files) and increasing and non-
  1021  // overlapping internal key ranges (for level non-0 files).
  1022  func (v *Version) CheckOrdering(cmp Compare, format base.FormatKey) error {
  1023  	for sublevel := len(v.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- {
  1024  		sublevelIter := v.L0SublevelFiles[sublevel].Iter()
  1025  		if err := CheckOrdering(cmp, format, L0Sublevel(sublevel), sublevelIter); err != nil {
  1026  			return base.CorruptionErrorf("%s\n%s", err, v.DebugString(format))
  1027  		}
  1028  	}
  1029  
  1030  	for level, lm := range v.Levels {
  1031  		if err := CheckOrdering(cmp, format, Level(level), lm.Iter()); err != nil {
  1032  			return base.CorruptionErrorf("%s\n%s", err, v.DebugString(format))
  1033  		}
  1034  	}
  1035  	return nil
  1036  }
  1037  
  1038  // CheckConsistency checks that all of the files listed in the version exist
  1039  // and their on-disk sizes match the sizes listed in the version.
  1040  func (v *Version) CheckConsistency(dirname string, fs vfs.FS) error {
  1041  	var buf bytes.Buffer
  1042  	var args []interface{}
  1043  
  1044  	for level, files := range v.Levels {
  1045  		iter := files.Iter()
  1046  		for f := iter.First(); f != nil; f = iter.Next() {
  1047  			path := base.MakeFilepath(fs, dirname, base.FileTypeTable, f.FileNum)
  1048  			info, err := fs.Stat(path)
  1049  			if err != nil {
  1050  				buf.WriteString("L%d: %s: %v\n")
  1051  				args = append(args, errors.Safe(level), errors.Safe(f.FileNum), err)
  1052  				continue
  1053  			}
  1054  			if info.Size() != int64(f.Size) {
  1055  				buf.WriteString("L%d: %s: file size mismatch (%s): %d (disk) != %d (MANIFEST)\n")
  1056  				args = append(args, errors.Safe(level), errors.Safe(f.FileNum), path,
  1057  					errors.Safe(info.Size()), errors.Safe(f.Size))
  1058  				continue
  1059  			}
  1060  		}
  1061  	}
  1062  
  1063  	if buf.Len() == 0 {
  1064  		return nil
  1065  	}
  1066  	return errors.Errorf(buf.String(), args...)
  1067  }
  1068  
  1069  // VersionList holds a list of versions. The versions are ordered from oldest
  1070  // to newest.
  1071  type VersionList struct {
  1072  	mu   *sync.Mutex
  1073  	root Version
  1074  }
  1075  
  1076  // Init initializes the version list.
  1077  func (l *VersionList) Init(mu *sync.Mutex) {
  1078  	l.mu = mu
  1079  	l.root.next = &l.root
  1080  	l.root.prev = &l.root
  1081  }
  1082  
  1083  // Empty returns true if the list is empty, and false otherwise.
  1084  func (l *VersionList) Empty() bool {
  1085  	return l.root.next == &l.root
  1086  }
  1087  
  1088  // Front returns the oldest version in the list. Note that this version is only
  1089  // valid if Empty() returns true.
  1090  func (l *VersionList) Front() *Version {
  1091  	return l.root.next
  1092  }
  1093  
  1094  // Back returns the newest version in the list. Note that this version is only
  1095  // valid if Empty() returns true.
  1096  func (l *VersionList) Back() *Version {
  1097  	return l.root.prev
  1098  }
  1099  
  1100  // PushBack adds a new version to the back of the list. This new version
  1101  // becomes the "newest" version in the list.
  1102  func (l *VersionList) PushBack(v *Version) {
  1103  	if v.list != nil || v.prev != nil || v.next != nil {
  1104  		panic("bitalostable: version list is inconsistent")
  1105  	}
  1106  	v.prev = l.root.prev
  1107  	v.prev.next = v
  1108  	v.next = &l.root
  1109  	v.next.prev = v
  1110  	v.list = l
  1111  	// Let L0Sublevels on the second newest version get GC'd, as it is no longer
  1112  	// necessary. See the comment in Version.
  1113  	v.prev.L0Sublevels = nil
  1114  }
  1115  
  1116  // Remove removes the specified version from the list.
  1117  func (l *VersionList) Remove(v *Version) {
  1118  	if v == &l.root {
  1119  		panic("bitalostable: cannot remove version list root node")
  1120  	}
  1121  	if v.list != l {
  1122  		panic("bitalostable: version list is inconsistent")
  1123  	}
  1124  	v.prev.next = v.next
  1125  	v.next.prev = v.prev
  1126  	v.next = nil // avoid memory leaks
  1127  	v.prev = nil // avoid memory leaks
  1128  	v.list = nil // avoid memory leaks
  1129  }
  1130  
  1131  // CheckOrdering checks that the files are consistent with respect to
  1132  // seqnums (for level 0 files -- see detailed comment below) and increasing and non-
  1133  // overlapping internal key ranges (for non-level 0 files).
  1134  func CheckOrdering(cmp Compare, format base.FormatKey, level Level, files LevelIterator) error {
  1135  	// The invariants to check for L0 sublevels are the same as the ones to
  1136  	// check for all other levels. However, if L0 is not organized into
  1137  	// sublevels, or if all L0 files are being passed in, we do the legacy L0
  1138  	// checks, defined in the detailed comment below.
  1139  	if level == Level(0) {
  1140  		// We have 2 kinds of files:
  1141  		// - Files with exactly one sequence number: these could be either ingested files
  1142  		//   or flushed files. We cannot tell the difference between them based on FileMetadata,
  1143  		//   so our consistency checking here uses the weaker checks assuming it is a narrow
  1144  		//   flushed file. We cannot error on ingested files having sequence numbers coincident
  1145  		//   with flushed files as the seemingly ingested file could just be a flushed file
  1146  		//   with just one key in it which is a truncated range tombstone sharing sequence numbers
  1147  		//   with other files in the same flush.
  1148  		// - Files with multiple sequence numbers: these are necessarily flushed files.
  1149  		//
  1150  		// Three cases of overlapping sequence numbers:
  1151  		// Case 1:
  1152  		// An ingested file contained in the sequence numbers of the flushed file -- it must be
  1153  		// fully contained (not coincident with either end of the flushed file) since the memtable
  1154  		// must have been at [a, b-1] (where b > a) when the ingested file was assigned sequence
  1155  		// num b, and the memtable got a subsequent update that was given sequence num b+1, before
  1156  		// being flushed.
  1157  		//
  1158  		// So a sequence [1000, 1000] [1002, 1002] [1000, 2000] is invalid since the first and
  1159  		// third file are inconsistent with each other. So comparing adjacent files is insufficient
  1160  		// for consistency checking.
  1161  		//
  1162  		// Visually we have something like
  1163  		// x------y x-----------yx-------------y (flushed files where x, y are the endpoints)
  1164  		//     y       y  y        y             (y's represent ingested files)
  1165  		// And these are ordered in increasing order of y. Note that y's must be unique.
  1166  		//
  1167  		// Case 2:
  1168  		// A flushed file that did not overlap in keys with any file in any level, but does overlap
  1169  		// in the file key intervals. This file is placed in L0 since it overlaps in the file
  1170  		// key intervals but since it has no overlapping data, it is assigned a sequence number
  1171  		// of 0 in RocksDB. We handle this case for compatibility with RocksDB.
  1172  		//
  1173  		// Case 3:
  1174  		// A sequence of flushed files that overlap in sequence numbers with one another,
  1175  		// but do not overlap in keys inside the sstables. These files correspond to
  1176  		// partitioned flushes or the results of intra-L0 compactions of partitioned
  1177  		// flushes.
  1178  		//
  1179  		// Since these types of SSTables violate most other sequence number
  1180  		// overlap invariants, and handling this case is important for compatibility
  1181  		// with future versions of bitalostable, this method relaxes most L0 invariant
  1182  		// checks.
  1183  
  1184  		var prev *FileMetadata
  1185  		for f := files.First(); f != nil; f, prev = files.Next(), f {
  1186  			if prev == nil {
  1187  				continue
  1188  			}
  1189  			// Validate that the sorting is sane.
  1190  			if prev.LargestSeqNum == 0 && f.LargestSeqNum == prev.LargestSeqNum {
  1191  				// Multiple files satisfying case 2 mentioned above.
  1192  			} else if !prev.lessSeqNum(f) {
  1193  				return base.CorruptionErrorf("L0 files %s and %s are not properly ordered: <#%d-#%d> vs <#%d-#%d>",
  1194  					errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
  1195  					errors.Safe(prev.SmallestSeqNum), errors.Safe(prev.LargestSeqNum),
  1196  					errors.Safe(f.SmallestSeqNum), errors.Safe(f.LargestSeqNum))
  1197  			}
  1198  		}
  1199  	} else {
  1200  		var prev *FileMetadata
  1201  		for f := files.First(); f != nil; f, prev = files.Next(), f {
  1202  			if err := f.Validate(cmp, format); err != nil {
  1203  				return errors.Wrapf(err, "%s ", level)
  1204  			}
  1205  			if prev != nil {
  1206  				if prev.cmpSmallestKey(f, cmp) >= 0 {
  1207  					return base.CorruptionErrorf("%s files %s and %s are not properly ordered: [%s-%s] vs [%s-%s]",
  1208  						errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
  1209  						prev.Smallest.Pretty(format), prev.Largest.Pretty(format),
  1210  						f.Smallest.Pretty(format), f.Largest.Pretty(format))
  1211  				}
  1212  				if base.InternalCompare(cmp, prev.Largest, f.Smallest) >= 0 {
  1213  					return base.CorruptionErrorf("%s files %s and %s have overlapping ranges: [%s-%s] vs [%s-%s]",
  1214  						errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
  1215  						prev.Smallest.Pretty(format), prev.Largest.Pretty(format),
  1216  						f.Smallest.Pretty(format), f.Largest.Pretty(format))
  1217  				}
  1218  			}
  1219  		}
  1220  	}
  1221  	return nil
  1222  }