github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/sstable/format.go (about)

     1  // Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package sstable
     6  
     7  import (
     8  	"github.com/cockroachdb/errors"
     9  	"github.com/cockroachdb/pebble/internal/base"
    10  )
    11  
    12  // TableFormat specifies the format version for sstables. The legacy LevelDB
    13  // format is format version 1.
    14  type TableFormat uint32
    15  
    16  // The available table formats, representing the tuple (magic number, version
    17  // number). Note that these values are not (and should not) be serialized to
    18  // disk. The ordering should follow the order the versions were introduced to
    19  // Pebble (i.e. the history is linear).
    20  const (
    21  	TableFormatUnspecified TableFormat = iota
    22  	TableFormatLevelDB
    23  	TableFormatRocksDBv2
    24  	TableFormatPebblev1 // Block properties.
    25  	TableFormatPebblev2 // Range keys.
    26  	TableFormatPebblev3 // Value blocks.
    27  	TableFormatPebblev4 // DELSIZED tombstones.
    28  	NumTableFormats
    29  
    30  	TableFormatMax = NumTableFormats - 1
    31  )
    32  
    33  // TableFormatPebblev4, in addition to DELSIZED, introduces the use of
    34  // InternalKeyKindSSTableInternalObsoleteBit.
    35  //
    36  // 1. Motivation
    37  //
    38  // We have various related problems caused by Pebble snapshots:
    39  //
    40  // - P1: RANGEDELs that delete points in the same sstable, but the points
    41  //   happen to not get deleted during compactions because of an open snapshot.
    42  //   This causes very expensive iteration, that has been observed in
    43  //   production deployments
    44  //
    45  // - P2: When iterating over a foreign sstable (in disaggregated storage), we
    46  //   need to do (a) point collapsing to expose at most one point per user key,
    47  //   (b) apply RANGEDELs in the sstable to hide deleted points in the same
    48  //   sstable. This per-sstable point collapsing iteration needs to be very
    49  //   efficient (ideally as efficient from a CPU perspective as iteration over
    50  //   regular sstables) since foreign sstables can be very long-lived -- one of
    51  //   the goals of disaggregated storage is to scale compute and disk bandwidth
    52  //   resources as a function of the hot (from a write perspective) data and
    53  //   not the whole data, so we don't want to have to rewrite foreign sstables
    54  //   solely to improve read performance.
    55  //
    56  // The ideal solution for P2 would allow user-facing reads to utilize the
    57  // existing SST iterators (with slight modifications) and with no loss of
    58  // efficiency. And for P1 and P2 we would like to skip whole blocks of
    59  // overwritten/deleted points. Even when we can't skip whole blocks, avoiding
    60  // key comparisons at iteration time to discover what points are deleted is
    61  // very desirable, since keys can be long.
    62  //
    63  // We observe that:
    64  //
    65  // - Reads:
    66  //   - All user-facing reads in CockroachDB use iterators over the DB, hence
    67  //     have a higher read seqnum than all sstables (there are some rare cases
    68  //     that can violate this, but those are not important from a performance
    69  //     optimization perspective).
    70  //
    71  //   - Certain internal-facing reads in CockroachDB use snapshots, but the
    72  //     snapshots are shortlived enough that most L5 and L6 sstables will have
    73  //     all seqnums lower than the snapshot seqnum.
    74  //
    75  // - Writes:
    76  //   - We already do key comparisons between points when writing the sstable
    77  //     to ensure that the sstable invariant (monotonically increasing internal
    78  //     keys) is not violated. So we know which points share the same userkey,
    79  //     and thereby which points are obsolete because there is a more recent
    80  //     point in the same sstable.
    81  //
    82  //   - The compactionIter knows which point id deleted by a RANGEDEL even if
    83  //     the point does need to be written because of a snapshot.
    84  //
    85  //   So this known information can be encoded in the sstable at write time and
    86  //   utilized for optimized reading.
    87  //
    88  // 2. Solution
    89  //
    90  // We primarily scope the solution to the following point kinds: SET,
    91  // SETWITHDEL, DEL, DELSIZED, SINGLEDEL. These are the ones marked locally
    92  // obsolete, i.e., obsolete within the sstable, and we can guarantee that at
    93  // most one point will be exposed per user key. MERGE keys create more
    94  // complexity: MERGE followed by MERGE causes multiple keys to not be
    95  // obsolete. Same applies for MERGE followed by SET/SETWITHDEL/DEL*. Note
    96  // that:
    97  //
    98  // - For regular sst iteration, the obsolete marking is a performance
    99  //   optimization, and multiple keys for the same userkey can be handled by
   100  //   higher layers in the iterator tree (specifically pebble.Iterator).
   101  //
   102  // - For foreign sst iteration, we disallow MERGEs to be written to such
   103  //   shared ssts (details below).
   104  //
   105  // The key kinds are marked with an obsolete bit
   106  // (InternalKeyKindSSTableInternalObsoleteBit) when the key-value pair is
   107  // obsolete. This marking is done within blockWriter, based on information
   108  // passed to it by Writer. In turn, Writer uses a combination of key
   109  // comparisons, and information provided by compactionIter to decide whether a
   110  // key-value pair is obsolete. Additionally, a Pebble-internal
   111  // BlockPropertyCollector (obsoleteKeyBlockPropertyCollector) is used to mark
   112  // blocks where all key-value pairs are obsolete. Since the common case is
   113  // non-obsolete blocks, this block property collector uses the empty byte
   114  // slice to represent a non-obsolete block, which consumes no space in
   115  // BlockHandleWithProperties.Props.
   116  //
   117  // At read time, the obsolete bit is only visible to the blockIter, which can
   118  // be optionally configured to hide obsolete points. This hiding is only
   119  // configured for data block iterators for sstables being read by user-facing
   120  // iterators at a seqnum greater than the max seqnum in the sstable.
   121  // Additionally, when this hiding is configured, a Pebble-internal block
   122  // property filter (obsoleteKeyBlockPropertyFilter), is used to skip whole
   123  // blocks that are obsolete.
   124  //
   125  // 2.1 Correctness
   126  //
   127  // Due to the level invariant, the sequence of seqnums for a user key in a
   128  // sstable represents a contiguous subsequence of the seqnums for the userkey
   129  // across the whole LSM, and is more recent than the seqnums in a sstable in a
   130  // lower level. So exposing exactly one point from a sstable for a userkey
   131  // will also mask the points for the userkey in lower levels. If we expose no
   132  // point, because of RANGEDELs, that RANGEDEL will also mask the points in
   133  // lower levels.
   134  //
   135  // Note that we do not need to do anything special at write time for
   136  // SETWITHDEL and SINGLEDEL. This is because these key kinds are treated
   137  // specially only by compactions, which do not hide obsolete points. For
   138  // regular reads, SETWITHDEL behaves the same as SET and SINGLEDEL behaves the
   139  // same as DEL.
   140  //
   141  // 2.2 Strictness and MERGE
   142  //
   143  // Setting the obsolete bit on point keys is advanced usage, so we support two
   144  // modes, both of which must be truthful when setting the obsolete bit, but
   145  // vary in when they don't set the obsolete bit.
   146  //
   147  // - Non-strict: In this mode, the bit does not need to be set for keys that
   148  //   are obsolete. Additionally, any sstable containing MERGE keys can only
   149  //   use this mode. An iterator over such an sstable, when configured to
   150  //   hideObsoletePoints, can expose multiple internal keys per user key, and
   151  //   can expose keys that are deleted by rangedels in the same sstable. This
   152  //   is the mode that non-advanced users should use. Pebble without
   153  //   disaggregated storage will also use this mode and will best-effort set
   154  //   the obsolete bit, to optimize iteration when snapshots have retained many
   155  //   obsolete keys.
   156  //
   157  // - Strict: In this mode, every obsolete key must have the obsolete bit set,
   158  //   and no MERGE keys are permitted. An iterator over such an sstable, when
   159  //   configured to hideObsoletePoints satisfies two properties:
   160  //   - S1: will expose at most one internal key per user key, which is the
   161  //     most recent one.
   162  //   - S2: will never expose keys that are deleted by rangedels in the same
   163  //     sstable.
   164  //
   165  //   This is the mode for two use cases in disaggregated storage (which will
   166  //   exclude parts of the key space that has MERGEs), for levels that contain
   167  //   sstables that can become foreign sstables:
   168  //   - Pebble compaction output to these levels that can become foreign
   169  //     sstables.
   170  //
   171  //   - CockroachDB ingest operations that can ingest into the levels that can
   172  //     become foreign sstables. Note, these are not sstables corresponding to
   173  //     copied data for CockroachDB range snapshots. This case occurs for
   174  //     operations like index backfills: these trivially satisfy the strictness
   175  //     criteria since they only write one key per userkey.
   176  //
   177  //     TODO(sumeer): this latter case is not currently supported, since only
   178  //     Writer.AddWithForceObsolete calls are permitted for writing strict
   179  //     obsolete sstables. This is done to reduce the likelihood of bugs. One
   180  //     simple way to lift this limitation would be to disallow adding any
   181  //     RANGEDELs when a Pebble-external writer is trying to construct a strict
   182  //     obsolete sstable.
   183  
   184  // ParseTableFormat parses the given magic bytes and version into its
   185  // corresponding internal TableFormat.
   186  func ParseTableFormat(magic []byte, version uint32) (TableFormat, error) {
   187  	switch string(magic) {
   188  	case levelDBMagic:
   189  		return TableFormatLevelDB, nil
   190  	case rocksDBMagic:
   191  		if version != rocksDBFormatVersion2 {
   192  			return TableFormatUnspecified, base.CorruptionErrorf(
   193  				"pebble/table: unsupported rocksdb format version %d", errors.Safe(version),
   194  			)
   195  		}
   196  		return TableFormatRocksDBv2, nil
   197  	case pebbleDBMagic:
   198  		switch version {
   199  		case 1:
   200  			return TableFormatPebblev1, nil
   201  		case 2:
   202  			return TableFormatPebblev2, nil
   203  		case 3:
   204  			return TableFormatPebblev3, nil
   205  		case 4:
   206  			return TableFormatPebblev4, nil
   207  		default:
   208  			return TableFormatUnspecified, base.CorruptionErrorf(
   209  				"pebble/table: unsupported pebble format version %d", errors.Safe(version),
   210  			)
   211  		}
   212  	default:
   213  		return TableFormatUnspecified, base.CorruptionErrorf(
   214  			"pebble/table: invalid table (bad magic number: 0x%x)", magic,
   215  		)
   216  	}
   217  }
   218  
   219  // AsTuple returns the TableFormat's (Magic String, Version) tuple.
   220  func (f TableFormat) AsTuple() (string, uint32) {
   221  	switch f {
   222  	case TableFormatLevelDB:
   223  		return levelDBMagic, 0
   224  	case TableFormatRocksDBv2:
   225  		return rocksDBMagic, 2
   226  	case TableFormatPebblev1:
   227  		return pebbleDBMagic, 1
   228  	case TableFormatPebblev2:
   229  		return pebbleDBMagic, 2
   230  	case TableFormatPebblev3:
   231  		return pebbleDBMagic, 3
   232  	case TableFormatPebblev4:
   233  		return pebbleDBMagic, 4
   234  	default:
   235  		panic("sstable: unknown table format version tuple")
   236  	}
   237  }
   238  
   239  // String returns the TableFormat (Magic String,Version) tuple.
   240  func (f TableFormat) String() string {
   241  	switch f {
   242  	case TableFormatLevelDB:
   243  		return "(LevelDB)"
   244  	case TableFormatRocksDBv2:
   245  		return "(RocksDB,v2)"
   246  	case TableFormatPebblev1:
   247  		return "(Pebble,v1)"
   248  	case TableFormatPebblev2:
   249  		return "(Pebble,v2)"
   250  	case TableFormatPebblev3:
   251  		return "(Pebble,v3)"
   252  	case TableFormatPebblev4:
   253  		return "(Pebble,v4)"
   254  	default:
   255  		panic("sstable: unknown table format version tuple")
   256  	}
   257  }