github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/sstable/table.go (about)

     1  // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  // Package sstable implements readers and writers of pebble tables.
     6  //
     7  // Tables are either opened for reading or created for writing but not both.
     8  //
     9  // A reader can create iterators, which allow seeking and next/prev
    10  // iteration. There may be multiple key/value pairs that have the same key and
    11  // different sequence numbers.
    12  //
    13  // A reader can be used concurrently. Multiple goroutines can call NewIter
    14  // concurrently, and each iterator can run concurrently with other iterators.
    15  // However, any particular iterator should not be used concurrently, and iterators
    16  // should not be used once a reader is closed.
    17  //
    18  // A writer writes key/value pairs in increasing key order, and cannot be used
    19  // concurrently. A table cannot be read until the writer has finished.
    20  //
    21  // Readers and writers can be created with various options. Passing a nil
    22  // Options pointer is valid and means to use the default values.
    23  //
    24  // One such option is to define the 'less than' ordering for keys. The default
    25  // Comparer uses the natural ordering consistent with bytes.Compare. The same
    26  // ordering should be used for reading and writing a table.
    27  //
    28  // To return the value for a key:
    29  //
    30  //	r := table.NewReader(file, options)
    31  //	defer r.Close()
    32  //	i := r.NewIter(nil, nil)
    33  //	defer i.Close()
    34  //	ikey, value := r.SeekGE(key)
    35  //	if options.Comparer.Compare(ikey.UserKey, key) != 0 {
    36  //	  // not found
    37  //	} else {
    38  //	  // value is the first record containing key
    39  //	}
    40  //
    41  // To count the number of entries in a table:
    42  //
    43  //	i, n := r.NewIter(nil, nil), 0
    44  //	for key, value := i.First(); key != nil; key, value = i.Next() {
    45  //		n++
    46  //	}
    47  //	if err := i.Close(); err != nil {
    48  //		return 0, err
    49  //	}
    50  //	return n, nil
    51  //
    52  // To write a table with three entries:
    53  //
    54  //	w := table.NewWriter(file, options)
    55  //	if err := w.Set([]byte("apple"), []byte("red")); err != nil {
    56  //		w.Close()
    57  //		return err
    58  //	}
    59  //	if err := w.Set([]byte("banana"), []byte("yellow")); err != nil {
    60  //		w.Close()
    61  //		return err
    62  //	}
    63  //	if err := w.Set([]byte("cherry"), []byte("red")); err != nil {
    64  //		w.Close()
    65  //		return err
    66  //	}
    67  //	return w.Close()
    68  package sstable // import "github.com/cockroachdb/pebble/sstable"
    69  
    70  import (
    71  	"context"
    72  	"encoding/binary"
    73  
    74  	"github.com/cockroachdb/errors"
    75  	"github.com/cockroachdb/pebble/internal/base"
    76  	"github.com/cockroachdb/pebble/objstorage"
    77  )
    78  
    79  /*
    80  The table file format looks like:
    81  
    82  <start_of_file>
    83  [data block 0]
    84  [data block 1]
    85  ...
    86  [data block N-1]
    87  [meta filter block] (optional)
    88  [index block] (for single level index)
    89  [meta rangedel block] (optional)
    90  [meta range key block] (optional)
    91  [value block 0] (optional)
    92  [value block M-1] (optional)
    93  [meta value index block] (optional)
    94  [meta properties block]
    95  [metaindex block]
    96  [footer]
    97  <end_of_file>
    98  
    99  A Reader eagerly loads the footer, metaindex block and meta properties block,
   100  because the data contained in those blocks is needed on every read, and even
   101  before reading. For example, the meta properties block is used to verify the
   102  comparer and merger are compatible, and the metaindex block contains the
   103  location of the meta properties (and other meta blocks). In situations where
   104  file system locality matters, or one wants to minimize number of read
   105  requests when eagerly loading these blocks, having these three as a suffix
   106  of the file is convenient.
   107  
   108  The interleaving of the index block(s) between the meta blocks is done to
   109  match RocksDB/LevelDB behavior.
   110  
   111  Each block consists of some data and a 5 byte trailer: a 1 byte block type and a
   112  4 byte checksum. The checksum is computed over the compressed data and the first
   113  byte of the trailer (i.e. the block type), and is serialized as little-endian.
   114  The block type gives the per-block compression used; each block is compressed
   115  independently. The checksum algorithm is described in the pebble/crc package.
   116  
   117  Most blocks, other than the meta filter block, value blocks and meta value
   118  index block, contain key/value pairs. The remainder of this comment refers to
   119  the decompressed block, containing key/value pairs, which has its 5 byte
   120  trailer stripped. The decompressed block data consists of a sequence of such
   121  key/value entries followed by a block suffix. Each key is encoded as a shared
   122  prefix length and a remainder string. For example, if two adjacent keys are
   123  "tweedledee" and "tweedledum", then the second key would be encoded as {8,
   124  "um"}. The shared prefix length is varint encoded. The remainder string and the
   125  value are encoded as a varint-encoded length followed by the literal contents.
   126  To continue the example, suppose that the key "tweedledum" mapped to the value
   127  "socks". The encoded key/value entry would be: "\x08\x02\x05umsocks".
   128  
   129  Every block has a restart interval I. Every I'th key/value entry in that block
   130  is called a restart point, and shares no key prefix with the previous entry.
   131  Continuing the example above, if the key after "tweedledum" was "two", but was
   132  part of a restart point, then that key would be encoded as {0, "two"} instead
   133  of {2, "o"}. If a block has P restart points, then the block suffix consists
   134  of (P+1)*4 bytes: (P+1) little-endian uint32 values. The first P of these
   135  uint32 values are the block offsets of each restart point. The final uint32
   136  value is P itself. Thus, when seeking for a particular key, one can use binary
   137  search to find the largest restart point whose key is <= the key sought.
   138  
   139  An index block is a block with N key/value entries. The i'th value is the
   140  encoded block handle of the i'th data block. The i'th key is a separator for
   141  i < N-1, and a successor for i == N-1. The separator between blocks i and i+1
   142  is a key that is >= every key in block i and is < every key i block i+1. The
   143  successor for the final block is a key that is >= every key in block N-1. The
   144  index block restart interval is 1: every entry is a restart point.
   145  
   146  A block handle is an offset, a length, and optional block properties (for data
   147  blocks and first/lower level index blocks); the length does not include the 5
   148  byte trailer. All numbers are varint-encoded, with no padding between the two
   149  values. The maximum size of an encoded block handle without properties is 20
   150  bytes. It is not advised to have properties that accumulate to be longer than
   151  100 bytes.
   152  
   153  Instead of a single index block, the sstable can have a two-level index (this
   154  is used to prevent a single huge index block). A two-level index consists of a
   155  sequence of lower-level index blocks with block handles for data blocks
   156  followed by a single top-level index block with block handles for the
   157  lower-level index blocks.
   158  
   159  The metaindex block also contains block handles as values, with keys being
   160  the names of the meta blocks.
   161  
   162  For a description of value blocks and the meta value index block, see
   163  value_block.go.
   164  
   165  Data blocks have some additional features:
   166  - For TableFormatPebblev3 onwards:
   167    - For SETs, the value has a 1 byte value prefix, which indicates whether the
   168      value is inline, or in a separate value block, and indicates whether the
   169      prefix of the userkey (as defined by split) has changed or not. See
   170      value_block.go for details.
   171    - The most significant bit of the restart points is used to indicate whether
   172      userkey prefix has changed since the last restart point. See the detailed
   173      comment in blockWriter.
   174    - The maximum length of the "shared prefix" when encoding the key, is the
   175      length of the prefix of the userkey (as defined by split) of the previous
   176      key.
   177  
   178  - For TableFormatPebblev4 onwards:
   179    - The key kinds may be altered to set the
   180      InternalKeyKindSSTableInternalObsoleteBit if the key-value pair is obsolete
   181      in the context of that sstable (for a reader that reads at a higher seqnum
   182      than the highest seqnum in the sstable). For details, see the comment in
   183      format.go.
   184  */
   185  
   186  const (
   187  	blockTrailerLen                    = 5
   188  	blockHandleMaxLenWithoutProperties = 10 + 10
   189  	// blockHandleLikelyMaxLen can be used for pre-allocating buffers to
   190  	// reduce memory copies. It is not guaranteed that a block handle will not
   191  	// exceed this length.
   192  	blockHandleLikelyMaxLen = blockHandleMaxLenWithoutProperties + 100
   193  
   194  	levelDBFooterLen   = 48
   195  	levelDBMagic       = "\x57\xfb\x80\x8b\x24\x75\x47\xdb"
   196  	levelDBMagicOffset = levelDBFooterLen - len(levelDBMagic)
   197  
   198  	rocksDBFooterLen             = 1 + 2*blockHandleMaxLenWithoutProperties + 4 + 8
   199  	rocksDBMagic                 = "\xf7\xcf\xf4\x85\xb7\x41\xe2\x88"
   200  	rocksDBMagicOffset           = rocksDBFooterLen - len(rocksDBMagic)
   201  	rocksDBVersionOffset         = rocksDBMagicOffset - 4
   202  	rocksDBExternalFormatVersion = 2
   203  
   204  	pebbleDBMagic = "\xf0\x9f\xaa\xb3\xf0\x9f\xaa\xb3" // 🪳🪳
   205  
   206  	minFooterLen = levelDBFooterLen
   207  	maxFooterLen = rocksDBFooterLen
   208  
   209  	levelDBFormatVersion  = 0
   210  	rocksDBFormatVersion2 = 2
   211  
   212  	metaRangeKeyName   = "pebble.range_key"
   213  	metaValueIndexName = "pebble.value_index"
   214  	metaPropertiesName = "rocksdb.properties"
   215  	metaRangeDelName   = "rocksdb.range_del"
   216  	metaRangeDelV2Name = "rocksdb.range_del2"
   217  
   218  	// Index Types.
   219  	// A space efficient index block that is optimized for binary-search-based
   220  	// index.
   221  	binarySearchIndex = 0
   222  	// hashSearchIndex               = 1
   223  	// A two-level index implementation. Both levels are binary search indexes.
   224  	twoLevelIndex = 2
   225  	// binarySearchWithFirstKeyIndex = 3
   226  
   227  	// RocksDB always includes this in the properties block. Since Pebble
   228  	// doesn't use zstd compression, the string will always be the same.
   229  	// This should be removed if we ever decide to diverge from the RocksDB
   230  	// properties block.
   231  	rocksDBCompressionOptions = "window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; "
   232  )
   233  
   234  // ChecksumType specifies the checksum used for blocks.
   235  type ChecksumType byte
   236  
   237  // The available checksum types.
   238  const (
   239  	ChecksumTypeNone     ChecksumType = 0
   240  	ChecksumTypeCRC32c   ChecksumType = 1
   241  	ChecksumTypeXXHash   ChecksumType = 2
   242  	ChecksumTypeXXHash64 ChecksumType = 3
   243  )
   244  
   245  // String implements fmt.Stringer.
   246  func (t ChecksumType) String() string {
   247  	switch t {
   248  	case ChecksumTypeCRC32c:
   249  		return "crc32c"
   250  	case ChecksumTypeNone:
   251  		return "none"
   252  	case ChecksumTypeXXHash:
   253  		return "xxhash"
   254  	case ChecksumTypeXXHash64:
   255  		return "xxhash64"
   256  	default:
   257  		panic(errors.Newf("sstable: unknown checksum type: %d", t))
   258  	}
   259  }
   260  
   261  type blockType byte
   262  
   263  const (
   264  	// The block type gives the per-block compression format.
   265  	// These constants are part of the file format and should not be changed.
   266  	// They are different from the Compression constants because the latter
   267  	// are designed so that the zero value of the Compression type means to
   268  	// use the default compression (which is snappy).
   269  	// Not all compression types listed here are supported.
   270  	noCompressionBlockType     blockType = 0
   271  	snappyCompressionBlockType blockType = 1
   272  	zlibCompressionBlockType   blockType = 2
   273  	bzip2CompressionBlockType  blockType = 3
   274  	lz4CompressionBlockType    blockType = 4
   275  	lz4hcCompressionBlockType  blockType = 5
   276  	xpressCompressionBlockType blockType = 6
   277  	zstdCompressionBlockType   blockType = 7
   278  )
   279  
   280  // String implements fmt.Stringer.
   281  func (t blockType) String() string {
   282  	switch t {
   283  	case 0:
   284  		return "none"
   285  	case 1:
   286  		return "snappy"
   287  	case 2:
   288  		return "zlib"
   289  	case 3:
   290  		return "bzip2"
   291  	case 4:
   292  		return "lz4"
   293  	case 5:
   294  		return "lz4hc"
   295  	case 6:
   296  		return "xpress"
   297  	case 7:
   298  		return "zstd"
   299  	default:
   300  		panic(errors.Newf("sstable: unknown block type: %d", t))
   301  	}
   302  }
   303  
   304  // legacy (LevelDB) footer format:
   305  //
   306  //	metaindex handle (varint64 offset, varint64 size)
   307  //	index handle     (varint64 offset, varint64 size)
   308  //	<padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
   309  //	table_magic_number (8 bytes)
   310  //
   311  // new (RocksDB) footer format:
   312  //
   313  //	checksum type (char, 1 byte)
   314  //	metaindex handle (varint64 offset, varint64 size)
   315  //	index handle     (varint64 offset, varint64 size)
   316  //	<padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
   317  //	footer version (4 bytes)
   318  //	table_magic_number (8 bytes)
   319  type footer struct {
   320  	format      TableFormat
   321  	checksum    ChecksumType
   322  	metaindexBH BlockHandle
   323  	indexBH     BlockHandle
   324  	footerBH    BlockHandle
   325  }
   326  
   327  func readFooter(f objstorage.Readable) (footer, error) {
   328  	var footer footer
   329  	size := f.Size()
   330  	if size < minFooterLen {
   331  		return footer, base.CorruptionErrorf("pebble/table: invalid table (file size is too small)")
   332  	}
   333  
   334  	buf := make([]byte, maxFooterLen)
   335  	off := size - maxFooterLen
   336  	if off < 0 {
   337  		off = 0
   338  		buf = buf[:size]
   339  	}
   340  	if err := f.ReadAt(context.TODO(), buf, off); err != nil {
   341  		return footer, errors.Wrap(err, "pebble/table: invalid table (could not read footer)")
   342  	}
   343  
   344  	switch magic := buf[len(buf)-len(rocksDBMagic):]; string(magic) {
   345  	case levelDBMagic:
   346  		if len(buf) < levelDBFooterLen {
   347  			return footer, base.CorruptionErrorf(
   348  				"pebble/table: invalid table (footer too short): %d", errors.Safe(len(buf)))
   349  		}
   350  		footer.footerBH.Offset = uint64(off+int64(len(buf))) - levelDBFooterLen
   351  		buf = buf[len(buf)-levelDBFooterLen:]
   352  		footer.footerBH.Length = uint64(len(buf))
   353  		footer.format = TableFormatLevelDB
   354  		footer.checksum = ChecksumTypeCRC32c
   355  
   356  	case rocksDBMagic, pebbleDBMagic:
   357  		// NOTE: The Pebble magic string implies the same footer format as that used
   358  		// by the RocksDBv2 table format.
   359  		if len(buf) < rocksDBFooterLen {
   360  			return footer, base.CorruptionErrorf("pebble/table: invalid table (footer too short): %d", errors.Safe(len(buf)))
   361  		}
   362  		footer.footerBH.Offset = uint64(off+int64(len(buf))) - rocksDBFooterLen
   363  		buf = buf[len(buf)-rocksDBFooterLen:]
   364  		footer.footerBH.Length = uint64(len(buf))
   365  		version := binary.LittleEndian.Uint32(buf[rocksDBVersionOffset:rocksDBMagicOffset])
   366  
   367  		format, err := ParseTableFormat(magic, version)
   368  		if err != nil {
   369  			return footer, err
   370  		}
   371  		footer.format = format
   372  
   373  		switch ChecksumType(buf[0]) {
   374  		case ChecksumTypeCRC32c:
   375  			footer.checksum = ChecksumTypeCRC32c
   376  		case ChecksumTypeXXHash64:
   377  			footer.checksum = ChecksumTypeXXHash64
   378  		default:
   379  			return footer, base.CorruptionErrorf("pebble/table: unsupported checksum type %d", errors.Safe(footer.checksum))
   380  		}
   381  		buf = buf[1:]
   382  
   383  	default:
   384  		return footer, base.CorruptionErrorf("pebble/table: invalid table (bad magic number: 0x%x)", magic)
   385  	}
   386  
   387  	{
   388  		end := uint64(size)
   389  		var n int
   390  		footer.metaindexBH, n = decodeBlockHandle(buf)
   391  		if n == 0 || footer.metaindexBH.Offset+footer.metaindexBH.Length > end {
   392  			return footer, base.CorruptionErrorf("pebble/table: invalid table (bad metaindex block handle)")
   393  		}
   394  		buf = buf[n:]
   395  
   396  		footer.indexBH, n = decodeBlockHandle(buf)
   397  		if n == 0 || footer.indexBH.Offset+footer.indexBH.Length > end {
   398  			return footer, base.CorruptionErrorf("pebble/table: invalid table (bad index block handle)")
   399  		}
   400  	}
   401  
   402  	return footer, nil
   403  }
   404  
   405  func (f footer) encode(buf []byte) []byte {
   406  	switch magic, version := f.format.AsTuple(); magic {
   407  	case levelDBMagic:
   408  		buf = buf[:levelDBFooterLen]
   409  		for i := range buf {
   410  			buf[i] = 0
   411  		}
   412  		n := encodeBlockHandle(buf[0:], f.metaindexBH)
   413  		encodeBlockHandle(buf[n:], f.indexBH)
   414  		copy(buf[len(buf)-len(levelDBMagic):], levelDBMagic)
   415  
   416  	case rocksDBMagic, pebbleDBMagic:
   417  		buf = buf[:rocksDBFooterLen]
   418  		for i := range buf {
   419  			buf[i] = 0
   420  		}
   421  		switch f.checksum {
   422  		case ChecksumTypeNone:
   423  			buf[0] = byte(ChecksumTypeNone)
   424  		case ChecksumTypeCRC32c:
   425  			buf[0] = byte(ChecksumTypeCRC32c)
   426  		case ChecksumTypeXXHash:
   427  			buf[0] = byte(ChecksumTypeXXHash)
   428  		case ChecksumTypeXXHash64:
   429  			buf[0] = byte(ChecksumTypeXXHash64)
   430  		default:
   431  			panic("unknown checksum type")
   432  		}
   433  		n := 1
   434  		n += encodeBlockHandle(buf[n:], f.metaindexBH)
   435  		encodeBlockHandle(buf[n:], f.indexBH)
   436  		binary.LittleEndian.PutUint32(buf[rocksDBVersionOffset:], version)
   437  		copy(buf[len(buf)-len(rocksDBMagic):], magic)
   438  
   439  	default:
   440  		panic("sstable: unspecified table format version")
   441  	}
   442  
   443  	return buf
   444  }
   445  
   446  func supportsTwoLevelIndex(format TableFormat) bool {
   447  	switch format {
   448  	case TableFormatLevelDB:
   449  		return false
   450  	case TableFormatRocksDBv2, TableFormatPebblev1, TableFormatPebblev2, TableFormatPebblev3, TableFormatPebblev4:
   451  		return true
   452  	default:
   453  		panic("sstable: unspecified table format version")
   454  	}
   455  }