github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/sstable/table.go (about)

     1  // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  // Package sstable implements readers and writers of bitalostable tables.
     6  //
     7  // Tables are either opened for reading or created for writing but not both.
     8  //
     9  // A reader can create iterators, which allow seeking and next/prev
    10  // iteration. There may be multiple key/value pairs that have the same key and
    11  // different sequence numbers.
    12  //
    13  // A reader can be used concurrently. Multiple goroutines can call NewIter
    14  // concurrently, and each iterator can run concurrently with other iterators.
    15  // However, any particular iterator should not be used concurrently, and iterators
    16  // should not be used once a reader is closed.
    17  //
    18  // A writer writes key/value pairs in increasing key order, and cannot be used
    19  // concurrently. A table cannot be read until the writer has finished.
    20  //
    21  // Readers and writers can be created with various options. Passing a nil
    22  // Options pointer is valid and means to use the default values.
    23  //
    24  // One such option is to define the 'less than' ordering for keys. The default
    25  // Comparer uses the natural ordering consistent with bytes.Compare. The same
    26  // ordering should be used for reading and writing a table.
    27  //
    28  // To return the value for a key:
    29  //
    30  //	r := table.NewReader(file, options)
    31  //	defer r.Close()
    32  //	i := r.NewIter(nil, nil)
    33  //	defer i.Close()
    34  //	ikey, value := r.SeekGE(key)
    35  //	if options.Comparer.Compare(ikey.UserKey, key) != 0 {
    36  //	  // not found
    37  //	} else {
    38  //	  // value is the first record containing key
    39  //	}
    40  //
    41  // To count the number of entries in a table:
    42  //
    43  //	i, n := r.NewIter(nil, nil), 0
    44  //	for key, value := i.First(); key != nil; key, value = i.Next() {
    45  //		n++
    46  //	}
    47  //	if err := i.Close(); err != nil {
    48  //		return 0, err
    49  //	}
    50  //	return n, nil
    51  //
    52  // To write a table with three entries:
    53  //
    54  //	w := table.NewWriter(file, options)
    55  //	if err := w.Set([]byte("apple"), []byte("red")); err != nil {
    56  //		w.Close()
    57  //		return err
    58  //	}
    59  //	if err := w.Set([]byte("banana"), []byte("yellow")); err != nil {
    60  //		w.Close()
    61  //		return err
    62  //	}
    63  //	if err := w.Set([]byte("cherry"), []byte("red")); err != nil {
    64  //		w.Close()
    65  //		return err
    66  //	}
    67  //	return w.Close()
    68  package sstable // import "github.com/zuoyebang/bitalostable/sstable"
    69  
    70  import (
    71  	"encoding/binary"
    72  	"io"
    73  
    74  	"github.com/cockroachdb/errors"
    75  	"github.com/zuoyebang/bitalostable/internal/base"
    76  )
    77  
    78  /*
    79  The table file format looks like:
    80  
    81  <start_of_file>
    82  [data block 0]
    83  [data block 1]
    84  ...
    85  [data block N-1]
    86  [meta filter block] (optional)
    87  [index block] (for single level index)
    88  [meta rangedel block] (optional)
    89  [meta range key block] (optional)
    90  [meta properties block]
    91  [metaindex block]
    92  [footer]
    93  <end_of_file>
    94  
    95  A Reader eagerly loads the footer, metaindex block and meta properties block,
    96  because the data contained in those blocks is needed on every read, and even
    97  before reading. For example, the meta properties block is used to verify the
    98  comparer and merger are compatible, and the metaindex block contains the
    99  location of the meta properties (and other meta blocks). In situations where
   100  file system locality matters, or one wants to minimize number of read
   101  requests when eagerly loading these blocks, having these three as a suffix
   102  of the file is convenient.
   103  
   104  The interleaving of the index block(s) between the meta blocks is done to
   105  match RocksDB/LevelDB behavior.
   106  
   107  Each block consists of some data and a 5 byte trailer: a 1 byte block type and a
   108  4 byte checksum. The checksum is computed over the compressed data and the first
   109  byte of the trailer (i.e. the block type), and is serialized as little-endian.
   110  The block type gives the per-block compression used; each block is compressed
   111  independently. The checksum algorithm is described in the bitalostable/crc package.
   112  
   113  Most blocks, other than the meta filter block, contain key/value pairs. The
   114  remainder of this comment refers to the decompressed block, which has its 5 byte
   115  trailer stripped. The decompressed block data consists of a sequence of such
   116  key/value entries followed by a block suffix. Each key is encoded as a shared
   117  prefix length and a remainder string. For example, if two adjacent keys are
   118  "tweedledee" and "tweedledum", then the second key would be encoded as {8,
   119  "um"}. The shared prefix length is varint encoded. The remainder string and the
   120  value are encoded as a varint-encoded length followed by the literal contents.
   121  To continue the example, suppose that the key "tweedledum" mapped to the value
   122  "socks". The encoded key/value entry would be: "\x08\x02\x05umsocks".
   123  
   124  Every block has a restart interval I. Every I'th key/value entry in that block
   125  is called a restart point, and shares no key prefix with the previous entry.
   126  Continuing the example above, if the key after "tweedledum" was "two", but was
   127  part of a restart point, then that key would be encoded as {0, "two"} instead
   128  of {2, "o"}. If a block has P restart points, then the block suffix consists
   129  of (P+1)*4 bytes: (P+1) little-endian uint32 values. The first P of these
   130  uint32 values are the block offsets of each restart point. The final uint32
   131  value is P itself. Thus, when seeking for a particular key, one can use binary
   132  search to find the largest restart point whose key is <= the key sought.
   133  
   134  An index block is a block with N key/value entries. The i'th value is the
   135  encoded block handle of the i'th data block. The i'th key is a separator for
   136  i < N-1, and a successor for i == N-1. The separator between blocks i and i+1
   137  is a key that is >= every key in block i and is < every key i block i+1. The
   138  successor for the final block is a key that is >= every key in block N-1. The
   139  index block restart interval is 1: every entry is a restart point.
   140  
   141  A block handle is an offset, a length, and optional block properties (for data
   142  blocks and first/lower level index blocks); the length does not include the 5
   143  byte trailer. All numbers are varint-encoded, with no padding between the two
   144  values. The maximum size of an encoded block handle without properties is 20
   145  bytes. It is not advised to have properties that accumulate to be longer than
   146  100 bytes.
   147  
   148  Instead of a single index block, the sstable can have a two-level index (this
   149  is used to prevent a single huge index block). A two-level index consists of a
   150  sequence of lower-level index blocks with block handles for data blocks
   151  followed by a single top-level index block with block handles for the
   152  lower-level index blocks.
   153  
   154  The metaindex block also contains block handles as values, with keys being
   155  the names of the meta blocks.
   156  
   157  */
   158  
   159  const (
   160  	blockTrailerLen                    = 5
   161  	blockHandleMaxLenWithoutProperties = 10 + 10
   162  	// blockHandleLikelyMaxLen can be used for pre-allocating buffers to
   163  	// reduce memory copies. It is not guaranteed that a block handle will not
   164  	// exceed this length.
   165  	blockHandleLikelyMaxLen = blockHandleMaxLenWithoutProperties + 100
   166  
   167  	levelDBFooterLen   = 48
   168  	levelDBMagic       = "\x57\xfb\x80\x8b\x24\x75\x47\xdb"
   169  	levelDBMagicOffset = levelDBFooterLen - len(levelDBMagic)
   170  
   171  	rocksDBFooterLen             = 1 + 2*blockHandleMaxLenWithoutProperties + 4 + 8
   172  	rocksDBMagic                 = "\xf7\xcf\xf4\x85\xb7\x41\xe2\x88"
   173  	rocksDBMagicOffset           = rocksDBFooterLen - len(rocksDBMagic)
   174  	rocksDBVersionOffset         = rocksDBMagicOffset - 4
   175  	rocksDBExternalFormatVersion = 2
   176  
   177  	bitalostableDBMagic = "\xf0\x9f\xaa\xb3\xf0\x9f\xaa\xb3" // 🪳🪳
   178  
   179  	minFooterLen = levelDBFooterLen
   180  	maxFooterLen = rocksDBFooterLen
   181  
   182  	levelDBFormatVersion  = 0
   183  	rocksDBFormatVersion2 = 2
   184  
   185  	metaRangeKeyName   = "bitalostable.range_key"
   186  	metaPropertiesName = "rocksdb.properties"
   187  	metaRangeDelName   = "rocksdb.range_del"
   188  	metaRangeDelV2Name = "rocksdb.range_del2"
   189  
   190  	// Index Types.
   191  	// A space efficient index block that is optimized for binary-search-based
   192  	// index.
   193  	binarySearchIndex = 0
   194  	// hashSearchIndex               = 1
   195  	// A two-level index implementation. Both levels are binary search indexes.
   196  	twoLevelIndex = 2
   197  	// binarySearchWithFirstKeyIndex = 3
   198  
   199  	// RocksDB always includes this in the properties block. Since Pebble
   200  	// doesn't use zstd compression, the string will always be the same.
   201  	// This should be removed if we ever decide to diverge from the RocksDB
   202  	// properties block.
   203  	rocksDBCompressionOptions = "window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; "
   204  )
   205  
   206  // ChecksumType specifies the checksum used for blocks.
   207  type ChecksumType byte
   208  
   209  // The available checksum types.
   210  const (
   211  	ChecksumTypeNone     ChecksumType = 0
   212  	ChecksumTypeCRC32c   ChecksumType = 1
   213  	ChecksumTypeXXHash   ChecksumType = 2
   214  	ChecksumTypeXXHash64 ChecksumType = 3
   215  )
   216  
   217  // String implements fmt.Stringer.
   218  func (t ChecksumType) String() string {
   219  	switch t {
   220  	case ChecksumTypeCRC32c:
   221  		return "crc32c"
   222  	case ChecksumTypeNone:
   223  		return "none"
   224  	case ChecksumTypeXXHash:
   225  		return "xxhash"
   226  	case ChecksumTypeXXHash64:
   227  		return "xxhash64"
   228  	default:
   229  		panic(errors.Newf("sstable: unknown checksum type: %d", t))
   230  	}
   231  }
   232  
   233  type blockType byte
   234  
   235  const (
   236  	// The block type gives the per-block compression format.
   237  	// These constants are part of the file format and should not be changed.
   238  	// They are different from the Compression constants because the latter
   239  	// are designed so that the zero value of the Compression type means to
   240  	// use the default compression (which is snappy).
   241  	// Not all compression types listed here are supported.
   242  	noCompressionBlockType     blockType = 0
   243  	snappyCompressionBlockType blockType = 1
   244  	zlibCompressionBlockType   blockType = 2
   245  	bzip2CompressionBlockType  blockType = 3
   246  	lz4CompressionBlockType    blockType = 4
   247  	lz4hcCompressionBlockType  blockType = 5
   248  	xpressCompressionBlockType blockType = 6
   249  	zstdCompressionBlockType   blockType = 7
   250  )
   251  
   252  // String implements fmt.Stringer.
   253  func (t blockType) String() string {
   254  	switch t {
   255  	case 0:
   256  		return "none"
   257  	case 1:
   258  		return "snappy"
   259  	case 2:
   260  		return "zlib"
   261  	case 3:
   262  		return "bzip2"
   263  	case 4:
   264  		return "lz4"
   265  	case 5:
   266  		return "lz4hc"
   267  	case 6:
   268  		return "xpress"
   269  	case 7:
   270  		return "zstd"
   271  	default:
   272  		panic(errors.Newf("sstable: unknown block type: %d", t))
   273  	}
   274  }
   275  
   276  // legacy (LevelDB) footer format:
   277  //
   278  //	metaindex handle (varint64 offset, varint64 size)
   279  //	index handle     (varint64 offset, varint64 size)
   280  //	<padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
   281  //	table_magic_number (8 bytes)
   282  //
   283  // new (RocksDB) footer format:
   284  //
   285  //	checksum type (char, 1 byte)
   286  //	metaindex handle (varint64 offset, varint64 size)
   287  //	index handle     (varint64 offset, varint64 size)
   288  //	<padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
   289  //	footer version (4 bytes)
   290  //	table_magic_number (8 bytes)
   291  type footer struct {
   292  	format      TableFormat
   293  	checksum    ChecksumType
   294  	metaindexBH BlockHandle
   295  	indexBH     BlockHandle
   296  	footerBH    BlockHandle
   297  }
   298  
   299  func readFooter(f ReadableFile) (footer, error) {
   300  	var footer footer
   301  	stat, err := f.Stat()
   302  	if err != nil {
   303  		return footer, errors.Wrap(err, "bitalostable/table: invalid table (could not stat file)")
   304  	}
   305  	if stat.Size() < minFooterLen {
   306  		return footer, base.CorruptionErrorf("bitalostable/table: invalid table (file size is too small)")
   307  	}
   308  
   309  	buf := make([]byte, maxFooterLen)
   310  	off := stat.Size() - maxFooterLen
   311  	if off < 0 {
   312  		off = 0
   313  	}
   314  	n, err := f.ReadAt(buf, off)
   315  	if err != nil && err != io.EOF {
   316  		return footer, errors.Wrap(err, "bitalostable/table: invalid table (could not read footer)")
   317  	}
   318  	buf = buf[:n]
   319  
   320  	switch magic := buf[len(buf)-len(rocksDBMagic):]; string(magic) {
   321  	case levelDBMagic:
   322  		if len(buf) < levelDBFooterLen {
   323  			return footer, base.CorruptionErrorf(
   324  				"bitalostable/table: invalid table (footer too short): %d", errors.Safe(len(buf)))
   325  		}
   326  		footer.footerBH.Offset = uint64(off+int64(len(buf))) - levelDBFooterLen
   327  		buf = buf[len(buf)-levelDBFooterLen:]
   328  		footer.footerBH.Length = uint64(len(buf))
   329  		footer.format = TableFormatLevelDB
   330  		footer.checksum = ChecksumTypeCRC32c
   331  
   332  	case rocksDBMagic, bitalostableDBMagic:
   333  		// NOTE: The Pebble magic string implies the same footer format as that used
   334  		// by the RocksDBv2 table format.
   335  		if len(buf) < rocksDBFooterLen {
   336  			return footer, base.CorruptionErrorf("bitalostable/table: invalid table (footer too short): %d", errors.Safe(len(buf)))
   337  		}
   338  		footer.footerBH.Offset = uint64(off+int64(len(buf))) - rocksDBFooterLen
   339  		buf = buf[len(buf)-rocksDBFooterLen:]
   340  		footer.footerBH.Length = uint64(len(buf))
   341  		version := binary.LittleEndian.Uint32(buf[rocksDBVersionOffset:rocksDBMagicOffset])
   342  
   343  		format, err := ParseTableFormat(magic, version)
   344  		if err != nil {
   345  			return footer, err
   346  		}
   347  		footer.format = format
   348  
   349  		switch ChecksumType(buf[0]) {
   350  		case ChecksumTypeCRC32c:
   351  			footer.checksum = ChecksumTypeCRC32c
   352  		case ChecksumTypeXXHash64:
   353  			footer.checksum = ChecksumTypeXXHash64
   354  		default:
   355  			return footer, base.CorruptionErrorf("bitalostable/table: unsupported checksum type %d", errors.Safe(footer.checksum))
   356  		}
   357  		buf = buf[1:]
   358  
   359  	default:
   360  		return footer, base.CorruptionErrorf("bitalostable/table: invalid table (bad magic number)")
   361  	}
   362  
   363  	{
   364  		end := uint64(stat.Size())
   365  		var n int
   366  		footer.metaindexBH, n = decodeBlockHandle(buf)
   367  		if n == 0 || footer.metaindexBH.Offset+footer.metaindexBH.Length > end {
   368  			return footer, base.CorruptionErrorf("bitalostable/table: invalid table (bad metaindex block handle)")
   369  		}
   370  		buf = buf[n:]
   371  
   372  		footer.indexBH, n = decodeBlockHandle(buf)
   373  		if n == 0 || footer.indexBH.Offset+footer.indexBH.Length > end {
   374  			return footer, base.CorruptionErrorf("bitalostable/table: invalid table (bad index block handle)")
   375  		}
   376  	}
   377  
   378  	return footer, nil
   379  }
   380  
   381  func (f footer) encode(buf []byte) []byte {
   382  	switch magic, version := f.format.AsTuple(); magic {
   383  	case levelDBMagic:
   384  		buf = buf[:levelDBFooterLen]
   385  		for i := range buf {
   386  			buf[i] = 0
   387  		}
   388  		n := encodeBlockHandle(buf[0:], f.metaindexBH)
   389  		encodeBlockHandle(buf[n:], f.indexBH)
   390  		copy(buf[len(buf)-len(levelDBMagic):], levelDBMagic)
   391  
   392  	case rocksDBMagic, bitalostableDBMagic:
   393  		buf = buf[:rocksDBFooterLen]
   394  		for i := range buf {
   395  			buf[i] = 0
   396  		}
   397  		switch f.checksum {
   398  		case ChecksumTypeNone:
   399  			buf[0] = byte(ChecksumTypeNone)
   400  		case ChecksumTypeCRC32c:
   401  			buf[0] = byte(ChecksumTypeCRC32c)
   402  		case ChecksumTypeXXHash:
   403  			buf[0] = byte(ChecksumTypeXXHash)
   404  		case ChecksumTypeXXHash64:
   405  			buf[0] = byte(ChecksumTypeXXHash64)
   406  		default:
   407  			panic("unknown checksum type")
   408  		}
   409  		n := 1
   410  		n += encodeBlockHandle(buf[n:], f.metaindexBH)
   411  		encodeBlockHandle(buf[n:], f.indexBH)
   412  		binary.LittleEndian.PutUint32(buf[rocksDBVersionOffset:], version)
   413  		copy(buf[len(buf)-len(rocksDBMagic):], magic)
   414  
   415  	default:
   416  		panic("sstable: unspecified table format version")
   417  	}
   418  
   419  	return buf
   420  }
   421  
   422  func supportsTwoLevelIndex(format TableFormat) bool {
   423  	switch format {
   424  	case TableFormatLevelDB:
   425  		return false
   426  	case TableFormatRocksDBv2, TableFormatPebblev1, TableFormatPebblev2:
   427  		return true
   428  	default:
   429  		panic("sstable: unspecified table format version")
   430  	}
   431  }