github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/sstable/table.go (about)

     1  // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  /*
     6  Package sstable implements readers and writers of pebble tables.
     7  
     8  Tables are either opened for reading or created for writing but not both.
     9  
    10  A reader can create iterators, which allow seeking and next/prev
    11  iteration. There may be multiple key/value pairs that have the same key and
    12  different sequence numbers.
    13  
    14  A reader can be used concurrently. Multiple goroutines can call NewIter
    15  concurrently, and each iterator can run concurrently with other iterators.
    16  However, any particular iterator should not be used concurrently, and iterators
    17  should not be used once a reader is closed.
    18  
    19  A writer writes key/value pairs in increasing key order, and cannot be used
    20  concurrently. A table cannot be read until the writer has finished.
    21  
    22  Readers and writers can be created with various options. Passing a nil
    23  Options pointer is valid and means to use the default values.
    24  
    25  One such option is to define the 'less than' ordering for keys. The default
    26  Comparer uses the natural ordering consistent with bytes.Compare. The same
    27  ordering should be used for reading and writing a table.
    28  
    29  To return the value for a key:
    30  
    31  	r := table.NewReader(file, options)
    32  	defer r.Close()
    33  	return r.Get(key)
    34  
    35  To count the number of entries in a table:
    36  
    37  	i, n := r.NewIter(ropts), 0
    38  	for valid := i.First(); valid; valid = i.Next() {
    39  		n++
    40  	}
    41  	if err := i.Close(); err != nil {
    42  		return 0, err
    43  	}
    44  	return n, nil
    45  
    46  To write a table with three entries:
    47  
    48  	w := table.NewWriter(file, options)
    49  	if err := w.Set([]byte("apple"), []byte("red"), wopts); err != nil {
    50  		w.Close()
    51  		return err
    52  	}
    53  	if err := w.Set([]byte("banana"), []byte("yellow"), wopts); err != nil {
    54  		w.Close()
    55  		return err
    56  	}
    57  	if err := w.Set([]byte("cherry"), []byte("red"), wopts); err != nil {
    58  		w.Close()
    59  		return err
    60  	}
    61  	return w.Close()
    62  */
    63  package sstable // import "github.com/petermattis/pebble/sstable"
    64  import (
    65  	"encoding/binary"
    66  	"errors"
    67  	"fmt"
    68  	"io"
    69  
    70  	"github.com/petermattis/pebble/vfs"
    71  )
    72  
    73  /*
    74  The table file format looks like:
    75  
    76  <start_of_file>
    77  [data block 0]
    78  [data block 1]
    79  ...
    80  [data block N-1]
    81  [meta block 0]
    82  [meta block 1]
    83  ...
    84  [meta block K-1]
    85  [metaindex block]
    86  [index block]
    87  [footer]
    88  <end_of_file>
    89  
    90  Each block consists of some data and a 5 byte trailer: a 1 byte block type and
    91  a 4 byte checksum of the compressed data. The block type gives the per-block
    92  compression used; each block is compressed independently. The checksum
    93  algorithm is described in the pebble/crc package.
    94  
    95  The decompressed block data consists of a sequence of key/value entries
    96  followed by a trailer. Each key is encoded as a shared prefix length and a
    97  remainder string. For example, if two adjacent keys are "tweedledee" and
    98  "tweedledum", then the second key would be encoded as {8, "um"}. The shared
    99  prefix length is varint encoded. The remainder string and the value are
   100  encoded as a varint-encoded length followed by the literal contents. To
   101  continue the example, suppose that the key "tweedledum" mapped to the value
   102  "socks". The encoded key/value entry would be: "\x08\x02\x05umsocks".
   103  
   104  Every block has a restart interval I. Every I'th key/value entry in that block
   105  is called a restart point, and shares no key prefix with the previous entry.
   106  Continuing the example above, if the key after "tweedledum" was "two", but was
   107  part of a restart point, then that key would be encoded as {0, "two"} instead
   108  of {2, "o"}. If a block has P restart points, then the block trailer consists
   109  of (P+1)*4 bytes: (P+1) little-endian uint32 values. The first P of these
   110  uint32 values are the block offsets of each restart point. The final uint32
   111  value is P itself. Thus, when seeking for a particular key, one can use binary
   112  search to find the largest restart point whose key is <= the key sought.
   113  
   114  An index block is a block with N key/value entries. The i'th value is the
   115  encoded block handle of the i'th data block. The i'th key is a separator for
   116  i < N-1, and a successor for i == N-1. The separator between blocks i and i+1
   117  is a key that is >= every key in block i and is < every key i block i+1. The
   118  successor for the final block is a key that is >= every key in block N-1. The
   119  index block restart interval is 1: every entry is a restart point.
   120  
   121  A block handle is an offset and a length; the length does not include the 5
   122  byte trailer. Both numbers are varint-encoded, with no padding between the two
   123  values. The maximum size of an encoded block handle is therefore 20 bytes.
   124  */
   125  
   126  const (
   127  	blockTrailerLen   = 5
   128  	blockHandleMaxLen = 10 + 10
   129  
   130  	levelDBFooterLen   = 48
   131  	levelDBMagic       = "\x57\xfb\x80\x8b\x24\x75\x47\xdb"
   132  	levelDBMagicOffset = levelDBFooterLen - len(levelDBMagic)
   133  
   134  	rocksDBFooterLen     = 1 + 2*blockHandleMaxLen + 4 + 8
   135  	rocksDBMagic         = "\xf7\xcf\xf4\x85\xb7\x41\xe2\x88"
   136  	rocksDBMagicOffset   = rocksDBFooterLen - len(rocksDBMagic)
   137  	rocksDBVersionOffset = rocksDBMagicOffset - 4
   138  
   139  	minFooterLen = levelDBFooterLen
   140  	maxFooterLen = rocksDBFooterLen
   141  
   142  	levelDBFormatVersion  = 0
   143  	rocksDBFormatVersion2 = 2
   144  
   145  	noChecksum     = 0
   146  	checksumCRC32c = 1
   147  	checksumXXHash = 2
   148  
   149  	// The block type gives the per-block compression format.
   150  	// These constants are part of the file format and should not be changed.
   151  	// They are different from the Compression constants because the latter
   152  	// are designed so that the zero value of the Compression type means to
   153  	// use the default compression (which is snappy).
   154  	noCompressionBlockType     byte = 0
   155  	snappyCompressionBlockType byte = 1
   156  
   157  	metaPropertiesName = "rocksdb.properties"
   158  	metaRangeDelName   = "rocksdb.range_del"
   159  	metaRangeDelV2Name = "rocksdb.range_del2"
   160  
   161  	// Index Types.
   162  	// A space efficient index block that is optimized for binary-search-based
   163  	// index.
   164  	binarySearchIndex = 0
   165  	// hashSearchIndex               = 1
   166  	// A two-level index implementation. Both levels are binary search indexes.
   167  	twoLevelIndex = 2
   168  	// binarySearchWithFirstKeyIndex = 3
   169  
   170  	// RocksDB always includes this in the properties block. Since Pebble
   171  	// doesn't use zstd compression, the string will always be the same.
   172  	// This should be removed if we ever decide to diverge from the RocksDB
   173  	// properties block.
   174  	rocksDBCompressionOptions = "window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; "
   175  )
   176  
   177  // legacy (LevelDB) footer format:
   178  //    metaindex handle (varint64 offset, varint64 size)
   179  //    index handle     (varint64 offset, varint64 size)
   180  //    <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
   181  //    table_magic_number (8 bytes)
   182  // new (RocksDB) footer format:
   183  //    checksum type (char, 1 byte)
   184  //    metaindex handle (varint64 offset, varint64 size)
   185  //    index handle     (varint64 offset, varint64 size)
   186  //    <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
   187  //    footer version (4 bytes)
   188  //    table_magic_number (8 bytes)
   189  type footer struct {
   190  	format      TableFormat
   191  	checksum    uint8
   192  	metaindexBH BlockHandle
   193  	indexBH     BlockHandle
   194  	footerBH    BlockHandle
   195  }
   196  
   197  func readFooter(f vfs.File) (footer, error) {
   198  	var footer footer
   199  	stat, err := f.Stat()
   200  	if err != nil {
   201  		return footer, fmt.Errorf("pebble/table: invalid table (could not stat file): %v", err)
   202  	}
   203  	if stat.Size() < minFooterLen {
   204  		return footer, errors.New("pebble/table: invalid table (file size is too small)")
   205  	}
   206  
   207  	buf := make([]byte, maxFooterLen)
   208  	off := stat.Size() - maxFooterLen
   209  	if off < 0 {
   210  		off = 0
   211  	}
   212  	n, err := f.ReadAt(buf, off)
   213  	if err != nil && err != io.EOF {
   214  		return footer, fmt.Errorf("pebble/table: invalid table (could not read footer): %v", err)
   215  	}
   216  	buf = buf[:n]
   217  
   218  	switch string(buf[len(buf)-len(rocksDBMagic):]) {
   219  	case levelDBMagic:
   220  		if len(buf) < levelDBFooterLen {
   221  			return footer, fmt.Errorf("pebble/table: invalid table (footer too short): %d", len(buf))
   222  		}
   223  		footer.footerBH.Offset = uint64(off+int64(len(buf))) - levelDBFooterLen
   224  		buf = buf[len(buf)-levelDBFooterLen:]
   225  		footer.footerBH.Length = uint64(len(buf))
   226  		footer.format = TableFormatLevelDB
   227  		footer.checksum = checksumCRC32c
   228  
   229  	case rocksDBMagic:
   230  		if len(buf) < rocksDBFooterLen {
   231  			return footer, fmt.Errorf("pebble/table: invalid table (footer too short): %d", len(buf))
   232  		}
   233  		footer.footerBH.Offset = uint64(off+int64(len(buf))) - rocksDBFooterLen
   234  		buf = buf[len(buf)-rocksDBFooterLen:]
   235  		footer.footerBH.Length = uint64(len(buf))
   236  		version := binary.LittleEndian.Uint32(buf[rocksDBVersionOffset:rocksDBMagicOffset])
   237  		if version != rocksDBFormatVersion2 {
   238  			return footer, fmt.Errorf("pebble/table: unsupported format version %d", version)
   239  		}
   240  		footer.format = TableFormatRocksDBv2
   241  		footer.checksum = uint8(buf[0])
   242  		if footer.checksum != checksumCRC32c {
   243  			return footer, fmt.Errorf("pebble/table: unsupported checksum type %d", footer.checksum)
   244  		}
   245  		buf = buf[1:]
   246  
   247  	default:
   248  		return footer, errors.New("pebble/table: invalid table (bad magic number)")
   249  	}
   250  
   251  	{
   252  		var n int
   253  		footer.metaindexBH, n = decodeBlockHandle(buf)
   254  		if n == 0 {
   255  			return footer, errors.New("pebble/table: invalid table (bad metaindex block handle)")
   256  		}
   257  		buf = buf[n:]
   258  
   259  		footer.indexBH, n = decodeBlockHandle(buf)
   260  		if n == 0 {
   261  			return footer, errors.New("pebble/table: invalid table (bad index block handle)")
   262  		}
   263  	}
   264  
   265  	return footer, nil
   266  }
   267  
   268  func (f footer) encode(buf []byte) []byte {
   269  	switch f.format {
   270  	case TableFormatLevelDB:
   271  		buf = buf[:levelDBFooterLen]
   272  		for i := range buf {
   273  			buf[i] = 0
   274  		}
   275  		n := encodeBlockHandle(buf[0:], f.metaindexBH)
   276  		n += encodeBlockHandle(buf[n:], f.indexBH)
   277  		copy(buf[len(buf)-len(levelDBMagic):], levelDBMagic)
   278  
   279  	case TableFormatRocksDBv2:
   280  		buf = buf[:rocksDBFooterLen]
   281  		for i := range buf {
   282  			buf[i] = 0
   283  		}
   284  		buf[0] = f.checksum
   285  		n := 1
   286  		n += encodeBlockHandle(buf[n:], f.metaindexBH)
   287  		n += encodeBlockHandle(buf[n:], f.indexBH)
   288  		binary.LittleEndian.PutUint32(buf[rocksDBVersionOffset:], rocksDBFormatVersion2)
   289  		copy(buf[len(buf)-len(rocksDBMagic):], rocksDBMagic)
   290  	}
   291  
   292  	return buf
   293  }