github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/ptable/table.go

github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/ptable/table.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package ptable // import "github.com/petermattis/pebble/ptable"
     6  
     7  // The ptable file format is similar to the sstable format except that data
     8  // blocks are formatted differently.
     9  //
    10  // <start_of_file>
    11  // [data block 0]
    12  // [data block 1]
    13  // ...
    14  // [data block N-1]
    15  // [meta block 0]
    16  // [meta block 1]
    17  // ...
    18  // [meta block K-1]
    19  // [metaindex block]
    20  // [index block]
    21  // [footer]
    22  // <end_of_file>
    23  //
    24  // The block consists of some data and a 5 byte trailer: a 1 byte block type
    25  // and a 4 byte checksum of the (optionally) compressed data. The block type
    26  // gives the per-block compression used; each block is compressed
    27  // independently. The checksum algorithm is described in the pebble/crc
    28  // package.
    29  //
    30  // The decompressed block consists of structured row data in a columnar
    31  // layout. The schema for rows is fixed for an entire table.
    32  //
    33  // An index block consists of a fixed 2 column schema of keys and block
    34  // handles. The i'th value is the encoded block handle of the i'th data
    35  // block. The i'th key is a separator for i < N-1, and a successor for i ==
    36  // N-1. The separator between blocks i and i+1 is a key that is >= every key in
    37  // block i and is < every key i block i+1. The successor for the final block is
    38  // a key that is >= every key in block N-1. Note that the keys in the index
    39  // block are not stored as such in data blocks.
    40  //
    41  // A block handle is an offset and a length. In the index block, the block
    42  // handle length is not stored directly but is instead calculated using the
    43  // offset of the following block. In the index block, the block handle offset
    44  // is stored as an 8-byte 64-bit integer.
    45  
    46  // TODO(peter):
    47  //
    48  // - We likely need to allow different block schemas within the same
    49  //   table. This is needed for both interleaved tables and for L0 tables which
    50  //   will likely hold blocks for many different SQL tables at once.
    51  //
    52  // - Do we need to store which columns the rows are sorted on? How to store
    53  //   sort order? Yes, we need to be able to merge tables in order to perform
    54  //   compactions and the fundamental operation here is comparison. We need to
    55  //   know the key columns.
    56  //
    57  // - Iteration iterates over blocks. Every row has an implicit timestamp column
    58  //   containing the hlc timestamp. Need to be able to filter to get only the
    59  //   desired version of a row.
    60  //
    61  // - How to integrate iteration with the memtable? The memtable contains
    62  //   relatively little data. Do we convert to columnar data on the fly?
    63  //
    64  // - How to specify the schema for a given key? The number of schemas is the
    65  //   number of indexes in all of the tables. The /table/index/ prefix is a
    66  //   unique prefix. Perhaps there should be a callback from key to schema and
    67  //   the "schema key" can be stored in each block.
    68  //
    69  // - Define scan operation which takes a start and end key and an operator tree
    70  //   composed of projections and filters and returns an iterator over the data.
    71  //
    72  // - How to handle changes to the schema? This happens for the primary row data
    73  //   only and is is due to the addition or deletion of columns. The schema
    74  //   needs to be stored in the table and when merging tables columns need to be
    75  //   added and dropped appropriately.
    76  //
    77  // - What to do about column families where row data is spread across multiple
    78  //   key/value pairs? Column families are logically part of the same row. In
    79  //   some ways, they are similar to the Merge operation, allowing partial
    80  //   updates to a row.
    81  //
    82  // - How to handle the MVCC metadata keys? The intent data (txn, etc) logically
    83  //   belongs on the row, yet no other version of the row has that data. Is this
    84  //   a hidden "intent" column that is NULL for other versions of the row?
    85  //
    86  // - What to do about Merge operations? Simplest approach would be to disallow
    87  //   them on the structured data portion of the key space.
    88  //
    89  // - Need to add a per-row "deleted" bitmap in order to support deletion
    90  //   tombstones.
    91  //
    92  // - How to handle range deletion tombstones?
    93  //
    94  // - How to support UNIQUE indexes with NULL-able keys? Such indexes have a
    95  //   fixed set of columns in the key, but also a unique-ifier suffix in case
    96  //   one of the columns is NULL. Perhaps just model such columns exactly like
    97  //   that, with an implicit hidden column that is part of the key and only
    98  //   non-empty if one of the key column values is NULL.
    99  //
   100  // - How to perform level merging iteration? This is sort of like a full-outer
   101  //   merge-join. For equality on the key columns we keep the version in the
   102  //   higher level.
   103  //
   104  // - How to perform most version selection? There seems to be some similarity
   105  //   with a sorted distinct operation, but instead of keeping the most recent
   106  //   value, we choose the one which meets some criteria on the timestamp
   107  //   column. Or perhaps the op first filters out versions which are newer than
   108  //   the read timestamp, and then does a sorted distinct to only keep the most
   109  //   recent version of the versions remaining.
   110  //
   111  // - How to support interleaved tables? The naive approach of switching blocks
   112  //   whenever we switch from parent to child keys would result in a lot of
   113  //   small blocks for the parent rows (perhaps 1 row per block). Seems better
   114  //   to interleave at the block level, though this will make iteration through
   115  //   a table more complex.
   116  
   117  // RowWriter provides an interface for writing the column data for a row.
   118  type RowWriter interface {
   119  	PutBool(col int, v bool)
   120  	PutInt8(col int, v int8)
   121  	PutInt16(col int, v int16)
   122  	PutInt32(col int, v int32)
   123  	PutInt64(col int, v int64)
   124  	PutFloat32(col int, v float32)
   125  	PutFloat64(col int, v float64)
   126  	PutBytes(col int, v []byte)
   127  	PutNull(col int)
   128  }
   129  
   130  // RowReader provides an interface for reading the column data from a row.
   131  type RowReader interface {
   132  	Null(col int) bool
   133  	Bool(col int) bool
   134  	Int8(col int) int8
   135  	Int16(col int) int16
   136  	Int32(col int) int32
   137  	Int64(col int) int64
   138  	Float32(col int) float32
   139  	Float64(col int) float64
   140  	Bytes(col int) []byte
   141  }
   142  
   143  // Env holds a set of functions used to convert key/value data to and from
   144  // structured column data.
   145  type Env struct {
   146  	// Schema specifies the columns for a table. The order of the columns in the
   147  	// schema matters. Columns that are part of the key need to occur in the same
   148  	// order as they are present in the key and all key columns need to specify a
   149  	// direction.
   150  	Schema []ColumnDef
   151  	// Decode the columns from a key/value pair, outputting the column data to
   152  	// writer. Buf can tbe used for temporary storage during decoding. The column
   153  	// data written to writer is copied.
   154  	Decode func(key, value, buf []byte, writer RowWriter)
   155  	// Encode the columns from the specified row into a key/value pair. Buf can
   156  	// be used to store the encoded key/value data or for temporary storage.
   157  	Encode func(row RowReader, buf []byte) (key, value []byte)
   158  }