github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/sstable/table.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 /* 6 Package sstable implements readers and writers of pebble tables. 7 8 Tables are either opened for reading or created for writing but not both. 9 10 A reader can create iterators, which allow seeking and next/prev 11 iteration. There may be multiple key/value pairs that have the same key and 12 different sequence numbers. 13 14 A reader can be used concurrently. Multiple goroutines can call NewIter 15 concurrently, and each iterator can run concurrently with other iterators. 16 However, any particular iterator should not be used concurrently, and iterators 17 should not be used once a reader is closed. 18 19 A writer writes key/value pairs in increasing key order, and cannot be used 20 concurrently. A table cannot be read until the writer has finished. 21 22 Readers and writers can be created with various options. Passing a nil 23 Options pointer is valid and means to use the default values. 24 25 One such option is to define the 'less than' ordering for keys. The default 26 Comparer uses the natural ordering consistent with bytes.Compare. The same 27 ordering should be used for reading and writing a table. 28 29 To return the value for a key: 30 31 r := table.NewReader(file, options) 32 defer r.Close() 33 return r.Get(key) 34 35 To count the number of entries in a table: 36 37 i, n := r.NewIter(ropts), 0 38 for valid := i.First(); valid; valid = i.Next() { 39 n++ 40 } 41 if err := i.Close(); err != nil { 42 return 0, err 43 } 44 return n, nil 45 46 To write a table with three entries: 47 48 w := table.NewWriter(file, options) 49 if err := w.Set([]byte("apple"), []byte("red"), wopts); err != nil { 50 w.Close() 51 return err 52 } 53 if err := w.Set([]byte("banana"), []byte("yellow"), wopts); err != nil { 54 w.Close() 55 return err 56 } 57 if err := w.Set([]byte("cherry"), []byte("red"), wopts); err != nil { 58 w.Close() 59 return err 60 } 61 return w.Close() 62 */ 63 package sstable // import "github.com/petermattis/pebble/sstable" 64 import ( 65 "encoding/binary" 66 "errors" 67 "fmt" 68 "io" 69 70 "github.com/petermattis/pebble/vfs" 71 ) 72 73 /* 74 The table file format looks like: 75 76 <start_of_file> 77 [data block 0] 78 [data block 1] 79 ... 80 [data block N-1] 81 [meta block 0] 82 [meta block 1] 83 ... 84 [meta block K-1] 85 [metaindex block] 86 [index block] 87 [footer] 88 <end_of_file> 89 90 Each block consists of some data and a 5 byte trailer: a 1 byte block type and 91 a 4 byte checksum of the compressed data. The block type gives the per-block 92 compression used; each block is compressed independently. The checksum 93 algorithm is described in the pebble/crc package. 94 95 The decompressed block data consists of a sequence of key/value entries 96 followed by a trailer. Each key is encoded as a shared prefix length and a 97 remainder string. For example, if two adjacent keys are "tweedledee" and 98 "tweedledum", then the second key would be encoded as {8, "um"}. The shared 99 prefix length is varint encoded. The remainder string and the value are 100 encoded as a varint-encoded length followed by the literal contents. To 101 continue the example, suppose that the key "tweedledum" mapped to the value 102 "socks". The encoded key/value entry would be: "\x08\x02\x05umsocks". 103 104 Every block has a restart interval I. Every I'th key/value entry in that block 105 is called a restart point, and shares no key prefix with the previous entry. 106 Continuing the example above, if the key after "tweedledum" was "two", but was 107 part of a restart point, then that key would be encoded as {0, "two"} instead 108 of {2, "o"}. If a block has P restart points, then the block trailer consists 109 of (P+1)*4 bytes: (P+1) little-endian uint32 values. The first P of these 110 uint32 values are the block offsets of each restart point. The final uint32 111 value is P itself. Thus, when seeking for a particular key, one can use binary 112 search to find the largest restart point whose key is <= the key sought. 113 114 An index block is a block with N key/value entries. The i'th value is the 115 encoded block handle of the i'th data block. The i'th key is a separator for 116 i < N-1, and a successor for i == N-1. The separator between blocks i and i+1 117 is a key that is >= every key in block i and is < every key i block i+1. The 118 successor for the final block is a key that is >= every key in block N-1. The 119 index block restart interval is 1: every entry is a restart point. 120 121 A block handle is an offset and a length; the length does not include the 5 122 byte trailer. Both numbers are varint-encoded, with no padding between the two 123 values. The maximum size of an encoded block handle is therefore 20 bytes. 124 */ 125 126 const ( 127 blockTrailerLen = 5 128 blockHandleMaxLen = 10 + 10 129 130 levelDBFooterLen = 48 131 levelDBMagic = "\x57\xfb\x80\x8b\x24\x75\x47\xdb" 132 levelDBMagicOffset = levelDBFooterLen - len(levelDBMagic) 133 134 rocksDBFooterLen = 1 + 2*blockHandleMaxLen + 4 + 8 135 rocksDBMagic = "\xf7\xcf\xf4\x85\xb7\x41\xe2\x88" 136 rocksDBMagicOffset = rocksDBFooterLen - len(rocksDBMagic) 137 rocksDBVersionOffset = rocksDBMagicOffset - 4 138 139 minFooterLen = levelDBFooterLen 140 maxFooterLen = rocksDBFooterLen 141 142 levelDBFormatVersion = 0 143 rocksDBFormatVersion2 = 2 144 145 noChecksum = 0 146 checksumCRC32c = 1 147 checksumXXHash = 2 148 149 // The block type gives the per-block compression format. 150 // These constants are part of the file format and should not be changed. 151 // They are different from the Compression constants because the latter 152 // are designed so that the zero value of the Compression type means to 153 // use the default compression (which is snappy). 154 noCompressionBlockType byte = 0 155 snappyCompressionBlockType byte = 1 156 157 metaPropertiesName = "rocksdb.properties" 158 metaRangeDelName = "rocksdb.range_del" 159 metaRangeDelV2Name = "rocksdb.range_del2" 160 161 // Index Types. 162 // A space efficient index block that is optimized for binary-search-based 163 // index. 164 binarySearchIndex = 0 165 // hashSearchIndex = 1 166 // A two-level index implementation. Both levels are binary search indexes. 167 twoLevelIndex = 2 168 // binarySearchWithFirstKeyIndex = 3 169 170 // RocksDB always includes this in the properties block. Since Pebble 171 // doesn't use zstd compression, the string will always be the same. 172 // This should be removed if we ever decide to diverge from the RocksDB 173 // properties block. 174 rocksDBCompressionOptions = "window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; " 175 ) 176 177 // legacy (LevelDB) footer format: 178 // metaindex handle (varint64 offset, varint64 size) 179 // index handle (varint64 offset, varint64 size) 180 // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength 181 // table_magic_number (8 bytes) 182 // new (RocksDB) footer format: 183 // checksum type (char, 1 byte) 184 // metaindex handle (varint64 offset, varint64 size) 185 // index handle (varint64 offset, varint64 size) 186 // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1 187 // footer version (4 bytes) 188 // table_magic_number (8 bytes) 189 type footer struct { 190 format TableFormat 191 checksum uint8 192 metaindexBH BlockHandle 193 indexBH BlockHandle 194 footerBH BlockHandle 195 } 196 197 func readFooter(f vfs.File) (footer, error) { 198 var footer footer 199 stat, err := f.Stat() 200 if err != nil { 201 return footer, fmt.Errorf("pebble/table: invalid table (could not stat file): %v", err) 202 } 203 if stat.Size() < minFooterLen { 204 return footer, errors.New("pebble/table: invalid table (file size is too small)") 205 } 206 207 buf := make([]byte, maxFooterLen) 208 off := stat.Size() - maxFooterLen 209 if off < 0 { 210 off = 0 211 } 212 n, err := f.ReadAt(buf, off) 213 if err != nil && err != io.EOF { 214 return footer, fmt.Errorf("pebble/table: invalid table (could not read footer): %v", err) 215 } 216 buf = buf[:n] 217 218 switch string(buf[len(buf)-len(rocksDBMagic):]) { 219 case levelDBMagic: 220 if len(buf) < levelDBFooterLen { 221 return footer, fmt.Errorf("pebble/table: invalid table (footer too short): %d", len(buf)) 222 } 223 footer.footerBH.Offset = uint64(off+int64(len(buf))) - levelDBFooterLen 224 buf = buf[len(buf)-levelDBFooterLen:] 225 footer.footerBH.Length = uint64(len(buf)) 226 footer.format = TableFormatLevelDB 227 footer.checksum = checksumCRC32c 228 229 case rocksDBMagic: 230 if len(buf) < rocksDBFooterLen { 231 return footer, fmt.Errorf("pebble/table: invalid table (footer too short): %d", len(buf)) 232 } 233 footer.footerBH.Offset = uint64(off+int64(len(buf))) - rocksDBFooterLen 234 buf = buf[len(buf)-rocksDBFooterLen:] 235 footer.footerBH.Length = uint64(len(buf)) 236 version := binary.LittleEndian.Uint32(buf[rocksDBVersionOffset:rocksDBMagicOffset]) 237 if version != rocksDBFormatVersion2 { 238 return footer, fmt.Errorf("pebble/table: unsupported format version %d", version) 239 } 240 footer.format = TableFormatRocksDBv2 241 footer.checksum = uint8(buf[0]) 242 if footer.checksum != checksumCRC32c { 243 return footer, fmt.Errorf("pebble/table: unsupported checksum type %d", footer.checksum) 244 } 245 buf = buf[1:] 246 247 default: 248 return footer, errors.New("pebble/table: invalid table (bad magic number)") 249 } 250 251 { 252 var n int 253 footer.metaindexBH, n = decodeBlockHandle(buf) 254 if n == 0 { 255 return footer, errors.New("pebble/table: invalid table (bad metaindex block handle)") 256 } 257 buf = buf[n:] 258 259 footer.indexBH, n = decodeBlockHandle(buf) 260 if n == 0 { 261 return footer, errors.New("pebble/table: invalid table (bad index block handle)") 262 } 263 } 264 265 return footer, nil 266 } 267 268 func (f footer) encode(buf []byte) []byte { 269 switch f.format { 270 case TableFormatLevelDB: 271 buf = buf[:levelDBFooterLen] 272 for i := range buf { 273 buf[i] = 0 274 } 275 n := encodeBlockHandle(buf[0:], f.metaindexBH) 276 n += encodeBlockHandle(buf[n:], f.indexBH) 277 copy(buf[len(buf)-len(levelDBMagic):], levelDBMagic) 278 279 case TableFormatRocksDBv2: 280 buf = buf[:rocksDBFooterLen] 281 for i := range buf { 282 buf[i] = 0 283 } 284 buf[0] = f.checksum 285 n := 1 286 n += encodeBlockHandle(buf[n:], f.metaindexBH) 287 n += encodeBlockHandle(buf[n:], f.indexBH) 288 binary.LittleEndian.PutUint32(buf[rocksDBVersionOffset:], rocksDBFormatVersion2) 289 copy(buf[len(buf)-len(rocksDBMagic):], rocksDBMagic) 290 } 291 292 return buf 293 }