github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/sstable/table.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 // Package sstable implements readers and writers of pebble tables. 6 // 7 // Tables are either opened for reading or created for writing but not both. 8 // 9 // A reader can create iterators, which allow seeking and next/prev 10 // iteration. There may be multiple key/value pairs that have the same key and 11 // different sequence numbers. 12 // 13 // A reader can be used concurrently. Multiple goroutines can call NewIter 14 // concurrently, and each iterator can run concurrently with other iterators. 15 // However, any particular iterator should not be used concurrently, and iterators 16 // should not be used once a reader is closed. 17 // 18 // A writer writes key/value pairs in increasing key order, and cannot be used 19 // concurrently. A table cannot be read until the writer has finished. 20 // 21 // Readers and writers can be created with various options. Passing a nil 22 // Options pointer is valid and means to use the default values. 23 // 24 // One such option is to define the 'less than' ordering for keys. The default 25 // Comparer uses the natural ordering consistent with bytes.Compare. The same 26 // ordering should be used for reading and writing a table. 27 // 28 // To return the value for a key: 29 // 30 // r := table.NewReader(file, options) 31 // defer r.Close() 32 // i := r.NewIter(nil, nil) 33 // defer i.Close() 34 // ikey, value := r.SeekGE(key) 35 // if options.Comparer.Compare(ikey.UserKey, key) != 0 { 36 // // not found 37 // } else { 38 // // value is the first record containing key 39 // } 40 // 41 // To count the number of entries in a table: 42 // 43 // i, n := r.NewIter(nil, nil), 0 44 // for key, value := i.First(); key != nil; key, value = i.Next() { 45 // n++ 46 // } 47 // if err := i.Close(); err != nil { 48 // return 0, err 49 // } 50 // return n, nil 51 // 52 // To write a table with three entries: 53 // 54 // w := table.NewWriter(file, options) 55 // if err := w.Set([]byte("apple"), []byte("red")); err != nil { 56 // w.Close() 57 // return err 58 // } 59 // if err := w.Set([]byte("banana"), []byte("yellow")); err != nil { 60 // w.Close() 61 // return err 62 // } 63 // if err := w.Set([]byte("cherry"), []byte("red")); err != nil { 64 // w.Close() 65 // return err 66 // } 67 // return w.Close() 68 package sstable // import "github.com/cockroachdb/pebble/sstable" 69 70 import ( 71 "context" 72 "encoding/binary" 73 74 "github.com/cockroachdb/errors" 75 "github.com/cockroachdb/pebble/internal/base" 76 "github.com/cockroachdb/pebble/objstorage" 77 ) 78 79 /* 80 The table file format looks like: 81 82 <start_of_file> 83 [data block 0] 84 [data block 1] 85 ... 86 [data block N-1] 87 [meta filter block] (optional) 88 [index block] (for single level index) 89 [meta rangedel block] (optional) 90 [meta range key block] (optional) 91 [value block 0] (optional) 92 [value block M-1] (optional) 93 [meta value index block] (optional) 94 [meta properties block] 95 [metaindex block] 96 [footer] 97 <end_of_file> 98 99 A Reader eagerly loads the footer, metaindex block and meta properties block, 100 because the data contained in those blocks is needed on every read, and even 101 before reading. For example, the meta properties block is used to verify the 102 comparer and merger are compatible, and the metaindex block contains the 103 location of the meta properties (and other meta blocks). In situations where 104 file system locality matters, or one wants to minimize number of read 105 requests when eagerly loading these blocks, having these three as a suffix 106 of the file is convenient. 107 108 The interleaving of the index block(s) between the meta blocks is done to 109 match RocksDB/LevelDB behavior. 110 111 Each block consists of some data and a 5 byte trailer: a 1 byte block type and a 112 4 byte checksum. The checksum is computed over the compressed data and the first 113 byte of the trailer (i.e. the block type), and is serialized as little-endian. 114 The block type gives the per-block compression used; each block is compressed 115 independently. The checksum algorithm is described in the pebble/crc package. 116 117 Most blocks, other than the meta filter block, value blocks and meta value 118 index block, contain key/value pairs. The remainder of this comment refers to 119 the decompressed block, containing key/value pairs, which has its 5 byte 120 trailer stripped. The decompressed block data consists of a sequence of such 121 key/value entries followed by a block suffix. Each key is encoded as a shared 122 prefix length and a remainder string. For example, if two adjacent keys are 123 "tweedledee" and "tweedledum", then the second key would be encoded as {8, 124 "um"}. The shared prefix length is varint encoded. The remainder string and the 125 value are encoded as a varint-encoded length followed by the literal contents. 126 To continue the example, suppose that the key "tweedledum" mapped to the value 127 "socks". The encoded key/value entry would be: "\x08\x02\x05umsocks". 128 129 Every block has a restart interval I. Every I'th key/value entry in that block 130 is called a restart point, and shares no key prefix with the previous entry. 131 Continuing the example above, if the key after "tweedledum" was "two", but was 132 part of a restart point, then that key would be encoded as {0, "two"} instead 133 of {2, "o"}. If a block has P restart points, then the block suffix consists 134 of (P+1)*4 bytes: (P+1) little-endian uint32 values. The first P of these 135 uint32 values are the block offsets of each restart point. The final uint32 136 value is P itself. Thus, when seeking for a particular key, one can use binary 137 search to find the largest restart point whose key is <= the key sought. 138 139 An index block is a block with N key/value entries. The i'th value is the 140 encoded block handle of the i'th data block. The i'th key is a separator for 141 i < N-1, and a successor for i == N-1. The separator between blocks i and i+1 142 is a key that is >= every key in block i and is < every key i block i+1. The 143 successor for the final block is a key that is >= every key in block N-1. The 144 index block restart interval is 1: every entry is a restart point. 145 146 A block handle is an offset, a length, and optional block properties (for data 147 blocks and first/lower level index blocks); the length does not include the 5 148 byte trailer. All numbers are varint-encoded, with no padding between the two 149 values. The maximum size of an encoded block handle without properties is 20 150 bytes. It is not advised to have properties that accumulate to be longer than 151 100 bytes. 152 153 Instead of a single index block, the sstable can have a two-level index (this 154 is used to prevent a single huge index block). A two-level index consists of a 155 sequence of lower-level index blocks with block handles for data blocks 156 followed by a single top-level index block with block handles for the 157 lower-level index blocks. 158 159 The metaindex block also contains block handles as values, with keys being 160 the names of the meta blocks. 161 162 For a description of value blocks and the meta value index block, see 163 value_block.go. 164 165 Data blocks have some additional features: 166 - For TableFormatPebblev3 onwards: 167 - For SETs, the value has a 1 byte value prefix, which indicates whether the 168 value is inline, or in a separate value block, and indicates whether the 169 prefix of the userkey (as defined by split) has changed or not. See 170 value_block.go for details. 171 - The most significant bit of the restart points is used to indicate whether 172 userkey prefix has changed since the last restart point. See the detailed 173 comment in blockWriter. 174 - The maximum length of the "shared prefix" when encoding the key, is the 175 length of the prefix of the userkey (as defined by split) of the previous 176 key. 177 178 - For TableFormatPebblev4 onwards: 179 - The key kinds may be altered to set the 180 InternalKeyKindSSTableInternalObsoleteBit if the key-value pair is obsolete 181 in the context of that sstable (for a reader that reads at a higher seqnum 182 than the highest seqnum in the sstable). For details, see the comment in 183 format.go. 184 */ 185 186 const ( 187 blockTrailerLen = 5 188 blockHandleMaxLenWithoutProperties = 10 + 10 189 // blockHandleLikelyMaxLen can be used for pre-allocating buffers to 190 // reduce memory copies. It is not guaranteed that a block handle will not 191 // exceed this length. 192 blockHandleLikelyMaxLen = blockHandleMaxLenWithoutProperties + 100 193 194 levelDBFooterLen = 48 195 levelDBMagic = "\x57\xfb\x80\x8b\x24\x75\x47\xdb" 196 levelDBMagicOffset = levelDBFooterLen - len(levelDBMagic) 197 198 rocksDBFooterLen = 1 + 2*blockHandleMaxLenWithoutProperties + 4 + 8 199 rocksDBMagic = "\xf7\xcf\xf4\x85\xb7\x41\xe2\x88" 200 rocksDBMagicOffset = rocksDBFooterLen - len(rocksDBMagic) 201 rocksDBVersionOffset = rocksDBMagicOffset - 4 202 rocksDBExternalFormatVersion = 2 203 204 pebbleDBMagic = "\xf0\x9f\xaa\xb3\xf0\x9f\xaa\xb3" // 🪳🪳 205 206 minFooterLen = levelDBFooterLen 207 maxFooterLen = rocksDBFooterLen 208 209 levelDBFormatVersion = 0 210 rocksDBFormatVersion2 = 2 211 212 metaRangeKeyName = "pebble.range_key" 213 metaValueIndexName = "pebble.value_index" 214 metaPropertiesName = "rocksdb.properties" 215 metaRangeDelName = "rocksdb.range_del" 216 metaRangeDelV2Name = "rocksdb.range_del2" 217 218 // Index Types. 219 // A space efficient index block that is optimized for binary-search-based 220 // index. 221 binarySearchIndex = 0 222 // hashSearchIndex = 1 223 // A two-level index implementation. Both levels are binary search indexes. 224 twoLevelIndex = 2 225 // binarySearchWithFirstKeyIndex = 3 226 227 // RocksDB always includes this in the properties block. Since Pebble 228 // doesn't use zstd compression, the string will always be the same. 229 // This should be removed if we ever decide to diverge from the RocksDB 230 // properties block. 231 rocksDBCompressionOptions = "window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; " 232 ) 233 234 // ChecksumType specifies the checksum used for blocks. 235 type ChecksumType byte 236 237 // The available checksum types. 238 const ( 239 ChecksumTypeNone ChecksumType = 0 240 ChecksumTypeCRC32c ChecksumType = 1 241 ChecksumTypeXXHash ChecksumType = 2 242 ChecksumTypeXXHash64 ChecksumType = 3 243 ) 244 245 // String implements fmt.Stringer. 246 func (t ChecksumType) String() string { 247 switch t { 248 case ChecksumTypeCRC32c: 249 return "crc32c" 250 case ChecksumTypeNone: 251 return "none" 252 case ChecksumTypeXXHash: 253 return "xxhash" 254 case ChecksumTypeXXHash64: 255 return "xxhash64" 256 default: 257 panic(errors.Newf("sstable: unknown checksum type: %d", t)) 258 } 259 } 260 261 type blockType byte 262 263 const ( 264 // The block type gives the per-block compression format. 265 // These constants are part of the file format and should not be changed. 266 // They are different from the Compression constants because the latter 267 // are designed so that the zero value of the Compression type means to 268 // use the default compression (which is snappy). 269 // Not all compression types listed here are supported. 270 noCompressionBlockType blockType = 0 271 snappyCompressionBlockType blockType = 1 272 zlibCompressionBlockType blockType = 2 273 bzip2CompressionBlockType blockType = 3 274 lz4CompressionBlockType blockType = 4 275 lz4hcCompressionBlockType blockType = 5 276 xpressCompressionBlockType blockType = 6 277 zstdCompressionBlockType blockType = 7 278 ) 279 280 // String implements fmt.Stringer. 281 func (t blockType) String() string { 282 switch t { 283 case 0: 284 return "none" 285 case 1: 286 return "snappy" 287 case 2: 288 return "zlib" 289 case 3: 290 return "bzip2" 291 case 4: 292 return "lz4" 293 case 5: 294 return "lz4hc" 295 case 6: 296 return "xpress" 297 case 7: 298 return "zstd" 299 default: 300 panic(errors.Newf("sstable: unknown block type: %d", t)) 301 } 302 } 303 304 // legacy (LevelDB) footer format: 305 // 306 // metaindex handle (varint64 offset, varint64 size) 307 // index handle (varint64 offset, varint64 size) 308 // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength 309 // table_magic_number (8 bytes) 310 // 311 // new (RocksDB) footer format: 312 // 313 // checksum type (char, 1 byte) 314 // metaindex handle (varint64 offset, varint64 size) 315 // index handle (varint64 offset, varint64 size) 316 // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1 317 // footer version (4 bytes) 318 // table_magic_number (8 bytes) 319 type footer struct { 320 format TableFormat 321 checksum ChecksumType 322 metaindexBH BlockHandle 323 indexBH BlockHandle 324 footerBH BlockHandle 325 } 326 327 func readFooter(f objstorage.Readable) (footer, error) { 328 var footer footer 329 size := f.Size() 330 if size < minFooterLen { 331 return footer, base.CorruptionErrorf("pebble/table: invalid table (file size is too small)") 332 } 333 334 buf := make([]byte, maxFooterLen) 335 off := size - maxFooterLen 336 if off < 0 { 337 off = 0 338 buf = buf[:size] 339 } 340 if err := f.ReadAt(context.TODO(), buf, off); err != nil { 341 return footer, errors.Wrap(err, "pebble/table: invalid table (could not read footer)") 342 } 343 344 switch magic := buf[len(buf)-len(rocksDBMagic):]; string(magic) { 345 case levelDBMagic: 346 if len(buf) < levelDBFooterLen { 347 return footer, base.CorruptionErrorf( 348 "pebble/table: invalid table (footer too short): %d", errors.Safe(len(buf))) 349 } 350 footer.footerBH.Offset = uint64(off+int64(len(buf))) - levelDBFooterLen 351 buf = buf[len(buf)-levelDBFooterLen:] 352 footer.footerBH.Length = uint64(len(buf)) 353 footer.format = TableFormatLevelDB 354 footer.checksum = ChecksumTypeCRC32c 355 356 case rocksDBMagic, pebbleDBMagic: 357 // NOTE: The Pebble magic string implies the same footer format as that used 358 // by the RocksDBv2 table format. 359 if len(buf) < rocksDBFooterLen { 360 return footer, base.CorruptionErrorf("pebble/table: invalid table (footer too short): %d", errors.Safe(len(buf))) 361 } 362 footer.footerBH.Offset = uint64(off+int64(len(buf))) - rocksDBFooterLen 363 buf = buf[len(buf)-rocksDBFooterLen:] 364 footer.footerBH.Length = uint64(len(buf)) 365 version := binary.LittleEndian.Uint32(buf[rocksDBVersionOffset:rocksDBMagicOffset]) 366 367 format, err := ParseTableFormat(magic, version) 368 if err != nil { 369 return footer, err 370 } 371 footer.format = format 372 373 switch ChecksumType(buf[0]) { 374 case ChecksumTypeCRC32c: 375 footer.checksum = ChecksumTypeCRC32c 376 case ChecksumTypeXXHash64: 377 footer.checksum = ChecksumTypeXXHash64 378 default: 379 return footer, base.CorruptionErrorf("pebble/table: unsupported checksum type %d", errors.Safe(footer.checksum)) 380 } 381 buf = buf[1:] 382 383 default: 384 return footer, base.CorruptionErrorf("pebble/table: invalid table (bad magic number: 0x%x)", magic) 385 } 386 387 { 388 end := uint64(size) 389 var n int 390 footer.metaindexBH, n = decodeBlockHandle(buf) 391 if n == 0 || footer.metaindexBH.Offset+footer.metaindexBH.Length > end { 392 return footer, base.CorruptionErrorf("pebble/table: invalid table (bad metaindex block handle)") 393 } 394 buf = buf[n:] 395 396 footer.indexBH, n = decodeBlockHandle(buf) 397 if n == 0 || footer.indexBH.Offset+footer.indexBH.Length > end { 398 return footer, base.CorruptionErrorf("pebble/table: invalid table (bad index block handle)") 399 } 400 } 401 402 return footer, nil 403 } 404 405 func (f footer) encode(buf []byte) []byte { 406 switch magic, version := f.format.AsTuple(); magic { 407 case levelDBMagic: 408 buf = buf[:levelDBFooterLen] 409 for i := range buf { 410 buf[i] = 0 411 } 412 n := encodeBlockHandle(buf[0:], f.metaindexBH) 413 encodeBlockHandle(buf[n:], f.indexBH) 414 copy(buf[len(buf)-len(levelDBMagic):], levelDBMagic) 415 416 case rocksDBMagic, pebbleDBMagic: 417 buf = buf[:rocksDBFooterLen] 418 for i := range buf { 419 buf[i] = 0 420 } 421 switch f.checksum { 422 case ChecksumTypeNone: 423 buf[0] = byte(ChecksumTypeNone) 424 case ChecksumTypeCRC32c: 425 buf[0] = byte(ChecksumTypeCRC32c) 426 case ChecksumTypeXXHash: 427 buf[0] = byte(ChecksumTypeXXHash) 428 case ChecksumTypeXXHash64: 429 buf[0] = byte(ChecksumTypeXXHash64) 430 default: 431 panic("unknown checksum type") 432 } 433 n := 1 434 n += encodeBlockHandle(buf[n:], f.metaindexBH) 435 encodeBlockHandle(buf[n:], f.indexBH) 436 binary.LittleEndian.PutUint32(buf[rocksDBVersionOffset:], version) 437 copy(buf[len(buf)-len(rocksDBMagic):], magic) 438 439 default: 440 panic("sstable: unspecified table format version") 441 } 442 443 return buf 444 } 445 446 func supportsTwoLevelIndex(format TableFormat) bool { 447 switch format { 448 case TableFormatLevelDB: 449 return false 450 case TableFormatRocksDBv2, TableFormatPebblev1, TableFormatPebblev2, TableFormatPebblev3, TableFormatPebblev4: 451 return true 452 default: 453 panic("sstable: unspecified table format version") 454 } 455 }