github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/sstable/table.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 // Package sstable implements readers and writers of bitalostable tables. 6 // 7 // Tables are either opened for reading or created for writing but not both. 8 // 9 // A reader can create iterators, which allow seeking and next/prev 10 // iteration. There may be multiple key/value pairs that have the same key and 11 // different sequence numbers. 12 // 13 // A reader can be used concurrently. Multiple goroutines can call NewIter 14 // concurrently, and each iterator can run concurrently with other iterators. 15 // However, any particular iterator should not be used concurrently, and iterators 16 // should not be used once a reader is closed. 17 // 18 // A writer writes key/value pairs in increasing key order, and cannot be used 19 // concurrently. A table cannot be read until the writer has finished. 20 // 21 // Readers and writers can be created with various options. Passing a nil 22 // Options pointer is valid and means to use the default values. 23 // 24 // One such option is to define the 'less than' ordering for keys. The default 25 // Comparer uses the natural ordering consistent with bytes.Compare. The same 26 // ordering should be used for reading and writing a table. 27 // 28 // To return the value for a key: 29 // 30 // r := table.NewReader(file, options) 31 // defer r.Close() 32 // i := r.NewIter(nil, nil) 33 // defer i.Close() 34 // ikey, value := r.SeekGE(key) 35 // if options.Comparer.Compare(ikey.UserKey, key) != 0 { 36 // // not found 37 // } else { 38 // // value is the first record containing key 39 // } 40 // 41 // To count the number of entries in a table: 42 // 43 // i, n := r.NewIter(nil, nil), 0 44 // for key, value := i.First(); key != nil; key, value = i.Next() { 45 // n++ 46 // } 47 // if err := i.Close(); err != nil { 48 // return 0, err 49 // } 50 // return n, nil 51 // 52 // To write a table with three entries: 53 // 54 // w := table.NewWriter(file, options) 55 // if err := w.Set([]byte("apple"), []byte("red")); err != nil { 56 // w.Close() 57 // return err 58 // } 59 // if err := w.Set([]byte("banana"), []byte("yellow")); err != nil { 60 // w.Close() 61 // return err 62 // } 63 // if err := w.Set([]byte("cherry"), []byte("red")); err != nil { 64 // w.Close() 65 // return err 66 // } 67 // return w.Close() 68 package sstable // import "github.com/zuoyebang/bitalostable/sstable" 69 70 import ( 71 "encoding/binary" 72 "io" 73 74 "github.com/cockroachdb/errors" 75 "github.com/zuoyebang/bitalostable/internal/base" 76 ) 77 78 /* 79 The table file format looks like: 80 81 <start_of_file> 82 [data block 0] 83 [data block 1] 84 ... 85 [data block N-1] 86 [meta filter block] (optional) 87 [index block] (for single level index) 88 [meta rangedel block] (optional) 89 [meta range key block] (optional) 90 [meta properties block] 91 [metaindex block] 92 [footer] 93 <end_of_file> 94 95 A Reader eagerly loads the footer, metaindex block and meta properties block, 96 because the data contained in those blocks is needed on every read, and even 97 before reading. For example, the meta properties block is used to verify the 98 comparer and merger are compatible, and the metaindex block contains the 99 location of the meta properties (and other meta blocks). In situations where 100 file system locality matters, or one wants to minimize number of read 101 requests when eagerly loading these blocks, having these three as a suffix 102 of the file is convenient. 103 104 The interleaving of the index block(s) between the meta blocks is done to 105 match RocksDB/LevelDB behavior. 106 107 Each block consists of some data and a 5 byte trailer: a 1 byte block type and a 108 4 byte checksum. The checksum is computed over the compressed data and the first 109 byte of the trailer (i.e. the block type), and is serialized as little-endian. 110 The block type gives the per-block compression used; each block is compressed 111 independently. The checksum algorithm is described in the bitalostable/crc package. 112 113 Most blocks, other than the meta filter block, contain key/value pairs. The 114 remainder of this comment refers to the decompressed block, which has its 5 byte 115 trailer stripped. The decompressed block data consists of a sequence of such 116 key/value entries followed by a block suffix. Each key is encoded as a shared 117 prefix length and a remainder string. For example, if two adjacent keys are 118 "tweedledee" and "tweedledum", then the second key would be encoded as {8, 119 "um"}. The shared prefix length is varint encoded. The remainder string and the 120 value are encoded as a varint-encoded length followed by the literal contents. 121 To continue the example, suppose that the key "tweedledum" mapped to the value 122 "socks". The encoded key/value entry would be: "\x08\x02\x05umsocks". 123 124 Every block has a restart interval I. Every I'th key/value entry in that block 125 is called a restart point, and shares no key prefix with the previous entry. 126 Continuing the example above, if the key after "tweedledum" was "two", but was 127 part of a restart point, then that key would be encoded as {0, "two"} instead 128 of {2, "o"}. If a block has P restart points, then the block suffix consists 129 of (P+1)*4 bytes: (P+1) little-endian uint32 values. The first P of these 130 uint32 values are the block offsets of each restart point. The final uint32 131 value is P itself. Thus, when seeking for a particular key, one can use binary 132 search to find the largest restart point whose key is <= the key sought. 133 134 An index block is a block with N key/value entries. The i'th value is the 135 encoded block handle of the i'th data block. The i'th key is a separator for 136 i < N-1, and a successor for i == N-1. The separator between blocks i and i+1 137 is a key that is >= every key in block i and is < every key i block i+1. The 138 successor for the final block is a key that is >= every key in block N-1. The 139 index block restart interval is 1: every entry is a restart point. 140 141 A block handle is an offset, a length, and optional block properties (for data 142 blocks and first/lower level index blocks); the length does not include the 5 143 byte trailer. All numbers are varint-encoded, with no padding between the two 144 values. The maximum size of an encoded block handle without properties is 20 145 bytes. It is not advised to have properties that accumulate to be longer than 146 100 bytes. 147 148 Instead of a single index block, the sstable can have a two-level index (this 149 is used to prevent a single huge index block). A two-level index consists of a 150 sequence of lower-level index blocks with block handles for data blocks 151 followed by a single top-level index block with block handles for the 152 lower-level index blocks. 153 154 The metaindex block also contains block handles as values, with keys being 155 the names of the meta blocks. 156 157 */ 158 159 const ( 160 blockTrailerLen = 5 161 blockHandleMaxLenWithoutProperties = 10 + 10 162 // blockHandleLikelyMaxLen can be used for pre-allocating buffers to 163 // reduce memory copies. It is not guaranteed that a block handle will not 164 // exceed this length. 165 blockHandleLikelyMaxLen = blockHandleMaxLenWithoutProperties + 100 166 167 levelDBFooterLen = 48 168 levelDBMagic = "\x57\xfb\x80\x8b\x24\x75\x47\xdb" 169 levelDBMagicOffset = levelDBFooterLen - len(levelDBMagic) 170 171 rocksDBFooterLen = 1 + 2*blockHandleMaxLenWithoutProperties + 4 + 8 172 rocksDBMagic = "\xf7\xcf\xf4\x85\xb7\x41\xe2\x88" 173 rocksDBMagicOffset = rocksDBFooterLen - len(rocksDBMagic) 174 rocksDBVersionOffset = rocksDBMagicOffset - 4 175 rocksDBExternalFormatVersion = 2 176 177 bitalostableDBMagic = "\xf0\x9f\xaa\xb3\xf0\x9f\xaa\xb3" // 🪳🪳 178 179 minFooterLen = levelDBFooterLen 180 maxFooterLen = rocksDBFooterLen 181 182 levelDBFormatVersion = 0 183 rocksDBFormatVersion2 = 2 184 185 metaRangeKeyName = "bitalostable.range_key" 186 metaPropertiesName = "rocksdb.properties" 187 metaRangeDelName = "rocksdb.range_del" 188 metaRangeDelV2Name = "rocksdb.range_del2" 189 190 // Index Types. 191 // A space efficient index block that is optimized for binary-search-based 192 // index. 193 binarySearchIndex = 0 194 // hashSearchIndex = 1 195 // A two-level index implementation. Both levels are binary search indexes. 196 twoLevelIndex = 2 197 // binarySearchWithFirstKeyIndex = 3 198 199 // RocksDB always includes this in the properties block. Since Pebble 200 // doesn't use zstd compression, the string will always be the same. 201 // This should be removed if we ever decide to diverge from the RocksDB 202 // properties block. 203 rocksDBCompressionOptions = "window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; " 204 ) 205 206 // ChecksumType specifies the checksum used for blocks. 207 type ChecksumType byte 208 209 // The available checksum types. 210 const ( 211 ChecksumTypeNone ChecksumType = 0 212 ChecksumTypeCRC32c ChecksumType = 1 213 ChecksumTypeXXHash ChecksumType = 2 214 ChecksumTypeXXHash64 ChecksumType = 3 215 ) 216 217 // String implements fmt.Stringer. 218 func (t ChecksumType) String() string { 219 switch t { 220 case ChecksumTypeCRC32c: 221 return "crc32c" 222 case ChecksumTypeNone: 223 return "none" 224 case ChecksumTypeXXHash: 225 return "xxhash" 226 case ChecksumTypeXXHash64: 227 return "xxhash64" 228 default: 229 panic(errors.Newf("sstable: unknown checksum type: %d", t)) 230 } 231 } 232 233 type blockType byte 234 235 const ( 236 // The block type gives the per-block compression format. 237 // These constants are part of the file format and should not be changed. 238 // They are different from the Compression constants because the latter 239 // are designed so that the zero value of the Compression type means to 240 // use the default compression (which is snappy). 241 // Not all compression types listed here are supported. 242 noCompressionBlockType blockType = 0 243 snappyCompressionBlockType blockType = 1 244 zlibCompressionBlockType blockType = 2 245 bzip2CompressionBlockType blockType = 3 246 lz4CompressionBlockType blockType = 4 247 lz4hcCompressionBlockType blockType = 5 248 xpressCompressionBlockType blockType = 6 249 zstdCompressionBlockType blockType = 7 250 ) 251 252 // String implements fmt.Stringer. 253 func (t blockType) String() string { 254 switch t { 255 case 0: 256 return "none" 257 case 1: 258 return "snappy" 259 case 2: 260 return "zlib" 261 case 3: 262 return "bzip2" 263 case 4: 264 return "lz4" 265 case 5: 266 return "lz4hc" 267 case 6: 268 return "xpress" 269 case 7: 270 return "zstd" 271 default: 272 panic(errors.Newf("sstable: unknown block type: %d", t)) 273 } 274 } 275 276 // legacy (LevelDB) footer format: 277 // 278 // metaindex handle (varint64 offset, varint64 size) 279 // index handle (varint64 offset, varint64 size) 280 // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength 281 // table_magic_number (8 bytes) 282 // 283 // new (RocksDB) footer format: 284 // 285 // checksum type (char, 1 byte) 286 // metaindex handle (varint64 offset, varint64 size) 287 // index handle (varint64 offset, varint64 size) 288 // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1 289 // footer version (4 bytes) 290 // table_magic_number (8 bytes) 291 type footer struct { 292 format TableFormat 293 checksum ChecksumType 294 metaindexBH BlockHandle 295 indexBH BlockHandle 296 footerBH BlockHandle 297 } 298 299 func readFooter(f ReadableFile) (footer, error) { 300 var footer footer 301 stat, err := f.Stat() 302 if err != nil { 303 return footer, errors.Wrap(err, "bitalostable/table: invalid table (could not stat file)") 304 } 305 if stat.Size() < minFooterLen { 306 return footer, base.CorruptionErrorf("bitalostable/table: invalid table (file size is too small)") 307 } 308 309 buf := make([]byte, maxFooterLen) 310 off := stat.Size() - maxFooterLen 311 if off < 0 { 312 off = 0 313 } 314 n, err := f.ReadAt(buf, off) 315 if err != nil && err != io.EOF { 316 return footer, errors.Wrap(err, "bitalostable/table: invalid table (could not read footer)") 317 } 318 buf = buf[:n] 319 320 switch magic := buf[len(buf)-len(rocksDBMagic):]; string(magic) { 321 case levelDBMagic: 322 if len(buf) < levelDBFooterLen { 323 return footer, base.CorruptionErrorf( 324 "bitalostable/table: invalid table (footer too short): %d", errors.Safe(len(buf))) 325 } 326 footer.footerBH.Offset = uint64(off+int64(len(buf))) - levelDBFooterLen 327 buf = buf[len(buf)-levelDBFooterLen:] 328 footer.footerBH.Length = uint64(len(buf)) 329 footer.format = TableFormatLevelDB 330 footer.checksum = ChecksumTypeCRC32c 331 332 case rocksDBMagic, bitalostableDBMagic: 333 // NOTE: The Pebble magic string implies the same footer format as that used 334 // by the RocksDBv2 table format. 335 if len(buf) < rocksDBFooterLen { 336 return footer, base.CorruptionErrorf("bitalostable/table: invalid table (footer too short): %d", errors.Safe(len(buf))) 337 } 338 footer.footerBH.Offset = uint64(off+int64(len(buf))) - rocksDBFooterLen 339 buf = buf[len(buf)-rocksDBFooterLen:] 340 footer.footerBH.Length = uint64(len(buf)) 341 version := binary.LittleEndian.Uint32(buf[rocksDBVersionOffset:rocksDBMagicOffset]) 342 343 format, err := ParseTableFormat(magic, version) 344 if err != nil { 345 return footer, err 346 } 347 footer.format = format 348 349 switch ChecksumType(buf[0]) { 350 case ChecksumTypeCRC32c: 351 footer.checksum = ChecksumTypeCRC32c 352 case ChecksumTypeXXHash64: 353 footer.checksum = ChecksumTypeXXHash64 354 default: 355 return footer, base.CorruptionErrorf("bitalostable/table: unsupported checksum type %d", errors.Safe(footer.checksum)) 356 } 357 buf = buf[1:] 358 359 default: 360 return footer, base.CorruptionErrorf("bitalostable/table: invalid table (bad magic number)") 361 } 362 363 { 364 end := uint64(stat.Size()) 365 var n int 366 footer.metaindexBH, n = decodeBlockHandle(buf) 367 if n == 0 || footer.metaindexBH.Offset+footer.metaindexBH.Length > end { 368 return footer, base.CorruptionErrorf("bitalostable/table: invalid table (bad metaindex block handle)") 369 } 370 buf = buf[n:] 371 372 footer.indexBH, n = decodeBlockHandle(buf) 373 if n == 0 || footer.indexBH.Offset+footer.indexBH.Length > end { 374 return footer, base.CorruptionErrorf("bitalostable/table: invalid table (bad index block handle)") 375 } 376 } 377 378 return footer, nil 379 } 380 381 func (f footer) encode(buf []byte) []byte { 382 switch magic, version := f.format.AsTuple(); magic { 383 case levelDBMagic: 384 buf = buf[:levelDBFooterLen] 385 for i := range buf { 386 buf[i] = 0 387 } 388 n := encodeBlockHandle(buf[0:], f.metaindexBH) 389 encodeBlockHandle(buf[n:], f.indexBH) 390 copy(buf[len(buf)-len(levelDBMagic):], levelDBMagic) 391 392 case rocksDBMagic, bitalostableDBMagic: 393 buf = buf[:rocksDBFooterLen] 394 for i := range buf { 395 buf[i] = 0 396 } 397 switch f.checksum { 398 case ChecksumTypeNone: 399 buf[0] = byte(ChecksumTypeNone) 400 case ChecksumTypeCRC32c: 401 buf[0] = byte(ChecksumTypeCRC32c) 402 case ChecksumTypeXXHash: 403 buf[0] = byte(ChecksumTypeXXHash) 404 case ChecksumTypeXXHash64: 405 buf[0] = byte(ChecksumTypeXXHash64) 406 default: 407 panic("unknown checksum type") 408 } 409 n := 1 410 n += encodeBlockHandle(buf[n:], f.metaindexBH) 411 encodeBlockHandle(buf[n:], f.indexBH) 412 binary.LittleEndian.PutUint32(buf[rocksDBVersionOffset:], version) 413 copy(buf[len(buf)-len(rocksDBMagic):], magic) 414 415 default: 416 panic("sstable: unspecified table format version") 417 } 418 419 return buf 420 } 421 422 func supportsTwoLevelIndex(format TableFormat) bool { 423 switch format { 424 case TableFormatLevelDB: 425 return false 426 case TableFormatRocksDBv2, TableFormatPebblev1, TableFormatPebblev2: 427 return true 428 default: 429 panic("sstable: unspecified table format version") 430 } 431 }