github.com/cockroachdb/pebble@v1.1.5/sstable/format.go (about) 1 // Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package sstable 6 7 import ( 8 "github.com/cockroachdb/errors" 9 "github.com/cockroachdb/pebble/internal/base" 10 ) 11 12 // TableFormat specifies the format version for sstables. The legacy LevelDB 13 // format is format version 1. 14 type TableFormat uint32 15 16 // The available table formats, representing the tuple (magic number, version 17 // number). Note that these values are not (and should not) be serialized to 18 // disk. The ordering should follow the order the versions were introduced to 19 // Pebble (i.e. the history is linear). 20 const ( 21 TableFormatUnspecified TableFormat = iota 22 TableFormatLevelDB 23 TableFormatRocksDBv2 24 TableFormatPebblev1 // Block properties. 25 TableFormatPebblev2 // Range keys. 26 TableFormatPebblev3 // Value blocks. 27 TableFormatPebblev4 // DELSIZED tombstones. 28 NumTableFormats 29 30 TableFormatMax = NumTableFormats - 1 31 ) 32 33 // TableFormatPebblev4, in addition to DELSIZED, introduces the use of 34 // InternalKeyKindSSTableInternalObsoleteBit. 35 // 36 // 1. Motivation 37 // 38 // We have various related problems caused by Pebble snapshots: 39 // 40 // - P1: RANGEDELs that delete points in the same sstable, but the points 41 // happen to not get deleted during compactions because of an open snapshot. 42 // This causes very expensive iteration, that has been observed in 43 // production deployments 44 // 45 // - P2: When iterating over a foreign sstable (in disaggregated storage), we 46 // need to do (a) point collapsing to expose at most one point per user key, 47 // (b) apply RANGEDELs in the sstable to hide deleted points in the same 48 // sstable. This per-sstable point collapsing iteration needs to be very 49 // efficient (ideally as efficient from a CPU perspective as iteration over 50 // regular sstables) since foreign sstables can be very long-lived -- one of 51 // the goals of disaggregated storage is to scale compute and disk bandwidth 52 // resources as a function of the hot (from a write perspective) data and 53 // not the whole data, so we don't want to have to rewrite foreign sstables 54 // solely to improve read performance. 55 // 56 // The ideal solution for P2 would allow user-facing reads to utilize the 57 // existing SST iterators (with slight modifications) and with no loss of 58 // efficiency. And for P1 and P2 we would like to skip whole blocks of 59 // overwritten/deleted points. Even when we can't skip whole blocks, avoiding 60 // key comparisons at iteration time to discover what points are deleted is 61 // very desirable, since keys can be long. 62 // 63 // We observe that: 64 // 65 // - Reads: 66 // - All user-facing reads in CockroachDB use iterators over the DB, hence 67 // have a higher read seqnum than all sstables (there are some rare cases 68 // that can violate this, but those are not important from a performance 69 // optimization perspective). 70 // 71 // - Certain internal-facing reads in CockroachDB use snapshots, but the 72 // snapshots are shortlived enough that most L5 and L6 sstables will have 73 // all seqnums lower than the snapshot seqnum. 74 // 75 // - Writes: 76 // - We already do key comparisons between points when writing the sstable 77 // to ensure that the sstable invariant (monotonically increasing internal 78 // keys) is not violated. So we know which points share the same userkey, 79 // and thereby which points are obsolete because there is a more recent 80 // point in the same sstable. 81 // 82 // - The compactionIter knows which point id deleted by a RANGEDEL even if 83 // the point does need to be written because of a snapshot. 84 // 85 // So this known information can be encoded in the sstable at write time and 86 // utilized for optimized reading. 87 // 88 // 2. Solution 89 // 90 // We primarily scope the solution to the following point kinds: SET, 91 // SETWITHDEL, DEL, DELSIZED, SINGLEDEL. These are the ones marked locally 92 // obsolete, i.e., obsolete within the sstable, and we can guarantee that at 93 // most one point will be exposed per user key. MERGE keys create more 94 // complexity: MERGE followed by MERGE causes multiple keys to not be 95 // obsolete. Same applies for MERGE followed by SET/SETWITHDEL/DEL*. Note 96 // that: 97 // 98 // - For regular sst iteration, the obsolete marking is a performance 99 // optimization, and multiple keys for the same userkey can be handled by 100 // higher layers in the iterator tree (specifically pebble.Iterator). 101 // 102 // - For foreign sst iteration, we disallow MERGEs to be written to such 103 // shared ssts (details below). 104 // 105 // The key kinds are marked with an obsolete bit 106 // (InternalKeyKindSSTableInternalObsoleteBit) when the key-value pair is 107 // obsolete. This marking is done within blockWriter, based on information 108 // passed to it by Writer. In turn, Writer uses a combination of key 109 // comparisons, and information provided by compactionIter to decide whether a 110 // key-value pair is obsolete. Additionally, a Pebble-internal 111 // BlockPropertyCollector (obsoleteKeyBlockPropertyCollector) is used to mark 112 // blocks where all key-value pairs are obsolete. Since the common case is 113 // non-obsolete blocks, this block property collector uses the empty byte 114 // slice to represent a non-obsolete block, which consumes no space in 115 // BlockHandleWithProperties.Props. 116 // 117 // At read time, the obsolete bit is only visible to the blockIter, which can 118 // be optionally configured to hide obsolete points. This hiding is only 119 // configured for data block iterators for sstables being read by user-facing 120 // iterators at a seqnum greater than the max seqnum in the sstable. 121 // Additionally, when this hiding is configured, a Pebble-internal block 122 // property filter (obsoleteKeyBlockPropertyFilter), is used to skip whole 123 // blocks that are obsolete. 124 // 125 // 2.1 Correctness 126 // 127 // Due to the level invariant, the sequence of seqnums for a user key in a 128 // sstable represents a contiguous subsequence of the seqnums for the userkey 129 // across the whole LSM, and is more recent than the seqnums in a sstable in a 130 // lower level. So exposing exactly one point from a sstable for a userkey 131 // will also mask the points for the userkey in lower levels. If we expose no 132 // point, because of RANGEDELs, that RANGEDEL will also mask the points in 133 // lower levels. 134 // 135 // Note that we do not need to do anything special at write time for 136 // SETWITHDEL and SINGLEDEL. This is because these key kinds are treated 137 // specially only by compactions, which do not hide obsolete points. For 138 // regular reads, SETWITHDEL behaves the same as SET and SINGLEDEL behaves the 139 // same as DEL. 140 // 141 // 2.2 Strictness and MERGE 142 // 143 // Setting the obsolete bit on point keys is advanced usage, so we support two 144 // modes, both of which must be truthful when setting the obsolete bit, but 145 // vary in when they don't set the obsolete bit. 146 // 147 // - Non-strict: In this mode, the bit does not need to be set for keys that 148 // are obsolete. Additionally, any sstable containing MERGE keys can only 149 // use this mode. An iterator over such an sstable, when configured to 150 // hideObsoletePoints, can expose multiple internal keys per user key, and 151 // can expose keys that are deleted by rangedels in the same sstable. This 152 // is the mode that non-advanced users should use. Pebble without 153 // disaggregated storage will also use this mode and will best-effort set 154 // the obsolete bit, to optimize iteration when snapshots have retained many 155 // obsolete keys. 156 // 157 // - Strict: In this mode, every obsolete key must have the obsolete bit set, 158 // and no MERGE keys are permitted. An iterator over such an sstable, when 159 // configured to hideObsoletePoints satisfies two properties: 160 // - S1: will expose at most one internal key per user key, which is the 161 // most recent one. 162 // - S2: will never expose keys that are deleted by rangedels in the same 163 // sstable. 164 // 165 // This is the mode for two use cases in disaggregated storage (which will 166 // exclude parts of the key space that has MERGEs), for levels that contain 167 // sstables that can become foreign sstables: 168 // - Pebble compaction output to these levels that can become foreign 169 // sstables. 170 // 171 // - CockroachDB ingest operations that can ingest into the levels that can 172 // become foreign sstables. Note, these are not sstables corresponding to 173 // copied data for CockroachDB range snapshots. This case occurs for 174 // operations like index backfills: these trivially satisfy the strictness 175 // criteria since they only write one key per userkey. 176 // 177 // TODO(sumeer): this latter case is not currently supported, since only 178 // Writer.AddWithForceObsolete calls are permitted for writing strict 179 // obsolete sstables. This is done to reduce the likelihood of bugs. One 180 // simple way to lift this limitation would be to disallow adding any 181 // RANGEDELs when a Pebble-external writer is trying to construct a strict 182 // obsolete sstable. 183 184 // ParseTableFormat parses the given magic bytes and version into its 185 // corresponding internal TableFormat. 186 func ParseTableFormat(magic []byte, version uint32) (TableFormat, error) { 187 switch string(magic) { 188 case levelDBMagic: 189 return TableFormatLevelDB, nil 190 case rocksDBMagic: 191 if version != rocksDBFormatVersion2 { 192 return TableFormatUnspecified, base.CorruptionErrorf( 193 "pebble/table: unsupported rocksdb format version %d", errors.Safe(version), 194 ) 195 } 196 return TableFormatRocksDBv2, nil 197 case pebbleDBMagic: 198 switch version { 199 case 1: 200 return TableFormatPebblev1, nil 201 case 2: 202 return TableFormatPebblev2, nil 203 case 3: 204 return TableFormatPebblev3, nil 205 case 4: 206 return TableFormatPebblev4, nil 207 default: 208 return TableFormatUnspecified, base.CorruptionErrorf( 209 "pebble/table: unsupported pebble format version %d", errors.Safe(version), 210 ) 211 } 212 default: 213 return TableFormatUnspecified, base.CorruptionErrorf( 214 "pebble/table: invalid table (bad magic number: 0x%x)", magic, 215 ) 216 } 217 } 218 219 // AsTuple returns the TableFormat's (Magic String, Version) tuple. 220 func (f TableFormat) AsTuple() (string, uint32) { 221 switch f { 222 case TableFormatLevelDB: 223 return levelDBMagic, 0 224 case TableFormatRocksDBv2: 225 return rocksDBMagic, 2 226 case TableFormatPebblev1: 227 return pebbleDBMagic, 1 228 case TableFormatPebblev2: 229 return pebbleDBMagic, 2 230 case TableFormatPebblev3: 231 return pebbleDBMagic, 3 232 case TableFormatPebblev4: 233 return pebbleDBMagic, 4 234 default: 235 panic("sstable: unknown table format version tuple") 236 } 237 } 238 239 // String returns the TableFormat (Magic String,Version) tuple. 240 func (f TableFormat) String() string { 241 switch f { 242 case TableFormatLevelDB: 243 return "(LevelDB)" 244 case TableFormatRocksDBv2: 245 return "(RocksDB,v2)" 246 case TableFormatPebblev1: 247 return "(Pebble,v1)" 248 case TableFormatPebblev2: 249 return "(Pebble,v2)" 250 case TableFormatPebblev3: 251 return "(Pebble,v3)" 252 case TableFormatPebblev4: 253 return "(Pebble,v4)" 254 default: 255 panic("sstable: unknown table format version tuple") 256 } 257 }