github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/sstable/options.go (about) 1 // Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package sstable 6 7 import ( 8 "github.com/cockroachdb/pebble/internal/base" 9 "github.com/cockroachdb/pebble/internal/cache" 10 ) 11 12 // Compression is the per-block compression algorithm to use. 13 type Compression int 14 15 // The available compression types. 16 const ( 17 DefaultCompression Compression = iota 18 NoCompression 19 SnappyCompression 20 ZstdCompression 21 NCompression 22 ) 23 24 var ignoredInternalProperties = map[string]struct{}{ 25 "rocksdb.column.family.id": {}, 26 "rocksdb.fixed.key.length": {}, 27 "rocksdb.index.key.is.user.key": {}, 28 "rocksdb.index.value.is.delta.encoded": {}, 29 "rocksdb.oldest.key.time": {}, 30 "rocksdb.creation.time": {}, 31 "rocksdb.file.creation.time": {}, 32 "rocksdb.format.version": {}, 33 } 34 35 func (c Compression) String() string { 36 switch c { 37 case DefaultCompression: 38 return "Default" 39 case NoCompression: 40 return "NoCompression" 41 case SnappyCompression: 42 return "Snappy" 43 case ZstdCompression: 44 return "ZSTD" 45 default: 46 return "Unknown" 47 } 48 } 49 50 // FilterType exports the base.FilterType type. 51 type FilterType = base.FilterType 52 53 // Exported TableFilter constants. 54 const ( 55 TableFilter = base.TableFilter 56 ) 57 58 // FilterWriter exports the base.FilterWriter type. 59 type FilterWriter = base.FilterWriter 60 61 // FilterPolicy exports the base.FilterPolicy type. 62 type FilterPolicy = base.FilterPolicy 63 64 // TablePropertyCollector provides a hook for collecting user-defined 65 // properties based on the keys and values stored in an sstable. A new 66 // TablePropertyCollector is created for an sstable when the sstable is being 67 // written. 68 type TablePropertyCollector interface { 69 // Add is called with each new entry added to the sstable. While the sstable 70 // is itself sorted by key, do not assume that the entries are added in any 71 // order. In particular, the ordering of point entries and range tombstones 72 // is unspecified. 73 Add(key InternalKey, value []byte) error 74 75 // Finish is called when all entries have been added to the sstable. The 76 // collected properties (if any) should be added to the specified map. Note 77 // that in case of an error during sstable construction, Finish may not be 78 // called. 79 Finish(userProps map[string]string) error 80 81 // The name of the property collector. 82 Name() string 83 } 84 85 // SuffixReplaceableTableCollector is an extension to the TablePropertyCollector 86 // interface that allows a table property collector to indicate that it supports 87 // being *updated* during suffix replacement, i.e. when an existing SST in which 88 // all keys have the same key suffix is updated to have a new suffix. 89 // 90 // A collector which supports being updated in such cases must be able to derive 91 // its updated value from its old value and the change being made to the suffix, 92 // without needing to be passed each updated K/V. 93 // 94 // For example, a collector that only inspects values can simply copy its 95 // previously computed property as-is, since key-suffix replacement does not 96 // change values, while a collector that depends only on key suffixes, like one 97 // which collected mvcc-timestamp bounds from timestamp-suffixed keys, can just 98 // set its new bounds from the new suffix, as it is common to all keys, without 99 // needing to recompute it from every key. 100 type SuffixReplaceableTableCollector interface { 101 // UpdateKeySuffixes is called when a table is updated to change the suffix of 102 // all keys in the table, and is passed the old value for that prop, if any, 103 // for that table as well as the old and new suffix. 104 UpdateKeySuffixes(oldProps map[string]string, oldSuffix, newSuffix []byte) error 105 } 106 107 // ReaderOptions holds the parameters needed for reading an sstable. 108 type ReaderOptions struct { 109 // Cache is used to cache uncompressed blocks from sstables. 110 // 111 // The default cache size is a zero-size cache. 112 Cache *cache.Cache 113 114 // User properties specified in this map will not be added to sst.Properties.UserProperties. 115 DeniedUserProperties map[string]struct{} 116 117 // Comparer defines a total ordering over the space of []byte keys: a 'less 118 // than' relationship. The same comparison algorithm must be used for reads 119 // and writes over the lifetime of the DB. 120 // 121 // The default value uses the same ordering as bytes.Compare. 122 Comparer *Comparer 123 124 // Merge defines the Merge function in use for this keyspace. 125 Merge base.Merge 126 127 // Filters is a map from filter policy name to filter policy. It is used for 128 // debugging tools which may be used on multiple databases configured with 129 // different filter policies. It is not necessary to populate this filters 130 // map during normal usage of a DB. 131 Filters map[string]FilterPolicy 132 133 // Merger defines the associative merge operation to use for merging values 134 // written with {Batch,DB}.Merge. The MergerName is checked for consistency 135 // with the value stored in the sstable when it was written. 136 MergerName string 137 138 // Logger is an optional logger and tracer. 139 LoggerAndTracer base.LoggerAndTracer 140 } 141 142 func (o ReaderOptions) ensureDefaults() ReaderOptions { 143 if o.Comparer == nil { 144 o.Comparer = base.DefaultComparer 145 } 146 if o.Merge == nil { 147 o.Merge = base.DefaultMerger.Merge 148 } 149 if o.MergerName == "" { 150 o.MergerName = base.DefaultMerger.Name 151 } 152 if o.LoggerAndTracer == nil { 153 o.LoggerAndTracer = base.NoopLoggerAndTracer{} 154 } 155 if o.DeniedUserProperties == nil { 156 o.DeniedUserProperties = ignoredInternalProperties 157 } 158 return o 159 } 160 161 // WriterOptions holds the parameters used to control building an sstable. 162 type WriterOptions struct { 163 // BlockRestartInterval is the number of keys between restart points 164 // for delta encoding of keys. 165 // 166 // The default value is 16. 167 BlockRestartInterval int 168 169 // BlockSize is the target uncompressed size in bytes of each table block. 170 // 171 // The default value is 4096. 172 BlockSize int 173 174 // BlockSizeThreshold finishes a block if the block size is larger than the 175 // specified percentage of the target block size and adding the next entry 176 // would cause the block to be larger than the target block size. 177 // 178 // The default value is 90 179 BlockSizeThreshold int 180 181 // Cache is used to cache uncompressed blocks from sstables. 182 // 183 // The default is a nil cache. 184 Cache *cache.Cache 185 186 // Comparer defines a total ordering over the space of []byte keys: a 'less 187 // than' relationship. The same comparison algorithm must be used for reads 188 // and writes over the lifetime of the DB. 189 // 190 // The default value uses the same ordering as bytes.Compare. 191 Comparer *Comparer 192 193 // Compression defines the per-block compression to use. 194 // 195 // The default value (DefaultCompression) uses snappy compression. 196 Compression Compression 197 198 // FilterPolicy defines a filter algorithm (such as a Bloom filter) that can 199 // reduce disk reads for Get calls. 200 // 201 // One such implementation is bloom.FilterPolicy(10) from the pebble/bloom 202 // package. 203 // 204 // The default value means to use no filter. 205 FilterPolicy FilterPolicy 206 207 // FilterType defines whether an existing filter policy is applied at a 208 // block-level or table-level. Block-level filters use less memory to create, 209 // but are slower to access as a check for the key in the index must first be 210 // performed to locate the filter block. A table-level filter will require 211 // memory proportional to the number of keys in an sstable to create, but 212 // avoids the index lookup when determining if a key is present. Table-level 213 // filters should be preferred except under constrained memory situations. 214 FilterType FilterType 215 216 // IndexBlockSize is the target uncompressed size in bytes of each index 217 // block. When the index block size is larger than this target, two-level 218 // indexes are automatically enabled. Setting this option to a large value 219 // (such as math.MaxInt32) disables the automatic creation of two-level 220 // indexes. 221 // 222 // The default value is the value of BlockSize. 223 IndexBlockSize int 224 225 // Merger defines the associative merge operation to use for merging values 226 // written with {Batch,DB}.Merge. The MergerName is checked for consistency 227 // with the value stored in the sstable when it was written. 228 MergerName string 229 230 // TableFormat specifies the format version for writing sstables. The default 231 // is TableFormatRocksDBv2 which creates RocksDB compatible sstables. Use 232 // TableFormatLevelDB to create LevelDB compatible sstable which can be used 233 // by a wider range of tools and libraries. 234 TableFormat TableFormat 235 236 // IsStrictObsolete is only relevant for >= TableFormatPebblev4. See comment 237 // in format.go. Must be false if format < TableFormatPebblev4. 238 // 239 // TODO(bilal): set this when writing shared ssts. 240 IsStrictObsolete bool 241 242 // WritingToLowestLevel is only relevant for >= TableFormatPebblev4. It is 243 // used to set the obsolete bit on DEL/DELSIZED/SINGLEDEL if they are the 244 // youngest for a userkey. 245 WritingToLowestLevel bool 246 247 // TablePropertyCollectors is a list of TablePropertyCollector creation 248 // functions. A new TablePropertyCollector is created for each sstable built 249 // and lives for the lifetime of the table. 250 TablePropertyCollectors []func() TablePropertyCollector 251 252 // BlockPropertyCollectors is a list of BlockPropertyCollector creation 253 // functions. A new BlockPropertyCollector is created for each sstable 254 // built and lives for the lifetime of writing that table. 255 BlockPropertyCollectors []func() BlockPropertyCollector 256 257 // Checksum specifies which checksum to use. 258 Checksum ChecksumType 259 260 // Parallelism is used to indicate that the sstable Writer is allowed to 261 // compress data blocks and write datablocks to disk in parallel with the 262 // Writer client goroutine. 263 Parallelism bool 264 265 // ShortAttributeExtractor mirrors 266 // Options.Experimental.ShortAttributeExtractor. 267 ShortAttributeExtractor base.ShortAttributeExtractor 268 269 // RequiredInPlaceValueBound mirrors 270 // Options.Experimental.RequiredInPlaceValueBound. 271 RequiredInPlaceValueBound UserKeyPrefixBound 272 } 273 274 func (o WriterOptions) ensureDefaults() WriterOptions { 275 if o.BlockRestartInterval <= 0 { 276 o.BlockRestartInterval = base.DefaultBlockRestartInterval 277 } 278 if o.BlockSize <= 0 { 279 o.BlockSize = base.DefaultBlockSize 280 } 281 if o.BlockSizeThreshold <= 0 { 282 o.BlockSizeThreshold = base.DefaultBlockSizeThreshold 283 } 284 if o.Comparer == nil { 285 o.Comparer = base.DefaultComparer 286 } 287 if o.Compression <= DefaultCompression || o.Compression >= NCompression { 288 o.Compression = SnappyCompression 289 } 290 if o.IndexBlockSize <= 0 { 291 o.IndexBlockSize = o.BlockSize 292 } 293 if o.MergerName == "" { 294 o.MergerName = base.DefaultMerger.Name 295 } 296 if o.Checksum == ChecksumTypeNone { 297 o.Checksum = ChecksumTypeCRC32c 298 } 299 // By default, if the table format is not specified, fall back to using the 300 // most compatible format. 301 if o.TableFormat == TableFormatUnspecified { 302 o.TableFormat = TableFormatRocksDBv2 303 } 304 return o 305 }