github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/options.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "bytes" 9 "fmt" 10 "io" 11 "runtime" 12 "strconv" 13 "strings" 14 "time" 15 16 "github.com/cockroachdb/errors" 17 "github.com/cockroachdb/pebble/internal/base" 18 "github.com/cockroachdb/pebble/internal/cache" 19 "github.com/cockroachdb/pebble/internal/humanize" 20 "github.com/cockroachdb/pebble/internal/keyspan" 21 "github.com/cockroachdb/pebble/internal/manifest" 22 "github.com/cockroachdb/pebble/objstorage/remote" 23 "github.com/cockroachdb/pebble/rangekey" 24 "github.com/cockroachdb/pebble/sstable" 25 "github.com/cockroachdb/pebble/vfs" 26 ) 27 28 const ( 29 cacheDefaultSize = 8 << 20 // 8 MB 30 defaultLevelMultiplier = 10 31 ) 32 33 // Compression exports the base.Compression type. 34 type Compression = sstable.Compression 35 36 // Exported Compression constants. 37 const ( 38 DefaultCompression = sstable.DefaultCompression 39 NoCompression = sstable.NoCompression 40 SnappyCompression = sstable.SnappyCompression 41 ZstdCompression = sstable.ZstdCompression 42 ) 43 44 // FilterType exports the base.FilterType type. 45 type FilterType = base.FilterType 46 47 // Exported TableFilter constants. 48 const ( 49 TableFilter = base.TableFilter 50 ) 51 52 // FilterWriter exports the base.FilterWriter type. 53 type FilterWriter = base.FilterWriter 54 55 // FilterPolicy exports the base.FilterPolicy type. 56 type FilterPolicy = base.FilterPolicy 57 58 // TablePropertyCollector exports the sstable.TablePropertyCollector type. 59 type TablePropertyCollector = sstable.TablePropertyCollector 60 61 // BlockPropertyCollector exports the sstable.BlockPropertyCollector type. 62 type BlockPropertyCollector = sstable.BlockPropertyCollector 63 64 // BlockPropertyFilter exports the sstable.BlockPropertyFilter type. 65 type BlockPropertyFilter = base.BlockPropertyFilter 66 67 // ShortAttributeExtractor exports the base.ShortAttributeExtractor type. 68 type ShortAttributeExtractor = base.ShortAttributeExtractor 69 70 // UserKeyPrefixBound exports the sstable.UserKeyPrefixBound type. 71 type UserKeyPrefixBound = sstable.UserKeyPrefixBound 72 73 // IterKeyType configures which types of keys an iterator should surface. 74 type IterKeyType int8 75 76 const ( 77 // IterKeyTypePointsOnly configures an iterator to iterate over point keys 78 // only. 79 IterKeyTypePointsOnly IterKeyType = iota 80 // IterKeyTypeRangesOnly configures an iterator to iterate over range keys 81 // only. 82 IterKeyTypeRangesOnly 83 // IterKeyTypePointsAndRanges configures an iterator iterate over both point 84 // keys and range keys simultaneously. 85 IterKeyTypePointsAndRanges 86 ) 87 88 // String implements fmt.Stringer. 89 func (t IterKeyType) String() string { 90 switch t { 91 case IterKeyTypePointsOnly: 92 return "points-only" 93 case IterKeyTypeRangesOnly: 94 return "ranges-only" 95 case IterKeyTypePointsAndRanges: 96 return "points-and-ranges" 97 default: 98 panic(fmt.Sprintf("unknown key type %d", t)) 99 } 100 } 101 102 // IterOptions hold the optional per-query parameters for NewIter. 103 // 104 // Like Options, a nil *IterOptions is valid and means to use the default 105 // values. 106 type IterOptions struct { 107 // LowerBound specifies the smallest key (inclusive) that the iterator will 108 // return during iteration. If the iterator is seeked or iterated past this 109 // boundary the iterator will return Valid()==false. Setting LowerBound 110 // effectively truncates the key space visible to the iterator. 111 LowerBound []byte 112 // UpperBound specifies the largest key (exclusive) that the iterator will 113 // return during iteration. If the iterator is seeked or iterated past this 114 // boundary the iterator will return Valid()==false. Setting UpperBound 115 // effectively truncates the key space visible to the iterator. 116 UpperBound []byte 117 // TableFilter can be used to filter the tables that are scanned during 118 // iteration based on the user properties. Return true to scan the table and 119 // false to skip scanning. This function must be thread-safe since the same 120 // function can be used by multiple iterators, if the iterator is cloned. 121 TableFilter func(userProps map[string]string) bool 122 // SkipPoint may be used to skip over point keys that don't match an 123 // arbitrary predicate during iteration. If set, the Iterator invokes 124 // SkipPoint for keys encountered. If SkipPoint returns true, the iterator 125 // will skip the key without yielding it to the iterator operation in 126 // progress. 127 // 128 // SkipPoint must be a pure function and always return the same result when 129 // provided the same arguments. The iterator may call SkipPoint multiple 130 // times for the same user key. 131 SkipPoint func(userKey []byte) bool 132 // PointKeyFilters can be used to avoid scanning tables and blocks in tables 133 // when iterating over point keys. This slice represents an intersection 134 // across all filters, i.e., all filters must indicate that the block is 135 // relevant. 136 // 137 // Performance note: When len(PointKeyFilters) > 0, the caller should ensure 138 // that cap(PointKeyFilters) is at least len(PointKeyFilters)+1. This helps 139 // avoid allocations in Pebble internal code that mutates the slice. 140 PointKeyFilters []BlockPropertyFilter 141 // RangeKeyFilters can be usefd to avoid scanning tables and blocks in tables 142 // when iterating over range keys. The same requirements that apply to 143 // PointKeyFilters apply here too. 144 RangeKeyFilters []BlockPropertyFilter 145 // KeyTypes configures which types of keys to iterate over: point keys, 146 // range keys, or both. 147 KeyTypes IterKeyType 148 // RangeKeyMasking can be used to enable automatic masking of point keys by 149 // range keys. Range key masking is only supported during combined range key 150 // and point key iteration mode (IterKeyTypePointsAndRanges). 151 RangeKeyMasking RangeKeyMasking 152 153 // OnlyReadGuaranteedDurable is an advanced option that is only supported by 154 // the Reader implemented by DB. When set to true, only the guaranteed to be 155 // durable state is visible in the iterator. 156 // - This definition is made under the assumption that the FS implementation 157 // is providing a durability guarantee when data is synced. 158 // - The visible state represents a consistent point in the history of the 159 // DB. 160 // - The implementation is free to choose a conservative definition of what 161 // is guaranteed durable. For simplicity, the current implementation 162 // ignores memtables. A more sophisticated implementation could track the 163 // highest seqnum that is synced to the WAL and published and use that as 164 // the visible seqnum for an iterator. Note that the latter approach is 165 // not strictly better than the former since we can have DBs that are (a) 166 // synced more rarely than memtable flushes, (b) have no WAL. (a) is 167 // likely to be true in a future CockroachDB context where the DB 168 // containing the state machine may be rarely synced. 169 // NB: this current implementation relies on the fact that memtables are 170 // flushed in seqnum order, and any ingested sstables that happen to have a 171 // lower seqnum than a non-flushed memtable don't have any overlapping keys. 172 // This is the fundamental level invariant used in other code too, like when 173 // merging iterators. 174 // 175 // Semantically, using this option provides the caller a "snapshot" as of 176 // the time the most recent memtable was flushed. An alternate interface 177 // would be to add a NewSnapshot variant. Creating a snapshot is heavier 178 // weight than creating an iterator, so we have opted to support this 179 // iterator option. 180 OnlyReadGuaranteedDurable bool 181 // UseL6Filters allows the caller to opt into reading filter blocks for L6 182 // sstables. Helpful if a lot of SeekPrefixGEs are expected in quick 183 // succession, that are also likely to not yield a single key. Filter blocks in 184 // L6 can be relatively large, often larger than data blocks, so the benefit of 185 // loading them in the cache is minimized if the probability of the key 186 // existing is not low or if we just expect a one-time Seek (where loading the 187 // data block directly is better). 188 UseL6Filters bool 189 190 // Internal options. 191 192 logger Logger 193 // Level corresponding to this file. Only passed in if constructed by a 194 // levelIter. 195 level manifest.Level 196 // disableLazyCombinedIteration is an internal testing option. 197 disableLazyCombinedIteration bool 198 // snapshotForHideObsoletePoints is specified for/by levelIter when opening 199 // files and is used to decide whether to hide obsolete points. A value of 0 200 // implies obsolete points should not be hidden. 201 snapshotForHideObsoletePoints uint64 202 203 // NB: If adding new Options, you must account for them in iterator 204 // construction and Iterator.SetOptions. 205 } 206 207 // GetLowerBound returns the LowerBound or nil if the receiver is nil. 208 func (o *IterOptions) GetLowerBound() []byte { 209 if o == nil { 210 return nil 211 } 212 return o.LowerBound 213 } 214 215 // GetUpperBound returns the UpperBound or nil if the receiver is nil. 216 func (o *IterOptions) GetUpperBound() []byte { 217 if o == nil { 218 return nil 219 } 220 return o.UpperBound 221 } 222 223 func (o *IterOptions) pointKeys() bool { 224 if o == nil { 225 return true 226 } 227 return o.KeyTypes == IterKeyTypePointsOnly || o.KeyTypes == IterKeyTypePointsAndRanges 228 } 229 230 func (o *IterOptions) rangeKeys() bool { 231 if o == nil { 232 return false 233 } 234 return o.KeyTypes == IterKeyTypeRangesOnly || o.KeyTypes == IterKeyTypePointsAndRanges 235 } 236 237 func (o *IterOptions) getLogger() Logger { 238 if o == nil || o.logger == nil { 239 return DefaultLogger 240 } 241 return o.logger 242 } 243 244 // SpanIterOptions creates a SpanIterOptions from this IterOptions. 245 func (o *IterOptions) SpanIterOptions() keyspan.SpanIterOptions { 246 if o == nil { 247 return keyspan.SpanIterOptions{} 248 } 249 return keyspan.SpanIterOptions{ 250 RangeKeyFilters: o.RangeKeyFilters, 251 } 252 } 253 254 // scanInternalOptions is similar to IterOptions, meant for use with 255 // scanInternalIterator. 256 type scanInternalOptions struct { 257 IterOptions 258 259 visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error 260 visitRangeDel func(start, end []byte, seqNum uint64) error 261 visitRangeKey func(start, end []byte, keys []rangekey.Key) error 262 visitSharedFile func(sst *SharedSSTMeta) error 263 264 // skipSharedLevels skips levels that are shareable (level >= 265 // sharedLevelStart). 266 skipSharedLevels bool 267 268 // includeObsoleteKeys specifies whether keys shadowed by newer internal keys 269 // are exposed. If false, only one internal key per user key is exposed. 270 includeObsoleteKeys bool 271 272 // rateLimitFunc is used to limit the amount of bytes read per second. 273 rateLimitFunc func(key *InternalKey, value LazyValue) error 274 } 275 276 // RangeKeyMasking configures automatic hiding of point keys by range keys. A 277 // non-nil Suffix enables range-key masking. When enabled, range keys with 278 // suffixes ≥ Suffix behave as masks. All point keys that are contained within a 279 // masking range key's bounds and have suffixes greater than the range key's 280 // suffix are automatically skipped. 281 // 282 // Specifically, when configured with a RangeKeyMasking.Suffix _s_, and there 283 // exists a range key with suffix _r_ covering a point key with suffix _p_, and 284 // 285 // _s_ ≤ _r_ < _p_ 286 // 287 // then the point key is elided. 288 // 289 // Range-key masking may only be used when iterating over both point keys and 290 // range keys with IterKeyTypePointsAndRanges. 291 type RangeKeyMasking struct { 292 // Suffix configures which range keys may mask point keys. Only range keys 293 // that are defined at suffixes greater than or equal to Suffix will mask 294 // point keys. 295 Suffix []byte 296 // Filter is an optional field that may be used to improve performance of 297 // range-key masking through a block-property filter defined over key 298 // suffixes. If non-nil, Filter is called by Pebble to construct a 299 // block-property filter mask at iterator creation. The filter is used to 300 // skip whole point-key blocks containing point keys with suffixes greater 301 // than a covering range-key's suffix. 302 // 303 // To use this functionality, the caller must create and configure (through 304 // Options.BlockPropertyCollectors) a block-property collector that records 305 // the maxmimum suffix contained within a block. The caller then must write 306 // and provide a BlockPropertyFilterMask implementation on that same 307 // property. See the BlockPropertyFilterMask type for more information. 308 Filter func() BlockPropertyFilterMask 309 } 310 311 // BlockPropertyFilterMask extends the BlockPropertyFilter interface for use 312 // with range-key masking. Unlike an ordinary block property filter, a 313 // BlockPropertyFilterMask's filtering criteria is allowed to change when Pebble 314 // invokes its SetSuffix method. 315 // 316 // When a Pebble iterator steps into a range key's bounds and the range key has 317 // a suffix greater than or equal to RangeKeyMasking.Suffix, the range key acts 318 // as a mask. The masking range key hides all point keys that fall within the 319 // range key's bounds and have suffixes > the range key's suffix. Without a 320 // filter mask configured, Pebble performs this hiding by stepping through point 321 // keys and comparing suffixes. If large numbers of point keys are masked, this 322 // requires Pebble to load, iterate through and discard a large number of 323 // sstable blocks containing masked point keys. 324 // 325 // If a block-property collector and a filter mask are configured, Pebble may 326 // skip loading some point-key blocks altogether. If a block's keys are known to 327 // all fall within the bounds of the masking range key and the block was 328 // annotated by a block-property collector with the maximal suffix, Pebble can 329 // ask the filter mask to compare the property to the current masking range 330 // key's suffix. If the mask reports no intersection, the block may be skipped. 331 // 332 // If unsuffixed and suffixed keys are written to the database, care must be 333 // taken to avoid unintentionally masking un-suffixed keys located in the same 334 // block as suffixed keys. One solution is to interpret unsuffixed keys as 335 // containing the maximal suffix value, ensuring that blocks containing 336 // unsuffixed keys are always loaded. 337 type BlockPropertyFilterMask interface { 338 BlockPropertyFilter 339 340 // SetSuffix configures the mask with the suffix of a range key. The filter 341 // should return false from Intersects whenever it's provided with a 342 // property encoding a block's minimum suffix that's greater (according to 343 // Compare) than the provided suffix. 344 SetSuffix(suffix []byte) error 345 } 346 347 // WriteOptions hold the optional per-query parameters for Set and Delete 348 // operations. 349 // 350 // Like Options, a nil *WriteOptions is valid and means to use the default 351 // values. 352 type WriteOptions struct { 353 // Sync is whether to sync writes through the OS buffer cache and down onto 354 // the actual disk, if applicable. Setting Sync is required for durability of 355 // individual write operations but can result in slower writes. 356 // 357 // If false, and the process or machine crashes, then a recent write may be 358 // lost. This is due to the recently written data being buffered inside the 359 // process running Pebble. This differs from the semantics of a write system 360 // call in which the data is buffered in the OS buffer cache and would thus 361 // survive a process crash. 362 // 363 // The default value is true. 364 Sync bool 365 } 366 367 // Sync specifies the default write options for writes which synchronize to 368 // disk. 369 var Sync = &WriteOptions{Sync: true} 370 371 // NoSync specifies the default write options for writes which do not 372 // synchronize to disk. 373 var NoSync = &WriteOptions{Sync: false} 374 375 // GetSync returns the Sync value or true if the receiver is nil. 376 func (o *WriteOptions) GetSync() bool { 377 return o == nil || o.Sync 378 } 379 380 // LevelOptions holds the optional per-level parameters. 381 type LevelOptions struct { 382 // BlockRestartInterval is the number of keys between restart points 383 // for delta encoding of keys. 384 // 385 // The default value is 16. 386 BlockRestartInterval int 387 388 // BlockSize is the target uncompressed size in bytes of each table block. 389 // 390 // The default value is 4096. 391 BlockSize int 392 393 // BlockSizeThreshold finishes a block if the block size is larger than the 394 // specified percentage of the target block size and adding the next entry 395 // would cause the block to be larger than the target block size. 396 // 397 // The default value is 90 398 BlockSizeThreshold int 399 400 // Compression defines the per-block compression to use. 401 // 402 // The default value (DefaultCompression) uses snappy compression. 403 Compression Compression 404 405 // FilterPolicy defines a filter algorithm (such as a Bloom filter) that can 406 // reduce disk reads for Get calls. 407 // 408 // One such implementation is bloom.FilterPolicy(10) from the pebble/bloom 409 // package. 410 // 411 // The default value means to use no filter. 412 FilterPolicy FilterPolicy 413 414 // FilterType defines whether an existing filter policy is applied at a 415 // block-level or table-level. Block-level filters use less memory to create, 416 // but are slower to access as a check for the key in the index must first be 417 // performed to locate the filter block. A table-level filter will require 418 // memory proportional to the number of keys in an sstable to create, but 419 // avoids the index lookup when determining if a key is present. Table-level 420 // filters should be preferred except under constrained memory situations. 421 FilterType FilterType 422 423 // IndexBlockSize is the target uncompressed size in bytes of each index 424 // block. When the index block size is larger than this target, two-level 425 // indexes are automatically enabled. Setting this option to a large value 426 // (such as math.MaxInt32) disables the automatic creation of two-level 427 // indexes. 428 // 429 // The default value is the value of BlockSize. 430 IndexBlockSize int 431 432 // The target file size for the level. 433 TargetFileSize int64 434 } 435 436 // EnsureDefaults ensures that the default values for all of the options have 437 // been initialized. It is valid to call EnsureDefaults on a nil receiver. A 438 // non-nil result will always be returned. 439 func (o *LevelOptions) EnsureDefaults() *LevelOptions { 440 if o == nil { 441 o = &LevelOptions{} 442 } 443 if o.BlockRestartInterval <= 0 { 444 o.BlockRestartInterval = base.DefaultBlockRestartInterval 445 } 446 if o.BlockSize <= 0 { 447 o.BlockSize = base.DefaultBlockSize 448 } else if o.BlockSize > sstable.MaximumBlockSize { 449 panic(errors.Errorf("BlockSize %d exceeds MaximumBlockSize", o.BlockSize)) 450 } 451 if o.BlockSizeThreshold <= 0 { 452 o.BlockSizeThreshold = base.DefaultBlockSizeThreshold 453 } 454 if o.Compression <= DefaultCompression || o.Compression >= sstable.NCompression { 455 o.Compression = SnappyCompression 456 } 457 if o.IndexBlockSize <= 0 { 458 o.IndexBlockSize = o.BlockSize 459 } 460 if o.TargetFileSize <= 0 { 461 o.TargetFileSize = 2 << 20 // 2 MB 462 } 463 return o 464 } 465 466 // Options holds the optional parameters for configuring pebble. These options 467 // apply to the DB at large; per-query options are defined by the IterOptions 468 // and WriteOptions types. 469 type Options struct { 470 // Sync sstables periodically in order to smooth out writes to disk. This 471 // option does not provide any persistency guarantee, but is used to avoid 472 // latency spikes if the OS automatically decides to write out a large chunk 473 // of dirty filesystem buffers. This option only controls SSTable syncs; WAL 474 // syncs are controlled by WALBytesPerSync. 475 // 476 // The default value is 512KB. 477 BytesPerSync int 478 479 // Cache is used to cache uncompressed blocks from sstables. 480 // 481 // The default cache size is 8 MB. 482 Cache *cache.Cache 483 484 // Cleaner cleans obsolete files. 485 // 486 // The default cleaner uses the DeleteCleaner. 487 Cleaner Cleaner 488 489 // Comparer defines a total ordering over the space of []byte keys: a 'less 490 // than' relationship. The same comparison algorithm must be used for reads 491 // and writes over the lifetime of the DB. 492 // 493 // The default value uses the same ordering as bytes.Compare. 494 Comparer *Comparer 495 496 // DebugCheck is invoked, if non-nil, whenever a new version is being 497 // installed. Typically, this is set to pebble.DebugCheckLevels in tests 498 // or tools only, to check invariants over all the data in the database. 499 DebugCheck func(*DB) error 500 501 // Disable the write-ahead log (WAL). Disabling the write-ahead log prohibits 502 // crash recovery, but can improve performance if crash recovery is not 503 // needed (e.g. when only temporary state is being stored in the database). 504 // 505 // TODO(peter): untested 506 DisableWAL bool 507 508 // ErrorIfExists causes an error on Open if the database already exists. 509 // The error can be checked with errors.Is(err, ErrDBAlreadyExists). 510 // 511 // The default value is false. 512 ErrorIfExists bool 513 514 // ErrorIfNotExists causes an error on Open if the database does not already 515 // exist. The error can be checked with errors.Is(err, ErrDBDoesNotExist). 516 // 517 // The default value is false which will cause a database to be created if it 518 // does not already exist. 519 ErrorIfNotExists bool 520 521 // ErrorIfNotPristine causes an error on Open if the database already exists 522 // and any operations have been performed on the database. The error can be 523 // checked with errors.Is(err, ErrDBNotPristine). 524 // 525 // Note that a database that contained keys that were all subsequently deleted 526 // may or may not trigger the error. Currently, we check if there are any live 527 // SSTs or log records to replay. 528 ErrorIfNotPristine bool 529 530 // EventListener provides hooks to listening to significant DB events such as 531 // flushes, compactions, and table deletion. 532 EventListener *EventListener 533 534 // Experimental contains experimental options which are off by default. 535 // These options are temporary and will eventually either be deleted, moved 536 // out of the experimental group, or made the non-adjustable default. These 537 // options may change at any time, so do not rely on them. 538 Experimental struct { 539 // The threshold of L0 read-amplification at which compaction concurrency 540 // is enabled (if CompactionDebtConcurrency was not already exceeded). 541 // Every multiple of this value enables another concurrent 542 // compaction up to MaxConcurrentCompactions. 543 L0CompactionConcurrency int 544 545 // CompactionDebtConcurrency controls the threshold of compaction debt 546 // at which additional compaction concurrency slots are added. For every 547 // multiple of this value in compaction debt bytes, an additional 548 // concurrent compaction is added. This works "on top" of 549 // L0CompactionConcurrency, so the higher of the count of compaction 550 // concurrency slots as determined by the two options is chosen. 551 CompactionDebtConcurrency uint64 552 553 // IngestSplit, if it returns true, allows for ingest-time splitting of 554 // existing sstables into two virtual sstables to allow ingestion sstables to 555 // slot into a lower level than they otherwise would have. 556 IngestSplit func() bool 557 558 // ReadCompactionRate controls the frequency of read triggered 559 // compactions by adjusting `AllowedSeeks` in manifest.FileMetadata: 560 // 561 // AllowedSeeks = FileSize / ReadCompactionRate 562 // 563 // From LevelDB: 564 // ``` 565 // We arrange to automatically compact this file after 566 // a certain number of seeks. Let's assume: 567 // (1) One seek costs 10ms 568 // (2) Writing or reading 1MB costs 10ms (100MB/s) 569 // (3) A compaction of 1MB does 25MB of IO: 570 // 1MB read from this level 571 // 10-12MB read from next level (boundaries may be misaligned) 572 // 10-12MB written to next level 573 // This implies that 25 seeks cost the same as the compaction 574 // of 1MB of data. I.e., one seek costs approximately the 575 // same as the compaction of 40KB of data. We are a little 576 // conservative and allow approximately one seek for every 16KB 577 // of data before triggering a compaction. 578 // ``` 579 ReadCompactionRate int64 580 581 // ReadSamplingMultiplier is a multiplier for the readSamplingPeriod in 582 // iterator.maybeSampleRead() to control the frequency of read sampling 583 // to trigger a read triggered compaction. A value of -1 prevents sampling 584 // and disables read triggered compactions. The default is 1 << 4. which 585 // gets multiplied with a constant of 1 << 16 to yield 1 << 20 (1MB). 586 ReadSamplingMultiplier int64 587 588 // TableCacheShards is the number of shards per table cache. 589 // Reducing the value can reduce the number of idle goroutines per DB 590 // instance which can be useful in scenarios with a lot of DB instances 591 // and a large number of CPUs, but doing so can lead to higher contention 592 // in the table cache and reduced performance. 593 // 594 // The default value is the number of logical CPUs, which can be 595 // limited by runtime.GOMAXPROCS. 596 TableCacheShards int 597 598 // KeyValidationFunc is a function to validate a user key in an SSTable. 599 // 600 // Currently, this function is used to validate the smallest and largest 601 // keys in an SSTable undergoing compaction. In this case, returning an 602 // error from the validation function will result in a panic at runtime, 603 // given that there is rarely any way of recovering from malformed keys 604 // present in compacted files. By default, validation is not performed. 605 // 606 // Additional use-cases may be added in the future. 607 // 608 // NOTE: callers should take care to not mutate the key being validated. 609 KeyValidationFunc func(userKey []byte) error 610 611 // ValidateOnIngest schedules validation of sstables after they have 612 // been ingested. 613 // 614 // By default, this value is false. 615 ValidateOnIngest bool 616 617 // LevelMultiplier configures the size multiplier used to determine the 618 // desired size of each level of the LSM. Defaults to 10. 619 LevelMultiplier int 620 621 // MultiLevelCompactionHeuristic determines whether to add an additional 622 // level to a conventional two level compaction. If nil, a multilevel 623 // compaction will never get triggered. 624 MultiLevelCompactionHeuristic MultiLevelHeuristic 625 626 // MaxWriterConcurrency is used to indicate the maximum number of 627 // compression workers the compression queue is allowed to use. If 628 // MaxWriterConcurrency > 0, then the Writer will use parallelism, to 629 // compress and write blocks to disk. Otherwise, the writer will 630 // compress and write blocks to disk synchronously. 631 MaxWriterConcurrency int 632 633 // ForceWriterParallelism is used to force parallelism in the sstable 634 // Writer for the metamorphic tests. Even with the MaxWriterConcurrency 635 // option set, we only enable parallelism in the sstable Writer if there 636 // is enough CPU available, and this option bypasses that. 637 ForceWriterParallelism bool 638 639 // CPUWorkPermissionGranter should be set if Pebble should be given the 640 // ability to optionally schedule additional CPU. See the documentation 641 // for CPUWorkPermissionGranter for more details. 642 CPUWorkPermissionGranter CPUWorkPermissionGranter 643 644 // EnableValueBlocks is used to decide whether to enable writing 645 // TableFormatPebblev3 sstables. This setting is only respected by a 646 // specific subset of format major versions: FormatSSTableValueBlocks, 647 // FormatFlushableIngest and FormatPrePebblev1MarkedCompacted. In lower 648 // format major versions, value blocks are never enabled. In higher 649 // format major versions, value blocks are always enabled. 650 EnableValueBlocks func() bool 651 652 // ShortAttributeExtractor is used iff EnableValueBlocks() returns true 653 // (else ignored). If non-nil, a ShortAttribute can be extracted from the 654 // value and stored with the key, when the value is stored elsewhere. 655 ShortAttributeExtractor ShortAttributeExtractor 656 657 // RequiredInPlaceValueBound specifies an optional span of user key 658 // prefixes that are not-MVCC, but have a suffix. For these the values 659 // must be stored with the key, since the concept of "older versions" is 660 // not defined. It is also useful for statically known exclusions to value 661 // separation. In CockroachDB, this will be used for the lock table key 662 // space that has non-empty suffixes, but those locks don't represent 663 // actual MVCC versions (the suffix ordering is arbitrary). We will also 664 // need to add support for dynamically configured exclusions (we want the 665 // default to be to allow Pebble to decide whether to separate the value 666 // or not, hence this is structured as exclusions), for example, for users 667 // of CockroachDB to dynamically exclude certain tables. 668 // 669 // Any change in exclusion behavior takes effect only on future written 670 // sstables, and does not start rewriting existing sstables. 671 // 672 // Even ignoring changes in this setting, exclusions are interpreted as a 673 // guidance by Pebble, and not necessarily honored. Specifically, user 674 // keys with multiple Pebble-versions *may* have the older versions stored 675 // in value blocks. 676 RequiredInPlaceValueBound UserKeyPrefixBound 677 678 // DisableIngestAsFlushable disables lazy ingestion of sstables through 679 // a WAL write and memtable rotation. Only effectual if the the format 680 // major version is at least `FormatFlushableIngest`. 681 DisableIngestAsFlushable func() bool 682 683 // RemoteStorage enables use of remote storage (e.g. S3) for storing 684 // sstables. Setting this option enables use of CreateOnShared option and 685 // allows ingestion of external files. 686 RemoteStorage remote.StorageFactory 687 688 // If CreateOnShared is non-zero, new sstables are created on remote storage 689 // (using CreateOnSharedLocator and with the appropriate 690 // CreateOnSharedStrategy). These sstables can be shared between different 691 // Pebble instances; the lifecycle of such objects is managed by the 692 // remote.Storage constructed by options.RemoteStorage. 693 // 694 // Can only be used when RemoteStorage is set (and recognizes 695 // CreateOnSharedLocator). 696 CreateOnShared remote.CreateOnSharedStrategy 697 CreateOnSharedLocator remote.Locator 698 699 // CacheSizeBytesBytes is the size of the on-disk block cache for objects 700 // on shared storage in bytes. If it is 0, no cache is used. 701 SecondaryCacheSizeBytes int64 702 703 // IneffectualPointDeleteCallback is called in compactions/flushes if any 704 // single delete is being elided without deleting a point set/merge. 705 IneffectualSingleDeleteCallback func(userKey []byte) 706 707 // SingleDeleteInvariantViolationCallback is called in compactions/flushes if any 708 // single delete has consumed a Set/Merge, and there is another immediately older 709 // Set/SetWithDelete/Merge. The user of Pebble has violated the invariant under 710 // which SingleDelete can be used correctly. 711 // 712 // Consider the sequence SingleDelete#3, Set#2, Set#1. There are three 713 // ways some of these keys can first meet in a compaction. 714 // 715 // - All 3 keys in the same compaction: this callback will detect the 716 // violation. 717 // 718 // - SingleDelete#3, Set#2 meet in a compaction first: Both keys will 719 // disappear. The violation will not be detected, and the DB will have 720 // Set#1 which is likely incorrect (from the user's perspective). 721 // 722 // - Set#2, Set#1 meet in a compaction first: The output will be Set#2, 723 // which will later be consumed by SingleDelete#3. The violation will 724 // not be detected and the DB will be correct. 725 SingleDeleteInvariantViolationCallback func(userKey []byte) 726 } 727 728 // Filters is a map from filter policy name to filter policy. It is used for 729 // debugging tools which may be used on multiple databases configured with 730 // different filter policies. It is not necessary to populate this filters 731 // map during normal usage of a DB. 732 Filters map[string]FilterPolicy 733 734 // FlushDelayDeleteRange configures how long the database should wait before 735 // forcing a flush of a memtable that contains a range deletion. Disk space 736 // cannot be reclaimed until the range deletion is flushed. No automatic 737 // flush occurs if zero. 738 FlushDelayDeleteRange time.Duration 739 740 // FlushDelayRangeKey configures how long the database should wait before 741 // forcing a flush of a memtable that contains a range key. Range keys in 742 // the memtable prevent lazy combined iteration, so it's desirable to flush 743 // range keys promptly. No automatic flush occurs if zero. 744 FlushDelayRangeKey time.Duration 745 746 // FlushSplitBytes denotes the target number of bytes per sublevel in 747 // each flush split interval (i.e. range between two flush split keys) 748 // in L0 sstables. When set to zero, only a single sstable is generated 749 // by each flush. When set to a non-zero value, flushes are split at 750 // points to meet L0's TargetFileSize, any grandparent-related overlap 751 // options, and at boundary keys of L0 flush split intervals (which are 752 // targeted to contain around FlushSplitBytes bytes in each sublevel 753 // between pairs of boundary keys). Splitting sstables during flush 754 // allows increased compaction flexibility and concurrency when those 755 // tables are compacted to lower levels. 756 FlushSplitBytes int64 757 758 // FormatMajorVersion sets the format of on-disk files. It is 759 // recommended to set the format major version to an explicit 760 // version, as the default may change over time. 761 // 762 // At Open if the existing database is formatted using a later 763 // format major version that is known to this version of Pebble, 764 // Pebble will continue to use the later format major version. If 765 // the existing database's version is unknown, the caller may use 766 // FormatMostCompatible and will be able to open the database 767 // regardless of its actual version. 768 // 769 // If the existing database is formatted using a format major 770 // version earlier than the one specified, Open will automatically 771 // ratchet the database to the specified format major version. 772 FormatMajorVersion FormatMajorVersion 773 774 // FS provides the interface for persistent file storage. 775 // 776 // The default value uses the underlying operating system's file system. 777 FS vfs.FS 778 779 // Lock, if set, must be a database lock acquired through LockDirectory for 780 // the same directory passed to Open. If provided, Open will skip locking 781 // the directory. Closing the database will not release the lock, and it's 782 // the responsibility of the caller to release the lock after closing the 783 // database. 784 // 785 // Open will enforce that the Lock passed locks the same directory passed to 786 // Open. Concurrent calls to Open using the same Lock are detected and 787 // prohibited. 788 Lock *Lock 789 790 // The count of L0 files necessary to trigger an L0 compaction. 791 L0CompactionFileThreshold int 792 793 // The amount of L0 read-amplification necessary to trigger an L0 compaction. 794 L0CompactionThreshold int 795 796 // Hard limit on L0 read-amplification, computed as the number of L0 797 // sublevels. Writes are stopped when this threshold is reached. 798 L0StopWritesThreshold int 799 800 // The maximum number of bytes for LBase. The base level is the level which 801 // L0 is compacted into. The base level is determined dynamically based on 802 // the existing data in the LSM. The maximum number of bytes for other levels 803 // is computed dynamically based on the base level's maximum size. When the 804 // maximum number of bytes for a level is exceeded, compaction is requested. 805 LBaseMaxBytes int64 806 807 // Per-level options. Options for at least one level must be specified. The 808 // options for the last level are used for all subsequent levels. 809 Levels []LevelOptions 810 811 // LoggerAndTracer will be used, if non-nil, else Logger will be used and 812 // tracing will be a noop. 813 814 // Logger used to write log messages. 815 // 816 // The default logger uses the Go standard library log package. 817 Logger Logger 818 // LoggerAndTracer is used for writing log messages and traces. 819 LoggerAndTracer LoggerAndTracer 820 821 // MaxManifestFileSize is the maximum size the MANIFEST file is allowed to 822 // become. When the MANIFEST exceeds this size it is rolled over and a new 823 // MANIFEST is created. 824 MaxManifestFileSize int64 825 826 // MaxOpenFiles is a soft limit on the number of open files that can be 827 // used by the DB. 828 // 829 // The default value is 1000. 830 MaxOpenFiles int 831 832 // The size of a MemTable in steady state. The actual MemTable size starts at 833 // min(256KB, MemTableSize) and doubles for each subsequent MemTable up to 834 // MemTableSize. This reduces the memory pressure caused by MemTables for 835 // short lived (test) DB instances. Note that more than one MemTable can be 836 // in existence since flushing a MemTable involves creating a new one and 837 // writing the contents of the old one in the 838 // background. MemTableStopWritesThreshold places a hard limit on the size of 839 // the queued MemTables. 840 // 841 // The default value is 4MB. 842 MemTableSize uint64 843 844 // Hard limit on the number of queued of MemTables. Writes are stopped when 845 // the sum of the queued memtable sizes exceeds: 846 // MemTableStopWritesThreshold * MemTableSize. 847 // 848 // This value should be at least 2 or writes will stop whenever a MemTable is 849 // being flushed. 850 // 851 // The default value is 2. 852 MemTableStopWritesThreshold int 853 854 // Merger defines the associative merge operation to use for merging values 855 // written with {Batch,DB}.Merge. 856 // 857 // The default merger concatenates values. 858 Merger *Merger 859 860 // MaxConcurrentCompactions specifies the maximum number of concurrent 861 // compactions. The default is 1. Concurrent compactions are performed 862 // - when L0 read-amplification passes the L0CompactionConcurrency threshold 863 // - for automatic background compactions 864 // - when a manual compaction for a level is split and parallelized 865 // MaxConcurrentCompactions must be greater than 0. 866 MaxConcurrentCompactions func() int 867 868 // DisableAutomaticCompactions dictates whether automatic compactions are 869 // scheduled or not. The default is false (enabled). This option is only used 870 // externally when running a manual compaction, and internally for tests. 871 DisableAutomaticCompactions bool 872 873 // NoSyncOnClose decides whether the Pebble instance will enforce a 874 // close-time synchronization (e.g., fdatasync() or sync_file_range()) 875 // on files it writes to. Setting this to true removes the guarantee for a 876 // sync on close. Some implementations can still issue a non-blocking sync. 877 NoSyncOnClose bool 878 879 // NumPrevManifest is the number of non-current or older manifests which 880 // we want to keep around for debugging purposes. By default, we're going 881 // to keep one older manifest. 882 NumPrevManifest int 883 884 // ReadOnly indicates that the DB should be opened in read-only mode. Writes 885 // to the DB will return an error, background compactions are disabled, and 886 // the flush that normally occurs after replaying the WAL at startup is 887 // disabled. 888 ReadOnly bool 889 890 // TableCache is an initialized TableCache which should be set as an 891 // option if the DB needs to be initialized with a pre-existing table cache. 892 // If TableCache is nil, then a table cache which is unique to the DB instance 893 // is created. TableCache can be shared between db instances by setting it here. 894 // The TableCache set here must use the same underlying cache as Options.Cache 895 // and pebble will panic otherwise. 896 TableCache *TableCache 897 898 // TablePropertyCollectors is a list of TablePropertyCollector creation 899 // functions. A new TablePropertyCollector is created for each sstable built 900 // and lives for the lifetime of the table. 901 TablePropertyCollectors []func() TablePropertyCollector 902 903 // BlockPropertyCollectors is a list of BlockPropertyCollector creation 904 // functions. A new BlockPropertyCollector is created for each sstable 905 // built and lives for the lifetime of writing that table. 906 BlockPropertyCollectors []func() BlockPropertyCollector 907 908 // WALBytesPerSync sets the number of bytes to write to a WAL before calling 909 // Sync on it in the background. Just like with BytesPerSync above, this 910 // helps smooth out disk write latencies, and avoids cases where the OS 911 // writes a lot of buffered data to disk at once. However, this is less 912 // necessary with WALs, as many write operations already pass in 913 // Sync = true. 914 // 915 // The default value is 0, i.e. no background syncing. This matches the 916 // default behaviour in RocksDB. 917 WALBytesPerSync int 918 919 // WALDir specifies the directory to store write-ahead logs (WALs) in. If 920 // empty (the default), WALs will be stored in the same directory as sstables 921 // (i.e. the directory passed to pebble.Open). 922 WALDir string 923 924 // WALMinSyncInterval is the minimum duration between syncs of the WAL. If 925 // WAL syncs are requested faster than this interval, they will be 926 // artificially delayed. Introducing a small artificial delay (500us) between 927 // WAL syncs can allow more operations to arrive and reduce IO operations 928 // while having a minimal impact on throughput. This option is supplied as a 929 // closure in order to allow the value to be changed dynamically. The default 930 // value is 0. 931 // 932 // TODO(peter): rather than a closure, should there be another mechanism for 933 // changing options dynamically? 934 WALMinSyncInterval func() time.Duration 935 936 // TargetByteDeletionRate is the rate (in bytes per second) at which sstable file 937 // deletions are limited to (under normal circumstances). 938 // 939 // Deletion pacing is used to slow down deletions when compactions finish up 940 // or readers close and newly-obsolete files need cleaning up. Deleting lots 941 // of files at once can cause disk latency to go up on some SSDs, which this 942 // functionality guards against. 943 // 944 // This value is only a best-effort target; the effective rate can be 945 // higher if deletions are falling behind or disk space is running low. 946 // 947 // Setting this to 0 disables deletion pacing, which is also the default. 948 TargetByteDeletionRate int 949 950 // private options are only used by internal tests or are used internally 951 // for facilitating upgrade paths of unconfigurable functionality. 952 private struct { 953 // strictWALTail configures whether or not a database's WALs created 954 // prior to the most recent one should be interpreted strictly, 955 // requiring a clean EOF. RocksDB 6.2.1 and the version of Pebble 956 // included in CockroachDB 20.1 do not guarantee that closed WALs end 957 // cleanly. If this option is set within an OPTIONS file, Pebble 958 // interprets previous WALs strictly, requiring a clean EOF. 959 // Otherwise, it interprets them permissively in the same manner as 960 // RocksDB 6.2.1. 961 strictWALTail bool 962 963 // disableDeleteOnlyCompactions prevents the scheduling of delete-only 964 // compactions that drop sstables wholy covered by range tombstones or 965 // range key tombstones. 966 disableDeleteOnlyCompactions bool 967 968 // disableElisionOnlyCompactions prevents the scheduling of elision-only 969 // compactions that rewrite sstables in place in order to elide obsolete 970 // keys. 971 disableElisionOnlyCompactions bool 972 973 // disableLazyCombinedIteration is a private option used by the 974 // metamorphic tests to test equivalence between lazy-combined iteration 975 // and constructing the range-key iterator upfront. It's a private 976 // option to avoid littering the public interface with options that we 977 // do not want to allow users to actually configure. 978 disableLazyCombinedIteration bool 979 980 // A private option to disable stats collection. 981 disableTableStats bool 982 983 // testingAlwaysWaitForCleanup is set by some tests to force waiting for 984 // obsolete file deletion (to make events deterministic). 985 testingAlwaysWaitForCleanup bool 986 987 // fsCloser holds a closer that should be invoked after a DB using these 988 // Options is closed. This is used to automatically stop the 989 // long-running goroutine associated with the disk-health-checking FS. 990 // See the initialization of FS in EnsureDefaults. Note that care has 991 // been taken to ensure that it is still safe to continue using the FS 992 // after this closer has been invoked. However, if write operations 993 // against the FS are made after the DB is closed, the FS may leak a 994 // goroutine indefinitely. 995 fsCloser io.Closer 996 } 997 } 998 999 // DebugCheckLevels calls CheckLevels on the provided database. 1000 // It may be set in the DebugCheck field of Options to check 1001 // level invariants whenever a new version is installed. 1002 func DebugCheckLevels(db *DB) error { 1003 return db.CheckLevels(nil) 1004 } 1005 1006 // EnsureDefaults ensures that the default values for all options are set if a 1007 // valid value was not already specified. Returns the new options. 1008 func (o *Options) EnsureDefaults() *Options { 1009 if o == nil { 1010 o = &Options{} 1011 } 1012 if o.BytesPerSync <= 0 { 1013 o.BytesPerSync = 512 << 10 // 512 KB 1014 } 1015 if o.Cleaner == nil { 1016 o.Cleaner = DeleteCleaner{} 1017 } 1018 if o.Comparer == nil { 1019 o.Comparer = DefaultComparer 1020 } 1021 if o.Experimental.DisableIngestAsFlushable == nil { 1022 o.Experimental.DisableIngestAsFlushable = func() bool { return false } 1023 } 1024 if o.Experimental.L0CompactionConcurrency <= 0 { 1025 o.Experimental.L0CompactionConcurrency = 10 1026 } 1027 if o.Experimental.CompactionDebtConcurrency <= 0 { 1028 o.Experimental.CompactionDebtConcurrency = 1 << 30 // 1 GB 1029 } 1030 if o.Experimental.KeyValidationFunc == nil { 1031 o.Experimental.KeyValidationFunc = func([]byte) error { return nil } 1032 } 1033 if o.L0CompactionThreshold <= 0 { 1034 o.L0CompactionThreshold = 4 1035 } 1036 if o.L0CompactionFileThreshold <= 0 { 1037 // Some justification for the default of 500: 1038 // Why not smaller?: 1039 // - The default target file size for L0 is 2MB, so 500 files is <= 1GB 1040 // of data. At observed compaction speeds of > 20MB/s, L0 can be 1041 // cleared of all files in < 1min, so this backlog is not huge. 1042 // - 500 files is low overhead for instantiating L0 sublevels from 1043 // scratch. 1044 // - Lower values were observed to cause excessive and inefficient 1045 // compactions out of L0 in a TPCC import benchmark. 1046 // Why not larger?: 1047 // - More than 1min to compact everything out of L0. 1048 // - CockroachDB's admission control system uses a threshold of 1000 1049 // files to start throttling writes to Pebble. Using 500 here gives 1050 // us headroom between when Pebble should start compacting L0 and 1051 // when the admission control threshold is reached. 1052 // 1053 // We can revisit this default in the future based on better 1054 // experimental understanding. 1055 // 1056 // TODO(jackson): Experiment with slightly lower thresholds [or higher 1057 // admission control thresholds] to see whether a higher L0 score at the 1058 // threshold (currently 2.0) is necessary for some workloads to avoid 1059 // starving L0 in favor of lower-level compactions. 1060 o.L0CompactionFileThreshold = 500 1061 } 1062 if o.L0StopWritesThreshold <= 0 { 1063 o.L0StopWritesThreshold = 12 1064 } 1065 if o.LBaseMaxBytes <= 0 { 1066 o.LBaseMaxBytes = 64 << 20 // 64 MB 1067 } 1068 if o.Levels == nil { 1069 o.Levels = make([]LevelOptions, 1) 1070 for i := range o.Levels { 1071 if i > 0 { 1072 l := &o.Levels[i] 1073 if l.TargetFileSize <= 0 { 1074 l.TargetFileSize = o.Levels[i-1].TargetFileSize * 2 1075 } 1076 } 1077 o.Levels[i].EnsureDefaults() 1078 } 1079 } else { 1080 for i := range o.Levels { 1081 o.Levels[i].EnsureDefaults() 1082 } 1083 } 1084 if o.Logger == nil { 1085 o.Logger = DefaultLogger 1086 } 1087 if o.EventListener == nil { 1088 o.EventListener = &EventListener{} 1089 } 1090 o.EventListener.EnsureDefaults(o.Logger) 1091 if o.MaxManifestFileSize == 0 { 1092 o.MaxManifestFileSize = 128 << 20 // 128 MB 1093 } 1094 if o.MaxOpenFiles == 0 { 1095 o.MaxOpenFiles = 1000 1096 } 1097 if o.MemTableSize <= 0 { 1098 o.MemTableSize = 4 << 20 // 4 MB 1099 } 1100 if o.MemTableStopWritesThreshold <= 0 { 1101 o.MemTableStopWritesThreshold = 2 1102 } 1103 if o.Merger == nil { 1104 o.Merger = DefaultMerger 1105 } 1106 o.private.strictWALTail = true 1107 if o.MaxConcurrentCompactions == nil { 1108 o.MaxConcurrentCompactions = func() int { return 1 } 1109 } 1110 if o.NumPrevManifest <= 0 { 1111 o.NumPrevManifest = 1 1112 } 1113 1114 if o.FormatMajorVersion == FormatDefault { 1115 o.FormatMajorVersion = FormatMostCompatible 1116 } 1117 1118 if o.FS == nil { 1119 o.WithFSDefaults() 1120 } 1121 if o.FlushSplitBytes <= 0 { 1122 o.FlushSplitBytes = 2 * o.Levels[0].TargetFileSize 1123 } 1124 if o.Experimental.LevelMultiplier <= 0 { 1125 o.Experimental.LevelMultiplier = defaultLevelMultiplier 1126 } 1127 if o.Experimental.ReadCompactionRate == 0 { 1128 o.Experimental.ReadCompactionRate = 16000 1129 } 1130 if o.Experimental.ReadSamplingMultiplier == 0 { 1131 o.Experimental.ReadSamplingMultiplier = 1 << 4 1132 } 1133 if o.Experimental.TableCacheShards <= 0 { 1134 o.Experimental.TableCacheShards = runtime.GOMAXPROCS(0) 1135 } 1136 if o.Experimental.CPUWorkPermissionGranter == nil { 1137 o.Experimental.CPUWorkPermissionGranter = defaultCPUWorkGranter{} 1138 } 1139 if o.Experimental.MultiLevelCompactionHeuristic == nil { 1140 o.Experimental.MultiLevelCompactionHeuristic = WriteAmpHeuristic{} 1141 } 1142 1143 o.initMaps() 1144 return o 1145 } 1146 1147 // WithFSDefaults configures the Options to wrap the configured filesystem with 1148 // the default virtual file system middleware, like disk-health checking. 1149 func (o *Options) WithFSDefaults() *Options { 1150 if o.FS == nil { 1151 o.FS = vfs.Default 1152 } 1153 o.FS, o.private.fsCloser = vfs.WithDiskHealthChecks(o.FS, 5*time.Second, 1154 func(info vfs.DiskSlowInfo) { 1155 o.EventListener.DiskSlow(info) 1156 }) 1157 return o 1158 } 1159 1160 // AddEventListener adds the provided event listener to the Options, in addition 1161 // to any existing event listener. 1162 func (o *Options) AddEventListener(l EventListener) { 1163 if o.EventListener != nil { 1164 l = TeeEventListener(l, *o.EventListener) 1165 } 1166 o.EventListener = &l 1167 } 1168 1169 func (o *Options) equal() Equal { 1170 if o.Comparer.Equal == nil { 1171 return bytes.Equal 1172 } 1173 return o.Comparer.Equal 1174 } 1175 1176 // initMaps initializes the Comparers, Filters, and Mergers maps. 1177 func (o *Options) initMaps() { 1178 for i := range o.Levels { 1179 l := &o.Levels[i] 1180 if l.FilterPolicy != nil { 1181 if o.Filters == nil { 1182 o.Filters = make(map[string]FilterPolicy) 1183 } 1184 name := l.FilterPolicy.Name() 1185 if _, ok := o.Filters[name]; !ok { 1186 o.Filters[name] = l.FilterPolicy 1187 } 1188 } 1189 } 1190 } 1191 1192 // Level returns the LevelOptions for the specified level. 1193 func (o *Options) Level(level int) LevelOptions { 1194 if level < len(o.Levels) { 1195 return o.Levels[level] 1196 } 1197 n := len(o.Levels) - 1 1198 l := o.Levels[n] 1199 for i := n; i < level; i++ { 1200 l.TargetFileSize *= 2 1201 } 1202 return l 1203 } 1204 1205 // Clone creates a shallow-copy of the supplied options. 1206 func (o *Options) Clone() *Options { 1207 n := &Options{} 1208 if o != nil { 1209 *n = *o 1210 } 1211 return n 1212 } 1213 1214 func filterPolicyName(p FilterPolicy) string { 1215 if p == nil { 1216 return "none" 1217 } 1218 return p.Name() 1219 } 1220 1221 func (o *Options) String() string { 1222 var buf bytes.Buffer 1223 1224 cacheSize := int64(cacheDefaultSize) 1225 if o.Cache != nil { 1226 cacheSize = o.Cache.MaxSize() 1227 } 1228 1229 fmt.Fprintf(&buf, "[Version]\n") 1230 fmt.Fprintf(&buf, " pebble_version=0.1\n") 1231 fmt.Fprintf(&buf, "\n") 1232 fmt.Fprintf(&buf, "[Options]\n") 1233 fmt.Fprintf(&buf, " bytes_per_sync=%d\n", o.BytesPerSync) 1234 fmt.Fprintf(&buf, " cache_size=%d\n", cacheSize) 1235 fmt.Fprintf(&buf, " cleaner=%s\n", o.Cleaner) 1236 fmt.Fprintf(&buf, " compaction_debt_concurrency=%d\n", o.Experimental.CompactionDebtConcurrency) 1237 fmt.Fprintf(&buf, " comparer=%s\n", o.Comparer.Name) 1238 fmt.Fprintf(&buf, " disable_wal=%t\n", o.DisableWAL) 1239 if o.Experimental.DisableIngestAsFlushable != nil && o.Experimental.DisableIngestAsFlushable() { 1240 fmt.Fprintf(&buf, " disable_ingest_as_flushable=%t\n", true) 1241 } 1242 fmt.Fprintf(&buf, " flush_delay_delete_range=%s\n", o.FlushDelayDeleteRange) 1243 fmt.Fprintf(&buf, " flush_delay_range_key=%s\n", o.FlushDelayRangeKey) 1244 fmt.Fprintf(&buf, " flush_split_bytes=%d\n", o.FlushSplitBytes) 1245 fmt.Fprintf(&buf, " format_major_version=%d\n", o.FormatMajorVersion) 1246 fmt.Fprintf(&buf, " l0_compaction_concurrency=%d\n", o.Experimental.L0CompactionConcurrency) 1247 fmt.Fprintf(&buf, " l0_compaction_file_threshold=%d\n", o.L0CompactionFileThreshold) 1248 fmt.Fprintf(&buf, " l0_compaction_threshold=%d\n", o.L0CompactionThreshold) 1249 fmt.Fprintf(&buf, " l0_stop_writes_threshold=%d\n", o.L0StopWritesThreshold) 1250 fmt.Fprintf(&buf, " lbase_max_bytes=%d\n", o.LBaseMaxBytes) 1251 if o.Experimental.LevelMultiplier != defaultLevelMultiplier { 1252 fmt.Fprintf(&buf, " level_multiplier=%d\n", o.Experimental.LevelMultiplier) 1253 } 1254 fmt.Fprintf(&buf, " max_concurrent_compactions=%d\n", o.MaxConcurrentCompactions()) 1255 fmt.Fprintf(&buf, " max_manifest_file_size=%d\n", o.MaxManifestFileSize) 1256 fmt.Fprintf(&buf, " max_open_files=%d\n", o.MaxOpenFiles) 1257 fmt.Fprintf(&buf, " mem_table_size=%d\n", o.MemTableSize) 1258 fmt.Fprintf(&buf, " mem_table_stop_writes_threshold=%d\n", o.MemTableStopWritesThreshold) 1259 fmt.Fprintf(&buf, " min_deletion_rate=%d\n", o.TargetByteDeletionRate) 1260 fmt.Fprintf(&buf, " merger=%s\n", o.Merger.Name) 1261 fmt.Fprintf(&buf, " read_compaction_rate=%d\n", o.Experimental.ReadCompactionRate) 1262 fmt.Fprintf(&buf, " read_sampling_multiplier=%d\n", o.Experimental.ReadSamplingMultiplier) 1263 fmt.Fprintf(&buf, " strict_wal_tail=%t\n", o.private.strictWALTail) 1264 fmt.Fprintf(&buf, " table_cache_shards=%d\n", o.Experimental.TableCacheShards) 1265 fmt.Fprintf(&buf, " table_property_collectors=[") 1266 for i := range o.TablePropertyCollectors { 1267 if i > 0 { 1268 fmt.Fprintf(&buf, ",") 1269 } 1270 // NB: This creates a new TablePropertyCollector, but Options.String() is 1271 // called rarely so the overhead of doing so is not consequential. 1272 fmt.Fprintf(&buf, "%s", o.TablePropertyCollectors[i]().Name()) 1273 } 1274 fmt.Fprintf(&buf, "]\n") 1275 fmt.Fprintf(&buf, " validate_on_ingest=%t\n", o.Experimental.ValidateOnIngest) 1276 fmt.Fprintf(&buf, " wal_dir=%s\n", o.WALDir) 1277 fmt.Fprintf(&buf, " wal_bytes_per_sync=%d\n", o.WALBytesPerSync) 1278 fmt.Fprintf(&buf, " max_writer_concurrency=%d\n", o.Experimental.MaxWriterConcurrency) 1279 fmt.Fprintf(&buf, " force_writer_parallelism=%t\n", o.Experimental.ForceWriterParallelism) 1280 fmt.Fprintf(&buf, " secondary_cache_size_bytes=%d\n", o.Experimental.SecondaryCacheSizeBytes) 1281 fmt.Fprintf(&buf, " create_on_shared=%d\n", o.Experimental.CreateOnShared) 1282 1283 // Private options. 1284 // 1285 // These options are only encoded if true, because we do not want them to 1286 // appear in production serialized Options files, since they're testing-only 1287 // options. They're only serialized when true, which still ensures that the 1288 // metamorphic tests may propagate them to subprocesses. 1289 if o.private.disableDeleteOnlyCompactions { 1290 fmt.Fprintln(&buf, " disable_delete_only_compactions=true") 1291 } 1292 if o.private.disableElisionOnlyCompactions { 1293 fmt.Fprintln(&buf, " disable_elision_only_compactions=true") 1294 } 1295 if o.private.disableLazyCombinedIteration { 1296 fmt.Fprintln(&buf, " disable_lazy_combined_iteration=true") 1297 } 1298 1299 for i := range o.Levels { 1300 l := &o.Levels[i] 1301 fmt.Fprintf(&buf, "\n") 1302 fmt.Fprintf(&buf, "[Level \"%d\"]\n", i) 1303 fmt.Fprintf(&buf, " block_restart_interval=%d\n", l.BlockRestartInterval) 1304 fmt.Fprintf(&buf, " block_size=%d\n", l.BlockSize) 1305 fmt.Fprintf(&buf, " block_size_threshold=%d\n", l.BlockSizeThreshold) 1306 fmt.Fprintf(&buf, " compression=%s\n", l.Compression) 1307 fmt.Fprintf(&buf, " filter_policy=%s\n", filterPolicyName(l.FilterPolicy)) 1308 fmt.Fprintf(&buf, " filter_type=%s\n", l.FilterType) 1309 fmt.Fprintf(&buf, " index_block_size=%d\n", l.IndexBlockSize) 1310 fmt.Fprintf(&buf, " target_file_size=%d\n", l.TargetFileSize) 1311 } 1312 1313 return buf.String() 1314 } 1315 1316 func parseOptions(s string, fn func(section, key, value string) error) error { 1317 var section string 1318 for _, line := range strings.Split(s, "\n") { 1319 line = strings.TrimSpace(line) 1320 if len(line) == 0 { 1321 // Skip blank lines. 1322 continue 1323 } 1324 if line[0] == ';' || line[0] == '#' { 1325 // Skip comments. 1326 continue 1327 } 1328 n := len(line) 1329 if line[0] == '[' && line[n-1] == ']' { 1330 // Parse section. 1331 section = line[1 : n-1] 1332 continue 1333 } 1334 1335 pos := strings.Index(line, "=") 1336 if pos < 0 { 1337 const maxLen = 50 1338 if len(line) > maxLen { 1339 line = line[:maxLen-3] + "..." 1340 } 1341 return base.CorruptionErrorf("invalid key=value syntax: %q", errors.Safe(line)) 1342 } 1343 1344 key := strings.TrimSpace(line[:pos]) 1345 value := strings.TrimSpace(line[pos+1:]) 1346 1347 // RocksDB uses a similar (INI-style) syntax for the OPTIONS file, but 1348 // different section names and keys. The "CFOptions ..." paths are the 1349 // RocksDB versions which we map to the Pebble paths. 1350 mappedSection := section 1351 if section == `CFOptions "default"` { 1352 mappedSection = "Options" 1353 switch key { 1354 case "comparator": 1355 key = "comparer" 1356 case "merge_operator": 1357 key = "merger" 1358 } 1359 } 1360 1361 if err := fn(mappedSection, key, value); err != nil { 1362 return err 1363 } 1364 } 1365 return nil 1366 } 1367 1368 // ParseHooks contains callbacks to create options fields which can have 1369 // user-defined implementations. 1370 type ParseHooks struct { 1371 NewCache func(size int64) *Cache 1372 NewCleaner func(name string) (Cleaner, error) 1373 NewComparer func(name string) (*Comparer, error) 1374 NewFilterPolicy func(name string) (FilterPolicy, error) 1375 NewMerger func(name string) (*Merger, error) 1376 SkipUnknown func(name, value string) bool 1377 } 1378 1379 // Parse parses the options from the specified string. Note that certain 1380 // options cannot be parsed into populated fields. For example, comparer and 1381 // merger. 1382 func (o *Options) Parse(s string, hooks *ParseHooks) error { 1383 return parseOptions(s, func(section, key, value string) error { 1384 // WARNING: DO NOT remove entries from the switches below because doing so 1385 // causes a key previously written to the OPTIONS file to be considered unknown, 1386 // a backwards incompatible change. Instead, leave in support for parsing the 1387 // key but simply don't parse the value. 1388 1389 switch { 1390 case section == "Version": 1391 switch key { 1392 case "pebble_version": 1393 default: 1394 if hooks != nil && hooks.SkipUnknown != nil && hooks.SkipUnknown(section+"."+key, value) { 1395 return nil 1396 } 1397 return errors.Errorf("pebble: unknown option: %s.%s", 1398 errors.Safe(section), errors.Safe(key)) 1399 } 1400 return nil 1401 1402 case section == "Options": 1403 var err error 1404 switch key { 1405 case "bytes_per_sync": 1406 o.BytesPerSync, err = strconv.Atoi(value) 1407 case "cache_size": 1408 var n int64 1409 n, err = strconv.ParseInt(value, 10, 64) 1410 if err == nil && hooks != nil && hooks.NewCache != nil { 1411 if o.Cache != nil { 1412 o.Cache.Unref() 1413 } 1414 o.Cache = hooks.NewCache(n) 1415 } 1416 // We avoid calling cache.New in parsing because it makes it 1417 // too easy to leak a cache. 1418 case "cleaner": 1419 switch value { 1420 case "archive": 1421 o.Cleaner = ArchiveCleaner{} 1422 case "delete": 1423 o.Cleaner = DeleteCleaner{} 1424 default: 1425 if hooks != nil && hooks.NewCleaner != nil { 1426 o.Cleaner, err = hooks.NewCleaner(value) 1427 } 1428 } 1429 case "comparer": 1430 switch value { 1431 case "leveldb.BytewiseComparator": 1432 o.Comparer = DefaultComparer 1433 default: 1434 if hooks != nil && hooks.NewComparer != nil { 1435 o.Comparer, err = hooks.NewComparer(value) 1436 } 1437 } 1438 case "compaction_debt_concurrency": 1439 o.Experimental.CompactionDebtConcurrency, err = strconv.ParseUint(value, 10, 64) 1440 case "delete_range_flush_delay": 1441 // NB: This is a deprecated serialization of the 1442 // `flush_delay_delete_range`. 1443 o.FlushDelayDeleteRange, err = time.ParseDuration(value) 1444 case "disable_delete_only_compactions": 1445 o.private.disableDeleteOnlyCompactions, err = strconv.ParseBool(value) 1446 case "disable_elision_only_compactions": 1447 o.private.disableElisionOnlyCompactions, err = strconv.ParseBool(value) 1448 case "disable_ingest_as_flushable": 1449 var v bool 1450 v, err = strconv.ParseBool(value) 1451 if err == nil { 1452 o.Experimental.DisableIngestAsFlushable = func() bool { return v } 1453 } 1454 case "disable_lazy_combined_iteration": 1455 o.private.disableLazyCombinedIteration, err = strconv.ParseBool(value) 1456 case "disable_wal": 1457 o.DisableWAL, err = strconv.ParseBool(value) 1458 case "flush_delay_delete_range": 1459 o.FlushDelayDeleteRange, err = time.ParseDuration(value) 1460 case "flush_delay_range_key": 1461 o.FlushDelayRangeKey, err = time.ParseDuration(value) 1462 case "flush_split_bytes": 1463 o.FlushSplitBytes, err = strconv.ParseInt(value, 10, 64) 1464 case "format_major_version": 1465 // NB: The version written here may be stale. Open does 1466 // not use the format major version encoded in the 1467 // OPTIONS file other than to validate that the encoded 1468 // version is valid right here. 1469 var v uint64 1470 v, err = strconv.ParseUint(value, 10, 64) 1471 if vers := FormatMajorVersion(v); vers > internalFormatNewest || vers == FormatDefault { 1472 err = errors.Newf("unknown format major version %d", o.FormatMajorVersion) 1473 } 1474 if err == nil { 1475 o.FormatMajorVersion = FormatMajorVersion(v) 1476 } 1477 case "l0_compaction_concurrency": 1478 o.Experimental.L0CompactionConcurrency, err = strconv.Atoi(value) 1479 case "l0_compaction_file_threshold": 1480 o.L0CompactionFileThreshold, err = strconv.Atoi(value) 1481 case "l0_compaction_threshold": 1482 o.L0CompactionThreshold, err = strconv.Atoi(value) 1483 case "l0_stop_writes_threshold": 1484 o.L0StopWritesThreshold, err = strconv.Atoi(value) 1485 case "l0_sublevel_compactions": 1486 // Do nothing; option existed in older versions of pebble. 1487 case "lbase_max_bytes": 1488 o.LBaseMaxBytes, err = strconv.ParseInt(value, 10, 64) 1489 case "level_multiplier": 1490 o.Experimental.LevelMultiplier, err = strconv.Atoi(value) 1491 case "max_concurrent_compactions": 1492 var concurrentCompactions int 1493 concurrentCompactions, err = strconv.Atoi(value) 1494 if concurrentCompactions <= 0 { 1495 err = errors.New("max_concurrent_compactions cannot be <= 0") 1496 } else { 1497 o.MaxConcurrentCompactions = func() int { return concurrentCompactions } 1498 } 1499 case "max_manifest_file_size": 1500 o.MaxManifestFileSize, err = strconv.ParseInt(value, 10, 64) 1501 case "max_open_files": 1502 o.MaxOpenFiles, err = strconv.Atoi(value) 1503 case "mem_table_size": 1504 o.MemTableSize, err = strconv.ParseUint(value, 10, 64) 1505 case "mem_table_stop_writes_threshold": 1506 o.MemTableStopWritesThreshold, err = strconv.Atoi(value) 1507 case "min_compaction_rate": 1508 // Do nothing; option existed in older versions of pebble, and 1509 // may be meaningful again eventually. 1510 case "min_deletion_rate": 1511 o.TargetByteDeletionRate, err = strconv.Atoi(value) 1512 case "min_flush_rate": 1513 // Do nothing; option existed in older versions of pebble, and 1514 // may be meaningful again eventually. 1515 case "point_tombstone_weight": 1516 // Do nothing; deprecated. 1517 case "strict_wal_tail": 1518 o.private.strictWALTail, err = strconv.ParseBool(value) 1519 case "merger": 1520 switch value { 1521 case "nullptr": 1522 o.Merger = nil 1523 case "pebble.concatenate": 1524 o.Merger = DefaultMerger 1525 default: 1526 if hooks != nil && hooks.NewMerger != nil { 1527 o.Merger, err = hooks.NewMerger(value) 1528 } 1529 } 1530 case "read_compaction_rate": 1531 o.Experimental.ReadCompactionRate, err = strconv.ParseInt(value, 10, 64) 1532 case "read_sampling_multiplier": 1533 o.Experimental.ReadSamplingMultiplier, err = strconv.ParseInt(value, 10, 64) 1534 case "table_cache_shards": 1535 o.Experimental.TableCacheShards, err = strconv.Atoi(value) 1536 case "table_format": 1537 switch value { 1538 case "leveldb": 1539 case "rocksdbv2": 1540 default: 1541 return errors.Errorf("pebble: unknown table format: %q", errors.Safe(value)) 1542 } 1543 case "table_property_collectors": 1544 // TODO(peter): set o.TablePropertyCollectors 1545 case "validate_on_ingest": 1546 o.Experimental.ValidateOnIngest, err = strconv.ParseBool(value) 1547 case "wal_dir": 1548 o.WALDir = value 1549 case "wal_bytes_per_sync": 1550 o.WALBytesPerSync, err = strconv.Atoi(value) 1551 case "max_writer_concurrency": 1552 o.Experimental.MaxWriterConcurrency, err = strconv.Atoi(value) 1553 case "force_writer_parallelism": 1554 o.Experimental.ForceWriterParallelism, err = strconv.ParseBool(value) 1555 case "secondary_cache_size_bytes": 1556 o.Experimental.SecondaryCacheSizeBytes, err = strconv.ParseInt(value, 10, 64) 1557 case "create_on_shared": 1558 var createOnSharedInt int64 1559 createOnSharedInt, err = strconv.ParseInt(value, 10, 64) 1560 o.Experimental.CreateOnShared = remote.CreateOnSharedStrategy(createOnSharedInt) 1561 default: 1562 if hooks != nil && hooks.SkipUnknown != nil && hooks.SkipUnknown(section+"."+key, value) { 1563 return nil 1564 } 1565 return errors.Errorf("pebble: unknown option: %s.%s", 1566 errors.Safe(section), errors.Safe(key)) 1567 } 1568 return err 1569 1570 case strings.HasPrefix(section, "Level "): 1571 var index int 1572 if n, err := fmt.Sscanf(section, `Level "%d"`, &index); err != nil { 1573 return err 1574 } else if n != 1 { 1575 if hooks != nil && hooks.SkipUnknown != nil && hooks.SkipUnknown(section, value) { 1576 return nil 1577 } 1578 return errors.Errorf("pebble: unknown section: %q", errors.Safe(section)) 1579 } 1580 1581 if len(o.Levels) <= index { 1582 newLevels := make([]LevelOptions, index+1) 1583 copy(newLevels, o.Levels) 1584 o.Levels = newLevels 1585 } 1586 l := &o.Levels[index] 1587 1588 var err error 1589 switch key { 1590 case "block_restart_interval": 1591 l.BlockRestartInterval, err = strconv.Atoi(value) 1592 case "block_size": 1593 l.BlockSize, err = strconv.Atoi(value) 1594 case "block_size_threshold": 1595 l.BlockSizeThreshold, err = strconv.Atoi(value) 1596 case "compression": 1597 switch value { 1598 case "Default": 1599 l.Compression = DefaultCompression 1600 case "NoCompression": 1601 l.Compression = NoCompression 1602 case "Snappy": 1603 l.Compression = SnappyCompression 1604 case "ZSTD": 1605 l.Compression = ZstdCompression 1606 default: 1607 return errors.Errorf("pebble: unknown compression: %q", errors.Safe(value)) 1608 } 1609 case "filter_policy": 1610 if hooks != nil && hooks.NewFilterPolicy != nil { 1611 l.FilterPolicy, err = hooks.NewFilterPolicy(value) 1612 } 1613 case "filter_type": 1614 switch value { 1615 case "table": 1616 l.FilterType = TableFilter 1617 default: 1618 return errors.Errorf("pebble: unknown filter type: %q", errors.Safe(value)) 1619 } 1620 case "index_block_size": 1621 l.IndexBlockSize, err = strconv.Atoi(value) 1622 case "target_file_size": 1623 l.TargetFileSize, err = strconv.ParseInt(value, 10, 64) 1624 default: 1625 if hooks != nil && hooks.SkipUnknown != nil && hooks.SkipUnknown(section+"."+key, value) { 1626 return nil 1627 } 1628 return errors.Errorf("pebble: unknown option: %s.%s", errors.Safe(section), errors.Safe(key)) 1629 } 1630 return err 1631 } 1632 if hooks != nil && hooks.SkipUnknown != nil && hooks.SkipUnknown(section+"."+key, value) { 1633 return nil 1634 } 1635 return errors.Errorf("pebble: unknown section: %q", errors.Safe(section)) 1636 }) 1637 } 1638 1639 func (o *Options) checkOptions(s string) (strictWALTail bool, err error) { 1640 // TODO(jackson): Refactor to avoid awkwardness of the strictWALTail return value. 1641 return strictWALTail, parseOptions(s, func(section, key, value string) error { 1642 switch section + "." + key { 1643 case "Options.comparer": 1644 if value != o.Comparer.Name { 1645 return errors.Errorf("pebble: comparer name from file %q != comparer name from options %q", 1646 errors.Safe(value), errors.Safe(o.Comparer.Name)) 1647 } 1648 case "Options.merger": 1649 // RocksDB allows the merge operator to be unspecified, in which case it 1650 // shows up as "nullptr". 1651 if value != "nullptr" && value != o.Merger.Name { 1652 return errors.Errorf("pebble: merger name from file %q != merger name from options %q", 1653 errors.Safe(value), errors.Safe(o.Merger.Name)) 1654 } 1655 case "Options.strict_wal_tail": 1656 strictWALTail, err = strconv.ParseBool(value) 1657 if err != nil { 1658 return errors.Errorf("pebble: error parsing strict_wal_tail value %q: %w", value, err) 1659 } 1660 } 1661 return nil 1662 }) 1663 } 1664 1665 // Check verifies the options are compatible with the previous options 1666 // serialized by Options.String(). For example, the Comparer and Merger must be 1667 // the same, or data will not be able to be properly read from the DB. 1668 func (o *Options) Check(s string) error { 1669 _, err := o.checkOptions(s) 1670 return err 1671 } 1672 1673 // Validate verifies that the options are mutually consistent. For example, 1674 // L0StopWritesThreshold must be >= L0CompactionThreshold, otherwise a write 1675 // stall would persist indefinitely. 1676 func (o *Options) Validate() error { 1677 // Note that we can presume Options.EnsureDefaults has been called, so there 1678 // is no need to check for zero values. 1679 1680 var buf strings.Builder 1681 if o.Experimental.L0CompactionConcurrency < 1 { 1682 fmt.Fprintf(&buf, "L0CompactionConcurrency (%d) must be >= 1\n", 1683 o.Experimental.L0CompactionConcurrency) 1684 } 1685 if o.L0StopWritesThreshold < o.L0CompactionThreshold { 1686 fmt.Fprintf(&buf, "L0StopWritesThreshold (%d) must be >= L0CompactionThreshold (%d)\n", 1687 o.L0StopWritesThreshold, o.L0CompactionThreshold) 1688 } 1689 if uint64(o.MemTableSize) >= maxMemTableSize { 1690 fmt.Fprintf(&buf, "MemTableSize (%s) must be < %s\n", 1691 humanize.Bytes.Uint64(uint64(o.MemTableSize)), humanize.Bytes.Uint64(maxMemTableSize)) 1692 } 1693 if o.MemTableStopWritesThreshold < 2 { 1694 fmt.Fprintf(&buf, "MemTableStopWritesThreshold (%d) must be >= 2\n", 1695 o.MemTableStopWritesThreshold) 1696 } 1697 if o.FormatMajorVersion > internalFormatNewest { 1698 fmt.Fprintf(&buf, "FormatMajorVersion (%d) must be <= %d\n", 1699 o.FormatMajorVersion, internalFormatNewest) 1700 } 1701 if o.TableCache != nil && o.Cache != o.TableCache.cache { 1702 fmt.Fprintf(&buf, "underlying cache in the TableCache and the Cache dont match\n") 1703 } 1704 if buf.Len() == 0 { 1705 return nil 1706 } 1707 return errors.New(buf.String()) 1708 } 1709 1710 // MakeReaderOptions constructs sstable.ReaderOptions from the corresponding 1711 // options in the receiver. 1712 func (o *Options) MakeReaderOptions() sstable.ReaderOptions { 1713 var readerOpts sstable.ReaderOptions 1714 if o != nil { 1715 readerOpts.Cache = o.Cache 1716 readerOpts.Comparer = o.Comparer 1717 readerOpts.Filters = o.Filters 1718 if o.Merger != nil { 1719 readerOpts.Merge = o.Merger.Merge 1720 readerOpts.MergerName = o.Merger.Name 1721 } 1722 readerOpts.LoggerAndTracer = o.LoggerAndTracer 1723 } 1724 return readerOpts 1725 } 1726 1727 // MakeWriterOptions constructs sstable.WriterOptions for the specified level 1728 // from the corresponding options in the receiver. 1729 func (o *Options) MakeWriterOptions(level int, format sstable.TableFormat) sstable.WriterOptions { 1730 var writerOpts sstable.WriterOptions 1731 writerOpts.TableFormat = format 1732 if o != nil { 1733 writerOpts.Cache = o.Cache 1734 writerOpts.Comparer = o.Comparer 1735 if o.Merger != nil { 1736 writerOpts.MergerName = o.Merger.Name 1737 } 1738 writerOpts.TablePropertyCollectors = o.TablePropertyCollectors 1739 writerOpts.BlockPropertyCollectors = o.BlockPropertyCollectors 1740 } 1741 if format >= sstable.TableFormatPebblev3 { 1742 writerOpts.ShortAttributeExtractor = o.Experimental.ShortAttributeExtractor 1743 writerOpts.RequiredInPlaceValueBound = o.Experimental.RequiredInPlaceValueBound 1744 if format >= sstable.TableFormatPebblev4 && level == numLevels-1 { 1745 writerOpts.WritingToLowestLevel = true 1746 } 1747 } 1748 levelOpts := o.Level(level) 1749 writerOpts.BlockRestartInterval = levelOpts.BlockRestartInterval 1750 writerOpts.BlockSize = levelOpts.BlockSize 1751 writerOpts.BlockSizeThreshold = levelOpts.BlockSizeThreshold 1752 writerOpts.Compression = levelOpts.Compression 1753 writerOpts.FilterPolicy = levelOpts.FilterPolicy 1754 writerOpts.FilterType = levelOpts.FilterType 1755 writerOpts.IndexBlockSize = levelOpts.IndexBlockSize 1756 return writerOpts 1757 }