github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/options.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package bitalostable 6 7 import ( 8 "bytes" 9 "fmt" 10 "io" 11 "runtime" 12 "strconv" 13 "strings" 14 "time" 15 16 "github.com/cockroachdb/errors" 17 "github.com/zuoyebang/bitalostable/internal/base" 18 "github.com/zuoyebang/bitalostable/internal/cache" 19 "github.com/zuoyebang/bitalostable/internal/humanize" 20 "github.com/zuoyebang/bitalostable/internal/manifest" 21 "github.com/zuoyebang/bitalostable/sstable" 22 "github.com/zuoyebang/bitalostable/vfs" 23 ) 24 25 const ( 26 cacheDefaultSize = 8 << 20 // 8 MB 27 ) 28 29 // Compression exports the base.Compression type. 30 type Compression = sstable.Compression 31 32 // Exported Compression constants. 33 const ( 34 DefaultCompression = sstable.DefaultCompression 35 NoCompression = sstable.NoCompression 36 SnappyCompression = sstable.SnappyCompression 37 ZstdCompression = sstable.ZstdCompression 38 ) 39 40 // FilterType exports the base.FilterType type. 41 type FilterType = base.FilterType 42 43 // Exported TableFilter constants. 44 const ( 45 TableFilter = base.TableFilter 46 ) 47 48 // FilterWriter exports the base.FilterWriter type. 49 type FilterWriter = base.FilterWriter 50 51 // FilterPolicy exports the base.FilterPolicy type. 52 type FilterPolicy = base.FilterPolicy 53 54 // TablePropertyCollector exports the sstable.TablePropertyCollector type. 55 type TablePropertyCollector = sstable.TablePropertyCollector 56 57 // BlockPropertyCollector exports the sstable.BlockPropertyCollector type. 58 type BlockPropertyCollector = sstable.BlockPropertyCollector 59 60 // BlockPropertyFilter exports the sstable.BlockPropertyFilter type. 61 type BlockPropertyFilter = base.BlockPropertyFilter 62 63 // IterKeyType configures which types of keys an iterator should surface. 64 type IterKeyType int8 65 66 const ( 67 // IterKeyTypePointsOnly configures an iterator to iterate over point keys 68 // only. 69 IterKeyTypePointsOnly IterKeyType = iota 70 // IterKeyTypeRangesOnly configures an iterator to iterate over range keys 71 // only. 72 IterKeyTypeRangesOnly 73 // IterKeyTypePointsAndRanges configures an iterator iterate over both point 74 // keys and range keys simultaneously. 75 IterKeyTypePointsAndRanges 76 ) 77 78 // String implements fmt.Stringer. 79 func (t IterKeyType) String() string { 80 switch t { 81 case IterKeyTypePointsOnly: 82 return "points-only" 83 case IterKeyTypeRangesOnly: 84 return "ranges-only" 85 case IterKeyTypePointsAndRanges: 86 return "points-and-ranges" 87 default: 88 panic(fmt.Sprintf("unknown key type %d", t)) 89 } 90 } 91 92 // IterOptions hold the optional per-query parameters for NewIter. 93 // 94 // Like Options, a nil *IterOptions is valid and means to use the default 95 // values. 96 type IterOptions struct { 97 // LowerBound specifies the smallest key (inclusive) that the iterator will 98 // return during iteration. If the iterator is seeked or iterated past this 99 // boundary the iterator will return Valid()==false. Setting LowerBound 100 // effectively truncates the key space visible to the iterator. 101 LowerBound []byte 102 // UpperBound specifies the largest key (exclusive) that the iterator will 103 // return during iteration. If the iterator is seeked or iterated past this 104 // boundary the iterator will return Valid()==false. Setting UpperBound 105 // effectively truncates the key space visible to the iterator. 106 UpperBound []byte 107 // TableFilter can be used to filter the tables that are scanned during 108 // iteration based on the user properties. Return true to scan the table and 109 // false to skip scanning. This function must be thread-safe since the same 110 // function can be used by multiple iterators, if the iterator is cloned. 111 TableFilter func(userProps map[string]string) bool 112 // PointKeyFilters can be used to avoid scanning tables and blocks in tables 113 // when iterating over point keys. It is requires that this slice is sorted in 114 // increasing order of the BlockPropertyFilter.ShortID. This slice represents 115 // an intersection across all filters, i.e., all filters must indicate that the 116 // block is relevant. 117 PointKeyFilters []BlockPropertyFilter 118 // RangeKeyFilters can be usefd to avoid scanning tables and blocks in tables 119 // when iterating over range keys. The same requirements that apply to 120 // PointKeyFilters apply here too. 121 RangeKeyFilters []BlockPropertyFilter 122 // KeyTypes configures which types of keys to iterate over: point keys, 123 // range keys, or both. 124 KeyTypes IterKeyType 125 // RangeKeyMasking can be used to enable automatic masking of point keys by 126 // range keys. Range key masking is only supported during combined range key 127 // and point key iteration mode (IterKeyTypePointsAndRanges). 128 RangeKeyMasking RangeKeyMasking 129 130 // OnlyReadGuaranteedDurable is an advanced option that is only supported by 131 // the Reader implemented by DB. When set to true, only the guaranteed to be 132 // durable state is visible in the iterator. 133 // - This definition is made under the assumption that the FS implementation 134 // is providing a durability guarantee when data is synced. 135 // - The visible state represents a consistent point in the history of the 136 // DB. 137 // - The implementation is free to choose a conservative definition of what 138 // is guaranteed durable. For simplicity, the current implementation 139 // ignores memtables. A more sophisticated implementation could track the 140 // highest seqnum that is synced to the WAL and published and use that as 141 // the visible seqnum for an iterator. Note that the latter approach is 142 // not strictly better than the former since we can have DBs that are (a) 143 // synced more rarely than memtable flushes, (b) have no WAL. (a) is 144 // likely to be true in a future CockroachDB context where the DB 145 // containing the state machine may be rarely synced. 146 // NB: this current implementation relies on the fact that memtables are 147 // flushed in seqnum order, and any ingested sstables that happen to have a 148 // lower seqnum than a non-flushed memtable don't have any overlapping keys. 149 // This is the fundamental level invariant used in other code too, like when 150 // merging iterators. 151 // 152 // Semantically, using this option provides the caller a "snapshot" as of 153 // the time the most recent memtable was flushed. An alternate interface 154 // would be to add a NewSnapshot variant. Creating a snapshot is heavier 155 // weight than creating an iterator, so we have opted to support this 156 // iterator option. 157 OnlyReadGuaranteedDurable bool 158 // UseL6Filters allows the caller to opt into reading filter blocks for L6 159 // sstables. Helpful if a lot of SeekPrefixGEs are expected in quick 160 // succession, that are also likely to not yield a single key. Filter blocks in 161 // L6 can be relatively large, often larger than data blocks, so the benefit of 162 // loading them in the cache is minimized if the probability of the key 163 // existing is not low or if we just expect a one-time Seek (where loading the 164 // data block directly is better). 165 UseL6Filters bool 166 // Internal options. 167 logger Logger 168 // Level corresponding to this file. Only passed in if constructed by a 169 // levelIter. 170 level manifest.Level 171 172 // NB: If adding new Options, you must account for them in iterator 173 // construction and Iterator.SetOptions. 174 } 175 176 // GetLowerBound returns the LowerBound or nil if the receiver is nil. 177 func (o *IterOptions) GetLowerBound() []byte { 178 if o == nil { 179 return nil 180 } 181 return o.LowerBound 182 } 183 184 // GetUpperBound returns the UpperBound or nil if the receiver is nil. 185 func (o *IterOptions) GetUpperBound() []byte { 186 if o == nil { 187 return nil 188 } 189 return o.UpperBound 190 } 191 192 func (o *IterOptions) pointKeys() bool { 193 if o == nil { 194 return true 195 } 196 return o.KeyTypes == IterKeyTypePointsOnly || o.KeyTypes == IterKeyTypePointsAndRanges 197 } 198 199 func (o *IterOptions) rangeKeys() bool { 200 if o == nil { 201 return false 202 } 203 return o.KeyTypes == IterKeyTypeRangesOnly || o.KeyTypes == IterKeyTypePointsAndRanges 204 } 205 206 func (o *IterOptions) getLogger() Logger { 207 if o == nil || o.logger == nil { 208 return DefaultLogger 209 } 210 return o.logger 211 } 212 213 // RangeKeyMasking configures automatic hiding of point keys by range keys. A 214 // non-nil Suffix enables range-key masking. When enabled, range keys with 215 // suffixes ≥ Suffix behave as masks. All point keys that are contained within a 216 // masking range key's bounds and have suffixes greater than the range key's 217 // suffix are automatically skipped. 218 // 219 // Specifically, when configured with a RangeKeyMasking.Suffix _s_, and there 220 // exists a range key with suffix _r_ covering a point key with suffix _p_, and 221 // 222 // _s_ ≤ _r_ < _p_ 223 // 224 // then the point key is elided. 225 // 226 // Range-key masking may only be used when iterating over both point keys and 227 // range keys with IterKeyTypePointsAndRanges. 228 type RangeKeyMasking struct { 229 // Suffix configures which range keys may mask point keys. Only range keys 230 // that are defined at suffixes greater than or equal to Suffix will mask 231 // point keys. 232 Suffix []byte 233 // Filter is an optional field that may be used to improve performance of 234 // range-key masking through a block-property filter defined over key 235 // suffixes. If non-nil, Filter is called by Pebble to construct a 236 // block-property filter mask at iterator creation. The filter is used to 237 // skip whole point-key blocks containing point keys with suffixes greater 238 // than a covering range-key's suffix. 239 // 240 // To use this functionality, the caller must create and configure (through 241 // Options.BlockPropertyCollectors) a block-property collector that records 242 // the maxmimum suffix contained within a block. The caller then must write 243 // and provide a BlockPropertyFilterMask implementation on that same 244 // property. See the BlockPropertyFilterMask type for more information. 245 Filter func() BlockPropertyFilterMask 246 } 247 248 // BlockPropertyFilterMask extends the BlockPropertyFilter interface for use 249 // with range-key masking. Unlike an ordinary block property filter, a 250 // BlockPropertyFilterMask's filtering criteria is allowed to change when Pebble 251 // invokes its SetSuffix method. 252 // 253 // When a Pebble iterator steps into a range key's bounds and the range key has 254 // a suffix greater than or equal to RangeKeyMasking.Suffix, the range key acts 255 // as a mask. The masking range key hides all point keys that fall within the 256 // range key's bounds and have suffixes > the range key's suffix. Without a 257 // filter mask configured, Pebble performs this hiding by stepping through point 258 // keys and comparing suffixes. If large numbers of point keys are masked, this 259 // requires Pebble to load, iterate through and discard a large number of 260 // sstable blocks containing masked point keys. 261 // 262 // If a block-property collector and a filter mask are configured, Pebble may 263 // skip loading some point-key blocks altogether. If a block's keys are known to 264 // all fall within the bounds of the masking range key and the block was 265 // annotated by a block-property collector with the maximal suffix, Pebble can 266 // ask the filter mask to compare the property to the current masking range 267 // key's suffix. If the mask reports no intersection, the block may be skipped. 268 // 269 // If unsuffixed and suffixed keys are written to the database, care must be 270 // taken to avoid unintentionally masking un-suffixed keys located in the same 271 // block as suffixed keys. One solution is to interpret unsuffixed keys as 272 // containing the maximal suffix value, ensuring that blocks containing 273 // unsuffixed keys are always loaded. 274 type BlockPropertyFilterMask interface { 275 BlockPropertyFilter 276 277 // SetSuffix configures the mask with the suffix of a range key. The filter 278 // should return false from Intersects whenever it's provided with a 279 // property encoding a block's minimum suffix that's greater (according to 280 // Compare) than the provided suffix. 281 SetSuffix(suffix []byte) error 282 } 283 284 // WriteOptions hold the optional per-query parameters for Set and Delete 285 // operations. 286 // 287 // Like Options, a nil *WriteOptions is valid and means to use the default 288 // values. 289 type WriteOptions struct { 290 // Sync is whether to sync writes through the OS buffer cache and down onto 291 // the actual disk, if applicable. Setting Sync is required for durability of 292 // individual write operations but can result in slower writes. 293 // 294 // If false, and the process or machine crashes, then a recent write may be 295 // lost. This is due to the recently written data being buffered inside the 296 // process running Pebble. This differs from the semantics of a write system 297 // call in which the data is buffered in the OS buffer cache and would thus 298 // survive a process crash. 299 // 300 // The default value is true. 301 Sync bool 302 } 303 304 // Sync specifies the default write options for writes which synchronize to 305 // disk. 306 var Sync = &WriteOptions{Sync: true} 307 308 // NoSync specifies the default write options for writes which do not 309 // synchronize to disk. 310 var NoSync = &WriteOptions{Sync: false} 311 312 // GetSync returns the Sync value or true if the receiver is nil. 313 func (o *WriteOptions) GetSync() bool { 314 return o == nil || o.Sync 315 } 316 317 // LevelOptions holds the optional per-level parameters. 318 type LevelOptions struct { 319 // BlockRestartInterval is the number of keys between restart points 320 // for delta encoding of keys. 321 // 322 // The default value is 16. 323 BlockRestartInterval int 324 325 // BlockSize is the target uncompressed size in bytes of each table block. 326 // 327 // The default value is 4096. 328 BlockSize int 329 330 // BlockSizeThreshold finishes a block if the block size is larger than the 331 // specified percentage of the target block size and adding the next entry 332 // would cause the block to be larger than the target block size. 333 // 334 // The default value is 90 335 BlockSizeThreshold int 336 337 // Compression defines the per-block compression to use. 338 // 339 // The default value (DefaultCompression) uses snappy compression. 340 Compression Compression 341 342 // FilterPolicy defines a filter algorithm (such as a Bloom filter) that can 343 // reduce disk reads for Get calls. 344 // 345 // One such implementation is bloom.FilterPolicy(10) from the bitalostable/bloom 346 // package. 347 // 348 // The default value means to use no filter. 349 FilterPolicy FilterPolicy 350 351 // FilterType defines whether an existing filter policy is applied at a 352 // block-level or table-level. Block-level filters use less memory to create, 353 // but are slower to access as a check for the key in the index must first be 354 // performed to locate the filter block. A table-level filter will require 355 // memory proportional to the number of keys in an sstable to create, but 356 // avoids the index lookup when determining if a key is present. Table-level 357 // filters should be preferred except under constrained memory situations. 358 FilterType FilterType 359 360 // IndexBlockSize is the target uncompressed size in bytes of each index 361 // block. When the index block size is larger than this target, two-level 362 // indexes are automatically enabled. Setting this option to a large value 363 // (such as math.MaxInt32) disables the automatic creation of two-level 364 // indexes. 365 // 366 // The default value is the value of BlockSize. 367 IndexBlockSize int 368 369 // The target file size for the level. 370 TargetFileSize int64 371 } 372 373 // EnsureDefaults ensures that the default values for all of the options have 374 // been initialized. It is valid to call EnsureDefaults on a nil receiver. A 375 // non-nil result will always be returned. 376 func (o *LevelOptions) EnsureDefaults() *LevelOptions { 377 if o == nil { 378 o = &LevelOptions{} 379 } 380 if o.BlockRestartInterval <= 0 { 381 o.BlockRestartInterval = base.DefaultBlockRestartInterval 382 } 383 if o.BlockSize <= 0 { 384 o.BlockSize = base.DefaultBlockSize 385 } 386 if o.BlockSizeThreshold <= 0 { 387 o.BlockSizeThreshold = base.DefaultBlockSizeThreshold 388 } 389 if o.Compression <= DefaultCompression || o.Compression >= sstable.NCompression { 390 o.Compression = SnappyCompression 391 } 392 if o.IndexBlockSize <= 0 { 393 o.IndexBlockSize = o.BlockSize 394 } 395 if o.TargetFileSize <= 0 { 396 o.TargetFileSize = 2 << 20 // 2 MB 397 } 398 return o 399 } 400 401 // Options holds the optional parameters for configuring bitalostable. These options 402 // apply to the DB at large; per-query options are defined by the IterOptions 403 // and WriteOptions types. 404 type Options struct { 405 // Sync sstables periodically in order to smooth out writes to disk. This 406 // option does not provide any persistency guarantee, but is used to avoid 407 // latency spikes if the OS automatically decides to write out a large chunk 408 // of dirty filesystem buffers. This option only controls SSTable syncs; WAL 409 // syncs are controlled by WALBytesPerSync. 410 // 411 // The default value is 512KB. 412 BytesPerSync int 413 414 // Cache is used to cache uncompressed blocks from sstables. 415 // 416 // The default cache size is 8 MB. 417 Cache *cache.Cache 418 419 // Cleaner cleans obsolete files. 420 // 421 // The default cleaner uses the DeleteCleaner. 422 Cleaner Cleaner 423 424 // Comparer defines a total ordering over the space of []byte keys: a 'less 425 // than' relationship. The same comparison algorithm must be used for reads 426 // and writes over the lifetime of the DB. 427 // 428 // The default value uses the same ordering as bytes.Compare. 429 Comparer *Comparer 430 431 // DebugCheck is invoked, if non-nil, whenever a new version is being 432 // installed. Typically, this is set to bitalostable.DebugCheckLevels in tests 433 // or tools only, to check invariants over all the data in the database. 434 DebugCheck func(*DB) error 435 436 // Disable the write-ahead log (WAL). Disabling the write-ahead log prohibits 437 // crash recovery, but can improve performance if crash recovery is not 438 // needed (e.g. when only temporary state is being stored in the database). 439 // 440 // TODO(peter): untested 441 DisableWAL bool 442 443 // ErrorIfExists is whether it is an error if the database already exists. 444 // 445 // The default value is false. 446 ErrorIfExists bool 447 448 // ErrorIfNotExists is whether it is an error if the database does not 449 // already exist. 450 // 451 // The default value is false which will cause a database to be created if it 452 // does not already exist. 453 ErrorIfNotExists bool 454 455 // EventListener provides hooks to listening to significant DB events such as 456 // flushes, compactions, and table deletion. 457 EventListener EventListener 458 459 // Experimental contains experimental options which are off by default. 460 // These options are temporary and will eventually either be deleted, moved 461 // out of the experimental group, or made the non-adjustable default. These 462 // options may change at any time, so do not rely on them. 463 Experimental struct { 464 // The threshold of L0 read-amplification at which compaction concurrency 465 // is enabled (if CompactionDebtConcurrency was not already exceeded). 466 // Every multiple of this value enables another concurrent 467 // compaction up to MaxConcurrentCompactions. 468 L0CompactionConcurrency int 469 470 // CompactionDebtConcurrency controls the threshold of compaction debt 471 // at which additional compaction concurrency slots are added. For every 472 // multiple of this value in compaction debt bytes, an additional 473 // concurrent compaction is added. This works "on top" of 474 // L0CompactionConcurrency, so the higher of the count of compaction 475 // concurrency slots as determined by the two options is chosen. 476 CompactionDebtConcurrency int 477 478 // MinDeletionRate is the minimum number of bytes per second that would 479 // be deleted. Deletion pacing is used to slow down deletions when 480 // compactions finish up or readers close, and newly-obsolete files need 481 // cleaning up. Deleting lots of files at once can cause disk latency to 482 // go up on some SSDs, which this functionality guards against. This is a 483 // minimum as the maximum is theoretically unlimited; pacing is disabled 484 // when there are too many obsolete files relative to live bytes, or 485 // there isn't enough disk space available. Setting this to 0 disables 486 // deletion pacing, which is also the default. 487 MinDeletionRate int 488 489 // ReadCompactionRate controls the frequency of read triggered 490 // compactions by adjusting `AllowedSeeks` in manifest.FileMetadata: 491 // 492 // AllowedSeeks = FileSize / ReadCompactionRate 493 // 494 // From LevelDB: 495 // ``` 496 // We arrange to automatically compact this file after 497 // a certain number of seeks. Let's assume: 498 // (1) One seek costs 10ms 499 // (2) Writing or reading 1MB costs 10ms (100MB/s) 500 // (3) A compaction of 1MB does 25MB of IO: 501 // 1MB read from this level 502 // 10-12MB read from next level (boundaries may be misaligned) 503 // 10-12MB written to next level 504 // This implies that 25 seeks cost the same as the compaction 505 // of 1MB of data. I.e., one seek costs approximately the 506 // same as the compaction of 40KB of data. We are a little 507 // conservative and allow approximately one seek for every 16KB 508 // of data before triggering a compaction. 509 // ``` 510 ReadCompactionRate int64 511 512 // ReadSamplingMultiplier is a multiplier for the readSamplingPeriod in 513 // iterator.maybeSampleRead() to control the frequency of read sampling 514 // to trigger a read triggered compaction. A value of -1 prevents sampling 515 // and disables read triggered compactions. The default is 1 << 4. which 516 // gets multiplied with a constant of 1 << 16 to yield 1 << 20 (1MB). 517 ReadSamplingMultiplier int64 518 519 // TableCacheShards is the number of shards per table cache. 520 // Reducing the value can reduce the number of idle goroutines per DB 521 // instance which can be useful in scenarios with a lot of DB instances 522 // and a large number of CPUs, but doing so can lead to higher contention 523 // in the table cache and reduced performance. 524 // 525 // The default value is the number of logical CPUs, which can be 526 // limited by runtime.GOMAXPROCS. 527 TableCacheShards int 528 529 // KeyValidationFunc is a function to validate a user key in an SSTable. 530 // 531 // Currently, this function is used to validate the smallest and largest 532 // keys in an SSTable undergoing compaction. In this case, returning an 533 // error from the validation function will result in a panic at runtime, 534 // given that there is rarely any way of recovering from malformed keys 535 // present in compacted files. By default, validation is not performed. 536 // 537 // Additional use-cases may be added in the future. 538 // 539 // NOTE: callers should take care to not mutate the key being validated. 540 KeyValidationFunc func(userKey []byte) error 541 542 // ValidateOnIngest schedules validation of sstables after they have 543 // been ingested. 544 // 545 // By default, this value is false. 546 ValidateOnIngest bool 547 548 // MultiLevelCompaction allows the compaction of SSTs from more than two 549 // levels iff a conventional two level compaction will quickly trigger a 550 // compaction in the output level. 551 MultiLevelCompaction bool 552 553 // MaxWriterConcurrency is used to indicate the maximum number of 554 // compression workers the compression queue is allowed to use. If 555 // MaxWriterConcurrency > 0, then the Writer will use parallelism, to 556 // compress and write blocks to disk. Otherwise, the writer will 557 // compress and write blocks to disk synchronously. 558 MaxWriterConcurrency int 559 560 // ForceWriterParallelism is used to force parallelism in the sstable 561 // Writer for the metamorphic tests. Even with the MaxWriterConcurrency 562 // option set, we only enable parallelism in the sstable Writer if there 563 // is enough CPU available, and this option bypasses that. 564 ForceWriterParallelism bool 565 566 // CPUWorkPermissionGranter should be set if Pebble should be given the 567 // ability to optionally schedule additional CPU. See the documentation 568 // for CPUWorkPermissionGranter for more details. 569 CPUWorkPermissionGranter CPUWorkPermissionGranter 570 } 571 572 // Filters is a map from filter policy name to filter policy. It is used for 573 // debugging tools which may be used on multiple databases configured with 574 // different filter policies. It is not necessary to populate this filters 575 // map during normal usage of a DB. 576 Filters map[string]FilterPolicy 577 578 // FlushDelayDeleteRange configures how long the database should wait before 579 // forcing a flush of a memtable that contains a range deletion. Disk space 580 // cannot be reclaimed until the range deletion is flushed. No automatic 581 // flush occurs if zero. 582 FlushDelayDeleteRange time.Duration 583 584 // FlushDelayRangeKey configures how long the database should wait before 585 // forcing a flush of a memtable that contains a range key. Range keys in 586 // the memtable prevent lazy combined iteration, so it's desirable to flush 587 // range keys promptly. No automatic flush occurs if zero. 588 FlushDelayRangeKey time.Duration 589 590 // FlushSplitBytes denotes the target number of bytes per sublevel in 591 // each flush split interval (i.e. range between two flush split keys) 592 // in L0 sstables. When set to zero, only a single sstable is generated 593 // by each flush. When set to a non-zero value, flushes are split at 594 // points to meet L0's TargetFileSize, any grandparent-related overlap 595 // options, and at boundary keys of L0 flush split intervals (which are 596 // targeted to contain around FlushSplitBytes bytes in each sublevel 597 // between pairs of boundary keys). Splitting sstables during flush 598 // allows increased compaction flexibility and concurrency when those 599 // tables are compacted to lower levels. 600 FlushSplitBytes int64 601 602 // FormatMajorVersion sets the format of on-disk files. It is 603 // recommended to set the format major version to an explicit 604 // version, as the default may change over time. 605 // 606 // At Open if the existing database is formatted using a later 607 // format major version that is known to this version of Pebble, 608 // Pebble will continue to use the later format major version. If 609 // the existing database's version is unknown, the caller may use 610 // FormatMostCompatible and will be able to open the database 611 // regardless of its actual version. 612 // 613 // If the existing database is formatted using a format major 614 // version earlier than the one specified, Open will automatically 615 // ratchet the database to the specified format major version. 616 FormatMajorVersion FormatMajorVersion 617 618 // FS provides the interface for persistent file storage. 619 // 620 // The default value uses the underlying operating system's file system. 621 FS vfs.FS 622 623 // The count of L0 files necessary to trigger an L0 compaction. 624 L0CompactionFileThreshold int 625 626 // The amount of L0 read-amplification necessary to trigger an L0 compaction. 627 L0CompactionThreshold int 628 629 // Hard limit on L0 read-amplification, computed as the number of L0 630 // sublevels. Writes are stopped when this threshold is reached. 631 L0StopWritesThreshold int 632 633 // The maximum number of bytes for LBase. The base level is the level which 634 // L0 is compacted into. The base level is determined dynamically based on 635 // the existing data in the LSM. The maximum number of bytes for other levels 636 // is computed dynamically based on the base level's maximum size. When the 637 // maximum number of bytes for a level is exceeded, compaction is requested. 638 LBaseMaxBytes int64 639 640 // Per-level options. Options for at least one level must be specified. The 641 // options for the last level are used for all subsequent levels. 642 Levels []LevelOptions 643 644 // Logger used to write log messages. 645 // 646 // The default logger uses the Go standard library log package. 647 Logger Logger 648 LogTag string 649 Verbose bool 650 651 // MaxManifestFileSize is the maximum size the MANIFEST file is allowed to 652 // become. When the MANIFEST exceeds this size it is rolled over and a new 653 // MANIFEST is created. 654 MaxManifestFileSize int64 655 656 // MaxOpenFiles is a soft limit on the number of open files that can be 657 // used by the DB. 658 // 659 // The default value is 1000. 660 MaxOpenFiles int 661 662 // The size of a MemTable in steady state. The actual MemTable size starts at 663 // min(256KB, MemTableSize) and doubles for each subsequent MemTable up to 664 // MemTableSize. This reduces the memory pressure caused by MemTables for 665 // short lived (test) DB instances. Note that more than one MemTable can be 666 // in existence since flushing a MemTable involves creating a new one and 667 // writing the contents of the old one in the 668 // background. MemTableStopWritesThreshold places a hard limit on the size of 669 // the queued MemTables. 670 MemTableSize int 671 672 // Hard limit on the size of queued of MemTables. Writes are stopped when the 673 // sum of the queued memtable sizes exceeds 674 // MemTableStopWritesThreshold*MemTableSize. This value should be at least 2 675 // or writes will stop whenever a MemTable is being flushed. 676 MemTableStopWritesThreshold int 677 678 // Merger defines the associative merge operation to use for merging values 679 // written with {Batch,DB}.Merge. 680 // 681 // The default merger concatenates values. 682 Merger *Merger 683 684 // MaxConcurrentCompactions specifies the maximum number of concurrent 685 // compactions. The default is 1. Concurrent compactions are performed 686 // - when L0 read-amplification passes the L0CompactionConcurrency threshold 687 // - for automatic background compactions 688 // - when a manual compaction for a level is split and parallelized 689 // MaxConcurrentCompactions must be greater than 0. 690 MaxConcurrentCompactions func() int 691 692 // DisableAutomaticCompactions dictates whether automatic compactions are 693 // scheduled or not. The default is false (enabled). This option is only used 694 // externally when running a manual compaction, and internally for tests. 695 DisableAutomaticCompactions bool 696 697 // NoSyncOnClose decides whether the Pebble instance will enforce a 698 // close-time synchronization (e.g., fdatasync() or sync_file_range()) 699 // on files it writes to. Setting this to true removes the guarantee for a 700 // sync on close. Some implementations can still issue a non-blocking sync. 701 NoSyncOnClose bool 702 703 // NumPrevManifest is the number of non-current or older manifests which 704 // we want to keep around for debugging purposes. By default, we're going 705 // to keep one older manifest. 706 NumPrevManifest int 707 708 // ReadOnly indicates that the DB should be opened in read-only mode. Writes 709 // to the DB will return an error, background compactions are disabled, and 710 // the flush that normally occurs after replaying the WAL at startup is 711 // disabled. 712 ReadOnly bool 713 714 // TableCache is an initialized TableCache which should be set as an 715 // option if the DB needs to be initialized with a pre-existing table cache. 716 // If TableCache is nil, then a table cache which is unique to the DB instance 717 // is created. TableCache can be shared between db instances by setting it here. 718 // The TableCache set here must use the same underlying cache as Options.Cache 719 // and bitalostable will panic otherwise. 720 TableCache *TableCache 721 722 // TablePropertyCollectors is a list of TablePropertyCollector creation 723 // functions. A new TablePropertyCollector is created for each sstable built 724 // and lives for the lifetime of the table. 725 TablePropertyCollectors []func() TablePropertyCollector 726 727 // BlockPropertyCollectors is a list of BlockPropertyCollector creation 728 // functions. A new BlockPropertyCollector is created for each sstable 729 // built and lives for the lifetime of writing that table. 730 BlockPropertyCollectors []func() BlockPropertyCollector 731 732 // WALBytesPerSync sets the number of bytes to write to a WAL before calling 733 // Sync on it in the background. Just like with BytesPerSync above, this 734 // helps smooth out disk write latencies, and avoids cases where the OS 735 // writes a lot of buffered data to disk at once. However, this is less 736 // necessary with WALs, as many write operations already pass in 737 // Sync = true. 738 // 739 // The default value is 0, i.e. no background syncing. This matches the 740 // default behaviour in RocksDB. 741 WALBytesPerSync int 742 743 // WALDir specifies the directory to store write-ahead logs (WALs) in. If 744 // empty (the default), WALs will be stored in the same directory as sstables 745 // (i.e. the directory passed to bitalostable.Open). 746 WALDir string 747 748 // WALMinSyncInterval is the minimum duration between syncs of the WAL. If 749 // WAL syncs are requested faster than this interval, they will be 750 // artificially delayed. Introducing a small artificial delay (500us) between 751 // WAL syncs can allow more operations to arrive and reduce IO operations 752 // while having a minimal impact on throughput. This option is supplied as a 753 // closure in order to allow the value to be changed dynamically. The default 754 // value is 0. 755 // 756 // TODO(peter): rather than a closure, should there be another mechanism for 757 // changing options dynamically? 758 WALMinSyncInterval func() time.Duration 759 760 Id int 761 762 FlushReporter func(int) 763 764 KvCheckExpireFunc func([]byte, []byte) bool 765 766 // private options are only used by internal tests or are used internally 767 // for facilitating upgrade paths of unconfigurable functionality. 768 private struct { 769 // strictWALTail configures whether or not a database's WALs created 770 // prior to the most recent one should be interpreted strictly, 771 // requiring a clean EOF. RocksDB 6.2.1 and the version of Pebble 772 // included in CockroachDB 20.1 do not guarantee that closed WALs end 773 // cleanly. If this option is set within an OPTIONS file, Pebble 774 // interprets previous WALs strictly, requiring a clean EOF. 775 // Otherwise, it interprets them permissively in the same manner as 776 // RocksDB 6.2.1. 777 strictWALTail bool 778 779 // A private option to disable stats collection. 780 disableTableStats bool 781 782 // fsCloser holds a closer that should be invoked after a DB using these 783 // Options is closed. This is used to automatically stop the 784 // long-running goroutine associated with the disk-health-checking FS. 785 // See the initialization of FS in EnsureDefaults. Note that care has 786 // been taken to ensure that it is still safe to continue using the FS 787 // after this closer has been invoked. However, if write operations 788 // against the FS are made after the DB is closed, the FS may leak a 789 // goroutine indefinitely. 790 fsCloser io.Closer 791 792 logInit bool 793 } 794 } 795 796 // DebugCheckLevels calls CheckLevels on the provided database. 797 // It may be set in the DebugCheck field of Options to check 798 // level invariants whenever a new version is installed. 799 func DebugCheckLevels(db *DB) error { 800 return db.CheckLevels(nil) 801 } 802 803 // EnsureDefaults ensures that the default values for all options are set if a 804 // valid value was not already specified. Returns the new options. 805 func (o *Options) EnsureDefaults() *Options { 806 if o == nil { 807 o = &Options{} 808 } 809 if o.BytesPerSync <= 0 { 810 o.BytesPerSync = 512 << 10 // 512 KB 811 } 812 if o.Cleaner == nil { 813 o.Cleaner = DeleteCleaner{} 814 } 815 if o.Comparer == nil { 816 o.Comparer = DefaultComparer 817 } 818 if o.Experimental.L0CompactionConcurrency <= 0 { 819 o.Experimental.L0CompactionConcurrency = 10 820 } 821 if o.Experimental.CompactionDebtConcurrency <= 0 { 822 o.Experimental.CompactionDebtConcurrency = 1 << 30 // 1 GB 823 } 824 if o.Experimental.KeyValidationFunc == nil { 825 o.Experimental.KeyValidationFunc = func([]byte) error { return nil } 826 } 827 if o.L0CompactionThreshold <= 0 { 828 o.L0CompactionThreshold = 4 829 } 830 if o.L0CompactionFileThreshold <= 0 { 831 // Some justification for the default of 500: 832 // Why not smaller?: 833 // - The default target file size for L0 is 2MB, so 500 files is <= 1GB 834 // of data. At observed compaction speeds of > 20MB/s, L0 can be 835 // cleared of all files in < 1min, so this backlog is not huge. 836 // - 500 files is low overhead for instantiating L0 sublevels from 837 // scratch. 838 // - Lower values were observed to cause excessive and inefficient 839 // compactions out of L0 in a TPCC import benchmark. 840 // Why not larger?: 841 // - More than 1min to compact everything out of L0. 842 // - CockroachDB's admission control system uses a threshold of 1000 843 // files to start throttling writes to Pebble. Using 500 here gives 844 // us headroom between when Pebble should start compacting L0 and 845 // when the admission control threshold is reached. 846 // 847 // We can revisit this default in the future based on better 848 // experimental understanding. 849 // 850 // TODO(jackson): Experiment with slightly lower thresholds [or higher 851 // admission control thresholds] to see whether a higher L0 score at the 852 // threshold (currently 2.0) is necessary for some workloads to avoid 853 // starving L0 in favor of lower-level compactions. 854 o.L0CompactionFileThreshold = 500 855 } 856 if o.L0StopWritesThreshold <= 0 { 857 o.L0StopWritesThreshold = 12 858 } 859 if o.LBaseMaxBytes <= 0 { 860 o.LBaseMaxBytes = 64 << 20 // 64 MB 861 } 862 if o.Levels == nil { 863 o.Levels = make([]LevelOptions, 1) 864 for i := range o.Levels { 865 if i > 0 { 866 l := &o.Levels[i] 867 if l.TargetFileSize <= 0 { 868 l.TargetFileSize = o.Levels[i-1].TargetFileSize * 2 869 } 870 } 871 o.Levels[i].EnsureDefaults() 872 } 873 } else { 874 for i := range o.Levels { 875 o.Levels[i].EnsureDefaults() 876 } 877 } 878 if o.Logger == nil { 879 o.Logger = DefaultLogger 880 } 881 o.EventListener.EnsureDefaults(o.Logger) 882 if o.MaxManifestFileSize == 0 { 883 o.MaxManifestFileSize = 128 << 20 // 128 MB 884 } 885 if o.MaxOpenFiles == 0 { 886 o.MaxOpenFiles = 1000 887 } 888 if o.MemTableSize <= 0 { 889 o.MemTableSize = 4 << 20 890 } 891 if o.MemTableStopWritesThreshold <= 0 { 892 o.MemTableStopWritesThreshold = 2 893 } 894 if o.Merger == nil { 895 o.Merger = DefaultMerger 896 } 897 if !o.private.logInit { 898 o.Logger = base.NewLogger(o.Logger, o.LogTag) 899 if o.Verbose { 900 o.EventListener = MakeLoggingEventListener(o.Logger) 901 } else { 902 o.EventListener.EnsureDefaults(o.Logger) 903 } 904 o.private.logInit = true 905 } 906 o.private.strictWALTail = true 907 if o.MaxConcurrentCompactions == nil { 908 o.MaxConcurrentCompactions = func() int { return 1 } 909 } 910 if o.NumPrevManifest <= 0 { 911 o.NumPrevManifest = 1 912 } 913 914 if o.FormatMajorVersion == FormatDefault { 915 o.FormatMajorVersion = FormatMostCompatible 916 } 917 918 if o.FS == nil { 919 o.FS, o.private.fsCloser = vfs.WithDiskHealthChecks(vfs.Default, 5*time.Second, 920 func(name string, duration time.Duration) { 921 o.EventListener.DiskSlow(DiskSlowInfo{ 922 Path: name, 923 Duration: duration, 924 }) 925 }) 926 } 927 if o.FlushSplitBytes <= 0 { 928 o.FlushSplitBytes = 2 * o.Levels[0].TargetFileSize 929 } 930 if o.Experimental.ReadCompactionRate == 0 { 931 o.Experimental.ReadCompactionRate = 16000 932 } 933 if o.Experimental.ReadSamplingMultiplier == 0 { 934 o.Experimental.ReadSamplingMultiplier = 1 << 4 935 } 936 if o.Experimental.TableCacheShards <= 0 { 937 o.Experimental.TableCacheShards = runtime.GOMAXPROCS(0) 938 } 939 if o.KvCheckExpireFunc == nil { 940 o.KvCheckExpireFunc = func([]byte, []byte) bool { return false } 941 } 942 943 o.initMaps() 944 return o 945 } 946 947 func (o *Options) equal() Equal { 948 if o.Comparer.Equal == nil { 949 return bytes.Equal 950 } 951 return o.Comparer.Equal 952 } 953 954 // initMaps initializes the Comparers, Filters, and Mergers maps. 955 func (o *Options) initMaps() { 956 for i := range o.Levels { 957 l := &o.Levels[i] 958 if l.FilterPolicy != nil { 959 if o.Filters == nil { 960 o.Filters = make(map[string]FilterPolicy) 961 } 962 name := l.FilterPolicy.Name() 963 if _, ok := o.Filters[name]; !ok { 964 o.Filters[name] = l.FilterPolicy 965 } 966 } 967 } 968 } 969 970 // Level returns the LevelOptions for the specified level. 971 func (o *Options) Level(level int) LevelOptions { 972 if level < len(o.Levels) { 973 return o.Levels[level] 974 } 975 n := len(o.Levels) - 1 976 l := o.Levels[n] 977 for i := n; i < level; i++ { 978 l.TargetFileSize *= 2 979 } 980 return l 981 } 982 983 // Clone creates a shallow-copy of the supplied options. 984 func (o *Options) Clone() *Options { 985 n := &Options{} 986 if o != nil { 987 *n = *o 988 } 989 return n 990 } 991 992 func filterPolicyName(p FilterPolicy) string { 993 if p == nil { 994 return "none" 995 } 996 return p.Name() 997 } 998 999 func (o *Options) String() string { 1000 var buf bytes.Buffer 1001 1002 cacheSize := int64(cacheDefaultSize) 1003 if o.Cache != nil { 1004 cacheSize = o.Cache.MaxSize() 1005 } 1006 1007 fmt.Fprintf(&buf, "[Version]\n") 1008 fmt.Fprintf(&buf, " bitalostable_version=0.1\n") 1009 fmt.Fprintf(&buf, "\n") 1010 fmt.Fprintf(&buf, "[Options]\n") 1011 fmt.Fprintf(&buf, " bytes_per_sync=%d\n", o.BytesPerSync) 1012 fmt.Fprintf(&buf, " cache_size=%d\n", cacheSize) 1013 fmt.Fprintf(&buf, " cleaner=%s\n", o.Cleaner) 1014 fmt.Fprintf(&buf, " compaction_debt_concurrency=%d\n", o.Experimental.CompactionDebtConcurrency) 1015 fmt.Fprintf(&buf, " comparer=%s\n", o.Comparer.Name) 1016 fmt.Fprintf(&buf, " disable_wal=%t\n", o.DisableWAL) 1017 fmt.Fprintf(&buf, " flush_delay_delete_range=%s\n", o.FlushDelayDeleteRange) 1018 fmt.Fprintf(&buf, " flush_delay_range_key=%s\n", o.FlushDelayRangeKey) 1019 fmt.Fprintf(&buf, " flush_split_bytes=%d\n", o.FlushSplitBytes) 1020 fmt.Fprintf(&buf, " format_major_version=%d\n", o.FormatMajorVersion) 1021 fmt.Fprintf(&buf, " l0_compaction_concurrency=%d\n", o.Experimental.L0CompactionConcurrency) 1022 fmt.Fprintf(&buf, " l0_compaction_file_threshold=%d\n", o.L0CompactionFileThreshold) 1023 fmt.Fprintf(&buf, " l0_compaction_threshold=%d\n", o.L0CompactionThreshold) 1024 fmt.Fprintf(&buf, " l0_stop_writes_threshold=%d\n", o.L0StopWritesThreshold) 1025 fmt.Fprintf(&buf, " lbase_max_bytes=%d\n", o.LBaseMaxBytes) 1026 fmt.Fprintf(&buf, " max_concurrent_compactions=%d\n", o.MaxConcurrentCompactions()) 1027 fmt.Fprintf(&buf, " max_manifest_file_size=%d\n", o.MaxManifestFileSize) 1028 fmt.Fprintf(&buf, " max_open_files=%d\n", o.MaxOpenFiles) 1029 fmt.Fprintf(&buf, " mem_table_size=%d\n", o.MemTableSize) 1030 fmt.Fprintf(&buf, " mem_table_stop_writes_threshold=%d\n", o.MemTableStopWritesThreshold) 1031 fmt.Fprintf(&buf, " min_deletion_rate=%d\n", o.Experimental.MinDeletionRate) 1032 fmt.Fprintf(&buf, " merger=%s\n", o.Merger.Name) 1033 fmt.Fprintf(&buf, " read_compaction_rate=%d\n", o.Experimental.ReadCompactionRate) 1034 fmt.Fprintf(&buf, " read_sampling_multiplier=%d\n", o.Experimental.ReadSamplingMultiplier) 1035 fmt.Fprintf(&buf, " strict_wal_tail=%t\n", o.private.strictWALTail) 1036 fmt.Fprintf(&buf, " table_cache_shards=%d\n", o.Experimental.TableCacheShards) 1037 fmt.Fprintf(&buf, " table_property_collectors=[") 1038 for i := range o.TablePropertyCollectors { 1039 if i > 0 { 1040 fmt.Fprintf(&buf, ",") 1041 } 1042 // NB: This creates a new TablePropertyCollector, but Options.String() is 1043 // called rarely so the overhead of doing so is not consequential. 1044 fmt.Fprintf(&buf, "%s", o.TablePropertyCollectors[i]().Name()) 1045 } 1046 fmt.Fprintf(&buf, "]\n") 1047 fmt.Fprintf(&buf, " validate_on_ingest=%t\n", o.Experimental.ValidateOnIngest) 1048 fmt.Fprintf(&buf, " wal_dir=%s\n", o.WALDir) 1049 fmt.Fprintf(&buf, " wal_bytes_per_sync=%d\n", o.WALBytesPerSync) 1050 fmt.Fprintf(&buf, " max_writer_concurrency=%d\n", o.Experimental.MaxWriterConcurrency) 1051 fmt.Fprintf(&buf, " force_writer_parallelism=%t\n", o.Experimental.ForceWriterParallelism) 1052 1053 for i := range o.Levels { 1054 l := &o.Levels[i] 1055 fmt.Fprintf(&buf, "\n") 1056 fmt.Fprintf(&buf, "[Level \"%d\"]\n", i) 1057 fmt.Fprintf(&buf, " block_restart_interval=%d\n", l.BlockRestartInterval) 1058 fmt.Fprintf(&buf, " block_size=%d\n", l.BlockSize) 1059 fmt.Fprintf(&buf, " compression=%s\n", l.Compression) 1060 fmt.Fprintf(&buf, " filter_policy=%s\n", filterPolicyName(l.FilterPolicy)) 1061 fmt.Fprintf(&buf, " filter_type=%s\n", l.FilterType) 1062 fmt.Fprintf(&buf, " index_block_size=%d\n", l.IndexBlockSize) 1063 fmt.Fprintf(&buf, " target_file_size=%d\n", l.TargetFileSize) 1064 } 1065 1066 return buf.String() 1067 } 1068 1069 func parseOptions(s string, fn func(section, key, value string) error) error { 1070 var section string 1071 for _, line := range strings.Split(s, "\n") { 1072 line = strings.TrimSpace(line) 1073 if len(line) == 0 { 1074 // Skip blank lines. 1075 continue 1076 } 1077 if line[0] == ';' || line[0] == '#' { 1078 // Skip comments. 1079 continue 1080 } 1081 n := len(line) 1082 if line[0] == '[' && line[n-1] == ']' { 1083 // Parse section. 1084 section = line[1 : n-1] 1085 continue 1086 } 1087 1088 pos := strings.Index(line, "=") 1089 if pos < 0 { 1090 return errors.Errorf("bitalostable: invalid key=value syntax: %s", errors.Safe(line)) 1091 } 1092 1093 key := strings.TrimSpace(line[:pos]) 1094 value := strings.TrimSpace(line[pos+1:]) 1095 1096 // RocksDB uses a similar (INI-style) syntax for the OPTIONS file, but 1097 // different section names and keys. The "CFOptions ..." paths are the 1098 // RocksDB versions which we map to the Pebble paths. 1099 mappedSection := section 1100 if section == `CFOptions "default"` { 1101 mappedSection = "Options" 1102 switch key { 1103 case "comparator": 1104 key = "comparer" 1105 case "merge_operator": 1106 key = "merger" 1107 } 1108 } 1109 1110 if err := fn(mappedSection, key, value); err != nil { 1111 return err 1112 } 1113 } 1114 return nil 1115 } 1116 1117 // ParseHooks contains callbacks to create options fields which can have 1118 // user-defined implementations. 1119 type ParseHooks struct { 1120 NewCache func(size int64) *Cache 1121 NewCleaner func(name string) (Cleaner, error) 1122 NewComparer func(name string) (*Comparer, error) 1123 NewFilterPolicy func(name string) (FilterPolicy, error) 1124 NewMerger func(name string) (*Merger, error) 1125 SkipUnknown func(name, value string) bool 1126 } 1127 1128 // Parse parses the options from the specified string. Note that certain 1129 // options cannot be parsed into populated fields. For example, comparer and 1130 // merger. 1131 func (o *Options) Parse(s string, hooks *ParseHooks) error { 1132 return parseOptions(s, func(section, key, value string) error { 1133 // WARNING: DO NOT remove entries from the switches below because doing so 1134 // causes a key previously written to the OPTIONS file to be considered unknown, 1135 // a backwards incompatible change. Instead, leave in support for parsing the 1136 // key but simply don't parse the value. 1137 1138 switch { 1139 case section == "Version": 1140 switch key { 1141 case "bitalostable_version": 1142 default: 1143 if hooks != nil && hooks.SkipUnknown != nil && hooks.SkipUnknown(section+"."+key, value) { 1144 return nil 1145 } 1146 return errors.Errorf("bitalostable: unknown option: %s.%s", 1147 errors.Safe(section), errors.Safe(key)) 1148 } 1149 return nil 1150 1151 case section == "Options": 1152 var err error 1153 switch key { 1154 case "bytes_per_sync": 1155 o.BytesPerSync, err = strconv.Atoi(value) 1156 case "cache_size": 1157 var n int64 1158 n, err = strconv.ParseInt(value, 10, 64) 1159 if err == nil && hooks != nil && hooks.NewCache != nil { 1160 if o.Cache != nil { 1161 o.Cache.Unref() 1162 } 1163 o.Cache = hooks.NewCache(n) 1164 } 1165 // We avoid calling cache.New in parsing because it makes it 1166 // too easy to leak a cache. 1167 case "cleaner": 1168 switch value { 1169 case "archive": 1170 o.Cleaner = ArchiveCleaner{} 1171 case "delete": 1172 o.Cleaner = DeleteCleaner{} 1173 default: 1174 if hooks != nil && hooks.NewCleaner != nil { 1175 o.Cleaner, err = hooks.NewCleaner(value) 1176 } 1177 } 1178 case "comparer": 1179 switch value { 1180 case "leveldb.BytewiseComparator": 1181 o.Comparer = DefaultComparer 1182 default: 1183 if hooks != nil && hooks.NewComparer != nil { 1184 o.Comparer, err = hooks.NewComparer(value) 1185 } 1186 } 1187 case "compaction_debt_concurrency": 1188 o.Experimental.CompactionDebtConcurrency, err = strconv.Atoi(value) 1189 case "delete_range_flush_delay": 1190 // NB: This is a deprecated serialization of the 1191 // `flush_delay_delete_range`. 1192 o.FlushDelayDeleteRange, err = time.ParseDuration(value) 1193 case "disable_wal": 1194 o.DisableWAL, err = strconv.ParseBool(value) 1195 case "flush_delay_delete_range": 1196 o.FlushDelayDeleteRange, err = time.ParseDuration(value) 1197 case "flush_delay_range_key": 1198 o.FlushDelayRangeKey, err = time.ParseDuration(value) 1199 case "flush_split_bytes": 1200 o.FlushSplitBytes, err = strconv.ParseInt(value, 10, 64) 1201 case "format_major_version": 1202 // NB: The version written here may be stale. Open does 1203 // not use the format major version encoded in the 1204 // OPTIONS file other than to validate that the encoded 1205 // version is valid right here. 1206 var v uint64 1207 v, err = strconv.ParseUint(value, 10, 64) 1208 if vers := FormatMajorVersion(v); vers > FormatNewest || vers == FormatDefault { 1209 err = errors.Newf("unknown format major version %d", o.FormatMajorVersion) 1210 } 1211 if err == nil { 1212 o.FormatMajorVersion = FormatMajorVersion(v) 1213 } 1214 case "l0_compaction_concurrency": 1215 o.Experimental.L0CompactionConcurrency, err = strconv.Atoi(value) 1216 case "l0_compaction_file_threshold": 1217 o.L0CompactionFileThreshold, err = strconv.Atoi(value) 1218 case "l0_compaction_threshold": 1219 o.L0CompactionThreshold, err = strconv.Atoi(value) 1220 case "l0_stop_writes_threshold": 1221 o.L0StopWritesThreshold, err = strconv.Atoi(value) 1222 case "l0_sublevel_compactions": 1223 // Do nothing; option existed in older versions of bitalostable. 1224 case "lbase_max_bytes": 1225 o.LBaseMaxBytes, err = strconv.ParseInt(value, 10, 64) 1226 case "max_concurrent_compactions": 1227 var concurrentCompactions int 1228 concurrentCompactions, err = strconv.Atoi(value) 1229 if concurrentCompactions <= 0 { 1230 err = errors.New("max_concurrent_compactions cannot be <= 0") 1231 } else { 1232 o.MaxConcurrentCompactions = func() int { return concurrentCompactions } 1233 } 1234 case "max_manifest_file_size": 1235 o.MaxManifestFileSize, err = strconv.ParseInt(value, 10, 64) 1236 case "max_open_files": 1237 o.MaxOpenFiles, err = strconv.Atoi(value) 1238 case "mem_table_size": 1239 o.MemTableSize, err = strconv.Atoi(value) 1240 case "mem_table_stop_writes_threshold": 1241 o.MemTableStopWritesThreshold, err = strconv.Atoi(value) 1242 case "min_compaction_rate": 1243 // Do nothing; option existed in older versions of bitalostable, and 1244 // may be meaningful again eventually. 1245 case "min_deletion_rate": 1246 o.Experimental.MinDeletionRate, err = strconv.Atoi(value) 1247 case "min_flush_rate": 1248 // Do nothing; option existed in older versions of bitalostable, and 1249 // may be meaningful again eventually. 1250 case "strict_wal_tail": 1251 o.private.strictWALTail, err = strconv.ParseBool(value) 1252 case "merger": 1253 switch value { 1254 case "nullptr": 1255 o.Merger = nil 1256 case "bitalostable.concatenate": 1257 o.Merger = DefaultMerger 1258 default: 1259 if hooks != nil && hooks.NewMerger != nil { 1260 o.Merger, err = hooks.NewMerger(value) 1261 } 1262 } 1263 case "read_compaction_rate": 1264 o.Experimental.ReadCompactionRate, err = strconv.ParseInt(value, 10, 64) 1265 case "read_sampling_multiplier": 1266 o.Experimental.ReadSamplingMultiplier, err = strconv.ParseInt(value, 10, 64) 1267 case "table_cache_shards": 1268 o.Experimental.TableCacheShards, err = strconv.Atoi(value) 1269 case "table_format": 1270 switch value { 1271 case "leveldb": 1272 case "rocksdbv2": 1273 default: 1274 return errors.Errorf("bitalostable: unknown table format: %q", errors.Safe(value)) 1275 } 1276 case "table_property_collectors": 1277 // TODO(peter): set o.TablePropertyCollectors 1278 case "validate_on_ingest": 1279 o.Experimental.ValidateOnIngest, err = strconv.ParseBool(value) 1280 case "wal_dir": 1281 o.WALDir = value 1282 case "wal_bytes_per_sync": 1283 o.WALBytesPerSync, err = strconv.Atoi(value) 1284 case "max_writer_concurrency": 1285 o.Experimental.MaxWriterConcurrency, err = strconv.Atoi(value) 1286 case "force_writer_parallelism": 1287 o.Experimental.ForceWriterParallelism, err = strconv.ParseBool(value) 1288 default: 1289 if hooks != nil && hooks.SkipUnknown != nil && hooks.SkipUnknown(section+"."+key, value) { 1290 return nil 1291 } 1292 return errors.Errorf("bitalostable: unknown option: %s.%s", 1293 errors.Safe(section), errors.Safe(key)) 1294 } 1295 return err 1296 1297 case strings.HasPrefix(section, "Level "): 1298 var index int 1299 if n, err := fmt.Sscanf(section, `Level "%d"`, &index); err != nil { 1300 return err 1301 } else if n != 1 { 1302 if hooks != nil && hooks.SkipUnknown != nil && hooks.SkipUnknown(section, value) { 1303 return nil 1304 } 1305 return errors.Errorf("bitalostable: unknown section: %q", errors.Safe(section)) 1306 } 1307 1308 if len(o.Levels) <= index { 1309 newLevels := make([]LevelOptions, index+1) 1310 copy(newLevels, o.Levels) 1311 o.Levels = newLevels 1312 } 1313 l := &o.Levels[index] 1314 1315 var err error 1316 switch key { 1317 case "block_restart_interval": 1318 l.BlockRestartInterval, err = strconv.Atoi(value) 1319 case "block_size": 1320 l.BlockSize, err = strconv.Atoi(value) 1321 case "compression": 1322 switch value { 1323 case "Default": 1324 l.Compression = DefaultCompression 1325 case "NoCompression": 1326 l.Compression = NoCompression 1327 case "Snappy": 1328 l.Compression = SnappyCompression 1329 case "ZSTD": 1330 l.Compression = ZstdCompression 1331 default: 1332 return errors.Errorf("bitalostable: unknown compression: %q", errors.Safe(value)) 1333 } 1334 case "filter_policy": 1335 if hooks != nil && hooks.NewFilterPolicy != nil { 1336 l.FilterPolicy, err = hooks.NewFilterPolicy(value) 1337 } 1338 case "filter_type": 1339 switch value { 1340 case "table": 1341 l.FilterType = TableFilter 1342 default: 1343 return errors.Errorf("bitalostable: unknown filter type: %q", errors.Safe(value)) 1344 } 1345 case "index_block_size": 1346 l.IndexBlockSize, err = strconv.Atoi(value) 1347 case "target_file_size": 1348 l.TargetFileSize, err = strconv.ParseInt(value, 10, 64) 1349 default: 1350 if hooks != nil && hooks.SkipUnknown != nil && hooks.SkipUnknown(section+"."+key, value) { 1351 return nil 1352 } 1353 return errors.Errorf("bitalostable: unknown option: %s.%s", errors.Safe(section), errors.Safe(key)) 1354 } 1355 return err 1356 } 1357 if hooks != nil && hooks.SkipUnknown != nil && hooks.SkipUnknown(section+"."+key, value) { 1358 return nil 1359 } 1360 return errors.Errorf("bitalostable: unknown section: %q", errors.Safe(section)) 1361 }) 1362 } 1363 1364 func (o *Options) checkOptions(s string) (strictWALTail bool, err error) { 1365 // TODO(jackson): Refactor to avoid awkwardness of the strictWALTail return value. 1366 return strictWALTail, parseOptions(s, func(section, key, value string) error { 1367 switch section + "." + key { 1368 case "Options.comparer": 1369 if value != o.Comparer.Name { 1370 return errors.Errorf("bitalostable: comparer name from file %q != comparer name from options %q", 1371 errors.Safe(value), errors.Safe(o.Comparer.Name)) 1372 } 1373 case "Options.merger": 1374 // RocksDB allows the merge operator to be unspecified, in which case it 1375 // shows up as "nullptr". 1376 if value != "nullptr" && value != o.Merger.Name { 1377 return errors.Errorf("bitalostable: merger name from file %q != merger name from options %q", 1378 errors.Safe(value), errors.Safe(o.Merger.Name)) 1379 } 1380 case "Options.strict_wal_tail": 1381 strictWALTail, err = strconv.ParseBool(value) 1382 if err != nil { 1383 return errors.Errorf("bitalostable: error parsing strict_wal_tail value %q: %w", value, err) 1384 } 1385 } 1386 return nil 1387 }) 1388 } 1389 1390 // Check verifies the options are compatible with the previous options 1391 // serialized by Options.String(). For example, the Comparer and Merger must be 1392 // the same, or data will not be able to be properly read from the DB. 1393 func (o *Options) Check(s string) error { 1394 _, err := o.checkOptions(s) 1395 return err 1396 } 1397 1398 // Validate verifies that the options are mutually consistent. For example, 1399 // L0StopWritesThreshold must be >= L0CompactionThreshold, otherwise a write 1400 // stall would persist indefinitely. 1401 func (o *Options) Validate() error { 1402 // Note that we can presume Options.EnsureDefaults has been called, so there 1403 // is no need to check for zero values. 1404 1405 var buf strings.Builder 1406 if o.Experimental.L0CompactionConcurrency < 1 { 1407 fmt.Fprintf(&buf, "L0CompactionConcurrency (%d) must be >= 1\n", 1408 o.Experimental.L0CompactionConcurrency) 1409 } 1410 if o.L0StopWritesThreshold < o.L0CompactionThreshold { 1411 fmt.Fprintf(&buf, "L0StopWritesThreshold (%d) must be >= L0CompactionThreshold (%d)\n", 1412 o.L0StopWritesThreshold, o.L0CompactionThreshold) 1413 } 1414 if uint64(o.MemTableSize) >= maxMemTableSize { 1415 fmt.Fprintf(&buf, "MemTableSize (%s) must be < %s\n", 1416 humanize.Uint64(uint64(o.MemTableSize)), humanize.Uint64(maxMemTableSize)) 1417 } 1418 if o.MemTableStopWritesThreshold < 2 { 1419 fmt.Fprintf(&buf, "MemTableStopWritesThreshold (%d) must be >= 2\n", 1420 o.MemTableStopWritesThreshold) 1421 } 1422 if o.FormatMajorVersion > FormatNewest { 1423 fmt.Fprintf(&buf, "FormatMajorVersion (%d) must be <= %d\n", 1424 o.FormatMajorVersion, FormatNewest) 1425 } 1426 if o.TableCache != nil && o.Cache != o.TableCache.cache { 1427 fmt.Fprintf(&buf, "underlying cache in the TableCache and the Cache dont match\n") 1428 } 1429 if buf.Len() == 0 { 1430 return nil 1431 } 1432 return errors.New(buf.String()) 1433 } 1434 1435 // MakeReaderOptions constructs sstable.ReaderOptions from the corresponding 1436 // options in the receiver. 1437 func (o *Options) MakeReaderOptions() sstable.ReaderOptions { 1438 var readerOpts sstable.ReaderOptions 1439 if o != nil { 1440 readerOpts.Cache = o.Cache 1441 readerOpts.Comparer = o.Comparer 1442 readerOpts.Filters = o.Filters 1443 if o.Merger != nil { 1444 readerOpts.MergerName = o.Merger.Name 1445 } 1446 } 1447 return readerOpts 1448 } 1449 1450 // MakeWriterOptions constructs sstable.WriterOptions for the specified level 1451 // from the corresponding options in the receiver. 1452 func (o *Options) MakeWriterOptions(level int, format sstable.TableFormat) sstable.WriterOptions { 1453 var writerOpts sstable.WriterOptions 1454 writerOpts.TableFormat = format 1455 if o != nil { 1456 writerOpts.Cache = o.Cache 1457 writerOpts.Comparer = o.Comparer 1458 if o.Merger != nil { 1459 writerOpts.MergerName = o.Merger.Name 1460 } 1461 writerOpts.TablePropertyCollectors = o.TablePropertyCollectors 1462 writerOpts.BlockPropertyCollectors = o.BlockPropertyCollectors 1463 } 1464 levelOpts := o.Level(level) 1465 writerOpts.BlockRestartInterval = levelOpts.BlockRestartInterval 1466 writerOpts.BlockSize = levelOpts.BlockSize 1467 writerOpts.BlockSizeThreshold = levelOpts.BlockSizeThreshold 1468 writerOpts.Compression = levelOpts.Compression 1469 writerOpts.FilterPolicy = levelOpts.FilterPolicy 1470 writerOpts.FilterType = levelOpts.FilterType 1471 writerOpts.IndexBlockSize = levelOpts.IndexBlockSize 1472 return writerOpts 1473 }