github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/internal/base/options.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package base 6 7 import ( 8 "bytes" 9 "fmt" 10 "strings" 11 12 "github.com/petermattis/pebble/cache" 13 "github.com/petermattis/pebble/vfs" 14 ) 15 16 // Compression is the per-block compression algorithm to use. 17 type Compression int 18 19 const ( 20 DefaultCompression Compression = iota 21 NoCompression 22 SnappyCompression 23 nCompression 24 ) 25 26 func (c Compression) String() string { 27 switch c { 28 case DefaultCompression: 29 return "Default" 30 case NoCompression: 31 return "NoCompression" 32 case SnappyCompression: 33 return "Snappy" 34 default: 35 return "Unknown" 36 } 37 } 38 39 // FilterType is the level at which to apply a filter: block or table. 40 type FilterType int 41 42 // The available filter types. 43 const ( 44 TableFilter FilterType = iota 45 ) 46 47 func (t FilterType) String() string { 48 switch t { 49 case TableFilter: 50 return "table" 51 } 52 return "unknown" 53 } 54 55 // FilterWriter provides an interface for creating filter blocks. See 56 // FilterPolicy for more details about filters. 57 type FilterWriter interface { 58 // AddKey adds a key to the current filter block. 59 AddKey(key []byte) 60 61 // Finish appends to dst an encoded filter tha holds the current set of 62 // keys. The writer state is reset after the call to Finish allowing the 63 // writer to be reused for the creation of additional filters. 64 Finish(dst []byte) []byte 65 } 66 67 // FilterPolicy is an algorithm for probabilistically encoding a set of keys. 68 // The canonical implementation is a Bloom filter. 69 // 70 // Every FilterPolicy has a name. This names the algorithm itself, not any one 71 // particular instance. Aspects specific to a particular instance, such as the 72 // set of keys or any other parameters, will be encoded in the []byte filter 73 // returned by NewWriter. 74 // 75 // The name may be written to files on disk, along with the filter data. To use 76 // these filters, the FilterPolicy name at the time of writing must equal the 77 // name at the time of reading. If they do not match, the filters will be 78 // ignored, which will not affect correctness but may affect performance. 79 type FilterPolicy interface { 80 // Name names the filter policy. 81 Name() string 82 83 // MayContain returns whether the encoded filter may contain given key. 84 // False positives are possible, where it returns true for keys not in the 85 // original set. 86 MayContain(ftype FilterType, filter, key []byte) bool 87 88 // NewWriter creates a new FilterWriter. 89 NewWriter(ftype FilterType) FilterWriter 90 } 91 92 func filterPolicyName(p FilterPolicy) string { 93 if p == nil { 94 return "none" 95 } 96 return p.Name() 97 } 98 99 // TableFormat specifies the format version for sstables. The legacy LevelDB 100 // format is format version 0. 101 type TableFormat uint32 102 103 // The available table formats. Note that these values are not (and should not) 104 // be serialized to disk. TableFormatRocksDBv2 is the default if otherwise 105 // unspecified. 106 const ( 107 TableFormatRocksDBv2 TableFormat = iota 108 TableFormatLevelDB 109 ) 110 111 // TablePropertyCollector provides a hook for collecting user-defined 112 // properties based on the keys and values stored in an sstable. A new 113 // TablePropertyCollector is created for an sstable when the sstable is being 114 // written. 115 type TablePropertyCollector interface { 116 // Add is called with each new entry added to the sstable. While the sstable 117 // is itself sorted by key, do not assume that the entries are added in any 118 // order. In particular, the ordering of point entries and range tombstones 119 // is unspecified. 120 Add(key InternalKey, value []byte) error 121 122 // Finish is called when all entries have been added to the sstable. The 123 // collected properties (if any) should be added to the specified map. Note 124 // that in case of an error during sstable construction, Finish may not be 125 // called. 126 Finish(userProps map[string]string) error 127 128 // The name of the property collector. 129 Name() string 130 } 131 132 // LevelOptions holds the optional per-level parameters. 133 type LevelOptions struct { 134 // BlockRestartInterval is the number of keys between restart points 135 // for delta encoding of keys. 136 // 137 // The default value is 16. 138 BlockRestartInterval int 139 140 // BlockSize is the target uncompressed size in bytes of each table block. 141 // 142 // The default value is 4096. 143 BlockSize int 144 145 // BlockSizeThreshold finishes a block if the block size is larger than the 146 // specified percentage of the target block size and adding the next entry 147 // would cause the block to be larger than the target block size. 148 // 149 // The default value is 90 150 BlockSizeThreshold int 151 152 // Compression defines the per-block compression to use. 153 // 154 // The default value (DefaultCompression) uses snappy compression. 155 Compression Compression 156 157 // FilterPolicy defines a filter algorithm (such as a Bloom filter) that can 158 // reduce disk reads for Get calls. 159 // 160 // One such implementation is bloom.FilterPolicy(10) from the pebble/bloom 161 // package. 162 // 163 // The default value means to use no filter. 164 FilterPolicy FilterPolicy 165 166 // FilterType defines whether an existing filter policy is applied at a 167 // block-level or table-level. Block-level filters use less memory to create, 168 // but are slower to access as a check for the key in the index must first be 169 // performed to locate the filter block. A table-level filter will require 170 // memory proportional to the number of keys in an sstable to create, but 171 // avoids the index lookup when determining if a key is present. Table-level 172 // filters should be preferred except under constrained memory situations. 173 FilterType FilterType 174 175 // IndexBlockSize is the target uncompressed size in bytes of each index 176 // block. When the index block size is larger than this target, two-level 177 // indexes are automatically enabled. Setting this option to a large value 178 // (such as math.MaxInt32) disables the automatic creation of two-level 179 // indexes. 180 // 181 // The default value is the value of BlockSize. 182 IndexBlockSize int 183 184 // The target file size for the level. 185 TargetFileSize int64 186 } 187 188 // EnsureDefaults ensures that the default values for all of the options have 189 // been initialized. It is valid to call EnsureDefaults on a nil receiver. A 190 // non-nil result will always be returned. 191 func (o *LevelOptions) EnsureDefaults() *LevelOptions { 192 if o == nil { 193 o = &LevelOptions{} 194 } 195 if o.BlockRestartInterval <= 0 { 196 o.BlockRestartInterval = 16 197 } 198 if o.BlockSize <= 0 { 199 o.BlockSize = 4096 200 } 201 if o.BlockSizeThreshold <= 0 { 202 o.BlockSizeThreshold = 90 203 } 204 if o.Compression <= DefaultCompression || o.Compression >= nCompression { 205 o.Compression = SnappyCompression 206 } 207 if o.IndexBlockSize <= 0 { 208 o.IndexBlockSize = o.BlockSize 209 } 210 if o.TargetFileSize <= 0 { 211 o.TargetFileSize = 2 << 20 // 2 MB 212 } 213 return o 214 } 215 216 // Options holds the optional parameters for configuring pebble. These options 217 // apply to the DB at large; per-query options are defined by the IterOptions 218 // and WriteOptions types. 219 type Options struct { 220 // Sync sstables and the WAL periodically in order to smooth out writes to 221 // disk. This option does not provide any persistency guarantee, but is used 222 // to avoid latency spikes if the OS automatically decides to write out a 223 // large chunk of dirty filesystem buffers. 224 // 225 // The default value is 512KB. 226 BytesPerSync int 227 228 // Cache is used to cache uncompressed blocks from sstables. 229 // 230 // The default cache size is 8 MB. 231 Cache *cache.Cache 232 233 // Comparer defines a total ordering over the space of []byte keys: a 'less 234 // than' relationship. The same comparison algorithm must be used for reads 235 // and writes over the lifetime of the DB. 236 // 237 // The default value uses the same ordering as bytes.Compare. 238 Comparer *Comparer 239 240 // Disable the write-ahead log (WAL). Disabling the write-ahead log prohibits 241 // crash recovery, but can improve performance if crash recovery is not 242 // needed (e.g. when only temporary state is being stored in the database). 243 // 244 // TODO(peter): untested 245 DisableWAL bool 246 247 // ErrorIfDBExists is whether it is an error if the database already exists. 248 // 249 // The default value is false. 250 ErrorIfDBExists bool 251 252 // EventListener provides hooks to listening to significant DB events such as 253 // flushes, compactions, and table deletion. 254 EventListener EventListener 255 256 // Filters is a map from filter policy name to filter policy. It is used for 257 // debugging tools which may be used on multiple databases configured with 258 // different filter policies. It is not necessary to populate this filters 259 // map during normal usage of a DB. 260 Filters map[string]FilterPolicy 261 262 // FS provides the interface for persistent file storage. 263 // 264 // The default value uses the underlying operating system's file system. 265 FS vfs.FS 266 267 // The number of files necessary to trigger an L0 compaction. 268 L0CompactionThreshold int 269 270 // Hard limit on the number of L0 files. Writes are stopped when this 271 // threshold is reached. 272 L0StopWritesThreshold int 273 274 // The maximum number of bytes for LBase. The base level is the level which 275 // L0 is compacted into. The base level is determined dynamically based on 276 // the existing data in the LSM. The maximum number of bytes for other levels 277 // is computed dynamically based on the base level's maximum size. When the 278 // maximum number of bytes for a level is exceeded, compaction is requested. 279 LBaseMaxBytes int64 280 281 // Per-level options. Options for at least one level must be specified. The 282 // options for the last level are used for all subsequent levels. 283 Levels []LevelOptions 284 285 // Logger used to write log messages. 286 // 287 // The default logger uses the Go standard library log package. 288 Logger Logger 289 290 // MaxManifestFileSize is the maximum size the MANIFEST file is allowed to 291 // become. When the MANIFEST exceeds this size it is rolled over and a new 292 // MANIFEST is created. 293 MaxManifestFileSize int64 294 295 // MaxOpenFiles is a soft limit on the number of open files that can be 296 // used by the DB. 297 // 298 // The default value is 1000. 299 MaxOpenFiles int 300 301 // The size of a MemTable. Note that more than one MemTable can be in 302 // existence since flushing a MemTable involves creating a new one and 303 // writing the contents of the old one in the 304 // background. MemTableStopWritesThreshold places a hard limit on the number 305 // of MemTables allowed at once. 306 MemTableSize int 307 308 // Hard limit on the number of MemTables. Writes are stopped when this number 309 // is reached. This value should be at least 2 or writes will stop whenever 310 // the MemTable is being flushed. 311 MemTableStopWritesThreshold int 312 313 // Merger defines the associative merge operation to use for merging values 314 // written with {Batch,DB}.Merge. 315 // 316 // The default merger concatenates values. 317 Merger *Merger 318 319 // MinCompactionRate sets the minimum rate at which compactions occur. The 320 // default is 4 MB/s. 321 MinCompactionRate int 322 323 // MinFlushRate sets the minimum rate at which the MemTables are flushed. The 324 // default is 1 MB/s. 325 MinFlushRate int 326 327 // ReadOnly indicates that the DB should be opened in read-only mode. Writes 328 // to the DB will return an error, background compactions are disabled, and 329 // the flush that normally occurs after replaying the WAL at startup is 330 // disabled. 331 ReadOnly bool 332 333 // TableFormat specifies the format version for sstables. The default is 334 // TableFormatRocksDBv2 which creates RocksDB compatible sstables. Use 335 // TableFormatLevelDB to create LevelDB compatible sstable which can be used 336 // by a wider range of tools and libraries. 337 // 338 // TODO(peter): TableFormatLevelDB does not support all of the functionality 339 // of TableFormatRocksDBv2. We should ensure it is only used when writing an 340 // sstable directly, and not used when opening a database. 341 TableFormat TableFormat 342 343 // TablePropertyCollectors is a list of TablePropertyCollector creation 344 // functions. A new TablePropertyCollector is created for each sstable built 345 // and lives for the lifetime of the table. 346 TablePropertyCollectors []func() TablePropertyCollector 347 348 // WALDir specifies the directory to store write-ahead logs (WALs) in. If 349 // empty (the default), WALs will be stored in the same directory as sstables 350 // (i.e. the directory passed to pebble.Open). 351 WALDir string 352 } 353 354 // EnsureDefaults ensures that the default values for all options are set if a 355 // valid value was not already specified. Returns the new options. 356 func (o *Options) EnsureDefaults() *Options { 357 if o == nil { 358 o = &Options{} 359 } 360 if o.BytesPerSync <= 0 { 361 o.BytesPerSync = 512 << 10 // 512 KB 362 } 363 if o.Cache == nil { 364 o.Cache = cache.New(8 << 20) // 8 MB 365 } 366 if o.Comparer == nil { 367 o.Comparer = DefaultComparer 368 } 369 if o.FS == nil { 370 o.FS = vfs.Default 371 } 372 if o.L0CompactionThreshold <= 0 { 373 o.L0CompactionThreshold = 4 374 } 375 if o.L0StopWritesThreshold <= 0 { 376 o.L0StopWritesThreshold = 12 377 } 378 if o.LBaseMaxBytes <= 0 { 379 o.LBaseMaxBytes = 64 << 20 // 64 MB 380 } 381 if o.Levels == nil { 382 o.Levels = make([]LevelOptions, 1) 383 for i := range o.Levels { 384 if i > 0 { 385 l := &o.Levels[i] 386 if l.TargetFileSize <= 0 { 387 l.TargetFileSize = o.Levels[i-1].TargetFileSize * 2 388 } 389 } 390 o.Levels[i].EnsureDefaults() 391 } 392 } else { 393 for i := range o.Levels { 394 o.Levels[i].EnsureDefaults() 395 } 396 } 397 if o.Logger == nil { 398 o.Logger = defaultLogger{} 399 } 400 o.EventListener.EnsureDefaults(o.Logger) 401 if o.MaxManifestFileSize == 0 { 402 o.MaxManifestFileSize = 128 << 20 // 128 MB 403 } 404 if o.MaxOpenFiles == 0 { 405 o.MaxOpenFiles = 1000 406 } 407 if o.MemTableSize <= 0 { 408 o.MemTableSize = 4 << 20 409 } 410 if o.MemTableStopWritesThreshold <= 0 { 411 o.MemTableStopWritesThreshold = 2 412 } 413 if o.Merger == nil { 414 o.Merger = DefaultMerger 415 } 416 if o.MinCompactionRate == 0 { 417 o.MinCompactionRate = 4 << 20 // 4 MB/s 418 } 419 if o.MinFlushRate == 0 { 420 o.MinFlushRate = 1 << 20 // 1 MB/s 421 } 422 423 o.initMaps() 424 return o 425 } 426 427 // initMaps initializes the Comparers, Filters, and Mergers maps. 428 func (o *Options) initMaps() { 429 for i := range o.Levels { 430 l := &o.Levels[i] 431 if l.FilterPolicy != nil { 432 if o.Filters == nil { 433 o.Filters = make(map[string]FilterPolicy) 434 } 435 name := l.FilterPolicy.Name() 436 if _, ok := o.Filters[name]; !ok { 437 o.Filters[name] = l.FilterPolicy 438 } 439 } 440 } 441 } 442 443 // Level returns the LevelOptions for the specified level. 444 func (o *Options) Level(level int) LevelOptions { 445 if level < len(o.Levels) { 446 return o.Levels[level] 447 } 448 n := len(o.Levels) - 1 449 l := o.Levels[n] 450 for i := n; i < level; i++ { 451 l.TargetFileSize *= 2 452 } 453 return l 454 } 455 456 // Clone creates a shallow-copy of the supplied options. 457 func (o *Options) Clone() *Options { 458 n := &Options{} 459 if o != nil { 460 *n = *o 461 } 462 return n 463 } 464 465 func (o *Options) String() string { 466 var buf bytes.Buffer 467 468 fmt.Fprintf(&buf, "[Version]\n") 469 fmt.Fprintf(&buf, " pebble_version=0.1\n") 470 fmt.Fprintf(&buf, "\n") 471 fmt.Fprintf(&buf, "[Options]\n") 472 fmt.Fprintf(&buf, " bytes_per_sync=%d\n", o.BytesPerSync) 473 fmt.Fprintf(&buf, " cache_size=%d\n", o.Cache.MaxSize()) 474 fmt.Fprintf(&buf, " comparer=%s\n", o.Comparer.Name) 475 fmt.Fprintf(&buf, " disable_wal=%t\n", o.DisableWAL) 476 fmt.Fprintf(&buf, " l0_compaction_threshold=%d\n", o.L0CompactionThreshold) 477 fmt.Fprintf(&buf, " l0_stop_writes_threshold=%d\n", o.L0StopWritesThreshold) 478 fmt.Fprintf(&buf, " lbase_max_bytes=%d\n", o.LBaseMaxBytes) 479 fmt.Fprintf(&buf, " max_manifest_file_size=%d\n", o.MaxManifestFileSize) 480 fmt.Fprintf(&buf, " max_open_files=%d\n", o.MaxOpenFiles) 481 fmt.Fprintf(&buf, " mem_table_size=%d\n", o.MemTableSize) 482 fmt.Fprintf(&buf, " mem_table_stop_writes_threshold=%d\n", o.MemTableStopWritesThreshold) 483 fmt.Fprintf(&buf, " min_compaction_rate=%d\n", o.MinCompactionRate) 484 fmt.Fprintf(&buf, " min_flush_rate=%d\n", o.MinFlushRate) 485 fmt.Fprintf(&buf, " merger=%s\n", o.Merger.Name) 486 fmt.Fprintf(&buf, " table_property_collectors=[") 487 for i := range o.TablePropertyCollectors { 488 if i > 0 { 489 fmt.Fprintf(&buf, ",") 490 } 491 // NB: This creates a new TablePropertyCollector, but Options.String() is 492 // called rarely so the overhead of doing so is not consequential. 493 fmt.Fprintf(&buf, "%s", o.TablePropertyCollectors[i]().Name()) 494 } 495 fmt.Fprintf(&buf, "]\n") 496 fmt.Fprintf(&buf, " wal_dir=%s\n", o.WALDir) 497 498 for i := range o.Levels { 499 l := &o.Levels[i] 500 fmt.Fprintf(&buf, "\n") 501 fmt.Fprintf(&buf, "[Level \"%d\"]\n", i) 502 fmt.Fprintf(&buf, " block_restart_interval=%d\n", l.BlockRestartInterval) 503 fmt.Fprintf(&buf, " block_size=%d\n", l.BlockSize) 504 fmt.Fprintf(&buf, " compression=%s\n", l.Compression) 505 fmt.Fprintf(&buf, " filter_policy=%s\n", filterPolicyName(l.FilterPolicy)) 506 fmt.Fprintf(&buf, " filter_type=%s\n", l.FilterType) 507 fmt.Fprintf(&buf, " index_block_size=%d\n", l.IndexBlockSize) 508 fmt.Fprintf(&buf, " target_file_size=%d\n", l.TargetFileSize) 509 } 510 511 return buf.String() 512 } 513 514 // Check verifies the options are compatible with the previous options 515 // serialized by Options.String(). For example, the Comparer and Merger must be 516 // the same, or data will not be able to be properly read from the DB. 517 func (o *Options) Check(s string) error { 518 var section string 519 for _, line := range strings.Split(s, "\n") { 520 line = strings.TrimSpace(line) 521 if len(line) == 0 { 522 // Skip blank lines. 523 continue 524 } 525 if line[0] == ';' || line[0] == '#' { 526 // Skip comments. 527 continue 528 } 529 n := len(line) 530 if line[0] == '[' && line[n-1] == ']' { 531 // Parse section. 532 section = line[1 : n-1] 533 continue 534 } 535 536 pos := strings.Index(line, "=") 537 if pos < 0 { 538 return fmt.Errorf("pebble: invalid key=value syntax: %s", line) 539 } 540 541 key := strings.TrimSpace(line[:pos]) 542 value := strings.TrimSpace(line[pos+1:]) 543 path := section + "." + key 544 545 // RocksDB uses a similar (INI-style) syntax for the OPTIONS file, but 546 // different section names and keys. The "CFOptions ..." paths below are 547 // the RocksDB versions. 548 switch path { 549 case "Options.comparer", `CFOptions "default".comparator`: 550 if value != o.Comparer.Name { 551 return fmt.Errorf("pebble: comparer name from file %q != comparer name from options %q", 552 value, o.Comparer.Name) 553 } 554 case "Options.merger", `CFOptions "default".merge_operator`: 555 // RocksDB allows the merge operator to be unspecified, in which case it 556 // shows up as "nullptr". 557 if value != "nullptr" && value != o.Merger.Name { 558 return fmt.Errorf("pebble: merger name from file %q != merger name from options %q", 559 value, o.Merger.Name) 560 } 561 } 562 } 563 return nil 564 }