github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/internal/base/options.go (about)

     1  // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package base
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"strings"
    11  
    12  	"github.com/petermattis/pebble/cache"
    13  	"github.com/petermattis/pebble/vfs"
    14  )
    15  
    16  // Compression is the per-block compression algorithm to use.
    17  type Compression int
    18  
    19  const (
    20  	DefaultCompression Compression = iota
    21  	NoCompression
    22  	SnappyCompression
    23  	nCompression
    24  )
    25  
    26  func (c Compression) String() string {
    27  	switch c {
    28  	case DefaultCompression:
    29  		return "Default"
    30  	case NoCompression:
    31  		return "NoCompression"
    32  	case SnappyCompression:
    33  		return "Snappy"
    34  	default:
    35  		return "Unknown"
    36  	}
    37  }
    38  
    39  // FilterType is the level at which to apply a filter: block or table.
    40  type FilterType int
    41  
    42  // The available filter types.
    43  const (
    44  	TableFilter FilterType = iota
    45  )
    46  
    47  func (t FilterType) String() string {
    48  	switch t {
    49  	case TableFilter:
    50  		return "table"
    51  	}
    52  	return "unknown"
    53  }
    54  
    55  // FilterWriter provides an interface for creating filter blocks. See
    56  // FilterPolicy for more details about filters.
    57  type FilterWriter interface {
    58  	// AddKey adds a key to the current filter block.
    59  	AddKey(key []byte)
    60  
    61  	// Finish appends to dst an encoded filter tha holds the current set of
    62  	// keys. The writer state is reset after the call to Finish allowing the
    63  	// writer to be reused for the creation of additional filters.
    64  	Finish(dst []byte) []byte
    65  }
    66  
    67  // FilterPolicy is an algorithm for probabilistically encoding a set of keys.
    68  // The canonical implementation is a Bloom filter.
    69  //
    70  // Every FilterPolicy has a name. This names the algorithm itself, not any one
    71  // particular instance. Aspects specific to a particular instance, such as the
    72  // set of keys or any other parameters, will be encoded in the []byte filter
    73  // returned by NewWriter.
    74  //
    75  // The name may be written to files on disk, along with the filter data. To use
    76  // these filters, the FilterPolicy name at the time of writing must equal the
    77  // name at the time of reading. If they do not match, the filters will be
    78  // ignored, which will not affect correctness but may affect performance.
    79  type FilterPolicy interface {
    80  	// Name names the filter policy.
    81  	Name() string
    82  
    83  	// MayContain returns whether the encoded filter may contain given key.
    84  	// False positives are possible, where it returns true for keys not in the
    85  	// original set.
    86  	MayContain(ftype FilterType, filter, key []byte) bool
    87  
    88  	// NewWriter creates a new FilterWriter.
    89  	NewWriter(ftype FilterType) FilterWriter
    90  }
    91  
    92  func filterPolicyName(p FilterPolicy) string {
    93  	if p == nil {
    94  		return "none"
    95  	}
    96  	return p.Name()
    97  }
    98  
    99  // TableFormat specifies the format version for sstables. The legacy LevelDB
   100  // format is format version 0.
   101  type TableFormat uint32
   102  
   103  // The available table formats. Note that these values are not (and should not)
   104  // be serialized to disk. TableFormatRocksDBv2 is the default if otherwise
   105  // unspecified.
   106  const (
   107  	TableFormatRocksDBv2 TableFormat = iota
   108  	TableFormatLevelDB
   109  )
   110  
   111  // TablePropertyCollector provides a hook for collecting user-defined
   112  // properties based on the keys and values stored in an sstable. A new
   113  // TablePropertyCollector is created for an sstable when the sstable is being
   114  // written.
   115  type TablePropertyCollector interface {
   116  	// Add is called with each new entry added to the sstable. While the sstable
   117  	// is itself sorted by key, do not assume that the entries are added in any
   118  	// order. In particular, the ordering of point entries and range tombstones
   119  	// is unspecified.
   120  	Add(key InternalKey, value []byte) error
   121  
   122  	// Finish is called when all entries have been added to the sstable. The
   123  	// collected properties (if any) should be added to the specified map. Note
   124  	// that in case of an error during sstable construction, Finish may not be
   125  	// called.
   126  	Finish(userProps map[string]string) error
   127  
   128  	// The name of the property collector.
   129  	Name() string
   130  }
   131  
   132  // LevelOptions holds the optional per-level parameters.
   133  type LevelOptions struct {
   134  	// BlockRestartInterval is the number of keys between restart points
   135  	// for delta encoding of keys.
   136  	//
   137  	// The default value is 16.
   138  	BlockRestartInterval int
   139  
   140  	// BlockSize is the target uncompressed size in bytes of each table block.
   141  	//
   142  	// The default value is 4096.
   143  	BlockSize int
   144  
   145  	// BlockSizeThreshold finishes a block if the block size is larger than the
   146  	// specified percentage of the target block size and adding the next entry
   147  	// would cause the block to be larger than the target block size.
   148  	//
   149  	// The default value is 90
   150  	BlockSizeThreshold int
   151  
   152  	// Compression defines the per-block compression to use.
   153  	//
   154  	// The default value (DefaultCompression) uses snappy compression.
   155  	Compression Compression
   156  
   157  	// FilterPolicy defines a filter algorithm (such as a Bloom filter) that can
   158  	// reduce disk reads for Get calls.
   159  	//
   160  	// One such implementation is bloom.FilterPolicy(10) from the pebble/bloom
   161  	// package.
   162  	//
   163  	// The default value means to use no filter.
   164  	FilterPolicy FilterPolicy
   165  
   166  	// FilterType defines whether an existing filter policy is applied at a
   167  	// block-level or table-level. Block-level filters use less memory to create,
   168  	// but are slower to access as a check for the key in the index must first be
   169  	// performed to locate the filter block. A table-level filter will require
   170  	// memory proportional to the number of keys in an sstable to create, but
   171  	// avoids the index lookup when determining if a key is present. Table-level
   172  	// filters should be preferred except under constrained memory situations.
   173  	FilterType FilterType
   174  
   175  	// IndexBlockSize is the target uncompressed size in bytes of each index
   176  	// block. When the index block size is larger than this target, two-level
   177  	// indexes are automatically enabled. Setting this option to a large value
   178  	// (such as math.MaxInt32) disables the automatic creation of two-level
   179  	// indexes.
   180  	//
   181  	// The default value is the value of BlockSize.
   182  	IndexBlockSize int
   183  
   184  	// The target file size for the level.
   185  	TargetFileSize int64
   186  }
   187  
   188  // EnsureDefaults ensures that the default values for all of the options have
   189  // been initialized. It is valid to call EnsureDefaults on a nil receiver. A
   190  // non-nil result will always be returned.
   191  func (o *LevelOptions) EnsureDefaults() *LevelOptions {
   192  	if o == nil {
   193  		o = &LevelOptions{}
   194  	}
   195  	if o.BlockRestartInterval <= 0 {
   196  		o.BlockRestartInterval = 16
   197  	}
   198  	if o.BlockSize <= 0 {
   199  		o.BlockSize = 4096
   200  	}
   201  	if o.BlockSizeThreshold <= 0 {
   202  		o.BlockSizeThreshold = 90
   203  	}
   204  	if o.Compression <= DefaultCompression || o.Compression >= nCompression {
   205  		o.Compression = SnappyCompression
   206  	}
   207  	if o.IndexBlockSize <= 0 {
   208  		o.IndexBlockSize = o.BlockSize
   209  	}
   210  	if o.TargetFileSize <= 0 {
   211  		o.TargetFileSize = 2 << 20 // 2 MB
   212  	}
   213  	return o
   214  }
   215  
   216  // Options holds the optional parameters for configuring pebble. These options
   217  // apply to the DB at large; per-query options are defined by the IterOptions
   218  // and WriteOptions types.
   219  type Options struct {
   220  	// Sync sstables and the WAL periodically in order to smooth out writes to
   221  	// disk. This option does not provide any persistency guarantee, but is used
   222  	// to avoid latency spikes if the OS automatically decides to write out a
   223  	// large chunk of dirty filesystem buffers.
   224  	//
   225  	// The default value is 512KB.
   226  	BytesPerSync int
   227  
   228  	// Cache is used to cache uncompressed blocks from sstables.
   229  	//
   230  	// The default cache size is 8 MB.
   231  	Cache *cache.Cache
   232  
   233  	// Comparer defines a total ordering over the space of []byte keys: a 'less
   234  	// than' relationship. The same comparison algorithm must be used for reads
   235  	// and writes over the lifetime of the DB.
   236  	//
   237  	// The default value uses the same ordering as bytes.Compare.
   238  	Comparer *Comparer
   239  
   240  	// Disable the write-ahead log (WAL). Disabling the write-ahead log prohibits
   241  	// crash recovery, but can improve performance if crash recovery is not
   242  	// needed (e.g. when only temporary state is being stored in the database).
   243  	//
   244  	// TODO(peter): untested
   245  	DisableWAL bool
   246  
   247  	// ErrorIfDBExists is whether it is an error if the database already exists.
   248  	//
   249  	// The default value is false.
   250  	ErrorIfDBExists bool
   251  
   252  	// EventListener provides hooks to listening to significant DB events such as
   253  	// flushes, compactions, and table deletion.
   254  	EventListener EventListener
   255  
   256  	// Filters is a map from filter policy name to filter policy. It is used for
   257  	// debugging tools which may be used on multiple databases configured with
   258  	// different filter policies. It is not necessary to populate this filters
   259  	// map during normal usage of a DB.
   260  	Filters map[string]FilterPolicy
   261  
   262  	// FS provides the interface for persistent file storage.
   263  	//
   264  	// The default value uses the underlying operating system's file system.
   265  	FS vfs.FS
   266  
   267  	// The number of files necessary to trigger an L0 compaction.
   268  	L0CompactionThreshold int
   269  
   270  	// Hard limit on the number of L0 files. Writes are stopped when this
   271  	// threshold is reached.
   272  	L0StopWritesThreshold int
   273  
   274  	// The maximum number of bytes for LBase. The base level is the level which
   275  	// L0 is compacted into. The base level is determined dynamically based on
   276  	// the existing data in the LSM. The maximum number of bytes for other levels
   277  	// is computed dynamically based on the base level's maximum size. When the
   278  	// maximum number of bytes for a level is exceeded, compaction is requested.
   279  	LBaseMaxBytes int64
   280  
   281  	// Per-level options. Options for at least one level must be specified. The
   282  	// options for the last level are used for all subsequent levels.
   283  	Levels []LevelOptions
   284  
   285  	// Logger used to write log messages.
   286  	//
   287  	// The default logger uses the Go standard library log package.
   288  	Logger Logger
   289  
   290  	// MaxManifestFileSize is the maximum size the MANIFEST file is allowed to
   291  	// become. When the MANIFEST exceeds this size it is rolled over and a new
   292  	// MANIFEST is created.
   293  	MaxManifestFileSize int64
   294  
   295  	// MaxOpenFiles is a soft limit on the number of open files that can be
   296  	// used by the DB.
   297  	//
   298  	// The default value is 1000.
   299  	MaxOpenFiles int
   300  
   301  	// The size of a MemTable. Note that more than one MemTable can be in
   302  	// existence since flushing a MemTable involves creating a new one and
   303  	// writing the contents of the old one in the
   304  	// background. MemTableStopWritesThreshold places a hard limit on the number
   305  	// of MemTables allowed at once.
   306  	MemTableSize int
   307  
   308  	// Hard limit on the number of MemTables. Writes are stopped when this number
   309  	// is reached. This value should be at least 2 or writes will stop whenever
   310  	// the MemTable is being flushed.
   311  	MemTableStopWritesThreshold int
   312  
   313  	// Merger defines the associative merge operation to use for merging values
   314  	// written with {Batch,DB}.Merge.
   315  	//
   316  	// The default merger concatenates values.
   317  	Merger *Merger
   318  
   319  	// MinCompactionRate sets the minimum rate at which compactions occur. The
   320  	// default is 4 MB/s.
   321  	MinCompactionRate int
   322  
   323  	// MinFlushRate sets the minimum rate at which the MemTables are flushed. The
   324  	// default is 1 MB/s.
   325  	MinFlushRate int
   326  
   327  	// ReadOnly indicates that the DB should be opened in read-only mode. Writes
   328  	// to the DB will return an error, background compactions are disabled, and
   329  	// the flush that normally occurs after replaying the WAL at startup is
   330  	// disabled.
   331  	ReadOnly bool
   332  
   333  	// TableFormat specifies the format version for sstables. The default is
   334  	// TableFormatRocksDBv2 which creates RocksDB compatible sstables. Use
   335  	// TableFormatLevelDB to create LevelDB compatible sstable which can be used
   336  	// by a wider range of tools and libraries.
   337  	//
   338  	// TODO(peter): TableFormatLevelDB does not support all of the functionality
   339  	// of TableFormatRocksDBv2. We should ensure it is only used when writing an
   340  	// sstable directly, and not used when opening a database.
   341  	TableFormat TableFormat
   342  
   343  	// TablePropertyCollectors is a list of TablePropertyCollector creation
   344  	// functions. A new TablePropertyCollector is created for each sstable built
   345  	// and lives for the lifetime of the table.
   346  	TablePropertyCollectors []func() TablePropertyCollector
   347  
   348  	// WALDir specifies the directory to store write-ahead logs (WALs) in. If
   349  	// empty (the default), WALs will be stored in the same directory as sstables
   350  	// (i.e. the directory passed to pebble.Open).
   351  	WALDir string
   352  }
   353  
   354  // EnsureDefaults ensures that the default values for all options are set if a
   355  // valid value was not already specified. Returns the new options.
   356  func (o *Options) EnsureDefaults() *Options {
   357  	if o == nil {
   358  		o = &Options{}
   359  	}
   360  	if o.BytesPerSync <= 0 {
   361  		o.BytesPerSync = 512 << 10 // 512 KB
   362  	}
   363  	if o.Cache == nil {
   364  		o.Cache = cache.New(8 << 20) // 8 MB
   365  	}
   366  	if o.Comparer == nil {
   367  		o.Comparer = DefaultComparer
   368  	}
   369  	if o.FS == nil {
   370  		o.FS = vfs.Default
   371  	}
   372  	if o.L0CompactionThreshold <= 0 {
   373  		o.L0CompactionThreshold = 4
   374  	}
   375  	if o.L0StopWritesThreshold <= 0 {
   376  		o.L0StopWritesThreshold = 12
   377  	}
   378  	if o.LBaseMaxBytes <= 0 {
   379  		o.LBaseMaxBytes = 64 << 20 // 64 MB
   380  	}
   381  	if o.Levels == nil {
   382  		o.Levels = make([]LevelOptions, 1)
   383  		for i := range o.Levels {
   384  			if i > 0 {
   385  				l := &o.Levels[i]
   386  				if l.TargetFileSize <= 0 {
   387  					l.TargetFileSize = o.Levels[i-1].TargetFileSize * 2
   388  				}
   389  			}
   390  			o.Levels[i].EnsureDefaults()
   391  		}
   392  	} else {
   393  		for i := range o.Levels {
   394  			o.Levels[i].EnsureDefaults()
   395  		}
   396  	}
   397  	if o.Logger == nil {
   398  		o.Logger = defaultLogger{}
   399  	}
   400  	o.EventListener.EnsureDefaults(o.Logger)
   401  	if o.MaxManifestFileSize == 0 {
   402  		o.MaxManifestFileSize = 128 << 20 // 128 MB
   403  	}
   404  	if o.MaxOpenFiles == 0 {
   405  		o.MaxOpenFiles = 1000
   406  	}
   407  	if o.MemTableSize <= 0 {
   408  		o.MemTableSize = 4 << 20
   409  	}
   410  	if o.MemTableStopWritesThreshold <= 0 {
   411  		o.MemTableStopWritesThreshold = 2
   412  	}
   413  	if o.Merger == nil {
   414  		o.Merger = DefaultMerger
   415  	}
   416  	if o.MinCompactionRate == 0 {
   417  		o.MinCompactionRate = 4 << 20 // 4 MB/s
   418  	}
   419  	if o.MinFlushRate == 0 {
   420  		o.MinFlushRate = 1 << 20 // 1 MB/s
   421  	}
   422  
   423  	o.initMaps()
   424  	return o
   425  }
   426  
   427  // initMaps initializes the Comparers, Filters, and Mergers maps.
   428  func (o *Options) initMaps() {
   429  	for i := range o.Levels {
   430  		l := &o.Levels[i]
   431  		if l.FilterPolicy != nil {
   432  			if o.Filters == nil {
   433  				o.Filters = make(map[string]FilterPolicy)
   434  			}
   435  			name := l.FilterPolicy.Name()
   436  			if _, ok := o.Filters[name]; !ok {
   437  				o.Filters[name] = l.FilterPolicy
   438  			}
   439  		}
   440  	}
   441  }
   442  
   443  // Level returns the LevelOptions for the specified level.
   444  func (o *Options) Level(level int) LevelOptions {
   445  	if level < len(o.Levels) {
   446  		return o.Levels[level]
   447  	}
   448  	n := len(o.Levels) - 1
   449  	l := o.Levels[n]
   450  	for i := n; i < level; i++ {
   451  		l.TargetFileSize *= 2
   452  	}
   453  	return l
   454  }
   455  
   456  // Clone creates a shallow-copy of the supplied options.
   457  func (o *Options) Clone() *Options {
   458  	n := &Options{}
   459  	if o != nil {
   460  		*n = *o
   461  	}
   462  	return n
   463  }
   464  
   465  func (o *Options) String() string {
   466  	var buf bytes.Buffer
   467  
   468  	fmt.Fprintf(&buf, "[Version]\n")
   469  	fmt.Fprintf(&buf, "  pebble_version=0.1\n")
   470  	fmt.Fprintf(&buf, "\n")
   471  	fmt.Fprintf(&buf, "[Options]\n")
   472  	fmt.Fprintf(&buf, "  bytes_per_sync=%d\n", o.BytesPerSync)
   473  	fmt.Fprintf(&buf, "  cache_size=%d\n", o.Cache.MaxSize())
   474  	fmt.Fprintf(&buf, "  comparer=%s\n", o.Comparer.Name)
   475  	fmt.Fprintf(&buf, "  disable_wal=%t\n", o.DisableWAL)
   476  	fmt.Fprintf(&buf, "  l0_compaction_threshold=%d\n", o.L0CompactionThreshold)
   477  	fmt.Fprintf(&buf, "  l0_stop_writes_threshold=%d\n", o.L0StopWritesThreshold)
   478  	fmt.Fprintf(&buf, "  lbase_max_bytes=%d\n", o.LBaseMaxBytes)
   479  	fmt.Fprintf(&buf, "  max_manifest_file_size=%d\n", o.MaxManifestFileSize)
   480  	fmt.Fprintf(&buf, "  max_open_files=%d\n", o.MaxOpenFiles)
   481  	fmt.Fprintf(&buf, "  mem_table_size=%d\n", o.MemTableSize)
   482  	fmt.Fprintf(&buf, "  mem_table_stop_writes_threshold=%d\n", o.MemTableStopWritesThreshold)
   483  	fmt.Fprintf(&buf, "  min_compaction_rate=%d\n", o.MinCompactionRate)
   484  	fmt.Fprintf(&buf, "  min_flush_rate=%d\n", o.MinFlushRate)
   485  	fmt.Fprintf(&buf, "  merger=%s\n", o.Merger.Name)
   486  	fmt.Fprintf(&buf, "  table_property_collectors=[")
   487  	for i := range o.TablePropertyCollectors {
   488  		if i > 0 {
   489  			fmt.Fprintf(&buf, ",")
   490  		}
   491  		// NB: This creates a new TablePropertyCollector, but Options.String() is
   492  		// called rarely so the overhead of doing so is not consequential.
   493  		fmt.Fprintf(&buf, "%s", o.TablePropertyCollectors[i]().Name())
   494  	}
   495  	fmt.Fprintf(&buf, "]\n")
   496  	fmt.Fprintf(&buf, "  wal_dir=%s\n", o.WALDir)
   497  
   498  	for i := range o.Levels {
   499  		l := &o.Levels[i]
   500  		fmt.Fprintf(&buf, "\n")
   501  		fmt.Fprintf(&buf, "[Level \"%d\"]\n", i)
   502  		fmt.Fprintf(&buf, "  block_restart_interval=%d\n", l.BlockRestartInterval)
   503  		fmt.Fprintf(&buf, "  block_size=%d\n", l.BlockSize)
   504  		fmt.Fprintf(&buf, "  compression=%s\n", l.Compression)
   505  		fmt.Fprintf(&buf, "  filter_policy=%s\n", filterPolicyName(l.FilterPolicy))
   506  		fmt.Fprintf(&buf, "  filter_type=%s\n", l.FilterType)
   507  		fmt.Fprintf(&buf, "  index_block_size=%d\n", l.IndexBlockSize)
   508  		fmt.Fprintf(&buf, "  target_file_size=%d\n", l.TargetFileSize)
   509  	}
   510  
   511  	return buf.String()
   512  }
   513  
   514  // Check verifies the options are compatible with the previous options
   515  // serialized by Options.String(). For example, the Comparer and Merger must be
   516  // the same, or data will not be able to be properly read from the DB.
   517  func (o *Options) Check(s string) error {
   518  	var section string
   519  	for _, line := range strings.Split(s, "\n") {
   520  		line = strings.TrimSpace(line)
   521  		if len(line) == 0 {
   522  			// Skip blank lines.
   523  			continue
   524  		}
   525  		if line[0] == ';' || line[0] == '#' {
   526  			// Skip comments.
   527  			continue
   528  		}
   529  		n := len(line)
   530  		if line[0] == '[' && line[n-1] == ']' {
   531  			// Parse section.
   532  			section = line[1 : n-1]
   533  			continue
   534  		}
   535  
   536  		pos := strings.Index(line, "=")
   537  		if pos < 0 {
   538  			return fmt.Errorf("pebble: invalid key=value syntax: %s", line)
   539  		}
   540  
   541  		key := strings.TrimSpace(line[:pos])
   542  		value := strings.TrimSpace(line[pos+1:])
   543  		path := section + "." + key
   544  
   545  		// RocksDB uses a similar (INI-style) syntax for the OPTIONS file, but
   546  		// different section names and keys. The "CFOptions ..." paths below are
   547  		// the RocksDB versions.
   548  		switch path {
   549  		case "Options.comparer", `CFOptions "default".comparator`:
   550  			if value != o.Comparer.Name {
   551  				return fmt.Errorf("pebble: comparer name from file %q != comparer name from options %q",
   552  					value, o.Comparer.Name)
   553  			}
   554  		case "Options.merger", `CFOptions "default".merge_operator`:
   555  			// RocksDB allows the merge operator to be unspecified, in which case it
   556  			// shows up as "nullptr".
   557  			if value != "nullptr" && value != o.Merger.Name {
   558  				return fmt.Errorf("pebble: merger name from file %q != merger name from options %q",
   559  					value, o.Merger.Name)
   560  			}
   561  		}
   562  	}
   563  	return nil
   564  }