github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/storage/engine.go (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package storage
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"path/filepath"
    17  	"time"
    18  
    19  	"github.com/cockroachdb/cockroach/pkg/base"
    20  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    21  	"github.com/cockroachdb/cockroach/pkg/settings"
    22  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    23  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    24  	"github.com/cockroachdb/cockroach/pkg/storage/fs"
    25  	"github.com/cockroachdb/cockroach/pkg/util/envutil"
    26  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    27  	"github.com/cockroachdb/cockroach/pkg/util/log"
    28  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    29  	"github.com/cockroachdb/errors"
    30  	"github.com/cockroachdb/pebble"
    31  )
    32  
    33  // DefaultStorageEngine represents the default storage engine to use.
    34  var DefaultStorageEngine enginepb.EngineType
    35  
    36  func init() {
    37  	_ = DefaultStorageEngine.Set(envutil.EnvOrDefaultString("COCKROACH_STORAGE_ENGINE", "default"))
    38  }
    39  
    40  // SimpleIterator is an interface for iterating over key/value pairs in an
    41  // engine. SimpleIterator implementations are thread safe unless otherwise
    42  // noted. SimpleIterator is a subset of the functionality offered by Iterator.
    43  type SimpleIterator interface {
    44  	// Close frees up resources held by the iterator.
    45  	Close()
    46  	// SeekGE advances the iterator to the first key in the engine which
    47  	// is >= the provided key.
    48  	SeekGE(key MVCCKey)
    49  	// Valid must be called after any call to Seek(), Next(), Prev(), or
    50  	// similar methods. It returns (true, nil) if the iterator points to
    51  	// a valid key (it is undefined to call Key(), Value(), or similar
    52  	// methods unless Valid() has returned (true, nil)). It returns
    53  	// (false, nil) if the iterator has moved past the end of the valid
    54  	// range, or (false, err) if an error has occurred. Valid() will
    55  	// never return true with a non-nil error.
    56  	Valid() (bool, error)
    57  	// Next advances the iterator to the next key/value in the
    58  	// iteration. After this call, Valid() will be true if the
    59  	// iterator was not positioned at the last key.
    60  	Next()
    61  	// NextKey advances the iterator to the next MVCC key. This operation is
    62  	// distinct from Next which advances to the next version of the current key
    63  	// or the next key if the iterator is currently located at the last version
    64  	// for a key.
    65  	NextKey()
    66  	// UnsafeKey returns the same value as Key, but the memory is invalidated on
    67  	// the next call to {Next,Prev,Seek,SeekReverse,Close}.
    68  	UnsafeKey() MVCCKey
    69  	// UnsafeValue returns the same value as Value, but the memory is
    70  	// invalidated on the next call to {Next,Prev,Seek,SeekReverse,Close}.
    71  	UnsafeValue() []byte
    72  }
    73  
    74  // IteratorStats is returned from (Iterator).Stats.
    75  type IteratorStats struct {
    76  	InternalDeleteSkippedCount int
    77  	TimeBoundNumSSTs           int
    78  }
    79  
    80  // Iterator is an interface for iterating over key/value pairs in an
    81  // engine. Iterator implementations are thread safe unless otherwise
    82  // noted.
    83  type Iterator interface {
    84  	SimpleIterator
    85  
    86  	// SeekLT advances the iterator to the first key in the engine which
    87  	// is < the provided key.
    88  	SeekLT(key MVCCKey)
    89  	// Prev moves the iterator backward to the previous key/value
    90  	// in the iteration. After this call, Valid() will be true if the
    91  	// iterator was not positioned at the first key.
    92  	Prev()
    93  	// Key returns the current key.
    94  	Key() MVCCKey
    95  	// Value returns the current value as a byte slice.
    96  	Value() []byte
    97  	// ValueProto unmarshals the value the iterator is currently
    98  	// pointing to using a protobuf decoder.
    99  	ValueProto(msg protoutil.Message) error
   100  	// ComputeStats scans the underlying engine from start to end keys and
   101  	// computes stats counters based on the values. This method is used after a
   102  	// range is split to recompute stats for each subrange. The start key is
   103  	// always adjusted to avoid counting local keys in the event stats are being
   104  	// recomputed for the first range (i.e. the one with start key == KeyMin).
   105  	// The nowNanos arg specifies the wall time in nanoseconds since the
   106  	// epoch and is used to compute the total age of all intents.
   107  	ComputeStats(start, end roachpb.Key, nowNanos int64) (enginepb.MVCCStats, error)
   108  	// FindSplitKey finds a key from the given span such that the left side of
   109  	// the split is roughly targetSize bytes. The returned key will never be
   110  	// chosen from the key ranges listed in keys.NoSplitSpans and will always
   111  	// sort equal to or after minSplitKey.
   112  	//
   113  	// DO NOT CALL directly (except in wrapper Iterator implementations). Use the
   114  	// package-level MVCCFindSplitKey instead. For correct operation, the caller
   115  	// must set the upper bound on the iterator before calling this method.
   116  	FindSplitKey(start, end, minSplitKey roachpb.Key, targetSize int64) (MVCCKey, error)
   117  	// CheckForKeyCollisions checks whether any keys collide between the iterator
   118  	// and the encoded SST data specified, within the provided key range. Returns
   119  	// stats on skipped KVs, or an error if a collision is found.
   120  	CheckForKeyCollisions(sstData []byte, start, end roachpb.Key) (enginepb.MVCCStats, error)
   121  	// SetUpperBound installs a new upper bound for this iterator.
   122  	SetUpperBound(roachpb.Key)
   123  	// Stats returns statistics about the iterator.
   124  	Stats() IteratorStats
   125  }
   126  
   127  // MVCCIterator is an interface that extends Iterator and provides concrete
   128  // implementations for MVCCGet and MVCCScan operations. It is used by instances
   129  // of the interface backed by RocksDB iterators to avoid cgo hops.
   130  type MVCCIterator interface {
   131  	Iterator
   132  	// MVCCOpsSpecialized returns whether the iterator has a specialized
   133  	// implementation of MVCCGet and MVCCScan. This is exposed as a method
   134  	// so that wrapper types can defer to their wrapped iterators.
   135  	MVCCOpsSpecialized() bool
   136  	// MVCCGet is the internal implementation of the family of package-level
   137  	// MVCCGet functions.
   138  	MVCCGet(
   139  		key roachpb.Key, timestamp hlc.Timestamp, opts MVCCGetOptions,
   140  	) (*roachpb.Value, *roachpb.Intent, error)
   141  	// MVCCScan is the internal implementation of the family of package-level
   142  	// MVCCScan functions. The notable difference is that key/value pairs are
   143  	// returned raw, as a series of buffers of length-prefixed slices,
   144  	// alternating from key to value, where numKVs specifies the number of pairs
   145  	// in the buffer.
   146  	MVCCScan(
   147  		start, end roachpb.Key, timestamp hlc.Timestamp, opts MVCCScanOptions,
   148  	) (MVCCScanResult, error)
   149  }
   150  
   151  // IterOptions contains options used to create an Iterator.
   152  //
   153  // For performance, every Iterator must specify either Prefix or UpperBound.
   154  type IterOptions struct {
   155  	// If Prefix is true, Seek will use the user-key prefix of
   156  	// the supplied MVCC key to restrict which sstables are searched,
   157  	// but iteration (using Next) over keys without the same user-key
   158  	// prefix will not work correctly (keys may be skipped).
   159  	Prefix bool
   160  	// LowerBound gives this iterator an inclusive lower bound. Attempts to
   161  	// SeekReverse or Prev to a key that is strictly less than the bound will
   162  	// invalidate the iterator.
   163  	LowerBound roachpb.Key
   164  	// UpperBound gives this iterator an exclusive upper bound. Attempts to Seek
   165  	// or Next to a key that is greater than or equal to the bound will invalidate
   166  	// the iterator. UpperBound must be provided unless Prefix is true, in which
   167  	// case the end of the prefix will be used as the upper bound.
   168  	UpperBound roachpb.Key
   169  	// If WithStats is true, the iterator accumulates RocksDB performance
   170  	// counters over its lifetime which can be queried via `Stats()`.
   171  	WithStats bool
   172  	// MinTimestampHint and MaxTimestampHint, if set, indicate that keys outside
   173  	// of the time range formed by [MinTimestampHint, MaxTimestampHint] do not
   174  	// need to be presented by the iterator. The underlying iterator may be able
   175  	// to efficiently skip over keys outside of the hinted time range, e.g., when
   176  	// an SST indicates that it contains no keys within the time range.
   177  	//
   178  	// Note that time bound hints are strictly a performance optimization, and
   179  	// iterators with time bounds hints will frequently return keys outside of the
   180  	// [start, end] time range. If you must guarantee that you never see a key
   181  	// outside of the time bounds, perform your own filtering.
   182  	MinTimestampHint, MaxTimestampHint hlc.Timestamp
   183  }
   184  
   185  // Reader is the read interface to an engine's data.
   186  type Reader interface {
   187  	// Close closes the reader, freeing up any outstanding resources. Note that
   188  	// various implementations have slightly different behaviors. In particular,
   189  	// Distinct() batches release their parent batch for future use while
   190  	// Engines, Snapshots and Batches free the associated C++ resources.
   191  	Close()
   192  	// Closed returns true if the reader has been closed or is not usable.
   193  	// Objects backed by this reader (e.g. Iterators) can check this to ensure
   194  	// that they are not using a closed engine. Intended for use within package
   195  	// engine; exported to enable wrappers to exist in other packages.
   196  	Closed() bool
   197  	// ExportToSst exports changes to the keyrange [startKey, endKey) over the
   198  	// interval (startTS, endTS]. Passing exportAllRevisions exports
   199  	// every revision of a key for the interval, otherwise only the latest value
   200  	// within the interval is exported. Deletions are included if all revisions are
   201  	// requested or if the start.Timestamp is non-zero. Returns the bytes of an
   202  	// SSTable containing the exported keys, the size of exported data, or an error.
   203  	//
   204  	// If targetSize is positive, it indicates that the export should produce SSTs
   205  	// which are roughly target size. Specifically, it will return an SST such that
   206  	// the last key is responsible for meeting or exceeding the targetSize. If the
   207  	// resumeKey is non-nil then the data size of the returned sst will be greater
   208  	// than or equal to the targetSize.
   209  	//
   210  	// If maxSize is positive, it is an absolute maximum on byte size for the
   211  	// returned sst. If it is the case that the versions of the last key will lead
   212  	// to an SST that exceeds maxSize, an error will be returned. This parameter
   213  	// exists to prevent creating SSTs which are too large to be used.
   214  	ExportToSst(
   215  		startKey, endKey roachpb.Key, startTS, endTS hlc.Timestamp,
   216  		exportAllRevisions bool, targetSize uint64, maxSize uint64,
   217  		io IterOptions,
   218  	) (sst []byte, _ roachpb.BulkOpSummary, resumeKey roachpb.Key, _ error)
   219  	// Get returns the value for the given key, nil otherwise.
   220  	//
   221  	// Deprecated: use MVCCGet instead.
   222  	Get(key MVCCKey) ([]byte, error)
   223  	// GetProto fetches the value at the specified key and unmarshals it
   224  	// using a protobuf decoder. Returns true on success or false if the
   225  	// key was not found. On success, returns the length in bytes of the
   226  	// key and the value.
   227  	//
   228  	// Deprecated: use Iterator.ValueProto instead.
   229  	GetProto(key MVCCKey, msg protoutil.Message) (ok bool, keyBytes, valBytes int64, err error)
   230  	// Iterate scans from the start key to the end key (exclusive), invoking the
   231  	// function f on each key value pair. If f returns an error or if the scan
   232  	// itself encounters an error, the iteration will stop and return the error.
   233  	// If the first result of f is true, the iteration stops and returns a nil
   234  	// error. Note that this method is not expected take into account the
   235  	// timestamp of the end key; all MVCCKeys at end.Key are considered excluded
   236  	// in the iteration.
   237  	Iterate(start, end roachpb.Key, f func(MVCCKeyValue) (stop bool, err error)) error
   238  	// NewIterator returns a new instance of an Iterator over this
   239  	// engine. The caller must invoke Iterator.Close() when finished
   240  	// with the iterator to free resources.
   241  	NewIterator(opts IterOptions) Iterator
   242  }
   243  
   244  // Writer is the write interface to an engine's data.
   245  type Writer interface {
   246  	// ApplyBatchRepr atomically applies a set of batched updates. Created by
   247  	// calling Repr() on a batch. Using this method is equivalent to constructing
   248  	// and committing a batch whose Repr() equals repr. If sync is true, the
   249  	// batch is synchronously written to disk. It is an error to specify
   250  	// sync=true if the Writer is a Batch.
   251  	//
   252  	// It is safe to modify the contents of the arguments after ApplyBatchRepr
   253  	// returns.
   254  	ApplyBatchRepr(repr []byte, sync bool) error
   255  	// Clear removes the item from the db with the given key. Note that clear
   256  	// actually removes entries from the storage engine, rather than inserting
   257  	// tombstones.
   258  	//
   259  	// It is safe to modify the contents of the arguments after Clear returns.
   260  	Clear(key MVCCKey) error
   261  	// SingleClear removes the most recent write to the item from the db with
   262  	// the given key. Whether older version of the item will come back to life
   263  	// if not also removed with SingleClear is undefined. See the following:
   264  	//   https://github.com/facebook/rocksdb/wiki/Single-Delete
   265  	// for details on the SingleDelete operation that this method invokes. Note
   266  	// that clear actually removes entries from the storage engine, rather than
   267  	// inserting tombstones.
   268  	//
   269  	// It is safe to modify the contents of the arguments after SingleClear
   270  	// returns.
   271  	SingleClear(key MVCCKey) error
   272  	// ClearRange removes a set of entries, from start (inclusive) to end
   273  	// (exclusive). Similar to Clear, this method actually removes entries from
   274  	// the storage engine.
   275  	//
   276  	// Note that when used on batches, subsequent reads may not reflect the result
   277  	// of the ClearRange.
   278  	//
   279  	// It is safe to modify the contents of the arguments after ClearRange
   280  	// returns.
   281  	//
   282  	// TODO(peter): Most callers want to pass roachpb.Key, except for
   283  	// MVCCClearTimeRange. That function actually does what to clear records
   284  	// between specific versions.
   285  	ClearRange(start, end MVCCKey) error
   286  	// ClearIterRange removes a set of entries, from start (inclusive) to end
   287  	// (exclusive). Similar to Clear and ClearRange, this method actually removes
   288  	// entries from the storage engine. Unlike ClearRange, the entries to remove
   289  	// are determined by iterating over iter and per-key tombstones are
   290  	// generated.
   291  	//
   292  	// It is safe to modify the contents of the arguments after ClearIterRange
   293  	// returns.
   294  	ClearIterRange(iter Iterator, start, end roachpb.Key) error
   295  	// Merge is a high-performance write operation used for values which are
   296  	// accumulated over several writes. Multiple values can be merged
   297  	// sequentially into a single key; a subsequent read will return a "merged"
   298  	// value which is computed from the original merged values.
   299  	//
   300  	// Merge currently provides specialized behavior for three data types:
   301  	// integers, byte slices, and time series observations. Merged integers are
   302  	// summed, acting as a high-performance accumulator.  Byte slices are simply
   303  	// concatenated in the order they are merged. Time series observations
   304  	// (stored as byte slices with a special tag on the roachpb.Value) are
   305  	// combined with specialized logic beyond that of simple byte slices.
   306  	//
   307  	// The logic for merges is written in db.cc in order to be compatible with
   308  	// RocksDB.
   309  	//
   310  	// It is safe to modify the contents of the arguments after Merge returns.
   311  	Merge(key MVCCKey, value []byte) error
   312  	// Put sets the given key to the value provided.
   313  	//
   314  	// It is safe to modify the contents of the arguments after Put returns.
   315  	Put(key MVCCKey, value []byte) error
   316  	// LogData adds the specified data to the RocksDB WAL. The data is
   317  	// uninterpreted by RocksDB (i.e. not added to the memtable or sstables).
   318  	//
   319  	// It is safe to modify the contents of the arguments after LogData returns.
   320  	LogData(data []byte) error
   321  	// LogLogicalOp logs the specified logical mvcc operation with the provided
   322  	// details to the writer, if it has logical op logging enabled. For most
   323  	// Writer implementations, this is a no-op.
   324  	LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails)
   325  }
   326  
   327  // ReadWriter is the read/write interface to an engine's data.
   328  type ReadWriter interface {
   329  	Reader
   330  	Writer
   331  }
   332  
   333  // Engine is the interface that wraps the core operations of a key/value store.
   334  type Engine interface {
   335  	ReadWriter
   336  	// Attrs returns the engine/store attributes.
   337  	Attrs() roachpb.Attributes
   338  	// Capacity returns capacity details for the engine's available storage.
   339  	Capacity() (roachpb.StoreCapacity, error)
   340  	// Compact forces compaction over the entire database.
   341  	Compact() error
   342  	// Flush causes the engine to write all in-memory data to disk
   343  	// immediately.
   344  	Flush() error
   345  	// GetSSTables retrieves metadata about this engine's live sstables.
   346  	GetSSTables() SSTableInfos
   347  	// GetCompactionStats returns the internal RocksDB compaction stats. See
   348  	// https://github.com/facebook/rocksdb/wiki/RocksDB-Tuning-Guide#rocksdb-statistics.
   349  	GetCompactionStats() string
   350  	// GetStats retrieves stats from the engine.
   351  	GetStats() (*Stats, error)
   352  	// GetEncryptionRegistries returns the file and key registries when encryption is enabled
   353  	// on the store.
   354  	GetEncryptionRegistries() (*EncryptionRegistries, error)
   355  	// GetEnvStats retrieves stats about the engine's environment
   356  	// For RocksDB, this includes details of at-rest encryption.
   357  	GetEnvStats() (*EnvStats, error)
   358  	// GetAuxiliaryDir returns a path under which files can be stored
   359  	// persistently, and from which data can be ingested by the engine.
   360  	//
   361  	// Not thread safe.
   362  	GetAuxiliaryDir() string
   363  	// NewBatch returns a new instance of a batched engine which wraps
   364  	// this engine. Batched engines accumulate all mutations and apply
   365  	// them atomically on a call to Commit().
   366  	NewBatch() Batch
   367  	// NewReadOnly returns a new instance of a ReadWriter that wraps this
   368  	// engine. This wrapper panics when unexpected operations (e.g., write
   369  	// operations) are executed on it and caches iterators to avoid the overhead
   370  	// of creating multiple iterators for batched reads.
   371  	//
   372  	// All iterators created from a read-only engine with the same "Prefix"
   373  	// option are guaranteed to provide a consistent snapshot of the underlying
   374  	// engine. For instance, two prefix iterators created from a read-only
   375  	// engine will provide a consistent snapshot. Similarly, two non-prefix
   376  	// iterators created from a read-only engine will provide a consistent
   377  	// snapshot. However, a prefix iterator and a non-prefix iterator created
   378  	// from a read-only engine are not guaranteed to provide a consistent view
   379  	// of the underlying engine.
   380  	//
   381  	// TODO(nvanbenschoten): remove this complexity when we're fully on Pebble
   382  	// and can guarantee that all iterators created from a read-only engine are
   383  	// consistent. To do this, we will want to add an Iterator.Clone method.
   384  	NewReadOnly() ReadWriter
   385  	// NewWriteOnlyBatch returns a new instance of a batched engine which wraps
   386  	// this engine. A write-only batch accumulates all mutations and applies them
   387  	// atomically on a call to Commit(). Read operations return an error.
   388  	//
   389  	// Note that a distinct write-only batch allows reads. Distinct batches are a
   390  	// means of indicating that the user does not need to read its own writes.
   391  	//
   392  	// TODO(peter): This should return a WriteBatch interface, but there are mild
   393  	// complications in both defining that interface and implementing it. In
   394  	// particular, Batch.Close would no longer come from Reader and we'd need to
   395  	// refactor a bunch of code in rocksDBBatch.
   396  	NewWriteOnlyBatch() Batch
   397  	// NewSnapshot returns a new instance of a read-only snapshot
   398  	// engine. Snapshots are instantaneous and, as long as they're
   399  	// released relatively quickly, inexpensive. Snapshots are released
   400  	// by invoking Close(). Note that snapshots must not be used after the
   401  	// original engine has been stopped.
   402  	NewSnapshot() Reader
   403  	// Type returns engine type.
   404  	Type() enginepb.EngineType
   405  	// IngestExternalFiles atomically links a slice of files into the RocksDB
   406  	// log-structured merge-tree.
   407  	IngestExternalFiles(ctx context.Context, paths []string) error
   408  	// PreIngestDelay offers an engine the chance to backpressure ingestions.
   409  	// When called, it may choose to block if the engine determines that it is in
   410  	// or approaching a state where further ingestions may risk its health.
   411  	PreIngestDelay(ctx context.Context)
   412  	// ApproximateDiskBytes returns an approximation of the on-disk size for the given key span.
   413  	ApproximateDiskBytes(from, to roachpb.Key) (uint64, error)
   414  	// CompactRange ensures that the specified range of key value pairs is
   415  	// optimized for space efficiency. The forceBottommost parameter ensures
   416  	// that the key range is compacted all the way to the bottommost level of
   417  	// SSTables, which is necessary to pick up changes to bloom filters.
   418  	CompactRange(start, end roachpb.Key, forceBottommost bool) error
   419  	// InMem returns true if the receiver is an in-memory engine and false
   420  	// otherwise.
   421  	//
   422  	// TODO(peter): This is a bit of a wart in the interface. It is used by
   423  	// addSSTablePreApply to select alternate code paths, but really there should
   424  	// be a unified code path there.
   425  	InMem() bool
   426  
   427  	// Filesystem functionality.
   428  	fs.FS
   429  	// ReadFile reads the content from the file with the given filename int this RocksDB's env.
   430  	ReadFile(filename string) ([]byte, error)
   431  	// WriteFile writes data to a file in this RocksDB's env.
   432  	WriteFile(filename string, data []byte) error
   433  	// CreateCheckpoint creates a checkpoint of the engine in the given directory,
   434  	// which must not exist. The directory should be on the same file system so
   435  	// that hard links can be used.
   436  	CreateCheckpoint(dir string) error
   437  }
   438  
   439  // Batch is the interface for batch specific operations.
   440  type Batch interface {
   441  	ReadWriter
   442  	// Commit atomically applies any batched updates to the underlying
   443  	// engine. This is a noop unless the batch was created via NewBatch(). If
   444  	// sync is true, the batch is synchronously committed to disk.
   445  	Commit(sync bool) error
   446  	// Distinct returns a view of the existing batch which only sees writes that
   447  	// were performed before the Distinct batch was created. That is, the
   448  	// returned batch will not read its own writes, but it will read writes to
   449  	// the parent batch performed before the call to Distinct(), except if the
   450  	// parent batch is a WriteOnlyBatch, in which case the Distinct() batch will
   451  	// read from the underlying engine.
   452  	//
   453  	// The returned
   454  	// batch needs to be closed before using the parent batch again. This is used
   455  	// as an optimization to avoid flushing mutations buffered by the batch in
   456  	// situations where we know all of the batched operations are for distinct
   457  	// keys.
   458  	//
   459  	// TODO(tbg): it seems insane that you cannot read from a WriteOnlyBatch but
   460  	// you can read from a Distinct on top of a WriteOnlyBatch but randomly don't
   461  	// see the batch at all. I was personally just bitten by this.
   462  	//
   463  	// TODO(itsbilal): Improve comments around how/why distinct batches are an
   464  	// optimization in the rocksdb write path.
   465  	Distinct() ReadWriter
   466  	// Empty returns whether the batch has been written to or not.
   467  	Empty() bool
   468  	// Len returns the size of the underlying representation of the batch.
   469  	// Because of the batch header, the size of the batch is never 0 and should
   470  	// not be used interchangeably with Empty. The method avoids the memory copy
   471  	// that Repr imposes, but it still may require flushing the batch's mutations.
   472  	Len() int
   473  	// Repr returns the underlying representation of the batch and can be used to
   474  	// reconstitute the batch on a remote node using Writer.ApplyBatchRepr().
   475  	Repr() []byte
   476  }
   477  
   478  // Stats is a set of Engine stats. Most are described in RocksDB.
   479  // Some stats (eg, `IngestedBytes`) are only exposed by Pebble.
   480  //
   481  // Currently, we collect stats from the following sources:
   482  // 1. RocksDB's internal "tickers" (i.e. counters). They're defined in
   483  //    rocksdb/statistics.h
   484  // 2. DBEventListener, which implements RocksDB's EventListener interface.
   485  // 3. rocksdb::DB::GetProperty().
   486  //
   487  // This is a good resource describing RocksDB's memory-related stats:
   488  // https://github.com/facebook/rocksdb/wiki/Memory-usage-in-RocksDB
   489  type Stats struct {
   490  	BlockCacheHits                 int64
   491  	BlockCacheMisses               int64
   492  	BlockCacheUsage                int64
   493  	BlockCachePinnedUsage          int64
   494  	BloomFilterPrefixChecked       int64
   495  	BloomFilterPrefixUseful        int64
   496  	MemtableTotalSize              int64
   497  	Flushes                        int64
   498  	FlushedBytes                   int64
   499  	Compactions                    int64
   500  	IngestedBytes                  int64 // Pebble only
   501  	CompactedBytesRead             int64
   502  	CompactedBytesWritten          int64
   503  	TableReadersMemEstimate        int64
   504  	PendingCompactionBytesEstimate int64
   505  	L0FileCount                    int64
   506  }
   507  
   508  // EnvStats is a set of RocksDB env stats, including encryption status.
   509  type EnvStats struct {
   510  	// TotalFiles is the total number of files reported by rocksdb.
   511  	TotalFiles uint64
   512  	// TotalBytes is the total size of files reported by rocksdb.
   513  	TotalBytes uint64
   514  	// ActiveKeyFiles is the number of files using the active data key.
   515  	ActiveKeyFiles uint64
   516  	// ActiveKeyBytes is the size of files using the active data key.
   517  	ActiveKeyBytes uint64
   518  	// EncryptionType is an enum describing the active encryption algorithm.
   519  	// See: ccl/storageccl/engineccl/enginepbccl/key_registry.proto
   520  	EncryptionType int32
   521  	// EncryptionStatus is a serialized enginepbccl/stats.proto::EncryptionStatus protobuf.
   522  	EncryptionStatus []byte
   523  }
   524  
   525  // EncryptionRegistries contains the encryption-related registries:
   526  // Both are serialized protobufs.
   527  type EncryptionRegistries struct {
   528  	// FileRegistry is the list of files with encryption status.
   529  	// serialized storage/engine/enginepb/file_registry.proto::FileRegistry
   530  	FileRegistry []byte
   531  	// KeyRegistry is the list of keys, scrubbed of actual key data.
   532  	// serialized ccl/storageccl/engineccl/enginepbccl/key_registry.proto::DataKeysRegistry
   533  	KeyRegistry []byte
   534  }
   535  
   536  // NewEngine creates a new storage engine.
   537  func NewEngine(
   538  	engine enginepb.EngineType, cacheSize int64, storageConfig base.StorageConfig,
   539  ) (Engine, error) {
   540  	switch engine {
   541  	case enginepb.EngineTypeTeePebbleRocksDB:
   542  		pebbleConfig := PebbleConfig{
   543  			StorageConfig: storageConfig,
   544  			Opts:          DefaultPebbleOptions(),
   545  		}
   546  		pebbleConfig.Opts.Cache = pebble.NewCache(cacheSize)
   547  		defer pebbleConfig.Opts.Cache.Unref()
   548  
   549  		pebbleConfig.Dir = filepath.Join(pebbleConfig.Dir, "pebble")
   550  		cache := NewRocksDBCache(cacheSize)
   551  		defer cache.Release()
   552  
   553  		ctx := context.Background()
   554  		pebbleDB, err := NewPebble(ctx, pebbleConfig)
   555  		if err != nil {
   556  			return nil, err
   557  		}
   558  
   559  		rocksDBConfig := RocksDBConfig{StorageConfig: storageConfig}
   560  		rocksDBConfig.Dir = filepath.Join(rocksDBConfig.Dir, "rocksdb")
   561  		rocksDB, err := NewRocksDB(rocksDBConfig, cache)
   562  		if err != nil {
   563  			return nil, err
   564  		}
   565  
   566  		return NewTee(ctx, rocksDB, pebbleDB), nil
   567  	case enginepb.EngineTypeDefault, enginepb.EngineTypePebble:
   568  		pebbleConfig := PebbleConfig{
   569  			StorageConfig: storageConfig,
   570  			Opts:          DefaultPebbleOptions(),
   571  		}
   572  		pebbleConfig.Opts.Cache = pebble.NewCache(cacheSize)
   573  		defer pebbleConfig.Opts.Cache.Unref()
   574  
   575  		return NewPebble(context.Background(), pebbleConfig)
   576  	case enginepb.EngineTypeRocksDB:
   577  		cache := NewRocksDBCache(cacheSize)
   578  		defer cache.Release()
   579  
   580  		return NewRocksDB(
   581  			RocksDBConfig{StorageConfig: storageConfig},
   582  			cache)
   583  	}
   584  	panic(fmt.Sprintf("unknown engine type: %d", engine))
   585  }
   586  
   587  // NewDefaultEngine allocates and returns a new, opened engine with the default configuration.
   588  // The caller must call the engine's Close method when the engine is no longer needed.
   589  func NewDefaultEngine(cacheSize int64, storageConfig base.StorageConfig) (Engine, error) {
   590  	return NewEngine(DefaultStorageEngine, cacheSize, storageConfig)
   591  }
   592  
   593  // PutProto sets the given key to the protobuf-serialized byte string
   594  // of msg and the provided timestamp. Returns the length in bytes of
   595  // key and the value.
   596  //
   597  // Deprecated: use MVCCPutProto instead.
   598  func PutProto(
   599  	writer Writer, key MVCCKey, msg protoutil.Message,
   600  ) (keyBytes, valBytes int64, err error) {
   601  	bytes, err := protoutil.Marshal(msg)
   602  	if err != nil {
   603  		return 0, 0, err
   604  	}
   605  
   606  	if err := writer.Put(key, bytes); err != nil {
   607  		return 0, 0, err
   608  	}
   609  
   610  	return int64(key.EncodedSize()), int64(len(bytes)), nil
   611  }
   612  
   613  // Scan returns up to max key/value objects starting from
   614  // start (inclusive) and ending at end (non-inclusive).
   615  // Specify max=0 for unbounded scans.
   616  func Scan(reader Reader, start, end roachpb.Key, max int64) ([]MVCCKeyValue, error) {
   617  	var kvs []MVCCKeyValue
   618  	err := reader.Iterate(start, end, func(kv MVCCKeyValue) (bool, error) {
   619  		if max != 0 && int64(len(kvs)) >= max {
   620  			return true, nil
   621  		}
   622  		kvs = append(kvs, kv)
   623  		return false, nil
   624  	})
   625  	return kvs, err
   626  }
   627  
   628  // WriteSyncNoop carries out a synchronous no-op write to the engine.
   629  func WriteSyncNoop(ctx context.Context, eng Engine) error {
   630  	batch := eng.NewBatch()
   631  	defer batch.Close()
   632  
   633  	if err := batch.LogData(nil); err != nil {
   634  		return err
   635  	}
   636  
   637  	if err := batch.Commit(true /* sync */); err != nil {
   638  		return err
   639  	}
   640  	return nil
   641  }
   642  
   643  // ClearRangeWithHeuristic clears the keys from start (inclusive) to end
   644  // (exclusive). Depending on the number of keys, it will either use ClearRange
   645  // or ClearIterRange.
   646  func ClearRangeWithHeuristic(reader Reader, writer Writer, start, end roachpb.Key) error {
   647  	iter := reader.NewIterator(IterOptions{UpperBound: end})
   648  	defer iter.Close()
   649  
   650  	// It is expensive for there to be many range deletion tombstones in the same
   651  	// sstable because all of the tombstones in an sstable are loaded whenever the
   652  	// sstable is accessed. So we avoid using range deletion unless there is some
   653  	// minimum number of keys. The value here was pulled out of thin air. It might
   654  	// be better to make this dependent on the size of the data being deleted. Or
   655  	// perhaps we should fix RocksDB to handle large numbers of tombstones in an
   656  	// sstable better.
   657  	const clearRangeMinKeys = 64
   658  	// Peek into the range to see whether it's large enough to justify
   659  	// ClearRange. Note that the work done here is bounded by
   660  	// clearRangeMinKeys, so it will be fairly cheap even for large
   661  	// ranges.
   662  	//
   663  	// TODO(bdarnell): Move this into ClearIterRange so we don't have
   664  	// to do this scan twice.
   665  	count := 0
   666  	iter.SeekGE(MakeMVCCMetadataKey(start))
   667  	for {
   668  		valid, err := iter.Valid()
   669  		if err != nil {
   670  			return err
   671  		}
   672  		if !valid {
   673  			break
   674  		}
   675  		count++
   676  		if count > clearRangeMinKeys {
   677  			break
   678  		}
   679  		iter.Next()
   680  	}
   681  	var err error
   682  	if count > clearRangeMinKeys {
   683  		err = writer.ClearRange(MakeMVCCMetadataKey(start), MakeMVCCMetadataKey(end))
   684  	} else {
   685  		err = writer.ClearIterRange(iter, start, end)
   686  	}
   687  	if err != nil {
   688  		return err
   689  	}
   690  	return nil
   691  }
   692  
   693  var ingestDelayL0Threshold = settings.RegisterIntSetting(
   694  	"rocksdb.ingest_backpressure.l0_file_count_threshold",
   695  	"number of L0 files after which to backpressure SST ingestions",
   696  	20,
   697  )
   698  
   699  var ingestDelayTime = settings.RegisterDurationSetting(
   700  	"rocksdb.ingest_backpressure.max_delay",
   701  	"maximum amount of time to backpressure a single SST ingestion",
   702  	time.Second*5,
   703  )
   704  
   705  // PreIngestDelay may choose to block for some duration if L0 has an excessive
   706  // number of files in it or if PendingCompactionBytesEstimate is elevated. This
   707  // it is intended to be called before ingesting a new SST, since we'd rather
   708  // backpressure the bulk operation adding SSTs than slow down the whole RocksDB
   709  // instance and impact all forground traffic by adding too many files to it.
   710  // After the number of L0 files exceeds the configured limit, it gradually
   711  // begins delaying more for each additional file in L0 over the limit until
   712  // hitting its configured (via settings) maximum delay. If the pending
   713  // compaction limit is exceeded, it waits for the maximum delay.
   714  func preIngestDelay(ctx context.Context, eng Engine, settings *cluster.Settings) {
   715  	if settings == nil {
   716  		return
   717  	}
   718  	stats, err := eng.GetStats()
   719  	if err != nil {
   720  		log.Warningf(ctx, "failed to read stats: %+v", err)
   721  		return
   722  	}
   723  	targetDelay := calculatePreIngestDelay(settings, stats)
   724  
   725  	if targetDelay == 0 {
   726  		return
   727  	}
   728  	log.VEventf(ctx, 2, "delaying SST ingestion %s. %d L0 files", targetDelay, stats.L0FileCount)
   729  
   730  	select {
   731  	case <-time.After(targetDelay):
   732  	case <-ctx.Done():
   733  	}
   734  }
   735  
   736  func calculatePreIngestDelay(settings *cluster.Settings, stats *Stats) time.Duration {
   737  	maxDelay := ingestDelayTime.Get(&settings.SV)
   738  	l0Filelimit := ingestDelayL0Threshold.Get(&settings.SV)
   739  
   740  	const ramp = 10
   741  	if stats.L0FileCount > l0Filelimit {
   742  		delayPerFile := maxDelay / time.Duration(ramp)
   743  		targetDelay := time.Duration(stats.L0FileCount-l0Filelimit) * delayPerFile
   744  		if targetDelay > maxDelay {
   745  			return maxDelay
   746  		}
   747  		return targetDelay
   748  	}
   749  	return 0
   750  }
   751  
   752  // Helper function to implement Reader.Iterate().
   753  func iterateOnReader(
   754  	reader Reader, start, end roachpb.Key, f func(MVCCKeyValue) (stop bool, err error),
   755  ) error {
   756  	if reader.Closed() {
   757  		return errors.New("cannot call Iterate on a closed batch")
   758  	}
   759  	if start.Compare(end) >= 0 {
   760  		return nil
   761  	}
   762  
   763  	it := reader.NewIterator(IterOptions{UpperBound: end})
   764  	defer it.Close()
   765  
   766  	it.SeekGE(MakeMVCCMetadataKey(start))
   767  	for ; ; it.Next() {
   768  		ok, err := it.Valid()
   769  		if err != nil {
   770  			return err
   771  		} else if !ok {
   772  			break
   773  		}
   774  		if done, err := f(MVCCKeyValue{Key: it.Key(), Value: it.Value()}); done || err != nil {
   775  			return err
   776  		}
   777  	}
   778  	return nil
   779  }