github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/storage/pebble.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package storage
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"fmt"
    17  	"io"
    18  	"io/ioutil"
    19  	"os"
    20  	"sort"
    21  	"strconv"
    22  	"strings"
    23  	"time"
    24  
    25  	"github.com/cockroachdb/cockroach/pkg/base"
    26  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    27  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    28  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    29  	"github.com/cockroachdb/cockroach/pkg/storage/fs"
    30  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    31  	"github.com/cockroachdb/cockroach/pkg/util/log"
    32  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    33  	"github.com/cockroachdb/errors"
    34  	"github.com/cockroachdb/logtags"
    35  	"github.com/cockroachdb/pebble"
    36  	"github.com/cockroachdb/pebble/bloom"
    37  	"github.com/cockroachdb/pebble/vfs"
    38  )
    39  
    40  // MVCCKeyCompare compares cockroach keys, including the MVCC timestamps.
    41  func MVCCKeyCompare(a, b []byte) int {
    42  	// NB: For performance, this routine manually splits the key into the
    43  	// user-key and timestamp components rather than using SplitMVCCKey. Don't
    44  	// try this at home kids: use SplitMVCCKey.
    45  
    46  	aEnd := len(a) - 1
    47  	bEnd := len(b) - 1
    48  	if aEnd < 0 || bEnd < 0 {
    49  		// This should never happen unless there is some sort of corruption of
    50  		// the keys. This is a little bizarre, but the behavior exactly matches
    51  		// engine/db.cc:DBComparator.
    52  		return bytes.Compare(a, b)
    53  	}
    54  
    55  	// Compute the index of the separator between the key and the timestamp.
    56  	aSep := aEnd - int(a[aEnd])
    57  	bSep := bEnd - int(b[bEnd])
    58  	if aSep < 0 || bSep < 0 {
    59  		// This should never happen unless there is some sort of corruption of
    60  		// the keys. This is a little bizarre, but the behavior exactly matches
    61  		// engine/db.cc:DBComparator.
    62  		return bytes.Compare(a, b)
    63  	}
    64  
    65  	// Compare the "user key" part of the key.
    66  	if c := bytes.Compare(a[:aSep], b[:bSep]); c != 0 {
    67  		return c
    68  	}
    69  
    70  	// Compare the timestamp part of the key.
    71  	aTS := a[aSep:aEnd]
    72  	bTS := b[bSep:bEnd]
    73  	if len(aTS) == 0 {
    74  		if len(bTS) == 0 {
    75  			return 0
    76  		}
    77  		return -1
    78  	} else if len(bTS) == 0 {
    79  		return 1
    80  	}
    81  	return bytes.Compare(bTS, aTS)
    82  }
    83  
    84  // MVCCComparer is a pebble.Comparer object that implements MVCC-specific
    85  // comparator settings for use with Pebble.
    86  var MVCCComparer = &pebble.Comparer{
    87  	Compare: MVCCKeyCompare,
    88  
    89  	AbbreviatedKey: func(k []byte) uint64 {
    90  		key, _, ok := enginepb.SplitMVCCKey(k)
    91  		if !ok {
    92  			return 0
    93  		}
    94  		return pebble.DefaultComparer.AbbreviatedKey(key)
    95  	},
    96  
    97  	FormatKey: func(k []byte) fmt.Formatter {
    98  		decoded, err := DecodeMVCCKey(k)
    99  		if err != nil {
   100  			return mvccKeyFormatter{err: err}
   101  		}
   102  		return mvccKeyFormatter{key: decoded}
   103  	},
   104  
   105  	Separator: func(dst, a, b []byte) []byte {
   106  		aKey, _, ok := enginepb.SplitMVCCKey(a)
   107  		if !ok {
   108  			return append(dst, a...)
   109  		}
   110  		bKey, _, ok := enginepb.SplitMVCCKey(b)
   111  		if !ok {
   112  			return append(dst, a...)
   113  		}
   114  		// If the keys are the same just return a.
   115  		if bytes.Equal(aKey, bKey) {
   116  			return append(dst, a...)
   117  		}
   118  		n := len(dst)
   119  		// MVCC key comparison uses bytes.Compare on the roachpb.Key, which is the same semantics as
   120  		// pebble.DefaultComparer, so reuse the latter's Separator implementation.
   121  		dst = pebble.DefaultComparer.Separator(dst, aKey, bKey)
   122  		// Did it pick a separator different than aKey -- if it did not we can't do better than a.
   123  		buf := dst[n:]
   124  		if bytes.Equal(aKey, buf) {
   125  			return append(dst[:n], a...)
   126  		}
   127  		// The separator is > aKey, so we only need to add the timestamp sentinel.
   128  		return append(dst, 0)
   129  	},
   130  
   131  	Successor: func(dst, a []byte) []byte {
   132  		aKey, _, ok := enginepb.SplitMVCCKey(a)
   133  		if !ok {
   134  			return append(dst, a...)
   135  		}
   136  		n := len(dst)
   137  		// MVCC key comparison uses bytes.Compare on the roachpb.Key, which is the same semantics as
   138  		// pebble.DefaultComparer, so reuse the latter's Successor implementation.
   139  		dst = pebble.DefaultComparer.Successor(dst, aKey)
   140  		// Did it pick a successor different than aKey -- if it did not we can't do better than a.
   141  		buf := dst[n:]
   142  		if bytes.Equal(aKey, buf) {
   143  			return append(dst[:n], a...)
   144  		}
   145  		// The successor is > aKey, so we only need to add the timestamp sentinel.
   146  		return append(dst, 0)
   147  	},
   148  
   149  	Split: func(k []byte) int {
   150  		key, _, ok := enginepb.SplitMVCCKey(k)
   151  		if !ok {
   152  			return len(k)
   153  		}
   154  		// This matches the behavior of libroach/KeyPrefix. RocksDB requires that
   155  		// keys generated via a SliceTransform be comparable with normal encoded
   156  		// MVCC keys. Encoded MVCC keys have a suffix indicating the number of
   157  		// bytes of timestamp data. MVCC keys without a timestamp have a suffix of
   158  		// 0. We're careful in EncodeKey to make sure that the user-key always has
   159  		// a trailing 0. If there is no timestamp this falls out naturally. If
   160  		// there is a timestamp we prepend a 0 to the encoded timestamp data.
   161  		return len(key) + 1
   162  	},
   163  
   164  	Name: "cockroach_comparator",
   165  }
   166  
   167  // MVCCMerger is a pebble.Merger object that implements the merge operator used
   168  // by Cockroach.
   169  var MVCCMerger = &pebble.Merger{
   170  	Name: "cockroach_merge_operator",
   171  	Merge: func(_, value []byte) (pebble.ValueMerger, error) {
   172  		res := &MVCCValueMerger{}
   173  		err := res.MergeNewer(value)
   174  		if err != nil {
   175  			return nil, err
   176  		}
   177  		return res, nil
   178  	},
   179  }
   180  
   181  // pebbleTimeBoundPropCollector implements a property collector for MVCC
   182  // Timestamps. Its behavior matches TimeBoundTblPropCollector in
   183  // table_props.cc.
   184  //
   185  // The handling of timestamps in intents is mildly complicated. Consider:
   186  //
   187  //   a@<meta>   -> <MVCCMetadata: Timestamp=t2>
   188  //   a@t2       -> <value>
   189  //   a@t1       -> <value>
   190  //
   191  // The metadata record (a.k.a. the intent) for a key always sorts first. The
   192  // timestamp field always points to the next record. In this case, the meta
   193  // record contains t2 and the next record is t2. Because of this duplication of
   194  // the timestamp both in the intent and in the timestamped record that
   195  // immediately follows it, we only need to unmarshal the MVCCMetadata if it is
   196  // the last key in the sstable.
   197  type pebbleTimeBoundPropCollector struct {
   198  	min, max  []byte
   199  	lastValue []byte
   200  }
   201  
   202  func (t *pebbleTimeBoundPropCollector) Add(key pebble.InternalKey, value []byte) error {
   203  	_, ts, ok := enginepb.SplitMVCCKey(key.UserKey)
   204  	if !ok {
   205  		return errors.Errorf("failed to split MVCC key")
   206  	}
   207  	if len(ts) > 0 {
   208  		t.lastValue = t.lastValue[:0]
   209  		t.updateBounds(ts)
   210  	} else {
   211  		t.lastValue = append(t.lastValue[:0], value...)
   212  	}
   213  	return nil
   214  }
   215  
   216  func (t *pebbleTimeBoundPropCollector) Finish(userProps map[string]string) error {
   217  	if len(t.lastValue) > 0 {
   218  		// The last record in the sstable was an intent. Unmarshal the metadata and
   219  		// update the bounds with the timestamp it contains.
   220  		meta := &enginepb.MVCCMetadata{}
   221  		if err := protoutil.Unmarshal(t.lastValue, meta); err != nil {
   222  			// We're unable to parse the MVCCMetadata. Fail open by not setting the
   223  			// min/max timestamp properties. This mimics the behavior of
   224  			// TimeBoundTblPropCollector.
   225  			// TODO(petermattis): Return the error here and in C++, see #43422.
   226  			return nil //nolint:returnerrcheck
   227  		}
   228  		if meta.Txn != nil {
   229  			ts := encodeTimestamp(hlc.Timestamp(meta.Timestamp))
   230  			t.updateBounds(ts)
   231  		}
   232  	}
   233  
   234  	userProps["crdb.ts.min"] = string(t.min)
   235  	userProps["crdb.ts.max"] = string(t.max)
   236  	return nil
   237  }
   238  
   239  func (t *pebbleTimeBoundPropCollector) updateBounds(ts []byte) {
   240  	if len(t.min) == 0 || bytes.Compare(ts, t.min) < 0 {
   241  		t.min = append(t.min[:0], ts...)
   242  	}
   243  	if len(t.max) == 0 || bytes.Compare(ts, t.max) > 0 {
   244  		t.max = append(t.max[:0], ts...)
   245  	}
   246  }
   247  
   248  func (t *pebbleTimeBoundPropCollector) Name() string {
   249  	// This constant needs to match the one used by the RocksDB version of this
   250  	// table property collector. DO NOT CHANGE.
   251  	return "TimeBoundTblPropCollectorFactory"
   252  }
   253  
   254  var _ pebble.NeedCompacter = &pebbleDeleteRangeCollector{}
   255  
   256  // pebbleDeleteRangeCollector marks an sstable for compaction that contains a
   257  // range tombstone.
   258  type pebbleDeleteRangeCollector struct {
   259  	numRangeTombstones int
   260  }
   261  
   262  func (c *pebbleDeleteRangeCollector) Add(key pebble.InternalKey, value []byte) error {
   263  	if key.Kind() == pebble.InternalKeyKindRangeDelete {
   264  		c.numRangeTombstones++
   265  	}
   266  	return nil
   267  }
   268  
   269  // NeedCompact implements the pebble.NeedCompacter interface.
   270  func (c *pebbleDeleteRangeCollector) NeedCompact() bool {
   271  	// NB: Mark any file containing range deletions as requiring a
   272  	// compaction. This ensures that range deletions are quickly compacted out
   273  	// of existence.
   274  	return c.numRangeTombstones > 0
   275  }
   276  
   277  func (*pebbleDeleteRangeCollector) Finish(userProps map[string]string) error {
   278  	return nil
   279  }
   280  
   281  func (*pebbleDeleteRangeCollector) Name() string {
   282  	// This constant needs to match the one used by the RocksDB version of this
   283  	// table property collector. DO NOT CHANGE.
   284  	return "DeleteRangeTblPropCollectorFactory"
   285  }
   286  
   287  // PebbleTablePropertyCollectors is the list of Pebble TablePropertyCollectors.
   288  var PebbleTablePropertyCollectors = []func() pebble.TablePropertyCollector{
   289  	func() pebble.TablePropertyCollector { return &pebbleTimeBoundPropCollector{} },
   290  	func() pebble.TablePropertyCollector { return &pebbleDeleteRangeCollector{} },
   291  }
   292  
   293  // DefaultPebbleOptions returns the default pebble options.
   294  func DefaultPebbleOptions() *pebble.Options {
   295  	// In RocksDB, the concurrency setting corresponds to both flushes and
   296  	// compactions. In Pebble, there is always a slot for a flush, and
   297  	// compactions are counted separately.
   298  	maxConcurrentCompactions := rocksdbConcurrency - 1
   299  	if maxConcurrentCompactions < 1 {
   300  		maxConcurrentCompactions = 1
   301  	}
   302  
   303  	opts := &pebble.Options{
   304  		Comparer:                    MVCCComparer,
   305  		L0CompactionThreshold:       2,
   306  		L0StopWritesThreshold:       1000,
   307  		LBaseMaxBytes:               64 << 20, // 64 MB
   308  		Levels:                      make([]pebble.LevelOptions, 7),
   309  		MaxConcurrentCompactions:    maxConcurrentCompactions,
   310  		MemTableSize:                64 << 20, // 64 MB
   311  		MemTableStopWritesThreshold: 4,
   312  		Merger:                      MVCCMerger,
   313  		MinFlushRate:                4 << 20, // 4 MB/sec
   314  		TablePropertyCollectors:     PebbleTablePropertyCollectors,
   315  	}
   316  
   317  	for i := 0; i < len(opts.Levels); i++ {
   318  		l := &opts.Levels[i]
   319  		l.BlockSize = 32 << 10       // 32 KB
   320  		l.IndexBlockSize = 256 << 10 // 256 KB
   321  		l.FilterPolicy = bloom.FilterPolicy(10)
   322  		l.FilterType = pebble.TableFilter
   323  		if i > 0 {
   324  			l.TargetFileSize = opts.Levels[i-1].TargetFileSize * 2
   325  		}
   326  		l.EnsureDefaults()
   327  	}
   328  
   329  	// Do not create bloom filters for the last level (i.e. the largest level
   330  	// which contains data in the LSM store). This configuration reduces the size
   331  	// of the bloom filters by 10x. This is significant given that bloom filters
   332  	// require 1.25 bytes (10 bits) per key which can translate into gigabytes of
   333  	// memory given typical key and value sizes. The downside is that bloom
   334  	// filters will only be usable on the higher levels, but that seems
   335  	// acceptable. We typically see read amplification of 5-6x on clusters
   336  	// (i.e. there are 5-6 levels of sstables) which means we'll achieve 80-90%
   337  	// of the benefit of having bloom filters on every level for only 10% of the
   338  	// memory cost.
   339  	opts.Levels[6].FilterPolicy = nil
   340  	return opts
   341  }
   342  
   343  var pebbleLog *log.SecondaryLogger
   344  
   345  // InitPebbleLogger initializes the logger to use for Pebble log messages. If
   346  // not called, WARNING, ERROR, and FATAL logs will be output to the normal
   347  // CockroachDB log. The caller is responsible for ensuring the
   348  // Close() method is eventually called on the new logger.
   349  func InitPebbleLogger(ctx context.Context) *log.SecondaryLogger {
   350  	pebbleLog = log.NewSecondaryLogger(ctx, nil, "pebble",
   351  		true /* enableGC */, false /* forceSyncWrites */, false /* enableMsgCount */)
   352  	return pebbleLog
   353  }
   354  
   355  type pebbleLogger struct {
   356  	ctx   context.Context
   357  	depth int
   358  }
   359  
   360  func (l pebbleLogger) Infof(format string, args ...interface{}) {
   361  	if pebbleLog != nil {
   362  		pebbleLog.LogfDepth(l.ctx, l.depth, format, args...)
   363  		return
   364  	}
   365  	// Only log INFO logs to the normal CockroachDB log at --v=3 and above.
   366  	if log.V(3) {
   367  		log.InfofDepth(l.ctx, l.depth, format, args...)
   368  	}
   369  }
   370  
   371  func (l pebbleLogger) Fatalf(format string, args ...interface{}) {
   372  	log.FatalfDepth(l.ctx, l.depth, format, args...)
   373  }
   374  
   375  // PebbleConfig holds all configuration parameters and knobs used in setting up
   376  // a new Pebble instance.
   377  type PebbleConfig struct {
   378  	// StorageConfig contains storage configs for all storage engines.
   379  	base.StorageConfig
   380  	// Pebble specific options.
   381  	Opts *pebble.Options
   382  }
   383  
   384  // EncryptionStatsHandler provides encryption related stats.
   385  type EncryptionStatsHandler interface {
   386  	// Returns a serialized enginepbccl.EncryptionStatus.
   387  	GetEncryptionStatus() ([]byte, error)
   388  	// Returns a serialized enginepbccl.DataKeysRegistry, scrubbed of key contents.
   389  	GetDataKeysRegistry() ([]byte, error)
   390  	// Returns the ID of the active data key, or "plain" if none.
   391  	GetActiveDataKeyID() (string, error)
   392  	// Returns the enum value of the encryption type.
   393  	GetActiveStoreKeyType() int32
   394  	// Returns the KeyID embedded in the serialized EncryptionSettings.
   395  	GetKeyIDFromSettings(settings []byte) (string, error)
   396  }
   397  
   398  // Pebble is a wrapper around a Pebble database instance.
   399  type Pebble struct {
   400  	db *pebble.DB
   401  
   402  	closed       bool
   403  	path         string
   404  	auxDir       string
   405  	maxSize      int64
   406  	attrs        roachpb.Attributes
   407  	settings     *cluster.Settings
   408  	statsHandler EncryptionStatsHandler
   409  	fileRegistry *PebbleFileRegistry
   410  
   411  	// Relevant options copied over from pebble.Options.
   412  	fs     vfs.FS
   413  	logger pebble.Logger
   414  }
   415  
   416  var _ Engine = &Pebble{}
   417  
   418  // NewEncryptedEnvFunc creates an encrypted environment and returns the vfs.FS to use for reading
   419  // and writing data. This should be initialized by calling engineccl.Init() before calling
   420  // NewPebble(). The optionBytes is a binary serialized baseccl.EncryptionOptions, so that non-CCL
   421  // code does not depend on CCL code.
   422  var NewEncryptedEnvFunc func(fs vfs.FS, fr *PebbleFileRegistry, dbDir string, readOnly bool, optionBytes []byte) (vfs.FS, EncryptionStatsHandler, error)
   423  
   424  // ResolveEncryptedEnvOptions fills in cfg.Opts.FS with an encrypted vfs if this
   425  // store has encryption-at-rest enabled. Also returns the associated file
   426  // registry and EncryptionStatsHandler.
   427  func ResolveEncryptedEnvOptions(
   428  	cfg *PebbleConfig,
   429  ) (*PebbleFileRegistry, EncryptionStatsHandler, error) {
   430  	fileRegistry := &PebbleFileRegistry{FS: cfg.Opts.FS, DBDir: cfg.Dir, ReadOnly: cfg.Opts.ReadOnly}
   431  	if cfg.UseFileRegistry {
   432  		if err := fileRegistry.Load(); err != nil {
   433  			return nil, nil, err
   434  		}
   435  	} else {
   436  		if err := fileRegistry.checkNoRegistryFile(); err != nil {
   437  			return nil, nil, fmt.Errorf("encryption was used on this store before, but no encryption flags " +
   438  				"specified. You need a CCL build and must fully specify the --enterprise-encryption flag")
   439  		}
   440  		fileRegistry = nil
   441  	}
   442  
   443  	var statsHandler EncryptionStatsHandler
   444  	if len(cfg.ExtraOptions) > 0 {
   445  		// Encryption is enabled.
   446  		if !cfg.UseFileRegistry {
   447  			return nil, nil, fmt.Errorf("file registry is needed to support encryption")
   448  		}
   449  		if NewEncryptedEnvFunc == nil {
   450  			return nil, nil, fmt.Errorf("encryption is enabled but no function to create the encrypted env")
   451  		}
   452  		var err error
   453  		cfg.Opts.FS, statsHandler, err =
   454  			NewEncryptedEnvFunc(cfg.Opts.FS, fileRegistry, cfg.Dir, cfg.Opts.ReadOnly, cfg.ExtraOptions)
   455  		if err != nil {
   456  			return nil, nil, err
   457  		}
   458  	}
   459  	return fileRegistry, statsHandler, nil
   460  }
   461  
   462  // NewPebble creates a new Pebble instance, at the specified path.
   463  func NewPebble(ctx context.Context, cfg PebbleConfig) (*Pebble, error) {
   464  	// pebble.Open also calls EnsureDefaults, but only after doing a clone. Call
   465  	// EnsureDefaults beforehand so we have a matching cfg here for when we save
   466  	// cfg.FS and cfg.ReadOnly later on.
   467  	cfg.Opts.EnsureDefaults()
   468  	cfg.Opts.ErrorIfNotExists = cfg.MustExist
   469  	if settings := cfg.Settings; settings != nil {
   470  		cfg.Opts.WALMinSyncInterval = func() time.Duration {
   471  			return minWALSyncInterval.Get(&settings.SV)
   472  		}
   473  	}
   474  
   475  	var auxDir string
   476  	if cfg.Dir == "" {
   477  		// TODO(peter): This is horribly hacky but matches what RocksDB does. For
   478  		// in-memory instances, we create an on-disk auxiliary directory. This is
   479  		// necessary because various tests expect the auxiliary directory to
   480  		// actually exist on disk even though they don't actually write files to
   481  		// the directory. See SSTSnapshotStorage for one example of this bad
   482  		// behavior.
   483  		var err error
   484  		auxDir, err = ioutil.TempDir(os.TempDir(), "cockroach-auxiliary")
   485  		if err != nil {
   486  			return nil, err
   487  		}
   488  	} else {
   489  		auxDir = cfg.Opts.FS.PathJoin(cfg.Dir, base.AuxiliaryDir)
   490  		if err := cfg.Opts.FS.MkdirAll(auxDir, 0755); err != nil {
   491  			return nil, err
   492  		}
   493  	}
   494  
   495  	fileRegistry, statsHandler, err := ResolveEncryptedEnvOptions(&cfg)
   496  	if err != nil {
   497  		return nil, err
   498  	}
   499  
   500  	// The context dance here is done so that we have a clean context without
   501  	// timeouts that has a copy of the log tags.
   502  	logCtx := logtags.WithTags(context.Background(), logtags.FromContext(ctx))
   503  	cfg.Opts.Logger = pebbleLogger{
   504  		ctx:   logCtx,
   505  		depth: 1,
   506  	}
   507  	cfg.Opts.EventListener = pebble.MakeLoggingEventListener(pebbleLogger{
   508  		ctx:   logCtx,
   509  		depth: 2, // skip over the EventListener stack frame
   510  	})
   511  
   512  	db, err := pebble.Open(cfg.StorageConfig.Dir, cfg.Opts)
   513  	if err != nil {
   514  		return nil, err
   515  	}
   516  
   517  	return &Pebble{
   518  		db:           db,
   519  		path:         cfg.Dir,
   520  		auxDir:       auxDir,
   521  		maxSize:      cfg.MaxSize,
   522  		attrs:        cfg.Attrs,
   523  		settings:     cfg.Settings,
   524  		statsHandler: statsHandler,
   525  		fileRegistry: fileRegistry,
   526  		fs:           cfg.Opts.FS,
   527  		logger:       cfg.Opts.Logger,
   528  	}, nil
   529  }
   530  
   531  func newTeeInMem(ctx context.Context, attrs roachpb.Attributes, cacheSize int64) *TeeEngine {
   532  	// Note that we use the same unmodified directories for both pebble and
   533  	// rocksdb. This is to make sure the file paths match up, and that we're
   534  	// able to write to both and ingest from both memory filesystems.
   535  	pebbleInMem := newPebbleInMem(ctx, attrs, cacheSize)
   536  	rocksDBInMem := newRocksDBInMem(attrs, cacheSize)
   537  	tee := NewTee(ctx, rocksDBInMem, pebbleInMem)
   538  	return tee
   539  }
   540  
   541  func newPebbleInMem(ctx context.Context, attrs roachpb.Attributes, cacheSize int64) *Pebble {
   542  	opts := DefaultPebbleOptions()
   543  	opts.Cache = pebble.NewCache(cacheSize)
   544  	defer opts.Cache.Unref()
   545  
   546  	opts.FS = vfs.NewMem()
   547  	db, err := NewPebble(
   548  		ctx,
   549  		PebbleConfig{
   550  			StorageConfig: base.StorageConfig{
   551  				Attrs: attrs,
   552  				// TODO(bdarnell): The hard-coded 512 MiB is wrong; see
   553  				// https://github.com/cockroachdb/cockroach/issues/16750
   554  				MaxSize: 512 << 20, /* 512 MiB */
   555  			},
   556  			Opts: opts,
   557  		})
   558  	if err != nil {
   559  		panic(err)
   560  	}
   561  	return db
   562  }
   563  
   564  func (p *Pebble) String() string {
   565  	dir := p.path
   566  	if dir == "" {
   567  		dir = "<in-mem>"
   568  	}
   569  	attrs := p.attrs.String()
   570  	if attrs == "" {
   571  		attrs = "<no-attributes>"
   572  	}
   573  	return fmt.Sprintf("%s=%s", attrs, dir)
   574  }
   575  
   576  // Close implements the Engine interface.
   577  func (p *Pebble) Close() {
   578  	if p.closed {
   579  		p.logger.Infof("closing unopened pebble instance")
   580  		return
   581  	}
   582  	p.closed = true
   583  
   584  	if p.path == "" {
   585  		// Remove the temporary directory when the engine is in-memory. This
   586  		// matches the RocksDB behavior.
   587  		//
   588  		// TODO(peter): The aux-dir shouldn't be on-disk for in-memory
   589  		// engines. This is just a wart that needs to be removed.
   590  		if err := os.RemoveAll(p.auxDir); err != nil {
   591  			p.logger.Infof("%v", err)
   592  		}
   593  	}
   594  
   595  	_ = p.db.Close()
   596  }
   597  
   598  // Closed implements the Engine interface.
   599  func (p *Pebble) Closed() bool {
   600  	return p.closed
   601  }
   602  
   603  // ExportToSst is part of the engine.Reader interface.
   604  func (p *Pebble) ExportToSst(
   605  	startKey, endKey roachpb.Key,
   606  	startTS, endTS hlc.Timestamp,
   607  	exportAllRevisions bool,
   608  	targetSize, maxSize uint64,
   609  	io IterOptions,
   610  ) ([]byte, roachpb.BulkOpSummary, roachpb.Key, error) {
   611  	return pebbleExportToSst(p, startKey, endKey, startTS, endTS, exportAllRevisions, targetSize, maxSize, io)
   612  }
   613  
   614  // Get implements the Engine interface.
   615  func (p *Pebble) Get(key MVCCKey) ([]byte, error) {
   616  	if len(key.Key) == 0 {
   617  		return nil, emptyKeyError()
   618  	}
   619  	ret, closer, err := p.db.Get(EncodeKey(key))
   620  	if closer != nil {
   621  		retCopy := make([]byte, len(ret))
   622  		copy(retCopy, ret)
   623  		ret = retCopy
   624  		closer.Close()
   625  	}
   626  	if errors.Is(err, pebble.ErrNotFound) || len(ret) == 0 {
   627  		return nil, nil
   628  	}
   629  	return ret, err
   630  }
   631  
   632  // GetCompactionStats implements the Engine interface.
   633  func (p *Pebble) GetCompactionStats() string {
   634  	// NB: The initial blank line matches the formatting used by RocksDB and
   635  	// ensures that compaction stats display will not contain the log prefix
   636  	// (this method is only used for logging purposes).
   637  	return "\n" + p.db.Metrics().String()
   638  }
   639  
   640  // GetProto implements the Engine interface.
   641  func (p *Pebble) GetProto(
   642  	key MVCCKey, msg protoutil.Message,
   643  ) (ok bool, keyBytes, valBytes int64, err error) {
   644  	if len(key.Key) == 0 {
   645  		return false, 0, 0, emptyKeyError()
   646  	}
   647  	encodedKey := EncodeKey(key)
   648  	val, closer, err := p.db.Get(encodedKey)
   649  	if closer != nil {
   650  		if msg != nil {
   651  			err = protoutil.Unmarshal(val, msg)
   652  		}
   653  		keyBytes = int64(len(encodedKey))
   654  		valBytes = int64(len(val))
   655  		closer.Close()
   656  		return true, keyBytes, valBytes, err
   657  	}
   658  	if errors.Is(err, pebble.ErrNotFound) {
   659  		return false, 0, 0, nil
   660  	}
   661  	return false, 0, 0, err
   662  }
   663  
   664  // Iterate implements the Engine interface.
   665  func (p *Pebble) Iterate(
   666  	start, end roachpb.Key, f func(MVCCKeyValue) (stop bool, err error),
   667  ) error {
   668  	return iterateOnReader(p, start, end, f)
   669  }
   670  
   671  // NewIterator implements the Engine interface.
   672  func (p *Pebble) NewIterator(opts IterOptions) Iterator {
   673  	iter := newPebbleIterator(p.db, opts)
   674  	if iter == nil {
   675  		panic("couldn't create a new iterator")
   676  	}
   677  	return iter
   678  }
   679  
   680  // ApplyBatchRepr implements the Engine interface.
   681  func (p *Pebble) ApplyBatchRepr(repr []byte, sync bool) error {
   682  	// batch.SetRepr takes ownership of the underlying slice, so make a copy.
   683  	reprCopy := make([]byte, len(repr))
   684  	copy(reprCopy, repr)
   685  
   686  	batch := p.db.NewBatch()
   687  	if err := batch.SetRepr(reprCopy); err != nil {
   688  		return err
   689  	}
   690  
   691  	opts := pebble.NoSync
   692  	if sync {
   693  		opts = pebble.Sync
   694  	}
   695  	return batch.Commit(opts)
   696  }
   697  
   698  // Clear implements the Engine interface.
   699  func (p *Pebble) Clear(key MVCCKey) error {
   700  	if len(key.Key) == 0 {
   701  		return emptyKeyError()
   702  	}
   703  	return p.db.Delete(EncodeKey(key), pebble.Sync)
   704  }
   705  
   706  // SingleClear implements the Engine interface.
   707  func (p *Pebble) SingleClear(key MVCCKey) error {
   708  	if len(key.Key) == 0 {
   709  		return emptyKeyError()
   710  	}
   711  	return p.db.SingleDelete(EncodeKey(key), pebble.Sync)
   712  }
   713  
   714  // ClearRange implements the Engine interface.
   715  func (p *Pebble) ClearRange(start, end MVCCKey) error {
   716  	bufStart := EncodeKey(start)
   717  	bufEnd := EncodeKey(end)
   718  	return p.db.DeleteRange(bufStart, bufEnd, pebble.Sync)
   719  }
   720  
   721  // ClearIterRange implements the Engine interface.
   722  func (p *Pebble) ClearIterRange(iter Iterator, start, end roachpb.Key) error {
   723  	// Write all the tombstones in one batch.
   724  	batch := p.NewWriteOnlyBatch()
   725  	defer batch.Close()
   726  
   727  	if err := batch.ClearIterRange(iter, start, end); err != nil {
   728  		return err
   729  	}
   730  	return batch.Commit(true)
   731  }
   732  
   733  // Merge implements the Engine interface.
   734  func (p *Pebble) Merge(key MVCCKey, value []byte) error {
   735  	if len(key.Key) == 0 {
   736  		return emptyKeyError()
   737  	}
   738  	return p.db.Merge(EncodeKey(key), value, pebble.Sync)
   739  }
   740  
   741  // Put implements the Engine interface.
   742  func (p *Pebble) Put(key MVCCKey, value []byte) error {
   743  	if len(key.Key) == 0 {
   744  		return emptyKeyError()
   745  	}
   746  	return p.db.Set(EncodeKey(key), value, pebble.Sync)
   747  }
   748  
   749  // LogData implements the Engine interface.
   750  func (p *Pebble) LogData(data []byte) error {
   751  	return p.db.LogData(data, pebble.Sync)
   752  }
   753  
   754  // LogLogicalOp implements the Engine interface.
   755  func (p *Pebble) LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails) {
   756  	// No-op. Logical logging disabled.
   757  }
   758  
   759  // Attrs implements the Engine interface.
   760  func (p *Pebble) Attrs() roachpb.Attributes {
   761  	return p.attrs
   762  }
   763  
   764  // Capacity implements the Engine interface.
   765  func (p *Pebble) Capacity() (roachpb.StoreCapacity, error) {
   766  	return computeCapacity(p.path, p.maxSize)
   767  }
   768  
   769  // Flush implements the Engine interface.
   770  func (p *Pebble) Flush() error {
   771  	return p.db.Flush()
   772  }
   773  
   774  // GetStats implements the Engine interface.
   775  func (p *Pebble) GetStats() (*Stats, error) {
   776  	m := p.db.Metrics()
   777  
   778  	// Aggregate compaction stats across levels.
   779  	var ingestedBytes, compactedBytesRead, compactedBytesWritten int64
   780  	for _, lm := range m.Levels {
   781  		ingestedBytes += int64(lm.BytesIngested)
   782  		compactedBytesRead += int64(lm.BytesRead)
   783  		compactedBytesWritten += int64(lm.BytesCompacted)
   784  	}
   785  
   786  	return &Stats{
   787  		BlockCacheHits:                 m.BlockCache.Hits,
   788  		BlockCacheMisses:               m.BlockCache.Misses,
   789  		BlockCacheUsage:                m.BlockCache.Size,
   790  		BlockCachePinnedUsage:          0,
   791  		BloomFilterPrefixChecked:       m.Filter.Hits + m.Filter.Misses,
   792  		BloomFilterPrefixUseful:        m.Filter.Hits,
   793  		MemtableTotalSize:              int64(m.MemTable.Size),
   794  		Flushes:                        m.Flush.Count,
   795  		FlushedBytes:                   int64(m.Levels[0].BytesFlushed),
   796  		Compactions:                    m.Compact.Count,
   797  		IngestedBytes:                  ingestedBytes,
   798  		CompactedBytesRead:             compactedBytesRead,
   799  		CompactedBytesWritten:          compactedBytesWritten,
   800  		TableReadersMemEstimate:        m.TableCache.Size,
   801  		PendingCompactionBytesEstimate: int64(m.Compact.EstimatedDebt),
   802  		L0FileCount:                    m.Levels[0].NumFiles,
   803  	}, nil
   804  }
   805  
   806  // GetEncryptionRegistries implements the Engine interface.
   807  func (p *Pebble) GetEncryptionRegistries() (*EncryptionRegistries, error) {
   808  	rv := &EncryptionRegistries{}
   809  	var err error
   810  	if p.statsHandler != nil {
   811  		rv.KeyRegistry, err = p.statsHandler.GetDataKeysRegistry()
   812  		if err != nil {
   813  			return nil, err
   814  		}
   815  	}
   816  	if p.fileRegistry != nil {
   817  		rv.FileRegistry, err = protoutil.Marshal(p.fileRegistry.getRegistryCopy())
   818  		if err != nil {
   819  			return nil, err
   820  		}
   821  	}
   822  	return rv, nil
   823  }
   824  
   825  // GetEnvStats implements the Engine interface.
   826  func (p *Pebble) GetEnvStats() (*EnvStats, error) {
   827  	// TODO(sumeer): make the stats complete. There are no bytes stats. The TotalFiles is missing
   828  	// files that are not in the registry (from before encryption was enabled).
   829  	stats := &EnvStats{}
   830  	if p.statsHandler == nil {
   831  		return stats, nil
   832  	}
   833  	stats.EncryptionType = p.statsHandler.GetActiveStoreKeyType()
   834  	var err error
   835  	stats.EncryptionStatus, err = p.statsHandler.GetEncryptionStatus()
   836  	if err != nil {
   837  		return nil, err
   838  	}
   839  	fr := p.fileRegistry.getRegistryCopy()
   840  	activeKeyID, err := p.statsHandler.GetActiveDataKeyID()
   841  	if err != nil {
   842  		return nil, err
   843  	}
   844  
   845  	m := p.db.Metrics()
   846  	stats.TotalFiles = 3 /* CURRENT, MANIFEST, OPTIONS */
   847  	stats.TotalFiles += uint64(m.WAL.Files + m.Table.ZombieCount + m.WAL.ObsoleteFiles)
   848  	stats.TotalBytes = m.WAL.Size + m.Table.ZombieSize
   849  	for _, l := range m.Levels {
   850  		stats.TotalFiles += uint64(l.NumFiles)
   851  		stats.TotalBytes += l.Size
   852  	}
   853  
   854  	sstSizes := make(map[pebble.FileNum]uint64)
   855  	for _, ssts := range p.db.SSTables() {
   856  		for _, sst := range ssts {
   857  			sstSizes[sst.FileNum] = sst.Size
   858  		}
   859  	}
   860  
   861  	for filePath, entry := range fr.Files {
   862  		keyID, err := p.statsHandler.GetKeyIDFromSettings(entry.EncryptionSettings)
   863  		if err != nil {
   864  			return nil, err
   865  		}
   866  		if len(keyID) == 0 {
   867  			keyID = "plain"
   868  		}
   869  		if keyID != activeKeyID {
   870  			continue
   871  		}
   872  		stats.ActiveKeyFiles++
   873  
   874  		filename := p.fs.PathBase(filePath)
   875  		numStr := strings.TrimSuffix(filename, ".sst")
   876  		if len(numStr) == len(filename) {
   877  			continue // not a sstable
   878  		}
   879  		u, err := strconv.ParseUint(numStr, 10, 64)
   880  		if err != nil {
   881  			return nil, errors.Wrapf(err, "parsing filename %q", errors.Safe(filename))
   882  		}
   883  		stats.ActiveKeyBytes += sstSizes[pebble.FileNum(u)]
   884  	}
   885  	return stats, nil
   886  }
   887  
   888  // GetAuxiliaryDir implements the Engine interface.
   889  func (p *Pebble) GetAuxiliaryDir() string {
   890  	return p.auxDir
   891  }
   892  
   893  // NewBatch implements the Engine interface.
   894  func (p *Pebble) NewBatch() Batch {
   895  	return newPebbleBatch(p.db, p.db.NewIndexedBatch())
   896  }
   897  
   898  // NewReadOnly implements the Engine interface.
   899  func (p *Pebble) NewReadOnly() ReadWriter {
   900  	return &pebbleReadOnly{
   901  		parent: p,
   902  	}
   903  }
   904  
   905  // NewWriteOnlyBatch implements the Engine interface.
   906  func (p *Pebble) NewWriteOnlyBatch() Batch {
   907  	return newPebbleBatch(p.db, p.db.NewBatch())
   908  }
   909  
   910  // NewSnapshot implements the Engine interface.
   911  func (p *Pebble) NewSnapshot() Reader {
   912  	return &pebbleSnapshot{
   913  		snapshot: p.db.NewSnapshot(),
   914  	}
   915  }
   916  
   917  // Type implements the Engine interface.
   918  func (p *Pebble) Type() enginepb.EngineType {
   919  	return enginepb.EngineTypePebble
   920  }
   921  
   922  // IngestExternalFiles implements the Engine interface.
   923  func (p *Pebble) IngestExternalFiles(ctx context.Context, paths []string) error {
   924  	return p.db.Ingest(paths)
   925  }
   926  
   927  // PreIngestDelay implements the Engine interface.
   928  func (p *Pebble) PreIngestDelay(ctx context.Context) {
   929  	preIngestDelay(ctx, p, p.settings)
   930  }
   931  
   932  // ApproximateDiskBytes implements the Engine interface.
   933  func (p *Pebble) ApproximateDiskBytes(from, to roachpb.Key) (uint64, error) {
   934  	count, err := p.db.EstimateDiskUsage(from, to)
   935  	if err != nil {
   936  		return 0, err
   937  	}
   938  	return count, nil
   939  }
   940  
   941  // Compact implements the Engine interface.
   942  func (p *Pebble) Compact() error {
   943  	return p.db.Compact(nil, EncodeKey(MVCCKeyMax))
   944  }
   945  
   946  // CompactRange implements the Engine interface.
   947  func (p *Pebble) CompactRange(start, end roachpb.Key, forceBottommost bool) error {
   948  	bufStart := EncodeKey(MVCCKey{start, hlc.Timestamp{}})
   949  	bufEnd := EncodeKey(MVCCKey{end, hlc.Timestamp{}})
   950  	return p.db.Compact(bufStart, bufEnd)
   951  }
   952  
   953  // InMem returns true if the receiver is an in-memory engine and false
   954  // otherwise.
   955  func (p *Pebble) InMem() bool {
   956  	return p.path == ""
   957  }
   958  
   959  // ReadFile implements the Engine interface.
   960  func (p *Pebble) ReadFile(filename string) ([]byte, error) {
   961  	file, err := p.fs.Open(filename)
   962  	if err != nil {
   963  		return nil, err
   964  	}
   965  	defer file.Close()
   966  
   967  	return ioutil.ReadAll(file)
   968  }
   969  
   970  // WriteFile writes data to a file in this RocksDB's env.
   971  func (p *Pebble) WriteFile(filename string, data []byte) error {
   972  	file, err := p.fs.Create(filename)
   973  	if err != nil {
   974  		return err
   975  	}
   976  	defer file.Close()
   977  
   978  	_, err = io.Copy(file, bytes.NewReader(data))
   979  	return err
   980  }
   981  
   982  // Remove implements the FS interface.
   983  func (p *Pebble) Remove(filename string) error {
   984  	return p.fs.Remove(filename)
   985  }
   986  
   987  // RemoveAll implements the Engine interface.
   988  func (p *Pebble) RemoveAll(dir string) error {
   989  	return p.fs.RemoveAll(dir)
   990  }
   991  
   992  // Link implements the FS interface.
   993  func (p *Pebble) Link(oldname, newname string) error {
   994  	return p.fs.Link(oldname, newname)
   995  }
   996  
   997  var _ fs.FS = &Pebble{}
   998  
   999  // Create implements the FS interface.
  1000  func (p *Pebble) Create(name string) (fs.File, error) {
  1001  	// TODO(peter): On RocksDB, the MemEnv allows creating a file when the parent
  1002  	// directory does not exist. Various tests in the storage package depend on
  1003  	// this because they are accidentally creating the required directory on the
  1004  	// actual filesystem instead of in the memory filesystem. See
  1005  	// diskSideloadedStorage and SSTSnapshotStrategy.
  1006  	if p.InMem() {
  1007  		_ = p.fs.MkdirAll(p.fs.PathDir(name), 0755)
  1008  	}
  1009  	return p.fs.Create(name)
  1010  }
  1011  
  1012  // CreateWithSync implements the FS interface.
  1013  func (p *Pebble) CreateWithSync(name string, bytesPerSync int) (fs.File, error) {
  1014  	// TODO(peter): On RocksDB, the MemEnv allows creating a file when the parent
  1015  	// directory does not exist. Various tests in the storage package depend on
  1016  	// this because they are accidentally creating the required directory on the
  1017  	// actual filesystem instead of in the memory filesystem. See
  1018  	// diskSideloadedStorage and SSTSnapshotStrategy.
  1019  	if p.InMem() {
  1020  		_ = p.fs.MkdirAll(p.fs.PathDir(name), 0755)
  1021  	}
  1022  	f, err := p.fs.Create(name)
  1023  	if err != nil {
  1024  		return nil, err
  1025  	}
  1026  	return vfs.NewSyncingFile(f, vfs.SyncingFileOptions{BytesPerSync: bytesPerSync}), nil
  1027  }
  1028  
  1029  // Open implements the FS interface.
  1030  func (p *Pebble) Open(name string) (fs.File, error) {
  1031  	return p.fs.Open(name)
  1032  }
  1033  
  1034  // OpenDir implements the FS interface.
  1035  func (p *Pebble) OpenDir(name string) (fs.File, error) {
  1036  	return p.fs.OpenDir(name)
  1037  }
  1038  
  1039  // Rename implements the FS interface.
  1040  func (p *Pebble) Rename(oldname, newname string) error {
  1041  	return p.fs.Rename(oldname, newname)
  1042  }
  1043  
  1044  // MkdirAll implements the FS interface.
  1045  func (p *Pebble) MkdirAll(name string) error {
  1046  	return p.fs.MkdirAll(name, 0755)
  1047  }
  1048  
  1049  // RemoveDir implements the FS interface.
  1050  func (p *Pebble) RemoveDir(name string) error {
  1051  	return p.fs.Remove(name)
  1052  }
  1053  
  1054  // List implements the FS interface.
  1055  func (p *Pebble) List(name string) ([]string, error) {
  1056  	dirents, err := p.fs.List(name)
  1057  	sort.Strings(dirents)
  1058  	return dirents, err
  1059  }
  1060  
  1061  // CreateCheckpoint implements the Engine interface.
  1062  func (p *Pebble) CreateCheckpoint(dir string) error {
  1063  	return p.db.Checkpoint(dir)
  1064  }
  1065  
  1066  // GetSSTables implements the WithSSTables interface.
  1067  func (p *Pebble) GetSSTables() (sstables SSTableInfos) {
  1068  	for level, tables := range p.db.SSTables() {
  1069  		for _, table := range tables {
  1070  			startKey, _ := DecodeMVCCKey(table.Smallest.UserKey)
  1071  			endKey, _ := DecodeMVCCKey(table.Largest.UserKey)
  1072  			info := SSTableInfo{
  1073  				Level: level,
  1074  				Size:  int64(table.Size),
  1075  				Start: startKey,
  1076  				End:   endKey,
  1077  			}
  1078  			sstables = append(sstables, info)
  1079  		}
  1080  	}
  1081  
  1082  	sort.Sort(sstables)
  1083  	return sstables
  1084  }
  1085  
  1086  type pebbleReadOnly struct {
  1087  	parent     *Pebble
  1088  	prefixIter pebbleIterator
  1089  	normalIter pebbleIterator
  1090  	closed     bool
  1091  }
  1092  
  1093  var _ ReadWriter = &pebbleReadOnly{}
  1094  
  1095  func (p *pebbleReadOnly) Close() {
  1096  	if p.closed {
  1097  		panic("closing an already-closed pebbleReadOnly")
  1098  	}
  1099  	p.closed = true
  1100  	p.prefixIter.destroy()
  1101  	p.normalIter.destroy()
  1102  }
  1103  
  1104  func (p *pebbleReadOnly) Closed() bool {
  1105  	return p.closed
  1106  }
  1107  
  1108  // ExportToSst is part of the engine.Reader interface.
  1109  func (p *pebbleReadOnly) ExportToSst(
  1110  	startKey, endKey roachpb.Key,
  1111  	startTS, endTS hlc.Timestamp,
  1112  	exportAllRevisions bool,
  1113  	targetSize, maxSize uint64,
  1114  	io IterOptions,
  1115  ) ([]byte, roachpb.BulkOpSummary, roachpb.Key, error) {
  1116  	return pebbleExportToSst(p, startKey, endKey, startTS, endTS, exportAllRevisions, targetSize, maxSize, io)
  1117  }
  1118  
  1119  func (p *pebbleReadOnly) Get(key MVCCKey) ([]byte, error) {
  1120  	if p.closed {
  1121  		panic("using a closed pebbleReadOnly")
  1122  	}
  1123  	return p.parent.Get(key)
  1124  }
  1125  
  1126  func (p *pebbleReadOnly) GetProto(
  1127  	key MVCCKey, msg protoutil.Message,
  1128  ) (ok bool, keyBytes, valBytes int64, err error) {
  1129  	if p.closed {
  1130  		panic("using a closed pebbleReadOnly")
  1131  	}
  1132  	return p.parent.GetProto(key, msg)
  1133  }
  1134  
  1135  func (p *pebbleReadOnly) Iterate(start, end roachpb.Key, f func(MVCCKeyValue) (bool, error)) error {
  1136  	if p.closed {
  1137  		panic("using a closed pebbleReadOnly")
  1138  	}
  1139  	return iterateOnReader(p, start, end, f)
  1140  }
  1141  
  1142  func (p *pebbleReadOnly) NewIterator(opts IterOptions) Iterator {
  1143  	if p.closed {
  1144  		panic("using a closed pebbleReadOnly")
  1145  	}
  1146  
  1147  	if opts.MinTimestampHint != (hlc.Timestamp{}) {
  1148  		// Iterators that specify timestamp bounds cannot be cached.
  1149  		return newPebbleIterator(p.parent.db, opts)
  1150  	}
  1151  
  1152  	iter := &p.normalIter
  1153  	if opts.Prefix {
  1154  		iter = &p.prefixIter
  1155  	}
  1156  	if iter.inuse {
  1157  		panic("iterator already in use")
  1158  	}
  1159  
  1160  	if iter.iter != nil {
  1161  		iter.setOptions(opts)
  1162  	} else {
  1163  		iter.init(p.parent.db, opts)
  1164  		iter.reusable = true
  1165  	}
  1166  
  1167  	iter.inuse = true
  1168  	return iter
  1169  }
  1170  
  1171  // Writer methods are not implemented for pebbleReadOnly. Ideally, the code
  1172  // could be refactored so that a Reader could be supplied to evaluateBatch
  1173  
  1174  // Writer is the write interface to an engine's data.
  1175  func (p *pebbleReadOnly) ApplyBatchRepr(repr []byte, sync bool) error {
  1176  	panic("not implemented")
  1177  }
  1178  
  1179  func (p *pebbleReadOnly) Clear(key MVCCKey) error {
  1180  	panic("not implemented")
  1181  }
  1182  
  1183  func (p *pebbleReadOnly) SingleClear(key MVCCKey) error {
  1184  	panic("not implemented")
  1185  }
  1186  
  1187  func (p *pebbleReadOnly) ClearRange(start, end MVCCKey) error {
  1188  	panic("not implemented")
  1189  }
  1190  
  1191  func (p *pebbleReadOnly) ClearIterRange(iter Iterator, start, end roachpb.Key) error {
  1192  	panic("not implemented")
  1193  }
  1194  
  1195  func (p *pebbleReadOnly) Merge(key MVCCKey, value []byte) error {
  1196  	panic("not implemented")
  1197  }
  1198  
  1199  func (p *pebbleReadOnly) Put(key MVCCKey, value []byte) error {
  1200  	panic("not implemented")
  1201  }
  1202  
  1203  func (p *pebbleReadOnly) LogData(data []byte) error {
  1204  	panic("not implemented")
  1205  }
  1206  
  1207  func (p *pebbleReadOnly) LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails) {
  1208  	panic("not implemented")
  1209  }
  1210  
  1211  // pebbleSnapshot represents a snapshot created using Pebble.NewSnapshot().
  1212  type pebbleSnapshot struct {
  1213  	snapshot *pebble.Snapshot
  1214  	closed   bool
  1215  }
  1216  
  1217  var _ Reader = &pebbleSnapshot{}
  1218  
  1219  // Close implements the Reader interface.
  1220  func (p *pebbleSnapshot) Close() {
  1221  	_ = p.snapshot.Close()
  1222  	p.closed = true
  1223  }
  1224  
  1225  // Closed implements the Reader interface.
  1226  func (p *pebbleSnapshot) Closed() bool {
  1227  	return p.closed
  1228  }
  1229  
  1230  // ExportToSst is part of the engine.Reader interface.
  1231  func (p *pebbleSnapshot) ExportToSst(
  1232  	startKey, endKey roachpb.Key,
  1233  	startTS, endTS hlc.Timestamp,
  1234  	exportAllRevisions bool,
  1235  	targetSize, maxSize uint64,
  1236  	io IterOptions,
  1237  ) ([]byte, roachpb.BulkOpSummary, roachpb.Key, error) {
  1238  	return pebbleExportToSst(p, startKey, endKey, startTS, endTS, exportAllRevisions, targetSize, maxSize, io)
  1239  }
  1240  
  1241  // Get implements the Reader interface.
  1242  func (p *pebbleSnapshot) Get(key MVCCKey) ([]byte, error) {
  1243  	if len(key.Key) == 0 {
  1244  		return nil, emptyKeyError()
  1245  	}
  1246  
  1247  	ret, closer, err := p.snapshot.Get(EncodeKey(key))
  1248  	if closer != nil {
  1249  		retCopy := make([]byte, len(ret))
  1250  		copy(retCopy, ret)
  1251  		ret = retCopy
  1252  		closer.Close()
  1253  	}
  1254  	if errors.Is(err, pebble.ErrNotFound) || len(ret) == 0 {
  1255  		return nil, nil
  1256  	}
  1257  	return ret, err
  1258  }
  1259  
  1260  // GetProto implements the Reader interface.
  1261  func (p *pebbleSnapshot) GetProto(
  1262  	key MVCCKey, msg protoutil.Message,
  1263  ) (ok bool, keyBytes, valBytes int64, err error) {
  1264  	if len(key.Key) == 0 {
  1265  		return false, 0, 0, emptyKeyError()
  1266  	}
  1267  	encodedKey := EncodeKey(key)
  1268  	val, closer, err := p.snapshot.Get(encodedKey)
  1269  	if closer != nil {
  1270  		if msg != nil {
  1271  			err = protoutil.Unmarshal(val, msg)
  1272  		}
  1273  		keyBytes = int64(len(encodedKey))
  1274  		valBytes = int64(len(val))
  1275  		closer.Close()
  1276  		return true, keyBytes, valBytes, err
  1277  	}
  1278  	if errors.Is(err, pebble.ErrNotFound) {
  1279  		return false, 0, 0, nil
  1280  	}
  1281  	return false, 0, 0, err
  1282  }
  1283  
  1284  // Iterate implements the Reader interface.
  1285  func (p *pebbleSnapshot) Iterate(
  1286  	start, end roachpb.Key, f func(MVCCKeyValue) (stop bool, err error),
  1287  ) error {
  1288  	return iterateOnReader(p, start, end, f)
  1289  }
  1290  
  1291  // NewIterator implements the Reader interface.
  1292  func (p pebbleSnapshot) NewIterator(opts IterOptions) Iterator {
  1293  	return newPebbleIterator(p.snapshot, opts)
  1294  }
  1295  
  1296  func pebbleExportToSst(
  1297  	reader Reader,
  1298  	startKey, endKey roachpb.Key,
  1299  	startTS, endTS hlc.Timestamp,
  1300  	exportAllRevisions bool,
  1301  	targetSize, maxSize uint64,
  1302  	io IterOptions,
  1303  ) ([]byte, roachpb.BulkOpSummary, roachpb.Key, error) {
  1304  	sstFile := &MemFile{}
  1305  	sstWriter := MakeBackupSSTWriter(sstFile)
  1306  	defer sstWriter.Close()
  1307  
  1308  	var rows RowCounter
  1309  	iter := NewMVCCIncrementalIterator(
  1310  		reader,
  1311  		MVCCIncrementalIterOptions{
  1312  			IterOptions: io,
  1313  			StartTime:   startTS,
  1314  			EndTime:     endTS,
  1315  		})
  1316  	defer iter.Close()
  1317  	var curKey roachpb.Key // only used if exportAllRevisions
  1318  	var resumeKey roachpb.Key
  1319  	paginated := targetSize > 0
  1320  	for iter.SeekGE(MakeMVCCMetadataKey(startKey)); ; {
  1321  		ok, err := iter.Valid()
  1322  		if err != nil {
  1323  			// The error may be a WriteIntentError. In which case, returning it will
  1324  			// cause this command to be retried.
  1325  			return nil, roachpb.BulkOpSummary{}, nil, err
  1326  		}
  1327  		if !ok {
  1328  			break
  1329  		}
  1330  		unsafeKey := iter.UnsafeKey()
  1331  		if unsafeKey.Key.Compare(endKey) >= 0 {
  1332  			break
  1333  		}
  1334  		unsafeValue := iter.UnsafeValue()
  1335  		isNewKey := !exportAllRevisions || !unsafeKey.Key.Equal(curKey)
  1336  		if paginated && exportAllRevisions && isNewKey {
  1337  			curKey = append(curKey[:0], unsafeKey.Key...)
  1338  		}
  1339  
  1340  		// Skip tombstone (len=0) records when start time is zero (non-incremental)
  1341  		// and we are not exporting all versions.
  1342  		skipTombstones := !exportAllRevisions && startTS.IsEmpty()
  1343  		if len(unsafeValue) > 0 || !skipTombstones {
  1344  			if err := rows.Count(unsafeKey.Key); err != nil {
  1345  				return nil, roachpb.BulkOpSummary{}, nil, errors.Wrapf(err, "decoding %s", unsafeKey)
  1346  			}
  1347  			curSize := rows.BulkOpSummary.DataSize
  1348  			reachedTargetSize := curSize > 0 && uint64(curSize) >= targetSize
  1349  			if paginated && isNewKey && reachedTargetSize {
  1350  				// Allocate the right size for resumeKey rather than using curKey.
  1351  				resumeKey = append(make(roachpb.Key, 0, len(unsafeKey.Key)), unsafeKey.Key...)
  1352  				break
  1353  			}
  1354  			if err := sstWriter.Put(unsafeKey, unsafeValue); err != nil {
  1355  				return nil, roachpb.BulkOpSummary{}, nil, errors.Wrapf(err, "adding key %s", unsafeKey)
  1356  			}
  1357  			newSize := curSize + int64(len(unsafeKey.Key)+len(unsafeValue))
  1358  			if maxSize > 0 && newSize > int64(maxSize) {
  1359  				return nil, roachpb.BulkOpSummary{}, nil,
  1360  					errors.Errorf("export size (%d bytes) exceeds max size (%d bytes)", newSize, maxSize)
  1361  			}
  1362  			rows.BulkOpSummary.DataSize = newSize
  1363  		}
  1364  
  1365  		if exportAllRevisions {
  1366  			iter.Next()
  1367  		} else {
  1368  			iter.NextKey()
  1369  		}
  1370  	}
  1371  
  1372  	if rows.BulkOpSummary.DataSize == 0 {
  1373  		// If no records were added to the sstable, skip completing it and return a
  1374  		// nil slice – the export code will discard it anyway (based on 0 DataSize).
  1375  		return nil, roachpb.BulkOpSummary{}, nil, nil
  1376  	}
  1377  
  1378  	if err := sstWriter.Finish(); err != nil {
  1379  		return nil, roachpb.BulkOpSummary{}, nil, err
  1380  	}
  1381  
  1382  	return sstFile.Data(), rows.BulkOpSummary, resumeKey, nil
  1383  }