github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/storage/rocksdb.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/storage/rocksdb.go (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package storage
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"fmt"
    17  	"io/ioutil"
    18  	"math"
    19  	"os"
    20  	"path/filepath"
    21  	"runtime"
    22  	"runtime/debug"
    23  	"sort"
    24  	"strings"
    25  	"sync"
    26  	"time"
    27  	"unsafe"
    28  
    29  	"github.com/cockroachdb/cockroach/pkg/base"
    30  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    31  	"github.com/cockroachdb/cockroach/pkg/settings"
    32  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    33  	"github.com/cockroachdb/cockroach/pkg/storage/fs"
    34  	"github.com/cockroachdb/cockroach/pkg/util"
    35  	"github.com/cockroachdb/cockroach/pkg/util/envutil"
    36  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    37  	"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
    38  	"github.com/cockroachdb/cockroach/pkg/util/log"
    39  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    40  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    41  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    42  	"github.com/cockroachdb/errors"
    43  	"github.com/cockroachdb/logtags"
    44  )
    45  
    46  // TODO(tamird): why does rocksdb not link jemalloc,snappy statically?
    47  
    48  // #cgo CPPFLAGS: -I../../c-deps/libroach/include
    49  // #cgo LDFLAGS: -lroach
    50  // #cgo LDFLAGS: -lprotobuf
    51  // #cgo LDFLAGS: -lrocksdb
    52  // #cgo LDFLAGS: -lsnappy
    53  // #cgo linux LDFLAGS: -lrt -lpthread
    54  // #cgo windows LDFLAGS: -lshlwapi -lrpcrt4
    55  //
    56  // #include <stdlib.h>
    57  // #include <libroach.h>
    58  import "C"
    59  
    60  var minWALSyncInterval = settings.RegisterDurationSetting(
    61  	"rocksdb.min_wal_sync_interval",
    62  	"minimum duration between syncs of the RocksDB WAL",
    63  	0*time.Millisecond,
    64  )
    65  
    66  var rocksdbConcurrency = envutil.EnvOrDefaultInt(
    67  	"COCKROACH_ROCKSDB_CONCURRENCY", func() int {
    68  		// Use up to min(numCPU, 4) threads for background RocksDB compactions per
    69  		// store.
    70  		const max = 4
    71  		if n := runtime.NumCPU(); n <= max {
    72  			return n
    73  		}
    74  		return max
    75  	}())
    76  
    77  // Set to true to perform expensive iterator debug leak checking. In normal
    78  // operation, we perform inexpensive iterator leak checking but those checks do
    79  // not indicate where the leak arose. The expensive checking tracks the stack
    80  // traces of every iterator allocated. DO NOT ENABLE in production code.
    81  const debugIteratorLeak = false
    82  
    83  var rocksdbLogger *log.SecondaryLogger
    84  
    85  // InitRocksDBLogger initializes the logger to use for RocksDB log messages. If
    86  // not called, WARNING, ERROR, and FATAL logs will be output to the normal
    87  // CockroachDB log. The caller is responsible for ensuring the
    88  // Close() method is eventually called on the new logger.
    89  func InitRocksDBLogger(ctx context.Context) *log.SecondaryLogger {
    90  	rocksdbLogger = log.NewSecondaryLogger(ctx, nil, "rocksdb",
    91  		true /* enableGC */, false /* forceSyncWrites */, false /* enableMsgCount */)
    92  	return rocksdbLogger
    93  }
    94  
    95  //export rocksDBLog
    96  func rocksDBLog(usePrimaryLog C.bool, sevLvl C.int, s *C.char, n C.int) {
    97  	sev := log.Severity(sevLvl)
    98  	if !usePrimaryLog {
    99  		if rocksdbLogger != nil {
   100  			// NB: No need for the rocksdb tag if we're logging to a rocksdb specific
   101  			// file.
   102  			rocksdbLogger.LogSev(context.Background(), sev, C.GoStringN(s, n))
   103  			return
   104  		}
   105  
   106  		// Only log INFO logs to the normal CockroachDB log at --v=3 and
   107  		// above. This only applies when we're not using the primary log for
   108  		// RocksDB generated messages (which is utilized by the encryption-at-rest
   109  		// code).
   110  		if sev == log.Severity_INFO && !log.V(3) {
   111  			return
   112  		}
   113  	}
   114  
   115  	ctx := logtags.AddTag(context.Background(), "rocksdb", nil)
   116  	switch sev {
   117  	case log.Severity_WARNING:
   118  		log.Warningf(ctx, "%v", C.GoStringN(s, n))
   119  	case log.Severity_ERROR:
   120  		log.Errorf(ctx, "%v", C.GoStringN(s, n))
   121  	case log.Severity_FATAL:
   122  		log.Fatalf(ctx, "%v", C.GoStringN(s, n))
   123  	default:
   124  		log.Infof(ctx, "%v", C.GoStringN(s, n))
   125  	}
   126  }
   127  
   128  //export prettyPrintKey
   129  func prettyPrintKey(cKey C.DBKey) *C.char {
   130  	mvccKey := MVCCKey{
   131  		Key: gobytes(unsafe.Pointer(cKey.key.data), int(cKey.key.len)),
   132  		Timestamp: hlc.Timestamp{
   133  			WallTime: int64(cKey.wall_time),
   134  			Logical:  int32(cKey.logical),
   135  		},
   136  	}
   137  	return C.CString(mvccKey.String())
   138  }
   139  
   140  const (
   141  	// RecommendedMaxOpenFiles is the recommended value for RocksDB's
   142  	// max_open_files option.
   143  	RecommendedMaxOpenFiles = 10000
   144  	// MinimumMaxOpenFiles is the minimum value that RocksDB's max_open_files
   145  	// option can be set to. While this should be set as high as possible, the
   146  	// minimum total for a single store node must be under 2048 for Windows
   147  	// compatibility. See:
   148  	// https://wpdev.uservoice.com/forums/266908-command-prompt-console-bash-on-ubuntu-on-windo/suggestions/17310124-add-ability-to-change-max-number-of-open-files-for
   149  	MinimumMaxOpenFiles = 1700
   150  )
   151  
   152  // SSTableInfo contains metadata about a single sstable. Note this mirrors
   153  // the C.DBSSTable struct contents.
   154  type SSTableInfo struct {
   155  	Level int
   156  	Size  int64
   157  	Start MVCCKey
   158  	End   MVCCKey
   159  }
   160  
   161  // SSTableInfos is a slice of SSTableInfo structures.
   162  type SSTableInfos []SSTableInfo
   163  
   164  func (s SSTableInfos) Len() int {
   165  	return len(s)
   166  }
   167  
   168  func (s SSTableInfos) Swap(i, j int) {
   169  	s[i], s[j] = s[j], s[i]
   170  }
   171  
   172  func (s SSTableInfos) Less(i, j int) bool {
   173  	switch {
   174  	case s[i].Level < s[j].Level:
   175  		return true
   176  	case s[i].Level > s[j].Level:
   177  		return false
   178  	case s[i].Size > s[j].Size:
   179  		return true
   180  	case s[i].Size < s[j].Size:
   181  		return false
   182  	default:
   183  		return s[i].Start.Less(s[j].Start)
   184  	}
   185  }
   186  
   187  func (s SSTableInfos) String() string {
   188  	const (
   189  		KB = 1 << 10
   190  		MB = 1 << 20
   191  		GB = 1 << 30
   192  		TB = 1 << 40
   193  	)
   194  
   195  	roundTo := func(val, to int64) int64 {
   196  		return (val + to/2) / to
   197  	}
   198  
   199  	// We're intentionally not using humanizeutil here as we want a slightly more
   200  	// compact representation.
   201  	humanize := func(size int64) string {
   202  		switch {
   203  		case size < MB:
   204  			return fmt.Sprintf("%dK", roundTo(size, KB))
   205  		case size < GB:
   206  			return fmt.Sprintf("%dM", roundTo(size, MB))
   207  		case size < TB:
   208  			return fmt.Sprintf("%dG", roundTo(size, GB))
   209  		default:
   210  			return fmt.Sprintf("%dT", roundTo(size, TB))
   211  		}
   212  	}
   213  
   214  	type levelInfo struct {
   215  		size  int64
   216  		count int
   217  	}
   218  
   219  	var levels []*levelInfo
   220  	for _, t := range s {
   221  		for i := len(levels); i <= t.Level; i++ {
   222  			levels = append(levels, &levelInfo{})
   223  		}
   224  		info := levels[t.Level]
   225  		info.size += t.Size
   226  		info.count++
   227  	}
   228  
   229  	var maxSize int
   230  	var maxLevelCount int
   231  	for _, info := range levels {
   232  		size := len(humanize(info.size))
   233  		if maxSize < size {
   234  			maxSize = size
   235  		}
   236  		count := 1 + int(math.Log10(float64(info.count)))
   237  		if maxLevelCount < count {
   238  			maxLevelCount = count
   239  		}
   240  	}
   241  	levelFormat := fmt.Sprintf("%%d [ %%%ds %%%dd ]:", maxSize, maxLevelCount)
   242  
   243  	level := -1
   244  	var buf bytes.Buffer
   245  	var lastSize string
   246  	var lastSizeCount int
   247  
   248  	flushLastSize := func() {
   249  		if lastSizeCount > 0 {
   250  			fmt.Fprintf(&buf, " %s", lastSize)
   251  			if lastSizeCount > 1 {
   252  				fmt.Fprintf(&buf, "[%d]", lastSizeCount)
   253  			}
   254  			lastSizeCount = 0
   255  		}
   256  	}
   257  
   258  	maybeFlush := func(newLevel, i int) {
   259  		if level == newLevel {
   260  			return
   261  		}
   262  		flushLastSize()
   263  		if buf.Len() > 0 {
   264  			buf.WriteString("\n")
   265  		}
   266  		level = newLevel
   267  		if level >= 0 {
   268  			info := levels[level]
   269  			fmt.Fprintf(&buf, levelFormat, level, humanize(info.size), info.count)
   270  		}
   271  	}
   272  
   273  	for i, t := range s {
   274  		maybeFlush(t.Level, i)
   275  		size := humanize(t.Size)
   276  		if size == lastSize {
   277  			lastSizeCount++
   278  		} else {
   279  			flushLastSize()
   280  			lastSize = size
   281  			lastSizeCount = 1
   282  		}
   283  	}
   284  
   285  	maybeFlush(-1, 0)
   286  	return buf.String()
   287  }
   288  
   289  // ReadAmplification returns RocksDB's worst case read amplification, which is
   290  // the number of level-0 sstables plus the number of levels, other than level 0,
   291  // with at least one sstable.
   292  //
   293  // This definition comes from here:
   294  // https://github.com/facebook/rocksdb/wiki/RocksDB-Tuning-Guide#level-style-compaction
   295  func (s SSTableInfos) ReadAmplification() int {
   296  	var readAmp int
   297  	seenLevel := make(map[int]bool)
   298  	for _, t := range s {
   299  		if t.Level == 0 {
   300  			readAmp++
   301  		} else if !seenLevel[t.Level] {
   302  			readAmp++
   303  			seenLevel[t.Level] = true
   304  		}
   305  	}
   306  	return readAmp
   307  }
   308  
   309  // SSTableInfosByLevel maintains slices of SSTableInfo objects, one
   310  // per level. The slice for each level contains the SSTableInfo
   311  // objects for SSTables at that level, sorted by start key.
   312  type SSTableInfosByLevel struct {
   313  	// Each level is a slice of SSTableInfos.
   314  	levels [][]SSTableInfo
   315  }
   316  
   317  // NewSSTableInfosByLevel returns a new SSTableInfosByLevel object
   318  // based on the supplied SSTableInfos slice.
   319  func NewSSTableInfosByLevel(s SSTableInfos) SSTableInfosByLevel {
   320  	var result SSTableInfosByLevel
   321  	for _, t := range s {
   322  		for i := len(result.levels); i <= t.Level; i++ {
   323  			result.levels = append(result.levels, []SSTableInfo{})
   324  		}
   325  		result.levels[t.Level] = append(result.levels[t.Level], t)
   326  	}
   327  	// Sort each level by start key.
   328  	for _, l := range result.levels {
   329  		sort.Slice(l, func(i, j int) bool { return l[i].Start.Less(l[j].Start) })
   330  	}
   331  	return result
   332  }
   333  
   334  // MaxLevel returns the maximum level for which there are SSTables.
   335  func (s *SSTableInfosByLevel) MaxLevel() int {
   336  	return len(s.levels) - 1
   337  }
   338  
   339  // MaxLevelSpanOverlapsContiguousSSTables returns the maximum level at
   340  // which the specified key span overlaps either none, one, or at most
   341  // two contiguous SSTables. Level 0 is returned if no level qualifies.
   342  //
   343  // This is useful when considering when to merge two compactions. In
   344  // this case, the method is called with the "gap" between the two
   345  // spans to be compacted. When the result is that the gap span touches
   346  // at most two SSTables at a high level, it suggests that merging the
   347  // two compactions is a good idea (as the up to two SSTables touched
   348  // by the gap span, due to containing endpoints of the existing
   349  // compactions, would be rewritten anyway).
   350  //
   351  // As an example, consider the following sstables in a small database:
   352  //
   353  // Level 0.
   354  //  {Level: 0, Size: 20, Start: key("a"), End: key("z")},
   355  //  {Level: 0, Size: 15, Start: key("a"), End: key("k")},
   356  // Level 2.
   357  //  {Level: 2, Size: 200, Start: key("a"), End: key("j")},
   358  //  {Level: 2, Size: 100, Start: key("k"), End: key("o")},
   359  //  {Level: 2, Size: 100, Start: key("r"), End: key("t")},
   360  // Level 6.
   361  //  {Level: 6, Size: 201, Start: key("a"), End: key("c")},
   362  //  {Level: 6, Size: 200, Start: key("d"), End: key("f")},
   363  //  {Level: 6, Size: 300, Start: key("h"), End: key("r")},
   364  //  {Level: 6, Size: 405, Start: key("s"), End: key("z")},
   365  //
   366  // - The span "a"-"c" overlaps only a single SSTable at the max level
   367  //   (L6). That's great, so we definitely want to compact that.
   368  // - The span "s"-"t" overlaps zero SSTables at the max level (L6).
   369  //   Again, great! That means we're going to compact the 3rd L2
   370  //   SSTable and maybe push that directly to L6.
   371  func (s *SSTableInfosByLevel) MaxLevelSpanOverlapsContiguousSSTables(span roachpb.Span) int {
   372  	// Note overlapsMoreTHanTwo should not be called on level 0, where
   373  	// the SSTables are not guaranteed disjoint.
   374  	overlapsMoreThanTwo := func(tables []SSTableInfo) bool {
   375  		// Search to find the first sstable which might overlap the span.
   376  		i := sort.Search(len(tables), func(i int) bool { return span.Key.Compare(tables[i].End.Key) < 0 })
   377  		// If no SSTable is overlapped, return false.
   378  		if i == -1 || i == len(tables) || span.EndKey.Compare(tables[i].Start.Key) < 0 {
   379  			return false
   380  		}
   381  		// Return true if the span is not subsumed by the combination of
   382  		// this sstable and the next. This logic is complicated and is
   383  		// covered in the unittest. There are three successive conditions
   384  		// which together ensure the span doesn't overlap > 2 SSTables.
   385  		//
   386  		// - If the first overlapped SSTable is the last.
   387  		// - If the span does not exceed the end of the next SSTable.
   388  		// - If the span does not overlap the start of the next next SSTable.
   389  		if i >= len(tables)-1 {
   390  			// First overlapped SSTable is the last (right-most) SSTable.
   391  			//    Span:   [c-----f)
   392  			//    SSTs: [a---d)
   393  			// or
   394  			//    SSTs: [a-----------q)
   395  			return false
   396  		}
   397  		if span.EndKey.Compare(tables[i+1].End.Key) <= 0 {
   398  			// Span does not reach outside of this SSTable's right neighbor.
   399  			//    Span:    [c------f)
   400  			//    SSTs: [a---d) [e-f) ...
   401  			return false
   402  		}
   403  		if i >= len(tables)-2 {
   404  			// Span reaches outside of this SSTable's right neighbor, but
   405  			// there are no more SSTables to the right.
   406  			//    Span:    [c-------------x)
   407  			//    SSTs: [a---d) [e---q)
   408  			return false
   409  		}
   410  		if span.EndKey.Compare(tables[i+2].Start.Key) <= 0 {
   411  			// There's another SSTable two to the right, but the span doesn't
   412  			// reach into it.
   413  			//    Span:    [c------------x)
   414  			//    SSTs: [a---d) [e---q) [x--z) ...
   415  			return false
   416  		}
   417  
   418  		// Touching at least three SSTables.
   419  		//    Span:    [c-------------y)
   420  		//    SSTs: [a---d) [e---q) [x--z) ...
   421  		return true
   422  	}
   423  	// Note that we never consider level 0, where SSTables can overlap.
   424  	// Level 0 is instead returned as a catch-all which means that there
   425  	// is no level where the span overlaps only two or fewer SSTables.
   426  	for i := len(s.levels) - 1; i > 0; i-- {
   427  		if !overlapsMoreThanTwo(s.levels[i]) {
   428  			return i
   429  		}
   430  	}
   431  	return 0
   432  }
   433  
   434  // RocksDBCache is a wrapper around C.DBCache
   435  type RocksDBCache struct {
   436  	cache *C.DBCache
   437  }
   438  
   439  // NewRocksDBCache creates a new cache of the specified size. Note that the
   440  // cache is refcounted internally and starts out with a refcount of one (i.e.
   441  // Release() should be called after having used the cache).
   442  func NewRocksDBCache(cacheSize int64) RocksDBCache {
   443  	return RocksDBCache{cache: C.DBNewCache(C.uint64_t(cacheSize))}
   444  }
   445  
   446  func (c RocksDBCache) ref() RocksDBCache {
   447  	if c.cache != nil {
   448  		c.cache = C.DBRefCache(c.cache)
   449  	}
   450  	return c
   451  }
   452  
   453  // Release releases the cache. Note that the cache will continue to be used
   454  // until all of the RocksDB engines it was attached to have been closed, and
   455  // that RocksDB engines which use it auto-release when they close.
   456  func (c RocksDBCache) Release() {
   457  	if c.cache != nil {
   458  		C.DBReleaseCache(c.cache)
   459  	}
   460  }
   461  
   462  // RocksDBConfig holds all configuration parameters and knobs used in setting
   463  // up a new RocksDB instance.
   464  type RocksDBConfig struct {
   465  	// StorageConfig contains storage configs for all storage engines.
   466  	base.StorageConfig
   467  	// ReadOnly will open the database in read only mode if set to true.
   468  	ReadOnly bool
   469  	// MaxOpenFiles controls the maximum number of file descriptors RocksDB
   470  	// creates. If MaxOpenFiles is zero, this is set to DefaultMaxOpenFiles.
   471  	MaxOpenFiles uint64
   472  	// WarnLargeBatchThreshold controls if a log message is printed when a
   473  	// WriteBatch takes longer than WarnLargeBatchThreshold. If it is set to
   474  	// zero, no log messages are ever printed.
   475  	WarnLargeBatchThreshold time.Duration
   476  	// RocksDBOptions contains RocksDB specific options using a semicolon
   477  	// separated key-value syntax ("key1=value1; key2=value2").
   478  	RocksDBOptions string
   479  }
   480  
   481  // RocksDB is a wrapper around a RocksDB database instance.
   482  type RocksDB struct {
   483  	cfg   RocksDBConfig
   484  	rdb   *C.DBEngine
   485  	cache RocksDBCache // Shared cache.
   486  	// auxDir is used for storing auxiliary files. Ideally it is a subdirectory of Dir.
   487  	auxDir string
   488  
   489  	commit struct {
   490  		syncutil.Mutex
   491  		cond       sync.Cond
   492  		committing bool
   493  		groupSize  int
   494  		pending    []*rocksDBBatch
   495  	}
   496  
   497  	syncer struct {
   498  		syncutil.Mutex
   499  		cond    sync.Cond
   500  		closed  bool
   501  		pending []*rocksDBBatch
   502  	}
   503  
   504  	iters struct {
   505  		syncutil.Mutex
   506  		m map[*rocksDBIterator][]byte
   507  	}
   508  }
   509  
   510  var _ Engine = &RocksDB{}
   511  
   512  // SetRocksDBOpenHook sets the DBOpenHook function that will be called during
   513  // RocksDB initialization. It is intended to be called by CCL code.
   514  func SetRocksDBOpenHook(fn unsafe.Pointer) {
   515  	C.DBSetOpenHook(fn)
   516  }
   517  
   518  // NewRocksDB allocates and returns a new RocksDB object.
   519  // This creates options and opens the database. If the database
   520  // doesn't yet exist at the specified directory, one is initialized
   521  // from scratch.
   522  // The caller must call the engine's Close method when the engine is no longer
   523  // needed.
   524  func NewRocksDB(cfg RocksDBConfig, cache RocksDBCache) (*RocksDB, error) {
   525  	if cfg.Dir == "" {
   526  		return nil, errors.New("dir must be non-empty")
   527  	}
   528  
   529  	r := &RocksDB{
   530  		cfg:   cfg,
   531  		cache: cache.ref(),
   532  	}
   533  
   534  	if err := r.setAuxiliaryDir(filepath.Join(cfg.Dir, base.AuxiliaryDir)); err != nil {
   535  		return nil, err
   536  	}
   537  
   538  	if err := r.open(); err != nil {
   539  		return nil, err
   540  	}
   541  	return r, nil
   542  }
   543  
   544  func newRocksDBInMem(attrs roachpb.Attributes, cacheSize int64) *RocksDB {
   545  	cache := NewRocksDBCache(cacheSize)
   546  	// The cache starts out with a refcount of one, and creating the engine
   547  	// from it adds another refcount, at which point we release one of them.
   548  	defer cache.Release()
   549  
   550  	// TODO(bdarnell): The hard-coded 512 MiB is wrong; see
   551  	// https://github.com/cockroachdb/cockroach/issues/16750
   552  	db, err := newMemRocksDB(attrs, cache, 512<<20 /* MaxSize: 512 MiB */)
   553  	if err != nil {
   554  		panic(err)
   555  	}
   556  	return db
   557  }
   558  
   559  func newMemRocksDB(attrs roachpb.Attributes, cache RocksDBCache, maxSize int64) (*RocksDB, error) {
   560  	r := &RocksDB{
   561  		cfg: RocksDBConfig{
   562  			StorageConfig: base.StorageConfig{
   563  				Attrs:   attrs,
   564  				MaxSize: maxSize,
   565  			},
   566  		},
   567  		// dir: empty dir == "mem" RocksDB instance.
   568  		cache: cache.ref(),
   569  	}
   570  
   571  	// TODO(peter): This is bizarre. We're creating on on-disk temporary
   572  	// directory for an in-memory filesystem. The reason this is done is because
   573  	// various users of the auxiliary directory use the os.* routines (which is
   574  	// invalid!). This needs to be cleaned up.
   575  	auxDir, err := ioutil.TempDir(os.TempDir(), "cockroach-auxiliary")
   576  	if err != nil {
   577  		return nil, err
   578  	}
   579  	if err := r.setAuxiliaryDir(auxDir); err != nil {
   580  		return nil, err
   581  	}
   582  
   583  	if err := r.open(); err != nil {
   584  		return nil, err
   585  	}
   586  
   587  	return r, nil
   588  }
   589  
   590  // String formatter.
   591  func (r *RocksDB) String() string {
   592  	dir := r.cfg.Dir
   593  	if r.cfg.Dir == "" {
   594  		dir = "<in-mem>"
   595  	}
   596  	attrs := r.Attrs().String()
   597  	if attrs == "" {
   598  		attrs = "<no-attributes>"
   599  	}
   600  	return fmt.Sprintf("%s=%s", attrs, dir)
   601  }
   602  
   603  func (r *RocksDB) open() error {
   604  	var existingVersion, newVersion storageVersion
   605  	if len(r.cfg.Dir) != 0 {
   606  		log.Infof(context.TODO(), "opening rocksdb instance at %q", r.cfg.Dir)
   607  
   608  		// Check the version number.
   609  		var err error
   610  		if existingVersion, err = getVersion(r.cfg.Dir); err != nil {
   611  			return err
   612  		}
   613  		if existingVersion < versionMinimum || existingVersion > versionCurrent {
   614  			// Instead of an error, we should call a migration if possible when
   615  			// one is needed immediately following the DBOpen call.
   616  			return fmt.Errorf("incompatible rocksdb data version, current:%d, on disk:%d, minimum:%d",
   617  				versionCurrent, existingVersion, versionMinimum)
   618  		}
   619  
   620  		newVersion = existingVersion
   621  		if newVersion == versionNoFile {
   622  			// We currently set the default store version one before the file registry
   623  			// to allow downgrades to older binaries as long as encryption is not in use.
   624  			// TODO(mberhault): once enough releases supporting versionFileRegistry have passed, we can upgrade
   625  			// to it without worry.
   626  			newVersion = versionBeta20160331
   627  		}
   628  
   629  		// Using the file registry forces the latest version. We can't downgrade!
   630  		if r.cfg.UseFileRegistry {
   631  			newVersion = versionCurrent
   632  		}
   633  	} else {
   634  		if log.V(2) {
   635  			log.Infof(context.TODO(), "opening in memory rocksdb instance")
   636  		}
   637  
   638  		// In memory dbs are always current.
   639  		existingVersion = versionCurrent
   640  	}
   641  
   642  	maxOpenFiles := uint64(RecommendedMaxOpenFiles)
   643  	if r.cfg.MaxOpenFiles != 0 {
   644  		maxOpenFiles = r.cfg.MaxOpenFiles
   645  	}
   646  
   647  	status := C.DBOpen(&r.rdb, goToCSlice([]byte(r.cfg.Dir)),
   648  		C.DBOptions{
   649  			cache:             r.cache.cache,
   650  			num_cpu:           C.int(rocksdbConcurrency),
   651  			max_open_files:    C.int(maxOpenFiles),
   652  			use_file_registry: C.bool(newVersion == versionCurrent),
   653  			must_exist:        C.bool(r.cfg.MustExist),
   654  			read_only:         C.bool(r.cfg.ReadOnly),
   655  			rocksdb_options:   goToCSlice([]byte(r.cfg.RocksDBOptions)),
   656  			extra_options:     goToCSlice(r.cfg.ExtraOptions),
   657  		})
   658  	if err := statusToError(status); err != nil {
   659  		return errors.Wrap(err, "could not open rocksdb instance")
   660  	}
   661  
   662  	// Update or add the version file if needed and if on-disk.
   663  	if len(r.cfg.Dir) != 0 && existingVersion < newVersion {
   664  		if err := writeVersionFile(r.cfg.Dir, newVersion); err != nil {
   665  			return err
   666  		}
   667  	}
   668  
   669  	r.commit.cond.L = &r.commit.Mutex
   670  	r.syncer.cond.L = &r.syncer.Mutex
   671  	r.iters.m = make(map[*rocksDBIterator][]byte)
   672  
   673  	// NB: The sync goroutine acts as a check that the RocksDB instance was
   674  	// properly closed as the goroutine will leak otherwise.
   675  	go r.syncLoop()
   676  	return nil
   677  }
   678  
   679  func (r *RocksDB) syncLoop() {
   680  	s := &r.syncer
   681  	s.Lock()
   682  
   683  	var lastSync time.Time
   684  	var err error
   685  
   686  	for {
   687  		for len(s.pending) == 0 && !s.closed {
   688  			s.cond.Wait()
   689  		}
   690  		if s.closed {
   691  			s.Unlock()
   692  			return
   693  		}
   694  
   695  		var min time.Duration
   696  		if r.cfg.Settings != nil {
   697  			min = minWALSyncInterval.Get(&r.cfg.Settings.SV)
   698  		}
   699  		if delta := timeutil.Since(lastSync); delta < min {
   700  			s.Unlock()
   701  			time.Sleep(min - delta)
   702  			s.Lock()
   703  		}
   704  
   705  		pending := s.pending
   706  		s.pending = nil
   707  
   708  		s.Unlock()
   709  
   710  		// Linux only guarantees we'll be notified of a writeback error once
   711  		// during a sync call. After sync fails once, we cannot rely on any
   712  		// future data written to WAL being crash-recoverable. That's because
   713  		// any future writes will be appended after a potential corruption in
   714  		// the WAL, and RocksDB's recovery terminates upon encountering any
   715  		// corruption. So, we must not call `DBSyncWAL` again after it has
   716  		// failed once.
   717  		if r.cfg.Dir != "" && err == nil {
   718  			err = statusToError(C.DBSyncWAL(r.rdb))
   719  			lastSync = timeutil.Now()
   720  		}
   721  
   722  		for _, b := range pending {
   723  			b.commitErr = err
   724  			b.commitWG.Done()
   725  		}
   726  
   727  		s.Lock()
   728  	}
   729  }
   730  
   731  // Close closes the database by deallocating the underlying handle.
   732  func (r *RocksDB) Close() {
   733  	if r.rdb == nil {
   734  		log.Errorf(context.TODO(), "closing unopened rocksdb instance")
   735  		return
   736  	}
   737  	if len(r.cfg.Dir) == 0 {
   738  		if log.V(1) {
   739  			log.Infof(context.TODO(), "closing in-memory rocksdb instance")
   740  		}
   741  		// Remove the temporary directory when the engine is in-memory.
   742  		if err := os.RemoveAll(r.auxDir); err != nil {
   743  			log.Warningf(context.TODO(), "%v", err)
   744  		}
   745  	} else {
   746  		log.Infof(context.TODO(), "closing rocksdb instance at %q", r.cfg.Dir)
   747  	}
   748  	if r.rdb != nil {
   749  		if err := statusToError(C.DBClose(r.rdb)); err != nil {
   750  			if debugIteratorLeak {
   751  				r.iters.Lock()
   752  				for _, stack := range r.iters.m {
   753  					fmt.Printf("%s\n", stack)
   754  				}
   755  				r.iters.Unlock()
   756  			}
   757  			panic(err)
   758  		}
   759  		r.rdb = nil
   760  	}
   761  	r.cache.Release()
   762  	r.syncer.Lock()
   763  	r.syncer.closed = true
   764  	r.syncer.cond.Signal()
   765  	r.syncer.Unlock()
   766  }
   767  
   768  // CreateCheckpoint creates a RocksDB checkpoint in the given directory (which
   769  // must not exist). This directory should be located on the same file system, or
   770  // copies of all data are used instead of hard links, which is very expensive.
   771  func (r *RocksDB) CreateCheckpoint(dir string) error {
   772  	status := C.DBCreateCheckpoint(r.rdb, goToCSlice([]byte(dir)))
   773  	return errors.Wrap(statusToError(status), "unable to take RocksDB checkpoint")
   774  }
   775  
   776  // Closed returns true if the engine is closed.
   777  func (r *RocksDB) Closed() bool {
   778  	return r.rdb == nil
   779  }
   780  
   781  // ExportToSst is part of the engine.Reader interface.
   782  func (r *RocksDB) ExportToSst(
   783  	startKey, endKey roachpb.Key,
   784  	startTS, endTS hlc.Timestamp,
   785  	exportAllRevisions bool,
   786  	targetSize, maxSize uint64,
   787  	io IterOptions,
   788  ) ([]byte, roachpb.BulkOpSummary, roachpb.Key, error) {
   789  	start := MVCCKey{Key: startKey, Timestamp: startTS}
   790  	end := MVCCKey{Key: endKey, Timestamp: endTS}
   791  
   792  	var data C.DBString
   793  	var intentErr C.DBString
   794  	var bulkopSummary C.DBString
   795  	var resumeKey C.DBString
   796  
   797  	err := statusToError(C.DBExportToSst(goToCKey(start), goToCKey(end),
   798  		C.bool(exportAllRevisions),
   799  		C.uint64_t(targetSize), C.uint64_t(maxSize),
   800  		goToCIterOptions(io), r.rdb, &data, &intentErr, &bulkopSummary, &resumeKey))
   801  
   802  	if err != nil {
   803  		if err.Error() == "WriteIntentError" {
   804  			var e roachpb.WriteIntentError
   805  			if err := protoutil.Unmarshal(cStringToGoBytes(intentErr), &e); err != nil {
   806  				return nil, roachpb.BulkOpSummary{}, nil, errors.Wrap(err, "failed to decode write intent error")
   807  			}
   808  
   809  			return nil, roachpb.BulkOpSummary{}, nil, &e
   810  		}
   811  		return nil, roachpb.BulkOpSummary{}, nil, err
   812  	}
   813  
   814  	var summary roachpb.BulkOpSummary
   815  	if err := protoutil.Unmarshal(cStringToGoBytes(bulkopSummary), &summary); err != nil {
   816  		return nil, roachpb.BulkOpSummary{}, nil, errors.Wrap(err, "failed to decode BulkopSummary")
   817  	}
   818  
   819  	return cStringToGoBytes(data), summary, roachpb.Key(cStringToGoBytes(resumeKey)), nil
   820  }
   821  
   822  // Attrs returns the list of attributes describing this engine. This
   823  // may include a specification of disk type (e.g. hdd, ssd, fio, etc.)
   824  // and potentially other labels to identify important attributes of
   825  // the engine.
   826  func (r *RocksDB) Attrs() roachpb.Attributes {
   827  	return r.cfg.Attrs
   828  }
   829  
   830  // Put sets the given key to the value provided.
   831  //
   832  // It is safe to modify the contents of the arguments after Put returns.
   833  func (r *RocksDB) Put(key MVCCKey, value []byte) error {
   834  	return dbPut(r.rdb, key, value)
   835  }
   836  
   837  // Merge implements the RocksDB merge operator using the function goMergeInit
   838  // to initialize missing values and goMerge to merge the old and the given
   839  // value into a new value, which is then stored under key.
   840  // Currently 64-bit counter logic is implemented. See the documentation of
   841  // goMerge and goMergeInit for details.
   842  //
   843  // It is safe to modify the contents of the arguments after Merge returns.
   844  func (r *RocksDB) Merge(key MVCCKey, value []byte) error {
   845  	return dbMerge(r.rdb, key, value)
   846  }
   847  
   848  // LogData is part of the Writer interface.
   849  //
   850  // It is safe to modify the contents of the arguments after LogData returns.
   851  func (r *RocksDB) LogData(data []byte) error {
   852  	panic("unimplemented")
   853  }
   854  
   855  // LogLogicalOp is part of the Writer interface.
   856  func (r *RocksDB) LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails) {
   857  	// No-op. Logical logging disabled.
   858  }
   859  
   860  // ApplyBatchRepr atomically applies a set of batched updates. Created by
   861  // calling Repr() on a batch. Using this method is equivalent to constructing
   862  // and committing a batch whose Repr() equals repr.
   863  //
   864  // It is safe to modify the contents of the arguments after ApplyBatchRepr
   865  // returns.
   866  func (r *RocksDB) ApplyBatchRepr(repr []byte, sync bool) error {
   867  	return dbApplyBatchRepr(r.rdb, repr, sync)
   868  }
   869  
   870  // Get returns the value for the given key.
   871  func (r *RocksDB) Get(key MVCCKey) ([]byte, error) {
   872  	return dbGet(r.rdb, key)
   873  }
   874  
   875  // GetProto fetches the value at the specified key and unmarshals it.
   876  func (r *RocksDB) GetProto(
   877  	key MVCCKey, msg protoutil.Message,
   878  ) (ok bool, keyBytes, valBytes int64, err error) {
   879  	return dbGetProto(r.rdb, key, msg)
   880  }
   881  
   882  // Clear removes the item from the db with the given key.
   883  //
   884  // It is safe to modify the contents of the arguments after Clear returns.
   885  func (r *RocksDB) Clear(key MVCCKey) error {
   886  	return dbClear(r.rdb, key)
   887  }
   888  
   889  // SingleClear removes the most recent item from the db with the given key.
   890  //
   891  // It is safe to modify the contents of the arguments after SingleClear returns.
   892  func (r *RocksDB) SingleClear(key MVCCKey) error {
   893  	return dbSingleClear(r.rdb, key)
   894  }
   895  
   896  // ClearRange removes a set of entries, from start (inclusive) to end
   897  // (exclusive).
   898  //
   899  // It is safe to modify the contents of the arguments after ClearRange returns.
   900  func (r *RocksDB) ClearRange(start, end MVCCKey) error {
   901  	return dbClearRange(r.rdb, start, end)
   902  }
   903  
   904  // ClearIterRange removes a set of entries, from start (inclusive) to end
   905  // (exclusive).
   906  //
   907  // It is safe to modify the contents of the arguments after ClearIterRange
   908  // returns.
   909  func (r *RocksDB) ClearIterRange(iter Iterator, start, end roachpb.Key) error {
   910  	return dbClearIterRange(r.rdb, iter, start, end)
   911  }
   912  
   913  // Iterate iterates from start to end keys, invoking f on each
   914  // key/value pair. See engine.Iterate for details.
   915  func (r *RocksDB) Iterate(start, end roachpb.Key, f func(MVCCKeyValue) (bool, error)) error {
   916  	return iterateOnReader(r, start, end, f)
   917  }
   918  
   919  // Capacity queries the underlying file system for disk capacity information.
   920  func (r *RocksDB) Capacity() (roachpb.StoreCapacity, error) {
   921  	return computeCapacity(r.cfg.Dir, r.cfg.MaxSize)
   922  }
   923  
   924  // Compact forces compaction over the entire database.
   925  func (r *RocksDB) Compact() error {
   926  	return statusToError(C.DBCompact(r.rdb))
   927  }
   928  
   929  // CompactRange forces compaction over a specified range of keys in the database.
   930  func (r *RocksDB) CompactRange(start, end roachpb.Key, forceBottommost bool) error {
   931  	return statusToError(C.DBCompactRange(r.rdb, goToCSlice(start), goToCSlice(end), C.bool(forceBottommost)))
   932  }
   933  
   934  // disableAutoCompaction disables automatic compactions. For testing use only.
   935  func (r *RocksDB) disableAutoCompaction() error {
   936  	return statusToError(C.DBDisableAutoCompaction(r.rdb))
   937  }
   938  
   939  // ApproximateDiskBytes returns the approximate on-disk size of the specified key range.
   940  func (r *RocksDB) ApproximateDiskBytes(from, to roachpb.Key) (uint64, error) {
   941  	start := MVCCKey{Key: from}
   942  	end := MVCCKey{Key: to}
   943  	var result C.uint64_t
   944  	err := statusToError(C.DBApproximateDiskBytes(r.rdb, goToCKey(start), goToCKey(end), &result))
   945  	return uint64(result), err
   946  }
   947  
   948  // Flush causes RocksDB to write all in-memory data to disk immediately.
   949  func (r *RocksDB) Flush() error {
   950  	return statusToError(C.DBFlush(r.rdb))
   951  }
   952  
   953  // NewIterator returns an iterator over this rocksdb engine.
   954  func (r *RocksDB) NewIterator(opts IterOptions) Iterator {
   955  	return newRocksDBIterator(r.rdb, opts, r, r)
   956  }
   957  
   958  // NewSnapshot creates a snapshot handle from engine and returns a
   959  // read-only rocksDBSnapshot engine.
   960  func (r *RocksDB) NewSnapshot() Reader {
   961  	if r.rdb == nil {
   962  		panic("RocksDB is not initialized yet")
   963  	}
   964  	return &rocksDBSnapshot{
   965  		parent: r,
   966  		handle: C.DBNewSnapshot(r.rdb),
   967  	}
   968  }
   969  
   970  // Type implements the Engine interface.
   971  func (r *RocksDB) Type() enginepb.EngineType {
   972  	return enginepb.EngineTypeRocksDB
   973  }
   974  
   975  // NewReadOnly returns a new ReadWriter wrapping this rocksdb engine.
   976  func (r *RocksDB) NewReadOnly() ReadWriter {
   977  	return &rocksDBReadOnly{
   978  		parent:   r,
   979  		isClosed: false,
   980  	}
   981  }
   982  
   983  type rocksDBReadOnly struct {
   984  	parent     *RocksDB
   985  	prefixIter reusableIterator
   986  	normalIter reusableIterator
   987  	isClosed   bool
   988  }
   989  
   990  func (r *rocksDBReadOnly) Close() {
   991  	if r.isClosed {
   992  		panic("closing an already-closed rocksDBReadOnly")
   993  	}
   994  	r.isClosed = true
   995  	if i := &r.prefixIter.rocksDBIterator; i.iter != nil {
   996  		i.destroy()
   997  	}
   998  	if i := &r.normalIter.rocksDBIterator; i.iter != nil {
   999  		i.destroy()
  1000  	}
  1001  }
  1002  
  1003  // Read-only batches are not committed
  1004  func (r *rocksDBReadOnly) Closed() bool {
  1005  	return r.isClosed
  1006  }
  1007  
  1008  // ExportToSst is part of the engine.Reader interface.
  1009  func (r *rocksDBReadOnly) ExportToSst(
  1010  	startKey, endKey roachpb.Key,
  1011  	startTS, endTS hlc.Timestamp,
  1012  	exportAllRevisions bool,
  1013  	targetSize, maxSize uint64,
  1014  	io IterOptions,
  1015  ) ([]byte, roachpb.BulkOpSummary, roachpb.Key, error) {
  1016  	return r.parent.ExportToSst(startKey, endKey, startTS, endTS, exportAllRevisions, targetSize, maxSize, io)
  1017  }
  1018  
  1019  func (r *rocksDBReadOnly) Get(key MVCCKey) ([]byte, error) {
  1020  	if r.isClosed {
  1021  		panic("using a closed rocksDBReadOnly")
  1022  	}
  1023  	return dbGet(r.parent.rdb, key)
  1024  }
  1025  
  1026  func (r *rocksDBReadOnly) GetProto(
  1027  	key MVCCKey, msg protoutil.Message,
  1028  ) (ok bool, keyBytes, valBytes int64, err error) {
  1029  	if r.isClosed {
  1030  		panic("using a closed rocksDBReadOnly")
  1031  	}
  1032  	return dbGetProto(r.parent.rdb, key, msg)
  1033  }
  1034  
  1035  func (r *rocksDBReadOnly) Iterate(
  1036  	start, end roachpb.Key, f func(MVCCKeyValue) (bool, error),
  1037  ) error {
  1038  	if r.isClosed {
  1039  		panic("using a closed rocksDBReadOnly")
  1040  	}
  1041  	return iterateOnReader(r, start, end, f)
  1042  }
  1043  
  1044  // NewIterator returns an iterator over the underlying engine. Note
  1045  // that the returned iterator is cached and re-used for the lifetime of the
  1046  // rocksDBReadOnly. A panic will be thrown if multiple prefix or normal (non-prefix)
  1047  // iterators are used simultaneously on the same rocksDBReadOnly.
  1048  func (r *rocksDBReadOnly) NewIterator(opts IterOptions) Iterator {
  1049  	if r.isClosed {
  1050  		panic("using a closed rocksDBReadOnly")
  1051  	}
  1052  	if opts.MinTimestampHint != (hlc.Timestamp{}) {
  1053  		// Iterators that specify timestamp bounds cannot be cached.
  1054  		return newRocksDBIterator(r.parent.rdb, opts, r, r.parent)
  1055  	}
  1056  	iter := &r.normalIter
  1057  	if opts.Prefix {
  1058  		iter = &r.prefixIter
  1059  	}
  1060  	if iter.rocksDBIterator.iter == nil {
  1061  		iter.rocksDBIterator.init(r.parent.rdb, opts, r, r.parent)
  1062  	} else {
  1063  		iter.rocksDBIterator.setOptions(opts)
  1064  	}
  1065  	if iter.inuse {
  1066  		panic("iterator already in use")
  1067  	}
  1068  	iter.inuse = true
  1069  	return iter
  1070  }
  1071  
  1072  // Writer methods are not implemented for rocksDBReadOnly. Ideally, the code
  1073  // could be refactored so that a Reader could be supplied to evaluateBatch
  1074  
  1075  // Writer is the write interface to an engine's data.
  1076  func (r *rocksDBReadOnly) ApplyBatchRepr(repr []byte, sync bool) error {
  1077  	panic("not implemented")
  1078  }
  1079  
  1080  func (r *rocksDBReadOnly) Clear(key MVCCKey) error {
  1081  	panic("not implemented")
  1082  }
  1083  
  1084  func (r *rocksDBReadOnly) SingleClear(key MVCCKey) error {
  1085  	panic("not implemented")
  1086  }
  1087  
  1088  func (r *rocksDBReadOnly) ClearRange(start, end MVCCKey) error {
  1089  	panic("not implemented")
  1090  }
  1091  
  1092  func (r *rocksDBReadOnly) ClearIterRange(iter Iterator, start, end roachpb.Key) error {
  1093  	panic("not implemented")
  1094  }
  1095  
  1096  func (r *rocksDBReadOnly) Merge(key MVCCKey, value []byte) error {
  1097  	panic("not implemented")
  1098  }
  1099  
  1100  func (r *rocksDBReadOnly) Put(key MVCCKey, value []byte) error {
  1101  	panic("not implemented")
  1102  }
  1103  
  1104  func (r *rocksDBReadOnly) LogData(data []byte) error {
  1105  	panic("not implemented")
  1106  }
  1107  
  1108  func (r *rocksDBReadOnly) LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails) {
  1109  	panic("not implemented")
  1110  }
  1111  
  1112  // NewBatch returns a new batch wrapping this rocksdb engine.
  1113  func (r *RocksDB) NewBatch() Batch {
  1114  	b := newRocksDBBatch(r, false /* writeOnly */)
  1115  	return b
  1116  }
  1117  
  1118  // NewWriteOnlyBatch returns a new write-only batch wrapping this rocksdb
  1119  // engine.
  1120  func (r *RocksDB) NewWriteOnlyBatch() Batch {
  1121  	return newRocksDBBatch(r, true /* writeOnly */)
  1122  }
  1123  
  1124  // GetSSTables retrieves metadata about this engine's live sstables.
  1125  func (r *RocksDB) GetSSTables() SSTableInfos {
  1126  	var n C.int
  1127  	tables := C.DBGetSSTables(r.rdb, &n)
  1128  	// We can't index into tables because it is a pointer, not a slice. The
  1129  	// hackery below treats the pointer as an array and then constructs a slice
  1130  	// from it.
  1131  
  1132  	tableSize := unsafe.Sizeof(C.DBSSTable{})
  1133  	tableVal := func(i int) C.DBSSTable {
  1134  		return *(*C.DBSSTable)(unsafe.Pointer(uintptr(unsafe.Pointer(tables)) + uintptr(i)*tableSize))
  1135  	}
  1136  
  1137  	res := make(SSTableInfos, n)
  1138  	for i := range res {
  1139  		r := &res[i]
  1140  		tv := tableVal(i)
  1141  		r.Level = int(tv.level)
  1142  		r.Size = int64(tv.size)
  1143  		r.Start = cToGoKey(tv.start_key)
  1144  		r.End = cToGoKey(tv.end_key)
  1145  		if ptr := tv.start_key.key.data; ptr != nil {
  1146  			C.free(unsafe.Pointer(ptr))
  1147  		}
  1148  		if ptr := tv.end_key.key.data; ptr != nil {
  1149  			C.free(unsafe.Pointer(ptr))
  1150  		}
  1151  	}
  1152  	C.free(unsafe.Pointer(tables))
  1153  
  1154  	sort.Sort(res)
  1155  	return res
  1156  }
  1157  
  1158  // WALFileInfo contains metadata about a single write-ahead log file. Note this
  1159  // mirrors the C.DBWALFile struct.
  1160  type WALFileInfo struct {
  1161  	LogNumber int64
  1162  	Size      int64
  1163  }
  1164  
  1165  // GetSortedWALFiles retrievews information about all of the write-ahead log
  1166  // files in this engine in order from oldest to newest.
  1167  func (r *RocksDB) GetSortedWALFiles() ([]WALFileInfo, error) {
  1168  	var n C.int
  1169  	var files *C.DBWALFile
  1170  	status := C.DBGetSortedWALFiles(r.rdb, &files, &n)
  1171  	if err := statusToError(status); err != nil {
  1172  		return nil, errors.Wrap(err, "could not get sorted WAL files")
  1173  	}
  1174  	defer C.free(unsafe.Pointer(files))
  1175  
  1176  	// We can't index into files because it is a pointer, not a slice. The hackery
  1177  	// below treats the pointer as an array and then constructs a slice from it.
  1178  
  1179  	structSize := unsafe.Sizeof(C.DBWALFile{})
  1180  	getWALFile := func(i int) *C.DBWALFile {
  1181  		return (*C.DBWALFile)(unsafe.Pointer(uintptr(unsafe.Pointer(files)) + uintptr(i)*structSize))
  1182  	}
  1183  
  1184  	res := make([]WALFileInfo, n)
  1185  	for i := range res {
  1186  		wf := getWALFile(i)
  1187  		res[i].LogNumber = int64(wf.log_number)
  1188  		res[i].Size = int64(wf.size)
  1189  	}
  1190  	return res, nil
  1191  }
  1192  
  1193  // GetUserProperties fetches the user properties stored in each sstable's
  1194  // metadata.
  1195  func (r *RocksDB) GetUserProperties() (enginepb.SSTUserPropertiesCollection, error) {
  1196  	buf := cStringToGoBytes(C.DBGetUserProperties(r.rdb))
  1197  	var ssts enginepb.SSTUserPropertiesCollection
  1198  	if err := protoutil.Unmarshal(buf, &ssts); err != nil {
  1199  		return enginepb.SSTUserPropertiesCollection{}, err
  1200  	}
  1201  	if ssts.Error != "" {
  1202  		return enginepb.SSTUserPropertiesCollection{}, errors.Newf("%s", ssts.Error)
  1203  	}
  1204  	return ssts, nil
  1205  }
  1206  
  1207  // GetStats retrieves stats from this engine's RocksDB instance and
  1208  // returns it in a new instance of Stats.
  1209  func (r *RocksDB) GetStats() (*Stats, error) {
  1210  	var s C.DBStatsResult
  1211  	if err := statusToError(C.DBGetStats(r.rdb, &s)); err != nil {
  1212  		return nil, err
  1213  	}
  1214  	return &Stats{
  1215  		BlockCacheHits:                 int64(s.block_cache_hits),
  1216  		BlockCacheMisses:               int64(s.block_cache_misses),
  1217  		BlockCacheUsage:                int64(s.block_cache_usage),
  1218  		BlockCachePinnedUsage:          int64(s.block_cache_pinned_usage),
  1219  		BloomFilterPrefixChecked:       int64(s.bloom_filter_prefix_checked),
  1220  		BloomFilterPrefixUseful:        int64(s.bloom_filter_prefix_useful),
  1221  		MemtableTotalSize:              int64(s.memtable_total_size),
  1222  		Flushes:                        int64(s.flushes),
  1223  		FlushedBytes:                   int64(s.flush_bytes),
  1224  		Compactions:                    int64(s.compactions),
  1225  		IngestedBytes:                  0, // Not exposed by RocksDB.
  1226  		CompactedBytesRead:             int64(s.compact_read_bytes),
  1227  		CompactedBytesWritten:          int64(s.compact_write_bytes),
  1228  		TableReadersMemEstimate:        int64(s.table_readers_mem_estimate),
  1229  		PendingCompactionBytesEstimate: int64(s.pending_compaction_bytes_estimate),
  1230  		L0FileCount:                    int64(s.l0_file_count),
  1231  	}, nil
  1232  }
  1233  
  1234  // GetTickersAndHistograms retrieves maps of all RocksDB tickers and histograms.
  1235  // It differs from `GetStats` by getting _every_ ticker and histogram, and by not
  1236  // getting anything else (DB properties, for example).
  1237  func (r *RocksDB) GetTickersAndHistograms() (*enginepb.TickersAndHistograms, error) {
  1238  	res := new(enginepb.TickersAndHistograms)
  1239  	var s C.DBTickersAndHistogramsResult
  1240  	if err := statusToError(C.DBGetTickersAndHistograms(r.rdb, &s)); err != nil {
  1241  		return nil, err
  1242  	}
  1243  
  1244  	tickers := (*[MaxArrayLen / C.sizeof_TickerInfo]C.TickerInfo)(
  1245  		unsafe.Pointer(s.tickers))[:s.tickers_len:s.tickers_len]
  1246  	res.Tickers = make(map[string]uint64)
  1247  	for _, ticker := range tickers {
  1248  		name := cStringToGoString(ticker.name)
  1249  		value := uint64(ticker.value)
  1250  		res.Tickers[name] = value
  1251  	}
  1252  	C.free(unsafe.Pointer(s.tickers))
  1253  
  1254  	res.Histograms = make(map[string]enginepb.HistogramData)
  1255  	histograms := (*[MaxArrayLen / C.sizeof_HistogramInfo]C.HistogramInfo)(
  1256  		unsafe.Pointer(s.histograms))[:s.histograms_len:s.histograms_len]
  1257  	for _, histogram := range histograms {
  1258  		name := cStringToGoString(histogram.name)
  1259  		value := enginepb.HistogramData{
  1260  			Mean:  float64(histogram.mean),
  1261  			P50:   float64(histogram.p50),
  1262  			P95:   float64(histogram.p95),
  1263  			P99:   float64(histogram.p99),
  1264  			Max:   float64(histogram.max),
  1265  			Count: uint64(histogram.count),
  1266  			Sum:   uint64(histogram.sum),
  1267  		}
  1268  		res.Histograms[name] = value
  1269  	}
  1270  	C.free(unsafe.Pointer(s.histograms))
  1271  	return res, nil
  1272  }
  1273  
  1274  // GetCompactionStats returns the internal RocksDB compaction stats. See
  1275  // https://github.com/facebook/rocksdb/wiki/RocksDB-Tuning-Guide#rocksdb-statistics.
  1276  func (r *RocksDB) GetCompactionStats() string {
  1277  	s := cStringToGoString(C.DBGetCompactionStats(r.rdb)) +
  1278  		"estimated_pending_compaction_bytes: "
  1279  	stats, err := r.GetStats()
  1280  	if err != nil {
  1281  		return s + err.Error()
  1282  	}
  1283  	return s + humanizeutil.IBytes(stats.PendingCompactionBytesEstimate)
  1284  }
  1285  
  1286  // GetEnvStats returns stats for the RocksDB env. This may include encryption stats.
  1287  func (r *RocksDB) GetEnvStats() (*EnvStats, error) {
  1288  	var s C.DBEnvStatsResult
  1289  	if err := statusToError(C.DBGetEnvStats(r.rdb, &s)); err != nil {
  1290  		return nil, err
  1291  	}
  1292  
  1293  	return &EnvStats{
  1294  		TotalFiles:       uint64(s.total_files),
  1295  		TotalBytes:       uint64(s.total_bytes),
  1296  		ActiveKeyFiles:   uint64(s.active_key_files),
  1297  		ActiveKeyBytes:   uint64(s.active_key_bytes),
  1298  		EncryptionType:   int32(s.encryption_type),
  1299  		EncryptionStatus: cStringToGoBytes(s.encryption_status),
  1300  	}, nil
  1301  }
  1302  
  1303  // GetEncryptionRegistries returns the file and key registries when encryption is enabled
  1304  // on the store.
  1305  func (r *RocksDB) GetEncryptionRegistries() (*EncryptionRegistries, error) {
  1306  	var s C.DBEncryptionRegistries
  1307  	if err := statusToError(C.DBGetEncryptionRegistries(r.rdb, &s)); err != nil {
  1308  		return nil, err
  1309  	}
  1310  
  1311  	return &EncryptionRegistries{
  1312  		FileRegistry: cStringToGoBytes(s.file_registry),
  1313  		KeyRegistry:  cStringToGoBytes(s.key_registry),
  1314  	}, nil
  1315  }
  1316  
  1317  type rocksDBSnapshot struct {
  1318  	parent *RocksDB
  1319  	handle *C.DBEngine
  1320  }
  1321  
  1322  // Close releases the snapshot handle.
  1323  func (r *rocksDBSnapshot) Close() {
  1324  	C.DBClose(r.handle)
  1325  	r.handle = nil
  1326  }
  1327  
  1328  // Closed returns true if the engine is closed.
  1329  func (r *rocksDBSnapshot) Closed() bool {
  1330  	return r.handle == nil
  1331  }
  1332  
  1333  // ExportToSst is part of the engine.Reader interface.
  1334  func (r *rocksDBSnapshot) ExportToSst(
  1335  	startKey, endKey roachpb.Key,
  1336  	startTS, endTS hlc.Timestamp,
  1337  	exportAllRevisions bool,
  1338  	targetSize, maxSize uint64,
  1339  	io IterOptions,
  1340  ) ([]byte, roachpb.BulkOpSummary, roachpb.Key, error) {
  1341  	return r.parent.ExportToSst(startKey, endKey, startTS, endTS, exportAllRevisions, targetSize, maxSize, io)
  1342  }
  1343  
  1344  // Get returns the value for the given key, nil otherwise using
  1345  // the snapshot handle.
  1346  func (r *rocksDBSnapshot) Get(key MVCCKey) ([]byte, error) {
  1347  	return dbGet(r.handle, key)
  1348  }
  1349  
  1350  func (r *rocksDBSnapshot) GetProto(
  1351  	key MVCCKey, msg protoutil.Message,
  1352  ) (ok bool, keyBytes, valBytes int64, err error) {
  1353  	return dbGetProto(r.handle, key, msg)
  1354  }
  1355  
  1356  // Iterate iterates over the keys between start inclusive and end
  1357  // exclusive, invoking f() on each key/value pair using the snapshot
  1358  // handle.
  1359  func (r *rocksDBSnapshot) Iterate(
  1360  	start, end roachpb.Key, f func(MVCCKeyValue) (bool, error),
  1361  ) error {
  1362  	return iterateOnReader(r, start, end, f)
  1363  }
  1364  
  1365  // NewIterator returns a new instance of an Iterator over the
  1366  // engine using the snapshot handle.
  1367  func (r *rocksDBSnapshot) NewIterator(opts IterOptions) Iterator {
  1368  	return newRocksDBIterator(r.handle, opts, r, r.parent)
  1369  }
  1370  
  1371  // reusableIterator wraps rocksDBIterator and allows reuse of an iterator
  1372  // for the lifetime of a batch.
  1373  type reusableIterator struct {
  1374  	rocksDBIterator
  1375  	inuse bool
  1376  }
  1377  
  1378  func (r *reusableIterator) Close() {
  1379  	// reusableIterator.Close() leaves the underlying rocksdb iterator open until
  1380  	// the associated batch is closed.
  1381  	if !r.inuse {
  1382  		panic("closing idle iterator")
  1383  	}
  1384  	r.inuse = false
  1385  }
  1386  
  1387  type distinctBatch struct {
  1388  	*rocksDBBatch
  1389  	prefixIter reusableIterator
  1390  	normalIter reusableIterator
  1391  }
  1392  
  1393  func (r *distinctBatch) Close() {
  1394  	if !r.distinctOpen {
  1395  		panic("distinct batch not open")
  1396  	}
  1397  	r.distinctOpen = false
  1398  }
  1399  
  1400  // NewIterator returns an iterator over the batch and underlying engine. Note
  1401  // that the returned iterator is cached and re-used for the lifetime of the
  1402  // batch. A panic will be thrown if multiple prefix or normal (non-prefix)
  1403  // iterators are used simultaneously on the same batch.
  1404  func (r *distinctBatch) NewIterator(opts IterOptions) Iterator {
  1405  	if opts.MinTimestampHint != (hlc.Timestamp{}) {
  1406  		// Iterators that specify timestamp bounds cannot be cached.
  1407  		if r.writeOnly {
  1408  			return newRocksDBIterator(r.parent.rdb, opts, r, r.parent)
  1409  		}
  1410  		r.ensureBatch()
  1411  		return newRocksDBIterator(r.batch, opts, r, r.parent)
  1412  	}
  1413  
  1414  	// Use the cached iterator, creating it on first access.
  1415  	iter := &r.normalIter
  1416  	if opts.Prefix {
  1417  		iter = &r.prefixIter
  1418  	}
  1419  	if iter.rocksDBIterator.iter == nil {
  1420  		if r.writeOnly {
  1421  			iter.rocksDBIterator.init(r.parent.rdb, opts, r, r.parent)
  1422  		} else {
  1423  			r.ensureBatch()
  1424  			iter.rocksDBIterator.init(r.batch, opts, r, r.parent)
  1425  		}
  1426  	} else {
  1427  		iter.rocksDBIterator.setOptions(opts)
  1428  	}
  1429  	if iter.inuse {
  1430  		panic("iterator already in use")
  1431  	}
  1432  	iter.inuse = true
  1433  	return iter
  1434  }
  1435  
  1436  func (r *distinctBatch) Get(key MVCCKey) ([]byte, error) {
  1437  	if r.writeOnly {
  1438  		return dbGet(r.parent.rdb, key)
  1439  	}
  1440  	r.ensureBatch()
  1441  	return dbGet(r.batch, key)
  1442  }
  1443  
  1444  func (r *distinctBatch) GetProto(
  1445  	key MVCCKey, msg protoutil.Message,
  1446  ) (ok bool, keyBytes, valBytes int64, err error) {
  1447  	if r.writeOnly {
  1448  		return dbGetProto(r.parent.rdb, key, msg)
  1449  	}
  1450  	r.ensureBatch()
  1451  	return dbGetProto(r.batch, key, msg)
  1452  }
  1453  
  1454  func (r *distinctBatch) Iterate(start, end roachpb.Key, f func(MVCCKeyValue) (bool, error)) error {
  1455  	r.ensureBatch()
  1456  	return iterateOnReader(r, start, end, f)
  1457  }
  1458  
  1459  func (r *distinctBatch) Put(key MVCCKey, value []byte) error {
  1460  	r.builder.Put(key, value)
  1461  	return nil
  1462  }
  1463  
  1464  func (r *distinctBatch) Merge(key MVCCKey, value []byte) error {
  1465  	r.builder.Merge(key, value)
  1466  	return nil
  1467  }
  1468  
  1469  func (r *distinctBatch) LogData(data []byte) error {
  1470  	r.builder.LogData(data)
  1471  	return nil
  1472  }
  1473  
  1474  func (r *distinctBatch) Clear(key MVCCKey) error {
  1475  	r.builder.Clear(key)
  1476  	return nil
  1477  }
  1478  
  1479  func (r *distinctBatch) SingleClear(key MVCCKey) error {
  1480  	r.builder.SingleClear(key)
  1481  	return nil
  1482  }
  1483  
  1484  func (r *distinctBatch) ClearRange(start, end MVCCKey) error {
  1485  	if !r.writeOnly {
  1486  		panic("readable batch")
  1487  	}
  1488  	r.flushMutations()
  1489  	r.flushes++ // make sure that Repr() doesn't take a shortcut
  1490  	r.ensureBatch()
  1491  	return dbClearRange(r.batch, start, end)
  1492  }
  1493  
  1494  func (r *distinctBatch) ClearIterRange(iter Iterator, start, end roachpb.Key) error {
  1495  	r.flushMutations()
  1496  	r.flushes++ // make sure that Repr() doesn't take a shortcut
  1497  	r.ensureBatch()
  1498  	return dbClearIterRange(r.batch, iter, start, end)
  1499  }
  1500  
  1501  func (r *distinctBatch) LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails) {
  1502  	// No-op. Logical logging disabled.
  1503  }
  1504  
  1505  func (r *distinctBatch) close() {
  1506  	if r.prefixIter.inuse {
  1507  		panic("iterator still inuse")
  1508  	}
  1509  	if r.normalIter.inuse {
  1510  		panic("iterator still inuse")
  1511  	}
  1512  	if i := &r.prefixIter.rocksDBIterator; i.iter != nil {
  1513  		i.destroy()
  1514  	}
  1515  	if i := &r.normalIter.rocksDBIterator; i.iter != nil {
  1516  		i.destroy()
  1517  	}
  1518  }
  1519  
  1520  // batchIterator wraps rocksDBIterator and ensures that the buffered mutations
  1521  // in a batch are flushed before performing read operations.
  1522  type batchIterator struct {
  1523  	iter  rocksDBIterator
  1524  	batch *rocksDBBatch
  1525  }
  1526  
  1527  func (r *batchIterator) Stats() IteratorStats {
  1528  	return r.iter.Stats()
  1529  }
  1530  
  1531  func (r *batchIterator) Close() {
  1532  	if r.batch == nil {
  1533  		panic("closing idle iterator")
  1534  	}
  1535  	r.batch = nil
  1536  	r.iter.destroy()
  1537  }
  1538  
  1539  func (r *batchIterator) SeekGE(key MVCCKey) {
  1540  	r.batch.flushMutations()
  1541  	r.iter.SeekGE(key)
  1542  }
  1543  
  1544  func (r *batchIterator) SeekLT(key MVCCKey) {
  1545  	r.batch.flushMutations()
  1546  	r.iter.SeekLT(key)
  1547  }
  1548  
  1549  func (r *batchIterator) Valid() (bool, error) {
  1550  	return r.iter.Valid()
  1551  }
  1552  
  1553  func (r *batchIterator) Next() {
  1554  	r.batch.flushMutations()
  1555  	r.iter.Next()
  1556  }
  1557  
  1558  func (r *batchIterator) Prev() {
  1559  	r.batch.flushMutations()
  1560  	r.iter.Prev()
  1561  }
  1562  
  1563  func (r *batchIterator) NextKey() {
  1564  	r.batch.flushMutations()
  1565  	r.iter.NextKey()
  1566  }
  1567  
  1568  func (r *batchIterator) ComputeStats(
  1569  	start, end roachpb.Key, nowNanos int64,
  1570  ) (enginepb.MVCCStats, error) {
  1571  	r.batch.flushMutations()
  1572  	return r.iter.ComputeStats(start, end, nowNanos)
  1573  }
  1574  
  1575  func (r *batchIterator) FindSplitKey(
  1576  	start, end, minSplitKey roachpb.Key, targetSize int64,
  1577  ) (MVCCKey, error) {
  1578  	r.batch.flushMutations()
  1579  	return r.iter.FindSplitKey(start, end, minSplitKey, targetSize)
  1580  }
  1581  
  1582  func (r *batchIterator) MVCCOpsSpecialized() bool {
  1583  	return r.iter.MVCCOpsSpecialized()
  1584  }
  1585  
  1586  func (r *batchIterator) MVCCGet(
  1587  	key roachpb.Key, timestamp hlc.Timestamp, opts MVCCGetOptions,
  1588  ) (*roachpb.Value, *roachpb.Intent, error) {
  1589  	r.batch.flushMutations()
  1590  	return r.iter.MVCCGet(key, timestamp, opts)
  1591  }
  1592  
  1593  func (r *batchIterator) MVCCScan(
  1594  	start, end roachpb.Key, timestamp hlc.Timestamp, opts MVCCScanOptions,
  1595  ) (MVCCScanResult, error) {
  1596  	r.batch.flushMutations()
  1597  	return r.iter.MVCCScan(start, end, timestamp, opts)
  1598  }
  1599  
  1600  func (r *batchIterator) SetUpperBound(key roachpb.Key) {
  1601  	r.iter.SetUpperBound(key)
  1602  }
  1603  
  1604  func (r *batchIterator) Key() MVCCKey {
  1605  	return r.iter.Key()
  1606  }
  1607  
  1608  func (r *batchIterator) Value() []byte {
  1609  	return r.iter.Value()
  1610  }
  1611  
  1612  func (r *batchIterator) ValueProto(msg protoutil.Message) error {
  1613  	return r.iter.ValueProto(msg)
  1614  }
  1615  
  1616  func (r *batchIterator) UnsafeKey() MVCCKey {
  1617  	return r.iter.UnsafeKey()
  1618  }
  1619  
  1620  func (r *batchIterator) UnsafeValue() []byte {
  1621  	return r.iter.UnsafeValue()
  1622  }
  1623  
  1624  func (r *batchIterator) getIter() *C.DBIterator {
  1625  	return r.iter.iter
  1626  }
  1627  
  1628  func (r *batchIterator) CheckForKeyCollisions(
  1629  	sstData []byte, start, end roachpb.Key,
  1630  ) (enginepb.MVCCStats, error) {
  1631  	return r.iter.CheckForKeyCollisions(sstData, start, end)
  1632  }
  1633  
  1634  // reusableBatchIterator wraps batchIterator and makes the Close method a no-op
  1635  // to allow reuse of the iterator for the lifetime of the batch. The batch must
  1636  // call iter.destroy() when it closes itself.
  1637  type reusableBatchIterator struct {
  1638  	batchIterator
  1639  }
  1640  
  1641  func (r *reusableBatchIterator) Close() {
  1642  	// reusableBatchIterator.Close() leaves the underlying rocksdb iterator open
  1643  	// until the associated batch is closed.
  1644  	if r.batch == nil {
  1645  		panic("closing idle iterator")
  1646  	}
  1647  	r.batch = nil
  1648  }
  1649  
  1650  type rocksDBBatch struct {
  1651  	parent             *RocksDB
  1652  	batch              *C.DBEngine
  1653  	flushes            int
  1654  	flushedCount       int
  1655  	flushedSize        int
  1656  	prefixIter         reusableBatchIterator
  1657  	normalIter         reusableBatchIterator
  1658  	builder            RocksDBBatchBuilder
  1659  	distinct           distinctBatch
  1660  	distinctOpen       bool
  1661  	distinctNeedsFlush bool
  1662  	writeOnly          bool
  1663  	syncCommit         bool
  1664  	closed             bool
  1665  	committed          bool
  1666  	commitErr          error
  1667  	commitWG           sync.WaitGroup
  1668  }
  1669  
  1670  var batchPool = sync.Pool{
  1671  	New: func() interface{} {
  1672  		return &rocksDBBatch{}
  1673  	},
  1674  }
  1675  
  1676  func newRocksDBBatch(parent *RocksDB, writeOnly bool) *rocksDBBatch {
  1677  	// Get a new batch from the pool. Batches in the pool may have their closed
  1678  	// fields set to true to facilitate some sanity check assertions. Reset this
  1679  	// field and set others.
  1680  	r := batchPool.Get().(*rocksDBBatch)
  1681  	r.closed = false
  1682  	r.parent = parent
  1683  	r.writeOnly = writeOnly
  1684  	r.distinct.rocksDBBatch = r
  1685  	return r
  1686  }
  1687  
  1688  func (r *rocksDBBatch) ensureBatch() {
  1689  	if r.batch == nil {
  1690  		r.batch = C.DBNewBatch(r.parent.rdb, C.bool(r.writeOnly))
  1691  	}
  1692  }
  1693  
  1694  func (r *rocksDBBatch) Close() {
  1695  	if r.closed {
  1696  		panic("this batch was already closed")
  1697  	}
  1698  	r.distinct.close()
  1699  	if r.prefixIter.batch != nil {
  1700  		panic("iterator still inuse")
  1701  	}
  1702  	if r.normalIter.batch != nil {
  1703  		panic("iterator still inuse")
  1704  	}
  1705  	if i := &r.prefixIter.iter; i.iter != nil {
  1706  		i.destroy()
  1707  	}
  1708  	if i := &r.normalIter.iter; i.iter != nil {
  1709  		i.destroy()
  1710  	}
  1711  	if r.batch != nil {
  1712  		C.DBClose(r.batch)
  1713  		r.batch = nil
  1714  	}
  1715  	r.builder.reset()
  1716  	r.closed = true
  1717  
  1718  	// Zero all the remaining fields individually. We can't just copy a new
  1719  	// struct onto r, since r.builder has a sync.NoCopy.
  1720  	r.batch = nil
  1721  	r.parent = nil
  1722  	r.flushes = 0
  1723  	r.flushedCount = 0
  1724  	r.flushedSize = 0
  1725  	r.prefixIter = reusableBatchIterator{}
  1726  	r.normalIter = reusableBatchIterator{}
  1727  	r.distinctOpen = false
  1728  	r.distinctNeedsFlush = false
  1729  	r.writeOnly = false
  1730  	r.syncCommit = false
  1731  	r.committed = false
  1732  	r.commitErr = nil
  1733  	r.commitWG = sync.WaitGroup{}
  1734  
  1735  	batchPool.Put(r)
  1736  }
  1737  
  1738  // Closed returns true if the engine is closed.
  1739  func (r *rocksDBBatch) Closed() bool {
  1740  	return r.closed || r.committed
  1741  }
  1742  
  1743  // ExportToSst is part of the engine.Reader interface.
  1744  func (r *rocksDBBatch) ExportToSst(
  1745  	startKey, endKey roachpb.Key,
  1746  	startTS, endTS hlc.Timestamp,
  1747  	exportAllRevisions bool,
  1748  	targetSize, maxSize uint64,
  1749  	io IterOptions,
  1750  ) ([]byte, roachpb.BulkOpSummary, roachpb.Key, error) {
  1751  	panic("unimplemented")
  1752  }
  1753  
  1754  func (r *rocksDBBatch) Put(key MVCCKey, value []byte) error {
  1755  	if r.distinctOpen {
  1756  		panic("distinct batch open")
  1757  	}
  1758  	r.distinctNeedsFlush = true
  1759  	r.builder.Put(key, value)
  1760  	return nil
  1761  }
  1762  
  1763  func (r *rocksDBBatch) Merge(key MVCCKey, value []byte) error {
  1764  	if r.distinctOpen {
  1765  		panic("distinct batch open")
  1766  	}
  1767  	r.distinctNeedsFlush = true
  1768  	r.builder.Merge(key, value)
  1769  	return nil
  1770  }
  1771  
  1772  func (r *rocksDBBatch) LogData(data []byte) error {
  1773  	if r.distinctOpen {
  1774  		panic("distinct batch open")
  1775  	}
  1776  	r.distinctNeedsFlush = true
  1777  	r.builder.LogData(data)
  1778  	return nil
  1779  }
  1780  
  1781  // ApplyBatchRepr atomically applies a set of batched updates to the current
  1782  // batch (the receiver).
  1783  func (r *rocksDBBatch) ApplyBatchRepr(repr []byte, sync bool) error {
  1784  	if r.distinctOpen {
  1785  		panic("distinct batch open")
  1786  	}
  1787  	r.distinctNeedsFlush = true
  1788  	return r.builder.ApplyRepr(repr)
  1789  }
  1790  
  1791  func (r *rocksDBBatch) Get(key MVCCKey) ([]byte, error) {
  1792  	if r.writeOnly {
  1793  		panic("write-only batch")
  1794  	}
  1795  	if r.distinctOpen {
  1796  		panic("distinct batch open")
  1797  	}
  1798  	r.flushMutations()
  1799  	r.ensureBatch()
  1800  	return dbGet(r.batch, key)
  1801  }
  1802  
  1803  func (r *rocksDBBatch) GetProto(
  1804  	key MVCCKey, msg protoutil.Message,
  1805  ) (ok bool, keyBytes, valBytes int64, err error) {
  1806  	if r.writeOnly {
  1807  		panic("write-only batch")
  1808  	}
  1809  	if r.distinctOpen {
  1810  		panic("distinct batch open")
  1811  	}
  1812  	r.flushMutations()
  1813  	r.ensureBatch()
  1814  	return dbGetProto(r.batch, key, msg)
  1815  }
  1816  
  1817  func (r *rocksDBBatch) Iterate(start, end roachpb.Key, f func(MVCCKeyValue) (bool, error)) error {
  1818  	if r.writeOnly {
  1819  		panic("write-only batch")
  1820  	}
  1821  	if r.distinctOpen {
  1822  		panic("distinct batch open")
  1823  	}
  1824  	r.flushMutations()
  1825  	r.ensureBatch()
  1826  	return iterateOnReader(r, start, end, f)
  1827  }
  1828  
  1829  func (r *rocksDBBatch) Clear(key MVCCKey) error {
  1830  	if r.distinctOpen {
  1831  		panic("distinct batch open")
  1832  	}
  1833  	r.distinctNeedsFlush = true
  1834  	r.builder.Clear(key)
  1835  	return nil
  1836  }
  1837  
  1838  func (r *rocksDBBatch) SingleClear(key MVCCKey) error {
  1839  	if r.distinctOpen {
  1840  		panic("distinct batch open")
  1841  	}
  1842  	r.distinctNeedsFlush = true
  1843  	r.builder.SingleClear(key)
  1844  	return nil
  1845  }
  1846  
  1847  func (r *rocksDBBatch) ClearRange(start, end MVCCKey) error {
  1848  	if r.distinctOpen {
  1849  		panic("distinct batch open")
  1850  	}
  1851  	r.flushMutations()
  1852  	r.flushes++ // make sure that Repr() doesn't take a shortcut
  1853  	r.ensureBatch()
  1854  	return dbClearRange(r.batch, start, end)
  1855  }
  1856  
  1857  func (r *rocksDBBatch) ClearIterRange(iter Iterator, start, end roachpb.Key) error {
  1858  	if r.distinctOpen {
  1859  		panic("distinct batch open")
  1860  	}
  1861  	r.flushMutations()
  1862  	r.flushes++ // make sure that Repr() doesn't take a shortcut
  1863  	r.ensureBatch()
  1864  	return dbClearIterRange(r.batch, iter, start, end)
  1865  }
  1866  
  1867  func (r *rocksDBBatch) LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails) {
  1868  	// No-op. Logical logging disabled.
  1869  }
  1870  
  1871  // NewIterator returns an iterator over the batch and underlying engine. Note
  1872  // that the returned iterator is cached and re-used for the lifetime of the
  1873  // batch. A panic will be thrown if multiple prefix or normal (non-prefix)
  1874  // iterators are used simultaneously on the same batch.
  1875  func (r *rocksDBBatch) NewIterator(opts IterOptions) Iterator {
  1876  	if r.writeOnly {
  1877  		panic("write-only batch")
  1878  	}
  1879  	if r.distinctOpen {
  1880  		panic("distinct batch open")
  1881  	}
  1882  
  1883  	if opts.MinTimestampHint != (hlc.Timestamp{}) {
  1884  		// Iterators that specify timestamp bounds cannot be cached.
  1885  		r.ensureBatch()
  1886  		iter := &batchIterator{batch: r}
  1887  		iter.iter.init(r.batch, opts, r, r.parent)
  1888  		return iter
  1889  	}
  1890  
  1891  	// Use the cached iterator, creating it on first access.
  1892  	iter := &r.normalIter
  1893  	if opts.Prefix {
  1894  		iter = &r.prefixIter
  1895  	}
  1896  	if iter.iter.iter == nil {
  1897  		r.ensureBatch()
  1898  		iter.iter.init(r.batch, opts, r, r.parent)
  1899  	} else {
  1900  		iter.iter.setOptions(opts)
  1901  	}
  1902  	if iter.batch != nil {
  1903  		panic("iterator already in use")
  1904  	}
  1905  	iter.batch = r
  1906  	return iter
  1907  }
  1908  
  1909  const maxBatchGroupSize = 1 << 20 // 1 MiB
  1910  
  1911  // makeBatchGroup add the specified batch to the pending list of batches to
  1912  // commit. Groups are delimited by a nil batch in the pending list. Group
  1913  // leaders are the first batch in the pending list and the first batch after a
  1914  // nil batch. The size of a group is limited by the maxSize parameter which is
  1915  // measured as the number of bytes in the group's batches. The groupSize
  1916  // parameter is the size of the current group being formed. Returns the new
  1917  // list of pending batches, the new size of the current group and whether the
  1918  // batch that was added is the leader of its group.
  1919  func makeBatchGroup(
  1920  	pending []*rocksDBBatch, b *rocksDBBatch, groupSize, maxSize int,
  1921  ) (_ []*rocksDBBatch, _ int, leader bool) {
  1922  	leader = len(pending) == 0
  1923  	if n := len(b.unsafeRepr()); leader {
  1924  		groupSize = n
  1925  	} else if groupSize+n > maxSize {
  1926  		leader = true
  1927  		groupSize = n
  1928  		pending = append(pending, nil)
  1929  	} else {
  1930  		groupSize += n
  1931  	}
  1932  	pending = append(pending, b)
  1933  	return pending, groupSize, leader
  1934  }
  1935  
  1936  // nextBatchGroup extracts the group of batches from the pending list. See
  1937  // makeBatchGroup for an explanation of how groups are encoded into the pending
  1938  // list. Returns the next group in the prefix return value, and the remaining
  1939  // groups in the suffix parameter (the next group is always a prefix of the
  1940  // pending argument).
  1941  func nextBatchGroup(pending []*rocksDBBatch) (prefix []*rocksDBBatch, suffix []*rocksDBBatch) {
  1942  	for i := 1; i < len(pending); i++ {
  1943  		if pending[i] == nil {
  1944  			return pending[:i], pending[i+1:]
  1945  		}
  1946  	}
  1947  	return pending, pending[len(pending):]
  1948  }
  1949  
  1950  func (r *rocksDBBatch) Commit(syncCommit bool) error {
  1951  	if r.Closed() {
  1952  		panic("this batch was already committed")
  1953  	}
  1954  	r.distinctOpen = false
  1955  
  1956  	if r.Empty() {
  1957  		// Nothing was written to this batch. Fast path.
  1958  		r.committed = true
  1959  		return nil
  1960  	}
  1961  
  1962  	// Combine multiple write-only batch commits into a single call to
  1963  	// RocksDB. RocksDB is supposed to be performing such batching internally,
  1964  	// but whether Cgo or something else, it isn't achieving the same degree of
  1965  	// batching. Instrumentation shows that internally RocksDB almost never
  1966  	// batches commits together. While the batching below often can batch 20 or
  1967  	// 30 concurrent commits.
  1968  	c := &r.parent.commit
  1969  	r.commitWG.Add(1)
  1970  	r.syncCommit = syncCommit
  1971  
  1972  	// The leader for the commit is the first batch to be added to the pending
  1973  	// slice. Every batch has an associated wait group which is signaled when
  1974  	// the commit is complete.
  1975  	c.Lock()
  1976  
  1977  	var leader bool
  1978  	c.pending, c.groupSize, leader = makeBatchGroup(c.pending, r, c.groupSize, maxBatchGroupSize)
  1979  
  1980  	if leader {
  1981  		// We're the leader of our group. Wait for any running commit to finish and
  1982  		// for our batch to make it to the head of the pending queue.
  1983  		for c.committing || c.pending[0] != r {
  1984  			c.cond.Wait()
  1985  		}
  1986  
  1987  		var pending []*rocksDBBatch
  1988  		pending, c.pending = nextBatchGroup(c.pending)
  1989  		c.committing = true
  1990  		c.Unlock()
  1991  
  1992  		// We want the batch that is performing the commit to be write-only in
  1993  		// order to avoid the (significant) overhead of indexing the operations in
  1994  		// the other batches when they are applied.
  1995  		committer := r
  1996  		merge := pending[1:]
  1997  		if !r.writeOnly && len(merge) > 0 {
  1998  			committer = newRocksDBBatch(r.parent, true /* writeOnly */)
  1999  			defer committer.Close()
  2000  			merge = pending
  2001  		}
  2002  
  2003  		// Bundle all of the batches together.
  2004  		var err error
  2005  		for _, b := range merge {
  2006  			if err = committer.ApplyBatchRepr(b.unsafeRepr(), false /* sync */); err != nil {
  2007  				break
  2008  			}
  2009  		}
  2010  
  2011  		if err == nil {
  2012  			err = committer.commitInternal(false /* sync */)
  2013  		}
  2014  
  2015  		// We're done committing the batch, let the next group of batches
  2016  		// proceed.
  2017  		c.Lock()
  2018  		c.committing = false
  2019  		// NB: Multiple leaders can be waiting.
  2020  		c.cond.Broadcast()
  2021  		c.Unlock()
  2022  
  2023  		// Propagate the error to all of the batches involved in the commit. If a
  2024  		// batch requires syncing and the commit was successful, add it to the
  2025  		// syncing list. Note that we're reusing the pending list here for the
  2026  		// syncing list. We need to be careful to cap the capacity so that
  2027  		// extending this slice past the length of the pending list will result in
  2028  		// reallocation. Otherwise we have a race between appending to this list
  2029  		// while holding the sync lock below, and appending to the commit pending
  2030  		// list while holding the commit lock above.
  2031  		syncing := pending[:0:len(pending)]
  2032  		for _, b := range pending {
  2033  			if err != nil || !b.syncCommit {
  2034  				b.commitErr = err
  2035  				b.commitWG.Done()
  2036  			} else {
  2037  				syncing = append(syncing, b)
  2038  			}
  2039  		}
  2040  
  2041  		if len(syncing) > 0 {
  2042  			// The commit was successful and one or more of the batches requires
  2043  			// syncing: notify the sync goroutine.
  2044  			s := &r.parent.syncer
  2045  			s.Lock()
  2046  			if len(s.pending) == 0 {
  2047  				s.pending = syncing
  2048  			} else {
  2049  				s.pending = append(s.pending, syncing...)
  2050  			}
  2051  			s.cond.Signal()
  2052  			s.Unlock()
  2053  		}
  2054  	} else {
  2055  		c.Unlock()
  2056  	}
  2057  	// Wait for the commit/sync to finish.
  2058  	r.commitWG.Wait()
  2059  	return r.commitErr
  2060  }
  2061  
  2062  func (r *rocksDBBatch) commitInternal(sync bool) error {
  2063  	start := timeutil.Now()
  2064  	var count, size int
  2065  
  2066  	if r.flushes > 0 {
  2067  		// We've previously flushed mutations to the C++ batch, so we have to flush
  2068  		// any remaining mutations as well and then commit the batch.
  2069  		r.flushMutations()
  2070  		r.ensureBatch()
  2071  		if err := statusToError(C.DBCommitAndCloseBatch(r.batch, C.bool(sync))); err != nil {
  2072  			return err
  2073  		}
  2074  		r.batch = nil
  2075  		count, size = r.flushedCount, r.flushedSize
  2076  	} else if r.builder.Len() > 0 {
  2077  		count, size = int(r.builder.Count()), r.builder.Len()
  2078  
  2079  		// Fast-path which avoids flushing mutations to the C++ batch. Instead, we
  2080  		// directly apply the mutations to the database.
  2081  		if err := dbApplyBatchRepr(r.parent.rdb, r.builder.Finish(), sync); err != nil {
  2082  			return err
  2083  		}
  2084  		if r.batch != nil {
  2085  			C.DBClose(r.batch)
  2086  			r.batch = nil
  2087  		}
  2088  	} else {
  2089  		panic("commitInternal called on empty batch")
  2090  	}
  2091  	r.committed = true
  2092  
  2093  	warnLargeBatches := r.parent.cfg.WarnLargeBatchThreshold > 0
  2094  	if elapsed := timeutil.Since(start); warnLargeBatches && (elapsed >= r.parent.cfg.WarnLargeBatchThreshold) {
  2095  		log.Warningf(context.TODO(), "batch [%d/%d/%d] commit took %s (>= warning threshold %s)",
  2096  			count, size, r.flushes, elapsed, r.parent.cfg.WarnLargeBatchThreshold)
  2097  	}
  2098  
  2099  	return nil
  2100  }
  2101  
  2102  func (r *rocksDBBatch) Empty() bool {
  2103  	return r.flushes == 0 && r.builder.Count() == 0 && !r.builder.logData
  2104  }
  2105  
  2106  func (r *rocksDBBatch) Len() int {
  2107  	return len(r.unsafeRepr())
  2108  }
  2109  
  2110  func (r *rocksDBBatch) unsafeRepr() []byte {
  2111  	if r.flushes == 0 {
  2112  		// We've never flushed to C++. Return the mutations only.
  2113  		return r.builder.getRepr()
  2114  	}
  2115  	r.flushMutations()
  2116  	return cSliceToUnsafeGoBytes(C.DBBatchRepr(r.batch))
  2117  }
  2118  
  2119  func (r *rocksDBBatch) Repr() []byte {
  2120  	if r.flushes == 0 {
  2121  		// We've never flushed to C++. Return the mutations only. We make a copy
  2122  		// of the builder's byte slice so that the return []byte is valid even
  2123  		// if the builder is reset or finished.
  2124  		repr := r.builder.getRepr()
  2125  		cpy := make([]byte, len(repr))
  2126  		copy(cpy, repr)
  2127  		return cpy
  2128  	}
  2129  	r.flushMutations()
  2130  	return cSliceToGoBytes(C.DBBatchRepr(r.batch))
  2131  }
  2132  
  2133  func (r *rocksDBBatch) Distinct() ReadWriter {
  2134  	if r.distinctNeedsFlush {
  2135  		r.flushMutations()
  2136  	}
  2137  	if r.distinctOpen {
  2138  		panic("distinct batch already open")
  2139  	}
  2140  	r.distinctOpen = true
  2141  	return &r.distinct
  2142  }
  2143  
  2144  func (r *rocksDBBatch) flushMutations() {
  2145  	if r.builder.Count() == 0 {
  2146  		return
  2147  	}
  2148  	r.ensureBatch()
  2149  	r.distinctNeedsFlush = false
  2150  	r.flushes++
  2151  	r.flushedCount += int(r.builder.Count())
  2152  	r.flushedSize += r.builder.Len()
  2153  	if err := dbApplyBatchRepr(r.batch, r.builder.Finish(), false); err != nil {
  2154  		panic(err)
  2155  	}
  2156  	// Force a seek of the underlying iterator on the next Seek/ReverseSeek.
  2157  	r.prefixIter.iter.reseek = true
  2158  	r.normalIter.iter.reseek = true
  2159  }
  2160  
  2161  type dbIteratorGetter interface {
  2162  	getIter() *C.DBIterator
  2163  }
  2164  
  2165  type rocksDBIterator struct {
  2166  	parent *RocksDB
  2167  	reader Reader
  2168  	iter   *C.DBIterator
  2169  	valid  bool
  2170  	reseek bool
  2171  	prefix bool
  2172  	err    error
  2173  	key    C.DBKey
  2174  	value  C.DBSlice
  2175  }
  2176  
  2177  // TODO(peter): Is this pool useful now that rocksDBBatch.NewIterator doesn't
  2178  // allocate by returning internal pointers?
  2179  var iterPool = sync.Pool{
  2180  	New: func() interface{} {
  2181  		return &rocksDBIterator{}
  2182  	},
  2183  }
  2184  
  2185  // newRocksDBIterator returns a new iterator over the supplied RocksDB
  2186  // instance. If snapshotHandle is not nil, uses the indicated snapshot.
  2187  // The caller must call rocksDBIterator.Close() when finished with the
  2188  // iterator to free up resources.
  2189  func newRocksDBIterator(
  2190  	rdb *C.DBEngine, opts IterOptions, reader Reader, parent *RocksDB,
  2191  ) MVCCIterator {
  2192  	// In order to prevent content displacement, caching is disabled
  2193  	// when performing scans. Any options set within the shared read
  2194  	// options field that should be carried over needs to be set here
  2195  	// as well.
  2196  	r := iterPool.Get().(*rocksDBIterator)
  2197  	r.init(rdb, opts, reader, parent)
  2198  	return r
  2199  }
  2200  
  2201  func (r *rocksDBIterator) getIter() *C.DBIterator {
  2202  	return r.iter
  2203  }
  2204  
  2205  func (r *rocksDBIterator) init(rdb *C.DBEngine, opts IterOptions, reader Reader, parent *RocksDB) {
  2206  	r.parent = parent
  2207  	if debugIteratorLeak && r.parent != nil {
  2208  		r.parent.iters.Lock()
  2209  		r.parent.iters.m[r] = debug.Stack()
  2210  		r.parent.iters.Unlock()
  2211  	}
  2212  
  2213  	if !opts.Prefix && len(opts.UpperBound) == 0 && len(opts.LowerBound) == 0 {
  2214  		panic("iterator must set prefix or upper bound or lower bound")
  2215  	}
  2216  
  2217  	r.iter = C.DBNewIter(rdb, goToCIterOptions(opts))
  2218  	if r.iter == nil {
  2219  		panic("unable to create iterator")
  2220  	}
  2221  	r.reader = reader
  2222  	r.prefix = opts.Prefix
  2223  }
  2224  
  2225  func (r *rocksDBIterator) setOptions(opts IterOptions) {
  2226  	if opts.MinTimestampHint != (hlc.Timestamp{}) || opts.MaxTimestampHint != (hlc.Timestamp{}) {
  2227  		panic("iterator with timestamp hints cannot be reused")
  2228  	}
  2229  	if !opts.Prefix && len(opts.UpperBound) == 0 && len(opts.LowerBound) == 0 {
  2230  		panic("iterator must set prefix or upper bound or lower bound")
  2231  	}
  2232  	C.DBIterSetLowerBound(r.iter, goToCKey(MakeMVCCMetadataKey(opts.LowerBound)))
  2233  	C.DBIterSetUpperBound(r.iter, goToCKey(MakeMVCCMetadataKey(opts.UpperBound)))
  2234  }
  2235  
  2236  func (r *rocksDBIterator) checkEngineOpen() {
  2237  	if r.reader.Closed() {
  2238  		panic("iterator used after backing engine closed")
  2239  	}
  2240  }
  2241  
  2242  func (r *rocksDBIterator) destroy() {
  2243  	if debugIteratorLeak && r.parent != nil {
  2244  		r.parent.iters.Lock()
  2245  		delete(r.parent.iters.m, r)
  2246  		r.parent.iters.Unlock()
  2247  	}
  2248  	C.DBIterDestroy(r.iter)
  2249  	*r = rocksDBIterator{}
  2250  }
  2251  
  2252  // The following methods implement the Iterator interface.
  2253  
  2254  func (r *rocksDBIterator) Stats() IteratorStats {
  2255  	stats := C.DBIterStats(r.iter)
  2256  	return IteratorStats{
  2257  		TimeBoundNumSSTs:           int(stats.timebound_num_ssts),
  2258  		InternalDeleteSkippedCount: int(stats.internal_delete_skipped_count),
  2259  	}
  2260  }
  2261  
  2262  func (r *rocksDBIterator) Close() {
  2263  	r.destroy()
  2264  	iterPool.Put(r)
  2265  }
  2266  
  2267  func (r *rocksDBIterator) SeekGE(key MVCCKey) {
  2268  	r.checkEngineOpen()
  2269  	if len(key.Key) == 0 {
  2270  		// start=Key("") needs special treatment since we need
  2271  		// to access start[0] in an explicit seek.
  2272  		r.setState(C.DBIterSeekToFirst(r.iter))
  2273  	} else {
  2274  		// We can avoid seeking if we're already at the key we seek.
  2275  		if r.valid && !r.reseek && key.Equal(r.UnsafeKey()) {
  2276  			return
  2277  		}
  2278  		r.setState(C.DBIterSeek(r.iter, goToCKey(key)))
  2279  	}
  2280  }
  2281  
  2282  func (r *rocksDBIterator) SeekLT(key MVCCKey) {
  2283  	r.checkEngineOpen()
  2284  	if len(key.Key) == 0 {
  2285  		r.setState(C.DBIterSeekToLast(r.iter))
  2286  	} else {
  2287  		// SeekForPrev positions the iterator at the last key that is less
  2288  		// than or equal to key, so we may need to iterate backwards once.
  2289  		r.setState(C.DBIterSeekForPrev(r.iter, goToCKey(key)))
  2290  		if r.valid && key.Equal(r.UnsafeKey()) {
  2291  			r.Prev()
  2292  		}
  2293  	}
  2294  }
  2295  
  2296  func (r *rocksDBIterator) Valid() (bool, error) {
  2297  	return r.valid, r.err
  2298  }
  2299  
  2300  func (r *rocksDBIterator) Next() {
  2301  	r.checkEngineOpen()
  2302  	r.setState(C.DBIterNext(r.iter, C.bool(false) /* skip_current_key_versions */))
  2303  }
  2304  
  2305  var errReversePrefixIteration = fmt.Errorf("unsupported reverse prefix iteration")
  2306  
  2307  func (r *rocksDBIterator) Prev() {
  2308  	r.checkEngineOpen()
  2309  	if r.prefix {
  2310  		r.valid = false
  2311  		r.err = errReversePrefixIteration
  2312  		return
  2313  	}
  2314  	r.setState(C.DBIterPrev(r.iter, C.bool(false) /* skip_current_key_versions */))
  2315  }
  2316  
  2317  func (r *rocksDBIterator) NextKey() {
  2318  	r.checkEngineOpen()
  2319  	r.setState(C.DBIterNext(r.iter, C.bool(true) /* skip_current_key_versions */))
  2320  }
  2321  
  2322  func (r *rocksDBIterator) Key() MVCCKey {
  2323  	// The data returned by rocksdb_iter_{key,value} is not meant to be
  2324  	// freed by the client. It is a direct reference to the data managed
  2325  	// by the iterator, so it is copied instead of freed.
  2326  	return cToGoKey(r.key)
  2327  }
  2328  
  2329  func (r *rocksDBIterator) Value() []byte {
  2330  	return cSliceToGoBytes(r.value)
  2331  }
  2332  
  2333  func (r *rocksDBIterator) ValueProto(msg protoutil.Message) error {
  2334  	if r.value.len == 0 {
  2335  		return nil
  2336  	}
  2337  	return protoutil.Unmarshal(r.UnsafeValue(), msg)
  2338  }
  2339  
  2340  func (r *rocksDBIterator) UnsafeKey() MVCCKey {
  2341  	return cToUnsafeGoKey(r.key)
  2342  }
  2343  
  2344  func (r *rocksDBIterator) UnsafeValue() []byte {
  2345  	return cSliceToUnsafeGoBytes(r.value)
  2346  }
  2347  
  2348  func (r *rocksDBIterator) clearState() {
  2349  	r.valid = false
  2350  	r.reseek = true
  2351  	r.key = C.DBKey{}
  2352  	r.value = C.DBSlice{}
  2353  	r.err = nil
  2354  }
  2355  
  2356  func (r *rocksDBIterator) setState(state C.DBIterState) {
  2357  	r.valid = bool(state.valid)
  2358  	r.reseek = false
  2359  	r.key = state.key
  2360  	r.value = state.value
  2361  	r.err = statusToError(state.status)
  2362  }
  2363  
  2364  func (r *rocksDBIterator) ComputeStats(
  2365  	start, end roachpb.Key, nowNanos int64,
  2366  ) (enginepb.MVCCStats, error) {
  2367  	r.clearState()
  2368  	result := C.MVCCComputeStats(r.iter,
  2369  		goToCKey(MakeMVCCMetadataKey(start)),
  2370  		goToCKey(MakeMVCCMetadataKey(end)),
  2371  		C.int64_t(nowNanos))
  2372  	stats, err := cStatsToGoStats(result, nowNanos)
  2373  	if util.RaceEnabled {
  2374  		// If we've come here via batchIterator, then flushMutations (which forces
  2375  		// reseek) was called just before C.MVCCComputeStats. Set it here as well
  2376  		// to match.
  2377  		r.reseek = true
  2378  		// C.MVCCComputeStats and ComputeStatsGo must behave identically.
  2379  		// There are unit tests to ensure that they return the same result, but
  2380  		// as an additional check, use the race builds to check any edge cases
  2381  		// that the tests may miss.
  2382  		verifyStats, verifyErr := ComputeStatsGo(r, start, end, nowNanos)
  2383  		if (err != nil) != (verifyErr != nil) {
  2384  			panic(fmt.Sprintf("C.MVCCComputeStats differed from ComputeStatsGo: err %v vs %v", err, verifyErr))
  2385  		}
  2386  		if !stats.Equal(verifyStats) {
  2387  			panic(fmt.Sprintf("C.MVCCComputeStats differed from ComputeStatsGo: stats %+v vs %+v", stats, verifyStats))
  2388  		}
  2389  	}
  2390  	return stats, err
  2391  }
  2392  
  2393  func (r *rocksDBIterator) FindSplitKey(
  2394  	start, end, minSplitKey roachpb.Key, targetSize int64,
  2395  ) (MVCCKey, error) {
  2396  	var splitKey C.DBString
  2397  	r.clearState()
  2398  	status := C.MVCCFindSplitKey(r.iter,
  2399  		goToCKey(MakeMVCCMetadataKey(start)),
  2400  		goToCKey(MakeMVCCMetadataKey(minSplitKey)),
  2401  		C.int64_t(targetSize), &splitKey)
  2402  	if err := statusToError(status); err != nil {
  2403  		return MVCCKey{}, err
  2404  	}
  2405  	return MVCCKey{Key: cStringToGoBytes(splitKey)}, nil
  2406  }
  2407  
  2408  func (r *rocksDBIterator) MVCCOpsSpecialized() bool {
  2409  	// rocksDBIterator provides specialized implementations of MVCCGet and
  2410  	// MVCCScan.
  2411  	return true
  2412  }
  2413  
  2414  func (r *rocksDBIterator) MVCCGet(
  2415  	key roachpb.Key, timestamp hlc.Timestamp, opts MVCCGetOptions,
  2416  ) (*roachpb.Value, *roachpb.Intent, error) {
  2417  	if opts.Inconsistent && opts.Txn != nil {
  2418  		return nil, nil, errors.Errorf("cannot allow inconsistent reads within a transaction")
  2419  	}
  2420  	if len(key) == 0 {
  2421  		return nil, nil, emptyKeyError()
  2422  	}
  2423  
  2424  	r.clearState()
  2425  	state := C.MVCCGet(
  2426  		r.iter, goToCSlice(key), goToCTimestamp(timestamp), goToCTxn(opts.Txn),
  2427  		C.bool(opts.Inconsistent), C.bool(opts.Tombstones), C.bool(opts.FailOnMoreRecent),
  2428  	)
  2429  
  2430  	if err := statusToError(state.status); err != nil {
  2431  		return nil, nil, err
  2432  	}
  2433  	if err := writeTooOldToError(timestamp, state.write_too_old_timestamp); err != nil {
  2434  		return nil, nil, err
  2435  	}
  2436  	if err := uncertaintyToError(timestamp, state.uncertainty_timestamp, opts.Txn); err != nil {
  2437  		return nil, nil, err
  2438  	}
  2439  
  2440  	intents, err := buildScanIntents(cSliceToGoBytes(state.intents))
  2441  	if err != nil {
  2442  		return nil, nil, err
  2443  	}
  2444  	if !opts.Inconsistent && len(intents) > 0 {
  2445  		return nil, nil, &roachpb.WriteIntentError{Intents: intents}
  2446  	}
  2447  
  2448  	var intent *roachpb.Intent
  2449  	if len(intents) > 1 {
  2450  		return nil, nil, errors.Errorf("expected 0 or 1 intents, got %d", len(intents))
  2451  	} else if len(intents) == 1 {
  2452  		intent = &intents[0]
  2453  	}
  2454  	if state.data.len == 0 {
  2455  		return nil, intent, nil
  2456  	}
  2457  
  2458  	count := state.data.count
  2459  	if count > 1 {
  2460  		return nil, nil, errors.Errorf("expected 0 or 1 result, found %d", count)
  2461  	}
  2462  	if count == 0 {
  2463  		return nil, intent, nil
  2464  	}
  2465  
  2466  	// Extract the value from the batch data.
  2467  	repr := copyFromSliceVector(state.data.bufs, state.data.len)
  2468  	mvccKey, rawValue, _, err := MVCCScanDecodeKeyValue(repr)
  2469  	if err != nil {
  2470  		return nil, nil, err
  2471  	}
  2472  	value := &roachpb.Value{
  2473  		RawBytes:  rawValue,
  2474  		Timestamp: mvccKey.Timestamp,
  2475  	}
  2476  	return value, intent, nil
  2477  }
  2478  
  2479  func (r *rocksDBIterator) MVCCScan(
  2480  	start, end roachpb.Key, timestamp hlc.Timestamp, opts MVCCScanOptions,
  2481  ) (MVCCScanResult, error) {
  2482  	if opts.Inconsistent && opts.Txn != nil {
  2483  		return MVCCScanResult{}, errors.Errorf("cannot allow inconsistent reads within a transaction")
  2484  	}
  2485  	if len(end) == 0 {
  2486  		return MVCCScanResult{}, emptyKeyError()
  2487  	}
  2488  	if opts.MaxKeys < 0 {
  2489  		resumeSpan := &roachpb.Span{Key: start, EndKey: end}
  2490  		return MVCCScanResult{ResumeSpan: resumeSpan}, nil
  2491  	}
  2492  
  2493  	r.clearState()
  2494  	state := C.MVCCScan(
  2495  		r.iter, goToCSlice(start), goToCSlice(end), goToCTimestamp(timestamp),
  2496  		C.int64_t(opts.MaxKeys), C.int64_t(opts.TargetBytes),
  2497  		goToCTxn(opts.Txn), C.bool(opts.Inconsistent),
  2498  		C.bool(opts.Reverse), C.bool(opts.Tombstones),
  2499  		C.bool(opts.FailOnMoreRecent),
  2500  	)
  2501  
  2502  	if err := statusToError(state.status); err != nil {
  2503  		return MVCCScanResult{}, err
  2504  	}
  2505  	if err := writeTooOldToError(timestamp, state.write_too_old_timestamp); err != nil {
  2506  		return MVCCScanResult{}, err
  2507  	}
  2508  	if err := uncertaintyToError(timestamp, state.uncertainty_timestamp, opts.Txn); err != nil {
  2509  		return MVCCScanResult{}, err
  2510  	}
  2511  
  2512  	kvData := [][]byte{copyFromSliceVector(state.data.bufs, state.data.len)}
  2513  	numKVs := int64(state.data.count)
  2514  	numBytes := int64(state.data.bytes)
  2515  
  2516  	var resumeSpan *roachpb.Span
  2517  	if resumeKey := cSliceToGoBytes(state.resume_key); resumeKey != nil {
  2518  		if opts.Reverse {
  2519  			resumeSpan = &roachpb.Span{Key: start, EndKey: roachpb.Key(resumeKey).Next()}
  2520  		} else {
  2521  			resumeSpan = &roachpb.Span{Key: resumeKey, EndKey: end}
  2522  		}
  2523  	}
  2524  
  2525  	intents, err := buildScanIntents(cSliceToGoBytes(state.intents))
  2526  	if err != nil {
  2527  		return MVCCScanResult{}, err
  2528  	}
  2529  	if !opts.Inconsistent && len(intents) > 0 {
  2530  		return MVCCScanResult{}, &roachpb.WriteIntentError{Intents: intents}
  2531  	}
  2532  
  2533  	return MVCCScanResult{
  2534  		KVData:     kvData,
  2535  		NumKeys:    numKVs,
  2536  		NumBytes:   numBytes,
  2537  		ResumeSpan: resumeSpan,
  2538  		Intents:    intents,
  2539  	}, nil
  2540  }
  2541  
  2542  func (r *rocksDBIterator) SetUpperBound(key roachpb.Key) {
  2543  	C.DBIterSetUpperBound(r.iter, goToCKey(MakeMVCCMetadataKey(key)))
  2544  }
  2545  
  2546  // CheckForKeyCollisions indicates if the provided SST data collides with this
  2547  // iterator in the specified range.
  2548  func (r *rocksDBIterator) CheckForKeyCollisions(
  2549  	sstData []byte, start, end roachpb.Key,
  2550  ) (enginepb.MVCCStats, error) {
  2551  	// Create a C++ iterator over the SST being added. This iterator is used to
  2552  	// perform a check for key collisions between the SST being ingested, and the
  2553  	// exisiting data. As the collision check is in C++ we are unable to use a
  2554  	// pure go iterator as in verifySSTable.
  2555  	sst := MakeRocksDBSstFileReader()
  2556  	defer sst.Close()
  2557  	emptyStats := enginepb.MVCCStats{}
  2558  
  2559  	if err := sst.IngestExternalFile(sstData); err != nil {
  2560  		return emptyStats, err
  2561  	}
  2562  	sstIterator := sst.NewIterator(IterOptions{UpperBound: end}).(*rocksDBIterator)
  2563  	defer sstIterator.Close()
  2564  	sstIterator.SeekGE(MakeMVCCMetadataKey(start))
  2565  	if ok, err := sstIterator.Valid(); err != nil || !ok {
  2566  		return emptyStats, errors.Wrap(err, "checking for key collisions")
  2567  	}
  2568  
  2569  	var intentErr C.DBString
  2570  	var skippedKVStats C.MVCCStatsResult
  2571  
  2572  	state := C.DBCheckForKeyCollisions(r.iter, sstIterator.iter, &skippedKVStats, &intentErr)
  2573  
  2574  	err := statusToError(state.status)
  2575  	if err != nil {
  2576  		if err.Error() == "WriteIntentError" {
  2577  			var e roachpb.WriteIntentError
  2578  			if err := protoutil.Unmarshal(cStringToGoBytes(intentErr), &e); err != nil {
  2579  				return emptyStats, errors.Wrap(err, "failed to decode write intent error")
  2580  			}
  2581  			return emptyStats, &e
  2582  		} else if err.Error() == "InlineError" {
  2583  			return emptyStats, errors.Errorf("inline values are unsupported when checking for key collisions")
  2584  		}
  2585  		err = errors.Wrap(&Error{msg: cToGoKey(state.key).String()}, "ingested key collides with an existing one")
  2586  		return emptyStats, err
  2587  	}
  2588  
  2589  	skippedStats, err := cStatsToGoStats(skippedKVStats, 0)
  2590  	return skippedStats, err
  2591  }
  2592  
  2593  func copyFromSliceVector(bufs *C.DBSlice, len C.int32_t) []byte {
  2594  	if bufs == nil {
  2595  		return nil
  2596  	}
  2597  
  2598  	// Interpret the C pointer as a pointer to a Go array, then slice.
  2599  	slices := (*[1 << 20]C.DBSlice)(unsafe.Pointer(bufs))[:len:len]
  2600  	neededBytes := 0
  2601  	for i := range slices {
  2602  		neededBytes += int(slices[i].len)
  2603  	}
  2604  	data := nonZeroingMakeByteSlice(neededBytes)[:0]
  2605  	for i := range slices {
  2606  		data = append(data, cSliceToUnsafeGoBytes(slices[i])...)
  2607  	}
  2608  	return data
  2609  }
  2610  
  2611  func cStatsToGoStats(stats C.MVCCStatsResult, nowNanos int64) (enginepb.MVCCStats, error) {
  2612  	ms := enginepb.MVCCStats{}
  2613  	if err := statusToError(stats.status); err != nil {
  2614  		return ms, err
  2615  	}
  2616  
  2617  	ms.ContainsEstimates = 0
  2618  	ms.LiveBytes = int64(stats.live_bytes)
  2619  	ms.KeyBytes = int64(stats.key_bytes)
  2620  	ms.ValBytes = int64(stats.val_bytes)
  2621  	ms.IntentBytes = int64(stats.intent_bytes)
  2622  	ms.LiveCount = int64(stats.live_count)
  2623  	ms.KeyCount = int64(stats.key_count)
  2624  	ms.ValCount = int64(stats.val_count)
  2625  	ms.IntentCount = int64(stats.intent_count)
  2626  	ms.IntentAge = int64(stats.intent_age)
  2627  	ms.GCBytesAge = int64(stats.gc_bytes_age)
  2628  	ms.SysBytes = int64(stats.sys_bytes)
  2629  	ms.SysCount = int64(stats.sys_count)
  2630  	ms.LastUpdateNanos = nowNanos
  2631  	return ms, nil
  2632  }
  2633  
  2634  // goToCSlice converts a go byte slice to a DBSlice. Note that this is
  2635  // potentially dangerous as the DBSlice holds a reference to the go
  2636  // byte slice memory that the Go GC does not know about. This method
  2637  // is only intended for use in converting arguments to C
  2638  // functions. The C function must copy any data that it wishes to
  2639  // retain once the function returns.
  2640  func goToCSlice(b []byte) C.DBSlice {
  2641  	if len(b) == 0 {
  2642  		return C.DBSlice{data: nil, len: 0}
  2643  	}
  2644  	return C.DBSlice{
  2645  		data: (*C.char)(unsafe.Pointer(&b[0])),
  2646  		len:  C.size_t(len(b)),
  2647  	}
  2648  }
  2649  
  2650  func goToCIgnoredSeqNums(b []enginepb.IgnoredSeqNumRange) C.DBIgnoredSeqNums {
  2651  	if len(b) == 0 {
  2652  		return C.DBIgnoredSeqNums{ranges: nil, len: 0}
  2653  	}
  2654  	return C.DBIgnoredSeqNums{
  2655  		ranges: (*C.DBIgnoredSeqNumRange)(unsafe.Pointer(&b[0])),
  2656  		len:    C.int(len(b)),
  2657  	}
  2658  }
  2659  
  2660  func goToCKey(key MVCCKey) C.DBKey {
  2661  	return C.DBKey{
  2662  		key:       goToCSlice(key.Key),
  2663  		wall_time: C.int64_t(key.Timestamp.WallTime),
  2664  		logical:   C.int32_t(key.Timestamp.Logical),
  2665  	}
  2666  }
  2667  
  2668  func cToGoKey(key C.DBKey) MVCCKey {
  2669  	// When converting a C.DBKey to an MVCCKey, give the underlying slice an
  2670  	// extra byte of capacity in anticipation of roachpb.Key.Next() being
  2671  	// called. The extra byte is trivial extra space, but allows callers to avoid
  2672  	// an allocation and copy when calling roachpb.Key.Next(). Note that it is
  2673  	// important that the extra byte contain the value 0 in order for the
  2674  	// roachpb.Key.Next() fast-path to be invoked. This is true for the code
  2675  	// below because make() zero initializes all of the bytes.
  2676  	unsafeKey := cSliceToUnsafeGoBytes(key.key)
  2677  	safeKey := make([]byte, len(unsafeKey), len(unsafeKey)+1)
  2678  	copy(safeKey, unsafeKey)
  2679  
  2680  	return MVCCKey{
  2681  		Key: safeKey,
  2682  		Timestamp: hlc.Timestamp{
  2683  			WallTime: int64(key.wall_time),
  2684  			Logical:  int32(key.logical),
  2685  		},
  2686  	}
  2687  }
  2688  
  2689  func cToUnsafeGoKey(key C.DBKey) MVCCKey {
  2690  	return MVCCKey{
  2691  		Key: cSliceToUnsafeGoBytes(key.key),
  2692  		Timestamp: hlc.Timestamp{
  2693  			WallTime: int64(key.wall_time),
  2694  			Logical:  int32(key.logical),
  2695  		},
  2696  	}
  2697  }
  2698  
  2699  func cStringToGoString(s C.DBString) string {
  2700  	if s.data == nil {
  2701  		return ""
  2702  	}
  2703  	// Reinterpret the string as a slice, then cast to string which does a copy.
  2704  	result := string(cSliceToUnsafeGoBytes(C.DBSlice{s.data, s.len}))
  2705  	C.free(unsafe.Pointer(s.data))
  2706  	return result
  2707  }
  2708  
  2709  func cStringToGoBytes(s C.DBString) []byte {
  2710  	if s.data == nil {
  2711  		return nil
  2712  	}
  2713  	result := gobytes(unsafe.Pointer(s.data), int(s.len))
  2714  	C.free(unsafe.Pointer(s.data))
  2715  	return result
  2716  }
  2717  
  2718  func cSliceToGoBytes(s C.DBSlice) []byte {
  2719  	if s.data == nil {
  2720  		return nil
  2721  	}
  2722  	return gobytes(unsafe.Pointer(s.data), int(s.len))
  2723  }
  2724  
  2725  func cSliceToUnsafeGoBytes(s C.DBSlice) []byte {
  2726  	if s.data == nil {
  2727  		return nil
  2728  	}
  2729  	// Interpret the C pointer as a pointer to a Go array, then slice.
  2730  	return (*[MaxArrayLen]byte)(unsafe.Pointer(s.data))[:s.len:s.len]
  2731  }
  2732  
  2733  func goToCTimestamp(ts hlc.Timestamp) C.DBTimestamp {
  2734  	return C.DBTimestamp{
  2735  		wall_time: C.int64_t(ts.WallTime),
  2736  		logical:   C.int32_t(ts.Logical),
  2737  	}
  2738  }
  2739  
  2740  func cToGoTimestamp(ts C.DBTimestamp) hlc.Timestamp {
  2741  	return hlc.Timestamp{
  2742  		WallTime: int64(ts.wall_time),
  2743  		Logical:  int32(ts.logical),
  2744  	}
  2745  }
  2746  
  2747  func goToCTxn(txn *roachpb.Transaction) C.DBTxn {
  2748  	var r C.DBTxn
  2749  	if txn != nil {
  2750  		r.id = goToCSlice(txn.ID.GetBytesMut())
  2751  		r.epoch = C.uint32_t(txn.Epoch)
  2752  		r.sequence = C.int32_t(txn.Sequence)
  2753  		r.max_timestamp = goToCTimestamp(txn.MaxTimestamp)
  2754  		r.ignored_seqnums = goToCIgnoredSeqNums(txn.IgnoredSeqNums)
  2755  	}
  2756  	return r
  2757  }
  2758  
  2759  func goToCIterOptions(opts IterOptions) C.DBIterOptions {
  2760  	return C.DBIterOptions{
  2761  		prefix:             C.bool(opts.Prefix),
  2762  		lower_bound:        goToCKey(MakeMVCCMetadataKey(opts.LowerBound)),
  2763  		upper_bound:        goToCKey(MakeMVCCMetadataKey(opts.UpperBound)),
  2764  		min_timestamp_hint: goToCTimestamp(opts.MinTimestampHint),
  2765  		max_timestamp_hint: goToCTimestamp(opts.MaxTimestampHint),
  2766  		with_stats:         C.bool(opts.WithStats),
  2767  	}
  2768  }
  2769  
  2770  func statusToError(s C.DBStatus) error {
  2771  	if s.data == nil {
  2772  		return nil
  2773  	}
  2774  	return &Error{msg: cStringToGoString(s)}
  2775  }
  2776  
  2777  func writeTooOldToError(readTS hlc.Timestamp, existingCTS C.DBTimestamp) error {
  2778  	existingTS := cToGoTimestamp(existingCTS)
  2779  	if !existingTS.IsEmpty() {
  2780  		// The txn can't write at the existing timestamp, so we provide the
  2781  		// error with the timestamp immediately after it.
  2782  		return roachpb.NewWriteTooOldError(readTS, existingTS.Next())
  2783  	}
  2784  	return nil
  2785  }
  2786  
  2787  func uncertaintyToError(
  2788  	readTS hlc.Timestamp, existingCTS C.DBTimestamp, txn *roachpb.Transaction,
  2789  ) error {
  2790  	existingTS := cToGoTimestamp(existingCTS)
  2791  	if !existingTS.IsEmpty() {
  2792  		return roachpb.NewReadWithinUncertaintyIntervalError(readTS, existingTS, txn)
  2793  	}
  2794  	return nil
  2795  }
  2796  
  2797  // goMerge takes existing and update byte slices that are expected to
  2798  // be marshaled roachpb.Values and merges the two values returning a
  2799  // marshaled roachpb.Value or an error.
  2800  func goMerge(existing, update []byte) ([]byte, error) {
  2801  	var result C.DBString
  2802  	status := C.DBMergeOne(goToCSlice(existing), goToCSlice(update), &result)
  2803  	if status.data != nil {
  2804  		return nil, errors.Errorf("%s: existing=%q, update=%q",
  2805  			cStringToGoString(status), existing, update)
  2806  	}
  2807  	return cStringToGoBytes(result), nil
  2808  }
  2809  
  2810  // goPartialMerge takes existing and update byte slices that are expected to
  2811  // be marshaled roachpb.Values and performs a partial merge using C++ code,
  2812  // marshaled roachpb.Value or an error.
  2813  func goPartialMerge(existing, update []byte) ([]byte, error) {
  2814  	var result C.DBString
  2815  	status := C.DBPartialMergeOne(goToCSlice(existing), goToCSlice(update), &result)
  2816  	if status.data != nil {
  2817  		return nil, errors.Errorf("%s: existing=%q, update=%q",
  2818  			cStringToGoString(status), existing, update)
  2819  	}
  2820  	return cStringToGoBytes(result), nil
  2821  }
  2822  
  2823  func emptyKeyError() error {
  2824  	return errors.Errorf("attempted access to empty key")
  2825  }
  2826  
  2827  func dbPut(rdb *C.DBEngine, key MVCCKey, value []byte) error {
  2828  	if len(key.Key) == 0 {
  2829  		return emptyKeyError()
  2830  	}
  2831  
  2832  	// *Put, *Get, and *Delete call memcpy() (by way of MemTable::Add)
  2833  	// when called, so we do not need to worry about these byte slices
  2834  	// being reclaimed by the GC.
  2835  	return statusToError(C.DBPut(rdb, goToCKey(key), goToCSlice(value)))
  2836  }
  2837  
  2838  func dbMerge(rdb *C.DBEngine, key MVCCKey, value []byte) error {
  2839  	if len(key.Key) == 0 {
  2840  		return emptyKeyError()
  2841  	}
  2842  
  2843  	// DBMerge calls memcpy() (by way of MemTable::Add)
  2844  	// when called, so we do not need to worry about these byte slices being
  2845  	// reclaimed by the GC.
  2846  	return statusToError(C.DBMerge(rdb, goToCKey(key), goToCSlice(value)))
  2847  }
  2848  
  2849  func dbApplyBatchRepr(rdb *C.DBEngine, repr []byte, sync bool) error {
  2850  	return statusToError(C.DBApplyBatchRepr(rdb, goToCSlice(repr), C.bool(sync)))
  2851  }
  2852  
  2853  // dbGet returns the value for the given key.
  2854  func dbGet(rdb *C.DBEngine, key MVCCKey) ([]byte, error) {
  2855  	if len(key.Key) == 0 {
  2856  		return nil, emptyKeyError()
  2857  	}
  2858  	var result C.DBString
  2859  	err := statusToError(C.DBGet(rdb, goToCKey(key), &result))
  2860  	if err != nil {
  2861  		return nil, err
  2862  	}
  2863  	return cStringToGoBytes(result), nil
  2864  }
  2865  
  2866  func dbGetProto(
  2867  	rdb *C.DBEngine, key MVCCKey, msg protoutil.Message,
  2868  ) (ok bool, keyBytes, valBytes int64, err error) {
  2869  	if len(key.Key) == 0 {
  2870  		err = emptyKeyError()
  2871  		return
  2872  	}
  2873  	var result C.DBString
  2874  	if err = statusToError(C.DBGet(rdb, goToCKey(key), &result)); err != nil {
  2875  		return
  2876  	}
  2877  	if result.len == 0 {
  2878  		msg.Reset()
  2879  		return
  2880  	}
  2881  	ok = true
  2882  	if msg != nil {
  2883  		// Make a byte slice that is backed by result.data. This slice
  2884  		// cannot live past the lifetime of this method, but we're only
  2885  		// using it to unmarshal the roachpb.
  2886  		data := cSliceToUnsafeGoBytes(C.DBSlice{data: result.data, len: result.len})
  2887  		err = protoutil.Unmarshal(data, msg)
  2888  	}
  2889  	C.free(unsafe.Pointer(result.data))
  2890  	keyBytes = int64(key.EncodedSize())
  2891  	valBytes = int64(result.len)
  2892  	return
  2893  }
  2894  
  2895  func dbClear(rdb *C.DBEngine, key MVCCKey) error {
  2896  	if len(key.Key) == 0 {
  2897  		return emptyKeyError()
  2898  	}
  2899  	return statusToError(C.DBDelete(rdb, goToCKey(key)))
  2900  }
  2901  
  2902  func dbSingleClear(rdb *C.DBEngine, key MVCCKey) error {
  2903  	if len(key.Key) == 0 {
  2904  		return emptyKeyError()
  2905  	}
  2906  	return statusToError(C.DBSingleDelete(rdb, goToCKey(key)))
  2907  }
  2908  
  2909  func dbClearRange(rdb *C.DBEngine, start, end MVCCKey) error {
  2910  	if err := statusToError(C.DBDeleteRange(rdb, goToCKey(start), goToCKey(end))); err != nil {
  2911  		return err
  2912  	}
  2913  	// This is a serious hack. RocksDB generates sstables which cover an
  2914  	// excessively large amount of the key space when range tombstones are
  2915  	// present. The crux of the problem is that the logic for determining sstable
  2916  	// boundaries depends on actual keys being present. So we help that logic
  2917  	// along by adding deletions of the first key covered by the range tombstone,
  2918  	// and a key near the end of the range (previous is difficult). See
  2919  	// TestRocksDBDeleteRangeCompaction which verifies that either this hack is
  2920  	// working, or the upstream problem was fixed in RocksDB.
  2921  	if err := dbClear(rdb, start); err != nil {
  2922  		return err
  2923  	}
  2924  	prev := make(roachpb.Key, len(end.Key))
  2925  	copy(prev, end.Key)
  2926  	if n := len(prev) - 1; prev[n] > 0 {
  2927  		prev[n]--
  2928  	} else {
  2929  		prev = prev[:n]
  2930  	}
  2931  	if start.Key.Compare(prev) < 0 {
  2932  		if err := dbClear(rdb, MakeMVCCMetadataKey(prev)); err != nil {
  2933  			return err
  2934  		}
  2935  	}
  2936  	return nil
  2937  }
  2938  
  2939  func dbClearIterRange(rdb *C.DBEngine, iter Iterator, start, end roachpb.Key) error {
  2940  	getter, ok := iter.(dbIteratorGetter)
  2941  	if !ok {
  2942  		return errors.Errorf("%T is not a RocksDB iterator", iter)
  2943  	}
  2944  	return statusToError(C.DBDeleteIterRange(rdb, getter.getIter(),
  2945  		goToCKey(MakeMVCCMetadataKey(start)), goToCKey(MakeMVCCMetadataKey(end))))
  2946  }
  2947  
  2948  // TODO(dan): Rename this to RocksDBSSTFileReader and RocksDBSSTFileWriter.
  2949  
  2950  // RocksDBSstFileReader allows iteration over a number of non-overlapping
  2951  // sstables exported by `RocksDBSstFileWriter`.
  2952  type RocksDBSstFileReader struct {
  2953  	rocksDB         *RocksDB
  2954  	filenameCounter int
  2955  }
  2956  
  2957  // MakeRocksDBSstFileReader creates a RocksDBSstFileReader backed by an
  2958  // in-memory RocksDB instance.
  2959  func MakeRocksDBSstFileReader() RocksDBSstFileReader {
  2960  	// cacheSize was selected because it's used for almost all other newRocksDBInMem
  2961  	// calls. It's seemed to work well so far, but there's probably more tuning
  2962  	// to be done here.
  2963  	const cacheSize = 1 << 20
  2964  	return RocksDBSstFileReader{rocksDB: newRocksDBInMem(roachpb.Attributes{}, cacheSize)}
  2965  }
  2966  
  2967  // IngestExternalFile links a file with the given contents into a database. See
  2968  // the RocksDB documentation on `IngestExternalFile` for the various
  2969  // restrictions on what can be added.
  2970  func (fr *RocksDBSstFileReader) IngestExternalFile(data []byte) error {
  2971  	if fr.rocksDB == nil {
  2972  		return errors.New("cannot call IngestExternalFile on a closed reader")
  2973  	}
  2974  
  2975  	filename := fmt.Sprintf("ingest-%d", fr.filenameCounter)
  2976  	fr.filenameCounter++
  2977  	if err := fr.rocksDB.WriteFile(filename, data); err != nil {
  2978  		return err
  2979  	}
  2980  
  2981  	cPaths := make([]*C.char, 1)
  2982  	cPaths[0] = C.CString(filename)
  2983  	cPathLen := C.size_t(len(cPaths))
  2984  	defer C.free(unsafe.Pointer(cPaths[0]))
  2985  
  2986  	const noMove = false
  2987  	return statusToError(C.DBIngestExternalFiles(fr.rocksDB.rdb, &cPaths[0], cPathLen, noMove))
  2988  }
  2989  
  2990  // Iterate iterates over the keys between start inclusive and end
  2991  // exclusive, invoking f() on each key/value pair.
  2992  func (fr *RocksDBSstFileReader) Iterate(
  2993  	start, end roachpb.Key, f func(MVCCKeyValue) (bool, error),
  2994  ) error {
  2995  	if fr.rocksDB == nil {
  2996  		return errors.New("cannot call Iterate on a closed reader")
  2997  	}
  2998  	return fr.rocksDB.Iterate(start, end, f)
  2999  }
  3000  
  3001  // NewIterator returns an iterator over this sst reader.
  3002  func (fr *RocksDBSstFileReader) NewIterator(opts IterOptions) Iterator {
  3003  	return newRocksDBIterator(fr.rocksDB.rdb, opts, fr.rocksDB, fr.rocksDB)
  3004  }
  3005  
  3006  // Close finishes the reader.
  3007  func (fr *RocksDBSstFileReader) Close() {
  3008  	if fr.rocksDB == nil {
  3009  		return
  3010  	}
  3011  	fr.rocksDB.Close()
  3012  	fr.rocksDB = nil
  3013  }
  3014  
  3015  // RocksDBSstFileWriter creates a file suitable for importing with
  3016  // RocksDBSstFileReader. It implements the Writer interface.
  3017  type RocksDBSstFileWriter struct {
  3018  	fw *C.DBSstFileWriter
  3019  	// dataSize tracks the total key and value bytes added so far.
  3020  	dataSize int64
  3021  }
  3022  
  3023  var _ Writer = &RocksDBSstFileWriter{}
  3024  
  3025  // MakeRocksDBSstFileWriter creates a new RocksDBSstFileWriter with the default
  3026  // configuration.
  3027  //
  3028  // NOTE: This is deprecated - and should only be used in tests to check for
  3029  // equivalence with engine.SSTWriter.
  3030  //
  3031  // TODO(itsbilal): Move all tests to SSTWriter and then delete this function
  3032  // and struct.
  3033  func MakeRocksDBSstFileWriter() (RocksDBSstFileWriter, error) {
  3034  	fw := C.DBSstFileWriterNew()
  3035  	err := statusToError(C.DBSstFileWriterOpen(fw))
  3036  	return RocksDBSstFileWriter{fw: fw}, err
  3037  }
  3038  
  3039  // ApplyBatchRepr implements the Writer interface.
  3040  func (fw *RocksDBSstFileWriter) ApplyBatchRepr(repr []byte, sync bool) error {
  3041  	panic("unimplemented")
  3042  }
  3043  
  3044  // Clear implements the Writer interface. Note that it inserts a tombstone
  3045  // rather than actually remove the entry from the storage engine. An error is
  3046  // returned if it is not greater than any previous key used in Put or Clear
  3047  // (according to the comparator configured during writer creation). Close
  3048  // cannot have been called.
  3049  func (fw *RocksDBSstFileWriter) Clear(key MVCCKey) error {
  3050  	if fw.fw == nil {
  3051  		return errors.New("cannot call Clear on a closed writer")
  3052  	}
  3053  	fw.dataSize += int64(len(key.Key))
  3054  	return statusToError(C.DBSstFileWriterDelete(fw.fw, goToCKey(key)))
  3055  }
  3056  
  3057  // DataSize returns the total key and value bytes added so far.
  3058  func (fw *RocksDBSstFileWriter) DataSize() int64 {
  3059  	return fw.dataSize
  3060  }
  3061  
  3062  // SingleClear implements the Writer interface.
  3063  func (fw *RocksDBSstFileWriter) SingleClear(key MVCCKey) error {
  3064  	panic("unimplemented")
  3065  }
  3066  
  3067  // ClearRange implements the Writer interface. Note that it inserts a range deletion
  3068  // tombstone rather than actually remove the entries from the storage engine.
  3069  // It can be called at any time with respect to Put and Clear.
  3070  func (fw *RocksDBSstFileWriter) ClearRange(start, end MVCCKey) error {
  3071  	if fw.fw == nil {
  3072  		return errors.New("cannot call ClearRange on a closed writer")
  3073  	}
  3074  	fw.dataSize += int64(len(start.Key)) + int64(len(end.Key))
  3075  	return statusToError(C.DBSstFileWriterDeleteRange(fw.fw, goToCKey(start), goToCKey(end)))
  3076  }
  3077  
  3078  // ClearIterRange implements the Writer interface.
  3079  //
  3080  // NOTE: This method is fairly expensive as it performs a Cgo call for every
  3081  // key deleted.
  3082  func (fw *RocksDBSstFileWriter) ClearIterRange(iter Iterator, start, end roachpb.Key) error {
  3083  	if fw.fw == nil {
  3084  		return errors.New("cannot call ClearIterRange on a closed writer")
  3085  	}
  3086  	mvccEndKey := MakeMVCCMetadataKey(end)
  3087  	iter.SeekGE(MakeMVCCMetadataKey(start))
  3088  	for {
  3089  		valid, err := iter.Valid()
  3090  		if err != nil {
  3091  			return err
  3092  		}
  3093  		if !valid || !iter.Key().Less(mvccEndKey) {
  3094  			break
  3095  		}
  3096  		if err := fw.Clear(iter.Key()); err != nil {
  3097  			return err
  3098  		}
  3099  		iter.Next()
  3100  	}
  3101  	return nil
  3102  }
  3103  
  3104  // Merge implements the Writer interface.
  3105  func (fw *RocksDBSstFileWriter) Merge(key MVCCKey, value []byte) error {
  3106  	panic("unimplemented")
  3107  }
  3108  
  3109  // Put implements the Writer interface. It puts a kv entry into the sstable
  3110  // being built. An error is returned if it is not greater than any previous key
  3111  // used in Put or Clear (according to the comparator configured during writer
  3112  // creation). Close cannot have been called.
  3113  func (fw *RocksDBSstFileWriter) Put(key MVCCKey, value []byte) error {
  3114  	if fw.fw == nil {
  3115  		return errors.New("cannot call Put on a closed writer")
  3116  	}
  3117  	fw.dataSize += int64(len(key.Key)) + int64(len(value))
  3118  	return statusToError(C.DBSstFileWriterAdd(fw.fw, goToCKey(key), goToCSlice(value)))
  3119  }
  3120  
  3121  // LogData implements the Writer interface.
  3122  func (fw *RocksDBSstFileWriter) LogData(data []byte) error {
  3123  	panic("unimplemented")
  3124  }
  3125  
  3126  // LogLogicalOp implements the Writer interface.
  3127  func (fw *RocksDBSstFileWriter) LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails) {
  3128  	// No-op. Logical logging disabled.
  3129  }
  3130  
  3131  // Truncate truncates the writer's current memory buffer and returns the
  3132  // contents it contained. May be called multiple times. The function may not
  3133  // truncate and return all keys if the underlying RocksDB blocks have not been
  3134  // flushed. Close cannot have been called.
  3135  func (fw *RocksDBSstFileWriter) Truncate() ([]byte, error) {
  3136  	if fw.fw == nil {
  3137  		return nil, errors.New("cannot call Truncate on a closed writer")
  3138  	}
  3139  	var contents C.DBString
  3140  	if err := statusToError(C.DBSstFileWriterTruncate(fw.fw, &contents)); err != nil {
  3141  		return nil, err
  3142  	}
  3143  	return cStringToGoBytes(contents), nil
  3144  }
  3145  
  3146  // Finish finalizes the writer and returns the constructed file's contents. At
  3147  // least one kv entry must have been added.
  3148  func (fw *RocksDBSstFileWriter) Finish() ([]byte, error) {
  3149  	if fw.fw == nil {
  3150  		return nil, errors.New("cannot call Finish on a closed writer")
  3151  	}
  3152  	var contents C.DBString
  3153  	if err := statusToError(C.DBSstFileWriterFinish(fw.fw, &contents)); err != nil {
  3154  		return nil, err
  3155  	}
  3156  	return cStringToGoBytes(contents), nil
  3157  }
  3158  
  3159  // Close finishes and frees memory and other resources. Close is idempotent.
  3160  func (fw *RocksDBSstFileWriter) Close() {
  3161  	if fw.fw == nil {
  3162  		return
  3163  	}
  3164  	C.DBSstFileWriterClose(fw.fw)
  3165  	fw.fw = nil
  3166  }
  3167  
  3168  // RunLDB runs RocksDB's ldb command-line tool. The passed
  3169  // command-line arguments should not include argv[0].
  3170  func RunLDB(args []string) {
  3171  	// Prepend "ldb" as argv[0].
  3172  	args = append([]string{"ldb"}, args...)
  3173  	argv := make([]*C.char, len(args))
  3174  	for i := range args {
  3175  		argv[i] = C.CString(args[i])
  3176  	}
  3177  	defer func() {
  3178  		for i := range argv {
  3179  			C.free(unsafe.Pointer(argv[i]))
  3180  		}
  3181  	}()
  3182  
  3183  	C.DBRunLDB(C.int(len(argv)), &argv[0])
  3184  }
  3185  
  3186  // RunSSTDump runs RocksDB's sst_dump command-line tool. The passed
  3187  // command-line arguments should not include argv[0].
  3188  func RunSSTDump(args []string) {
  3189  	// Prepend "sst_dump" as argv[0].
  3190  	args = append([]string{"sst_dump"}, args...)
  3191  	argv := make([]*C.char, len(args))
  3192  	for i := range args {
  3193  		argv[i] = C.CString(args[i])
  3194  	}
  3195  	defer func() {
  3196  		for i := range argv {
  3197  			C.free(unsafe.Pointer(argv[i]))
  3198  		}
  3199  	}()
  3200  
  3201  	C.DBRunSSTDump(C.int(len(argv)), &argv[0])
  3202  }
  3203  
  3204  // GetAuxiliaryDir returns the auxiliary storage path for this engine.
  3205  func (r *RocksDB) GetAuxiliaryDir() string {
  3206  	return r.auxDir
  3207  }
  3208  
  3209  func (r *RocksDB) setAuxiliaryDir(d string) error {
  3210  	if !r.cfg.ReadOnly {
  3211  		if err := os.MkdirAll(d, 0755); err != nil {
  3212  			return err
  3213  		}
  3214  	}
  3215  	r.auxDir = d
  3216  	return nil
  3217  }
  3218  
  3219  // PreIngestDelay implements the Engine interface.
  3220  func (r *RocksDB) PreIngestDelay(ctx context.Context) {
  3221  	preIngestDelay(ctx, r, r.cfg.Settings)
  3222  }
  3223  
  3224  // IngestExternalFiles atomically links a slice of files into the RocksDB
  3225  // log-structured merge-tree.
  3226  func (r *RocksDB) IngestExternalFiles(ctx context.Context, paths []string) error {
  3227  	cPaths := make([]*C.char, len(paths))
  3228  	for i := range paths {
  3229  		cPaths[i] = C.CString(paths[i])
  3230  	}
  3231  	defer func() {
  3232  		for i := range cPaths {
  3233  			C.free(unsafe.Pointer(cPaths[i]))
  3234  		}
  3235  	}()
  3236  
  3237  	return statusToError(C.DBIngestExternalFiles(
  3238  		r.rdb,
  3239  		&cPaths[0],
  3240  		C.size_t(len(cPaths)),
  3241  		C._Bool(true), // move_files
  3242  	))
  3243  }
  3244  
  3245  // InMem returns true if the receiver is an in-memory engine and false
  3246  // otherwise.
  3247  func (r *RocksDB) InMem() bool {
  3248  	return r.cfg.Dir == ""
  3249  }
  3250  
  3251  // ReadFile reads the content from a file with the given filename. The file
  3252  // must have been opened through Engine.OpenFile. Otherwise an error will be
  3253  // returned.
  3254  func (r *RocksDB) ReadFile(filename string) ([]byte, error) {
  3255  	var data C.DBSlice
  3256  	if err := statusToError(C.DBEnvReadFile(r.rdb, goToCSlice([]byte(filename)), &data)); err != nil {
  3257  		return nil, notFoundErrOrDefault(err)
  3258  	}
  3259  	defer C.free(unsafe.Pointer(data.data))
  3260  	return cSliceToGoBytes(data), nil
  3261  }
  3262  
  3263  // WriteFile writes data to a file in this RocksDB's env.
  3264  func (r *RocksDB) WriteFile(filename string, data []byte) error {
  3265  	return statusToError(C.DBEnvWriteFile(r.rdb, goToCSlice([]byte(filename)), goToCSlice(data)))
  3266  }
  3267  
  3268  // Remove deletes the file with the given filename from this RocksDB's env.
  3269  // If the file with given filename doesn't exist, return os.ErrNotExist.
  3270  func (r *RocksDB) Remove(filename string) error {
  3271  	if err := statusToError(C.DBEnvDeleteFile(r.rdb, goToCSlice([]byte(filename)))); err != nil {
  3272  		return notFoundErrOrDefault(err)
  3273  	}
  3274  	return nil
  3275  }
  3276  
  3277  // RemoveAll removes path and any children it contains from this RocksDB's
  3278  // env. If the path does not exist, RemoveAll returns nil (no error).
  3279  func (r *RocksDB) RemoveAll(path string) error {
  3280  	// We don't have a reliable way of telling whether a path is a directory
  3281  	// or a file from the RocksDB Env interface. Assume it's a directory,
  3282  	// ignoring any resulting error, and delete any of its children.
  3283  	dirents, listErr := r.List(path)
  3284  	if listErr == nil {
  3285  		for _, dirent := range dirents {
  3286  			err := r.RemoveAll(filepath.Join(path, dirent))
  3287  			if err != nil {
  3288  				return err
  3289  			}
  3290  		}
  3291  
  3292  		// Path should exist, point to a directory and have no children.
  3293  		return r.RemoveDir(path)
  3294  	}
  3295  
  3296  	// Path might be a file, non-existent, or a directory for which List
  3297  	// errored for some other reason.
  3298  	err := r.Remove(path)
  3299  	if err == nil {
  3300  		return nil
  3301  	}
  3302  	if os.IsNotExist(err) && os.IsNotExist(listErr) {
  3303  		return nil
  3304  	}
  3305  	return listErr
  3306  }
  3307  
  3308  // Link creates 'newname' as a hard link to 'oldname'. This use the Env
  3309  // responsible for the file which may handle extra logic (eg: copy encryption
  3310  // settings for EncryptedEnv).
  3311  func (r *RocksDB) Link(oldname, newname string) error {
  3312  	if err := statusToError(C.DBEnvLinkFile(r.rdb, goToCSlice([]byte(oldname)), goToCSlice([]byte(newname)))); err != nil {
  3313  		return &os.LinkError{
  3314  			Op:  "link",
  3315  			Old: oldname,
  3316  			New: newname,
  3317  			Err: err,
  3318  		}
  3319  	}
  3320  	return nil
  3321  }
  3322  
  3323  // IsValidSplitKey returns whether the key is a valid split key. Certain key
  3324  // ranges cannot be split (the meta1 span and the system DB span); split keys
  3325  // chosen within any of these ranges are considered invalid. And a split key
  3326  // equal to Meta2KeyMax (\x03\xff\xff) is considered invalid.
  3327  func IsValidSplitKey(key roachpb.Key) bool {
  3328  	return bool(C.MVCCIsValidSplitKey(goToCSlice(key)))
  3329  }
  3330  
  3331  // lockFile sets a lock on the specified file using RocksDB's file locking interface.
  3332  func lockFile(filename string) (C.DBFileLock, error) {
  3333  	var lock C.DBFileLock
  3334  	// C.DBLockFile mutates its argument. `lock, statusToError(...)`
  3335  	// happens to work in gc, but does not work in gccgo.
  3336  	//
  3337  	// See https://github.com/golang/go/issues/23188.
  3338  	err := statusToError(C.DBLockFile(goToCSlice([]byte(filename)), &lock))
  3339  	return lock, err
  3340  }
  3341  
  3342  // unlockFile unlocks the file asscoiated with the specified lock and GCs any allocated memory for the lock.
  3343  func unlockFile(lock C.DBFileLock) error {
  3344  	return statusToError(C.DBUnlockFile(lock))
  3345  }
  3346  
  3347  // MVCCScanDecodeKeyValue decodes a key/value pair returned in an MVCCScan
  3348  // "batch" (this is not the RocksDB batch repr format), returning both the
  3349  // key/value and the suffix of data remaining in the batch.
  3350  func MVCCScanDecodeKeyValue(repr []byte) (key MVCCKey, value []byte, orepr []byte, err error) {
  3351  	k, ts, value, orepr, err := enginepb.ScanDecodeKeyValue(repr)
  3352  	return MVCCKey{k, ts}, value, orepr, err
  3353  }
  3354  
  3355  // MVCCScanDecodeKeyValues decodes all key/value pairs returned in one or more
  3356  // MVCCScan "batches" (this is not the RocksDB batch repr format). The provided
  3357  // function is called for each key/value pair.
  3358  func MVCCScanDecodeKeyValues(repr [][]byte, fn func(key MVCCKey, rawBytes []byte) error) error {
  3359  	var k MVCCKey
  3360  	var rawBytes []byte
  3361  	var err error
  3362  	for _, data := range repr {
  3363  		for len(data) > 0 {
  3364  			k, rawBytes, data, err = MVCCScanDecodeKeyValue(data)
  3365  			if err != nil {
  3366  				return err
  3367  			}
  3368  			if err = fn(k, rawBytes); err != nil {
  3369  				return err
  3370  			}
  3371  		}
  3372  	}
  3373  	return nil
  3374  }
  3375  
  3376  func notFoundErrOrDefault(err error) error {
  3377  	errStr := err.Error()
  3378  	if strings.Contains(errStr, "No such") ||
  3379  		strings.Contains(errStr, "not found") ||
  3380  		strings.Contains(errStr, "does not exist") ||
  3381  		strings.Contains(errStr, "NotFound:") ||
  3382  		strings.Contains(errStr, "cannot find") {
  3383  		return os.ErrNotExist
  3384  	}
  3385  	return err
  3386  }
  3387  
  3388  // rocksdbWritableFile implements the File interface. It is used to interact with the
  3389  // DBWritableFile in the corresponding RocksDB env.
  3390  type rocksdbWritableFile struct {
  3391  	file C.DBWritableFile
  3392  	rdb  *C.DBEngine
  3393  }
  3394  
  3395  var _ fs.File = &rocksdbWritableFile{}
  3396  
  3397  // Write implements the File interface.
  3398  func (f *rocksdbWritableFile) Write(data []byte) (int, error) {
  3399  	err := statusToError(C.DBEnvAppendFile(f.rdb, f.file, goToCSlice(data)))
  3400  	return len(data), err
  3401  }
  3402  
  3403  // Close implements the File interface.
  3404  func (f *rocksdbWritableFile) Close() error {
  3405  	return statusToError(C.DBEnvCloseFile(f.rdb, f.file))
  3406  }
  3407  
  3408  // Sync implements the File interface.
  3409  func (f *rocksdbWritableFile) Sync() error {
  3410  	return statusToError(C.DBEnvSyncFile(f.rdb, f.file))
  3411  }
  3412  
  3413  // Read implements the File interface.
  3414  func (f *rocksdbWritableFile) Read(p []byte) (n int, err error) {
  3415  	return 0, fmt.Errorf("cannot read file opened for writing")
  3416  }
  3417  
  3418  // ReadAt implements the File interface.
  3419  func (f *rocksdbWritableFile) ReadAt(p []byte, off int64) (n int, err error) {
  3420  	return 0, fmt.Errorf("cannot read file opened for writing")
  3421  }
  3422  
  3423  // rocksdbReadableFile implements the File interface. It is used to interact with the
  3424  // DBReadableFile in the corresponding RocksDB env.
  3425  type rocksdbReadableFile struct {
  3426  	file   C.DBReadableFile
  3427  	rdb    *C.DBEngine
  3428  	offset int64
  3429  }
  3430  
  3431  var _ fs.File = &rocksdbReadableFile{}
  3432  
  3433  // Write implements the File interface.
  3434  func (f *rocksdbReadableFile) Write(data []byte) (int, error) {
  3435  	return 0, fmt.Errorf("cannot write file opened for reading")
  3436  }
  3437  
  3438  // Close implements the File interface.
  3439  func (f *rocksdbReadableFile) Close() error {
  3440  	return statusToError(C.DBEnvCloseReadableFile(f.rdb, f.file))
  3441  }
  3442  
  3443  // Sync implements the File interface.
  3444  func (f *rocksdbReadableFile) Sync() error {
  3445  	return fmt.Errorf("cannot sync file opened for reading")
  3446  }
  3447  
  3448  // Read implements the File interface.
  3449  func (f *rocksdbReadableFile) Read(p []byte) (n int, err error) {
  3450  	n, err = f.ReadAt(p, f.offset)
  3451  	f.offset += int64(n)
  3452  	return
  3453  }
  3454  
  3455  // ReadAt implements the File interface.
  3456  func (f *rocksdbReadableFile) ReadAt(p []byte, off int64) (int, error) {
  3457  	var n C.int
  3458  	err := statusToError(C.DBEnvReadAtFile(f.rdb, f.file, goToCSlice(p), C.int64_t(off), &n))
  3459  	return int(n), err
  3460  }
  3461  
  3462  type rocksdbDirectory struct {
  3463  	file C.DBDirectory
  3464  	rdb  *C.DBEngine
  3465  }
  3466  
  3467  var _ fs.File = &rocksdbDirectory{}
  3468  
  3469  // Write implements the File interface.
  3470  func (f *rocksdbDirectory) Write(data []byte) (int, error) {
  3471  	return 0, fmt.Errorf("cannot write to directory")
  3472  }
  3473  
  3474  // Close implements the File interface.
  3475  func (f *rocksdbDirectory) Close() error {
  3476  	return statusToError(C.DBEnvCloseDirectory(f.rdb, f.file))
  3477  }
  3478  
  3479  // Sync implements the File interface.
  3480  func (f *rocksdbDirectory) Sync() error {
  3481  	return statusToError(C.DBEnvSyncDirectory(f.rdb, f.file))
  3482  }
  3483  
  3484  // Read implements the File interface.
  3485  func (f *rocksdbDirectory) Read(p []byte) (n int, err error) {
  3486  	return 0, fmt.Errorf("cannot read directory")
  3487  }
  3488  
  3489  // ReadAt implements the File interface.
  3490  func (f *rocksdbDirectory) ReadAt(p []byte, off int64) (n int, err error) {
  3491  	return 0, fmt.Errorf("cannot read directory")
  3492  }
  3493  
  3494  var _ fs.FS = &RocksDB{}
  3495  
  3496  // Create implements the FS interface.
  3497  func (r *RocksDB) Create(name string) (fs.File, error) {
  3498  	return r.CreateWithSync(name, 0)
  3499  }
  3500  
  3501  // CreateWithSync implements the FS interface.
  3502  func (r *RocksDB) CreateWithSync(name string, bytesPerSync int) (fs.File, error) {
  3503  	var file C.DBWritableFile
  3504  	if err := statusToError(C.DBEnvOpenFile(
  3505  		r.rdb, goToCSlice([]byte(name)), C.uint64_t(bytesPerSync), &file)); err != nil {
  3506  		return nil, notFoundErrOrDefault(err)
  3507  	}
  3508  	return &rocksdbWritableFile{file: file, rdb: r.rdb}, nil
  3509  }
  3510  
  3511  // Open implements the FS interface.
  3512  func (r *RocksDB) Open(name string) (fs.File, error) {
  3513  	var file C.DBReadableFile
  3514  	if err := statusToError(C.DBEnvOpenReadableFile(r.rdb, goToCSlice([]byte(name)), &file)); err != nil {
  3515  		return nil, notFoundErrOrDefault(err)
  3516  	}
  3517  	return &rocksdbReadableFile{file: file, rdb: r.rdb}, nil
  3518  }
  3519  
  3520  // OpenDir implements the FS interface.
  3521  func (r *RocksDB) OpenDir(name string) (fs.File, error) {
  3522  	var file C.DBDirectory
  3523  	if err := statusToError(C.DBEnvOpenDirectory(r.rdb, goToCSlice([]byte(name)), &file)); err != nil {
  3524  		return nil, notFoundErrOrDefault(err)
  3525  	}
  3526  	return &rocksdbDirectory{file: file, rdb: r.rdb}, nil
  3527  }
  3528  
  3529  // Rename implements the FS interface.
  3530  func (r *RocksDB) Rename(oldname, newname string) error {
  3531  	return statusToError(C.DBEnvRenameFile(r.rdb, goToCSlice([]byte(oldname)), goToCSlice([]byte(newname))))
  3532  }
  3533  
  3534  // MkdirAll implements the FS interface.
  3535  func (r *RocksDB) MkdirAll(path string) error {
  3536  	path = filepath.Clean(path)
  3537  
  3538  	// Skip trailing path separators.
  3539  	for len(path) > 0 && path[len(path)-1] == filepath.Separator {
  3540  		path = path[:len(path)-1]
  3541  	}
  3542  	// The path may be empty after cleaning and trimming tailing path
  3543  	// separators.
  3544  	if path == "" {
  3545  		return nil
  3546  	}
  3547  
  3548  	// Ensure the parent exists first.
  3549  	parent, _ := filepath.Split(path)
  3550  	if parent != "" {
  3551  		if err := r.MkdirAll(parent); err != nil {
  3552  			return err
  3553  		}
  3554  	}
  3555  	return statusToError(C.DBEnvCreateDir(r.rdb, goToCSlice([]byte(path))))
  3556  }
  3557  
  3558  // RemoveDir implements the FS interface.
  3559  func (r *RocksDB) RemoveDir(name string) error {
  3560  	return statusToError(C.DBEnvDeleteDir(r.rdb, goToCSlice([]byte(name))))
  3561  }
  3562  
  3563  // List implements the FS interface.
  3564  func (r *RocksDB) List(name string) ([]string, error) {
  3565  	list := C.DBEnvListDir(r.rdb, goToCSlice([]byte(name)))
  3566  	n := list.n
  3567  	names := list.names
  3568  	// We can't index into names because it is a pointer, not a slice. The
  3569  	// hackery below treats the pointer as an array and then constructs
  3570  	// a slice from it.
  3571  	nameSize := unsafe.Sizeof(C.DBString{})
  3572  	nameVal := func(i int) C.DBString {
  3573  		return *(*C.DBString)(unsafe.Pointer(uintptr(unsafe.Pointer(names)) + uintptr(i)*nameSize))
  3574  	}
  3575  	err := statusToError(list.status)
  3576  	if err != nil {
  3577  		err = notFoundErrOrDefault(err)
  3578  	}
  3579  
  3580  	result := make([]string, n)
  3581  	j := 0
  3582  	for i := range result {
  3583  		str := cStringToGoString(nameVal(i))
  3584  		if str == "." || str == ".." {
  3585  			continue
  3586  		}
  3587  		result[j] = str
  3588  		j++
  3589  	}
  3590  	C.free(unsafe.Pointer(names))
  3591  
  3592  	result = result[:j]
  3593  	sort.Strings(result)
  3594  	return result, err
  3595  }
  3596  
  3597  // ThreadStacks returns the stacks for all threads. The stacks are raw
  3598  // addresses, and do not contain symbols. Use addr2line (or atos on Darwin) to
  3599  // symbolize.
  3600  func ThreadStacks() string {
  3601  	return cStringToGoString(C.DBDumpThreadStacks())
  3602  }