github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/storage/rocksdb_test.go (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package storage
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"encoding/json"
    17  	"fmt"
    18  	"io/ioutil"
    19  	"math/rand"
    20  	"os"
    21  	"path/filepath"
    22  	"reflect"
    23  	"sort"
    24  	"strconv"
    25  	"testing"
    26  	"time"
    27  
    28  	"github.com/cockroachdb/cockroach/pkg/base"
    29  	"github.com/cockroachdb/cockroach/pkg/keys"
    30  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    31  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    32  	"github.com/cockroachdb/cockroach/pkg/testutils"
    33  	"github.com/cockroachdb/cockroach/pkg/util"
    34  	"github.com/cockroachdb/cockroach/pkg/util/encoding"
    35  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    36  	"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
    37  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    38  	"github.com/cockroachdb/cockroach/pkg/util/log"
    39  	"github.com/cockroachdb/cockroach/pkg/util/randutil"
    40  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    41  )
    42  
    43  const testCacheSize = 1 << 30 // 1 GB
    44  
    45  // TestBatchReadLaterWrite demonstrates that reading from a batch is not like
    46  // reading from a snapshot: writes that occur after opening the batch will be
    47  // visible to reads from the batch (whereas using a snapshot, they would not).
    48  func TestBatchReadLaterWrite(t *testing.T) {
    49  	defer leaktest.AfterTest(t)()
    50  
    51  	ctx := context.Background()
    52  	key := roachpb.Key("a")
    53  
    54  	eng := setupMVCCInMemRocksDB(t, "unused")
    55  	defer eng.Close()
    56  
    57  	batch := eng.NewBatch()
    58  	defer batch.Close()
    59  	snap := eng.NewSnapshot()
    60  	defer snap.Close()
    61  
    62  	v := roachpb.MakeValueFromString("foo")
    63  
    64  	if err := MVCCPut(ctx, eng, nil, key, hlc.Timestamp{}, v, nil); err != nil {
    65  		t.Fatal(err)
    66  	}
    67  
    68  	// Read from a batch that was opened before the value was written to the
    69  	// underlying engine. The batch will see the write.
    70  	{
    71  		rv, _, err := MVCCGet(ctx, batch, key, hlc.Timestamp{}, MVCCGetOptions{})
    72  		if err != nil {
    73  			t.Fatal(err)
    74  		}
    75  		if rv == nil {
    76  			t.Fatal("value not found")
    77  		}
    78  
    79  		if !rv.Equal(&v) {
    80  			t.Fatalf("values not equal: put %v, read %v", v, *rv)
    81  		}
    82  	}
    83  
    84  	// Read from a snapshot opened prior to the write. The snapshot won't see the
    85  	// write.
    86  	{
    87  		rv, _, err := MVCCGet(ctx, snap, key, hlc.Timestamp{}, MVCCGetOptions{})
    88  		if err != nil {
    89  			t.Fatal(err)
    90  		}
    91  		if rv != nil {
    92  			t.Fatalf("value unexpectedly found: %v", *rv)
    93  		}
    94  	}
    95  }
    96  
    97  func TestBatchIterReadOwnWrite(t *testing.T) {
    98  	defer leaktest.AfterTest(t)()
    99  
   100  	db := setupMVCCInMemRocksDB(t, "iter_read_own_write")
   101  	defer db.Close()
   102  
   103  	b := db.NewBatch()
   104  	defer b.Close()
   105  
   106  	k := MakeMVCCMetadataKey(testKey1)
   107  
   108  	before := b.NewIterator(IterOptions{UpperBound: roachpb.KeyMax})
   109  	defer before.Close()
   110  
   111  	nonBatchBefore := db.NewIterator(IterOptions{UpperBound: roachpb.KeyMax})
   112  	defer nonBatchBefore.Close()
   113  
   114  	if err := b.Put(k, []byte("abc")); err != nil {
   115  		t.Fatal(err)
   116  	}
   117  
   118  	// We use a prefix iterator for after in order to workaround the restriction
   119  	// on concurrent use of more than 1 prefix or normal (non-prefix) iterator on
   120  	// a batch.
   121  	after := b.NewIterator(IterOptions{Prefix: true})
   122  	defer after.Close()
   123  
   124  	after.SeekGE(k)
   125  	if ok, err := after.Valid(); !ok {
   126  		t.Fatalf("write missing on batch iter created after write, err=%v", err)
   127  	}
   128  	before.SeekGE(k)
   129  	if ok, err := before.Valid(); !ok {
   130  		t.Fatalf("write missing on batch iter created before write, err=%v", err)
   131  	}
   132  	nonBatchBefore.SeekGE(k)
   133  	if ok, err := nonBatchBefore.Valid(); err != nil {
   134  		t.Fatal(err)
   135  	} else if ok {
   136  		t.Fatal("uncommitted write seen by non-batch iter")
   137  	}
   138  
   139  	if err := b.Commit(false /* sync */); err != nil {
   140  		t.Fatal(err)
   141  	}
   142  
   143  	nonBatchAfter := db.NewIterator(IterOptions{UpperBound: roachpb.KeyMax})
   144  	defer nonBatchAfter.Close()
   145  
   146  	nonBatchBefore.SeekGE(k)
   147  	if ok, err := nonBatchBefore.Valid(); err != nil {
   148  		t.Fatal(err)
   149  	} else if ok {
   150  		t.Fatal("committed write seen by non-batch iter created before commit")
   151  	}
   152  	nonBatchAfter.SeekGE(k)
   153  	if ok, err := nonBatchAfter.Valid(); !ok {
   154  		t.Fatalf("committed write missing by non-batch iter created after commit, err=%v", err)
   155  	}
   156  
   157  	// `Commit` frees the batch, so iterators backed by it should panic.
   158  	func() {
   159  		defer func() {
   160  			if err, expected := recover(), "iterator used after backing engine closed"; err != expected {
   161  				t.Fatalf("Unexpected panic: expected %q, got %q", expected, err)
   162  			}
   163  		}()
   164  		after.SeekGE(k)
   165  		t.Fatalf(`Seek on batch-backed iter after batched closed should panic.
   166  			iter.engine: %T, iter.engine.Closed: %v, batch.Closed %v`,
   167  			after.(*rocksDBIterator).reader,
   168  			after.(*rocksDBIterator).reader.Closed(),
   169  			b.Closed(),
   170  		)
   171  	}()
   172  }
   173  
   174  func TestBatchPrefixIter(t *testing.T) {
   175  	defer leaktest.AfterTest(t)()
   176  
   177  	db := setupMVCCInMemRocksDB(t, "iter_read_own_write")
   178  	defer db.Close()
   179  
   180  	b := db.NewBatch()
   181  	defer b.Close()
   182  
   183  	// Set up a batch with: delete("a"), put("b"). We'll then prefix seek for "b"
   184  	// which should succeed and then prefix seek for "a" which should fail. Note
   185  	// that order of operations is important here to stress the C++ code paths.
   186  	if err := b.Clear(mvccKey("a")); err != nil {
   187  		t.Fatal(err)
   188  	}
   189  	if err := b.Put(mvccKey("b"), []byte("b")); err != nil {
   190  		t.Fatal(err)
   191  	}
   192  
   193  	iter := b.NewIterator(IterOptions{Prefix: true})
   194  	defer iter.Close()
   195  
   196  	iter.SeekGE(mvccKey("b"))
   197  	if ok, err := iter.Valid(); !ok {
   198  		t.Fatalf("expected to find \"b\", err=%v", err)
   199  	}
   200  	iter.SeekGE(mvccKey("a"))
   201  	if ok, err := iter.Valid(); err != nil {
   202  		t.Fatal(err)
   203  	} else if ok {
   204  		t.Fatalf("expected to not find anything, found %s -> %q", iter.Key(), iter.Value())
   205  	}
   206  }
   207  
   208  func TestIterBounds(t *testing.T) {
   209  	defer leaktest.AfterTest(t)()
   210  
   211  	db := setupMVCCInMemRocksDB(t, "iter_bounds")
   212  	defer db.Close()
   213  
   214  	if err := db.Put(mvccKey("0"), []byte("val")); err != nil {
   215  		t.Fatal(err)
   216  	}
   217  	if err := db.Put(mvccKey("a"), []byte("val")); err != nil {
   218  		t.Fatal(err)
   219  	}
   220  	if err := db.Put(mvccKey("b"), []byte("val")); err != nil {
   221  		t.Fatal(err)
   222  	}
   223  
   224  	testCases := []struct {
   225  		name         string
   226  		createEngine func() Reader
   227  	}{
   228  		{"batch", func() Reader { return db.NewBatch() }},
   229  		{"readonly", func() Reader { return db.NewReadOnly() }},
   230  		{"snapshot", func() Reader { return db.NewSnapshot() }},
   231  		{"engine", func() Reader { return db }},
   232  	}
   233  	for _, tc := range testCases {
   234  		t.Run(tc.name, func(t *testing.T) {
   235  			e := tc.createEngine()
   236  			defer e.Close()
   237  
   238  			if _, ok := e.(*rocksDBBatch); !ok { // batches do not support reverse iteration
   239  				// Test that a new iterator's lower bound is applied.
   240  				func() {
   241  					iter := e.NewIterator(IterOptions{LowerBound: roachpb.Key("b")})
   242  					defer iter.Close()
   243  					iter.SeekLT(mvccKey("c"))
   244  					if ok, err := iter.Valid(); err != nil {
   245  						t.Fatal(err)
   246  					} else if !ok {
   247  						t.Fatalf("expected iterator to be valid, but was invalid")
   248  					}
   249  					iter.SeekLT(mvccKey("b"))
   250  					if ok, err := iter.Valid(); err != nil {
   251  						t.Fatal(err)
   252  					} else if ok {
   253  						t.Fatalf("expected iterator to be invalid, but was valid")
   254  					}
   255  					iter.SeekLT(mvccKey("a"))
   256  					if ok, err := iter.Valid(); err != nil {
   257  						t.Fatal(err)
   258  					} else if ok {
   259  						t.Fatalf("expected iterator to be invalid, but was valid")
   260  					}
   261  				}()
   262  
   263  				// Test that the cached iterator, if the underlying engine implementation
   264  				// caches iterators, can take on a new lower bound.
   265  				func() {
   266  					iter := e.NewIterator(IterOptions{LowerBound: roachpb.Key("a")})
   267  					defer iter.Close()
   268  
   269  					iter.SeekLT(mvccKey("b"))
   270  					if ok, err := iter.Valid(); !ok {
   271  						t.Fatal(err)
   272  					}
   273  					if !mvccKey("a").Equal(iter.Key()) {
   274  						t.Fatalf("expected key a, but got %q", iter.Key())
   275  					}
   276  					iter.Prev()
   277  					if ok, err := iter.Valid(); err != nil {
   278  						t.Fatal(err)
   279  					} else if ok {
   280  						t.Fatalf("expected iterator to be invalid, but was valid")
   281  					}
   282  				}()
   283  			}
   284  
   285  			// Test that a new iterator's upper bound is applied.
   286  			func() {
   287  				iter := e.NewIterator(IterOptions{UpperBound: roachpb.Key("a")})
   288  				defer iter.Close()
   289  				iter.SeekGE(mvccKey("a"))
   290  				if ok, err := iter.Valid(); err != nil {
   291  					t.Fatal(err)
   292  				} else if ok {
   293  					t.Fatalf("expected iterator to be invalid, but was valid")
   294  				}
   295  			}()
   296  
   297  			// Test that the cached iterator, if the underlying engine implementation
   298  			// caches iterators, can take on a new upper bound.
   299  			func() {
   300  				iter := e.NewIterator(IterOptions{UpperBound: roachpb.Key("b")})
   301  				defer iter.Close()
   302  
   303  				iter.SeekGE(mvccKey("a"))
   304  				if ok, err := iter.Valid(); !ok {
   305  					t.Fatal(err)
   306  				}
   307  				if !mvccKey("a").Equal(iter.Key()) {
   308  					t.Fatalf("expected key a, but got %q", iter.Key())
   309  				}
   310  				iter.Next()
   311  				if ok, err := iter.Valid(); err != nil {
   312  					t.Fatal(err)
   313  				} else if ok {
   314  					t.Fatalf("expected iterator to be invalid, but was valid")
   315  				}
   316  			}()
   317  
   318  			// Perform additional tests if the engine supports writes.
   319  			w, isReadWriter := e.(ReadWriter)
   320  			if _, isSecretlyReadOnly := e.(*rocksDBReadOnly); !isReadWriter || isSecretlyReadOnly {
   321  				return
   322  			}
   323  			if err := w.Put(mvccKey("c"), []byte("val")); err != nil {
   324  				t.Fatal(err)
   325  			}
   326  			func() {
   327  				iter := w.NewIterator(IterOptions{UpperBound: roachpb.Key("c")})
   328  				defer iter.Close()
   329  				iter.SeekGE(mvccKey("c"))
   330  				if ok, err := iter.Valid(); err != nil {
   331  					t.Fatal(err)
   332  				} else if ok {
   333  					t.Fatalf("expected iterator to be invalid, but was valid")
   334  				}
   335  			}()
   336  		})
   337  	}
   338  }
   339  
   340  func makeKey(i int) MVCCKey {
   341  	return MakeMVCCMetadataKey(roachpb.Key(strconv.Itoa(i)))
   342  }
   343  
   344  func benchmarkIterOnBatch(ctx context.Context, b *testing.B, writes int) {
   345  	engine := createTestRocksDBEngine()
   346  	defer engine.Close()
   347  
   348  	for i := 0; i < writes; i++ {
   349  		if err := engine.Put(makeKey(i), []byte(strconv.Itoa(i))); err != nil {
   350  			b.Fatal(err)
   351  		}
   352  	}
   353  
   354  	batch := engine.NewBatch()
   355  	defer batch.Close()
   356  
   357  	for i := 0; i < writes; i++ {
   358  		if err := batch.Clear(makeKey(i)); err != nil {
   359  			b.Fatal(err)
   360  		}
   361  	}
   362  
   363  	r := rand.New(rand.NewSource(5))
   364  
   365  	b.ResetTimer()
   366  	for i := 0; i < b.N; i++ {
   367  		key := makeKey(r.Intn(writes))
   368  		iter := batch.NewIterator(IterOptions{Prefix: true})
   369  		iter.SeekGE(key)
   370  		iter.Close()
   371  	}
   372  }
   373  
   374  func benchmarkIterOnReadWriter(
   375  	b *testing.B, writes int, f func(Engine) ReadWriter, closeReadWriter bool,
   376  ) {
   377  	engine := createTestRocksDBEngine()
   378  	defer engine.Close()
   379  
   380  	for i := 0; i < writes; i++ {
   381  		if err := engine.Put(makeKey(i), []byte(strconv.Itoa(i))); err != nil {
   382  			b.Fatal(err)
   383  		}
   384  	}
   385  
   386  	readWriter := f(engine)
   387  	if closeReadWriter {
   388  		defer readWriter.Close()
   389  	}
   390  
   391  	r := rand.New(rand.NewSource(5))
   392  
   393  	b.ResetTimer()
   394  	for i := 0; i < b.N; i++ {
   395  		key := makeKey(r.Intn(writes))
   396  		iter := readWriter.NewIterator(IterOptions{Prefix: true})
   397  		iter.SeekGE(key)
   398  		iter.Close()
   399  	}
   400  }
   401  
   402  // TestRocksDBOpenWithVersions verifies the version checking in Open()
   403  // functions correctly.
   404  func TestRocksDBOpenWithVersions(t *testing.T) {
   405  	defer leaktest.AfterTest(t)()
   406  
   407  	testCases := []struct {
   408  		hasFile     bool
   409  		ver         Version
   410  		expectedErr string
   411  	}{
   412  		{false, Version{}, ""},
   413  		{true, Version{versionCurrent}, ""},
   414  		{true, Version{versionMinimum}, ""},
   415  		{true, Version{-1}, "incompatible rocksdb data version, current:2, on disk:-1, minimum:0"},
   416  		{true, Version{3}, "incompatible rocksdb data version, current:2, on disk:3, minimum:0"},
   417  	}
   418  
   419  	for i, testCase := range testCases {
   420  		err := openRocksDBWithVersion(t, testCase.hasFile, testCase.ver)
   421  		if !testutils.IsError(err, testCase.expectedErr) {
   422  			t.Errorf("%d: expected error '%s', actual '%v'", i, testCase.expectedErr, err)
   423  		}
   424  	}
   425  }
   426  
   427  // openRocksDBWithVersion attempts to open a rocks db instance, optionally with
   428  // the supplied Version struct.
   429  func openRocksDBWithVersion(t *testing.T, hasVersionFile bool, ver Version) error {
   430  	dir, err := ioutil.TempDir("", "testing")
   431  	if err != nil {
   432  		t.Fatal(err)
   433  	}
   434  	defer func() {
   435  		if err := os.RemoveAll(dir); err != nil {
   436  			t.Fatal(err)
   437  		}
   438  	}()
   439  
   440  	if hasVersionFile {
   441  		b, err := json.Marshal(ver)
   442  		if err != nil {
   443  			t.Fatal(err)
   444  		}
   445  		if err := ioutil.WriteFile(getVersionFilename(dir), b, 0644); err != nil {
   446  			t.Fatal(err)
   447  		}
   448  	}
   449  
   450  	rocksdb, err := NewRocksDB(
   451  		RocksDBConfig{
   452  			StorageConfig: base.StorageConfig{
   453  				Settings: cluster.MakeTestingClusterSettings(),
   454  				Dir:      dir,
   455  			},
   456  		},
   457  		RocksDBCache{},
   458  	)
   459  	if err == nil {
   460  		rocksdb.Close()
   461  	}
   462  	return err
   463  }
   464  
   465  func TestRocksDBApproximateDiskBytes(t *testing.T) {
   466  	defer leaktest.AfterTest(t)()
   467  
   468  	dir, cleanup := testutils.TempDir(t)
   469  	defer cleanup()
   470  
   471  	rocksdb, err := NewRocksDB(
   472  		RocksDBConfig{
   473  			StorageConfig: base.StorageConfig{
   474  				Settings: cluster.MakeTestingClusterSettings(),
   475  				Dir:      dir,
   476  			},
   477  		},
   478  		RocksDBCache{},
   479  	)
   480  	if err != nil {
   481  		t.Fatal(err)
   482  	}
   483  	defer rocksdb.Close()
   484  
   485  	rnd, seed := randutil.NewPseudoRand()
   486  
   487  	log.Infof(context.Background(), "seed is %d", seed)
   488  
   489  	for i := 0; i < 10; i++ {
   490  		ts := hlc.Timestamp{WallTime: rnd.Int63()}
   491  		key := roachpb.Key(randutil.RandBytes(rnd, 1<<10))
   492  		key = append(key, []byte(fmt.Sprintf("#%d", i))...) // make unique
   493  		value := roachpb.MakeValueFromBytes(randutil.RandBytes(rnd, 1<<20))
   494  		value.InitChecksum(key)
   495  		if err := MVCCPut(context.Background(), rocksdb, nil, key, ts, value, nil); err != nil {
   496  			t.Fatal(err)
   497  		}
   498  		if err := rocksdb.Flush(); err != nil {
   499  			t.Fatal(err)
   500  		}
   501  		keyOnlySize, err := rocksdb.ApproximateDiskBytes(key, key.Next())
   502  		if err != nil {
   503  			t.Fatal(err)
   504  		}
   505  		const mb = int64(1 << 20)
   506  		if min, max, act := mb/2, 2*mb, int64(keyOnlySize); act < min || act > max {
   507  			t.Fatalf("iteration %d: new kv pair estimated at %s; expected between %s and %s",
   508  				i+1, humanizeutil.IBytes(act), humanizeutil.IBytes(min), humanizeutil.IBytes(max))
   509  		}
   510  
   511  		allSize, err := rocksdb.ApproximateDiskBytes(roachpb.KeyMin, roachpb.KeyMax)
   512  		if err != nil {
   513  			t.Fatal(err)
   514  		}
   515  
   516  		if min, max, act := int64(i)*mb, int64(i+2)*mb, int64(allSize); act < min || act > max {
   517  			t.Fatalf("iteration %d: total size estimated at %s; expected between %s and %s",
   518  				i+1, humanizeutil.IBytes(act), humanizeutil.IBytes(min), humanizeutil.IBytes(max))
   519  		}
   520  
   521  	}
   522  }
   523  
   524  func TestSSTableInfosString(t *testing.T) {
   525  	defer leaktest.AfterTest(t)()
   526  
   527  	info := func(level int, size int64) SSTableInfo {
   528  		return SSTableInfo{
   529  			Level: level,
   530  			Size:  size,
   531  		}
   532  	}
   533  	tables := SSTableInfos{
   534  		info(1, 7<<20),
   535  		info(1, 1<<20),
   536  		info(1, 63<<10),
   537  		info(2, 10<<20),
   538  		info(2, 8<<20),
   539  		info(2, 13<<20),
   540  		info(2, 31<<20),
   541  		info(2, 13<<20),
   542  		info(2, 30<<20),
   543  		info(2, 5<<20),
   544  		info(3, 129<<20),
   545  		info(3, 129<<20),
   546  		info(3, 129<<20),
   547  		info(3, 9<<20),
   548  		info(3, 129<<20),
   549  		info(3, 129<<20),
   550  		info(3, 129<<20),
   551  		info(3, 93<<20),
   552  		info(3, 129<<20),
   553  		info(3, 129<<20),
   554  		info(3, 122<<20),
   555  		info(3, 129<<20),
   556  		info(3, 129<<20),
   557  		info(3, 129<<20),
   558  		info(3, 129<<20),
   559  		info(3, 129<<20),
   560  		info(3, 129<<20),
   561  		info(3, 24<<20),
   562  		info(3, 18<<20),
   563  	}
   564  	expected := `1 [   8M  3 ]: 7M 1M 63K
   565  2 [ 110M  7 ]: 31M 30M 13M[2] 10M 8M 5M
   566  3 [   2G 19 ]: 129M[14] 122M 93M 24M 18M 9M
   567  `
   568  	sort.Sort(tables)
   569  	s := tables.String()
   570  	if expected != s {
   571  		t.Fatalf("expected\n%s\ngot\n%s", expected, s)
   572  	}
   573  }
   574  
   575  func TestReadAmplification(t *testing.T) {
   576  	defer leaktest.AfterTest(t)()
   577  
   578  	info := func(level int, size int64) SSTableInfo {
   579  		return SSTableInfo{
   580  			Level: level,
   581  			Size:  size,
   582  		}
   583  	}
   584  
   585  	tables1 := SSTableInfos{
   586  		info(0, 0),
   587  		info(0, 0),
   588  		info(0, 0),
   589  		info(1, 0),
   590  	}
   591  	if a, e := tables1.ReadAmplification(), 4; a != e {
   592  		t.Errorf("got %d, expected %d", a, e)
   593  	}
   594  
   595  	tables2 := SSTableInfos{
   596  		info(0, 0),
   597  		info(1, 0),
   598  		info(2, 0),
   599  		info(3, 0),
   600  	}
   601  	if a, e := tables2.ReadAmplification(), 4; a != e {
   602  		t.Errorf("got %d, expected %d", a, e)
   603  	}
   604  
   605  	tables3 := SSTableInfos{
   606  		info(1, 0),
   607  		info(0, 0),
   608  		info(0, 0),
   609  		info(0, 0),
   610  		info(1, 0),
   611  		info(1, 0),
   612  		info(2, 0),
   613  		info(3, 0),
   614  		info(6, 0),
   615  	}
   616  	if a, e := tables3.ReadAmplification(), 7; a != e {
   617  		t.Errorf("got %d, expected %d", a, e)
   618  	}
   619  }
   620  
   621  func TestInMemIllegalOption(t *testing.T) {
   622  	defer leaktest.AfterTest(t)()
   623  
   624  	cache := NewRocksDBCache(10 << 20 /* 10mb */)
   625  	defer cache.Release()
   626  
   627  	r := &RocksDB{
   628  		cfg: RocksDBConfig{
   629  			StorageConfig: base.StorageConfig{
   630  				MustExist: true,
   631  			},
   632  		},
   633  		// dir: empty dir == "mem" RocksDB instance.
   634  		cache: cache.ref(),
   635  	}
   636  	err := r.open()
   637  	const expErr = `could not open rocksdb instance: Invalid argument: ` +
   638  		`: does not exist \(create_if_missing is false\)`
   639  	if !testutils.IsError(err, expErr) {
   640  		t.Error(err)
   641  	}
   642  }
   643  
   644  func TestConcurrentBatch(t *testing.T) {
   645  	defer leaktest.AfterTest(t)()
   646  
   647  	if testutils.NightlyStress() || util.RaceEnabled {
   648  		t.Skip()
   649  	}
   650  
   651  	dir, err := ioutil.TempDir("", t.Name())
   652  	if err != nil {
   653  		t.Fatal(err)
   654  	}
   655  	defer func() {
   656  		if err := os.RemoveAll(dir); err != nil {
   657  			t.Fatal(err)
   658  		}
   659  	}()
   660  
   661  	db, err := NewRocksDB(
   662  		RocksDBConfig{
   663  			StorageConfig: base.StorageConfig{
   664  				Settings: cluster.MakeTestingClusterSettings(),
   665  				Dir:      dir,
   666  			},
   667  		},
   668  		RocksDBCache{},
   669  	)
   670  	if err != nil {
   671  		t.Fatalf("could not create new rocksdb db instance at %s: %+v", dir, err)
   672  	}
   673  	defer db.Close()
   674  
   675  	// Prepare 16 4 MB batches containing non-overlapping contents.
   676  	var batches []Batch
   677  	for i := 0; i < 16; i++ {
   678  		batch := db.NewBatch()
   679  		for j := 0; true; j++ {
   680  			key := encoding.EncodeUvarintAscending([]byte("bar"), uint64(i))
   681  			key = encoding.EncodeUvarintAscending(key, uint64(j))
   682  			if err := batch.Put(MakeMVCCMetadataKey(key), nil); err != nil {
   683  				t.Fatal(err)
   684  			}
   685  			const targetSize = 4 << 20
   686  			if targetSize < maxBatchGroupSize {
   687  				t.Fatalf("target size (%d) should be larger than the max batch group size (%d)",
   688  					targetSize, maxBatchGroupSize)
   689  			}
   690  			if batch.Len() >= targetSize {
   691  				break
   692  			}
   693  		}
   694  		batches = append(batches, batch)
   695  	}
   696  
   697  	errChan := make(chan error, len(batches))
   698  
   699  	// Concurrently write all the batches.
   700  	for _, batch := range batches {
   701  		go func(batch Batch) {
   702  			errChan <- batch.Commit(false /* sync */)
   703  		}(batch)
   704  	}
   705  
   706  	// While the batch writes are in progress, try to write another key.
   707  	time.Sleep(100 * time.Millisecond)
   708  	remainingBatches := len(batches)
   709  	for i := 0; remainingBatches > 0; i++ {
   710  		select {
   711  		case err := <-errChan:
   712  			if err != nil {
   713  				t.Fatal(err)
   714  			}
   715  			remainingBatches--
   716  		default:
   717  		}
   718  
   719  		// This write can get delayed excessively if we hit the max memtable count
   720  		// or the L0 stop writes threshold.
   721  		start := timeutil.Now()
   722  		key := encoding.EncodeUvarintAscending([]byte("foo"), uint64(i))
   723  		if err := db.Put(MakeMVCCMetadataKey(key), nil); err != nil {
   724  			t.Fatal(err)
   725  		}
   726  		if elapsed := timeutil.Since(start); elapsed >= 10*time.Second {
   727  			t.Fatalf("write took %0.1fs\n", elapsed.Seconds())
   728  		}
   729  	}
   730  }
   731  
   732  // TestRocksDBSstFileWriterTruncate ensures that sum of the chunks created by
   733  // calling Truncate on a RocksDBSstFileWriter is equivalent to an SST built
   734  // without ever calling Truncate.
   735  func TestRocksDBSstFileWriterTruncate(t *testing.T) {
   736  	defer leaktest.AfterTest(t)()
   737  
   738  	// Truncate will be used on this writer.
   739  	sst1, err := MakeRocksDBSstFileWriter()
   740  	if err != nil {
   741  		t.Fatal(err)
   742  	}
   743  	defer sst1.Close()
   744  
   745  	// Truncate will not be used on this writer.
   746  	sst2, err := MakeRocksDBSstFileWriter()
   747  	if err != nil {
   748  		t.Fatal(err)
   749  	}
   750  	defer sst2.Close()
   751  
   752  	const keyLen = 10
   753  	const valLen = 950
   754  	ts := hlc.Timestamp{WallTime: 1}
   755  	key := MVCCKey{Key: roachpb.Key(make([]byte, keyLen)), Timestamp: ts}
   756  	value := make([]byte, valLen)
   757  
   758  	var resBuf1, resBuf2 []byte
   759  	const entries = 100000
   760  	const truncateChunk = entries / 10
   761  	for i := 0; i < entries; i++ {
   762  		key.Key = []byte(fmt.Sprintf("%09d", i))
   763  		copy(value, key.Key)
   764  
   765  		if err := sst1.Put(key, value); err != nil {
   766  			t.Fatal(err)
   767  		}
   768  		if err := sst2.Put(key, value); err != nil {
   769  			t.Fatal(err)
   770  		}
   771  
   772  		if i > 0 && i%truncateChunk == 0 {
   773  			sst1Chunk, err := sst1.Truncate()
   774  			if err != nil {
   775  				t.Fatal(err)
   776  			}
   777  			t.Logf("iteration %d, truncate chunk\tlen=%d", i, len(sst1Chunk))
   778  
   779  			// Even though we added keys, it is not guaranteed strictly by the
   780  			// contract of Truncate that a byte slice will be returned. This is
   781  			// because the keys may be in un-flushed blocks. This test had been tuned
   782  			// such that every other batch chunk is always large enough to require at
   783  			// least one block to be flushed.
   784  			empty := len(sst1Chunk) == 0
   785  			if i%(2*truncateChunk) == 0 {
   786  				if empty {
   787  					t.Fatalf("expected non-empty SST chunk during iteration %d", i)
   788  				}
   789  				resBuf1 = append(resBuf1, sst1Chunk...)
   790  			} else {
   791  				if !empty {
   792  					t.Fatalf("expected empty SST chunk during iteration %d", i)
   793  				}
   794  			}
   795  		}
   796  	}
   797  
   798  	sst1FinishBuf, err := sst1.Finish()
   799  	if err != nil {
   800  		t.Fatal(err)
   801  	}
   802  	resBuf1 = append(resBuf1, sst1FinishBuf...)
   803  	t.Logf("truncated sst final chunk\t\tlen=%d", len(sst1FinishBuf))
   804  
   805  	resBuf2, err = sst2.Finish()
   806  	if err != nil {
   807  		t.Fatal(err)
   808  	}
   809  	t.Logf("non-truncated sst final chunk\tlen=%d", len(resBuf2))
   810  
   811  	if !bytes.Equal(resBuf1, resBuf2) {
   812  		t.Errorf("expected SST made up of truncate chunks (len=%d) to be equivalent to SST that "+
   813  			"was not (len=%d)", len(sst1FinishBuf), len(resBuf2))
   814  	}
   815  }
   816  
   817  func BenchmarkRocksDBSstFileWriter(b *testing.B) {
   818  	dir, err := ioutil.TempDir("", "BenchmarkRocksDBSstFileWriter")
   819  	if err != nil {
   820  		b.Fatal(err)
   821  	}
   822  	defer func() {
   823  		if err := os.RemoveAll(dir); err != nil {
   824  			b.Fatal(err)
   825  		}
   826  	}()
   827  
   828  	const maxEntries = 100000
   829  	const keyLen = 10
   830  	const valLen = 100
   831  	ts := hlc.Timestamp{WallTime: timeutil.Now().UnixNano()}
   832  	kv := MVCCKeyValue{
   833  		Key:   MVCCKey{Key: roachpb.Key(make([]byte, keyLen)), Timestamp: ts},
   834  		Value: make([]byte, valLen),
   835  	}
   836  
   837  	b.ResetTimer()
   838  	sst, err := MakeRocksDBSstFileWriter()
   839  	if err != nil {
   840  		b.Fatal(sst)
   841  	}
   842  	defer sst.Close()
   843  	for i := 1; i <= b.N; i++ {
   844  		if i%maxEntries == 0 {
   845  			if _, err := sst.Finish(); err != nil {
   846  				b.Fatal(err)
   847  			}
   848  			sst, err = MakeRocksDBSstFileWriter()
   849  			if err != nil {
   850  				b.Fatal(sst)
   851  			}
   852  			defer sst.Close()
   853  		}
   854  
   855  		b.StopTimer()
   856  		kv.Key.Key = []byte(fmt.Sprintf("%09d", i))
   857  		copy(kv.Value, kv.Key.Key)
   858  		b.StartTimer()
   859  		if err := sst.Put(kv.Key, kv.Value); err != nil {
   860  			b.Fatal(err)
   861  		}
   862  	}
   863  	b.SetBytes(keyLen + valLen)
   864  }
   865  
   866  func BenchmarkRocksDBSstFileReader(b *testing.B) {
   867  	dir, err := ioutil.TempDir("", "BenchmarkRocksDBSstFileReader")
   868  	if err != nil {
   869  		b.Fatal(err)
   870  	}
   871  	defer func() {
   872  		if err := os.RemoveAll(dir); err != nil {
   873  			b.Fatal(err)
   874  		}
   875  	}()
   876  
   877  	var sstContents []byte
   878  	{
   879  		const maxEntries = 100000
   880  		const keyLen = 10
   881  		const valLen = 100
   882  		b.SetBytes(keyLen + valLen)
   883  
   884  		ts := hlc.Timestamp{WallTime: timeutil.Now().UnixNano()}
   885  		kv := MVCCKeyValue{
   886  			Key:   MVCCKey{Key: roachpb.Key(make([]byte, keyLen)), Timestamp: ts},
   887  			Value: make([]byte, valLen),
   888  		}
   889  
   890  		sst, err := MakeRocksDBSstFileWriter()
   891  		if err != nil {
   892  			b.Fatal(sst)
   893  		}
   894  		defer sst.Close()
   895  		var entries = b.N
   896  		if entries > maxEntries {
   897  			entries = maxEntries
   898  		}
   899  		for i := 0; i < entries; i++ {
   900  			kv.Key.Key = []byte(fmt.Sprintf("%09d", i))
   901  			copy(kv.Value, kv.Key.Key)
   902  			if err := sst.Put(kv.Key, kv.Value); err != nil {
   903  				b.Fatal(err)
   904  			}
   905  		}
   906  		sstContents, err = sst.Finish()
   907  		if err != nil {
   908  			b.Fatal(err)
   909  		}
   910  	}
   911  
   912  	b.ResetTimer()
   913  	sst := MakeRocksDBSstFileReader()
   914  	defer sst.Close()
   915  
   916  	if err := sst.IngestExternalFile(sstContents); err != nil {
   917  		b.Fatal(err)
   918  	}
   919  	count := 0
   920  	iterateFn := func(kv MVCCKeyValue) (bool, error) {
   921  		count++
   922  		if count >= b.N {
   923  			return true, nil
   924  		}
   925  		return false, nil
   926  	}
   927  	for {
   928  		if err := sst.Iterate(keys.MinKey, keys.MaxKey, iterateFn); err != nil {
   929  			b.Fatal(err)
   930  		}
   931  		if count >= b.N {
   932  			break
   933  		}
   934  	}
   935  }
   936  
   937  func key(s string) MVCCKey {
   938  	return MakeMVCCMetadataKey([]byte(s))
   939  }
   940  
   941  // Regression test for https://github.com/facebook/rocksdb/issues/2752. Range
   942  // deletion tombstones between different snapshot stripes are not stored in
   943  // order, so the first tombstone of each snapshot stripe should be checked as a
   944  // smallest candidate.
   945  func TestRocksDBDeleteRangeBug(t *testing.T) {
   946  	defer leaktest.AfterTest(t)()
   947  	dir, dirCleanup := testutils.TempDir(t)
   948  	defer dirCleanup()
   949  
   950  	db, err := NewRocksDB(
   951  		RocksDBConfig{
   952  			StorageConfig: base.StorageConfig{
   953  				Settings: cluster.MakeTestingClusterSettings(),
   954  				Dir:      dir,
   955  			},
   956  		},
   957  		RocksDBCache{},
   958  	)
   959  	if err != nil {
   960  		t.Fatalf("could not create new rocksdb db instance at %s: %+v", dir, err)
   961  	}
   962  	defer db.Close()
   963  
   964  	if err := db.Put(key("a"), []byte("a")); err != nil {
   965  		t.Fatal(err)
   966  	}
   967  	if err := db.Flush(); err != nil {
   968  		t.Fatal(err)
   969  	}
   970  	if err := db.Compact(); err != nil {
   971  		t.Fatal(err)
   972  	}
   973  
   974  	func() {
   975  		if err := db.ClearRange(key("b"), key("c")); err != nil {
   976  			t.Fatal(err)
   977  		}
   978  		// Hold a snapshot to separate these two delete ranges.
   979  		snap := db.NewSnapshot()
   980  		defer snap.Close()
   981  		if err := db.ClearRange(key("a"), key("b")); err != nil {
   982  			t.Fatal(err)
   983  		}
   984  		if err := db.Flush(); err != nil {
   985  			t.Fatal(err)
   986  		}
   987  	}()
   988  
   989  	if err := db.Compact(); err != nil {
   990  		t.Fatal(err)
   991  	}
   992  
   993  	iter := db.NewIterator(IterOptions{UpperBound: roachpb.KeyMax})
   994  	iter.SeekGE(key("a"))
   995  	if ok, _ := iter.Valid(); ok {
   996  		t.Fatalf("unexpected key: %s", iter.Key())
   997  	}
   998  	iter.Close()
   999  }
  1000  
  1001  func createTestSSTableInfos() SSTableInfos {
  1002  	ssti := SSTableInfos{
  1003  		// Level 0.
  1004  		{Level: 0, Size: 20, Start: key("a"), End: key("z")},
  1005  		{Level: 0, Size: 15, Start: key("a"), End: key("k")},
  1006  		// Level 1.
  1007  		{Level: 1, Size: 200, Start: key("a"), End: key("j")},
  1008  		{Level: 1, Size: 100, Start: key("k"), End: key("o")},
  1009  		{Level: 1, Size: 100, Start: key("r"), End: key("t")},
  1010  		// Level 2.
  1011  		{Level: 2, Size: 201, Start: key("a"), End: key("c")},
  1012  		{Level: 2, Size: 200, Start: key("d"), End: key("f")},
  1013  		{Level: 2, Size: 300, Start: key("h"), End: key("r")},
  1014  		{Level: 2, Size: 405, Start: key("s"), End: key("z")},
  1015  		// Level 3.
  1016  		{Level: 3, Size: 667, Start: key("a"), End: key("c")},
  1017  		{Level: 3, Size: 230, Start: key("d"), End: key("f")},
  1018  		{Level: 3, Size: 332, Start: key("h"), End: key("i")},
  1019  		{Level: 3, Size: 923, Start: key("k"), End: key("n")},
  1020  		{Level: 3, Size: 143, Start: key("n"), End: key("o")},
  1021  		{Level: 3, Size: 621, Start: key("p"), End: key("s")},
  1022  		{Level: 3, Size: 411, Start: key("u"), End: key("x")},
  1023  		// Level 4.
  1024  		{Level: 4, Size: 215, Start: key("a"), End: key("b")},
  1025  		{Level: 4, Size: 211, Start: key("b"), End: key("d")},
  1026  		{Level: 4, Size: 632, Start: key("e"), End: key("f")},
  1027  		{Level: 4, Size: 813, Start: key("f"), End: key("h")},
  1028  		{Level: 4, Size: 346, Start: key("h"), End: key("j")},
  1029  		{Level: 4, Size: 621, Start: key("j"), End: key("l")},
  1030  		{Level: 4, Size: 681, Start: key("m"), End: key("o")},
  1031  		{Level: 4, Size: 521, Start: key("o"), End: key("r")},
  1032  		{Level: 4, Size: 135, Start: key("r"), End: key("t")},
  1033  		{Level: 4, Size: 622, Start: key("t"), End: key("v")},
  1034  		{Level: 4, Size: 672, Start: key("x"), End: key("z")},
  1035  	}
  1036  	sort.Sort(ssti)
  1037  	return ssti
  1038  }
  1039  
  1040  func TestSSTableInfosByLevel(t *testing.T) {
  1041  	defer leaktest.AfterTest(t)()
  1042  	ssti := NewSSTableInfosByLevel(createTestSSTableInfos())
  1043  
  1044  	// First, verify that each level is sorted by start key, not size.
  1045  	for level, l := range ssti.levels {
  1046  		if level == 0 {
  1047  			continue
  1048  		}
  1049  		lastInfo := l[0]
  1050  		for _, info := range l[1:] {
  1051  			if !lastInfo.Start.Less(info.Start) {
  1052  				t.Errorf("sort failed (%s >= %s) for level %d", lastInfo.Start, info.Start, level)
  1053  			}
  1054  		}
  1055  	}
  1056  	if a, e := ssti.MaxLevel(), 4; a != e {
  1057  		t.Errorf("expected MaxLevel() == %d; got %d", e, a)
  1058  	}
  1059  
  1060  	// Next, verify various contiguous overlap scenarios.
  1061  	testCases := []struct {
  1062  		span        roachpb.Span
  1063  		expMaxLevel int
  1064  	}{
  1065  		// The full a-z span overlaps more than two SSTables at all levels L1-L4
  1066  		{span: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("z")}, expMaxLevel: 0},
  1067  		// The a-j span overlaps the first three SSTables in L2, so max level is L1.
  1068  		{span: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("j")}, expMaxLevel: 1},
  1069  		// The k-o span overlaps only two adjacent L4 SSTs: j-l & m-o.
  1070  		{span: roachpb.Span{Key: roachpb.Key("k"), EndKey: roachpb.Key("o")}, expMaxLevel: 4},
  1071  		// The K0-o0 span hits three SSTs in L4: j-l, m-o, & o-r.
  1072  		{span: roachpb.Span{Key: roachpb.Key("k0"), EndKey: roachpb.Key("o0")}, expMaxLevel: 3},
  1073  		// The k-z span overlaps the last 4 SSTs in L3.
  1074  		{span: roachpb.Span{Key: roachpb.Key("k"), EndKey: roachpb.Key("z")}, expMaxLevel: 2},
  1075  		// The c-c0 span overlaps only the second L4 SST.
  1076  		{span: roachpb.Span{Key: roachpb.Key("c"), EndKey: roachpb.Key("c0")}, expMaxLevel: 4},
  1077  		// The a-f span full overlaps the first three L4 SSTs.
  1078  		{span: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("f")}, expMaxLevel: 3},
  1079  		// The a-d0 span only overlaps the first two L4 SSTs.
  1080  		{span: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("d0")}, expMaxLevel: 4},
  1081  		// The a-e span only overlaps the first two L4 SSTs. It only is adjacent to the 3rd.
  1082  		{span: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("e")}, expMaxLevel: 4},
  1083  		// The a-d span overlaps fully the first two L4 SSTs.
  1084  		{span: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("d")}, expMaxLevel: 4},
  1085  		// The a-a0 span overlaps only the first L4 SST.
  1086  		{span: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("a0")}, expMaxLevel: 4},
  1087  		// The 0-1 span doesn't overlap any L4 SSTs.
  1088  		{span: roachpb.Span{Key: roachpb.Key("0"), EndKey: roachpb.Key("1")}, expMaxLevel: 4},
  1089  		// The Z-a span doesn't overlap any L4 SSTs, just touches the start of the first.
  1090  		{span: roachpb.Span{Key: roachpb.Key("Z"), EndKey: roachpb.Key("a")}, expMaxLevel: 4},
  1091  		// The Z-a0 span overlaps only the first L4 SST.
  1092  		{span: roachpb.Span{Key: roachpb.Key("Z"), EndKey: roachpb.Key("a0")}, expMaxLevel: 4},
  1093  		// The z-z0 span doesn't overlap any L4 SSTs, just touches the end of the last.
  1094  		{span: roachpb.Span{Key: roachpb.Key("z"), EndKey: roachpb.Key("z0")}, expMaxLevel: 4},
  1095  		// The y-z0 span overlaps the last L4 SST.
  1096  		{span: roachpb.Span{Key: roachpb.Key("y"), EndKey: roachpb.Key("z0")}, expMaxLevel: 4},
  1097  	}
  1098  
  1099  	for _, test := range testCases {
  1100  		t.Run(fmt.Sprintf("%s-%s", test.span.Key, test.span.EndKey), func(t *testing.T) {
  1101  			maxLevel := ssti.MaxLevelSpanOverlapsContiguousSSTables(test.span)
  1102  			if test.expMaxLevel != maxLevel {
  1103  				t.Errorf("expected max level %d; got %d", test.expMaxLevel, maxLevel)
  1104  			}
  1105  		})
  1106  	}
  1107  }
  1108  
  1109  func TestRocksDBOptions(t *testing.T) {
  1110  	defer leaktest.AfterTest(t)()
  1111  
  1112  	dir, err := ioutil.TempDir("", "testing")
  1113  	if err != nil {
  1114  		t.Fatal(err)
  1115  	}
  1116  	defer func() {
  1117  		if err := os.RemoveAll(dir); err != nil {
  1118  			t.Fatal(err)
  1119  		}
  1120  	}()
  1121  	rocksdb, err := NewRocksDB(
  1122  		RocksDBConfig{
  1123  			StorageConfig: base.StorageConfig{
  1124  				Settings: cluster.MakeTestingClusterSettings(),
  1125  				Dir:      dir,
  1126  			},
  1127  			RocksDBOptions: "use_fsync=true;" +
  1128  				"min_write_buffer_number_to_merge=2;" +
  1129  				"block_based_table_factory={block_size=4k}",
  1130  		},
  1131  		RocksDBCache{},
  1132  	)
  1133  	if err != nil {
  1134  		t.Fatal(err)
  1135  	}
  1136  	rocksdb.Close()
  1137  
  1138  	paths, err := filepath.Glob(dir + "/OPTIONS-*")
  1139  	if err != nil {
  1140  		t.Fatal(err)
  1141  	}
  1142  	for _, p := range paths {
  1143  		data, err := ioutil.ReadFile(p)
  1144  		if err != nil {
  1145  			t.Fatal(err)
  1146  		}
  1147  
  1148  		options := []string{
  1149  			"use_fsync=true",
  1150  			"min_write_buffer_number_to_merge=2",
  1151  			"block_size=4096",
  1152  		}
  1153  		for _, o := range options {
  1154  			fullOption := fmt.Sprintf("  %s\n", o)
  1155  			if !bytes.Contains(data, []byte(fullOption)) {
  1156  				t.Errorf("unable to find %s in %s", o, p)
  1157  			}
  1158  		}
  1159  	}
  1160  }
  1161  
  1162  // Verify that range tombstones do not result in sstables that cover an
  1163  // exessively large portion of the key space.
  1164  func TestRocksDBDeleteRangeCompaction(t *testing.T) {
  1165  	defer leaktest.AfterTest(t)()
  1166  
  1167  	db := setupMVCCInMemRocksDB(t, "delrange")
  1168  	defer db.Close()
  1169  
  1170  	// Disable automatic compactions which interfere with test expectations
  1171  	// below.
  1172  	if err := db.(*RocksDB).disableAutoCompaction(); err != nil {
  1173  		t.Fatal(err)
  1174  	}
  1175  
  1176  	makeKey := func(prefix string, i int) roachpb.Key {
  1177  		return roachpb.Key(fmt.Sprintf("%s%09d", prefix, i))
  1178  	}
  1179  
  1180  	rnd, _ := randutil.NewPseudoRand()
  1181  
  1182  	// Create sstables in L6 that are half the L6 target size. Any smaller and
  1183  	// RocksDB might choose to compact them.
  1184  	const targetSize = 64 << 20
  1185  	const numEntries = 10000
  1186  	const keySize = 10
  1187  	const valueSize = (targetSize / numEntries) - keySize
  1188  
  1189  	for _, p := range "abc" {
  1190  		sst, err := MakeRocksDBSstFileWriter()
  1191  		if err != nil {
  1192  			t.Fatal(sst)
  1193  		}
  1194  		defer sst.Close()
  1195  
  1196  		for i := 0; i < numEntries; i++ {
  1197  			if err := sst.Put(MVCCKey{Key: makeKey(string(p), i)}, randutil.RandBytes(rnd, valueSize)); err != nil {
  1198  				t.Fatal(err)
  1199  			}
  1200  		}
  1201  
  1202  		sstContents, err := sst.Finish()
  1203  		if err != nil {
  1204  			t.Fatal(err)
  1205  		}
  1206  
  1207  		filename := fmt.Sprintf("ingest")
  1208  		if err := db.WriteFile(filename, sstContents); err != nil {
  1209  			t.Fatal(err)
  1210  		}
  1211  
  1212  		if err := db.IngestExternalFiles(context.Background(), []string{filename}); err != nil {
  1213  			t.Fatal(err)
  1214  		}
  1215  		if testing.Verbose() {
  1216  			fmt.Printf("ingested %s\n", string(p))
  1217  		}
  1218  	}
  1219  
  1220  	getSSTables := func() string {
  1221  		ssts := db.GetSSTables()
  1222  		sort.Slice(ssts, func(i, j int) bool {
  1223  			a, b := ssts[i], ssts[j]
  1224  			if a.Level < b.Level {
  1225  				return true
  1226  			}
  1227  			if a.Level > b.Level {
  1228  				return false
  1229  			}
  1230  			return a.Start.Less(b.Start)
  1231  		})
  1232  		var buf bytes.Buffer
  1233  		fmt.Fprintf(&buf, "\n")
  1234  		for i := range ssts {
  1235  			fmt.Fprintf(&buf, "%d: %s - %s\n",
  1236  				ssts[i].Level, ssts[i].Start.Key, ssts[i].End.Key)
  1237  		}
  1238  		return buf.String()
  1239  	}
  1240  
  1241  	verifySSTables := func(expected string) {
  1242  		actual := getSSTables()
  1243  		if expected != actual {
  1244  			t.Fatalf("expected%sgot%s", expected, actual)
  1245  		}
  1246  		if testing.Verbose() {
  1247  			fmt.Printf("%s", actual)
  1248  		}
  1249  	}
  1250  
  1251  	// After setup there should be 3 sstables.
  1252  	verifySSTables(`
  1253  6: "a000000000" - "a000009999"
  1254  6: "b000000000" - "b000009999"
  1255  6: "c000000000" - "c000009999"
  1256  `)
  1257  
  1258  	// Generate a batch which writes to the very first key, and then deletes the
  1259  	// range of keys covered by the last sstable.
  1260  	batch := db.NewBatch()
  1261  	if err := batch.Put(MakeMVCCMetadataKey(makeKey("a", 0)), []byte("hello")); err != nil {
  1262  		t.Fatal(err)
  1263  	}
  1264  	if err := batch.ClearRange(MakeMVCCMetadataKey(makeKey("c", 0)),
  1265  		MakeMVCCMetadataKey(makeKey("c", numEntries))); err != nil {
  1266  		t.Fatal(err)
  1267  	}
  1268  	if err := batch.Commit(true); err != nil {
  1269  		t.Fatal(err)
  1270  	}
  1271  	batch.Close()
  1272  	if err := db.Flush(); err != nil {
  1273  		t.Fatal(err)
  1274  	}
  1275  
  1276  	// After flushing, there is a single additional L0 table that covers the
  1277  	// entire key range.
  1278  	verifySSTables(`
  1279  0: "a000000000" - "c000010000"
  1280  6: "a000000000" - "a000009999"
  1281  6: "b000000000" - "b000009999"
  1282  6: "c000000000" - "c000009999"
  1283  `)
  1284  
  1285  	// Compacting the key range covering the last sstable should result in that
  1286  	// sstable being deleted. Prior to the hack in dbClearRange, all of the
  1287  	// sstables would be compacted resulting in 2 L6 sstables with different
  1288  	// boundaries than the ones below.
  1289  	_ = db.CompactRange(makeKey("c", 0), makeKey("c", numEntries), false)
  1290  	verifySSTables(`
  1291  5: "a000000000" - "a000000000"
  1292  6: "a000000000" - "a000009999"
  1293  6: "b000000000" - "b000009999"
  1294  `)
  1295  }
  1296  
  1297  func BenchmarkRocksDBDeleteRangeIterate(b *testing.B) {
  1298  	for _, entries := range []int{10, 1000, 100000} {
  1299  		b.Run(fmt.Sprintf("entries=%d", entries), func(b *testing.B) {
  1300  			for _, deleted := range []int{entries, entries - 1} {
  1301  				b.Run(fmt.Sprintf("deleted=%d", deleted), func(b *testing.B) {
  1302  					db := setupMVCCInMemRocksDB(b, "unused")
  1303  					defer db.Close()
  1304  
  1305  					makeKey := func(i int) roachpb.Key {
  1306  						return roachpb.Key(fmt.Sprintf("%09d", i))
  1307  					}
  1308  
  1309  					// Create an SST with N entries and ingest it. This is a fast way to get a
  1310  					// lot of entries into RocksDB.
  1311  					{
  1312  						sst, err := MakeRocksDBSstFileWriter()
  1313  						if err != nil {
  1314  							b.Fatal(sst)
  1315  						}
  1316  						defer sst.Close()
  1317  
  1318  						for i := 0; i < entries; i++ {
  1319  							if err := sst.Put(MVCCKey{Key: makeKey(i)}, nil); err != nil {
  1320  								b.Fatal(err)
  1321  							}
  1322  						}
  1323  
  1324  						sstContents, err := sst.Finish()
  1325  						if err != nil {
  1326  							b.Fatal(err)
  1327  						}
  1328  
  1329  						filename := fmt.Sprintf("ingest")
  1330  						if err := db.WriteFile(filename, sstContents); err != nil {
  1331  							b.Fatal(err)
  1332  						}
  1333  
  1334  						err = db.IngestExternalFiles(context.Background(), []string{filename})
  1335  						if err != nil {
  1336  							b.Fatal(err)
  1337  						}
  1338  					}
  1339  
  1340  					// Create a range tombstone that deletes most (or all) of those entries.
  1341  					from := makeKey(0)
  1342  					to := makeKey(deleted)
  1343  					if err := db.ClearRange(MakeMVCCMetadataKey(from), MakeMVCCMetadataKey(to)); err != nil {
  1344  						b.Fatal(err)
  1345  					}
  1346  
  1347  					b.ResetTimer()
  1348  					for i := 0; i < b.N; i++ {
  1349  						iter := db.NewIterator(IterOptions{UpperBound: roachpb.KeyMax})
  1350  						iter.SeekGE(MakeMVCCMetadataKey(from))
  1351  						ok, err := iter.Valid()
  1352  						if err != nil {
  1353  							b.Fatal(err)
  1354  						}
  1355  						if deleted < entries {
  1356  							if !ok {
  1357  								b.Fatal("key not found")
  1358  							}
  1359  						} else if ok {
  1360  							b.Fatal("unexpected key found")
  1361  						}
  1362  						iter.Close()
  1363  					}
  1364  				})
  1365  			}
  1366  		})
  1367  	}
  1368  }
  1369  
  1370  func TestMakeBatchGroup(t *testing.T) {
  1371  	defer leaktest.AfterTest(t)()
  1372  
  1373  	// Assume every newly instantiated batch has size 12 (header only).
  1374  	testCases := []struct {
  1375  		maxSize   int
  1376  		groupSize []int
  1377  		leader    []bool
  1378  		groups    []int
  1379  	}{
  1380  		{1, []int{12, 12, 12}, []bool{true, true, true}, []int{1, 1, 1}},
  1381  		{23, []int{12, 12, 12}, []bool{true, true, true}, []int{1, 1, 1}},
  1382  		{24, []int{12, 24, 12}, []bool{true, false, true}, []int{2, 1}},
  1383  		{35, []int{12, 24, 12}, []bool{true, false, true}, []int{2, 1}},
  1384  		{36, []int{12, 24, 36}, []bool{true, false, false}, []int{3}},
  1385  		{
  1386  			48,
  1387  			[]int{12, 24, 36, 48, 12},
  1388  			[]bool{true, false, false, false, true},
  1389  			[]int{4, 1},
  1390  		},
  1391  	}
  1392  	for _, c := range testCases {
  1393  		t.Run("", func(t *testing.T) {
  1394  			var pending []*rocksDBBatch
  1395  			var groupSize int
  1396  			for i := range c.groupSize {
  1397  				b := &rocksDBBatch{}
  1398  				var leader bool
  1399  				pending, groupSize, leader = makeBatchGroup(pending, b, groupSize, c.maxSize)
  1400  				if c.groupSize[i] != groupSize {
  1401  					t.Fatalf("expected group size %d, but found %d", c.groupSize[i], groupSize)
  1402  				}
  1403  				if c.leader[i] != leader {
  1404  					t.Fatalf("expected leader %t, but found %t", c.leader[i], leader)
  1405  				}
  1406  			}
  1407  			var groups []int
  1408  			for len(pending) > 0 {
  1409  				var group []*rocksDBBatch
  1410  				group, pending = nextBatchGroup(pending)
  1411  				groups = append(groups, len(group))
  1412  			}
  1413  			if !reflect.DeepEqual(c.groups, groups) {
  1414  				t.Fatalf("expected %d, but found %d", c.groups, groups)
  1415  			}
  1416  		})
  1417  	}
  1418  }
  1419  
  1420  // Verify that RocksDBSstFileWriter works with time bounded iterators.
  1421  func TestSstFileWriterTimeBound(t *testing.T) {
  1422  	defer leaktest.AfterTest(t)()
  1423  
  1424  	ctx := context.Background()
  1425  	db := setupMVCCInMemRocksDB(t, "sstwriter-timebound")
  1426  	defer db.Close()
  1427  
  1428  	for walltime := int64(1); walltime < 5; walltime++ {
  1429  		sst, err := MakeRocksDBSstFileWriter()
  1430  		if err != nil {
  1431  			t.Fatal(sst)
  1432  		}
  1433  		defer sst.Close()
  1434  		if err := sst.Put(
  1435  			MVCCKey{Key: []byte("key"), Timestamp: hlc.Timestamp{WallTime: walltime}},
  1436  			[]byte("value"),
  1437  		); err != nil {
  1438  			t.Fatal(err)
  1439  		}
  1440  		sstContents, err := sst.Finish()
  1441  		if err != nil {
  1442  			t.Fatal(err)
  1443  		}
  1444  		if err := db.WriteFile(`ingest`, sstContents); err != nil {
  1445  			t.Fatal(err)
  1446  		}
  1447  		if err := db.IngestExternalFiles(ctx, []string{`ingest`}); err != nil {
  1448  			t.Fatal(err)
  1449  		}
  1450  	}
  1451  
  1452  	it := db.NewIterator(IterOptions{
  1453  		UpperBound:       keys.MaxKey,
  1454  		MinTimestampHint: hlc.Timestamp{WallTime: 2},
  1455  		MaxTimestampHint: hlc.Timestamp{WallTime: 3},
  1456  		WithStats:        true,
  1457  	})
  1458  	defer it.Close()
  1459  	for it.SeekGE(MVCCKey{Key: keys.MinKey}); ; it.Next() {
  1460  		ok, err := it.Valid()
  1461  		if err != nil {
  1462  			t.Fatal(err)
  1463  		}
  1464  		if !ok {
  1465  			break
  1466  		}
  1467  	}
  1468  	if s := it.Stats(); s.TimeBoundNumSSTs != 2 {
  1469  		t.Errorf(`expected 2 sstables got %d`, s.TimeBoundNumSSTs)
  1470  	}
  1471  }
  1472  
  1473  // TestRocksDBWALFileEmptyBatch verifies that committing an empty batch does
  1474  // not write an entry to RocksDB's write-ahead log.
  1475  func TestRocksDBWALFileEmptyBatch(t *testing.T) {
  1476  	defer leaktest.AfterTest(t)()
  1477  
  1478  	dir, cleanup := testutils.TempDir(t)
  1479  	defer cleanup()
  1480  
  1481  	// NB: The in-mem RocksDB instance doesn't support syncing the WAL which is
  1482  	// necessary for this test.
  1483  	e, err := NewRocksDB(
  1484  		RocksDBConfig{
  1485  			StorageConfig: base.StorageConfig{
  1486  				Settings: cluster.MakeTestingClusterSettings(),
  1487  				Dir:      dir,
  1488  			},
  1489  		},
  1490  		RocksDBCache{},
  1491  	)
  1492  	if err != nil {
  1493  		t.Fatal(err)
  1494  	}
  1495  	defer e.Close()
  1496  
  1497  	// Commit a batch with one key.
  1498  	b := e.NewBatch()
  1499  	defer b.Close()
  1500  	if err := b.Put(mvccKey("foo"), []byte{'b', 'a', 'r'}); err != nil {
  1501  		t.Fatal(err)
  1502  	}
  1503  	if err := b.Commit(true /* sync */); err != nil {
  1504  		t.Fatal(err)
  1505  	}
  1506  
  1507  	// Verify that RocksDB has created a non-empty WAL.
  1508  	walsBefore, err := e.GetSortedWALFiles()
  1509  	if err != nil {
  1510  		t.Fatal(err)
  1511  	}
  1512  	if len(walsBefore) != 1 {
  1513  		t.Fatalf("expected exactly one WAL file, but got %d", len(walsBefore))
  1514  	}
  1515  	if walsBefore[0].Size == 0 {
  1516  		t.Fatalf("expected non-empty WAL file")
  1517  	}
  1518  
  1519  	// Commit an empty batch.
  1520  	b = e.NewBatch()
  1521  	defer b.Close()
  1522  	if err := b.Commit(true /* sync */); err != nil {
  1523  		t.Fatal(err)
  1524  	}
  1525  
  1526  	// Verify that the WAL has not changed in size.
  1527  	walsAfter, err := e.GetSortedWALFiles()
  1528  	if err != nil {
  1529  		t.Fatal(err)
  1530  	}
  1531  	if !reflect.DeepEqual(walsBefore, walsAfter) {
  1532  		t.Fatalf("expected wal files %#v after committing empty batch, but got %#v",
  1533  			walsBefore, walsAfter)
  1534  	}
  1535  
  1536  	// Regression test a bug that would accidentally make Commit a no-op (via an
  1537  	// errant fast-path) when a batch contained only LogData.
  1538  	testutils.RunTrueAndFalse(t, "distinct", func(t *testing.T, distinct bool) {
  1539  		walsBefore, err := e.GetSortedWALFiles()
  1540  		if err != nil {
  1541  			t.Fatal(err)
  1542  		}
  1543  		if len(walsBefore) != 1 {
  1544  			t.Fatalf("expected one WAL file, got %d", len(walsBefore))
  1545  		}
  1546  
  1547  		batch := e.NewBatch()
  1548  		defer batch.Close()
  1549  
  1550  		var rw ReadWriter = batch
  1551  		if distinct {
  1552  			// NB: we can't actually close this distinct batch because it auto-
  1553  			// closes when the batch commits.
  1554  			rw = batch.Distinct()
  1555  		}
  1556  
  1557  		if err := rw.LogData([]byte("foo")); err != nil {
  1558  			t.Fatal(err)
  1559  		}
  1560  		if batch.Empty() {
  1561  			t.Error("batch is not empty")
  1562  		}
  1563  
  1564  		if err := batch.Commit(true /* sync */); err != nil {
  1565  			t.Fatal(err)
  1566  		}
  1567  
  1568  		// Verify that the WAL has grown.
  1569  		walsAfter, err := e.GetSortedWALFiles()
  1570  		if err != nil {
  1571  			t.Fatal(err)
  1572  		}
  1573  
  1574  		if len(walsAfter) != 1 {
  1575  			t.Fatalf("expected one WAL file, got %+v", walsAfter)
  1576  		}
  1577  
  1578  		if after, before := walsAfter[0].Size, walsBefore[0].Size; after <= before {
  1579  			t.Fatalf("wal size was expected to increase, got %d -> %d", before, after)
  1580  		}
  1581  	})
  1582  }
  1583  
  1584  // Regression test for https://github.com/facebook/rocksdb/issues/6666.
  1585  func TestRocksDBGlobalSeqnumIssue(t *testing.T) {
  1586  	defer leaktest.AfterTest(t)()
  1587  
  1588  	tempDir, cleanup := testutils.TempDir(t)
  1589  	defer cleanup()
  1590  	db := setupMVCCRocksDB(t, tempDir)
  1591  	defer db.Close()
  1592  
  1593  	keyBase := []byte("ab")
  1594  	valBase := []byte("foobar")
  1595  	valBase2 := []byte("barfoo")
  1596  	key0 := MVCCKey{Key: []byte("aa")}
  1597  
  1598  	// When encoded, this MVCC key is 0x616200, trailer 0x0000000000000001 (seqnum
  1599  	// 0, key type 1 or SET), which gets encoded as little endian.
  1600  	// Including the trailer, this key is encoded internally as
  1601  	// 0x6162000100000000000000
  1602  	key1 := MVCCKey{Key: keyBase}
  1603  	// When encoded, this MVCC key is 0x616200010000000000000009, trailer
  1604  	// 0x0000000000000001 (same as before).
  1605  	// Including the trailer, the internal key is encoded as
  1606  	// 0x6162000100000000000000090100000000000000.
  1607  	// Note that it has a prefix matching the earlier key's full internal key.
  1608  	key2 := MVCCKey{Key: keyBase, Timestamp: hlc.Timestamp{WallTime: 0x0100000000000000}}
  1609  
  1610  	// Bump up the global sequence number to a non-zero number. Also lay down
  1611  	// keys around key1 and key2.
  1612  	if err := db.Put(key0, valBase); err != nil {
  1613  		t.Fatal(err)
  1614  	}
  1615  	for i := 0; i < 50; i++ {
  1616  		key := make([]byte, len(keyBase)+1)
  1617  		copy(key, keyBase)
  1618  		// Make keys of the format ac0, ac1, ...
  1619  		key[1] = 'c'
  1620  		key[2] = byte(i)
  1621  		err := db.Put(MVCCKey{Key: key}, valBase)
  1622  		if err != nil {
  1623  			t.Fatal(err)
  1624  		}
  1625  	}
  1626  	// A flush + compact is necessary to push down the writes above into L6.
  1627  	if err := db.Flush(); err != nil {
  1628  		t.Fatal(err)
  1629  	}
  1630  	if err := db.Compact(); err != nil {
  1631  		t.Fatal(err)
  1632  	}
  1633  	// An open snapshot ensures RocksDB assigns a nonzero global sequence number
  1634  	// to the SSTable we're about to ingest.
  1635  	snapshot := db.NewSnapshot()
  1636  
  1637  	sstFilePath := filepath.Join(db.GetAuxiliaryDir(), "test1.sst")
  1638  	_ = os.MkdirAll(db.GetAuxiliaryDir(), 0755)
  1639  	sstFile, err := os.Create(sstFilePath)
  1640  	if err != nil {
  1641  		t.Fatal(err)
  1642  	}
  1643  	writer := MakeIngestionSSTWriter(sstFile)
  1644  	if err := writer.Put(key1, valBase2); err != nil {
  1645  		t.Fatal(err)
  1646  	}
  1647  	if err := writer.Put(key2, valBase2); err != nil {
  1648  		t.Fatal(err)
  1649  	}
  1650  	if err := writer.Finish(); err != nil {
  1651  		t.Fatal(err)
  1652  	}
  1653  	writer.Close()
  1654  	sstFile.Close()
  1655  
  1656  	// When this file is ingested, it'll be added to L0, since it overlaps in key
  1657  	// bounds (but not actual keys) with the SSTable flushed earlier.
  1658  	if err := db.IngestExternalFiles(context.Background(), []string{sstFilePath}); err != nil {
  1659  		t.Fatal(err)
  1660  	}
  1661  	snapshot.Close()
  1662  	val, err := db.Get(key1)
  1663  	if err != nil {
  1664  		t.Fatal(err)
  1665  	}
  1666  	val2, err := db.Get(key2)
  1667  	if err != nil {
  1668  		t.Fatal(err)
  1669  	}
  1670  	if !bytes.Equal(val, valBase2) || !bytes.Equal(val2, valBase2) {
  1671  		t.Fatalf("expected values to match: %v != %v != 'barfoo'", val, val2)
  1672  	}
  1673  }