github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/storage/rocksdb_test.go (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package storage 12 13 import ( 14 "bytes" 15 "context" 16 "encoding/json" 17 "fmt" 18 "io/ioutil" 19 "math/rand" 20 "os" 21 "path/filepath" 22 "reflect" 23 "sort" 24 "strconv" 25 "testing" 26 "time" 27 28 "github.com/cockroachdb/cockroach/pkg/base" 29 "github.com/cockroachdb/cockroach/pkg/keys" 30 "github.com/cockroachdb/cockroach/pkg/roachpb" 31 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 32 "github.com/cockroachdb/cockroach/pkg/testutils" 33 "github.com/cockroachdb/cockroach/pkg/util" 34 "github.com/cockroachdb/cockroach/pkg/util/encoding" 35 "github.com/cockroachdb/cockroach/pkg/util/hlc" 36 "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" 37 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 38 "github.com/cockroachdb/cockroach/pkg/util/log" 39 "github.com/cockroachdb/cockroach/pkg/util/randutil" 40 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 41 ) 42 43 const testCacheSize = 1 << 30 // 1 GB 44 45 // TestBatchReadLaterWrite demonstrates that reading from a batch is not like 46 // reading from a snapshot: writes that occur after opening the batch will be 47 // visible to reads from the batch (whereas using a snapshot, they would not). 48 func TestBatchReadLaterWrite(t *testing.T) { 49 defer leaktest.AfterTest(t)() 50 51 ctx := context.Background() 52 key := roachpb.Key("a") 53 54 eng := setupMVCCInMemRocksDB(t, "unused") 55 defer eng.Close() 56 57 batch := eng.NewBatch() 58 defer batch.Close() 59 snap := eng.NewSnapshot() 60 defer snap.Close() 61 62 v := roachpb.MakeValueFromString("foo") 63 64 if err := MVCCPut(ctx, eng, nil, key, hlc.Timestamp{}, v, nil); err != nil { 65 t.Fatal(err) 66 } 67 68 // Read from a batch that was opened before the value was written to the 69 // underlying engine. The batch will see the write. 70 { 71 rv, _, err := MVCCGet(ctx, batch, key, hlc.Timestamp{}, MVCCGetOptions{}) 72 if err != nil { 73 t.Fatal(err) 74 } 75 if rv == nil { 76 t.Fatal("value not found") 77 } 78 79 if !rv.Equal(&v) { 80 t.Fatalf("values not equal: put %v, read %v", v, *rv) 81 } 82 } 83 84 // Read from a snapshot opened prior to the write. The snapshot won't see the 85 // write. 86 { 87 rv, _, err := MVCCGet(ctx, snap, key, hlc.Timestamp{}, MVCCGetOptions{}) 88 if err != nil { 89 t.Fatal(err) 90 } 91 if rv != nil { 92 t.Fatalf("value unexpectedly found: %v", *rv) 93 } 94 } 95 } 96 97 func TestBatchIterReadOwnWrite(t *testing.T) { 98 defer leaktest.AfterTest(t)() 99 100 db := setupMVCCInMemRocksDB(t, "iter_read_own_write") 101 defer db.Close() 102 103 b := db.NewBatch() 104 defer b.Close() 105 106 k := MakeMVCCMetadataKey(testKey1) 107 108 before := b.NewIterator(IterOptions{UpperBound: roachpb.KeyMax}) 109 defer before.Close() 110 111 nonBatchBefore := db.NewIterator(IterOptions{UpperBound: roachpb.KeyMax}) 112 defer nonBatchBefore.Close() 113 114 if err := b.Put(k, []byte("abc")); err != nil { 115 t.Fatal(err) 116 } 117 118 // We use a prefix iterator for after in order to workaround the restriction 119 // on concurrent use of more than 1 prefix or normal (non-prefix) iterator on 120 // a batch. 121 after := b.NewIterator(IterOptions{Prefix: true}) 122 defer after.Close() 123 124 after.SeekGE(k) 125 if ok, err := after.Valid(); !ok { 126 t.Fatalf("write missing on batch iter created after write, err=%v", err) 127 } 128 before.SeekGE(k) 129 if ok, err := before.Valid(); !ok { 130 t.Fatalf("write missing on batch iter created before write, err=%v", err) 131 } 132 nonBatchBefore.SeekGE(k) 133 if ok, err := nonBatchBefore.Valid(); err != nil { 134 t.Fatal(err) 135 } else if ok { 136 t.Fatal("uncommitted write seen by non-batch iter") 137 } 138 139 if err := b.Commit(false /* sync */); err != nil { 140 t.Fatal(err) 141 } 142 143 nonBatchAfter := db.NewIterator(IterOptions{UpperBound: roachpb.KeyMax}) 144 defer nonBatchAfter.Close() 145 146 nonBatchBefore.SeekGE(k) 147 if ok, err := nonBatchBefore.Valid(); err != nil { 148 t.Fatal(err) 149 } else if ok { 150 t.Fatal("committed write seen by non-batch iter created before commit") 151 } 152 nonBatchAfter.SeekGE(k) 153 if ok, err := nonBatchAfter.Valid(); !ok { 154 t.Fatalf("committed write missing by non-batch iter created after commit, err=%v", err) 155 } 156 157 // `Commit` frees the batch, so iterators backed by it should panic. 158 func() { 159 defer func() { 160 if err, expected := recover(), "iterator used after backing engine closed"; err != expected { 161 t.Fatalf("Unexpected panic: expected %q, got %q", expected, err) 162 } 163 }() 164 after.SeekGE(k) 165 t.Fatalf(`Seek on batch-backed iter after batched closed should panic. 166 iter.engine: %T, iter.engine.Closed: %v, batch.Closed %v`, 167 after.(*rocksDBIterator).reader, 168 after.(*rocksDBIterator).reader.Closed(), 169 b.Closed(), 170 ) 171 }() 172 } 173 174 func TestBatchPrefixIter(t *testing.T) { 175 defer leaktest.AfterTest(t)() 176 177 db := setupMVCCInMemRocksDB(t, "iter_read_own_write") 178 defer db.Close() 179 180 b := db.NewBatch() 181 defer b.Close() 182 183 // Set up a batch with: delete("a"), put("b"). We'll then prefix seek for "b" 184 // which should succeed and then prefix seek for "a" which should fail. Note 185 // that order of operations is important here to stress the C++ code paths. 186 if err := b.Clear(mvccKey("a")); err != nil { 187 t.Fatal(err) 188 } 189 if err := b.Put(mvccKey("b"), []byte("b")); err != nil { 190 t.Fatal(err) 191 } 192 193 iter := b.NewIterator(IterOptions{Prefix: true}) 194 defer iter.Close() 195 196 iter.SeekGE(mvccKey("b")) 197 if ok, err := iter.Valid(); !ok { 198 t.Fatalf("expected to find \"b\", err=%v", err) 199 } 200 iter.SeekGE(mvccKey("a")) 201 if ok, err := iter.Valid(); err != nil { 202 t.Fatal(err) 203 } else if ok { 204 t.Fatalf("expected to not find anything, found %s -> %q", iter.Key(), iter.Value()) 205 } 206 } 207 208 func TestIterBounds(t *testing.T) { 209 defer leaktest.AfterTest(t)() 210 211 db := setupMVCCInMemRocksDB(t, "iter_bounds") 212 defer db.Close() 213 214 if err := db.Put(mvccKey("0"), []byte("val")); err != nil { 215 t.Fatal(err) 216 } 217 if err := db.Put(mvccKey("a"), []byte("val")); err != nil { 218 t.Fatal(err) 219 } 220 if err := db.Put(mvccKey("b"), []byte("val")); err != nil { 221 t.Fatal(err) 222 } 223 224 testCases := []struct { 225 name string 226 createEngine func() Reader 227 }{ 228 {"batch", func() Reader { return db.NewBatch() }}, 229 {"readonly", func() Reader { return db.NewReadOnly() }}, 230 {"snapshot", func() Reader { return db.NewSnapshot() }}, 231 {"engine", func() Reader { return db }}, 232 } 233 for _, tc := range testCases { 234 t.Run(tc.name, func(t *testing.T) { 235 e := tc.createEngine() 236 defer e.Close() 237 238 if _, ok := e.(*rocksDBBatch); !ok { // batches do not support reverse iteration 239 // Test that a new iterator's lower bound is applied. 240 func() { 241 iter := e.NewIterator(IterOptions{LowerBound: roachpb.Key("b")}) 242 defer iter.Close() 243 iter.SeekLT(mvccKey("c")) 244 if ok, err := iter.Valid(); err != nil { 245 t.Fatal(err) 246 } else if !ok { 247 t.Fatalf("expected iterator to be valid, but was invalid") 248 } 249 iter.SeekLT(mvccKey("b")) 250 if ok, err := iter.Valid(); err != nil { 251 t.Fatal(err) 252 } else if ok { 253 t.Fatalf("expected iterator to be invalid, but was valid") 254 } 255 iter.SeekLT(mvccKey("a")) 256 if ok, err := iter.Valid(); err != nil { 257 t.Fatal(err) 258 } else if ok { 259 t.Fatalf("expected iterator to be invalid, but was valid") 260 } 261 }() 262 263 // Test that the cached iterator, if the underlying engine implementation 264 // caches iterators, can take on a new lower bound. 265 func() { 266 iter := e.NewIterator(IterOptions{LowerBound: roachpb.Key("a")}) 267 defer iter.Close() 268 269 iter.SeekLT(mvccKey("b")) 270 if ok, err := iter.Valid(); !ok { 271 t.Fatal(err) 272 } 273 if !mvccKey("a").Equal(iter.Key()) { 274 t.Fatalf("expected key a, but got %q", iter.Key()) 275 } 276 iter.Prev() 277 if ok, err := iter.Valid(); err != nil { 278 t.Fatal(err) 279 } else if ok { 280 t.Fatalf("expected iterator to be invalid, but was valid") 281 } 282 }() 283 } 284 285 // Test that a new iterator's upper bound is applied. 286 func() { 287 iter := e.NewIterator(IterOptions{UpperBound: roachpb.Key("a")}) 288 defer iter.Close() 289 iter.SeekGE(mvccKey("a")) 290 if ok, err := iter.Valid(); err != nil { 291 t.Fatal(err) 292 } else if ok { 293 t.Fatalf("expected iterator to be invalid, but was valid") 294 } 295 }() 296 297 // Test that the cached iterator, if the underlying engine implementation 298 // caches iterators, can take on a new upper bound. 299 func() { 300 iter := e.NewIterator(IterOptions{UpperBound: roachpb.Key("b")}) 301 defer iter.Close() 302 303 iter.SeekGE(mvccKey("a")) 304 if ok, err := iter.Valid(); !ok { 305 t.Fatal(err) 306 } 307 if !mvccKey("a").Equal(iter.Key()) { 308 t.Fatalf("expected key a, but got %q", iter.Key()) 309 } 310 iter.Next() 311 if ok, err := iter.Valid(); err != nil { 312 t.Fatal(err) 313 } else if ok { 314 t.Fatalf("expected iterator to be invalid, but was valid") 315 } 316 }() 317 318 // Perform additional tests if the engine supports writes. 319 w, isReadWriter := e.(ReadWriter) 320 if _, isSecretlyReadOnly := e.(*rocksDBReadOnly); !isReadWriter || isSecretlyReadOnly { 321 return 322 } 323 if err := w.Put(mvccKey("c"), []byte("val")); err != nil { 324 t.Fatal(err) 325 } 326 func() { 327 iter := w.NewIterator(IterOptions{UpperBound: roachpb.Key("c")}) 328 defer iter.Close() 329 iter.SeekGE(mvccKey("c")) 330 if ok, err := iter.Valid(); err != nil { 331 t.Fatal(err) 332 } else if ok { 333 t.Fatalf("expected iterator to be invalid, but was valid") 334 } 335 }() 336 }) 337 } 338 } 339 340 func makeKey(i int) MVCCKey { 341 return MakeMVCCMetadataKey(roachpb.Key(strconv.Itoa(i))) 342 } 343 344 func benchmarkIterOnBatch(ctx context.Context, b *testing.B, writes int) { 345 engine := createTestRocksDBEngine() 346 defer engine.Close() 347 348 for i := 0; i < writes; i++ { 349 if err := engine.Put(makeKey(i), []byte(strconv.Itoa(i))); err != nil { 350 b.Fatal(err) 351 } 352 } 353 354 batch := engine.NewBatch() 355 defer batch.Close() 356 357 for i := 0; i < writes; i++ { 358 if err := batch.Clear(makeKey(i)); err != nil { 359 b.Fatal(err) 360 } 361 } 362 363 r := rand.New(rand.NewSource(5)) 364 365 b.ResetTimer() 366 for i := 0; i < b.N; i++ { 367 key := makeKey(r.Intn(writes)) 368 iter := batch.NewIterator(IterOptions{Prefix: true}) 369 iter.SeekGE(key) 370 iter.Close() 371 } 372 } 373 374 func benchmarkIterOnReadWriter( 375 b *testing.B, writes int, f func(Engine) ReadWriter, closeReadWriter bool, 376 ) { 377 engine := createTestRocksDBEngine() 378 defer engine.Close() 379 380 for i := 0; i < writes; i++ { 381 if err := engine.Put(makeKey(i), []byte(strconv.Itoa(i))); err != nil { 382 b.Fatal(err) 383 } 384 } 385 386 readWriter := f(engine) 387 if closeReadWriter { 388 defer readWriter.Close() 389 } 390 391 r := rand.New(rand.NewSource(5)) 392 393 b.ResetTimer() 394 for i := 0; i < b.N; i++ { 395 key := makeKey(r.Intn(writes)) 396 iter := readWriter.NewIterator(IterOptions{Prefix: true}) 397 iter.SeekGE(key) 398 iter.Close() 399 } 400 } 401 402 // TestRocksDBOpenWithVersions verifies the version checking in Open() 403 // functions correctly. 404 func TestRocksDBOpenWithVersions(t *testing.T) { 405 defer leaktest.AfterTest(t)() 406 407 testCases := []struct { 408 hasFile bool 409 ver Version 410 expectedErr string 411 }{ 412 {false, Version{}, ""}, 413 {true, Version{versionCurrent}, ""}, 414 {true, Version{versionMinimum}, ""}, 415 {true, Version{-1}, "incompatible rocksdb data version, current:2, on disk:-1, minimum:0"}, 416 {true, Version{3}, "incompatible rocksdb data version, current:2, on disk:3, minimum:0"}, 417 } 418 419 for i, testCase := range testCases { 420 err := openRocksDBWithVersion(t, testCase.hasFile, testCase.ver) 421 if !testutils.IsError(err, testCase.expectedErr) { 422 t.Errorf("%d: expected error '%s', actual '%v'", i, testCase.expectedErr, err) 423 } 424 } 425 } 426 427 // openRocksDBWithVersion attempts to open a rocks db instance, optionally with 428 // the supplied Version struct. 429 func openRocksDBWithVersion(t *testing.T, hasVersionFile bool, ver Version) error { 430 dir, err := ioutil.TempDir("", "testing") 431 if err != nil { 432 t.Fatal(err) 433 } 434 defer func() { 435 if err := os.RemoveAll(dir); err != nil { 436 t.Fatal(err) 437 } 438 }() 439 440 if hasVersionFile { 441 b, err := json.Marshal(ver) 442 if err != nil { 443 t.Fatal(err) 444 } 445 if err := ioutil.WriteFile(getVersionFilename(dir), b, 0644); err != nil { 446 t.Fatal(err) 447 } 448 } 449 450 rocksdb, err := NewRocksDB( 451 RocksDBConfig{ 452 StorageConfig: base.StorageConfig{ 453 Settings: cluster.MakeTestingClusterSettings(), 454 Dir: dir, 455 }, 456 }, 457 RocksDBCache{}, 458 ) 459 if err == nil { 460 rocksdb.Close() 461 } 462 return err 463 } 464 465 func TestRocksDBApproximateDiskBytes(t *testing.T) { 466 defer leaktest.AfterTest(t)() 467 468 dir, cleanup := testutils.TempDir(t) 469 defer cleanup() 470 471 rocksdb, err := NewRocksDB( 472 RocksDBConfig{ 473 StorageConfig: base.StorageConfig{ 474 Settings: cluster.MakeTestingClusterSettings(), 475 Dir: dir, 476 }, 477 }, 478 RocksDBCache{}, 479 ) 480 if err != nil { 481 t.Fatal(err) 482 } 483 defer rocksdb.Close() 484 485 rnd, seed := randutil.NewPseudoRand() 486 487 log.Infof(context.Background(), "seed is %d", seed) 488 489 for i := 0; i < 10; i++ { 490 ts := hlc.Timestamp{WallTime: rnd.Int63()} 491 key := roachpb.Key(randutil.RandBytes(rnd, 1<<10)) 492 key = append(key, []byte(fmt.Sprintf("#%d", i))...) // make unique 493 value := roachpb.MakeValueFromBytes(randutil.RandBytes(rnd, 1<<20)) 494 value.InitChecksum(key) 495 if err := MVCCPut(context.Background(), rocksdb, nil, key, ts, value, nil); err != nil { 496 t.Fatal(err) 497 } 498 if err := rocksdb.Flush(); err != nil { 499 t.Fatal(err) 500 } 501 keyOnlySize, err := rocksdb.ApproximateDiskBytes(key, key.Next()) 502 if err != nil { 503 t.Fatal(err) 504 } 505 const mb = int64(1 << 20) 506 if min, max, act := mb/2, 2*mb, int64(keyOnlySize); act < min || act > max { 507 t.Fatalf("iteration %d: new kv pair estimated at %s; expected between %s and %s", 508 i+1, humanizeutil.IBytes(act), humanizeutil.IBytes(min), humanizeutil.IBytes(max)) 509 } 510 511 allSize, err := rocksdb.ApproximateDiskBytes(roachpb.KeyMin, roachpb.KeyMax) 512 if err != nil { 513 t.Fatal(err) 514 } 515 516 if min, max, act := int64(i)*mb, int64(i+2)*mb, int64(allSize); act < min || act > max { 517 t.Fatalf("iteration %d: total size estimated at %s; expected between %s and %s", 518 i+1, humanizeutil.IBytes(act), humanizeutil.IBytes(min), humanizeutil.IBytes(max)) 519 } 520 521 } 522 } 523 524 func TestSSTableInfosString(t *testing.T) { 525 defer leaktest.AfterTest(t)() 526 527 info := func(level int, size int64) SSTableInfo { 528 return SSTableInfo{ 529 Level: level, 530 Size: size, 531 } 532 } 533 tables := SSTableInfos{ 534 info(1, 7<<20), 535 info(1, 1<<20), 536 info(1, 63<<10), 537 info(2, 10<<20), 538 info(2, 8<<20), 539 info(2, 13<<20), 540 info(2, 31<<20), 541 info(2, 13<<20), 542 info(2, 30<<20), 543 info(2, 5<<20), 544 info(3, 129<<20), 545 info(3, 129<<20), 546 info(3, 129<<20), 547 info(3, 9<<20), 548 info(3, 129<<20), 549 info(3, 129<<20), 550 info(3, 129<<20), 551 info(3, 93<<20), 552 info(3, 129<<20), 553 info(3, 129<<20), 554 info(3, 122<<20), 555 info(3, 129<<20), 556 info(3, 129<<20), 557 info(3, 129<<20), 558 info(3, 129<<20), 559 info(3, 129<<20), 560 info(3, 129<<20), 561 info(3, 24<<20), 562 info(3, 18<<20), 563 } 564 expected := `1 [ 8M 3 ]: 7M 1M 63K 565 2 [ 110M 7 ]: 31M 30M 13M[2] 10M 8M 5M 566 3 [ 2G 19 ]: 129M[14] 122M 93M 24M 18M 9M 567 ` 568 sort.Sort(tables) 569 s := tables.String() 570 if expected != s { 571 t.Fatalf("expected\n%s\ngot\n%s", expected, s) 572 } 573 } 574 575 func TestReadAmplification(t *testing.T) { 576 defer leaktest.AfterTest(t)() 577 578 info := func(level int, size int64) SSTableInfo { 579 return SSTableInfo{ 580 Level: level, 581 Size: size, 582 } 583 } 584 585 tables1 := SSTableInfos{ 586 info(0, 0), 587 info(0, 0), 588 info(0, 0), 589 info(1, 0), 590 } 591 if a, e := tables1.ReadAmplification(), 4; a != e { 592 t.Errorf("got %d, expected %d", a, e) 593 } 594 595 tables2 := SSTableInfos{ 596 info(0, 0), 597 info(1, 0), 598 info(2, 0), 599 info(3, 0), 600 } 601 if a, e := tables2.ReadAmplification(), 4; a != e { 602 t.Errorf("got %d, expected %d", a, e) 603 } 604 605 tables3 := SSTableInfos{ 606 info(1, 0), 607 info(0, 0), 608 info(0, 0), 609 info(0, 0), 610 info(1, 0), 611 info(1, 0), 612 info(2, 0), 613 info(3, 0), 614 info(6, 0), 615 } 616 if a, e := tables3.ReadAmplification(), 7; a != e { 617 t.Errorf("got %d, expected %d", a, e) 618 } 619 } 620 621 func TestInMemIllegalOption(t *testing.T) { 622 defer leaktest.AfterTest(t)() 623 624 cache := NewRocksDBCache(10 << 20 /* 10mb */) 625 defer cache.Release() 626 627 r := &RocksDB{ 628 cfg: RocksDBConfig{ 629 StorageConfig: base.StorageConfig{ 630 MustExist: true, 631 }, 632 }, 633 // dir: empty dir == "mem" RocksDB instance. 634 cache: cache.ref(), 635 } 636 err := r.open() 637 const expErr = `could not open rocksdb instance: Invalid argument: ` + 638 `: does not exist \(create_if_missing is false\)` 639 if !testutils.IsError(err, expErr) { 640 t.Error(err) 641 } 642 } 643 644 func TestConcurrentBatch(t *testing.T) { 645 defer leaktest.AfterTest(t)() 646 647 if testutils.NightlyStress() || util.RaceEnabled { 648 t.Skip() 649 } 650 651 dir, err := ioutil.TempDir("", t.Name()) 652 if err != nil { 653 t.Fatal(err) 654 } 655 defer func() { 656 if err := os.RemoveAll(dir); err != nil { 657 t.Fatal(err) 658 } 659 }() 660 661 db, err := NewRocksDB( 662 RocksDBConfig{ 663 StorageConfig: base.StorageConfig{ 664 Settings: cluster.MakeTestingClusterSettings(), 665 Dir: dir, 666 }, 667 }, 668 RocksDBCache{}, 669 ) 670 if err != nil { 671 t.Fatalf("could not create new rocksdb db instance at %s: %+v", dir, err) 672 } 673 defer db.Close() 674 675 // Prepare 16 4 MB batches containing non-overlapping contents. 676 var batches []Batch 677 for i := 0; i < 16; i++ { 678 batch := db.NewBatch() 679 for j := 0; true; j++ { 680 key := encoding.EncodeUvarintAscending([]byte("bar"), uint64(i)) 681 key = encoding.EncodeUvarintAscending(key, uint64(j)) 682 if err := batch.Put(MakeMVCCMetadataKey(key), nil); err != nil { 683 t.Fatal(err) 684 } 685 const targetSize = 4 << 20 686 if targetSize < maxBatchGroupSize { 687 t.Fatalf("target size (%d) should be larger than the max batch group size (%d)", 688 targetSize, maxBatchGroupSize) 689 } 690 if batch.Len() >= targetSize { 691 break 692 } 693 } 694 batches = append(batches, batch) 695 } 696 697 errChan := make(chan error, len(batches)) 698 699 // Concurrently write all the batches. 700 for _, batch := range batches { 701 go func(batch Batch) { 702 errChan <- batch.Commit(false /* sync */) 703 }(batch) 704 } 705 706 // While the batch writes are in progress, try to write another key. 707 time.Sleep(100 * time.Millisecond) 708 remainingBatches := len(batches) 709 for i := 0; remainingBatches > 0; i++ { 710 select { 711 case err := <-errChan: 712 if err != nil { 713 t.Fatal(err) 714 } 715 remainingBatches-- 716 default: 717 } 718 719 // This write can get delayed excessively if we hit the max memtable count 720 // or the L0 stop writes threshold. 721 start := timeutil.Now() 722 key := encoding.EncodeUvarintAscending([]byte("foo"), uint64(i)) 723 if err := db.Put(MakeMVCCMetadataKey(key), nil); err != nil { 724 t.Fatal(err) 725 } 726 if elapsed := timeutil.Since(start); elapsed >= 10*time.Second { 727 t.Fatalf("write took %0.1fs\n", elapsed.Seconds()) 728 } 729 } 730 } 731 732 // TestRocksDBSstFileWriterTruncate ensures that sum of the chunks created by 733 // calling Truncate on a RocksDBSstFileWriter is equivalent to an SST built 734 // without ever calling Truncate. 735 func TestRocksDBSstFileWriterTruncate(t *testing.T) { 736 defer leaktest.AfterTest(t)() 737 738 // Truncate will be used on this writer. 739 sst1, err := MakeRocksDBSstFileWriter() 740 if err != nil { 741 t.Fatal(err) 742 } 743 defer sst1.Close() 744 745 // Truncate will not be used on this writer. 746 sst2, err := MakeRocksDBSstFileWriter() 747 if err != nil { 748 t.Fatal(err) 749 } 750 defer sst2.Close() 751 752 const keyLen = 10 753 const valLen = 950 754 ts := hlc.Timestamp{WallTime: 1} 755 key := MVCCKey{Key: roachpb.Key(make([]byte, keyLen)), Timestamp: ts} 756 value := make([]byte, valLen) 757 758 var resBuf1, resBuf2 []byte 759 const entries = 100000 760 const truncateChunk = entries / 10 761 for i := 0; i < entries; i++ { 762 key.Key = []byte(fmt.Sprintf("%09d", i)) 763 copy(value, key.Key) 764 765 if err := sst1.Put(key, value); err != nil { 766 t.Fatal(err) 767 } 768 if err := sst2.Put(key, value); err != nil { 769 t.Fatal(err) 770 } 771 772 if i > 0 && i%truncateChunk == 0 { 773 sst1Chunk, err := sst1.Truncate() 774 if err != nil { 775 t.Fatal(err) 776 } 777 t.Logf("iteration %d, truncate chunk\tlen=%d", i, len(sst1Chunk)) 778 779 // Even though we added keys, it is not guaranteed strictly by the 780 // contract of Truncate that a byte slice will be returned. This is 781 // because the keys may be in un-flushed blocks. This test had been tuned 782 // such that every other batch chunk is always large enough to require at 783 // least one block to be flushed. 784 empty := len(sst1Chunk) == 0 785 if i%(2*truncateChunk) == 0 { 786 if empty { 787 t.Fatalf("expected non-empty SST chunk during iteration %d", i) 788 } 789 resBuf1 = append(resBuf1, sst1Chunk...) 790 } else { 791 if !empty { 792 t.Fatalf("expected empty SST chunk during iteration %d", i) 793 } 794 } 795 } 796 } 797 798 sst1FinishBuf, err := sst1.Finish() 799 if err != nil { 800 t.Fatal(err) 801 } 802 resBuf1 = append(resBuf1, sst1FinishBuf...) 803 t.Logf("truncated sst final chunk\t\tlen=%d", len(sst1FinishBuf)) 804 805 resBuf2, err = sst2.Finish() 806 if err != nil { 807 t.Fatal(err) 808 } 809 t.Logf("non-truncated sst final chunk\tlen=%d", len(resBuf2)) 810 811 if !bytes.Equal(resBuf1, resBuf2) { 812 t.Errorf("expected SST made up of truncate chunks (len=%d) to be equivalent to SST that "+ 813 "was not (len=%d)", len(sst1FinishBuf), len(resBuf2)) 814 } 815 } 816 817 func BenchmarkRocksDBSstFileWriter(b *testing.B) { 818 dir, err := ioutil.TempDir("", "BenchmarkRocksDBSstFileWriter") 819 if err != nil { 820 b.Fatal(err) 821 } 822 defer func() { 823 if err := os.RemoveAll(dir); err != nil { 824 b.Fatal(err) 825 } 826 }() 827 828 const maxEntries = 100000 829 const keyLen = 10 830 const valLen = 100 831 ts := hlc.Timestamp{WallTime: timeutil.Now().UnixNano()} 832 kv := MVCCKeyValue{ 833 Key: MVCCKey{Key: roachpb.Key(make([]byte, keyLen)), Timestamp: ts}, 834 Value: make([]byte, valLen), 835 } 836 837 b.ResetTimer() 838 sst, err := MakeRocksDBSstFileWriter() 839 if err != nil { 840 b.Fatal(sst) 841 } 842 defer sst.Close() 843 for i := 1; i <= b.N; i++ { 844 if i%maxEntries == 0 { 845 if _, err := sst.Finish(); err != nil { 846 b.Fatal(err) 847 } 848 sst, err = MakeRocksDBSstFileWriter() 849 if err != nil { 850 b.Fatal(sst) 851 } 852 defer sst.Close() 853 } 854 855 b.StopTimer() 856 kv.Key.Key = []byte(fmt.Sprintf("%09d", i)) 857 copy(kv.Value, kv.Key.Key) 858 b.StartTimer() 859 if err := sst.Put(kv.Key, kv.Value); err != nil { 860 b.Fatal(err) 861 } 862 } 863 b.SetBytes(keyLen + valLen) 864 } 865 866 func BenchmarkRocksDBSstFileReader(b *testing.B) { 867 dir, err := ioutil.TempDir("", "BenchmarkRocksDBSstFileReader") 868 if err != nil { 869 b.Fatal(err) 870 } 871 defer func() { 872 if err := os.RemoveAll(dir); err != nil { 873 b.Fatal(err) 874 } 875 }() 876 877 var sstContents []byte 878 { 879 const maxEntries = 100000 880 const keyLen = 10 881 const valLen = 100 882 b.SetBytes(keyLen + valLen) 883 884 ts := hlc.Timestamp{WallTime: timeutil.Now().UnixNano()} 885 kv := MVCCKeyValue{ 886 Key: MVCCKey{Key: roachpb.Key(make([]byte, keyLen)), Timestamp: ts}, 887 Value: make([]byte, valLen), 888 } 889 890 sst, err := MakeRocksDBSstFileWriter() 891 if err != nil { 892 b.Fatal(sst) 893 } 894 defer sst.Close() 895 var entries = b.N 896 if entries > maxEntries { 897 entries = maxEntries 898 } 899 for i := 0; i < entries; i++ { 900 kv.Key.Key = []byte(fmt.Sprintf("%09d", i)) 901 copy(kv.Value, kv.Key.Key) 902 if err := sst.Put(kv.Key, kv.Value); err != nil { 903 b.Fatal(err) 904 } 905 } 906 sstContents, err = sst.Finish() 907 if err != nil { 908 b.Fatal(err) 909 } 910 } 911 912 b.ResetTimer() 913 sst := MakeRocksDBSstFileReader() 914 defer sst.Close() 915 916 if err := sst.IngestExternalFile(sstContents); err != nil { 917 b.Fatal(err) 918 } 919 count := 0 920 iterateFn := func(kv MVCCKeyValue) (bool, error) { 921 count++ 922 if count >= b.N { 923 return true, nil 924 } 925 return false, nil 926 } 927 for { 928 if err := sst.Iterate(keys.MinKey, keys.MaxKey, iterateFn); err != nil { 929 b.Fatal(err) 930 } 931 if count >= b.N { 932 break 933 } 934 } 935 } 936 937 func key(s string) MVCCKey { 938 return MakeMVCCMetadataKey([]byte(s)) 939 } 940 941 // Regression test for https://github.com/facebook/rocksdb/issues/2752. Range 942 // deletion tombstones between different snapshot stripes are not stored in 943 // order, so the first tombstone of each snapshot stripe should be checked as a 944 // smallest candidate. 945 func TestRocksDBDeleteRangeBug(t *testing.T) { 946 defer leaktest.AfterTest(t)() 947 dir, dirCleanup := testutils.TempDir(t) 948 defer dirCleanup() 949 950 db, err := NewRocksDB( 951 RocksDBConfig{ 952 StorageConfig: base.StorageConfig{ 953 Settings: cluster.MakeTestingClusterSettings(), 954 Dir: dir, 955 }, 956 }, 957 RocksDBCache{}, 958 ) 959 if err != nil { 960 t.Fatalf("could not create new rocksdb db instance at %s: %+v", dir, err) 961 } 962 defer db.Close() 963 964 if err := db.Put(key("a"), []byte("a")); err != nil { 965 t.Fatal(err) 966 } 967 if err := db.Flush(); err != nil { 968 t.Fatal(err) 969 } 970 if err := db.Compact(); err != nil { 971 t.Fatal(err) 972 } 973 974 func() { 975 if err := db.ClearRange(key("b"), key("c")); err != nil { 976 t.Fatal(err) 977 } 978 // Hold a snapshot to separate these two delete ranges. 979 snap := db.NewSnapshot() 980 defer snap.Close() 981 if err := db.ClearRange(key("a"), key("b")); err != nil { 982 t.Fatal(err) 983 } 984 if err := db.Flush(); err != nil { 985 t.Fatal(err) 986 } 987 }() 988 989 if err := db.Compact(); err != nil { 990 t.Fatal(err) 991 } 992 993 iter := db.NewIterator(IterOptions{UpperBound: roachpb.KeyMax}) 994 iter.SeekGE(key("a")) 995 if ok, _ := iter.Valid(); ok { 996 t.Fatalf("unexpected key: %s", iter.Key()) 997 } 998 iter.Close() 999 } 1000 1001 func createTestSSTableInfos() SSTableInfos { 1002 ssti := SSTableInfos{ 1003 // Level 0. 1004 {Level: 0, Size: 20, Start: key("a"), End: key("z")}, 1005 {Level: 0, Size: 15, Start: key("a"), End: key("k")}, 1006 // Level 1. 1007 {Level: 1, Size: 200, Start: key("a"), End: key("j")}, 1008 {Level: 1, Size: 100, Start: key("k"), End: key("o")}, 1009 {Level: 1, Size: 100, Start: key("r"), End: key("t")}, 1010 // Level 2. 1011 {Level: 2, Size: 201, Start: key("a"), End: key("c")}, 1012 {Level: 2, Size: 200, Start: key("d"), End: key("f")}, 1013 {Level: 2, Size: 300, Start: key("h"), End: key("r")}, 1014 {Level: 2, Size: 405, Start: key("s"), End: key("z")}, 1015 // Level 3. 1016 {Level: 3, Size: 667, Start: key("a"), End: key("c")}, 1017 {Level: 3, Size: 230, Start: key("d"), End: key("f")}, 1018 {Level: 3, Size: 332, Start: key("h"), End: key("i")}, 1019 {Level: 3, Size: 923, Start: key("k"), End: key("n")}, 1020 {Level: 3, Size: 143, Start: key("n"), End: key("o")}, 1021 {Level: 3, Size: 621, Start: key("p"), End: key("s")}, 1022 {Level: 3, Size: 411, Start: key("u"), End: key("x")}, 1023 // Level 4. 1024 {Level: 4, Size: 215, Start: key("a"), End: key("b")}, 1025 {Level: 4, Size: 211, Start: key("b"), End: key("d")}, 1026 {Level: 4, Size: 632, Start: key("e"), End: key("f")}, 1027 {Level: 4, Size: 813, Start: key("f"), End: key("h")}, 1028 {Level: 4, Size: 346, Start: key("h"), End: key("j")}, 1029 {Level: 4, Size: 621, Start: key("j"), End: key("l")}, 1030 {Level: 4, Size: 681, Start: key("m"), End: key("o")}, 1031 {Level: 4, Size: 521, Start: key("o"), End: key("r")}, 1032 {Level: 4, Size: 135, Start: key("r"), End: key("t")}, 1033 {Level: 4, Size: 622, Start: key("t"), End: key("v")}, 1034 {Level: 4, Size: 672, Start: key("x"), End: key("z")}, 1035 } 1036 sort.Sort(ssti) 1037 return ssti 1038 } 1039 1040 func TestSSTableInfosByLevel(t *testing.T) { 1041 defer leaktest.AfterTest(t)() 1042 ssti := NewSSTableInfosByLevel(createTestSSTableInfos()) 1043 1044 // First, verify that each level is sorted by start key, not size. 1045 for level, l := range ssti.levels { 1046 if level == 0 { 1047 continue 1048 } 1049 lastInfo := l[0] 1050 for _, info := range l[1:] { 1051 if !lastInfo.Start.Less(info.Start) { 1052 t.Errorf("sort failed (%s >= %s) for level %d", lastInfo.Start, info.Start, level) 1053 } 1054 } 1055 } 1056 if a, e := ssti.MaxLevel(), 4; a != e { 1057 t.Errorf("expected MaxLevel() == %d; got %d", e, a) 1058 } 1059 1060 // Next, verify various contiguous overlap scenarios. 1061 testCases := []struct { 1062 span roachpb.Span 1063 expMaxLevel int 1064 }{ 1065 // The full a-z span overlaps more than two SSTables at all levels L1-L4 1066 {span: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("z")}, expMaxLevel: 0}, 1067 // The a-j span overlaps the first three SSTables in L2, so max level is L1. 1068 {span: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("j")}, expMaxLevel: 1}, 1069 // The k-o span overlaps only two adjacent L4 SSTs: j-l & m-o. 1070 {span: roachpb.Span{Key: roachpb.Key("k"), EndKey: roachpb.Key("o")}, expMaxLevel: 4}, 1071 // The K0-o0 span hits three SSTs in L4: j-l, m-o, & o-r. 1072 {span: roachpb.Span{Key: roachpb.Key("k0"), EndKey: roachpb.Key("o0")}, expMaxLevel: 3}, 1073 // The k-z span overlaps the last 4 SSTs in L3. 1074 {span: roachpb.Span{Key: roachpb.Key("k"), EndKey: roachpb.Key("z")}, expMaxLevel: 2}, 1075 // The c-c0 span overlaps only the second L4 SST. 1076 {span: roachpb.Span{Key: roachpb.Key("c"), EndKey: roachpb.Key("c0")}, expMaxLevel: 4}, 1077 // The a-f span full overlaps the first three L4 SSTs. 1078 {span: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("f")}, expMaxLevel: 3}, 1079 // The a-d0 span only overlaps the first two L4 SSTs. 1080 {span: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("d0")}, expMaxLevel: 4}, 1081 // The a-e span only overlaps the first two L4 SSTs. It only is adjacent to the 3rd. 1082 {span: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("e")}, expMaxLevel: 4}, 1083 // The a-d span overlaps fully the first two L4 SSTs. 1084 {span: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("d")}, expMaxLevel: 4}, 1085 // The a-a0 span overlaps only the first L4 SST. 1086 {span: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("a0")}, expMaxLevel: 4}, 1087 // The 0-1 span doesn't overlap any L4 SSTs. 1088 {span: roachpb.Span{Key: roachpb.Key("0"), EndKey: roachpb.Key("1")}, expMaxLevel: 4}, 1089 // The Z-a span doesn't overlap any L4 SSTs, just touches the start of the first. 1090 {span: roachpb.Span{Key: roachpb.Key("Z"), EndKey: roachpb.Key("a")}, expMaxLevel: 4}, 1091 // The Z-a0 span overlaps only the first L4 SST. 1092 {span: roachpb.Span{Key: roachpb.Key("Z"), EndKey: roachpb.Key("a0")}, expMaxLevel: 4}, 1093 // The z-z0 span doesn't overlap any L4 SSTs, just touches the end of the last. 1094 {span: roachpb.Span{Key: roachpb.Key("z"), EndKey: roachpb.Key("z0")}, expMaxLevel: 4}, 1095 // The y-z0 span overlaps the last L4 SST. 1096 {span: roachpb.Span{Key: roachpb.Key("y"), EndKey: roachpb.Key("z0")}, expMaxLevel: 4}, 1097 } 1098 1099 for _, test := range testCases { 1100 t.Run(fmt.Sprintf("%s-%s", test.span.Key, test.span.EndKey), func(t *testing.T) { 1101 maxLevel := ssti.MaxLevelSpanOverlapsContiguousSSTables(test.span) 1102 if test.expMaxLevel != maxLevel { 1103 t.Errorf("expected max level %d; got %d", test.expMaxLevel, maxLevel) 1104 } 1105 }) 1106 } 1107 } 1108 1109 func TestRocksDBOptions(t *testing.T) { 1110 defer leaktest.AfterTest(t)() 1111 1112 dir, err := ioutil.TempDir("", "testing") 1113 if err != nil { 1114 t.Fatal(err) 1115 } 1116 defer func() { 1117 if err := os.RemoveAll(dir); err != nil { 1118 t.Fatal(err) 1119 } 1120 }() 1121 rocksdb, err := NewRocksDB( 1122 RocksDBConfig{ 1123 StorageConfig: base.StorageConfig{ 1124 Settings: cluster.MakeTestingClusterSettings(), 1125 Dir: dir, 1126 }, 1127 RocksDBOptions: "use_fsync=true;" + 1128 "min_write_buffer_number_to_merge=2;" + 1129 "block_based_table_factory={block_size=4k}", 1130 }, 1131 RocksDBCache{}, 1132 ) 1133 if err != nil { 1134 t.Fatal(err) 1135 } 1136 rocksdb.Close() 1137 1138 paths, err := filepath.Glob(dir + "/OPTIONS-*") 1139 if err != nil { 1140 t.Fatal(err) 1141 } 1142 for _, p := range paths { 1143 data, err := ioutil.ReadFile(p) 1144 if err != nil { 1145 t.Fatal(err) 1146 } 1147 1148 options := []string{ 1149 "use_fsync=true", 1150 "min_write_buffer_number_to_merge=2", 1151 "block_size=4096", 1152 } 1153 for _, o := range options { 1154 fullOption := fmt.Sprintf(" %s\n", o) 1155 if !bytes.Contains(data, []byte(fullOption)) { 1156 t.Errorf("unable to find %s in %s", o, p) 1157 } 1158 } 1159 } 1160 } 1161 1162 // Verify that range tombstones do not result in sstables that cover an 1163 // exessively large portion of the key space. 1164 func TestRocksDBDeleteRangeCompaction(t *testing.T) { 1165 defer leaktest.AfterTest(t)() 1166 1167 db := setupMVCCInMemRocksDB(t, "delrange") 1168 defer db.Close() 1169 1170 // Disable automatic compactions which interfere with test expectations 1171 // below. 1172 if err := db.(*RocksDB).disableAutoCompaction(); err != nil { 1173 t.Fatal(err) 1174 } 1175 1176 makeKey := func(prefix string, i int) roachpb.Key { 1177 return roachpb.Key(fmt.Sprintf("%s%09d", prefix, i)) 1178 } 1179 1180 rnd, _ := randutil.NewPseudoRand() 1181 1182 // Create sstables in L6 that are half the L6 target size. Any smaller and 1183 // RocksDB might choose to compact them. 1184 const targetSize = 64 << 20 1185 const numEntries = 10000 1186 const keySize = 10 1187 const valueSize = (targetSize / numEntries) - keySize 1188 1189 for _, p := range "abc" { 1190 sst, err := MakeRocksDBSstFileWriter() 1191 if err != nil { 1192 t.Fatal(sst) 1193 } 1194 defer sst.Close() 1195 1196 for i := 0; i < numEntries; i++ { 1197 if err := sst.Put(MVCCKey{Key: makeKey(string(p), i)}, randutil.RandBytes(rnd, valueSize)); err != nil { 1198 t.Fatal(err) 1199 } 1200 } 1201 1202 sstContents, err := sst.Finish() 1203 if err != nil { 1204 t.Fatal(err) 1205 } 1206 1207 filename := fmt.Sprintf("ingest") 1208 if err := db.WriteFile(filename, sstContents); err != nil { 1209 t.Fatal(err) 1210 } 1211 1212 if err := db.IngestExternalFiles(context.Background(), []string{filename}); err != nil { 1213 t.Fatal(err) 1214 } 1215 if testing.Verbose() { 1216 fmt.Printf("ingested %s\n", string(p)) 1217 } 1218 } 1219 1220 getSSTables := func() string { 1221 ssts := db.GetSSTables() 1222 sort.Slice(ssts, func(i, j int) bool { 1223 a, b := ssts[i], ssts[j] 1224 if a.Level < b.Level { 1225 return true 1226 } 1227 if a.Level > b.Level { 1228 return false 1229 } 1230 return a.Start.Less(b.Start) 1231 }) 1232 var buf bytes.Buffer 1233 fmt.Fprintf(&buf, "\n") 1234 for i := range ssts { 1235 fmt.Fprintf(&buf, "%d: %s - %s\n", 1236 ssts[i].Level, ssts[i].Start.Key, ssts[i].End.Key) 1237 } 1238 return buf.String() 1239 } 1240 1241 verifySSTables := func(expected string) { 1242 actual := getSSTables() 1243 if expected != actual { 1244 t.Fatalf("expected%sgot%s", expected, actual) 1245 } 1246 if testing.Verbose() { 1247 fmt.Printf("%s", actual) 1248 } 1249 } 1250 1251 // After setup there should be 3 sstables. 1252 verifySSTables(` 1253 6: "a000000000" - "a000009999" 1254 6: "b000000000" - "b000009999" 1255 6: "c000000000" - "c000009999" 1256 `) 1257 1258 // Generate a batch which writes to the very first key, and then deletes the 1259 // range of keys covered by the last sstable. 1260 batch := db.NewBatch() 1261 if err := batch.Put(MakeMVCCMetadataKey(makeKey("a", 0)), []byte("hello")); err != nil { 1262 t.Fatal(err) 1263 } 1264 if err := batch.ClearRange(MakeMVCCMetadataKey(makeKey("c", 0)), 1265 MakeMVCCMetadataKey(makeKey("c", numEntries))); err != nil { 1266 t.Fatal(err) 1267 } 1268 if err := batch.Commit(true); err != nil { 1269 t.Fatal(err) 1270 } 1271 batch.Close() 1272 if err := db.Flush(); err != nil { 1273 t.Fatal(err) 1274 } 1275 1276 // After flushing, there is a single additional L0 table that covers the 1277 // entire key range. 1278 verifySSTables(` 1279 0: "a000000000" - "c000010000" 1280 6: "a000000000" - "a000009999" 1281 6: "b000000000" - "b000009999" 1282 6: "c000000000" - "c000009999" 1283 `) 1284 1285 // Compacting the key range covering the last sstable should result in that 1286 // sstable being deleted. Prior to the hack in dbClearRange, all of the 1287 // sstables would be compacted resulting in 2 L6 sstables with different 1288 // boundaries than the ones below. 1289 _ = db.CompactRange(makeKey("c", 0), makeKey("c", numEntries), false) 1290 verifySSTables(` 1291 5: "a000000000" - "a000000000" 1292 6: "a000000000" - "a000009999" 1293 6: "b000000000" - "b000009999" 1294 `) 1295 } 1296 1297 func BenchmarkRocksDBDeleteRangeIterate(b *testing.B) { 1298 for _, entries := range []int{10, 1000, 100000} { 1299 b.Run(fmt.Sprintf("entries=%d", entries), func(b *testing.B) { 1300 for _, deleted := range []int{entries, entries - 1} { 1301 b.Run(fmt.Sprintf("deleted=%d", deleted), func(b *testing.B) { 1302 db := setupMVCCInMemRocksDB(b, "unused") 1303 defer db.Close() 1304 1305 makeKey := func(i int) roachpb.Key { 1306 return roachpb.Key(fmt.Sprintf("%09d", i)) 1307 } 1308 1309 // Create an SST with N entries and ingest it. This is a fast way to get a 1310 // lot of entries into RocksDB. 1311 { 1312 sst, err := MakeRocksDBSstFileWriter() 1313 if err != nil { 1314 b.Fatal(sst) 1315 } 1316 defer sst.Close() 1317 1318 for i := 0; i < entries; i++ { 1319 if err := sst.Put(MVCCKey{Key: makeKey(i)}, nil); err != nil { 1320 b.Fatal(err) 1321 } 1322 } 1323 1324 sstContents, err := sst.Finish() 1325 if err != nil { 1326 b.Fatal(err) 1327 } 1328 1329 filename := fmt.Sprintf("ingest") 1330 if err := db.WriteFile(filename, sstContents); err != nil { 1331 b.Fatal(err) 1332 } 1333 1334 err = db.IngestExternalFiles(context.Background(), []string{filename}) 1335 if err != nil { 1336 b.Fatal(err) 1337 } 1338 } 1339 1340 // Create a range tombstone that deletes most (or all) of those entries. 1341 from := makeKey(0) 1342 to := makeKey(deleted) 1343 if err := db.ClearRange(MakeMVCCMetadataKey(from), MakeMVCCMetadataKey(to)); err != nil { 1344 b.Fatal(err) 1345 } 1346 1347 b.ResetTimer() 1348 for i := 0; i < b.N; i++ { 1349 iter := db.NewIterator(IterOptions{UpperBound: roachpb.KeyMax}) 1350 iter.SeekGE(MakeMVCCMetadataKey(from)) 1351 ok, err := iter.Valid() 1352 if err != nil { 1353 b.Fatal(err) 1354 } 1355 if deleted < entries { 1356 if !ok { 1357 b.Fatal("key not found") 1358 } 1359 } else if ok { 1360 b.Fatal("unexpected key found") 1361 } 1362 iter.Close() 1363 } 1364 }) 1365 } 1366 }) 1367 } 1368 } 1369 1370 func TestMakeBatchGroup(t *testing.T) { 1371 defer leaktest.AfterTest(t)() 1372 1373 // Assume every newly instantiated batch has size 12 (header only). 1374 testCases := []struct { 1375 maxSize int 1376 groupSize []int 1377 leader []bool 1378 groups []int 1379 }{ 1380 {1, []int{12, 12, 12}, []bool{true, true, true}, []int{1, 1, 1}}, 1381 {23, []int{12, 12, 12}, []bool{true, true, true}, []int{1, 1, 1}}, 1382 {24, []int{12, 24, 12}, []bool{true, false, true}, []int{2, 1}}, 1383 {35, []int{12, 24, 12}, []bool{true, false, true}, []int{2, 1}}, 1384 {36, []int{12, 24, 36}, []bool{true, false, false}, []int{3}}, 1385 { 1386 48, 1387 []int{12, 24, 36, 48, 12}, 1388 []bool{true, false, false, false, true}, 1389 []int{4, 1}, 1390 }, 1391 } 1392 for _, c := range testCases { 1393 t.Run("", func(t *testing.T) { 1394 var pending []*rocksDBBatch 1395 var groupSize int 1396 for i := range c.groupSize { 1397 b := &rocksDBBatch{} 1398 var leader bool 1399 pending, groupSize, leader = makeBatchGroup(pending, b, groupSize, c.maxSize) 1400 if c.groupSize[i] != groupSize { 1401 t.Fatalf("expected group size %d, but found %d", c.groupSize[i], groupSize) 1402 } 1403 if c.leader[i] != leader { 1404 t.Fatalf("expected leader %t, but found %t", c.leader[i], leader) 1405 } 1406 } 1407 var groups []int 1408 for len(pending) > 0 { 1409 var group []*rocksDBBatch 1410 group, pending = nextBatchGroup(pending) 1411 groups = append(groups, len(group)) 1412 } 1413 if !reflect.DeepEqual(c.groups, groups) { 1414 t.Fatalf("expected %d, but found %d", c.groups, groups) 1415 } 1416 }) 1417 } 1418 } 1419 1420 // Verify that RocksDBSstFileWriter works with time bounded iterators. 1421 func TestSstFileWriterTimeBound(t *testing.T) { 1422 defer leaktest.AfterTest(t)() 1423 1424 ctx := context.Background() 1425 db := setupMVCCInMemRocksDB(t, "sstwriter-timebound") 1426 defer db.Close() 1427 1428 for walltime := int64(1); walltime < 5; walltime++ { 1429 sst, err := MakeRocksDBSstFileWriter() 1430 if err != nil { 1431 t.Fatal(sst) 1432 } 1433 defer sst.Close() 1434 if err := sst.Put( 1435 MVCCKey{Key: []byte("key"), Timestamp: hlc.Timestamp{WallTime: walltime}}, 1436 []byte("value"), 1437 ); err != nil { 1438 t.Fatal(err) 1439 } 1440 sstContents, err := sst.Finish() 1441 if err != nil { 1442 t.Fatal(err) 1443 } 1444 if err := db.WriteFile(`ingest`, sstContents); err != nil { 1445 t.Fatal(err) 1446 } 1447 if err := db.IngestExternalFiles(ctx, []string{`ingest`}); err != nil { 1448 t.Fatal(err) 1449 } 1450 } 1451 1452 it := db.NewIterator(IterOptions{ 1453 UpperBound: keys.MaxKey, 1454 MinTimestampHint: hlc.Timestamp{WallTime: 2}, 1455 MaxTimestampHint: hlc.Timestamp{WallTime: 3}, 1456 WithStats: true, 1457 }) 1458 defer it.Close() 1459 for it.SeekGE(MVCCKey{Key: keys.MinKey}); ; it.Next() { 1460 ok, err := it.Valid() 1461 if err != nil { 1462 t.Fatal(err) 1463 } 1464 if !ok { 1465 break 1466 } 1467 } 1468 if s := it.Stats(); s.TimeBoundNumSSTs != 2 { 1469 t.Errorf(`expected 2 sstables got %d`, s.TimeBoundNumSSTs) 1470 } 1471 } 1472 1473 // TestRocksDBWALFileEmptyBatch verifies that committing an empty batch does 1474 // not write an entry to RocksDB's write-ahead log. 1475 func TestRocksDBWALFileEmptyBatch(t *testing.T) { 1476 defer leaktest.AfterTest(t)() 1477 1478 dir, cleanup := testutils.TempDir(t) 1479 defer cleanup() 1480 1481 // NB: The in-mem RocksDB instance doesn't support syncing the WAL which is 1482 // necessary for this test. 1483 e, err := NewRocksDB( 1484 RocksDBConfig{ 1485 StorageConfig: base.StorageConfig{ 1486 Settings: cluster.MakeTestingClusterSettings(), 1487 Dir: dir, 1488 }, 1489 }, 1490 RocksDBCache{}, 1491 ) 1492 if err != nil { 1493 t.Fatal(err) 1494 } 1495 defer e.Close() 1496 1497 // Commit a batch with one key. 1498 b := e.NewBatch() 1499 defer b.Close() 1500 if err := b.Put(mvccKey("foo"), []byte{'b', 'a', 'r'}); err != nil { 1501 t.Fatal(err) 1502 } 1503 if err := b.Commit(true /* sync */); err != nil { 1504 t.Fatal(err) 1505 } 1506 1507 // Verify that RocksDB has created a non-empty WAL. 1508 walsBefore, err := e.GetSortedWALFiles() 1509 if err != nil { 1510 t.Fatal(err) 1511 } 1512 if len(walsBefore) != 1 { 1513 t.Fatalf("expected exactly one WAL file, but got %d", len(walsBefore)) 1514 } 1515 if walsBefore[0].Size == 0 { 1516 t.Fatalf("expected non-empty WAL file") 1517 } 1518 1519 // Commit an empty batch. 1520 b = e.NewBatch() 1521 defer b.Close() 1522 if err := b.Commit(true /* sync */); err != nil { 1523 t.Fatal(err) 1524 } 1525 1526 // Verify that the WAL has not changed in size. 1527 walsAfter, err := e.GetSortedWALFiles() 1528 if err != nil { 1529 t.Fatal(err) 1530 } 1531 if !reflect.DeepEqual(walsBefore, walsAfter) { 1532 t.Fatalf("expected wal files %#v after committing empty batch, but got %#v", 1533 walsBefore, walsAfter) 1534 } 1535 1536 // Regression test a bug that would accidentally make Commit a no-op (via an 1537 // errant fast-path) when a batch contained only LogData. 1538 testutils.RunTrueAndFalse(t, "distinct", func(t *testing.T, distinct bool) { 1539 walsBefore, err := e.GetSortedWALFiles() 1540 if err != nil { 1541 t.Fatal(err) 1542 } 1543 if len(walsBefore) != 1 { 1544 t.Fatalf("expected one WAL file, got %d", len(walsBefore)) 1545 } 1546 1547 batch := e.NewBatch() 1548 defer batch.Close() 1549 1550 var rw ReadWriter = batch 1551 if distinct { 1552 // NB: we can't actually close this distinct batch because it auto- 1553 // closes when the batch commits. 1554 rw = batch.Distinct() 1555 } 1556 1557 if err := rw.LogData([]byte("foo")); err != nil { 1558 t.Fatal(err) 1559 } 1560 if batch.Empty() { 1561 t.Error("batch is not empty") 1562 } 1563 1564 if err := batch.Commit(true /* sync */); err != nil { 1565 t.Fatal(err) 1566 } 1567 1568 // Verify that the WAL has grown. 1569 walsAfter, err := e.GetSortedWALFiles() 1570 if err != nil { 1571 t.Fatal(err) 1572 } 1573 1574 if len(walsAfter) != 1 { 1575 t.Fatalf("expected one WAL file, got %+v", walsAfter) 1576 } 1577 1578 if after, before := walsAfter[0].Size, walsBefore[0].Size; after <= before { 1579 t.Fatalf("wal size was expected to increase, got %d -> %d", before, after) 1580 } 1581 }) 1582 } 1583 1584 // Regression test for https://github.com/facebook/rocksdb/issues/6666. 1585 func TestRocksDBGlobalSeqnumIssue(t *testing.T) { 1586 defer leaktest.AfterTest(t)() 1587 1588 tempDir, cleanup := testutils.TempDir(t) 1589 defer cleanup() 1590 db := setupMVCCRocksDB(t, tempDir) 1591 defer db.Close() 1592 1593 keyBase := []byte("ab") 1594 valBase := []byte("foobar") 1595 valBase2 := []byte("barfoo") 1596 key0 := MVCCKey{Key: []byte("aa")} 1597 1598 // When encoded, this MVCC key is 0x616200, trailer 0x0000000000000001 (seqnum 1599 // 0, key type 1 or SET), which gets encoded as little endian. 1600 // Including the trailer, this key is encoded internally as 1601 // 0x6162000100000000000000 1602 key1 := MVCCKey{Key: keyBase} 1603 // When encoded, this MVCC key is 0x616200010000000000000009, trailer 1604 // 0x0000000000000001 (same as before). 1605 // Including the trailer, the internal key is encoded as 1606 // 0x6162000100000000000000090100000000000000. 1607 // Note that it has a prefix matching the earlier key's full internal key. 1608 key2 := MVCCKey{Key: keyBase, Timestamp: hlc.Timestamp{WallTime: 0x0100000000000000}} 1609 1610 // Bump up the global sequence number to a non-zero number. Also lay down 1611 // keys around key1 and key2. 1612 if err := db.Put(key0, valBase); err != nil { 1613 t.Fatal(err) 1614 } 1615 for i := 0; i < 50; i++ { 1616 key := make([]byte, len(keyBase)+1) 1617 copy(key, keyBase) 1618 // Make keys of the format ac0, ac1, ... 1619 key[1] = 'c' 1620 key[2] = byte(i) 1621 err := db.Put(MVCCKey{Key: key}, valBase) 1622 if err != nil { 1623 t.Fatal(err) 1624 } 1625 } 1626 // A flush + compact is necessary to push down the writes above into L6. 1627 if err := db.Flush(); err != nil { 1628 t.Fatal(err) 1629 } 1630 if err := db.Compact(); err != nil { 1631 t.Fatal(err) 1632 } 1633 // An open snapshot ensures RocksDB assigns a nonzero global sequence number 1634 // to the SSTable we're about to ingest. 1635 snapshot := db.NewSnapshot() 1636 1637 sstFilePath := filepath.Join(db.GetAuxiliaryDir(), "test1.sst") 1638 _ = os.MkdirAll(db.GetAuxiliaryDir(), 0755) 1639 sstFile, err := os.Create(sstFilePath) 1640 if err != nil { 1641 t.Fatal(err) 1642 } 1643 writer := MakeIngestionSSTWriter(sstFile) 1644 if err := writer.Put(key1, valBase2); err != nil { 1645 t.Fatal(err) 1646 } 1647 if err := writer.Put(key2, valBase2); err != nil { 1648 t.Fatal(err) 1649 } 1650 if err := writer.Finish(); err != nil { 1651 t.Fatal(err) 1652 } 1653 writer.Close() 1654 sstFile.Close() 1655 1656 // When this file is ingested, it'll be added to L0, since it overlaps in key 1657 // bounds (but not actual keys) with the SSTable flushed earlier. 1658 if err := db.IngestExternalFiles(context.Background(), []string{sstFilePath}); err != nil { 1659 t.Fatal(err) 1660 } 1661 snapshot.Close() 1662 val, err := db.Get(key1) 1663 if err != nil { 1664 t.Fatal(err) 1665 } 1666 val2, err := db.Get(key2) 1667 if err != nil { 1668 t.Fatal(err) 1669 } 1670 if !bytes.Equal(val, valBase2) || !bytes.Equal(val2, valBase2) { 1671 t.Fatalf("expected values to match: %v != %v != 'barfoo'", val, val2) 1672 } 1673 }