github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/ingest_test.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "bytes" 9 "context" 10 "fmt" 11 "io" 12 "math" 13 "os" 14 "path/filepath" 15 "slices" 16 "sort" 17 "strconv" 18 "strings" 19 "sync" 20 "sync/atomic" 21 "testing" 22 "time" 23 24 "github.com/cockroachdb/datadriven" 25 "github.com/cockroachdb/errors" 26 "github.com/cockroachdb/errors/oserror" 27 "github.com/cockroachdb/pebble/internal/base" 28 "github.com/cockroachdb/pebble/internal/keyspan" 29 "github.com/cockroachdb/pebble/internal/manifest" 30 "github.com/cockroachdb/pebble/internal/rangekey" 31 "github.com/cockroachdb/pebble/internal/testkeys" 32 "github.com/cockroachdb/pebble/objstorage" 33 "github.com/cockroachdb/pebble/objstorage/objstorageprovider" 34 "github.com/cockroachdb/pebble/objstorage/remote" 35 "github.com/cockroachdb/pebble/record" 36 "github.com/cockroachdb/pebble/sstable" 37 "github.com/cockroachdb/pebble/vfs" 38 "github.com/cockroachdb/pebble/vfs/errorfs" 39 "github.com/kr/pretty" 40 "github.com/stretchr/testify/require" 41 "golang.org/x/exp/rand" 42 ) 43 44 func TestSSTableKeyCompare(t *testing.T) { 45 var buf bytes.Buffer 46 datadriven.RunTest(t, "testdata/sstable_key_compare", func(t *testing.T, td *datadriven.TestData) string { 47 switch td.Cmd { 48 case "cmp": 49 buf.Reset() 50 for _, line := range strings.Split(td.Input, "\n") { 51 fields := strings.Fields(line) 52 a := base.ParseInternalKey(fields[0]) 53 b := base.ParseInternalKey(fields[1]) 54 got := sstableKeyCompare(testkeys.Comparer.Compare, a, b) 55 fmt.Fprintf(&buf, "%38s", fmt.Sprint(a.Pretty(base.DefaultFormatter))) 56 switch got { 57 case -1: 58 fmt.Fprint(&buf, " < ") 59 case +1: 60 fmt.Fprint(&buf, " > ") 61 case 0: 62 fmt.Fprint(&buf, " = ") 63 } 64 fmt.Fprintf(&buf, "%s\n", fmt.Sprint(b.Pretty(base.DefaultFormatter))) 65 } 66 return buf.String() 67 default: 68 return fmt.Sprintf("unrecognized command %q", td.Cmd) 69 } 70 }) 71 } 72 73 func TestIngestLoad(t *testing.T) { 74 mem := vfs.NewMem() 75 76 datadriven.RunTest(t, "testdata/ingest_load", func(t *testing.T, td *datadriven.TestData) string { 77 switch td.Cmd { 78 case "load": 79 writerOpts := sstable.WriterOptions{} 80 var dbVersion FormatMajorVersion 81 for _, cmdArgs := range td.CmdArgs { 82 v, err := strconv.Atoi(cmdArgs.Vals[0]) 83 if err != nil { 84 return err.Error() 85 } 86 switch k := cmdArgs.Key; k { 87 case "writer-version": 88 fmv := FormatMajorVersion(v) 89 writerOpts.TableFormat = fmv.MaxTableFormat() 90 case "db-version": 91 dbVersion = FormatMajorVersion(v) 92 default: 93 return fmt.Sprintf("unknown cmd %s\n", k) 94 } 95 } 96 f, err := mem.Create("ext") 97 if err != nil { 98 return err.Error() 99 } 100 w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), writerOpts) 101 for _, data := range strings.Split(td.Input, "\n") { 102 if strings.HasPrefix(data, "rangekey: ") { 103 data = strings.TrimPrefix(data, "rangekey: ") 104 s := keyspan.ParseSpan(data) 105 err := rangekey.Encode(&s, w.AddRangeKey) 106 if err != nil { 107 return err.Error() 108 } 109 continue 110 } 111 112 j := strings.Index(data, ":") 113 if j < 0 { 114 return fmt.Sprintf("malformed input: %s\n", data) 115 } 116 key := base.ParseInternalKey(data[:j]) 117 value := []byte(data[j+1:]) 118 if err := w.Add(key, value); err != nil { 119 return err.Error() 120 } 121 } 122 if err := w.Close(); err != nil { 123 return err.Error() 124 } 125 126 opts := (&Options{ 127 Comparer: DefaultComparer, 128 FS: mem, 129 }).WithFSDefaults() 130 lr, err := ingestLoad(opts, dbVersion, []string{"ext"}, nil, nil, 0, []base.DiskFileNum{base.FileNum(1).DiskFileNum()}, nil, 0) 131 if err != nil { 132 return err.Error() 133 } 134 var buf bytes.Buffer 135 for _, m := range lr.localMeta { 136 fmt.Fprintf(&buf, "%d: %s-%s\n", m.FileNum, m.Smallest, m.Largest) 137 fmt.Fprintf(&buf, " points: %s-%s\n", m.SmallestPointKey, m.LargestPointKey) 138 fmt.Fprintf(&buf, " ranges: %s-%s\n", m.SmallestRangeKey, m.LargestRangeKey) 139 } 140 return buf.String() 141 142 default: 143 return fmt.Sprintf("unknown command: %s", td.Cmd) 144 } 145 }) 146 } 147 148 func TestIngestLoadRand(t *testing.T) { 149 mem := vfs.NewMem() 150 rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano()))) 151 cmp := DefaultComparer.Compare 152 version := internalFormatNewest 153 154 randBytes := func(size int) []byte { 155 data := make([]byte, size) 156 for i := range data { 157 data[i] = byte(rng.Int() & 0xff) 158 } 159 return data 160 } 161 162 paths := make([]string, 1+rng.Intn(10)) 163 pending := make([]base.DiskFileNum, len(paths)) 164 expected := make([]*fileMetadata, len(paths)) 165 for i := range paths { 166 paths[i] = fmt.Sprint(i) 167 pending[i] = base.FileNum(rng.Uint64()).DiskFileNum() 168 expected[i] = &fileMetadata{ 169 FileNum: pending[i].FileNum(), 170 } 171 expected[i].StatsMarkValid() 172 173 func() { 174 f, err := mem.Create(paths[i]) 175 require.NoError(t, err) 176 177 keys := make([]InternalKey, 1+rng.Intn(100)) 178 for i := range keys { 179 keys[i] = base.MakeInternalKey( 180 randBytes(1+rng.Intn(10)), 181 0, 182 InternalKeyKindSet) 183 } 184 slices.SortFunc(keys, func(a, b base.InternalKey) int { 185 return base.InternalCompare(cmp, a, b) 186 }) 187 188 expected[i].ExtendPointKeyBounds(cmp, keys[0], keys[len(keys)-1]) 189 190 w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{ 191 TableFormat: version.MaxTableFormat(), 192 }) 193 var count uint64 194 for i := range keys { 195 if i > 0 && base.InternalCompare(cmp, keys[i-1], keys[i]) == 0 { 196 // Duplicate key, ignore. 197 continue 198 } 199 w.Add(keys[i], nil) 200 count++ 201 } 202 expected[i].Stats.NumEntries = count 203 require.NoError(t, w.Close()) 204 205 meta, err := w.Metadata() 206 require.NoError(t, err) 207 208 expected[i].Size = meta.Size 209 expected[i].InitPhysicalBacking() 210 }() 211 } 212 213 opts := (&Options{ 214 Comparer: DefaultComparer, 215 FS: mem, 216 }).WithFSDefaults() 217 lr, err := ingestLoad(opts, version, paths, nil, nil, 0, pending, nil, 0) 218 require.NoError(t, err) 219 220 for _, m := range lr.localMeta { 221 m.CreationTime = 0 222 } 223 t.Log(strings.Join(pretty.Diff(expected, lr.localMeta), "\n")) 224 require.Equal(t, expected, lr.localMeta) 225 } 226 227 func TestIngestLoadInvalid(t *testing.T) { 228 mem := vfs.NewMem() 229 f, err := mem.Create("invalid") 230 require.NoError(t, err) 231 require.NoError(t, f.Close()) 232 233 opts := (&Options{ 234 Comparer: DefaultComparer, 235 FS: mem, 236 }).WithFSDefaults() 237 if _, err := ingestLoad(opts, internalFormatNewest, []string{"invalid"}, nil, nil, 0, []base.DiskFileNum{base.FileNum(1).DiskFileNum()}, nil, 0); err == nil { 238 t.Fatalf("expected error, but found success") 239 } 240 } 241 242 func TestIngestSortAndVerify(t *testing.T) { 243 comparers := map[string]Compare{ 244 "default": DefaultComparer.Compare, 245 "reverse": func(a, b []byte) int { 246 return DefaultComparer.Compare(b, a) 247 }, 248 } 249 250 t.Run("", func(t *testing.T) { 251 datadriven.RunTest(t, "testdata/ingest_sort_and_verify", func(t *testing.T, d *datadriven.TestData) string { 252 switch d.Cmd { 253 case "ingest": 254 var buf bytes.Buffer 255 var meta []*fileMetadata 256 var paths []string 257 var cmpName string 258 d.ScanArgs(t, "cmp", &cmpName) 259 cmp := comparers[cmpName] 260 if cmp == nil { 261 return fmt.Sprintf("%s unknown comparer: %s", d.Cmd, cmpName) 262 } 263 for i, data := range strings.Split(d.Input, "\n") { 264 parts := strings.Split(data, "-") 265 if len(parts) != 2 { 266 return fmt.Sprintf("malformed test case: %s", d.Input) 267 } 268 smallest := base.ParseInternalKey(parts[0]) 269 largest := base.ParseInternalKey(parts[1]) 270 if cmp(smallest.UserKey, largest.UserKey) > 0 { 271 return fmt.Sprintf("range %v-%v is not valid", smallest, largest) 272 } 273 m := (&fileMetadata{}).ExtendPointKeyBounds(cmp, smallest, largest) 274 m.InitPhysicalBacking() 275 meta = append(meta, m) 276 paths = append(paths, strconv.Itoa(i)) 277 } 278 lr := ingestLoadResult{localPaths: paths, localMeta: meta} 279 err := ingestSortAndVerify(cmp, lr, KeyRange{}) 280 if err != nil { 281 return fmt.Sprintf("%v\n", err) 282 } 283 for i := range meta { 284 fmt.Fprintf(&buf, "%s: %v-%v\n", paths[i], meta[i].Smallest, meta[i].Largest) 285 } 286 return buf.String() 287 288 default: 289 return fmt.Sprintf("unknown command: %s", d.Cmd) 290 } 291 }) 292 }) 293 } 294 295 func TestIngestLink(t *testing.T) { 296 // Test linking of tables into the DB directory. Test cleanup when one of the 297 // tables cannot be linked. 298 299 const dir = "db" 300 const count = 10 301 for i := 0; i <= count; i++ { 302 t.Run("", func(t *testing.T) { 303 opts := &Options{FS: vfs.NewMem()} 304 opts.EnsureDefaults().WithFSDefaults() 305 require.NoError(t, opts.FS.MkdirAll(dir, 0755)) 306 objProvider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(opts.FS, dir)) 307 require.NoError(t, err) 308 defer objProvider.Close() 309 310 paths := make([]string, 10) 311 meta := make([]*fileMetadata, len(paths)) 312 contents := make([][]byte, len(paths)) 313 for j := range paths { 314 paths[j] = fmt.Sprintf("external%d", j) 315 meta[j] = &fileMetadata{} 316 meta[j].FileNum = FileNum(j) 317 meta[j].InitPhysicalBacking() 318 f, err := opts.FS.Create(paths[j]) 319 require.NoError(t, err) 320 321 contents[j] = []byte(fmt.Sprintf("data%d", j)) 322 // memFile.Write will modify the supplied buffer when invariants are 323 // enabled, so provide a throw-away copy. 324 _, err = f.Write(append([]byte(nil), contents[j]...)) 325 require.NoError(t, err) 326 require.NoError(t, f.Close()) 327 } 328 329 if i < count { 330 opts.FS.Remove(paths[i]) 331 } 332 333 lr := ingestLoadResult{localMeta: meta, localPaths: paths} 334 err = ingestLink(0 /* jobID */, opts, objProvider, lr, nil /* shared */) 335 if i < count { 336 if err == nil { 337 t.Fatalf("expected error, but found success") 338 } 339 } else { 340 require.NoError(t, err) 341 } 342 343 files, err := opts.FS.List(dir) 344 require.NoError(t, err) 345 346 sort.Strings(files) 347 348 if i < count { 349 if len(files) > 0 { 350 t.Fatalf("expected all of the files to be cleaned up, but found:\n%s", 351 strings.Join(files, "\n")) 352 } 353 } else { 354 if len(files) != count { 355 t.Fatalf("expected %d files, but found:\n%s", count, strings.Join(files, "\n")) 356 } 357 for j := range files { 358 ftype, fileNum, ok := base.ParseFilename(opts.FS, files[j]) 359 if !ok { 360 t.Fatalf("unable to parse filename: %s", files[j]) 361 } 362 if fileTypeTable != ftype { 363 t.Fatalf("expected table, but found %d", ftype) 364 } 365 if j != int(fileNum.FileNum()) { 366 t.Fatalf("expected table %d, but found %d", j, fileNum) 367 } 368 f, err := opts.FS.Open(opts.FS.PathJoin(dir, files[j])) 369 require.NoError(t, err) 370 371 data, err := io.ReadAll(f) 372 require.NoError(t, err) 373 require.NoError(t, f.Close()) 374 if !bytes.Equal(contents[j], data) { 375 t.Fatalf("expected %s, but found %s", contents[j], data) 376 } 377 } 378 } 379 }) 380 } 381 } 382 383 func TestIngestLinkFallback(t *testing.T) { 384 // Verify that ingestLink succeeds if linking fails by falling back to 385 // copying. 386 mem := vfs.NewMem() 387 src, err := mem.Create("source") 388 require.NoError(t, err) 389 390 opts := &Options{FS: errorfs.Wrap(mem, errorfs.ErrInjected.If(errorfs.OnIndex(1)))} 391 opts.EnsureDefaults().WithFSDefaults() 392 objSettings := objstorageprovider.DefaultSettings(opts.FS, "") 393 // Prevent the provider from listing the dir (where we may get an injected error). 394 objSettings.FSDirInitialListing = []string{} 395 objProvider, err := objstorageprovider.Open(objSettings) 396 require.NoError(t, err) 397 defer objProvider.Close() 398 399 meta := []*fileMetadata{{FileNum: 1}} 400 meta[0].InitPhysicalBacking() 401 lr := ingestLoadResult{localMeta: meta, localPaths: []string{"source"}} 402 err = ingestLink(0, opts, objProvider, lr, nil /* shared */) 403 require.NoError(t, err) 404 405 dest, err := mem.Open("000001.sst") 406 require.NoError(t, err) 407 408 // We should be able to write bytes to src, and not have them show up in 409 // dest. 410 _, _ = src.Write([]byte("test")) 411 data, err := io.ReadAll(dest) 412 require.NoError(t, err) 413 if len(data) != 0 { 414 t.Fatalf("expected copy, but files appear to be hard linked: [%s] unexpectedly found", data) 415 } 416 } 417 418 func TestOverlappingIngestedSSTs(t *testing.T) { 419 dir := "" 420 var ( 421 mem vfs.FS 422 d *DB 423 opts *Options 424 closed = false 425 blockFlush = false 426 ) 427 defer func() { 428 if !closed { 429 require.NoError(t, d.Close()) 430 } 431 }() 432 433 reset := func(strictMem bool) { 434 if d != nil && !closed { 435 require.NoError(t, d.Close()) 436 } 437 blockFlush = false 438 439 if strictMem { 440 mem = vfs.NewStrictMem() 441 } else { 442 mem = vfs.NewMem() 443 } 444 445 require.NoError(t, mem.MkdirAll("ext", 0755)) 446 opts = (&Options{ 447 FS: mem, 448 MemTableStopWritesThreshold: 4, 449 L0CompactionThreshold: 100, 450 L0StopWritesThreshold: 100, 451 DebugCheck: DebugCheckLevels, 452 FormatMajorVersion: internalFormatNewest, 453 }).WithFSDefaults() 454 // Disable automatic compactions because otherwise we'll race with 455 // delete-only compactions triggered by ingesting range tombstones. 456 opts.DisableAutomaticCompactions = true 457 458 var err error 459 d, err = Open(dir, opts) 460 require.NoError(t, err) 461 d.TestOnlyWaitForCleaning() 462 } 463 waitForFlush := func() { 464 if d == nil { 465 return 466 } 467 d.mu.Lock() 468 for d.mu.compact.flushing { 469 d.mu.compact.cond.Wait() 470 } 471 d.mu.Unlock() 472 } 473 reset(false) 474 475 datadriven.RunTest(t, "testdata/flushable_ingest", func(t *testing.T, td *datadriven.TestData) string { 476 switch td.Cmd { 477 case "reset": 478 reset(td.HasArg("strictMem")) 479 return "" 480 481 case "ignoreSyncs": 482 var ignoreSyncs bool 483 if len(td.CmdArgs) == 1 && td.CmdArgs[0].String() == "true" { 484 ignoreSyncs = true 485 } 486 mem.(*vfs.MemFS).SetIgnoreSyncs(ignoreSyncs) 487 return "" 488 489 case "resetToSynced": 490 mem.(*vfs.MemFS).ResetToSyncedState() 491 files, err := mem.List(dir) 492 sort.Strings(files) 493 require.NoError(t, err) 494 return strings.Join(files, "\n") 495 496 case "batch": 497 b := d.NewIndexedBatch() 498 if err := runBatchDefineCmd(td, b); err != nil { 499 return err.Error() 500 } 501 if err := b.Commit(nil); err != nil { 502 return err.Error() 503 } 504 return "" 505 506 case "build": 507 if err := runBuildCmd(td, d, mem); err != nil { 508 return err.Error() 509 } 510 return "" 511 512 case "ingest": 513 if err := runIngestCmd(td, d, mem); err != nil { 514 return err.Error() 515 } 516 if !blockFlush { 517 waitForFlush() 518 } 519 return "" 520 521 case "iter": 522 iter, _ := d.NewIter(nil) 523 return runIterCmd(td, iter, true) 524 525 case "lsm": 526 return runLSMCmd(td, d) 527 528 case "close": 529 if closed { 530 return "already closed" 531 } 532 require.NoError(t, d.Close()) 533 closed = true 534 return "" 535 536 case "ls": 537 files, err := mem.List(dir) 538 sort.Strings(files) 539 require.NoError(t, err) 540 return strings.Join(files, "\n") 541 542 case "open": 543 opts.ReadOnly = td.HasArg("readOnly") 544 var err error 545 d, err = Open(dir, opts) 546 closed = false 547 require.NoError(t, err) 548 waitForFlush() 549 d.TestOnlyWaitForCleaning() 550 return "" 551 552 case "blockFlush": 553 blockFlush = true 554 d.mu.Lock() 555 d.mu.compact.flushing = true 556 d.mu.Unlock() 557 return "" 558 559 case "allowFlush": 560 blockFlush = false 561 d.mu.Lock() 562 d.mu.compact.flushing = false 563 d.mu.Unlock() 564 return "" 565 566 case "flush": 567 d.maybeScheduleFlush() 568 waitForFlush() 569 d.TestOnlyWaitForCleaning() 570 return "" 571 572 case "get": 573 return runGetCmd(t, td, d) 574 575 default: 576 return fmt.Sprintf("unknown command: %s", td.Cmd) 577 } 578 }) 579 } 580 581 func TestExcise(t *testing.T) { 582 var mem vfs.FS 583 var d *DB 584 var flushed bool 585 defer func() { 586 require.NoError(t, d.Close()) 587 }() 588 589 var opts *Options 590 reset := func() { 591 if d != nil { 592 require.NoError(t, d.Close()) 593 } 594 595 mem = vfs.NewMem() 596 require.NoError(t, mem.MkdirAll("ext", 0755)) 597 opts = &Options{ 598 FS: mem, 599 L0CompactionThreshold: 100, 600 L0StopWritesThreshold: 100, 601 DebugCheck: DebugCheckLevels, 602 EventListener: &EventListener{FlushEnd: func(info FlushInfo) { 603 flushed = true 604 }}, 605 FormatMajorVersion: FormatVirtualSSTables, 606 Comparer: testkeys.Comparer, 607 } 608 // Disable automatic compactions because otherwise we'll race with 609 // delete-only compactions triggered by ingesting range tombstones. 610 opts.DisableAutomaticCompactions = true 611 // Set this to true to add some testing for the virtual sstable validation 612 // code paths. 613 opts.Experimental.ValidateOnIngest = true 614 615 var err error 616 d, err = Open("", opts) 617 require.NoError(t, err) 618 } 619 reset() 620 621 datadriven.RunTest(t, "testdata/excise", func(t *testing.T, td *datadriven.TestData) string { 622 switch td.Cmd { 623 case "reset": 624 reset() 625 return "" 626 case "reopen": 627 require.NoError(t, d.Close()) 628 var err error 629 d, err = Open("", opts) 630 require.NoError(t, err) 631 632 return "" 633 case "batch": 634 b := d.NewIndexedBatch() 635 if err := runBatchDefineCmd(td, b); err != nil { 636 return err.Error() 637 } 638 if err := b.Commit(nil); err != nil { 639 return err.Error() 640 } 641 return "" 642 case "build": 643 if err := runBuildCmd(td, d, mem); err != nil { 644 return err.Error() 645 } 646 return "" 647 648 case "flush": 649 if err := d.Flush(); err != nil { 650 return err.Error() 651 } 652 return "" 653 654 case "ingest": 655 flushed = false 656 if err := runIngestCmd(td, d, mem); err != nil { 657 return err.Error() 658 } 659 // Wait for a possible flush. 660 d.mu.Lock() 661 for d.mu.compact.flushing { 662 d.mu.compact.cond.Wait() 663 } 664 d.mu.Unlock() 665 if flushed { 666 return "memtable flushed" 667 } 668 return "" 669 670 case "ingest-and-excise": 671 flushed = false 672 if err := runIngestAndExciseCmd(td, d, mem); err != nil { 673 return err.Error() 674 } 675 // Wait for a possible flush. 676 d.mu.Lock() 677 for d.mu.compact.flushing { 678 d.mu.compact.cond.Wait() 679 } 680 d.mu.Unlock() 681 if flushed { 682 return "memtable flushed" 683 } 684 return "" 685 686 case "get": 687 return runGetCmd(t, td, d) 688 689 case "iter": 690 iter, _ := d.NewIter(&IterOptions{ 691 KeyTypes: IterKeyTypePointsAndRanges, 692 }) 693 return runIterCmd(td, iter, true) 694 695 case "lsm": 696 return runLSMCmd(td, d) 697 698 case "metrics": 699 // The asynchronous loading of table stats can change metrics, so 700 // wait for all the tables' stats to be loaded. 701 d.mu.Lock() 702 d.waitTableStats() 703 d.mu.Unlock() 704 705 return d.Metrics().StringForTests() 706 707 case "wait-pending-table-stats": 708 return runTableStatsCmd(td, d) 709 710 case "excise": 711 ve := &versionEdit{ 712 DeletedFiles: map[deletedFileEntry]*fileMetadata{}, 713 } 714 var exciseSpan KeyRange 715 if len(td.CmdArgs) != 2 { 716 panic("insufficient args for compact command") 717 } 718 exciseSpan.Start = []byte(td.CmdArgs[0].Key) 719 exciseSpan.End = []byte(td.CmdArgs[1].Key) 720 721 d.mu.Lock() 722 d.mu.versions.logLock() 723 d.mu.Unlock() 724 current := d.mu.versions.currentVersion() 725 for level := range current.Levels { 726 iter := current.Levels[level].Iter() 727 for m := iter.SeekGE(d.cmp, exciseSpan.Start); m != nil && d.cmp(m.Smallest.UserKey, exciseSpan.End) < 0; m = iter.Next() { 728 _, err := d.excise(exciseSpan, m, ve, level) 729 if err != nil { 730 d.mu.Lock() 731 d.mu.versions.logUnlock() 732 d.mu.Unlock() 733 return fmt.Sprintf("error when excising %s: %s", m.FileNum, err.Error()) 734 } 735 } 736 } 737 d.mu.Lock() 738 d.mu.versions.logUnlock() 739 d.mu.Unlock() 740 return fmt.Sprintf("would excise %d files, use ingest-and-excise to excise.\n%s", len(ve.DeletedFiles), ve.DebugString(base.DefaultFormatter)) 741 742 case "confirm-backing": 743 // Confirms that the files have the same FileBacking. 744 fileNums := make(map[base.FileNum]struct{}) 745 for i := range td.CmdArgs { 746 fNum, err := strconv.Atoi(td.CmdArgs[i].Key) 747 if err != nil { 748 panic("invalid file number") 749 } 750 fileNums[base.FileNum(fNum)] = struct{}{} 751 } 752 d.mu.Lock() 753 currVersion := d.mu.versions.currentVersion() 754 var ptr *manifest.FileBacking 755 for _, level := range currVersion.Levels { 756 lIter := level.Iter() 757 for f := lIter.First(); f != nil; f = lIter.Next() { 758 if _, ok := fileNums[f.FileNum]; ok { 759 if ptr == nil { 760 ptr = f.FileBacking 761 continue 762 } 763 if f.FileBacking != ptr { 764 d.mu.Unlock() 765 return "file backings are not the same" 766 } 767 } 768 } 769 } 770 d.mu.Unlock() 771 return "file backings are the same" 772 case "compact": 773 if len(td.CmdArgs) != 2 { 774 panic("insufficient args for compact command") 775 } 776 l := td.CmdArgs[0].Key 777 r := td.CmdArgs[1].Key 778 err := d.Compact([]byte(l), []byte(r), false) 779 if err != nil { 780 return err.Error() 781 } 782 return "" 783 default: 784 return fmt.Sprintf("unknown command: %s", td.Cmd) 785 } 786 }) 787 } 788 789 func testIngestSharedImpl( 790 t *testing.T, createOnShared remote.CreateOnSharedStrategy, fileName string, 791 ) { 792 var d, d1, d2 *DB 793 var efos map[string]*EventuallyFileOnlySnapshot 794 defer func() { 795 for _, e := range efos { 796 require.NoError(t, e.Close()) 797 } 798 if d1 != nil { 799 require.NoError(t, d1.Close()) 800 } 801 if d2 != nil { 802 require.NoError(t, d2.Close()) 803 } 804 }() 805 creatorIDCounter := uint64(1) 806 replicateCounter := 1 807 var opts1, opts2 *Options 808 809 reset := func() { 810 for _, e := range efos { 811 require.NoError(t, e.Close()) 812 } 813 if d1 != nil { 814 require.NoError(t, d1.Close()) 815 } 816 if d2 != nil { 817 require.NoError(t, d2.Close()) 818 } 819 efos = make(map[string]*EventuallyFileOnlySnapshot) 820 821 sstorage := remote.NewInMem() 822 mem1 := vfs.NewMem() 823 mem2 := vfs.NewMem() 824 require.NoError(t, mem1.MkdirAll("ext", 0755)) 825 require.NoError(t, mem2.MkdirAll("ext", 0755)) 826 opts1 = &Options{ 827 Comparer: testkeys.Comparer, 828 FS: mem1, 829 LBaseMaxBytes: 1, 830 L0CompactionThreshold: 100, 831 L0StopWritesThreshold: 100, 832 DebugCheck: DebugCheckLevels, 833 FormatMajorVersion: FormatVirtualSSTables, 834 } 835 // lel. 836 lel := MakeLoggingEventListener(DefaultLogger) 837 opts1.EventListener = &lel 838 opts1.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{ 839 "": sstorage, 840 }) 841 opts1.Experimental.CreateOnShared = createOnShared 842 opts1.Experimental.CreateOnSharedLocator = "" 843 // Disable automatic compactions because otherwise we'll race with 844 // delete-only compactions triggered by ingesting range tombstones. 845 opts1.DisableAutomaticCompactions = true 846 847 opts2 = &Options{} 848 *opts2 = *opts1 849 opts2.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{ 850 "": sstorage, 851 }) 852 opts2.Experimental.CreateOnShared = createOnShared 853 opts2.Experimental.CreateOnSharedLocator = "" 854 opts2.FS = mem2 855 856 var err error 857 d1, err = Open("", opts1) 858 require.NoError(t, err) 859 require.NoError(t, d1.SetCreatorID(creatorIDCounter)) 860 creatorIDCounter++ 861 d2, err = Open("", opts2) 862 require.NoError(t, err) 863 require.NoError(t, d2.SetCreatorID(creatorIDCounter)) 864 creatorIDCounter++ 865 d = d1 866 } 867 reset() 868 869 datadriven.RunTest(t, fmt.Sprintf("testdata/%s", fileName), func(t *testing.T, td *datadriven.TestData) string { 870 switch td.Cmd { 871 case "restart": 872 for _, e := range efos { 873 require.NoError(t, e.Close()) 874 } 875 if d1 != nil { 876 require.NoError(t, d1.Close()) 877 } 878 if d2 != nil { 879 require.NoError(t, d2.Close()) 880 } 881 882 var err error 883 d1, err = Open("", opts1) 884 if err != nil { 885 return err.Error() 886 } 887 d2, err = Open("", opts2) 888 if err != nil { 889 return err.Error() 890 } 891 d = d1 892 return "ok, note that the active db has been set to 1 (use 'switch' to change)" 893 case "reset": 894 reset() 895 return "" 896 case "switch": 897 if len(td.CmdArgs) != 1 { 898 return "usage: switch <1 or 2>" 899 } 900 switch td.CmdArgs[0].Key { 901 case "1": 902 d = d1 903 case "2": 904 d = d2 905 default: 906 return "usage: switch <1 or 2>" 907 } 908 return "ok" 909 case "batch": 910 b := d.NewIndexedBatch() 911 if err := runBatchDefineCmd(td, b); err != nil { 912 return err.Error() 913 } 914 if err := b.Commit(nil); err != nil { 915 return err.Error() 916 } 917 return "" 918 case "build": 919 if err := runBuildCmd(td, d, d.opts.FS); err != nil { 920 return err.Error() 921 } 922 return "" 923 924 case "flush": 925 if err := d.Flush(); err != nil { 926 return err.Error() 927 } 928 return "" 929 930 case "ingest": 931 if err := runIngestCmd(td, d, d.opts.FS); err != nil { 932 return err.Error() 933 } 934 // Wait for a possible flush. 935 d.mu.Lock() 936 for d.mu.compact.flushing { 937 d.mu.compact.cond.Wait() 938 } 939 d.mu.Unlock() 940 return "" 941 942 case "ingest-and-excise": 943 if err := runIngestAndExciseCmd(td, d, d.opts.FS); err != nil { 944 return err.Error() 945 } 946 // Wait for a possible flush. 947 d.mu.Lock() 948 for d.mu.compact.flushing { 949 d.mu.compact.cond.Wait() 950 } 951 d.mu.Unlock() 952 return "" 953 954 case "replicate": 955 if len(td.CmdArgs) != 4 { 956 return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>" 957 } 958 var from, to *DB 959 switch td.CmdArgs[0].Key { 960 case "1": 961 from = d1 962 case "2": 963 from = d2 964 default: 965 return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>" 966 } 967 switch td.CmdArgs[1].Key { 968 case "1": 969 to = d1 970 case "2": 971 to = d2 972 default: 973 return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>" 974 } 975 startKey := []byte(td.CmdArgs[2].Key) 976 endKey := []byte(td.CmdArgs[3].Key) 977 978 writeOpts := d.opts.MakeWriterOptions(0 /* level */, to.opts.FormatMajorVersion.MaxTableFormat()) 979 sstPath := fmt.Sprintf("ext/replicate%d.sst", replicateCounter) 980 f, err := to.opts.FS.Create(sstPath) 981 require.NoError(t, err) 982 replicateCounter++ 983 w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), writeOpts) 984 985 var sharedSSTs []SharedSSTMeta 986 err = from.ScanInternal(context.TODO(), sstable.CategoryAndQoS{}, startKey, endKey, 987 func(key *InternalKey, value LazyValue, _ IteratorLevel) error { 988 val, _, err := value.Value(nil) 989 require.NoError(t, err) 990 require.NoError(t, w.Add(base.MakeInternalKey(key.UserKey, 0, key.Kind()), val)) 991 return nil 992 }, 993 func(start, end []byte, seqNum uint64) error { 994 require.NoError(t, w.DeleteRange(start, end)) 995 return nil 996 }, 997 func(start, end []byte, keys []keyspan.Key) error { 998 s := keyspan.Span{ 999 Start: start, 1000 End: end, 1001 Keys: keys, 1002 KeysOrder: 0, 1003 } 1004 require.NoError(t, rangekey.Encode(&s, func(k base.InternalKey, v []byte) error { 1005 return w.AddRangeKey(base.MakeInternalKey(k.UserKey, 0, k.Kind()), v) 1006 })) 1007 return nil 1008 }, 1009 func(sst *SharedSSTMeta) error { 1010 sharedSSTs = append(sharedSSTs, *sst) 1011 return nil 1012 }, 1013 ) 1014 require.NoError(t, err) 1015 require.NoError(t, w.Close()) 1016 1017 _, err = to.IngestAndExcise([]string{sstPath}, sharedSSTs, KeyRange{Start: startKey, End: endKey}) 1018 require.NoError(t, err) 1019 return fmt.Sprintf("replicated %d shared SSTs", len(sharedSSTs)) 1020 1021 case "get": 1022 return runGetCmd(t, td, d) 1023 1024 case "iter": 1025 o := &IterOptions{KeyTypes: IterKeyTypePointsAndRanges} 1026 var reader Reader 1027 reader = d 1028 for _, arg := range td.CmdArgs { 1029 switch arg.Key { 1030 case "mask-suffix": 1031 o.RangeKeyMasking.Suffix = []byte(arg.Vals[0]) 1032 case "mask-filter": 1033 o.RangeKeyMasking.Filter = func() BlockPropertyFilterMask { 1034 return sstable.NewTestKeysMaskingFilter() 1035 } 1036 case "snapshot": 1037 reader = efos[arg.Vals[0]] 1038 } 1039 } 1040 iter, err := reader.NewIter(o) 1041 if err != nil { 1042 return err.Error() 1043 } 1044 return runIterCmd(td, iter, true) 1045 1046 case "lsm": 1047 return runLSMCmd(td, d) 1048 1049 case "metrics": 1050 // The asynchronous loading of table stats can change metrics, so 1051 // wait for all the tables' stats to be loaded. 1052 d.mu.Lock() 1053 d.waitTableStats() 1054 d.mu.Unlock() 1055 1056 return d.Metrics().StringForTests() 1057 1058 case "wait-pending-table-stats": 1059 return runTableStatsCmd(td, d) 1060 1061 case "excise": 1062 ve := &versionEdit{ 1063 DeletedFiles: map[deletedFileEntry]*fileMetadata{}, 1064 } 1065 var exciseSpan KeyRange 1066 if len(td.CmdArgs) != 2 { 1067 panic("insufficient args for excise command") 1068 } 1069 exciseSpan.Start = []byte(td.CmdArgs[0].Key) 1070 exciseSpan.End = []byte(td.CmdArgs[1].Key) 1071 1072 d.mu.Lock() 1073 d.mu.versions.logLock() 1074 d.mu.Unlock() 1075 current := d.mu.versions.currentVersion() 1076 for level := range current.Levels { 1077 iter := current.Levels[level].Iter() 1078 for m := iter.SeekGE(d.cmp, exciseSpan.Start); m != nil && d.cmp(m.Smallest.UserKey, exciseSpan.End) < 0; m = iter.Next() { 1079 _, err := d.excise(exciseSpan, m, ve, level) 1080 if err != nil { 1081 d.mu.Lock() 1082 d.mu.versions.logUnlock() 1083 d.mu.Unlock() 1084 return fmt.Sprintf("error when excising %s: %s", m.FileNum, err.Error()) 1085 } 1086 } 1087 } 1088 d.mu.Lock() 1089 d.mu.versions.logUnlock() 1090 d.mu.Unlock() 1091 return fmt.Sprintf("would excise %d files, use ingest-and-excise to excise.\n%s", len(ve.DeletedFiles), ve.String()) 1092 1093 case "file-only-snapshot": 1094 if len(td.CmdArgs) != 1 { 1095 panic("insufficient args for file-only-snapshot command") 1096 } 1097 name := td.CmdArgs[0].Key 1098 var keyRanges []KeyRange 1099 for _, line := range strings.Split(td.Input, "\n") { 1100 fields := strings.Fields(line) 1101 if len(fields) != 2 { 1102 return "expected two fields for file-only snapshot KeyRanges" 1103 } 1104 kr := KeyRange{Start: []byte(fields[0]), End: []byte(fields[1])} 1105 keyRanges = append(keyRanges, kr) 1106 } 1107 1108 s := d.NewEventuallyFileOnlySnapshot(keyRanges) 1109 efos[name] = s 1110 return "ok" 1111 1112 case "wait-for-file-only-snapshot": 1113 if len(td.CmdArgs) != 1 { 1114 panic("insufficient args for file-only-snapshot command") 1115 } 1116 name := td.CmdArgs[0].Key 1117 err := efos[name].WaitForFileOnlySnapshot(context.TODO(), 1*time.Millisecond) 1118 if err != nil { 1119 return err.Error() 1120 } 1121 return "ok" 1122 1123 case "compact": 1124 err := runCompactCmd(td, d) 1125 if err != nil { 1126 return err.Error() 1127 } 1128 return "ok" 1129 default: 1130 return fmt.Sprintf("unknown command: %s", td.Cmd) 1131 } 1132 }) 1133 } 1134 1135 func TestIngestShared(t *testing.T) { 1136 for _, strategy := range []remote.CreateOnSharedStrategy{remote.CreateOnSharedAll, remote.CreateOnSharedLower} { 1137 strategyStr := "all" 1138 if strategy == remote.CreateOnSharedLower { 1139 strategyStr = "lower" 1140 } 1141 t.Run(fmt.Sprintf("createOnShared=%s", strategyStr), func(t *testing.T) { 1142 fileName := "ingest_shared" 1143 if strategy == remote.CreateOnSharedLower { 1144 fileName = "ingest_shared_lower" 1145 } 1146 testIngestSharedImpl(t, strategy, fileName) 1147 }) 1148 } 1149 } 1150 1151 func TestSimpleIngestShared(t *testing.T) { 1152 mem := vfs.NewMem() 1153 var d *DB 1154 var provider2 objstorage.Provider 1155 opts2 := Options{FS: vfs.NewMem(), FormatMajorVersion: FormatVirtualSSTables} 1156 opts2.EnsureDefaults() 1157 1158 // Create an objProvider where we will fake-create some sstables that can 1159 // then be shared back to the db instance. 1160 providerSettings := objstorageprovider.Settings{ 1161 Logger: opts2.Logger, 1162 FS: opts2.FS, 1163 FSDirName: "", 1164 FSDirInitialListing: nil, 1165 FSCleaner: opts2.Cleaner, 1166 NoSyncOnClose: opts2.NoSyncOnClose, 1167 BytesPerSync: opts2.BytesPerSync, 1168 } 1169 providerSettings.Remote.StorageFactory = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{ 1170 "": remote.NewInMem(), 1171 }) 1172 providerSettings.Remote.CreateOnShared = remote.CreateOnSharedAll 1173 providerSettings.Remote.CreateOnSharedLocator = "" 1174 1175 provider2, err := objstorageprovider.Open(providerSettings) 1176 require.NoError(t, err) 1177 creatorIDCounter := uint64(1) 1178 provider2.SetCreatorID(objstorage.CreatorID(creatorIDCounter)) 1179 creatorIDCounter++ 1180 1181 defer func() { 1182 require.NoError(t, d.Close()) 1183 }() 1184 1185 reset := func() { 1186 if d != nil { 1187 require.NoError(t, d.Close()) 1188 } 1189 1190 mem = vfs.NewMem() 1191 require.NoError(t, mem.MkdirAll("ext", 0755)) 1192 opts := &Options{ 1193 FormatMajorVersion: FormatVirtualSSTables, 1194 FS: mem, 1195 L0CompactionThreshold: 100, 1196 L0StopWritesThreshold: 100, 1197 } 1198 opts.Experimental.RemoteStorage = providerSettings.Remote.StorageFactory 1199 opts.Experimental.CreateOnShared = providerSettings.Remote.CreateOnShared 1200 opts.Experimental.CreateOnSharedLocator = providerSettings.Remote.CreateOnSharedLocator 1201 1202 var err error 1203 d, err = Open("", opts) 1204 require.NoError(t, err) 1205 require.NoError(t, d.SetCreatorID(creatorIDCounter)) 1206 creatorIDCounter++ 1207 } 1208 reset() 1209 1210 metaMap := map[base.DiskFileNum]objstorage.ObjectMetadata{} 1211 1212 require.NoError(t, d.Set([]byte("d"), []byte("unexpected"), nil)) 1213 require.NoError(t, d.Set([]byte("e"), []byte("unexpected"), nil)) 1214 require.NoError(t, d.Set([]byte("a"), []byte("unexpected"), nil)) 1215 require.NoError(t, d.Set([]byte("f"), []byte("unexpected"), nil)) 1216 d.Flush() 1217 1218 { 1219 // Create a shared file. 1220 fn := base.FileNum(2) 1221 f, meta, err := provider2.Create(context.TODO(), fileTypeTable, fn.DiskFileNum(), objstorage.CreateOptions{PreferSharedStorage: true}) 1222 require.NoError(t, err) 1223 w := sstable.NewWriter(f, d.opts.MakeWriterOptions(0, d.opts.FormatMajorVersion.MaxTableFormat())) 1224 w.Set([]byte("d"), []byte("shared")) 1225 w.Set([]byte("e"), []byte("shared")) 1226 w.Close() 1227 metaMap[fn.DiskFileNum()] = meta 1228 } 1229 1230 m := metaMap[base.FileNum(2).DiskFileNum()] 1231 handle, err := provider2.RemoteObjectBacking(&m) 1232 require.NoError(t, err) 1233 size, err := provider2.Size(m) 1234 require.NoError(t, err) 1235 1236 sharedSSTMeta := SharedSSTMeta{ 1237 Backing: handle, 1238 Smallest: base.MakeInternalKey([]byte("d"), 0, InternalKeyKindSet), 1239 Largest: base.MakeInternalKey([]byte("e"), 0, InternalKeyKindSet), 1240 SmallestPointKey: base.MakeInternalKey([]byte("d"), 0, InternalKeyKindSet), 1241 LargestPointKey: base.MakeInternalKey([]byte("e"), 0, InternalKeyKindSet), 1242 Level: 6, 1243 Size: uint64(size + 5), 1244 } 1245 _, err = d.IngestAndExcise([]string{}, []SharedSSTMeta{sharedSSTMeta}, KeyRange{Start: []byte("d"), End: []byte("ee")}) 1246 require.NoError(t, err) 1247 1248 // TODO(bilal): Once reading of shared sstables is in, verify that the values 1249 // of d and e have been updated. 1250 } 1251 1252 type blockedCompaction struct { 1253 startBlock, unblock chan struct{} 1254 } 1255 1256 func TestConcurrentExcise(t *testing.T) { 1257 var d, d1, d2 *DB 1258 var efos map[string]*EventuallyFileOnlySnapshot 1259 backgroundErrs := make(chan error, 5) 1260 var compactions map[string]*blockedCompaction 1261 defer func() { 1262 for _, e := range efos { 1263 require.NoError(t, e.Close()) 1264 } 1265 if d1 != nil { 1266 require.NoError(t, d1.Close()) 1267 } 1268 if d2 != nil { 1269 require.NoError(t, d2.Close()) 1270 } 1271 }() 1272 creatorIDCounter := uint64(1) 1273 replicateCounter := 1 1274 1275 var wg sync.WaitGroup 1276 defer wg.Wait() 1277 var blockNextCompaction bool 1278 var blockedJobID int 1279 var blockedCompactionName string 1280 var blockedCompactionsMu sync.Mutex // protects the above three variables. 1281 1282 reset := func() { 1283 wg.Wait() 1284 for _, e := range efos { 1285 require.NoError(t, e.Close()) 1286 } 1287 if d1 != nil { 1288 require.NoError(t, d1.Close()) 1289 } 1290 if d2 != nil { 1291 require.NoError(t, d2.Close()) 1292 } 1293 efos = make(map[string]*EventuallyFileOnlySnapshot) 1294 compactions = make(map[string]*blockedCompaction) 1295 backgroundErrs = make(chan error, 5) 1296 1297 var el EventListener 1298 el.EnsureDefaults(testLogger{t: t}) 1299 el.FlushBegin = func(info FlushInfo) { 1300 // Don't block flushes 1301 } 1302 el.BackgroundError = func(err error) { 1303 backgroundErrs <- err 1304 } 1305 el.CompactionBegin = func(info CompactionInfo) { 1306 if info.Reason == "move" { 1307 return 1308 } 1309 blockedCompactionsMu.Lock() 1310 defer blockedCompactionsMu.Unlock() 1311 if blockNextCompaction { 1312 blockNextCompaction = false 1313 blockedJobID = info.JobID 1314 } 1315 } 1316 el.TableCreated = func(info TableCreateInfo) { 1317 blockedCompactionsMu.Lock() 1318 if info.JobID != blockedJobID { 1319 blockedCompactionsMu.Unlock() 1320 return 1321 } 1322 blockedJobID = 0 1323 c := compactions[blockedCompactionName] 1324 blockedCompactionName = "" 1325 blockedCompactionsMu.Unlock() 1326 c.startBlock <- struct{}{} 1327 <-c.unblock 1328 } 1329 1330 sstorage := remote.NewInMem() 1331 mem1 := vfs.NewMem() 1332 mem2 := vfs.NewMem() 1333 require.NoError(t, mem1.MkdirAll("ext", 0755)) 1334 require.NoError(t, mem2.MkdirAll("ext", 0755)) 1335 opts1 := &Options{ 1336 Comparer: testkeys.Comparer, 1337 LBaseMaxBytes: 1, 1338 FS: mem1, 1339 L0CompactionThreshold: 100, 1340 L0StopWritesThreshold: 100, 1341 DebugCheck: DebugCheckLevels, 1342 FormatMajorVersion: FormatVirtualSSTables, 1343 } 1344 // lel. 1345 lel := MakeLoggingEventListener(DefaultLogger) 1346 tel := TeeEventListener(lel, el) 1347 opts1.EventListener = &tel 1348 opts1.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{ 1349 "": sstorage, 1350 }) 1351 opts1.Experimental.CreateOnShared = remote.CreateOnSharedAll 1352 opts1.Experimental.CreateOnSharedLocator = "" 1353 // Disable automatic compactions because otherwise we'll race with 1354 // delete-only compactions triggered by ingesting range tombstones. 1355 opts1.DisableAutomaticCompactions = true 1356 1357 opts2 := &Options{} 1358 *opts2 = *opts1 1359 opts2.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{ 1360 "": sstorage, 1361 }) 1362 opts2.Experimental.CreateOnShared = remote.CreateOnSharedAll 1363 opts2.Experimental.CreateOnSharedLocator = "" 1364 opts2.FS = mem2 1365 1366 var err error 1367 d1, err = Open("", opts1) 1368 require.NoError(t, err) 1369 require.NoError(t, d1.SetCreatorID(creatorIDCounter)) 1370 creatorIDCounter++ 1371 d2, err = Open("", opts2) 1372 require.NoError(t, err) 1373 require.NoError(t, d2.SetCreatorID(creatorIDCounter)) 1374 creatorIDCounter++ 1375 d = d1 1376 } 1377 reset() 1378 1379 datadriven.RunTest(t, "testdata/concurrent_excise", func(t *testing.T, td *datadriven.TestData) string { 1380 switch td.Cmd { 1381 case "reset": 1382 reset() 1383 return "" 1384 case "switch": 1385 if len(td.CmdArgs) != 1 { 1386 return "usage: switch <1 or 2>" 1387 } 1388 switch td.CmdArgs[0].Key { 1389 case "1": 1390 d = d1 1391 case "2": 1392 d = d2 1393 default: 1394 return "usage: switch <1 or 2>" 1395 } 1396 return "ok" 1397 case "batch": 1398 b := d.NewIndexedBatch() 1399 if err := runBatchDefineCmd(td, b); err != nil { 1400 return err.Error() 1401 } 1402 if err := b.Commit(nil); err != nil { 1403 return err.Error() 1404 } 1405 return "" 1406 case "build": 1407 if err := runBuildCmd(td, d, d.opts.FS); err != nil { 1408 return err.Error() 1409 } 1410 return "" 1411 1412 case "flush": 1413 if err := d.Flush(); err != nil { 1414 return err.Error() 1415 } 1416 return "" 1417 1418 case "ingest": 1419 if err := runIngestCmd(td, d, d.opts.FS); err != nil { 1420 return err.Error() 1421 } 1422 // Wait for a possible flush. 1423 d.mu.Lock() 1424 for d.mu.compact.flushing { 1425 d.mu.compact.cond.Wait() 1426 } 1427 d.mu.Unlock() 1428 return "" 1429 1430 case "ingest-and-excise": 1431 if err := runIngestAndExciseCmd(td, d, d.opts.FS); err != nil { 1432 return err.Error() 1433 } 1434 // Wait for a possible flush. 1435 d.mu.Lock() 1436 for d.mu.compact.flushing { 1437 d.mu.compact.cond.Wait() 1438 } 1439 d.mu.Unlock() 1440 return "" 1441 1442 case "replicate": 1443 if len(td.CmdArgs) != 4 { 1444 return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>" 1445 } 1446 var from, to *DB 1447 switch td.CmdArgs[0].Key { 1448 case "1": 1449 from = d1 1450 case "2": 1451 from = d2 1452 default: 1453 return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>" 1454 } 1455 switch td.CmdArgs[1].Key { 1456 case "1": 1457 to = d1 1458 case "2": 1459 to = d2 1460 default: 1461 return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>" 1462 } 1463 startKey := []byte(td.CmdArgs[2].Key) 1464 endKey := []byte(td.CmdArgs[3].Key) 1465 1466 writeOpts := d.opts.MakeWriterOptions(0 /* level */, to.opts.FormatMajorVersion.MaxTableFormat()) 1467 sstPath := fmt.Sprintf("ext/replicate%d.sst", replicateCounter) 1468 f, err := to.opts.FS.Create(sstPath) 1469 require.NoError(t, err) 1470 replicateCounter++ 1471 w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), writeOpts) 1472 1473 var sharedSSTs []SharedSSTMeta 1474 err = from.ScanInternal(context.TODO(), sstable.CategoryAndQoS{}, startKey, endKey, 1475 func(key *InternalKey, value LazyValue, _ IteratorLevel) error { 1476 val, _, err := value.Value(nil) 1477 require.NoError(t, err) 1478 require.NoError(t, w.Add(base.MakeInternalKey(key.UserKey, 0, key.Kind()), val)) 1479 return nil 1480 }, 1481 func(start, end []byte, seqNum uint64) error { 1482 require.NoError(t, w.DeleteRange(start, end)) 1483 return nil 1484 }, 1485 func(start, end []byte, keys []keyspan.Key) error { 1486 s := keyspan.Span{ 1487 Start: start, 1488 End: end, 1489 Keys: keys, 1490 KeysOrder: 0, 1491 } 1492 require.NoError(t, rangekey.Encode(&s, func(k base.InternalKey, v []byte) error { 1493 return w.AddRangeKey(base.MakeInternalKey(k.UserKey, 0, k.Kind()), v) 1494 })) 1495 return nil 1496 }, 1497 func(sst *SharedSSTMeta) error { 1498 sharedSSTs = append(sharedSSTs, *sst) 1499 return nil 1500 }, 1501 ) 1502 require.NoError(t, err) 1503 require.NoError(t, w.Close()) 1504 1505 _, err = to.IngestAndExcise([]string{sstPath}, sharedSSTs, KeyRange{Start: startKey, End: endKey}) 1506 require.NoError(t, err) 1507 return fmt.Sprintf("replicated %d shared SSTs", len(sharedSSTs)) 1508 1509 case "get": 1510 return runGetCmd(t, td, d) 1511 1512 case "iter": 1513 o := &IterOptions{KeyTypes: IterKeyTypePointsAndRanges} 1514 var reader Reader 1515 reader = d 1516 for _, arg := range td.CmdArgs { 1517 switch arg.Key { 1518 case "mask-suffix": 1519 o.RangeKeyMasking.Suffix = []byte(arg.Vals[0]) 1520 case "mask-filter": 1521 o.RangeKeyMasking.Filter = func() BlockPropertyFilterMask { 1522 return sstable.NewTestKeysMaskingFilter() 1523 } 1524 case "snapshot": 1525 reader = efos[arg.Vals[0]] 1526 } 1527 } 1528 iter, err := reader.NewIter(o) 1529 if err != nil { 1530 return err.Error() 1531 } 1532 return runIterCmd(td, iter, true) 1533 1534 case "lsm": 1535 return runLSMCmd(td, d) 1536 1537 case "metrics": 1538 // The asynchronous loading of table stats can change metrics, so 1539 // wait for all the tables' stats to be loaded. 1540 d.mu.Lock() 1541 d.waitTableStats() 1542 d.mu.Unlock() 1543 1544 return d.Metrics().StringForTests() 1545 1546 case "wait-pending-table-stats": 1547 return runTableStatsCmd(td, d) 1548 1549 case "excise": 1550 ve := &versionEdit{ 1551 DeletedFiles: map[deletedFileEntry]*fileMetadata{}, 1552 } 1553 var exciseSpan KeyRange 1554 if len(td.CmdArgs) != 2 { 1555 panic("insufficient args for excise command") 1556 } 1557 exciseSpan.Start = []byte(td.CmdArgs[0].Key) 1558 exciseSpan.End = []byte(td.CmdArgs[1].Key) 1559 1560 d.mu.Lock() 1561 d.mu.versions.logLock() 1562 d.mu.Unlock() 1563 current := d.mu.versions.currentVersion() 1564 for level := range current.Levels { 1565 iter := current.Levels[level].Iter() 1566 for m := iter.SeekGE(d.cmp, exciseSpan.Start); m != nil && d.cmp(m.Smallest.UserKey, exciseSpan.End) < 0; m = iter.Next() { 1567 _, err := d.excise(exciseSpan, m, ve, level) 1568 if err != nil { 1569 d.mu.Lock() 1570 d.mu.versions.logUnlock() 1571 d.mu.Unlock() 1572 return fmt.Sprintf("error when excising %s: %s", m.FileNum, err.Error()) 1573 } 1574 } 1575 } 1576 d.mu.Lock() 1577 d.mu.versions.logUnlock() 1578 d.mu.Unlock() 1579 return fmt.Sprintf("would excise %d files, use ingest-and-excise to excise.\n%s", len(ve.DeletedFiles), ve.String()) 1580 1581 case "file-only-snapshot": 1582 if len(td.CmdArgs) != 1 { 1583 panic("insufficient args for file-only-snapshot command") 1584 } 1585 name := td.CmdArgs[0].Key 1586 var keyRanges []KeyRange 1587 for _, line := range strings.Split(td.Input, "\n") { 1588 fields := strings.Fields(line) 1589 if len(fields) != 2 { 1590 return "expected two fields for file-only snapshot KeyRanges" 1591 } 1592 kr := KeyRange{Start: []byte(fields[0]), End: []byte(fields[1])} 1593 keyRanges = append(keyRanges, kr) 1594 } 1595 1596 s := d.NewEventuallyFileOnlySnapshot(keyRanges) 1597 efos[name] = s 1598 return "ok" 1599 1600 case "wait-for-file-only-snapshot": 1601 if len(td.CmdArgs) != 1 { 1602 panic("insufficient args for file-only-snapshot command") 1603 } 1604 name := td.CmdArgs[0].Key 1605 err := efos[name].WaitForFileOnlySnapshot(context.TODO(), 1*time.Millisecond) 1606 if err != nil { 1607 return err.Error() 1608 } 1609 return "ok" 1610 1611 case "unblock": 1612 name := td.CmdArgs[0].Key 1613 blockedCompactionsMu.Lock() 1614 c := compactions[name] 1615 delete(compactions, name) 1616 blockedCompactionsMu.Unlock() 1617 c.unblock <- struct{}{} 1618 return "ok" 1619 1620 case "compact": 1621 async := false 1622 var otherArgs []datadriven.CmdArg 1623 var bc *blockedCompaction 1624 for i := range td.CmdArgs { 1625 switch td.CmdArgs[i].Key { 1626 case "block": 1627 name := td.CmdArgs[i].Vals[0] 1628 bc = &blockedCompaction{startBlock: make(chan struct{}), unblock: make(chan struct{})} 1629 blockedCompactionsMu.Lock() 1630 compactions[name] = bc 1631 blockNextCompaction = true 1632 blockedCompactionName = name 1633 blockedCompactionsMu.Unlock() 1634 async = true 1635 default: 1636 otherArgs = append(otherArgs, td.CmdArgs[i]) 1637 } 1638 } 1639 var tdClone datadriven.TestData 1640 tdClone = *td 1641 tdClone.CmdArgs = otherArgs 1642 if !async { 1643 err := runCompactCmd(td, d) 1644 if err != nil { 1645 return err.Error() 1646 } 1647 } else { 1648 wg.Add(1) 1649 go func() { 1650 defer wg.Done() 1651 _ = runCompactCmd(&tdClone, d) 1652 }() 1653 <-bc.startBlock 1654 return "spun off in separate goroutine" 1655 } 1656 return "ok" 1657 case "wait-for-background-error": 1658 err := <-backgroundErrs 1659 return err.Error() 1660 default: 1661 return fmt.Sprintf("unknown command: %s", td.Cmd) 1662 } 1663 }) 1664 } 1665 1666 func TestIngestExternal(t *testing.T) { 1667 var mem vfs.FS 1668 var d *DB 1669 var flushed bool 1670 defer func() { 1671 require.NoError(t, d.Close()) 1672 }() 1673 1674 var remoteStorage remote.Storage 1675 1676 reset := func() { 1677 if d != nil { 1678 require.NoError(t, d.Close()) 1679 } 1680 1681 mem = vfs.NewMem() 1682 require.NoError(t, mem.MkdirAll("ext", 0755)) 1683 remoteStorage = remote.NewInMem() 1684 opts := &Options{ 1685 FS: mem, 1686 L0CompactionThreshold: 100, 1687 L0StopWritesThreshold: 100, 1688 DebugCheck: DebugCheckLevels, 1689 EventListener: &EventListener{FlushEnd: func(info FlushInfo) { 1690 flushed = true 1691 }}, 1692 FormatMajorVersion: FormatVirtualSSTables, 1693 } 1694 opts.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{ 1695 "external-locator": remoteStorage, 1696 }) 1697 opts.Experimental.CreateOnShared = remote.CreateOnSharedNone 1698 // Disable automatic compactions because otherwise we'll race with 1699 // delete-only compactions triggered by ingesting range tombstones. 1700 opts.DisableAutomaticCompactions = true 1701 lel := MakeLoggingEventListener(DefaultLogger) 1702 opts.EventListener = &lel 1703 1704 var err error 1705 d, err = Open("", opts) 1706 require.NoError(t, err) 1707 require.NoError(t, d.SetCreatorID(1)) 1708 } 1709 reset() 1710 1711 datadriven.RunTest(t, "testdata/ingest_external", func(t *testing.T, td *datadriven.TestData) string { 1712 switch td.Cmd { 1713 case "reset": 1714 reset() 1715 return "" 1716 case "batch": 1717 b := d.NewIndexedBatch() 1718 if err := runBatchDefineCmd(td, b); err != nil { 1719 return err.Error() 1720 } 1721 if err := b.Commit(nil); err != nil { 1722 return err.Error() 1723 } 1724 return "" 1725 case "build-remote": 1726 if err := runBuildRemoteCmd(td, d, remoteStorage); err != nil { 1727 return err.Error() 1728 } 1729 return "" 1730 1731 case "flush": 1732 if err := d.Flush(); err != nil { 1733 return err.Error() 1734 } 1735 return "" 1736 1737 case "ingest-external": 1738 flushed = false 1739 if err := runIngestExternalCmd(td, d, "external-locator"); err != nil { 1740 return err.Error() 1741 } 1742 // Wait for a possible flush. 1743 d.mu.Lock() 1744 for d.mu.compact.flushing { 1745 d.mu.compact.cond.Wait() 1746 } 1747 d.mu.Unlock() 1748 if flushed { 1749 return "memtable flushed" 1750 } 1751 return "" 1752 1753 case "download": 1754 if len(td.CmdArgs) != 2 { 1755 panic("insufficient args for download command") 1756 } 1757 l := []byte(td.CmdArgs[0].Key) 1758 r := []byte(td.CmdArgs[1].Key) 1759 spans := []DownloadSpan{{StartKey: l, EndKey: r}} 1760 ctx, cancel := context.WithTimeout(context.TODO(), 1*time.Minute) 1761 defer cancel() 1762 err := d.Download(ctx, spans) 1763 if err != nil { 1764 return err.Error() 1765 } 1766 return "ok" 1767 1768 case "get": 1769 return runGetCmd(t, td, d) 1770 1771 case "iter": 1772 iter, _ := d.NewIter(&IterOptions{ 1773 KeyTypes: IterKeyTypePointsAndRanges, 1774 }) 1775 return runIterCmd(td, iter, true) 1776 1777 case "lsm": 1778 return runLSMCmd(td, d) 1779 1780 case "metrics": 1781 // The asynchronous loading of table stats can change metrics, so 1782 // wait for all the tables' stats to be loaded. 1783 d.mu.Lock() 1784 d.waitTableStats() 1785 d.mu.Unlock() 1786 1787 return d.Metrics().StringForTests() 1788 1789 case "wait-pending-table-stats": 1790 return runTableStatsCmd(td, d) 1791 1792 case "compact": 1793 if len(td.CmdArgs) != 2 { 1794 panic("insufficient args for compact command") 1795 } 1796 l := td.CmdArgs[0].Key 1797 r := td.CmdArgs[1].Key 1798 err := d.Compact([]byte(l), []byte(r), false) 1799 if err != nil { 1800 return err.Error() 1801 } 1802 return "" 1803 default: 1804 return fmt.Sprintf("unknown command: %s", td.Cmd) 1805 } 1806 }) 1807 } 1808 1809 func TestIngestMemtableOverlaps(t *testing.T) { 1810 comparers := []Comparer{ 1811 {Name: "default", Compare: DefaultComparer.Compare, FormatKey: DefaultComparer.FormatKey}, 1812 { 1813 Name: "reverse", 1814 Compare: func(a, b []byte) int { return DefaultComparer.Compare(b, a) }, 1815 FormatKey: DefaultComparer.FormatKey, 1816 }, 1817 } 1818 m := make(map[string]*Comparer) 1819 for i := range comparers { 1820 c := &comparers[i] 1821 m[c.Name] = c 1822 } 1823 1824 for _, comparer := range comparers { 1825 t.Run(comparer.Name, func(t *testing.T) { 1826 var mem *memTable 1827 1828 parseMeta := func(s string) *fileMetadata { 1829 parts := strings.Split(s, "-") 1830 meta := &fileMetadata{} 1831 if len(parts) != 2 { 1832 t.Fatalf("malformed table spec: %s", s) 1833 } 1834 var smallest, largest base.InternalKey 1835 if strings.Contains(parts[0], ".") { 1836 if !strings.Contains(parts[1], ".") { 1837 t.Fatalf("malformed table spec: %s", s) 1838 } 1839 smallest = base.ParseInternalKey(parts[0]) 1840 largest = base.ParseInternalKey(parts[1]) 1841 } else { 1842 smallest = InternalKey{UserKey: []byte(parts[0])} 1843 largest = InternalKey{UserKey: []byte(parts[1])} 1844 } 1845 // If we're using a reverse comparer, flip the file bounds. 1846 if mem.cmp(smallest.UserKey, largest.UserKey) > 0 { 1847 smallest, largest = largest, smallest 1848 } 1849 meta.ExtendPointKeyBounds(comparer.Compare, smallest, largest) 1850 meta.InitPhysicalBacking() 1851 return meta 1852 } 1853 1854 datadriven.RunTest(t, "testdata/ingest_memtable_overlaps", func(t *testing.T, d *datadriven.TestData) string { 1855 switch d.Cmd { 1856 case "define": 1857 b := newBatch(nil) 1858 if err := runBatchDefineCmd(d, b); err != nil { 1859 return err.Error() 1860 } 1861 1862 opts := &Options{ 1863 Comparer: &comparer, 1864 } 1865 opts.EnsureDefaults().WithFSDefaults() 1866 if len(d.CmdArgs) > 1 { 1867 return fmt.Sprintf("%s expects at most 1 argument", d.Cmd) 1868 } 1869 if len(d.CmdArgs) == 1 { 1870 opts.Comparer = m[d.CmdArgs[0].String()] 1871 if opts.Comparer == nil { 1872 return fmt.Sprintf("%s unknown comparer: %s", d.Cmd, d.CmdArgs[0].String()) 1873 } 1874 } 1875 1876 mem = newMemTable(memTableOptions{Options: opts}) 1877 if err := mem.apply(b, 0); err != nil { 1878 return err.Error() 1879 } 1880 return "" 1881 1882 case "overlaps": 1883 var buf bytes.Buffer 1884 for _, data := range strings.Split(d.Input, "\n") { 1885 var keyRanges []internalKeyRange 1886 for _, part := range strings.Fields(data) { 1887 meta := parseMeta(part) 1888 keyRanges = append(keyRanges, internalKeyRange{smallest: meta.Smallest, largest: meta.Largest}) 1889 } 1890 fmt.Fprintf(&buf, "%t\n", ingestMemtableOverlaps(mem.cmp, mem, keyRanges)) 1891 } 1892 return buf.String() 1893 1894 default: 1895 return fmt.Sprintf("unknown command: %s", d.Cmd) 1896 } 1897 }) 1898 }) 1899 } 1900 } 1901 1902 func TestKeyRangeBasic(t *testing.T) { 1903 cmp := base.DefaultComparer.Compare 1904 k1 := KeyRange{Start: []byte("b"), End: []byte("c")} 1905 1906 // Tests for Contains() 1907 require.True(t, k1.Contains(cmp, base.MakeInternalKey([]byte("b"), 1, InternalKeyKindSet))) 1908 require.False(t, k1.Contains(cmp, base.MakeInternalKey([]byte("c"), 1, InternalKeyKindSet))) 1909 require.True(t, k1.Contains(cmp, base.MakeInternalKey([]byte("bb"), 1, InternalKeyKindSet))) 1910 require.True(t, k1.Contains(cmp, base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, []byte("c")))) 1911 1912 m1 := &fileMetadata{ 1913 Smallest: base.MakeInternalKey([]byte("b"), 1, InternalKeyKindSet), 1914 Largest: base.MakeInternalKey([]byte("c"), 1, InternalKeyKindSet), 1915 } 1916 require.True(t, k1.Overlaps(cmp, m1)) 1917 m2 := &fileMetadata{ 1918 Smallest: base.MakeInternalKey([]byte("c"), 1, InternalKeyKindSet), 1919 Largest: base.MakeInternalKey([]byte("d"), 1, InternalKeyKindSet), 1920 } 1921 require.False(t, k1.Overlaps(cmp, m2)) 1922 m3 := &fileMetadata{ 1923 Smallest: base.MakeInternalKey([]byte("a"), 1, InternalKeyKindSet), 1924 Largest: base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, []byte("b")), 1925 } 1926 require.False(t, k1.Overlaps(cmp, m3)) 1927 m4 := &fileMetadata{ 1928 Smallest: base.MakeInternalKey([]byte("a"), 1, InternalKeyKindSet), 1929 Largest: base.MakeInternalKey([]byte("b"), 1, InternalKeyKindSet), 1930 } 1931 require.True(t, k1.Overlaps(cmp, m4)) 1932 } 1933 1934 func BenchmarkIngestOverlappingMemtable(b *testing.B) { 1935 assertNoError := func(err error) { 1936 b.Helper() 1937 if err != nil { 1938 b.Fatal(err) 1939 } 1940 } 1941 1942 for count := 1; count < 6; count++ { 1943 b.Run(fmt.Sprintf("memtables=%d", count), func(b *testing.B) { 1944 for i := 0; i < b.N; i++ { 1945 b.StopTimer() 1946 mem := vfs.NewMem() 1947 d, err := Open("", &Options{ 1948 FS: mem, 1949 }) 1950 assertNoError(err) 1951 1952 // Create memtables. 1953 for { 1954 assertNoError(d.Set([]byte("a"), nil, nil)) 1955 d.mu.Lock() 1956 done := len(d.mu.mem.queue) == count 1957 d.mu.Unlock() 1958 if done { 1959 break 1960 } 1961 } 1962 1963 // Create the overlapping sstable that will force a flush when ingested. 1964 f, err := mem.Create("ext") 1965 assertNoError(err) 1966 w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{}) 1967 assertNoError(w.Set([]byte("a"), nil)) 1968 assertNoError(w.Close()) 1969 1970 b.StartTimer() 1971 assertNoError(d.Ingest([]string{"ext"})) 1972 } 1973 }) 1974 } 1975 } 1976 1977 func TestIngestTargetLevel(t *testing.T) { 1978 var d *DB 1979 defer func() { 1980 if d != nil { 1981 // Ignore errors because this test defines fake in-progress transactions 1982 // that prohibit clean shutdown. 1983 _ = d.Close() 1984 } 1985 }() 1986 1987 parseMeta := func(s string) *fileMetadata { 1988 var rkey bool 1989 if len(s) >= 4 && s[0:4] == "rkey" { 1990 rkey = true 1991 s = s[5:] 1992 } 1993 parts := strings.Split(s, "-") 1994 if len(parts) != 2 { 1995 t.Fatalf("malformed table spec: %s", s) 1996 } 1997 var m *fileMetadata 1998 if rkey { 1999 m = (&fileMetadata{}).ExtendRangeKeyBounds( 2000 d.cmp, 2001 InternalKey{UserKey: []byte(parts[0])}, 2002 InternalKey{UserKey: []byte(parts[1])}, 2003 ) 2004 } else { 2005 m = (&fileMetadata{}).ExtendPointKeyBounds( 2006 d.cmp, 2007 InternalKey{UserKey: []byte(parts[0])}, 2008 InternalKey{UserKey: []byte(parts[1])}, 2009 ) 2010 } 2011 m.InitPhysicalBacking() 2012 return m 2013 } 2014 2015 datadriven.RunTest(t, "testdata/ingest_target_level", func(t *testing.T, td *datadriven.TestData) string { 2016 switch td.Cmd { 2017 case "define": 2018 if d != nil { 2019 // Ignore errors because this test defines fake in-progress 2020 // transactions that prohibit clean shutdown. 2021 _ = d.Close() 2022 } 2023 2024 var err error 2025 opts := Options{ 2026 FormatMajorVersion: internalFormatNewest, 2027 } 2028 opts.WithFSDefaults() 2029 if d, err = runDBDefineCmd(td, &opts); err != nil { 2030 return err.Error() 2031 } 2032 2033 readState := d.loadReadState() 2034 c := &checkConfig{ 2035 logger: d.opts.Logger, 2036 comparer: d.opts.Comparer, 2037 readState: readState, 2038 newIters: d.newIters, 2039 // TODO: runDBDefineCmd doesn't properly update the visible 2040 // sequence number. So we have to explicitly configure level checker with a very large 2041 // sequence number, otherwise the DB appears empty. 2042 seqNum: InternalKeySeqNumMax, 2043 } 2044 if err := checkLevelsInternal(c); err != nil { 2045 return err.Error() 2046 } 2047 readState.unref() 2048 2049 d.mu.Lock() 2050 s := d.mu.versions.currentVersion().String() 2051 d.mu.Unlock() 2052 return s 2053 2054 case "target": 2055 var buf bytes.Buffer 2056 suggestSplit := false 2057 for _, cmd := range td.CmdArgs { 2058 switch cmd.Key { 2059 case "suggest-split": 2060 suggestSplit = true 2061 } 2062 } 2063 for _, target := range strings.Split(td.Input, "\n") { 2064 meta := parseMeta(target) 2065 level, overlapFile, err := ingestTargetLevel( 2066 d.newIters, d.tableNewRangeKeyIter, IterOptions{logger: d.opts.Logger}, 2067 d.opts.Comparer, d.mu.versions.currentVersion(), 1, d.mu.compact.inProgress, meta, 2068 suggestSplit) 2069 if err != nil { 2070 return err.Error() 2071 } 2072 if overlapFile != nil { 2073 fmt.Fprintf(&buf, "%d (split file: %s)\n", level, overlapFile.FileNum) 2074 } else { 2075 fmt.Fprintf(&buf, "%d\n", level) 2076 } 2077 } 2078 return buf.String() 2079 2080 default: 2081 return fmt.Sprintf("unknown command: %s", td.Cmd) 2082 } 2083 }) 2084 } 2085 2086 func TestIngest(t *testing.T) { 2087 var mem vfs.FS 2088 var d *DB 2089 var flushed bool 2090 defer func() { 2091 require.NoError(t, d.Close()) 2092 }() 2093 2094 reset := func(split bool) { 2095 if d != nil { 2096 require.NoError(t, d.Close()) 2097 } 2098 2099 mem = vfs.NewMem() 2100 require.NoError(t, mem.MkdirAll("ext", 0755)) 2101 opts := &Options{ 2102 FS: mem, 2103 L0CompactionThreshold: 100, 2104 L0StopWritesThreshold: 100, 2105 DebugCheck: DebugCheckLevels, 2106 EventListener: &EventListener{FlushEnd: func(info FlushInfo) { 2107 flushed = true 2108 }}, 2109 FormatMajorVersion: internalFormatNewest, 2110 } 2111 opts.Experimental.IngestSplit = func() bool { 2112 return split 2113 } 2114 // Disable automatic compactions because otherwise we'll race with 2115 // delete-only compactions triggered by ingesting range tombstones. 2116 opts.DisableAutomaticCompactions = true 2117 2118 var err error 2119 d, err = Open("", opts) 2120 require.NoError(t, err) 2121 } 2122 reset(false /* split */) 2123 2124 datadriven.RunTest(t, "testdata/ingest", func(t *testing.T, td *datadriven.TestData) string { 2125 switch td.Cmd { 2126 case "reset": 2127 split := false 2128 for _, cmd := range td.CmdArgs { 2129 switch cmd.Key { 2130 case "enable-split": 2131 split = true 2132 default: 2133 return fmt.Sprintf("unexpected key: %s", cmd.Key) 2134 } 2135 } 2136 reset(split) 2137 return "" 2138 case "batch": 2139 b := d.NewIndexedBatch() 2140 if err := runBatchDefineCmd(td, b); err != nil { 2141 return err.Error() 2142 } 2143 if err := b.Commit(nil); err != nil { 2144 return err.Error() 2145 } 2146 return "" 2147 2148 case "build": 2149 if err := runBuildCmd(td, d, mem); err != nil { 2150 return err.Error() 2151 } 2152 return "" 2153 2154 case "ingest": 2155 flushed = false 2156 if err := runIngestCmd(td, d, mem); err != nil { 2157 return err.Error() 2158 } 2159 // Wait for a possible flush. 2160 d.mu.Lock() 2161 for d.mu.compact.flushing { 2162 d.mu.compact.cond.Wait() 2163 } 2164 d.mu.Unlock() 2165 if flushed { 2166 return "memtable flushed" 2167 } 2168 return "" 2169 2170 case "get": 2171 return runGetCmd(t, td, d) 2172 2173 case "iter": 2174 iter, _ := d.NewIter(&IterOptions{ 2175 KeyTypes: IterKeyTypePointsAndRanges, 2176 }) 2177 return runIterCmd(td, iter, true) 2178 2179 case "lsm": 2180 return runLSMCmd(td, d) 2181 2182 case "metrics": 2183 // The asynchronous loading of table stats can change metrics, so 2184 // wait for all the tables' stats to be loaded. 2185 d.mu.Lock() 2186 d.waitTableStats() 2187 d.mu.Unlock() 2188 2189 return d.Metrics().StringForTests() 2190 2191 case "wait-pending-table-stats": 2192 return runTableStatsCmd(td, d) 2193 2194 case "compact": 2195 if len(td.CmdArgs) != 2 { 2196 panic("insufficient args for compact command") 2197 } 2198 l := td.CmdArgs[0].Key 2199 r := td.CmdArgs[1].Key 2200 err := d.Compact([]byte(l), []byte(r), false) 2201 if err != nil { 2202 return err.Error() 2203 } 2204 return "" 2205 default: 2206 return fmt.Sprintf("unknown command: %s", td.Cmd) 2207 } 2208 }) 2209 } 2210 2211 func TestIngestError(t *testing.T) { 2212 for i := int32(0); ; i++ { 2213 mem := vfs.NewMem() 2214 2215 f0, err := mem.Create("ext0") 2216 require.NoError(t, err) 2217 w := sstable.NewWriter(objstorageprovider.NewFileWritable(f0), sstable.WriterOptions{}) 2218 require.NoError(t, w.Set([]byte("d"), nil)) 2219 require.NoError(t, w.Close()) 2220 f1, err := mem.Create("ext1") 2221 require.NoError(t, err) 2222 w = sstable.NewWriter(objstorageprovider.NewFileWritable(f1), sstable.WriterOptions{}) 2223 require.NoError(t, w.Set([]byte("d"), nil)) 2224 require.NoError(t, w.Close()) 2225 2226 ii := errorfs.OnIndex(-1) 2227 d, err := Open("", &Options{ 2228 FS: errorfs.Wrap(mem, errorfs.ErrInjected.If(ii)), 2229 Logger: panicLogger{}, 2230 L0CompactionThreshold: 8, 2231 }) 2232 require.NoError(t, err) 2233 // Force the creation of an L0 sstable that overlaps with the tables 2234 // we'll attempt to ingest. This ensures that we exercise filesystem 2235 // codepaths when determining the ingest target level. 2236 require.NoError(t, d.Set([]byte("a"), nil, nil)) 2237 require.NoError(t, d.Set([]byte("d"), nil, nil)) 2238 require.NoError(t, d.Flush()) 2239 2240 t.Run(fmt.Sprintf("index-%d", i), func(t *testing.T) { 2241 defer func() { 2242 if r := recover(); r != nil { 2243 if e, ok := r.(error); ok && errors.Is(e, errorfs.ErrInjected) { 2244 return 2245 } 2246 // d.opts.Logger.Fatalf won't propagate ErrInjected 2247 // itself, but should contain the error message. 2248 if strings.HasSuffix(fmt.Sprint(r), errorfs.ErrInjected.Error()) { 2249 return 2250 } 2251 t.Fatal(r) 2252 } 2253 }() 2254 2255 ii.Store(i) 2256 err1 := d.Ingest([]string{"ext0"}) 2257 err2 := d.Ingest([]string{"ext1"}) 2258 err := firstError(err1, err2) 2259 if err != nil && !errors.Is(err, errorfs.ErrInjected) { 2260 t.Fatal(err) 2261 } 2262 }) 2263 2264 // d.Close may error if we failed to flush the manifest. 2265 _ = d.Close() 2266 2267 // If the injector's index is non-negative, the i-th filesystem 2268 // operation was never executed. 2269 if ii.Load() >= 0 { 2270 break 2271 } 2272 } 2273 } 2274 2275 func TestIngestIdempotence(t *testing.T) { 2276 // Use an on-disk filesystem, because Ingest with a MemFS will copy, not 2277 // link the ingested file. 2278 dir, err := os.MkdirTemp("", "ingest-idempotence") 2279 require.NoError(t, err) 2280 defer os.RemoveAll(dir) 2281 fs := vfs.Default 2282 2283 path := fs.PathJoin(dir, "ext") 2284 f, err := fs.Create(fs.PathJoin(dir, "ext")) 2285 require.NoError(t, err) 2286 w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{}) 2287 require.NoError(t, w.Set([]byte("d"), nil)) 2288 require.NoError(t, w.Close()) 2289 2290 d, err := Open(dir, &Options{ 2291 FS: fs, 2292 }) 2293 require.NoError(t, err) 2294 const count = 4 2295 for i := 0; i < count; i++ { 2296 ingestPath := fs.PathJoin(dir, fmt.Sprintf("ext%d", i)) 2297 require.NoError(t, fs.Link(path, ingestPath)) 2298 require.NoError(t, d.Ingest([]string{ingestPath})) 2299 } 2300 require.NoError(t, d.Close()) 2301 } 2302 2303 func TestIngestCompact(t *testing.T) { 2304 mem := vfs.NewMem() 2305 lel := MakeLoggingEventListener(&base.InMemLogger{}) 2306 d, err := Open("", &Options{ 2307 EventListener: &lel, 2308 FS: mem, 2309 L0CompactionThreshold: 1, 2310 L0StopWritesThreshold: 1, 2311 }) 2312 require.NoError(t, err) 2313 2314 src := func(i int) string { 2315 return fmt.Sprintf("ext%d", i) 2316 } 2317 f, err := mem.Create(src(0)) 2318 require.NoError(t, err) 2319 2320 w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{}) 2321 key := []byte("a") 2322 require.NoError(t, w.Add(base.MakeInternalKey(key, 0, InternalKeyKindSet), nil)) 2323 require.NoError(t, w.Close()) 2324 2325 // Make N copies of the sstable. 2326 const count = 20 2327 for i := 1; i < count; i++ { 2328 require.NoError(t, vfs.Copy(d.opts.FS, src(0), src(i))) 2329 } 2330 2331 // Ingest the same sstable multiple times. Compaction should take place as 2332 // ingestion happens, preventing an indefinite write stall from occurring. 2333 for i := 0; i < count; i++ { 2334 if i == 10 { 2335 // Half-way through the ingestions, set a key in the memtable to force 2336 // overlap with the memtable which will require the memtable to be 2337 // flushed. 2338 require.NoError(t, d.Set(key, nil, nil)) 2339 } 2340 require.NoError(t, d.Ingest([]string{src(i)})) 2341 } 2342 2343 require.NoError(t, d.Close()) 2344 } 2345 2346 func TestConcurrentIngest(t *testing.T) { 2347 mem := vfs.NewMem() 2348 d, err := Open("", &Options{ 2349 FS: mem, 2350 }) 2351 require.NoError(t, err) 2352 2353 // Create an sstable with 2 keys. This is necessary to trigger the overlap 2354 // bug because an sstable with a single key will not have overlap in internal 2355 // key space and the sequence number assignment had already guaranteed 2356 // correct ordering. 2357 src := func(i int) string { 2358 return fmt.Sprintf("ext%d", i) 2359 } 2360 f, err := mem.Create(src(0)) 2361 require.NoError(t, err) 2362 2363 w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{}) 2364 require.NoError(t, w.Set([]byte("a"), nil)) 2365 require.NoError(t, w.Set([]byte("b"), nil)) 2366 require.NoError(t, w.Close()) 2367 2368 // Make N copies of the sstable. 2369 errCh := make(chan error, 5) 2370 for i := 1; i < cap(errCh); i++ { 2371 require.NoError(t, vfs.Copy(d.opts.FS, src(0), src(i))) 2372 } 2373 2374 // Perform N ingestions concurrently. 2375 for i := 0; i < cap(errCh); i++ { 2376 go func(i int) { 2377 err := d.Ingest([]string{src(i)}) 2378 if err == nil { 2379 if _, err = d.opts.FS.Stat(src(i)); oserror.IsNotExist(err) { 2380 err = nil 2381 } 2382 } 2383 errCh <- err 2384 }(i) 2385 } 2386 for i := 0; i < cap(errCh); i++ { 2387 require.NoError(t, <-errCh) 2388 } 2389 2390 require.NoError(t, d.Close()) 2391 } 2392 2393 func TestConcurrentIngestCompact(t *testing.T) { 2394 for i := 0; i < 2; i++ { 2395 t.Run("", func(t *testing.T) { 2396 mem := vfs.NewMem() 2397 compactionReady := make(chan struct{}) 2398 compactionBegin := make(chan struct{}) 2399 d, err := Open("", &Options{ 2400 FS: mem, 2401 EventListener: &EventListener{ 2402 TableCreated: func(info TableCreateInfo) { 2403 if info.Reason == "compacting" { 2404 close(compactionReady) 2405 <-compactionBegin 2406 } 2407 }, 2408 }, 2409 }) 2410 require.NoError(t, err) 2411 2412 ingest := func(keys ...string) { 2413 t.Helper() 2414 f, err := mem.Create("ext") 2415 require.NoError(t, err) 2416 2417 w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{}) 2418 for _, k := range keys { 2419 require.NoError(t, w.Set([]byte(k), nil)) 2420 } 2421 require.NoError(t, w.Close()) 2422 require.NoError(t, d.Ingest([]string{"ext"})) 2423 } 2424 2425 compact := func(start, end string) { 2426 t.Helper() 2427 require.NoError(t, d.Compact([]byte(start), []byte(end), false)) 2428 } 2429 2430 lsm := func() string { 2431 d.mu.Lock() 2432 s := d.mu.versions.currentVersion().String() 2433 d.mu.Unlock() 2434 return s 2435 } 2436 2437 expectLSM := func(expected string) { 2438 t.Helper() 2439 expected = strings.TrimSpace(expected) 2440 actual := strings.TrimSpace(lsm()) 2441 if expected != actual { 2442 t.Fatalf("expected\n%s\nbut found\n%s", expected, actual) 2443 } 2444 } 2445 2446 ingest("a") 2447 ingest("a") 2448 ingest("c") 2449 ingest("c") 2450 2451 expectLSM(` 2452 0.0: 2453 000005:[a#11,SET-a#11,SET] 2454 000007:[c#13,SET-c#13,SET] 2455 6: 2456 000004:[a#10,SET-a#10,SET] 2457 000006:[c#12,SET-c#12,SET] 2458 `) 2459 2460 // At this point ingestion of an sstable containing only key "b" will be 2461 // targeted at L6. Yet a concurrent compaction of sstables 5 and 7 will 2462 // create a new sstable in L6 spanning ["a"-"c"]. So the ingestion must 2463 // actually target L5. 2464 2465 switch i { 2466 case 0: 2467 // Compact, then ingest. 2468 go func() { 2469 <-compactionReady 2470 2471 ingest("b") 2472 2473 close(compactionBegin) 2474 }() 2475 2476 compact("a", "z") 2477 2478 expectLSM(` 2479 0.0: 2480 000009:[b#14,SET-b#14,SET] 2481 6: 2482 000008:[a#0,SET-c#0,SET] 2483 `) 2484 2485 case 1: 2486 // Ingest, then compact 2487 var wg sync.WaitGroup 2488 wg.Add(1) 2489 go func() { 2490 defer wg.Done() 2491 close(compactionBegin) 2492 compact("a", "z") 2493 }() 2494 2495 ingest("b") 2496 wg.Wait() 2497 2498 // Because we're performing the ingestion and compaction concurrently, 2499 // we can't guarantee any particular LSM structure at this point. The 2500 // test will fail with an assertion error due to overlapping sstables 2501 // if there is insufficient synchronization between ingestion and 2502 // compaction. 2503 } 2504 2505 require.NoError(t, d.Close()) 2506 }) 2507 } 2508 } 2509 2510 func TestIngestFlushQueuedMemTable(t *testing.T) { 2511 // Verify that ingestion forces a flush of a queued memtable. 2512 2513 // Test with a format major version prior to FormatFlushableIngest and one 2514 // after. Both should result in the same statistic calculations. 2515 for _, fmv := range []FormatMajorVersion{FormatFlushableIngest - 1, internalFormatNewest} { 2516 func(fmv FormatMajorVersion) { 2517 mem := vfs.NewMem() 2518 d, err := Open("", &Options{ 2519 FS: mem, 2520 FormatMajorVersion: fmv, 2521 }) 2522 require.NoError(t, err) 2523 2524 // Add the key "a" to the memtable, then fill up the memtable with the key 2525 // "b". The ingested sstable will only overlap with the queued memtable. 2526 require.NoError(t, d.Set([]byte("a"), nil, nil)) 2527 for { 2528 require.NoError(t, d.Set([]byte("b"), nil, nil)) 2529 d.mu.Lock() 2530 done := len(d.mu.mem.queue) == 2 2531 d.mu.Unlock() 2532 if done { 2533 break 2534 } 2535 } 2536 2537 ingest := func(keys ...string) { 2538 t.Helper() 2539 f, err := mem.Create("ext") 2540 require.NoError(t, err) 2541 2542 w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{ 2543 TableFormat: fmv.MinTableFormat(), 2544 }) 2545 for _, k := range keys { 2546 require.NoError(t, w.Set([]byte(k), nil)) 2547 } 2548 require.NoError(t, w.Close()) 2549 stats, err := d.IngestWithStats([]string{"ext"}) 2550 require.NoError(t, err) 2551 require.Equal(t, stats.ApproxIngestedIntoL0Bytes, stats.Bytes) 2552 require.Equal(t, stats.MemtableOverlappingFiles, 1) 2553 require.Less(t, uint64(0), stats.Bytes) 2554 } 2555 2556 ingest("a") 2557 2558 require.NoError(t, d.Close()) 2559 }(fmv) 2560 } 2561 } 2562 2563 func TestIngestStats(t *testing.T) { 2564 mem := vfs.NewMem() 2565 d, err := Open("", &Options{ 2566 FS: mem, 2567 }) 2568 require.NoError(t, err) 2569 2570 ingest := func(expectedLevel int, keys ...string) { 2571 t.Helper() 2572 f, err := mem.Create("ext") 2573 require.NoError(t, err) 2574 2575 w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{}) 2576 for _, k := range keys { 2577 require.NoError(t, w.Set([]byte(k), nil)) 2578 } 2579 require.NoError(t, w.Close()) 2580 stats, err := d.IngestWithStats([]string{"ext"}) 2581 require.NoError(t, err) 2582 if expectedLevel == 0 { 2583 require.Equal(t, stats.ApproxIngestedIntoL0Bytes, stats.Bytes) 2584 } else { 2585 require.EqualValues(t, 0, stats.ApproxIngestedIntoL0Bytes) 2586 } 2587 require.Less(t, uint64(0), stats.Bytes) 2588 } 2589 ingest(6, "a") 2590 ingest(0, "a") 2591 ingest(6, "b", "g") 2592 ingest(0, "c") 2593 require.NoError(t, d.Close()) 2594 } 2595 2596 func TestIngestFlushQueuedLargeBatch(t *testing.T) { 2597 // Verify that ingestion forces a flush of a queued large batch. 2598 2599 mem := vfs.NewMem() 2600 d, err := Open("", &Options{ 2601 FS: mem, 2602 }) 2603 require.NoError(t, err) 2604 2605 // The default large batch threshold is slightly less than 1/2 of the 2606 // memtable size which makes triggering a problem with flushing queued large 2607 // batches irritating. Manually adjust the threshold to 1/8 of the memtable 2608 // size in order to more easily create a situation where a large batch is 2609 // queued but not automatically flushed. 2610 d.mu.Lock() 2611 d.largeBatchThreshold = d.opts.MemTableSize / 8 2612 d.mu.Unlock() 2613 2614 // Set a record with a large value. This will be transformed into a large 2615 // batch and placed in the flushable queue. 2616 require.NoError(t, d.Set([]byte("a"), bytes.Repeat([]byte("v"), int(d.largeBatchThreshold)), nil)) 2617 2618 ingest := func(keys ...string) { 2619 t.Helper() 2620 f, err := mem.Create("ext") 2621 require.NoError(t, err) 2622 2623 w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{}) 2624 for _, k := range keys { 2625 require.NoError(t, w.Set([]byte(k), nil)) 2626 } 2627 require.NoError(t, w.Close()) 2628 require.NoError(t, d.Ingest([]string{"ext"})) 2629 } 2630 2631 ingest("a") 2632 2633 require.NoError(t, d.Close()) 2634 } 2635 2636 func TestIngestMemtablePendingOverlap(t *testing.T) { 2637 mem := vfs.NewMem() 2638 d, err := Open("", &Options{ 2639 FS: mem, 2640 }) 2641 require.NoError(t, err) 2642 2643 d.mu.Lock() 2644 // Use a custom commit pipeline apply function to give us control over 2645 // timing of events. 2646 assignedBatch := make(chan struct{}) 2647 applyBatch := make(chan struct{}) 2648 originalApply := d.commit.env.apply 2649 d.commit.env.apply = func(b *Batch, mem *memTable) error { 2650 assignedBatch <- struct{}{} 2651 applyBatch <- struct{}{} 2652 return originalApply(b, mem) 2653 } 2654 d.mu.Unlock() 2655 2656 ingest := func(keys ...string) { 2657 t.Helper() 2658 f, err := mem.Create("ext") 2659 require.NoError(t, err) 2660 2661 w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{}) 2662 for _, k := range keys { 2663 require.NoError(t, w.Set([]byte(k), nil)) 2664 } 2665 require.NoError(t, w.Close()) 2666 require.NoError(t, d.Ingest([]string{"ext"})) 2667 } 2668 2669 var wg sync.WaitGroup 2670 wg.Add(2) 2671 2672 // First, Set('c') begins. This call will: 2673 // 2674 // * enqueue the batch to the pending queue. 2675 // * allocate a sequence number `x`. 2676 // * write the batch to the WAL. 2677 // 2678 // and then block until we read from the `applyBatch` channel down below. 2679 go func() { 2680 err := d.Set([]byte("c"), nil, nil) 2681 if err != nil { 2682 t.Error(err) 2683 } 2684 wg.Done() 2685 }() 2686 2687 // When the above Set('c') is ready to apply, it sends on the 2688 // `assignedBatch` channel. Once that happens, we start Ingest('a', 'c'). 2689 // The Ingest('a', 'c') allocates sequence number `x + 1`. 2690 go func() { 2691 // Wait until the Set has grabbed a sequence number before ingesting. 2692 <-assignedBatch 2693 ingest("a", "c") 2694 wg.Done() 2695 }() 2696 2697 // The Set('c')#1 and Ingest('a', 'c')#2 are both pending. To maintain 2698 // sequence number invariants, the Set needs to be applied and flushed 2699 // before the Ingest determines its target level. 2700 // 2701 // Sleep a bit to ensure that the Ingest has time to call into 2702 // AllocateSeqNum. Once it allocates its sequence number, it should see 2703 // that there are unpublished sequence numbers below it and spin until the 2704 // Set's sequence number is published. After sleeping, read from 2705 // `applyBatch` to actually allow the Set to apply and publish its 2706 // sequence number. 2707 time.Sleep(100 * time.Millisecond) 2708 <-applyBatch 2709 2710 // Wait for both calls to complete. 2711 wg.Wait() 2712 require.NoError(t, d.Flush()) 2713 require.NoError(t, d.CheckLevels(nil)) 2714 require.NoError(t, d.Close()) 2715 } 2716 2717 type testLogger struct { 2718 t testing.TB 2719 } 2720 2721 func (l testLogger) Infof(format string, args ...interface{}) { 2722 l.t.Logf(format, args...) 2723 } 2724 2725 func (l testLogger) Errorf(format string, args ...interface{}) { 2726 l.t.Logf(format, args...) 2727 } 2728 2729 func (l testLogger) Fatalf(format string, args ...interface{}) { 2730 l.t.Fatalf(format, args...) 2731 } 2732 2733 // TestIngestMemtableOverlapRace is a regression test for the race described in 2734 // #2196. If an ingest that checks for overlap with the mutable memtable and 2735 // finds no overlap, it must not allow overlapping keys with later sequence 2736 // numbers to be applied to the memtable and the memtable to be flushed before 2737 // the ingest completes. 2738 // 2739 // This test operates by committing the same key concurrently: 2740 // - 1 goroutine repeatedly ingests the same sstable writing the key `foo` 2741 // - n goroutines repeatedly apply batches writing the key `foo` and trigger 2742 // flushes. 2743 // 2744 // After a while, the database is closed and the manifest is verified. Version 2745 // edits should contain new files with monotonically increasing sequence 2746 // numbers, since every flush and every ingest conflicts with one another. 2747 func TestIngestMemtableOverlapRace(t *testing.T) { 2748 mem := vfs.NewMem() 2749 el := MakeLoggingEventListener(testLogger{t: t}) 2750 d, err := Open("", &Options{ 2751 FS: mem, 2752 // Disable automatic compactions to keep the manifest clean; only 2753 // flushes and ingests. 2754 DisableAutomaticCompactions: true, 2755 // Disable the WAL to speed up batch commits. 2756 DisableWAL: true, 2757 EventListener: &el, 2758 // We're endlessly appending to L0 without clearing it, so set a maximal 2759 // stop writes threshold. 2760 L0StopWritesThreshold: math.MaxInt, 2761 // Accumulating more than 1 immutable memtable doesn't help us exercise 2762 // the bug, since the committed keys need to be flushed promptly. 2763 MemTableStopWritesThreshold: 2, 2764 }) 2765 require.NoError(t, err) 2766 2767 // Prepare a sstable `ext` deleting foo. 2768 f, err := mem.Create("ext") 2769 require.NoError(t, err) 2770 w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{}) 2771 require.NoError(t, w.Delete([]byte("foo"))) 2772 require.NoError(t, w.Close()) 2773 2774 var done atomic.Bool 2775 const numSetters = 2 2776 var wg sync.WaitGroup 2777 wg.Add(numSetters + 1) 2778 2779 untilDone := func(fn func()) { 2780 defer wg.Done() 2781 for !done.Load() { 2782 fn() 2783 } 2784 } 2785 2786 // Ingest in the background. 2787 totalIngests := 0 2788 go untilDone(func() { 2789 filename := fmt.Sprintf("ext%d", totalIngests) 2790 require.NoError(t, mem.Link("ext", filename)) 2791 require.NoError(t, d.Ingest([]string{filename})) 2792 totalIngests++ 2793 }) 2794 2795 // Apply batches and trigger flushes in the background. 2796 wo := &WriteOptions{Sync: false} 2797 var localCommits [numSetters]int 2798 for i := 0; i < numSetters; i++ { 2799 i := i 2800 v := []byte(fmt.Sprintf("v%d", i+1)) 2801 go untilDone(func() { 2802 // Commit a batch setting foo=vN. 2803 b := d.NewBatch() 2804 require.NoError(t, b.Set([]byte("foo"), v, nil)) 2805 require.NoError(t, b.Commit(wo)) 2806 localCommits[i]++ 2807 d.AsyncFlush() 2808 }) 2809 } 2810 time.Sleep(100 * time.Millisecond) 2811 done.Store(true) 2812 wg.Wait() 2813 2814 var totalCommits int 2815 for i := 0; i < numSetters; i++ { 2816 totalCommits += localCommits[i] 2817 } 2818 m := d.Metrics() 2819 tot := m.Total() 2820 t.Logf("Committed %d batches.", totalCommits) 2821 t.Logf("Flushed %d times.", m.Flush.Count) 2822 t.Logf("Ingested %d sstables.", tot.TablesIngested) 2823 require.NoError(t, d.CheckLevels(nil)) 2824 require.NoError(t, d.Close()) 2825 2826 // Replay the manifest. Every flush and ingest is a separate version edit. 2827 // Since they all write the same key and compactions are disabled, sequence 2828 // numbers of new files should be monotonically increasing. 2829 // 2830 // This check is necessary because most of these sstables are ingested into 2831 // L0. The L0 sublevels construction will order them by LargestSeqNum, even 2832 // if they're added to L0 out-of-order. The CheckLevels call at the end of 2833 // the test may find that the sublevels are all appropriately ordered, but 2834 // the manifest may reveal they were added to the LSM out-of-order. 2835 dbDesc, err := Peek("", mem) 2836 require.NoError(t, err) 2837 require.True(t, dbDesc.Exists) 2838 f, err = mem.Open(dbDesc.ManifestFilename) 2839 require.NoError(t, err) 2840 defer f.Close() 2841 rr := record.NewReader(f, 0 /* logNum */) 2842 var largest *fileMetadata 2843 for { 2844 r, err := rr.Next() 2845 if err == io.EOF || err == record.ErrInvalidChunk { 2846 break 2847 } 2848 require.NoError(t, err) 2849 var ve manifest.VersionEdit 2850 require.NoError(t, ve.Decode(r)) 2851 t.Log(ve.String()) 2852 for _, f := range ve.NewFiles { 2853 if largest != nil { 2854 require.Equal(t, 0, f.Level) 2855 if largest.LargestSeqNum > f.Meta.LargestSeqNum { 2856 t.Fatalf("previous largest file %s has sequence number > next file %s", largest, f.Meta) 2857 } 2858 } 2859 largest = f.Meta 2860 } 2861 } 2862 } 2863 2864 type ingestCrashFS struct { 2865 vfs.FS 2866 } 2867 2868 func (fs ingestCrashFS) Link(oldname, newname string) error { 2869 if err := fs.FS.Link(oldname, newname); err != nil { 2870 return err 2871 } 2872 panic(errorfs.ErrInjected) 2873 } 2874 2875 type noRemoveFS struct { 2876 vfs.FS 2877 } 2878 2879 func (fs noRemoveFS) Remove(string) error { 2880 return errorfs.ErrInjected 2881 } 2882 2883 func TestIngestFileNumReuseCrash(t *testing.T) { 2884 const count = 10 2885 // Use an on-disk filesystem, because Ingest with a MemFS will copy, not 2886 // link the ingested file. 2887 dir, err := os.MkdirTemp("", "ingest-filenum-reuse") 2888 require.NoError(t, err) 2889 defer os.RemoveAll(dir) 2890 fs := vfs.Default 2891 2892 readFile := func(s string) []byte { 2893 f, err := fs.Open(fs.PathJoin(dir, s)) 2894 require.NoError(t, err) 2895 b, err := io.ReadAll(f) 2896 require.NoError(t, err) 2897 require.NoError(t, f.Close()) 2898 return b 2899 } 2900 2901 // Create sstables to ingest. 2902 var files []string 2903 var fileBytes [][]byte 2904 for i := 0; i < count; i++ { 2905 name := fmt.Sprintf("ext%d", i) 2906 f, err := fs.Create(fs.PathJoin(dir, name)) 2907 require.NoError(t, err) 2908 w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{}) 2909 require.NoError(t, w.Set([]byte(fmt.Sprintf("foo%d", i)), nil)) 2910 require.NoError(t, w.Close()) 2911 files = append(files, name) 2912 fileBytes = append(fileBytes, readFile(name)) 2913 } 2914 2915 // Open a database with a filesystem that will successfully link the 2916 // ingested files but then panic. This is an approximation of what a crash 2917 // after linking but before updating the manifest would look like. 2918 d, err := Open(dir, &Options{ 2919 FS: ingestCrashFS{FS: fs}, 2920 }) 2921 // A flush here ensures the file num bumps from creating OPTIONS files, 2922 // etc get recorded in the manifest. We want the nextFileNum after the 2923 // restart to be the same as one of our ingested sstables. 2924 require.NoError(t, err) 2925 require.NoError(t, d.Set([]byte("boop"), nil, nil)) 2926 require.NoError(t, d.Flush()) 2927 for _, f := range files { 2928 func() { 2929 defer func() { err = recover().(error) }() 2930 err = d.Ingest([]string{fs.PathJoin(dir, f)}) 2931 }() 2932 if err == nil || !errors.Is(err, errorfs.ErrInjected) { 2933 t.Fatalf("expected injected error, got %v", err) 2934 } 2935 } 2936 // Leave something in the WAL so that Open will flush while replaying the 2937 // WAL. 2938 require.NoError(t, d.Set([]byte("wal"), nil, nil)) 2939 require.NoError(t, d.Close()) 2940 2941 // There are now two links to each external file: the original extX link 2942 // and a numbered sstable link. The sstable files are still not a part of 2943 // the manifest and so they may be overwritten. Open will detect the 2944 // obsolete number sstables and try to remove them. The FS here is wrapped 2945 // to induce errors on Remove calls. Even if we're unsuccessful in 2946 // removing the obsolete files, the external files should not be 2947 // overwritten. 2948 d, err = Open(dir, &Options{FS: noRemoveFS{FS: fs}}) 2949 require.NoError(t, err) 2950 require.NoError(t, d.Set([]byte("bar"), nil, nil)) 2951 require.NoError(t, d.Flush()) 2952 require.NoError(t, d.Close()) 2953 2954 // None of the external files should change despite modifying the linked 2955 // versions. 2956 for i, f := range files { 2957 afterBytes := readFile(f) 2958 require.Equal(t, fileBytes[i], afterBytes) 2959 } 2960 } 2961 2962 func TestIngest_UpdateSequenceNumber(t *testing.T) { 2963 mem := vfs.NewMem() 2964 cmp := base.DefaultComparer.Compare 2965 parse := func(input string) (*sstable.Writer, error) { 2966 f, err := mem.Create("ext") 2967 if err != nil { 2968 return nil, err 2969 } 2970 w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{ 2971 TableFormat: sstable.TableFormatMax, 2972 }) 2973 for _, data := range strings.Split(input, "\n") { 2974 if strings.HasPrefix(data, "rangekey: ") { 2975 data = strings.TrimPrefix(data, "rangekey: ") 2976 s := keyspan.ParseSpan(data) 2977 err := rangekey.Encode(&s, w.AddRangeKey) 2978 if err != nil { 2979 return nil, err 2980 } 2981 continue 2982 } 2983 j := strings.Index(data, ":") 2984 if j < 0 { 2985 return nil, errors.Newf("malformed input: %s\n", data) 2986 } 2987 key := base.ParseInternalKey(data[:j]) 2988 value := []byte(data[j+1:]) 2989 if err := w.Add(key, value); err != nil { 2990 return nil, err 2991 } 2992 } 2993 return w, nil 2994 } 2995 2996 var ( 2997 seqnum uint64 2998 err error 2999 metas []*fileMetadata 3000 ) 3001 datadriven.RunTest(t, "testdata/ingest_update_seqnums", func(t *testing.T, td *datadriven.TestData) string { 3002 switch td.Cmd { 3003 case "starting-seqnum": 3004 seqnum, err = strconv.ParseUint(td.Input, 10, 64) 3005 if err != nil { 3006 return err.Error() 3007 } 3008 return "" 3009 3010 case "reset": 3011 metas = metas[:0] 3012 return "" 3013 3014 case "load": 3015 w, err := parse(td.Input) 3016 if err != nil { 3017 return err.Error() 3018 } 3019 if err = w.Close(); err != nil { 3020 return err.Error() 3021 } 3022 defer w.Close() 3023 3024 // Format the bounds of the table. 3025 wm, err := w.Metadata() 3026 if err != nil { 3027 return err.Error() 3028 } 3029 3030 // Upper bounds for range dels and range keys are expected to be sentinel 3031 // keys. 3032 maybeUpdateUpperBound := func(key base.InternalKey) base.InternalKey { 3033 switch k := key.Kind(); { 3034 case k == base.InternalKeyKindRangeDelete: 3035 key.Trailer = base.InternalKeyRangeDeleteSentinel 3036 case rangekey.IsRangeKey(k): 3037 return base.MakeExclusiveSentinelKey(k, key.UserKey) 3038 } 3039 return key 3040 } 3041 3042 // Construct the file metadata from the writer metadata. 3043 m := &fileMetadata{ 3044 SmallestSeqNum: 0, // Simulate an ingestion. 3045 LargestSeqNum: 0, 3046 } 3047 if wm.HasPointKeys { 3048 m.ExtendPointKeyBounds(cmp, wm.SmallestPoint, wm.LargestPoint) 3049 } 3050 if wm.HasRangeDelKeys { 3051 m.ExtendPointKeyBounds( 3052 cmp, 3053 wm.SmallestRangeDel, 3054 maybeUpdateUpperBound(wm.LargestRangeDel), 3055 ) 3056 } 3057 if wm.HasRangeKeys { 3058 m.ExtendRangeKeyBounds( 3059 cmp, 3060 wm.SmallestRangeKey, 3061 maybeUpdateUpperBound(wm.LargestRangeKey), 3062 ) 3063 } 3064 m.InitPhysicalBacking() 3065 if err := m.Validate(cmp, base.DefaultFormatter); err != nil { 3066 return err.Error() 3067 } 3068 3069 // Collect this file. 3070 metas = append(metas, m) 3071 3072 // Return an index number for the file. 3073 return fmt.Sprintf("file %d\n", len(metas)-1) 3074 3075 case "update-files": 3076 // Update the bounds across all files. 3077 if err = ingestUpdateSeqNum(cmp, base.DefaultFormatter, seqnum, ingestLoadResult{localMeta: metas}); err != nil { 3078 return err.Error() 3079 } 3080 3081 var buf bytes.Buffer 3082 for i, m := range metas { 3083 fmt.Fprintf(&buf, "file %d:\n", i) 3084 fmt.Fprintf(&buf, " combined: %s-%s\n", m.Smallest, m.Largest) 3085 fmt.Fprintf(&buf, " points: %s-%s\n", m.SmallestPointKey, m.LargestPointKey) 3086 fmt.Fprintf(&buf, " ranges: %s-%s\n", m.SmallestRangeKey, m.LargestRangeKey) 3087 } 3088 3089 return buf.String() 3090 3091 default: 3092 return fmt.Sprintf("unknown command %s\n", td.Cmd) 3093 } 3094 }) 3095 } 3096 3097 func TestIngestCleanup(t *testing.T) { 3098 fns := []base.FileNum{0, 1, 2} 3099 3100 testCases := []struct { 3101 closeFiles []base.FileNum 3102 cleanupFiles []base.FileNum 3103 wantErr string 3104 }{ 3105 // Close and remove all files. 3106 { 3107 closeFiles: fns, 3108 cleanupFiles: fns, 3109 }, 3110 // Remove a non-existent file. 3111 { 3112 closeFiles: fns, 3113 cleanupFiles: []base.FileNum{3}, 3114 wantErr: "unknown to the objstorage provider", 3115 }, 3116 // Remove a file that has not been closed. 3117 { 3118 closeFiles: []base.FileNum{0, 2}, 3119 cleanupFiles: fns, 3120 wantErr: oserror.ErrInvalid.Error(), 3121 }, 3122 // Remove all files, one of which is still open, plus a file that does not exist. 3123 { 3124 closeFiles: []base.FileNum{0, 2}, 3125 cleanupFiles: []base.FileNum{0, 1, 2, 3}, 3126 wantErr: oserror.ErrInvalid.Error(), // The first error encountered is due to the open file. 3127 }, 3128 } 3129 3130 for _, tc := range testCases { 3131 t.Run("", func(t *testing.T) { 3132 mem := vfs.NewMem() 3133 mem.UseWindowsSemantics(true) 3134 objProvider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(mem, "")) 3135 require.NoError(t, err) 3136 defer objProvider.Close() 3137 3138 // Create the files in the VFS. 3139 metaMap := make(map[base.FileNum]objstorage.Writable) 3140 for _, fn := range fns { 3141 w, _, err := objProvider.Create(context.Background(), base.FileTypeTable, fn.DiskFileNum(), objstorage.CreateOptions{}) 3142 require.NoError(t, err) 3143 3144 metaMap[fn] = w 3145 } 3146 3147 // Close a select number of files. 3148 for _, m := range tc.closeFiles { 3149 w, ok := metaMap[m] 3150 if !ok { 3151 continue 3152 } 3153 require.NoError(t, w.Finish()) 3154 } 3155 3156 // Cleanup the set of files in the FS. 3157 var toRemove []*fileMetadata 3158 for _, fn := range tc.cleanupFiles { 3159 m := &fileMetadata{FileNum: fn} 3160 m.InitPhysicalBacking() 3161 toRemove = append(toRemove, m) 3162 } 3163 3164 err = ingestCleanup(objProvider, toRemove) 3165 if tc.wantErr != "" { 3166 require.Error(t, err, "got no error, expected %s", tc.wantErr) 3167 require.Contains(t, err.Error(), tc.wantErr) 3168 } else { 3169 require.NoError(t, err) 3170 } 3171 }) 3172 } 3173 } 3174 3175 // fatalCapturingLogger captures a fatal error instead of panicking. 3176 type fatalCapturingLogger struct { 3177 t testing.TB 3178 err error 3179 } 3180 3181 // Infof implements the Logger interface. 3182 func (l *fatalCapturingLogger) Infof(fmt string, args ...interface{}) { 3183 l.t.Logf(fmt, args...) 3184 } 3185 3186 // Errorf implements the Logger interface. 3187 func (l *fatalCapturingLogger) Errorf(fmt string, args ...interface{}) { 3188 l.t.Logf(fmt, args...) 3189 } 3190 3191 // Fatalf implements the Logger interface. 3192 func (l *fatalCapturingLogger) Fatalf(_ string, args ...interface{}) { 3193 l.err = args[0].(error) 3194 } 3195 3196 func TestIngestValidation(t *testing.T) { 3197 type keyVal struct { 3198 key, val []byte 3199 } 3200 // The corruptionLocation enum defines where to corrupt an sstable if 3201 // anywhere. corruptionLocation{Start,End} describe the start and end 3202 // data blocks. corruptionLocationInternal describes a random data block 3203 // that's neither the start or end blocks. The Ingest operation does not 3204 // read the entire sstable, only the start and end blocks, so corruption 3205 // introduced using corruptionLocationInternal will not be discovered until 3206 // the asynchronous validation job runs. 3207 type corruptionLocation int 3208 const ( 3209 corruptionLocationNone corruptionLocation = iota 3210 corruptionLocationStart 3211 corruptionLocationEnd 3212 corruptionLocationInternal 3213 ) 3214 // The errReportLocation type defines an enum to allow tests to enforce 3215 // expectations about how an error surfaced during ingestion or validation 3216 // is reported. Asynchronous validation that uncovers corruption should call 3217 // Fatalf on the Logger. Asychronous validation that encounters 3218 // non-corruption errors should surface it through the 3219 // EventListener.BackgroundError func. 3220 type errReportLocation int 3221 const ( 3222 errReportLocationNone errReportLocation = iota 3223 errReportLocationIngest 3224 errReportLocationFatal 3225 errReportLocationBackgroundError 3226 ) 3227 const ( 3228 nKeys = 1_000 3229 keySize = 16 3230 valSize = 100 3231 blockSize = 100 3232 3233 ingestTableName = "ext" 3234 ) 3235 3236 seed := uint64(time.Now().UnixNano()) 3237 rng := rand.New(rand.NewSource(seed)) 3238 t.Logf("rng seed = %d", seed) 3239 3240 // errfsCounter is used by test cases that make use of an errorfs.Injector 3241 // to inject errors into the ingest validation code path. 3242 var errfsCounter atomic.Int32 3243 testCases := []struct { 3244 description string 3245 cLoc corruptionLocation 3246 wantErrType errReportLocation 3247 wantErr error 3248 errorfsInjector errorfs.Injector 3249 }{ 3250 { 3251 description: "no corruption", 3252 cLoc: corruptionLocationNone, 3253 wantErrType: errReportLocationNone, 3254 }, 3255 { 3256 description: "start block", 3257 cLoc: corruptionLocationStart, 3258 wantErr: ErrCorruption, 3259 wantErrType: errReportLocationIngest, 3260 }, 3261 { 3262 description: "end block", 3263 cLoc: corruptionLocationEnd, 3264 wantErr: ErrCorruption, 3265 wantErrType: errReportLocationIngest, 3266 }, 3267 { 3268 description: "non-end block", 3269 cLoc: corruptionLocationInternal, 3270 wantErr: ErrCorruption, 3271 wantErrType: errReportLocationFatal, 3272 }, 3273 { 3274 description: "non-corruption error", 3275 cLoc: corruptionLocationNone, 3276 wantErr: errorfs.ErrInjected, 3277 wantErrType: errReportLocationBackgroundError, 3278 errorfsInjector: errorfs.InjectorFunc(func(op errorfs.Op) error { 3279 // Inject an error on the first read-at operation on an sstable 3280 // (excluding the read on the sstable before ingestion has 3281 // linked it in). 3282 if op.Path != "ext" && op.Kind != errorfs.OpFileReadAt || filepath.Ext(op.Path) != ".sst" { 3283 return nil 3284 } 3285 if errfsCounter.Add(1) == 1 { 3286 return errorfs.ErrInjected 3287 } 3288 return nil 3289 }), 3290 }, 3291 } 3292 3293 for _, tc := range testCases { 3294 t.Run(tc.description, func(t *testing.T) { 3295 errfsCounter.Store(0) 3296 var wg sync.WaitGroup 3297 wg.Add(1) 3298 3299 fs := vfs.NewMem() 3300 var testFS vfs.FS = fs 3301 if tc.errorfsInjector != nil { 3302 testFS = errorfs.Wrap(fs, tc.errorfsInjector) 3303 } 3304 3305 // backgroundErr is populated by EventListener.BackgroundError. 3306 var backgroundErr error 3307 logger := &fatalCapturingLogger{t: t} 3308 opts := &Options{ 3309 FS: testFS, 3310 Logger: logger, 3311 EventListener: &EventListener{ 3312 TableValidated: func(i TableValidatedInfo) { 3313 wg.Done() 3314 }, 3315 BackgroundError: func(err error) { 3316 backgroundErr = err 3317 }, 3318 }, 3319 } 3320 // Disable table stats so that injected errors can't be accidentally 3321 // injected into the table stats collector read, and so the table 3322 // stats collector won't prime the table+block cache such that the 3323 // error injection won't trigger at all during ingest validation. 3324 opts.private.disableTableStats = true 3325 opts.Experimental.ValidateOnIngest = true 3326 d, err := Open("", opts) 3327 require.NoError(t, err) 3328 defer func() { require.NoError(t, d.Close()) }() 3329 3330 corrupt := func(f vfs.File) { 3331 readable, err := sstable.NewSimpleReadable(f) 3332 require.NoError(t, err) 3333 // Compute the layout of the sstable in order to find the 3334 // appropriate block locations to corrupt. 3335 r, err := sstable.NewReader(readable, sstable.ReaderOptions{}) 3336 require.NoError(t, err) 3337 l, err := r.Layout() 3338 require.NoError(t, err) 3339 3340 // Select an appropriate data block to corrupt. 3341 var blockIdx int 3342 switch tc.cLoc { 3343 case corruptionLocationStart: 3344 blockIdx = 0 3345 case corruptionLocationEnd: 3346 blockIdx = len(l.Data) - 1 3347 case corruptionLocationInternal: 3348 blockIdx = 1 + rng.Intn(len(l.Data)-2) 3349 default: 3350 t.Fatalf("unknown corruptionLocation: %T", tc.cLoc) 3351 } 3352 bh := l.Data[blockIdx] 3353 3354 // Corrupting a key will cause the ingestion to fail due to a 3355 // malformed key, rather than a block checksum mismatch. 3356 // Instead, we corrupt the last byte in the selected block, 3357 // before the trailer, which corresponds to a value. 3358 offset := bh.Offset + bh.Length - 1 3359 _, err = f.WriteAt([]byte("\xff"), int64(offset)) 3360 require.NoError(t, err) 3361 require.NoError(t, r.Close()) 3362 } 3363 3364 type errT struct { 3365 errLoc errReportLocation 3366 err error 3367 } 3368 runIngest := func(keyVals []keyVal) (et errT) { 3369 f, err := fs.Create(ingestTableName) 3370 require.NoError(t, err) 3371 defer func() { _ = fs.Remove(ingestTableName) }() 3372 3373 w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{ 3374 BlockSize: blockSize, // Create many smaller blocks. 3375 Compression: NoCompression, // For simpler debugging. 3376 }) 3377 for _, kv := range keyVals { 3378 require.NoError(t, w.Set(kv.key, kv.val)) 3379 } 3380 require.NoError(t, w.Close()) 3381 3382 // Possibly corrupt the file. 3383 if tc.cLoc != corruptionLocationNone { 3384 f, err = fs.OpenReadWrite(ingestTableName) 3385 require.NoError(t, err) 3386 corrupt(f) 3387 } 3388 3389 // Ingest the external table. 3390 err = d.Ingest([]string{ingestTableName}) 3391 if err != nil { 3392 et.errLoc = errReportLocationIngest 3393 et.err = err 3394 return 3395 } 3396 3397 // Wait for the validation on the sstable to complete. 3398 wg.Wait() 3399 3400 // Return any error encountered during validation. 3401 if logger.err != nil { 3402 et.errLoc = errReportLocationFatal 3403 et.err = logger.err 3404 } else if backgroundErr != nil { 3405 et.errLoc = errReportLocationBackgroundError 3406 et.err = backgroundErr 3407 } 3408 return 3409 } 3410 3411 // Construct a set of keys to ingest. 3412 var keyVals []keyVal 3413 for i := 0; i < nKeys; i++ { 3414 key := make([]byte, keySize) 3415 _, err = rng.Read(key) 3416 require.NoError(t, err) 3417 3418 val := make([]byte, valSize) 3419 _, err = rng.Read(val) 3420 require.NoError(t, err) 3421 3422 keyVals = append(keyVals, keyVal{key, val}) 3423 } 3424 3425 // Keys must be sorted. 3426 slices.SortFunc(keyVals, func(a, b keyVal) int { return d.cmp(a.key, b.key) }) 3427 3428 // Run the ingestion. 3429 et := runIngest(keyVals) 3430 3431 // Assert we saw the errors we expect. 3432 switch tc.wantErrType { 3433 case errReportLocationNone: 3434 require.Equal(t, errReportLocationNone, et.errLoc) 3435 require.NoError(t, et.err) 3436 case errReportLocationIngest: 3437 require.Equal(t, errReportLocationIngest, et.errLoc) 3438 require.Error(t, et.err) 3439 require.True(t, errors.Is(et.err, tc.wantErr)) 3440 case errReportLocationFatal: 3441 require.Equal(t, errReportLocationFatal, et.errLoc) 3442 require.Error(t, et.err) 3443 require.True(t, errors.Is(et.err, tc.wantErr)) 3444 case errReportLocationBackgroundError: 3445 require.Equal(t, errReportLocationBackgroundError, et.errLoc) 3446 require.Error(t, et.err) 3447 require.True(t, errors.Is(et.err, tc.wantErr)) 3448 default: 3449 t.Fatalf("unknown wantErrType %T", tc.wantErrType) 3450 } 3451 }) 3452 } 3453 } 3454 3455 // BenchmarkManySSTables measures the cost of various operations with various 3456 // counts of SSTables within the database. 3457 func BenchmarkManySSTables(b *testing.B) { 3458 counts := []int{10, 1_000, 10_000, 100_000, 1_000_000} 3459 ops := []string{"ingest", "calculateInuseKeyRanges"} 3460 for _, op := range ops { 3461 b.Run(op, func(b *testing.B) { 3462 for _, count := range counts { 3463 b.Run(fmt.Sprintf("sstables=%d", count), func(b *testing.B) { 3464 mem := vfs.NewMem() 3465 d, err := Open("", &Options{ 3466 FS: mem, 3467 }) 3468 require.NoError(b, err) 3469 3470 var paths []string 3471 for i := 0; i < count; i++ { 3472 n := fmt.Sprintf("%07d", i) 3473 f, err := mem.Create(n) 3474 require.NoError(b, err) 3475 w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{}) 3476 require.NoError(b, w.Set([]byte(n), nil)) 3477 require.NoError(b, w.Close()) 3478 paths = append(paths, n) 3479 } 3480 require.NoError(b, d.Ingest(paths)) 3481 3482 { 3483 const broadIngest = "broad.sst" 3484 f, err := mem.Create(broadIngest) 3485 require.NoError(b, err) 3486 w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{}) 3487 require.NoError(b, w.Set([]byte("0"), nil)) 3488 require.NoError(b, w.Set([]byte("Z"), nil)) 3489 require.NoError(b, w.Close()) 3490 require.NoError(b, d.Ingest([]string{broadIngest})) 3491 } 3492 3493 switch op { 3494 case "ingest": 3495 runBenchmarkManySSTablesIngest(b, d, mem, count) 3496 case "calculateInuseKeyRanges": 3497 runBenchmarkManySSTablesInUseKeyRanges(b, d, count) 3498 } 3499 require.NoError(b, d.Close()) 3500 }) 3501 } 3502 }) 3503 } 3504 } 3505 3506 func runBenchmarkManySSTablesIngest(b *testing.B, d *DB, fs vfs.FS, count int) { 3507 b.ResetTimer() 3508 for i := 0; i < b.N; i++ { 3509 n := fmt.Sprintf("%07d", count+i) 3510 f, err := fs.Create(n) 3511 require.NoError(b, err) 3512 w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{}) 3513 require.NoError(b, w.Set([]byte(n), nil)) 3514 require.NoError(b, w.Close()) 3515 require.NoError(b, d.Ingest([]string{n})) 3516 } 3517 } 3518 3519 func runBenchmarkManySSTablesInUseKeyRanges(b *testing.B, d *DB, count int) { 3520 // This benchmark is pretty contrived, but it's not easy to write a 3521 // microbenchmark for this in a more natural way. L6 has many files, and 3522 // L5 has 1 file spanning the entire breadth of L5. 3523 d.mu.Lock() 3524 defer d.mu.Unlock() 3525 v := d.mu.versions.currentVersion() 3526 b.ResetTimer() 3527 3528 smallest := []byte("0") 3529 largest := []byte("z") 3530 for i := 0; i < b.N; i++ { 3531 _ = calculateInuseKeyRanges(v, d.cmp, 0, numLevels-1, smallest, largest) 3532 } 3533 }