github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/ingest_test.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"bytes"
     9  	"context"
    10  	"fmt"
    11  	"io"
    12  	"math"
    13  	"os"
    14  	"path/filepath"
    15  	"slices"
    16  	"sort"
    17  	"strconv"
    18  	"strings"
    19  	"sync"
    20  	"sync/atomic"
    21  	"testing"
    22  	"time"
    23  
    24  	"github.com/cockroachdb/datadriven"
    25  	"github.com/cockroachdb/errors"
    26  	"github.com/cockroachdb/errors/oserror"
    27  	"github.com/cockroachdb/pebble/internal/base"
    28  	"github.com/cockroachdb/pebble/internal/keyspan"
    29  	"github.com/cockroachdb/pebble/internal/manifest"
    30  	"github.com/cockroachdb/pebble/internal/rangekey"
    31  	"github.com/cockroachdb/pebble/internal/testkeys"
    32  	"github.com/cockroachdb/pebble/objstorage"
    33  	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
    34  	"github.com/cockroachdb/pebble/objstorage/remote"
    35  	"github.com/cockroachdb/pebble/record"
    36  	"github.com/cockroachdb/pebble/sstable"
    37  	"github.com/cockroachdb/pebble/vfs"
    38  	"github.com/cockroachdb/pebble/vfs/errorfs"
    39  	"github.com/kr/pretty"
    40  	"github.com/stretchr/testify/require"
    41  	"golang.org/x/exp/rand"
    42  )
    43  
    44  func TestSSTableKeyCompare(t *testing.T) {
    45  	var buf bytes.Buffer
    46  	datadriven.RunTest(t, "testdata/sstable_key_compare", func(t *testing.T, td *datadriven.TestData) string {
    47  		switch td.Cmd {
    48  		case "cmp":
    49  			buf.Reset()
    50  			for _, line := range strings.Split(td.Input, "\n") {
    51  				fields := strings.Fields(line)
    52  				a := base.ParseInternalKey(fields[0])
    53  				b := base.ParseInternalKey(fields[1])
    54  				got := sstableKeyCompare(testkeys.Comparer.Compare, a, b)
    55  				fmt.Fprintf(&buf, "%38s", fmt.Sprint(a.Pretty(base.DefaultFormatter)))
    56  				switch got {
    57  				case -1:
    58  					fmt.Fprint(&buf, " < ")
    59  				case +1:
    60  					fmt.Fprint(&buf, " > ")
    61  				case 0:
    62  					fmt.Fprint(&buf, " = ")
    63  				}
    64  				fmt.Fprintf(&buf, "%s\n", fmt.Sprint(b.Pretty(base.DefaultFormatter)))
    65  			}
    66  			return buf.String()
    67  		default:
    68  			return fmt.Sprintf("unrecognized command %q", td.Cmd)
    69  		}
    70  	})
    71  }
    72  
    73  func TestIngestLoad(t *testing.T) {
    74  	mem := vfs.NewMem()
    75  
    76  	datadriven.RunTest(t, "testdata/ingest_load", func(t *testing.T, td *datadriven.TestData) string {
    77  		switch td.Cmd {
    78  		case "load":
    79  			writerOpts := sstable.WriterOptions{}
    80  			var dbVersion FormatMajorVersion
    81  			for _, cmdArgs := range td.CmdArgs {
    82  				v, err := strconv.Atoi(cmdArgs.Vals[0])
    83  				if err != nil {
    84  					return err.Error()
    85  				}
    86  				switch k := cmdArgs.Key; k {
    87  				case "writer-version":
    88  					fmv := FormatMajorVersion(v)
    89  					writerOpts.TableFormat = fmv.MaxTableFormat()
    90  				case "db-version":
    91  					dbVersion = FormatMajorVersion(v)
    92  				default:
    93  					return fmt.Sprintf("unknown cmd %s\n", k)
    94  				}
    95  			}
    96  			f, err := mem.Create("ext")
    97  			if err != nil {
    98  				return err.Error()
    99  			}
   100  			w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), writerOpts)
   101  			for _, data := range strings.Split(td.Input, "\n") {
   102  				if strings.HasPrefix(data, "rangekey: ") {
   103  					data = strings.TrimPrefix(data, "rangekey: ")
   104  					s := keyspan.ParseSpan(data)
   105  					err := rangekey.Encode(&s, w.AddRangeKey)
   106  					if err != nil {
   107  						return err.Error()
   108  					}
   109  					continue
   110  				}
   111  
   112  				j := strings.Index(data, ":")
   113  				if j < 0 {
   114  					return fmt.Sprintf("malformed input: %s\n", data)
   115  				}
   116  				key := base.ParseInternalKey(data[:j])
   117  				value := []byte(data[j+1:])
   118  				if err := w.Add(key, value); err != nil {
   119  					return err.Error()
   120  				}
   121  			}
   122  			if err := w.Close(); err != nil {
   123  				return err.Error()
   124  			}
   125  
   126  			opts := (&Options{
   127  				Comparer: DefaultComparer,
   128  				FS:       mem,
   129  			}).WithFSDefaults()
   130  			lr, err := ingestLoad(opts, dbVersion, []string{"ext"}, nil, nil, 0, []base.DiskFileNum{base.FileNum(1).DiskFileNum()}, nil, 0)
   131  			if err != nil {
   132  				return err.Error()
   133  			}
   134  			var buf bytes.Buffer
   135  			for _, m := range lr.localMeta {
   136  				fmt.Fprintf(&buf, "%d: %s-%s\n", m.FileNum, m.Smallest, m.Largest)
   137  				fmt.Fprintf(&buf, "  points: %s-%s\n", m.SmallestPointKey, m.LargestPointKey)
   138  				fmt.Fprintf(&buf, "  ranges: %s-%s\n", m.SmallestRangeKey, m.LargestRangeKey)
   139  			}
   140  			return buf.String()
   141  
   142  		default:
   143  			return fmt.Sprintf("unknown command: %s", td.Cmd)
   144  		}
   145  	})
   146  }
   147  
   148  func TestIngestLoadRand(t *testing.T) {
   149  	mem := vfs.NewMem()
   150  	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
   151  	cmp := DefaultComparer.Compare
   152  	version := internalFormatNewest
   153  
   154  	randBytes := func(size int) []byte {
   155  		data := make([]byte, size)
   156  		for i := range data {
   157  			data[i] = byte(rng.Int() & 0xff)
   158  		}
   159  		return data
   160  	}
   161  
   162  	paths := make([]string, 1+rng.Intn(10))
   163  	pending := make([]base.DiskFileNum, len(paths))
   164  	expected := make([]*fileMetadata, len(paths))
   165  	for i := range paths {
   166  		paths[i] = fmt.Sprint(i)
   167  		pending[i] = base.FileNum(rng.Uint64()).DiskFileNum()
   168  		expected[i] = &fileMetadata{
   169  			FileNum: pending[i].FileNum(),
   170  		}
   171  		expected[i].StatsMarkValid()
   172  
   173  		func() {
   174  			f, err := mem.Create(paths[i])
   175  			require.NoError(t, err)
   176  
   177  			keys := make([]InternalKey, 1+rng.Intn(100))
   178  			for i := range keys {
   179  				keys[i] = base.MakeInternalKey(
   180  					randBytes(1+rng.Intn(10)),
   181  					0,
   182  					InternalKeyKindSet)
   183  			}
   184  			slices.SortFunc(keys, func(a, b base.InternalKey) int {
   185  				return base.InternalCompare(cmp, a, b)
   186  			})
   187  
   188  			expected[i].ExtendPointKeyBounds(cmp, keys[0], keys[len(keys)-1])
   189  
   190  			w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
   191  				TableFormat: version.MaxTableFormat(),
   192  			})
   193  			var count uint64
   194  			for i := range keys {
   195  				if i > 0 && base.InternalCompare(cmp, keys[i-1], keys[i]) == 0 {
   196  					// Duplicate key, ignore.
   197  					continue
   198  				}
   199  				w.Add(keys[i], nil)
   200  				count++
   201  			}
   202  			expected[i].Stats.NumEntries = count
   203  			require.NoError(t, w.Close())
   204  
   205  			meta, err := w.Metadata()
   206  			require.NoError(t, err)
   207  
   208  			expected[i].Size = meta.Size
   209  			expected[i].InitPhysicalBacking()
   210  		}()
   211  	}
   212  
   213  	opts := (&Options{
   214  		Comparer: DefaultComparer,
   215  		FS:       mem,
   216  	}).WithFSDefaults()
   217  	lr, err := ingestLoad(opts, version, paths, nil, nil, 0, pending, nil, 0)
   218  	require.NoError(t, err)
   219  
   220  	for _, m := range lr.localMeta {
   221  		m.CreationTime = 0
   222  	}
   223  	t.Log(strings.Join(pretty.Diff(expected, lr.localMeta), "\n"))
   224  	require.Equal(t, expected, lr.localMeta)
   225  }
   226  
   227  func TestIngestLoadInvalid(t *testing.T) {
   228  	mem := vfs.NewMem()
   229  	f, err := mem.Create("invalid")
   230  	require.NoError(t, err)
   231  	require.NoError(t, f.Close())
   232  
   233  	opts := (&Options{
   234  		Comparer: DefaultComparer,
   235  		FS:       mem,
   236  	}).WithFSDefaults()
   237  	if _, err := ingestLoad(opts, internalFormatNewest, []string{"invalid"}, nil, nil, 0, []base.DiskFileNum{base.FileNum(1).DiskFileNum()}, nil, 0); err == nil {
   238  		t.Fatalf("expected error, but found success")
   239  	}
   240  }
   241  
   242  func TestIngestSortAndVerify(t *testing.T) {
   243  	comparers := map[string]Compare{
   244  		"default": DefaultComparer.Compare,
   245  		"reverse": func(a, b []byte) int {
   246  			return DefaultComparer.Compare(b, a)
   247  		},
   248  	}
   249  
   250  	t.Run("", func(t *testing.T) {
   251  		datadriven.RunTest(t, "testdata/ingest_sort_and_verify", func(t *testing.T, d *datadriven.TestData) string {
   252  			switch d.Cmd {
   253  			case "ingest":
   254  				var buf bytes.Buffer
   255  				var meta []*fileMetadata
   256  				var paths []string
   257  				var cmpName string
   258  				d.ScanArgs(t, "cmp", &cmpName)
   259  				cmp := comparers[cmpName]
   260  				if cmp == nil {
   261  					return fmt.Sprintf("%s unknown comparer: %s", d.Cmd, cmpName)
   262  				}
   263  				for i, data := range strings.Split(d.Input, "\n") {
   264  					parts := strings.Split(data, "-")
   265  					if len(parts) != 2 {
   266  						return fmt.Sprintf("malformed test case: %s", d.Input)
   267  					}
   268  					smallest := base.ParseInternalKey(parts[0])
   269  					largest := base.ParseInternalKey(parts[1])
   270  					if cmp(smallest.UserKey, largest.UserKey) > 0 {
   271  						return fmt.Sprintf("range %v-%v is not valid", smallest, largest)
   272  					}
   273  					m := (&fileMetadata{}).ExtendPointKeyBounds(cmp, smallest, largest)
   274  					m.InitPhysicalBacking()
   275  					meta = append(meta, m)
   276  					paths = append(paths, strconv.Itoa(i))
   277  				}
   278  				lr := ingestLoadResult{localPaths: paths, localMeta: meta}
   279  				err := ingestSortAndVerify(cmp, lr, KeyRange{})
   280  				if err != nil {
   281  					return fmt.Sprintf("%v\n", err)
   282  				}
   283  				for i := range meta {
   284  					fmt.Fprintf(&buf, "%s: %v-%v\n", paths[i], meta[i].Smallest, meta[i].Largest)
   285  				}
   286  				return buf.String()
   287  
   288  			default:
   289  				return fmt.Sprintf("unknown command: %s", d.Cmd)
   290  			}
   291  		})
   292  	})
   293  }
   294  
   295  func TestIngestLink(t *testing.T) {
   296  	// Test linking of tables into the DB directory. Test cleanup when one of the
   297  	// tables cannot be linked.
   298  
   299  	const dir = "db"
   300  	const count = 10
   301  	for i := 0; i <= count; i++ {
   302  		t.Run("", func(t *testing.T) {
   303  			opts := &Options{FS: vfs.NewMem()}
   304  			opts.EnsureDefaults().WithFSDefaults()
   305  			require.NoError(t, opts.FS.MkdirAll(dir, 0755))
   306  			objProvider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(opts.FS, dir))
   307  			require.NoError(t, err)
   308  			defer objProvider.Close()
   309  
   310  			paths := make([]string, 10)
   311  			meta := make([]*fileMetadata, len(paths))
   312  			contents := make([][]byte, len(paths))
   313  			for j := range paths {
   314  				paths[j] = fmt.Sprintf("external%d", j)
   315  				meta[j] = &fileMetadata{}
   316  				meta[j].FileNum = FileNum(j)
   317  				meta[j].InitPhysicalBacking()
   318  				f, err := opts.FS.Create(paths[j])
   319  				require.NoError(t, err)
   320  
   321  				contents[j] = []byte(fmt.Sprintf("data%d", j))
   322  				// memFile.Write will modify the supplied buffer when invariants are
   323  				// enabled, so provide a throw-away copy.
   324  				_, err = f.Write(append([]byte(nil), contents[j]...))
   325  				require.NoError(t, err)
   326  				require.NoError(t, f.Close())
   327  			}
   328  
   329  			if i < count {
   330  				opts.FS.Remove(paths[i])
   331  			}
   332  
   333  			lr := ingestLoadResult{localMeta: meta, localPaths: paths}
   334  			err = ingestLink(0 /* jobID */, opts, objProvider, lr, nil /* shared */)
   335  			if i < count {
   336  				if err == nil {
   337  					t.Fatalf("expected error, but found success")
   338  				}
   339  			} else {
   340  				require.NoError(t, err)
   341  			}
   342  
   343  			files, err := opts.FS.List(dir)
   344  			require.NoError(t, err)
   345  
   346  			sort.Strings(files)
   347  
   348  			if i < count {
   349  				if len(files) > 0 {
   350  					t.Fatalf("expected all of the files to be cleaned up, but found:\n%s",
   351  						strings.Join(files, "\n"))
   352  				}
   353  			} else {
   354  				if len(files) != count {
   355  					t.Fatalf("expected %d files, but found:\n%s", count, strings.Join(files, "\n"))
   356  				}
   357  				for j := range files {
   358  					ftype, fileNum, ok := base.ParseFilename(opts.FS, files[j])
   359  					if !ok {
   360  						t.Fatalf("unable to parse filename: %s", files[j])
   361  					}
   362  					if fileTypeTable != ftype {
   363  						t.Fatalf("expected table, but found %d", ftype)
   364  					}
   365  					if j != int(fileNum.FileNum()) {
   366  						t.Fatalf("expected table %d, but found %d", j, fileNum)
   367  					}
   368  					f, err := opts.FS.Open(opts.FS.PathJoin(dir, files[j]))
   369  					require.NoError(t, err)
   370  
   371  					data, err := io.ReadAll(f)
   372  					require.NoError(t, err)
   373  					require.NoError(t, f.Close())
   374  					if !bytes.Equal(contents[j], data) {
   375  						t.Fatalf("expected %s, but found %s", contents[j], data)
   376  					}
   377  				}
   378  			}
   379  		})
   380  	}
   381  }
   382  
   383  func TestIngestLinkFallback(t *testing.T) {
   384  	// Verify that ingestLink succeeds if linking fails by falling back to
   385  	// copying.
   386  	mem := vfs.NewMem()
   387  	src, err := mem.Create("source")
   388  	require.NoError(t, err)
   389  
   390  	opts := &Options{FS: errorfs.Wrap(mem, errorfs.ErrInjected.If(errorfs.OnIndex(1)))}
   391  	opts.EnsureDefaults().WithFSDefaults()
   392  	objSettings := objstorageprovider.DefaultSettings(opts.FS, "")
   393  	// Prevent the provider from listing the dir (where we may get an injected error).
   394  	objSettings.FSDirInitialListing = []string{}
   395  	objProvider, err := objstorageprovider.Open(objSettings)
   396  	require.NoError(t, err)
   397  	defer objProvider.Close()
   398  
   399  	meta := []*fileMetadata{{FileNum: 1}}
   400  	meta[0].InitPhysicalBacking()
   401  	lr := ingestLoadResult{localMeta: meta, localPaths: []string{"source"}}
   402  	err = ingestLink(0, opts, objProvider, lr, nil /* shared */)
   403  	require.NoError(t, err)
   404  
   405  	dest, err := mem.Open("000001.sst")
   406  	require.NoError(t, err)
   407  
   408  	// We should be able to write bytes to src, and not have them show up in
   409  	// dest.
   410  	_, _ = src.Write([]byte("test"))
   411  	data, err := io.ReadAll(dest)
   412  	require.NoError(t, err)
   413  	if len(data) != 0 {
   414  		t.Fatalf("expected copy, but files appear to be hard linked: [%s] unexpectedly found", data)
   415  	}
   416  }
   417  
   418  func TestOverlappingIngestedSSTs(t *testing.T) {
   419  	dir := ""
   420  	var (
   421  		mem        vfs.FS
   422  		d          *DB
   423  		opts       *Options
   424  		closed     = false
   425  		blockFlush = false
   426  	)
   427  	defer func() {
   428  		if !closed {
   429  			require.NoError(t, d.Close())
   430  		}
   431  	}()
   432  
   433  	reset := func(strictMem bool) {
   434  		if d != nil && !closed {
   435  			require.NoError(t, d.Close())
   436  		}
   437  		blockFlush = false
   438  
   439  		if strictMem {
   440  			mem = vfs.NewStrictMem()
   441  		} else {
   442  			mem = vfs.NewMem()
   443  		}
   444  
   445  		require.NoError(t, mem.MkdirAll("ext", 0755))
   446  		opts = (&Options{
   447  			FS:                          mem,
   448  			MemTableStopWritesThreshold: 4,
   449  			L0CompactionThreshold:       100,
   450  			L0StopWritesThreshold:       100,
   451  			DebugCheck:                  DebugCheckLevels,
   452  			FormatMajorVersion:          internalFormatNewest,
   453  		}).WithFSDefaults()
   454  		// Disable automatic compactions because otherwise we'll race with
   455  		// delete-only compactions triggered by ingesting range tombstones.
   456  		opts.DisableAutomaticCompactions = true
   457  
   458  		var err error
   459  		d, err = Open(dir, opts)
   460  		require.NoError(t, err)
   461  		d.TestOnlyWaitForCleaning()
   462  	}
   463  	waitForFlush := func() {
   464  		if d == nil {
   465  			return
   466  		}
   467  		d.mu.Lock()
   468  		for d.mu.compact.flushing {
   469  			d.mu.compact.cond.Wait()
   470  		}
   471  		d.mu.Unlock()
   472  	}
   473  	reset(false)
   474  
   475  	datadriven.RunTest(t, "testdata/flushable_ingest", func(t *testing.T, td *datadriven.TestData) string {
   476  		switch td.Cmd {
   477  		case "reset":
   478  			reset(td.HasArg("strictMem"))
   479  			return ""
   480  
   481  		case "ignoreSyncs":
   482  			var ignoreSyncs bool
   483  			if len(td.CmdArgs) == 1 && td.CmdArgs[0].String() == "true" {
   484  				ignoreSyncs = true
   485  			}
   486  			mem.(*vfs.MemFS).SetIgnoreSyncs(ignoreSyncs)
   487  			return ""
   488  
   489  		case "resetToSynced":
   490  			mem.(*vfs.MemFS).ResetToSyncedState()
   491  			files, err := mem.List(dir)
   492  			sort.Strings(files)
   493  			require.NoError(t, err)
   494  			return strings.Join(files, "\n")
   495  
   496  		case "batch":
   497  			b := d.NewIndexedBatch()
   498  			if err := runBatchDefineCmd(td, b); err != nil {
   499  				return err.Error()
   500  			}
   501  			if err := b.Commit(nil); err != nil {
   502  				return err.Error()
   503  			}
   504  			return ""
   505  
   506  		case "build":
   507  			if err := runBuildCmd(td, d, mem); err != nil {
   508  				return err.Error()
   509  			}
   510  			return ""
   511  
   512  		case "ingest":
   513  			if err := runIngestCmd(td, d, mem); err != nil {
   514  				return err.Error()
   515  			}
   516  			if !blockFlush {
   517  				waitForFlush()
   518  			}
   519  			return ""
   520  
   521  		case "iter":
   522  			iter, _ := d.NewIter(nil)
   523  			return runIterCmd(td, iter, true)
   524  
   525  		case "lsm":
   526  			return runLSMCmd(td, d)
   527  
   528  		case "close":
   529  			if closed {
   530  				return "already closed"
   531  			}
   532  			require.NoError(t, d.Close())
   533  			closed = true
   534  			return ""
   535  
   536  		case "ls":
   537  			files, err := mem.List(dir)
   538  			sort.Strings(files)
   539  			require.NoError(t, err)
   540  			return strings.Join(files, "\n")
   541  
   542  		case "open":
   543  			opts.ReadOnly = td.HasArg("readOnly")
   544  			var err error
   545  			d, err = Open(dir, opts)
   546  			closed = false
   547  			require.NoError(t, err)
   548  			waitForFlush()
   549  			d.TestOnlyWaitForCleaning()
   550  			return ""
   551  
   552  		case "blockFlush":
   553  			blockFlush = true
   554  			d.mu.Lock()
   555  			d.mu.compact.flushing = true
   556  			d.mu.Unlock()
   557  			return ""
   558  
   559  		case "allowFlush":
   560  			blockFlush = false
   561  			d.mu.Lock()
   562  			d.mu.compact.flushing = false
   563  			d.mu.Unlock()
   564  			return ""
   565  
   566  		case "flush":
   567  			d.maybeScheduleFlush()
   568  			waitForFlush()
   569  			d.TestOnlyWaitForCleaning()
   570  			return ""
   571  
   572  		case "get":
   573  			return runGetCmd(t, td, d)
   574  
   575  		default:
   576  			return fmt.Sprintf("unknown command: %s", td.Cmd)
   577  		}
   578  	})
   579  }
   580  
   581  func TestExcise(t *testing.T) {
   582  	var mem vfs.FS
   583  	var d *DB
   584  	var flushed bool
   585  	defer func() {
   586  		require.NoError(t, d.Close())
   587  	}()
   588  
   589  	var opts *Options
   590  	reset := func() {
   591  		if d != nil {
   592  			require.NoError(t, d.Close())
   593  		}
   594  
   595  		mem = vfs.NewMem()
   596  		require.NoError(t, mem.MkdirAll("ext", 0755))
   597  		opts = &Options{
   598  			FS:                    mem,
   599  			L0CompactionThreshold: 100,
   600  			L0StopWritesThreshold: 100,
   601  			DebugCheck:            DebugCheckLevels,
   602  			EventListener: &EventListener{FlushEnd: func(info FlushInfo) {
   603  				flushed = true
   604  			}},
   605  			FormatMajorVersion: FormatVirtualSSTables,
   606  			Comparer:           testkeys.Comparer,
   607  		}
   608  		// Disable automatic compactions because otherwise we'll race with
   609  		// delete-only compactions triggered by ingesting range tombstones.
   610  		opts.DisableAutomaticCompactions = true
   611  		// Set this to true to add some testing for the virtual sstable validation
   612  		// code paths.
   613  		opts.Experimental.ValidateOnIngest = true
   614  
   615  		var err error
   616  		d, err = Open("", opts)
   617  		require.NoError(t, err)
   618  	}
   619  	reset()
   620  
   621  	datadriven.RunTest(t, "testdata/excise", func(t *testing.T, td *datadriven.TestData) string {
   622  		switch td.Cmd {
   623  		case "reset":
   624  			reset()
   625  			return ""
   626  		case "reopen":
   627  			require.NoError(t, d.Close())
   628  			var err error
   629  			d, err = Open("", opts)
   630  			require.NoError(t, err)
   631  
   632  			return ""
   633  		case "batch":
   634  			b := d.NewIndexedBatch()
   635  			if err := runBatchDefineCmd(td, b); err != nil {
   636  				return err.Error()
   637  			}
   638  			if err := b.Commit(nil); err != nil {
   639  				return err.Error()
   640  			}
   641  			return ""
   642  		case "build":
   643  			if err := runBuildCmd(td, d, mem); err != nil {
   644  				return err.Error()
   645  			}
   646  			return ""
   647  
   648  		case "flush":
   649  			if err := d.Flush(); err != nil {
   650  				return err.Error()
   651  			}
   652  			return ""
   653  
   654  		case "ingest":
   655  			flushed = false
   656  			if err := runIngestCmd(td, d, mem); err != nil {
   657  				return err.Error()
   658  			}
   659  			// Wait for a possible flush.
   660  			d.mu.Lock()
   661  			for d.mu.compact.flushing {
   662  				d.mu.compact.cond.Wait()
   663  			}
   664  			d.mu.Unlock()
   665  			if flushed {
   666  				return "memtable flushed"
   667  			}
   668  			return ""
   669  
   670  		case "ingest-and-excise":
   671  			flushed = false
   672  			if err := runIngestAndExciseCmd(td, d, mem); err != nil {
   673  				return err.Error()
   674  			}
   675  			// Wait for a possible flush.
   676  			d.mu.Lock()
   677  			for d.mu.compact.flushing {
   678  				d.mu.compact.cond.Wait()
   679  			}
   680  			d.mu.Unlock()
   681  			if flushed {
   682  				return "memtable flushed"
   683  			}
   684  			return ""
   685  
   686  		case "get":
   687  			return runGetCmd(t, td, d)
   688  
   689  		case "iter":
   690  			iter, _ := d.NewIter(&IterOptions{
   691  				KeyTypes: IterKeyTypePointsAndRanges,
   692  			})
   693  			return runIterCmd(td, iter, true)
   694  
   695  		case "lsm":
   696  			return runLSMCmd(td, d)
   697  
   698  		case "metrics":
   699  			// The asynchronous loading of table stats can change metrics, so
   700  			// wait for all the tables' stats to be loaded.
   701  			d.mu.Lock()
   702  			d.waitTableStats()
   703  			d.mu.Unlock()
   704  
   705  			return d.Metrics().StringForTests()
   706  
   707  		case "wait-pending-table-stats":
   708  			return runTableStatsCmd(td, d)
   709  
   710  		case "excise":
   711  			ve := &versionEdit{
   712  				DeletedFiles: map[deletedFileEntry]*fileMetadata{},
   713  			}
   714  			var exciseSpan KeyRange
   715  			if len(td.CmdArgs) != 2 {
   716  				panic("insufficient args for compact command")
   717  			}
   718  			exciseSpan.Start = []byte(td.CmdArgs[0].Key)
   719  			exciseSpan.End = []byte(td.CmdArgs[1].Key)
   720  
   721  			d.mu.Lock()
   722  			d.mu.versions.logLock()
   723  			d.mu.Unlock()
   724  			current := d.mu.versions.currentVersion()
   725  			for level := range current.Levels {
   726  				iter := current.Levels[level].Iter()
   727  				for m := iter.SeekGE(d.cmp, exciseSpan.Start); m != nil && d.cmp(m.Smallest.UserKey, exciseSpan.End) < 0; m = iter.Next() {
   728  					_, err := d.excise(exciseSpan, m, ve, level)
   729  					if err != nil {
   730  						d.mu.Lock()
   731  						d.mu.versions.logUnlock()
   732  						d.mu.Unlock()
   733  						return fmt.Sprintf("error when excising %s: %s", m.FileNum, err.Error())
   734  					}
   735  				}
   736  			}
   737  			d.mu.Lock()
   738  			d.mu.versions.logUnlock()
   739  			d.mu.Unlock()
   740  			return fmt.Sprintf("would excise %d files, use ingest-and-excise to excise.\n%s", len(ve.DeletedFiles), ve.DebugString(base.DefaultFormatter))
   741  
   742  		case "confirm-backing":
   743  			// Confirms that the files have the same FileBacking.
   744  			fileNums := make(map[base.FileNum]struct{})
   745  			for i := range td.CmdArgs {
   746  				fNum, err := strconv.Atoi(td.CmdArgs[i].Key)
   747  				if err != nil {
   748  					panic("invalid file number")
   749  				}
   750  				fileNums[base.FileNum(fNum)] = struct{}{}
   751  			}
   752  			d.mu.Lock()
   753  			currVersion := d.mu.versions.currentVersion()
   754  			var ptr *manifest.FileBacking
   755  			for _, level := range currVersion.Levels {
   756  				lIter := level.Iter()
   757  				for f := lIter.First(); f != nil; f = lIter.Next() {
   758  					if _, ok := fileNums[f.FileNum]; ok {
   759  						if ptr == nil {
   760  							ptr = f.FileBacking
   761  							continue
   762  						}
   763  						if f.FileBacking != ptr {
   764  							d.mu.Unlock()
   765  							return "file backings are not the same"
   766  						}
   767  					}
   768  				}
   769  			}
   770  			d.mu.Unlock()
   771  			return "file backings are the same"
   772  		case "compact":
   773  			if len(td.CmdArgs) != 2 {
   774  				panic("insufficient args for compact command")
   775  			}
   776  			l := td.CmdArgs[0].Key
   777  			r := td.CmdArgs[1].Key
   778  			err := d.Compact([]byte(l), []byte(r), false)
   779  			if err != nil {
   780  				return err.Error()
   781  			}
   782  			return ""
   783  		default:
   784  			return fmt.Sprintf("unknown command: %s", td.Cmd)
   785  		}
   786  	})
   787  }
   788  
   789  func testIngestSharedImpl(
   790  	t *testing.T, createOnShared remote.CreateOnSharedStrategy, fileName string,
   791  ) {
   792  	var d, d1, d2 *DB
   793  	var efos map[string]*EventuallyFileOnlySnapshot
   794  	defer func() {
   795  		for _, e := range efos {
   796  			require.NoError(t, e.Close())
   797  		}
   798  		if d1 != nil {
   799  			require.NoError(t, d1.Close())
   800  		}
   801  		if d2 != nil {
   802  			require.NoError(t, d2.Close())
   803  		}
   804  	}()
   805  	creatorIDCounter := uint64(1)
   806  	replicateCounter := 1
   807  	var opts1, opts2 *Options
   808  
   809  	reset := func() {
   810  		for _, e := range efos {
   811  			require.NoError(t, e.Close())
   812  		}
   813  		if d1 != nil {
   814  			require.NoError(t, d1.Close())
   815  		}
   816  		if d2 != nil {
   817  			require.NoError(t, d2.Close())
   818  		}
   819  		efos = make(map[string]*EventuallyFileOnlySnapshot)
   820  
   821  		sstorage := remote.NewInMem()
   822  		mem1 := vfs.NewMem()
   823  		mem2 := vfs.NewMem()
   824  		require.NoError(t, mem1.MkdirAll("ext", 0755))
   825  		require.NoError(t, mem2.MkdirAll("ext", 0755))
   826  		opts1 = &Options{
   827  			Comparer:              testkeys.Comparer,
   828  			FS:                    mem1,
   829  			LBaseMaxBytes:         1,
   830  			L0CompactionThreshold: 100,
   831  			L0StopWritesThreshold: 100,
   832  			DebugCheck:            DebugCheckLevels,
   833  			FormatMajorVersion:    FormatVirtualSSTables,
   834  		}
   835  		// lel.
   836  		lel := MakeLoggingEventListener(DefaultLogger)
   837  		opts1.EventListener = &lel
   838  		opts1.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
   839  			"": sstorage,
   840  		})
   841  		opts1.Experimental.CreateOnShared = createOnShared
   842  		opts1.Experimental.CreateOnSharedLocator = ""
   843  		// Disable automatic compactions because otherwise we'll race with
   844  		// delete-only compactions triggered by ingesting range tombstones.
   845  		opts1.DisableAutomaticCompactions = true
   846  
   847  		opts2 = &Options{}
   848  		*opts2 = *opts1
   849  		opts2.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
   850  			"": sstorage,
   851  		})
   852  		opts2.Experimental.CreateOnShared = createOnShared
   853  		opts2.Experimental.CreateOnSharedLocator = ""
   854  		opts2.FS = mem2
   855  
   856  		var err error
   857  		d1, err = Open("", opts1)
   858  		require.NoError(t, err)
   859  		require.NoError(t, d1.SetCreatorID(creatorIDCounter))
   860  		creatorIDCounter++
   861  		d2, err = Open("", opts2)
   862  		require.NoError(t, err)
   863  		require.NoError(t, d2.SetCreatorID(creatorIDCounter))
   864  		creatorIDCounter++
   865  		d = d1
   866  	}
   867  	reset()
   868  
   869  	datadriven.RunTest(t, fmt.Sprintf("testdata/%s", fileName), func(t *testing.T, td *datadriven.TestData) string {
   870  		switch td.Cmd {
   871  		case "restart":
   872  			for _, e := range efos {
   873  				require.NoError(t, e.Close())
   874  			}
   875  			if d1 != nil {
   876  				require.NoError(t, d1.Close())
   877  			}
   878  			if d2 != nil {
   879  				require.NoError(t, d2.Close())
   880  			}
   881  
   882  			var err error
   883  			d1, err = Open("", opts1)
   884  			if err != nil {
   885  				return err.Error()
   886  			}
   887  			d2, err = Open("", opts2)
   888  			if err != nil {
   889  				return err.Error()
   890  			}
   891  			d = d1
   892  			return "ok, note that the active db has been set to 1 (use 'switch' to change)"
   893  		case "reset":
   894  			reset()
   895  			return ""
   896  		case "switch":
   897  			if len(td.CmdArgs) != 1 {
   898  				return "usage: switch <1 or 2>"
   899  			}
   900  			switch td.CmdArgs[0].Key {
   901  			case "1":
   902  				d = d1
   903  			case "2":
   904  				d = d2
   905  			default:
   906  				return "usage: switch <1 or 2>"
   907  			}
   908  			return "ok"
   909  		case "batch":
   910  			b := d.NewIndexedBatch()
   911  			if err := runBatchDefineCmd(td, b); err != nil {
   912  				return err.Error()
   913  			}
   914  			if err := b.Commit(nil); err != nil {
   915  				return err.Error()
   916  			}
   917  			return ""
   918  		case "build":
   919  			if err := runBuildCmd(td, d, d.opts.FS); err != nil {
   920  				return err.Error()
   921  			}
   922  			return ""
   923  
   924  		case "flush":
   925  			if err := d.Flush(); err != nil {
   926  				return err.Error()
   927  			}
   928  			return ""
   929  
   930  		case "ingest":
   931  			if err := runIngestCmd(td, d, d.opts.FS); err != nil {
   932  				return err.Error()
   933  			}
   934  			// Wait for a possible flush.
   935  			d.mu.Lock()
   936  			for d.mu.compact.flushing {
   937  				d.mu.compact.cond.Wait()
   938  			}
   939  			d.mu.Unlock()
   940  			return ""
   941  
   942  		case "ingest-and-excise":
   943  			if err := runIngestAndExciseCmd(td, d, d.opts.FS); err != nil {
   944  				return err.Error()
   945  			}
   946  			// Wait for a possible flush.
   947  			d.mu.Lock()
   948  			for d.mu.compact.flushing {
   949  				d.mu.compact.cond.Wait()
   950  			}
   951  			d.mu.Unlock()
   952  			return ""
   953  
   954  		case "replicate":
   955  			if len(td.CmdArgs) != 4 {
   956  				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
   957  			}
   958  			var from, to *DB
   959  			switch td.CmdArgs[0].Key {
   960  			case "1":
   961  				from = d1
   962  			case "2":
   963  				from = d2
   964  			default:
   965  				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
   966  			}
   967  			switch td.CmdArgs[1].Key {
   968  			case "1":
   969  				to = d1
   970  			case "2":
   971  				to = d2
   972  			default:
   973  				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
   974  			}
   975  			startKey := []byte(td.CmdArgs[2].Key)
   976  			endKey := []byte(td.CmdArgs[3].Key)
   977  
   978  			writeOpts := d.opts.MakeWriterOptions(0 /* level */, to.opts.FormatMajorVersion.MaxTableFormat())
   979  			sstPath := fmt.Sprintf("ext/replicate%d.sst", replicateCounter)
   980  			f, err := to.opts.FS.Create(sstPath)
   981  			require.NoError(t, err)
   982  			replicateCounter++
   983  			w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), writeOpts)
   984  
   985  			var sharedSSTs []SharedSSTMeta
   986  			err = from.ScanInternal(context.TODO(), sstable.CategoryAndQoS{}, startKey, endKey,
   987  				func(key *InternalKey, value LazyValue, _ IteratorLevel) error {
   988  					val, _, err := value.Value(nil)
   989  					require.NoError(t, err)
   990  					require.NoError(t, w.Add(base.MakeInternalKey(key.UserKey, 0, key.Kind()), val))
   991  					return nil
   992  				},
   993  				func(start, end []byte, seqNum uint64) error {
   994  					require.NoError(t, w.DeleteRange(start, end))
   995  					return nil
   996  				},
   997  				func(start, end []byte, keys []keyspan.Key) error {
   998  					s := keyspan.Span{
   999  						Start:     start,
  1000  						End:       end,
  1001  						Keys:      keys,
  1002  						KeysOrder: 0,
  1003  					}
  1004  					require.NoError(t, rangekey.Encode(&s, func(k base.InternalKey, v []byte) error {
  1005  						return w.AddRangeKey(base.MakeInternalKey(k.UserKey, 0, k.Kind()), v)
  1006  					}))
  1007  					return nil
  1008  				},
  1009  				func(sst *SharedSSTMeta) error {
  1010  					sharedSSTs = append(sharedSSTs, *sst)
  1011  					return nil
  1012  				},
  1013  			)
  1014  			require.NoError(t, err)
  1015  			require.NoError(t, w.Close())
  1016  
  1017  			_, err = to.IngestAndExcise([]string{sstPath}, sharedSSTs, KeyRange{Start: startKey, End: endKey})
  1018  			require.NoError(t, err)
  1019  			return fmt.Sprintf("replicated %d shared SSTs", len(sharedSSTs))
  1020  
  1021  		case "get":
  1022  			return runGetCmd(t, td, d)
  1023  
  1024  		case "iter":
  1025  			o := &IterOptions{KeyTypes: IterKeyTypePointsAndRanges}
  1026  			var reader Reader
  1027  			reader = d
  1028  			for _, arg := range td.CmdArgs {
  1029  				switch arg.Key {
  1030  				case "mask-suffix":
  1031  					o.RangeKeyMasking.Suffix = []byte(arg.Vals[0])
  1032  				case "mask-filter":
  1033  					o.RangeKeyMasking.Filter = func() BlockPropertyFilterMask {
  1034  						return sstable.NewTestKeysMaskingFilter()
  1035  					}
  1036  				case "snapshot":
  1037  					reader = efos[arg.Vals[0]]
  1038  				}
  1039  			}
  1040  			iter, err := reader.NewIter(o)
  1041  			if err != nil {
  1042  				return err.Error()
  1043  			}
  1044  			return runIterCmd(td, iter, true)
  1045  
  1046  		case "lsm":
  1047  			return runLSMCmd(td, d)
  1048  
  1049  		case "metrics":
  1050  			// The asynchronous loading of table stats can change metrics, so
  1051  			// wait for all the tables' stats to be loaded.
  1052  			d.mu.Lock()
  1053  			d.waitTableStats()
  1054  			d.mu.Unlock()
  1055  
  1056  			return d.Metrics().StringForTests()
  1057  
  1058  		case "wait-pending-table-stats":
  1059  			return runTableStatsCmd(td, d)
  1060  
  1061  		case "excise":
  1062  			ve := &versionEdit{
  1063  				DeletedFiles: map[deletedFileEntry]*fileMetadata{},
  1064  			}
  1065  			var exciseSpan KeyRange
  1066  			if len(td.CmdArgs) != 2 {
  1067  				panic("insufficient args for excise command")
  1068  			}
  1069  			exciseSpan.Start = []byte(td.CmdArgs[0].Key)
  1070  			exciseSpan.End = []byte(td.CmdArgs[1].Key)
  1071  
  1072  			d.mu.Lock()
  1073  			d.mu.versions.logLock()
  1074  			d.mu.Unlock()
  1075  			current := d.mu.versions.currentVersion()
  1076  			for level := range current.Levels {
  1077  				iter := current.Levels[level].Iter()
  1078  				for m := iter.SeekGE(d.cmp, exciseSpan.Start); m != nil && d.cmp(m.Smallest.UserKey, exciseSpan.End) < 0; m = iter.Next() {
  1079  					_, err := d.excise(exciseSpan, m, ve, level)
  1080  					if err != nil {
  1081  						d.mu.Lock()
  1082  						d.mu.versions.logUnlock()
  1083  						d.mu.Unlock()
  1084  						return fmt.Sprintf("error when excising %s: %s", m.FileNum, err.Error())
  1085  					}
  1086  				}
  1087  			}
  1088  			d.mu.Lock()
  1089  			d.mu.versions.logUnlock()
  1090  			d.mu.Unlock()
  1091  			return fmt.Sprintf("would excise %d files, use ingest-and-excise to excise.\n%s", len(ve.DeletedFiles), ve.String())
  1092  
  1093  		case "file-only-snapshot":
  1094  			if len(td.CmdArgs) != 1 {
  1095  				panic("insufficient args for file-only-snapshot command")
  1096  			}
  1097  			name := td.CmdArgs[0].Key
  1098  			var keyRanges []KeyRange
  1099  			for _, line := range strings.Split(td.Input, "\n") {
  1100  				fields := strings.Fields(line)
  1101  				if len(fields) != 2 {
  1102  					return "expected two fields for file-only snapshot KeyRanges"
  1103  				}
  1104  				kr := KeyRange{Start: []byte(fields[0]), End: []byte(fields[1])}
  1105  				keyRanges = append(keyRanges, kr)
  1106  			}
  1107  
  1108  			s := d.NewEventuallyFileOnlySnapshot(keyRanges)
  1109  			efos[name] = s
  1110  			return "ok"
  1111  
  1112  		case "wait-for-file-only-snapshot":
  1113  			if len(td.CmdArgs) != 1 {
  1114  				panic("insufficient args for file-only-snapshot command")
  1115  			}
  1116  			name := td.CmdArgs[0].Key
  1117  			err := efos[name].WaitForFileOnlySnapshot(context.TODO(), 1*time.Millisecond)
  1118  			if err != nil {
  1119  				return err.Error()
  1120  			}
  1121  			return "ok"
  1122  
  1123  		case "compact":
  1124  			err := runCompactCmd(td, d)
  1125  			if err != nil {
  1126  				return err.Error()
  1127  			}
  1128  			return "ok"
  1129  		default:
  1130  			return fmt.Sprintf("unknown command: %s", td.Cmd)
  1131  		}
  1132  	})
  1133  }
  1134  
  1135  func TestIngestShared(t *testing.T) {
  1136  	for _, strategy := range []remote.CreateOnSharedStrategy{remote.CreateOnSharedAll, remote.CreateOnSharedLower} {
  1137  		strategyStr := "all"
  1138  		if strategy == remote.CreateOnSharedLower {
  1139  			strategyStr = "lower"
  1140  		}
  1141  		t.Run(fmt.Sprintf("createOnShared=%s", strategyStr), func(t *testing.T) {
  1142  			fileName := "ingest_shared"
  1143  			if strategy == remote.CreateOnSharedLower {
  1144  				fileName = "ingest_shared_lower"
  1145  			}
  1146  			testIngestSharedImpl(t, strategy, fileName)
  1147  		})
  1148  	}
  1149  }
  1150  
  1151  func TestSimpleIngestShared(t *testing.T) {
  1152  	mem := vfs.NewMem()
  1153  	var d *DB
  1154  	var provider2 objstorage.Provider
  1155  	opts2 := Options{FS: vfs.NewMem(), FormatMajorVersion: FormatVirtualSSTables}
  1156  	opts2.EnsureDefaults()
  1157  
  1158  	// Create an objProvider where we will fake-create some sstables that can
  1159  	// then be shared back to the db instance.
  1160  	providerSettings := objstorageprovider.Settings{
  1161  		Logger:              opts2.Logger,
  1162  		FS:                  opts2.FS,
  1163  		FSDirName:           "",
  1164  		FSDirInitialListing: nil,
  1165  		FSCleaner:           opts2.Cleaner,
  1166  		NoSyncOnClose:       opts2.NoSyncOnClose,
  1167  		BytesPerSync:        opts2.BytesPerSync,
  1168  	}
  1169  	providerSettings.Remote.StorageFactory = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
  1170  		"": remote.NewInMem(),
  1171  	})
  1172  	providerSettings.Remote.CreateOnShared = remote.CreateOnSharedAll
  1173  	providerSettings.Remote.CreateOnSharedLocator = ""
  1174  
  1175  	provider2, err := objstorageprovider.Open(providerSettings)
  1176  	require.NoError(t, err)
  1177  	creatorIDCounter := uint64(1)
  1178  	provider2.SetCreatorID(objstorage.CreatorID(creatorIDCounter))
  1179  	creatorIDCounter++
  1180  
  1181  	defer func() {
  1182  		require.NoError(t, d.Close())
  1183  	}()
  1184  
  1185  	reset := func() {
  1186  		if d != nil {
  1187  			require.NoError(t, d.Close())
  1188  		}
  1189  
  1190  		mem = vfs.NewMem()
  1191  		require.NoError(t, mem.MkdirAll("ext", 0755))
  1192  		opts := &Options{
  1193  			FormatMajorVersion:    FormatVirtualSSTables,
  1194  			FS:                    mem,
  1195  			L0CompactionThreshold: 100,
  1196  			L0StopWritesThreshold: 100,
  1197  		}
  1198  		opts.Experimental.RemoteStorage = providerSettings.Remote.StorageFactory
  1199  		opts.Experimental.CreateOnShared = providerSettings.Remote.CreateOnShared
  1200  		opts.Experimental.CreateOnSharedLocator = providerSettings.Remote.CreateOnSharedLocator
  1201  
  1202  		var err error
  1203  		d, err = Open("", opts)
  1204  		require.NoError(t, err)
  1205  		require.NoError(t, d.SetCreatorID(creatorIDCounter))
  1206  		creatorIDCounter++
  1207  	}
  1208  	reset()
  1209  
  1210  	metaMap := map[base.DiskFileNum]objstorage.ObjectMetadata{}
  1211  
  1212  	require.NoError(t, d.Set([]byte("d"), []byte("unexpected"), nil))
  1213  	require.NoError(t, d.Set([]byte("e"), []byte("unexpected"), nil))
  1214  	require.NoError(t, d.Set([]byte("a"), []byte("unexpected"), nil))
  1215  	require.NoError(t, d.Set([]byte("f"), []byte("unexpected"), nil))
  1216  	d.Flush()
  1217  
  1218  	{
  1219  		// Create a shared file.
  1220  		fn := base.FileNum(2)
  1221  		f, meta, err := provider2.Create(context.TODO(), fileTypeTable, fn.DiskFileNum(), objstorage.CreateOptions{PreferSharedStorage: true})
  1222  		require.NoError(t, err)
  1223  		w := sstable.NewWriter(f, d.opts.MakeWriterOptions(0, d.opts.FormatMajorVersion.MaxTableFormat()))
  1224  		w.Set([]byte("d"), []byte("shared"))
  1225  		w.Set([]byte("e"), []byte("shared"))
  1226  		w.Close()
  1227  		metaMap[fn.DiskFileNum()] = meta
  1228  	}
  1229  
  1230  	m := metaMap[base.FileNum(2).DiskFileNum()]
  1231  	handle, err := provider2.RemoteObjectBacking(&m)
  1232  	require.NoError(t, err)
  1233  	size, err := provider2.Size(m)
  1234  	require.NoError(t, err)
  1235  
  1236  	sharedSSTMeta := SharedSSTMeta{
  1237  		Backing:          handle,
  1238  		Smallest:         base.MakeInternalKey([]byte("d"), 0, InternalKeyKindSet),
  1239  		Largest:          base.MakeInternalKey([]byte("e"), 0, InternalKeyKindSet),
  1240  		SmallestPointKey: base.MakeInternalKey([]byte("d"), 0, InternalKeyKindSet),
  1241  		LargestPointKey:  base.MakeInternalKey([]byte("e"), 0, InternalKeyKindSet),
  1242  		Level:            6,
  1243  		Size:             uint64(size + 5),
  1244  	}
  1245  	_, err = d.IngestAndExcise([]string{}, []SharedSSTMeta{sharedSSTMeta}, KeyRange{Start: []byte("d"), End: []byte("ee")})
  1246  	require.NoError(t, err)
  1247  
  1248  	// TODO(bilal): Once reading of shared sstables is in, verify that the values
  1249  	// of d and e have been updated.
  1250  }
  1251  
  1252  type blockedCompaction struct {
  1253  	startBlock, unblock chan struct{}
  1254  }
  1255  
  1256  func TestConcurrentExcise(t *testing.T) {
  1257  	var d, d1, d2 *DB
  1258  	var efos map[string]*EventuallyFileOnlySnapshot
  1259  	backgroundErrs := make(chan error, 5)
  1260  	var compactions map[string]*blockedCompaction
  1261  	defer func() {
  1262  		for _, e := range efos {
  1263  			require.NoError(t, e.Close())
  1264  		}
  1265  		if d1 != nil {
  1266  			require.NoError(t, d1.Close())
  1267  		}
  1268  		if d2 != nil {
  1269  			require.NoError(t, d2.Close())
  1270  		}
  1271  	}()
  1272  	creatorIDCounter := uint64(1)
  1273  	replicateCounter := 1
  1274  
  1275  	var wg sync.WaitGroup
  1276  	defer wg.Wait()
  1277  	var blockNextCompaction bool
  1278  	var blockedJobID int
  1279  	var blockedCompactionName string
  1280  	var blockedCompactionsMu sync.Mutex // protects the above three variables.
  1281  
  1282  	reset := func() {
  1283  		wg.Wait()
  1284  		for _, e := range efos {
  1285  			require.NoError(t, e.Close())
  1286  		}
  1287  		if d1 != nil {
  1288  			require.NoError(t, d1.Close())
  1289  		}
  1290  		if d2 != nil {
  1291  			require.NoError(t, d2.Close())
  1292  		}
  1293  		efos = make(map[string]*EventuallyFileOnlySnapshot)
  1294  		compactions = make(map[string]*blockedCompaction)
  1295  		backgroundErrs = make(chan error, 5)
  1296  
  1297  		var el EventListener
  1298  		el.EnsureDefaults(testLogger{t: t})
  1299  		el.FlushBegin = func(info FlushInfo) {
  1300  			// Don't block flushes
  1301  		}
  1302  		el.BackgroundError = func(err error) {
  1303  			backgroundErrs <- err
  1304  		}
  1305  		el.CompactionBegin = func(info CompactionInfo) {
  1306  			if info.Reason == "move" {
  1307  				return
  1308  			}
  1309  			blockedCompactionsMu.Lock()
  1310  			defer blockedCompactionsMu.Unlock()
  1311  			if blockNextCompaction {
  1312  				blockNextCompaction = false
  1313  				blockedJobID = info.JobID
  1314  			}
  1315  		}
  1316  		el.TableCreated = func(info TableCreateInfo) {
  1317  			blockedCompactionsMu.Lock()
  1318  			if info.JobID != blockedJobID {
  1319  				blockedCompactionsMu.Unlock()
  1320  				return
  1321  			}
  1322  			blockedJobID = 0
  1323  			c := compactions[blockedCompactionName]
  1324  			blockedCompactionName = ""
  1325  			blockedCompactionsMu.Unlock()
  1326  			c.startBlock <- struct{}{}
  1327  			<-c.unblock
  1328  		}
  1329  
  1330  		sstorage := remote.NewInMem()
  1331  		mem1 := vfs.NewMem()
  1332  		mem2 := vfs.NewMem()
  1333  		require.NoError(t, mem1.MkdirAll("ext", 0755))
  1334  		require.NoError(t, mem2.MkdirAll("ext", 0755))
  1335  		opts1 := &Options{
  1336  			Comparer:              testkeys.Comparer,
  1337  			LBaseMaxBytes:         1,
  1338  			FS:                    mem1,
  1339  			L0CompactionThreshold: 100,
  1340  			L0StopWritesThreshold: 100,
  1341  			DebugCheck:            DebugCheckLevels,
  1342  			FormatMajorVersion:    FormatVirtualSSTables,
  1343  		}
  1344  		// lel.
  1345  		lel := MakeLoggingEventListener(DefaultLogger)
  1346  		tel := TeeEventListener(lel, el)
  1347  		opts1.EventListener = &tel
  1348  		opts1.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
  1349  			"": sstorage,
  1350  		})
  1351  		opts1.Experimental.CreateOnShared = remote.CreateOnSharedAll
  1352  		opts1.Experimental.CreateOnSharedLocator = ""
  1353  		// Disable automatic compactions because otherwise we'll race with
  1354  		// delete-only compactions triggered by ingesting range tombstones.
  1355  		opts1.DisableAutomaticCompactions = true
  1356  
  1357  		opts2 := &Options{}
  1358  		*opts2 = *opts1
  1359  		opts2.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
  1360  			"": sstorage,
  1361  		})
  1362  		opts2.Experimental.CreateOnShared = remote.CreateOnSharedAll
  1363  		opts2.Experimental.CreateOnSharedLocator = ""
  1364  		opts2.FS = mem2
  1365  
  1366  		var err error
  1367  		d1, err = Open("", opts1)
  1368  		require.NoError(t, err)
  1369  		require.NoError(t, d1.SetCreatorID(creatorIDCounter))
  1370  		creatorIDCounter++
  1371  		d2, err = Open("", opts2)
  1372  		require.NoError(t, err)
  1373  		require.NoError(t, d2.SetCreatorID(creatorIDCounter))
  1374  		creatorIDCounter++
  1375  		d = d1
  1376  	}
  1377  	reset()
  1378  
  1379  	datadriven.RunTest(t, "testdata/concurrent_excise", func(t *testing.T, td *datadriven.TestData) string {
  1380  		switch td.Cmd {
  1381  		case "reset":
  1382  			reset()
  1383  			return ""
  1384  		case "switch":
  1385  			if len(td.CmdArgs) != 1 {
  1386  				return "usage: switch <1 or 2>"
  1387  			}
  1388  			switch td.CmdArgs[0].Key {
  1389  			case "1":
  1390  				d = d1
  1391  			case "2":
  1392  				d = d2
  1393  			default:
  1394  				return "usage: switch <1 or 2>"
  1395  			}
  1396  			return "ok"
  1397  		case "batch":
  1398  			b := d.NewIndexedBatch()
  1399  			if err := runBatchDefineCmd(td, b); err != nil {
  1400  				return err.Error()
  1401  			}
  1402  			if err := b.Commit(nil); err != nil {
  1403  				return err.Error()
  1404  			}
  1405  			return ""
  1406  		case "build":
  1407  			if err := runBuildCmd(td, d, d.opts.FS); err != nil {
  1408  				return err.Error()
  1409  			}
  1410  			return ""
  1411  
  1412  		case "flush":
  1413  			if err := d.Flush(); err != nil {
  1414  				return err.Error()
  1415  			}
  1416  			return ""
  1417  
  1418  		case "ingest":
  1419  			if err := runIngestCmd(td, d, d.opts.FS); err != nil {
  1420  				return err.Error()
  1421  			}
  1422  			// Wait for a possible flush.
  1423  			d.mu.Lock()
  1424  			for d.mu.compact.flushing {
  1425  				d.mu.compact.cond.Wait()
  1426  			}
  1427  			d.mu.Unlock()
  1428  			return ""
  1429  
  1430  		case "ingest-and-excise":
  1431  			if err := runIngestAndExciseCmd(td, d, d.opts.FS); err != nil {
  1432  				return err.Error()
  1433  			}
  1434  			// Wait for a possible flush.
  1435  			d.mu.Lock()
  1436  			for d.mu.compact.flushing {
  1437  				d.mu.compact.cond.Wait()
  1438  			}
  1439  			d.mu.Unlock()
  1440  			return ""
  1441  
  1442  		case "replicate":
  1443  			if len(td.CmdArgs) != 4 {
  1444  				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
  1445  			}
  1446  			var from, to *DB
  1447  			switch td.CmdArgs[0].Key {
  1448  			case "1":
  1449  				from = d1
  1450  			case "2":
  1451  				from = d2
  1452  			default:
  1453  				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
  1454  			}
  1455  			switch td.CmdArgs[1].Key {
  1456  			case "1":
  1457  				to = d1
  1458  			case "2":
  1459  				to = d2
  1460  			default:
  1461  				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
  1462  			}
  1463  			startKey := []byte(td.CmdArgs[2].Key)
  1464  			endKey := []byte(td.CmdArgs[3].Key)
  1465  
  1466  			writeOpts := d.opts.MakeWriterOptions(0 /* level */, to.opts.FormatMajorVersion.MaxTableFormat())
  1467  			sstPath := fmt.Sprintf("ext/replicate%d.sst", replicateCounter)
  1468  			f, err := to.opts.FS.Create(sstPath)
  1469  			require.NoError(t, err)
  1470  			replicateCounter++
  1471  			w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), writeOpts)
  1472  
  1473  			var sharedSSTs []SharedSSTMeta
  1474  			err = from.ScanInternal(context.TODO(), sstable.CategoryAndQoS{}, startKey, endKey,
  1475  				func(key *InternalKey, value LazyValue, _ IteratorLevel) error {
  1476  					val, _, err := value.Value(nil)
  1477  					require.NoError(t, err)
  1478  					require.NoError(t, w.Add(base.MakeInternalKey(key.UserKey, 0, key.Kind()), val))
  1479  					return nil
  1480  				},
  1481  				func(start, end []byte, seqNum uint64) error {
  1482  					require.NoError(t, w.DeleteRange(start, end))
  1483  					return nil
  1484  				},
  1485  				func(start, end []byte, keys []keyspan.Key) error {
  1486  					s := keyspan.Span{
  1487  						Start:     start,
  1488  						End:       end,
  1489  						Keys:      keys,
  1490  						KeysOrder: 0,
  1491  					}
  1492  					require.NoError(t, rangekey.Encode(&s, func(k base.InternalKey, v []byte) error {
  1493  						return w.AddRangeKey(base.MakeInternalKey(k.UserKey, 0, k.Kind()), v)
  1494  					}))
  1495  					return nil
  1496  				},
  1497  				func(sst *SharedSSTMeta) error {
  1498  					sharedSSTs = append(sharedSSTs, *sst)
  1499  					return nil
  1500  				},
  1501  			)
  1502  			require.NoError(t, err)
  1503  			require.NoError(t, w.Close())
  1504  
  1505  			_, err = to.IngestAndExcise([]string{sstPath}, sharedSSTs, KeyRange{Start: startKey, End: endKey})
  1506  			require.NoError(t, err)
  1507  			return fmt.Sprintf("replicated %d shared SSTs", len(sharedSSTs))
  1508  
  1509  		case "get":
  1510  			return runGetCmd(t, td, d)
  1511  
  1512  		case "iter":
  1513  			o := &IterOptions{KeyTypes: IterKeyTypePointsAndRanges}
  1514  			var reader Reader
  1515  			reader = d
  1516  			for _, arg := range td.CmdArgs {
  1517  				switch arg.Key {
  1518  				case "mask-suffix":
  1519  					o.RangeKeyMasking.Suffix = []byte(arg.Vals[0])
  1520  				case "mask-filter":
  1521  					o.RangeKeyMasking.Filter = func() BlockPropertyFilterMask {
  1522  						return sstable.NewTestKeysMaskingFilter()
  1523  					}
  1524  				case "snapshot":
  1525  					reader = efos[arg.Vals[0]]
  1526  				}
  1527  			}
  1528  			iter, err := reader.NewIter(o)
  1529  			if err != nil {
  1530  				return err.Error()
  1531  			}
  1532  			return runIterCmd(td, iter, true)
  1533  
  1534  		case "lsm":
  1535  			return runLSMCmd(td, d)
  1536  
  1537  		case "metrics":
  1538  			// The asynchronous loading of table stats can change metrics, so
  1539  			// wait for all the tables' stats to be loaded.
  1540  			d.mu.Lock()
  1541  			d.waitTableStats()
  1542  			d.mu.Unlock()
  1543  
  1544  			return d.Metrics().StringForTests()
  1545  
  1546  		case "wait-pending-table-stats":
  1547  			return runTableStatsCmd(td, d)
  1548  
  1549  		case "excise":
  1550  			ve := &versionEdit{
  1551  				DeletedFiles: map[deletedFileEntry]*fileMetadata{},
  1552  			}
  1553  			var exciseSpan KeyRange
  1554  			if len(td.CmdArgs) != 2 {
  1555  				panic("insufficient args for excise command")
  1556  			}
  1557  			exciseSpan.Start = []byte(td.CmdArgs[0].Key)
  1558  			exciseSpan.End = []byte(td.CmdArgs[1].Key)
  1559  
  1560  			d.mu.Lock()
  1561  			d.mu.versions.logLock()
  1562  			d.mu.Unlock()
  1563  			current := d.mu.versions.currentVersion()
  1564  			for level := range current.Levels {
  1565  				iter := current.Levels[level].Iter()
  1566  				for m := iter.SeekGE(d.cmp, exciseSpan.Start); m != nil && d.cmp(m.Smallest.UserKey, exciseSpan.End) < 0; m = iter.Next() {
  1567  					_, err := d.excise(exciseSpan, m, ve, level)
  1568  					if err != nil {
  1569  						d.mu.Lock()
  1570  						d.mu.versions.logUnlock()
  1571  						d.mu.Unlock()
  1572  						return fmt.Sprintf("error when excising %s: %s", m.FileNum, err.Error())
  1573  					}
  1574  				}
  1575  			}
  1576  			d.mu.Lock()
  1577  			d.mu.versions.logUnlock()
  1578  			d.mu.Unlock()
  1579  			return fmt.Sprintf("would excise %d files, use ingest-and-excise to excise.\n%s", len(ve.DeletedFiles), ve.String())
  1580  
  1581  		case "file-only-snapshot":
  1582  			if len(td.CmdArgs) != 1 {
  1583  				panic("insufficient args for file-only-snapshot command")
  1584  			}
  1585  			name := td.CmdArgs[0].Key
  1586  			var keyRanges []KeyRange
  1587  			for _, line := range strings.Split(td.Input, "\n") {
  1588  				fields := strings.Fields(line)
  1589  				if len(fields) != 2 {
  1590  					return "expected two fields for file-only snapshot KeyRanges"
  1591  				}
  1592  				kr := KeyRange{Start: []byte(fields[0]), End: []byte(fields[1])}
  1593  				keyRanges = append(keyRanges, kr)
  1594  			}
  1595  
  1596  			s := d.NewEventuallyFileOnlySnapshot(keyRanges)
  1597  			efos[name] = s
  1598  			return "ok"
  1599  
  1600  		case "wait-for-file-only-snapshot":
  1601  			if len(td.CmdArgs) != 1 {
  1602  				panic("insufficient args for file-only-snapshot command")
  1603  			}
  1604  			name := td.CmdArgs[0].Key
  1605  			err := efos[name].WaitForFileOnlySnapshot(context.TODO(), 1*time.Millisecond)
  1606  			if err != nil {
  1607  				return err.Error()
  1608  			}
  1609  			return "ok"
  1610  
  1611  		case "unblock":
  1612  			name := td.CmdArgs[0].Key
  1613  			blockedCompactionsMu.Lock()
  1614  			c := compactions[name]
  1615  			delete(compactions, name)
  1616  			blockedCompactionsMu.Unlock()
  1617  			c.unblock <- struct{}{}
  1618  			return "ok"
  1619  
  1620  		case "compact":
  1621  			async := false
  1622  			var otherArgs []datadriven.CmdArg
  1623  			var bc *blockedCompaction
  1624  			for i := range td.CmdArgs {
  1625  				switch td.CmdArgs[i].Key {
  1626  				case "block":
  1627  					name := td.CmdArgs[i].Vals[0]
  1628  					bc = &blockedCompaction{startBlock: make(chan struct{}), unblock: make(chan struct{})}
  1629  					blockedCompactionsMu.Lock()
  1630  					compactions[name] = bc
  1631  					blockNextCompaction = true
  1632  					blockedCompactionName = name
  1633  					blockedCompactionsMu.Unlock()
  1634  					async = true
  1635  				default:
  1636  					otherArgs = append(otherArgs, td.CmdArgs[i])
  1637  				}
  1638  			}
  1639  			var tdClone datadriven.TestData
  1640  			tdClone = *td
  1641  			tdClone.CmdArgs = otherArgs
  1642  			if !async {
  1643  				err := runCompactCmd(td, d)
  1644  				if err != nil {
  1645  					return err.Error()
  1646  				}
  1647  			} else {
  1648  				wg.Add(1)
  1649  				go func() {
  1650  					defer wg.Done()
  1651  					_ = runCompactCmd(&tdClone, d)
  1652  				}()
  1653  				<-bc.startBlock
  1654  				return "spun off in separate goroutine"
  1655  			}
  1656  			return "ok"
  1657  		case "wait-for-background-error":
  1658  			err := <-backgroundErrs
  1659  			return err.Error()
  1660  		default:
  1661  			return fmt.Sprintf("unknown command: %s", td.Cmd)
  1662  		}
  1663  	})
  1664  }
  1665  
  1666  func TestIngestExternal(t *testing.T) {
  1667  	var mem vfs.FS
  1668  	var d *DB
  1669  	var flushed bool
  1670  	defer func() {
  1671  		require.NoError(t, d.Close())
  1672  	}()
  1673  
  1674  	var remoteStorage remote.Storage
  1675  
  1676  	reset := func() {
  1677  		if d != nil {
  1678  			require.NoError(t, d.Close())
  1679  		}
  1680  
  1681  		mem = vfs.NewMem()
  1682  		require.NoError(t, mem.MkdirAll("ext", 0755))
  1683  		remoteStorage = remote.NewInMem()
  1684  		opts := &Options{
  1685  			FS:                    mem,
  1686  			L0CompactionThreshold: 100,
  1687  			L0StopWritesThreshold: 100,
  1688  			DebugCheck:            DebugCheckLevels,
  1689  			EventListener: &EventListener{FlushEnd: func(info FlushInfo) {
  1690  				flushed = true
  1691  			}},
  1692  			FormatMajorVersion: FormatVirtualSSTables,
  1693  		}
  1694  		opts.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
  1695  			"external-locator": remoteStorage,
  1696  		})
  1697  		opts.Experimental.CreateOnShared = remote.CreateOnSharedNone
  1698  		// Disable automatic compactions because otherwise we'll race with
  1699  		// delete-only compactions triggered by ingesting range tombstones.
  1700  		opts.DisableAutomaticCompactions = true
  1701  		lel := MakeLoggingEventListener(DefaultLogger)
  1702  		opts.EventListener = &lel
  1703  
  1704  		var err error
  1705  		d, err = Open("", opts)
  1706  		require.NoError(t, err)
  1707  		require.NoError(t, d.SetCreatorID(1))
  1708  	}
  1709  	reset()
  1710  
  1711  	datadriven.RunTest(t, "testdata/ingest_external", func(t *testing.T, td *datadriven.TestData) string {
  1712  		switch td.Cmd {
  1713  		case "reset":
  1714  			reset()
  1715  			return ""
  1716  		case "batch":
  1717  			b := d.NewIndexedBatch()
  1718  			if err := runBatchDefineCmd(td, b); err != nil {
  1719  				return err.Error()
  1720  			}
  1721  			if err := b.Commit(nil); err != nil {
  1722  				return err.Error()
  1723  			}
  1724  			return ""
  1725  		case "build-remote":
  1726  			if err := runBuildRemoteCmd(td, d, remoteStorage); err != nil {
  1727  				return err.Error()
  1728  			}
  1729  			return ""
  1730  
  1731  		case "flush":
  1732  			if err := d.Flush(); err != nil {
  1733  				return err.Error()
  1734  			}
  1735  			return ""
  1736  
  1737  		case "ingest-external":
  1738  			flushed = false
  1739  			if err := runIngestExternalCmd(td, d, "external-locator"); err != nil {
  1740  				return err.Error()
  1741  			}
  1742  			// Wait for a possible flush.
  1743  			d.mu.Lock()
  1744  			for d.mu.compact.flushing {
  1745  				d.mu.compact.cond.Wait()
  1746  			}
  1747  			d.mu.Unlock()
  1748  			if flushed {
  1749  				return "memtable flushed"
  1750  			}
  1751  			return ""
  1752  
  1753  		case "download":
  1754  			if len(td.CmdArgs) != 2 {
  1755  				panic("insufficient args for download command")
  1756  			}
  1757  			l := []byte(td.CmdArgs[0].Key)
  1758  			r := []byte(td.CmdArgs[1].Key)
  1759  			spans := []DownloadSpan{{StartKey: l, EndKey: r}}
  1760  			ctx, cancel := context.WithTimeout(context.TODO(), 1*time.Minute)
  1761  			defer cancel()
  1762  			err := d.Download(ctx, spans)
  1763  			if err != nil {
  1764  				return err.Error()
  1765  			}
  1766  			return "ok"
  1767  
  1768  		case "get":
  1769  			return runGetCmd(t, td, d)
  1770  
  1771  		case "iter":
  1772  			iter, _ := d.NewIter(&IterOptions{
  1773  				KeyTypes: IterKeyTypePointsAndRanges,
  1774  			})
  1775  			return runIterCmd(td, iter, true)
  1776  
  1777  		case "lsm":
  1778  			return runLSMCmd(td, d)
  1779  
  1780  		case "metrics":
  1781  			// The asynchronous loading of table stats can change metrics, so
  1782  			// wait for all the tables' stats to be loaded.
  1783  			d.mu.Lock()
  1784  			d.waitTableStats()
  1785  			d.mu.Unlock()
  1786  
  1787  			return d.Metrics().StringForTests()
  1788  
  1789  		case "wait-pending-table-stats":
  1790  			return runTableStatsCmd(td, d)
  1791  
  1792  		case "compact":
  1793  			if len(td.CmdArgs) != 2 {
  1794  				panic("insufficient args for compact command")
  1795  			}
  1796  			l := td.CmdArgs[0].Key
  1797  			r := td.CmdArgs[1].Key
  1798  			err := d.Compact([]byte(l), []byte(r), false)
  1799  			if err != nil {
  1800  				return err.Error()
  1801  			}
  1802  			return ""
  1803  		default:
  1804  			return fmt.Sprintf("unknown command: %s", td.Cmd)
  1805  		}
  1806  	})
  1807  }
  1808  
  1809  func TestIngestMemtableOverlaps(t *testing.T) {
  1810  	comparers := []Comparer{
  1811  		{Name: "default", Compare: DefaultComparer.Compare, FormatKey: DefaultComparer.FormatKey},
  1812  		{
  1813  			Name:      "reverse",
  1814  			Compare:   func(a, b []byte) int { return DefaultComparer.Compare(b, a) },
  1815  			FormatKey: DefaultComparer.FormatKey,
  1816  		},
  1817  	}
  1818  	m := make(map[string]*Comparer)
  1819  	for i := range comparers {
  1820  		c := &comparers[i]
  1821  		m[c.Name] = c
  1822  	}
  1823  
  1824  	for _, comparer := range comparers {
  1825  		t.Run(comparer.Name, func(t *testing.T) {
  1826  			var mem *memTable
  1827  
  1828  			parseMeta := func(s string) *fileMetadata {
  1829  				parts := strings.Split(s, "-")
  1830  				meta := &fileMetadata{}
  1831  				if len(parts) != 2 {
  1832  					t.Fatalf("malformed table spec: %s", s)
  1833  				}
  1834  				var smallest, largest base.InternalKey
  1835  				if strings.Contains(parts[0], ".") {
  1836  					if !strings.Contains(parts[1], ".") {
  1837  						t.Fatalf("malformed table spec: %s", s)
  1838  					}
  1839  					smallest = base.ParseInternalKey(parts[0])
  1840  					largest = base.ParseInternalKey(parts[1])
  1841  				} else {
  1842  					smallest = InternalKey{UserKey: []byte(parts[0])}
  1843  					largest = InternalKey{UserKey: []byte(parts[1])}
  1844  				}
  1845  				// If we're using a reverse comparer, flip the file bounds.
  1846  				if mem.cmp(smallest.UserKey, largest.UserKey) > 0 {
  1847  					smallest, largest = largest, smallest
  1848  				}
  1849  				meta.ExtendPointKeyBounds(comparer.Compare, smallest, largest)
  1850  				meta.InitPhysicalBacking()
  1851  				return meta
  1852  			}
  1853  
  1854  			datadriven.RunTest(t, "testdata/ingest_memtable_overlaps", func(t *testing.T, d *datadriven.TestData) string {
  1855  				switch d.Cmd {
  1856  				case "define":
  1857  					b := newBatch(nil)
  1858  					if err := runBatchDefineCmd(d, b); err != nil {
  1859  						return err.Error()
  1860  					}
  1861  
  1862  					opts := &Options{
  1863  						Comparer: &comparer,
  1864  					}
  1865  					opts.EnsureDefaults().WithFSDefaults()
  1866  					if len(d.CmdArgs) > 1 {
  1867  						return fmt.Sprintf("%s expects at most 1 argument", d.Cmd)
  1868  					}
  1869  					if len(d.CmdArgs) == 1 {
  1870  						opts.Comparer = m[d.CmdArgs[0].String()]
  1871  						if opts.Comparer == nil {
  1872  							return fmt.Sprintf("%s unknown comparer: %s", d.Cmd, d.CmdArgs[0].String())
  1873  						}
  1874  					}
  1875  
  1876  					mem = newMemTable(memTableOptions{Options: opts})
  1877  					if err := mem.apply(b, 0); err != nil {
  1878  						return err.Error()
  1879  					}
  1880  					return ""
  1881  
  1882  				case "overlaps":
  1883  					var buf bytes.Buffer
  1884  					for _, data := range strings.Split(d.Input, "\n") {
  1885  						var keyRanges []internalKeyRange
  1886  						for _, part := range strings.Fields(data) {
  1887  							meta := parseMeta(part)
  1888  							keyRanges = append(keyRanges, internalKeyRange{smallest: meta.Smallest, largest: meta.Largest})
  1889  						}
  1890  						fmt.Fprintf(&buf, "%t\n", ingestMemtableOverlaps(mem.cmp, mem, keyRanges))
  1891  					}
  1892  					return buf.String()
  1893  
  1894  				default:
  1895  					return fmt.Sprintf("unknown command: %s", d.Cmd)
  1896  				}
  1897  			})
  1898  		})
  1899  	}
  1900  }
  1901  
  1902  func TestKeyRangeBasic(t *testing.T) {
  1903  	cmp := base.DefaultComparer.Compare
  1904  	k1 := KeyRange{Start: []byte("b"), End: []byte("c")}
  1905  
  1906  	// Tests for Contains()
  1907  	require.True(t, k1.Contains(cmp, base.MakeInternalKey([]byte("b"), 1, InternalKeyKindSet)))
  1908  	require.False(t, k1.Contains(cmp, base.MakeInternalKey([]byte("c"), 1, InternalKeyKindSet)))
  1909  	require.True(t, k1.Contains(cmp, base.MakeInternalKey([]byte("bb"), 1, InternalKeyKindSet)))
  1910  	require.True(t, k1.Contains(cmp, base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, []byte("c"))))
  1911  
  1912  	m1 := &fileMetadata{
  1913  		Smallest: base.MakeInternalKey([]byte("b"), 1, InternalKeyKindSet),
  1914  		Largest:  base.MakeInternalKey([]byte("c"), 1, InternalKeyKindSet),
  1915  	}
  1916  	require.True(t, k1.Overlaps(cmp, m1))
  1917  	m2 := &fileMetadata{
  1918  		Smallest: base.MakeInternalKey([]byte("c"), 1, InternalKeyKindSet),
  1919  		Largest:  base.MakeInternalKey([]byte("d"), 1, InternalKeyKindSet),
  1920  	}
  1921  	require.False(t, k1.Overlaps(cmp, m2))
  1922  	m3 := &fileMetadata{
  1923  		Smallest: base.MakeInternalKey([]byte("a"), 1, InternalKeyKindSet),
  1924  		Largest:  base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, []byte("b")),
  1925  	}
  1926  	require.False(t, k1.Overlaps(cmp, m3))
  1927  	m4 := &fileMetadata{
  1928  		Smallest: base.MakeInternalKey([]byte("a"), 1, InternalKeyKindSet),
  1929  		Largest:  base.MakeInternalKey([]byte("b"), 1, InternalKeyKindSet),
  1930  	}
  1931  	require.True(t, k1.Overlaps(cmp, m4))
  1932  }
  1933  
  1934  func BenchmarkIngestOverlappingMemtable(b *testing.B) {
  1935  	assertNoError := func(err error) {
  1936  		b.Helper()
  1937  		if err != nil {
  1938  			b.Fatal(err)
  1939  		}
  1940  	}
  1941  
  1942  	for count := 1; count < 6; count++ {
  1943  		b.Run(fmt.Sprintf("memtables=%d", count), func(b *testing.B) {
  1944  			for i := 0; i < b.N; i++ {
  1945  				b.StopTimer()
  1946  				mem := vfs.NewMem()
  1947  				d, err := Open("", &Options{
  1948  					FS: mem,
  1949  				})
  1950  				assertNoError(err)
  1951  
  1952  				// Create memtables.
  1953  				for {
  1954  					assertNoError(d.Set([]byte("a"), nil, nil))
  1955  					d.mu.Lock()
  1956  					done := len(d.mu.mem.queue) == count
  1957  					d.mu.Unlock()
  1958  					if done {
  1959  						break
  1960  					}
  1961  				}
  1962  
  1963  				// Create the overlapping sstable that will force a flush when ingested.
  1964  				f, err := mem.Create("ext")
  1965  				assertNoError(err)
  1966  				w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  1967  				assertNoError(w.Set([]byte("a"), nil))
  1968  				assertNoError(w.Close())
  1969  
  1970  				b.StartTimer()
  1971  				assertNoError(d.Ingest([]string{"ext"}))
  1972  			}
  1973  		})
  1974  	}
  1975  }
  1976  
  1977  func TestIngestTargetLevel(t *testing.T) {
  1978  	var d *DB
  1979  	defer func() {
  1980  		if d != nil {
  1981  			// Ignore errors because this test defines fake in-progress transactions
  1982  			// that prohibit clean shutdown.
  1983  			_ = d.Close()
  1984  		}
  1985  	}()
  1986  
  1987  	parseMeta := func(s string) *fileMetadata {
  1988  		var rkey bool
  1989  		if len(s) >= 4 && s[0:4] == "rkey" {
  1990  			rkey = true
  1991  			s = s[5:]
  1992  		}
  1993  		parts := strings.Split(s, "-")
  1994  		if len(parts) != 2 {
  1995  			t.Fatalf("malformed table spec: %s", s)
  1996  		}
  1997  		var m *fileMetadata
  1998  		if rkey {
  1999  			m = (&fileMetadata{}).ExtendRangeKeyBounds(
  2000  				d.cmp,
  2001  				InternalKey{UserKey: []byte(parts[0])},
  2002  				InternalKey{UserKey: []byte(parts[1])},
  2003  			)
  2004  		} else {
  2005  			m = (&fileMetadata{}).ExtendPointKeyBounds(
  2006  				d.cmp,
  2007  				InternalKey{UserKey: []byte(parts[0])},
  2008  				InternalKey{UserKey: []byte(parts[1])},
  2009  			)
  2010  		}
  2011  		m.InitPhysicalBacking()
  2012  		return m
  2013  	}
  2014  
  2015  	datadriven.RunTest(t, "testdata/ingest_target_level", func(t *testing.T, td *datadriven.TestData) string {
  2016  		switch td.Cmd {
  2017  		case "define":
  2018  			if d != nil {
  2019  				// Ignore errors because this test defines fake in-progress
  2020  				// transactions that prohibit clean shutdown.
  2021  				_ = d.Close()
  2022  			}
  2023  
  2024  			var err error
  2025  			opts := Options{
  2026  				FormatMajorVersion: internalFormatNewest,
  2027  			}
  2028  			opts.WithFSDefaults()
  2029  			if d, err = runDBDefineCmd(td, &opts); err != nil {
  2030  				return err.Error()
  2031  			}
  2032  
  2033  			readState := d.loadReadState()
  2034  			c := &checkConfig{
  2035  				logger:    d.opts.Logger,
  2036  				comparer:  d.opts.Comparer,
  2037  				readState: readState,
  2038  				newIters:  d.newIters,
  2039  				// TODO: runDBDefineCmd doesn't properly update the visible
  2040  				// sequence number. So we have to explicitly configure level checker with a very large
  2041  				// sequence number, otherwise the DB appears empty.
  2042  				seqNum: InternalKeySeqNumMax,
  2043  			}
  2044  			if err := checkLevelsInternal(c); err != nil {
  2045  				return err.Error()
  2046  			}
  2047  			readState.unref()
  2048  
  2049  			d.mu.Lock()
  2050  			s := d.mu.versions.currentVersion().String()
  2051  			d.mu.Unlock()
  2052  			return s
  2053  
  2054  		case "target":
  2055  			var buf bytes.Buffer
  2056  			suggestSplit := false
  2057  			for _, cmd := range td.CmdArgs {
  2058  				switch cmd.Key {
  2059  				case "suggest-split":
  2060  					suggestSplit = true
  2061  				}
  2062  			}
  2063  			for _, target := range strings.Split(td.Input, "\n") {
  2064  				meta := parseMeta(target)
  2065  				level, overlapFile, err := ingestTargetLevel(
  2066  					d.newIters, d.tableNewRangeKeyIter, IterOptions{logger: d.opts.Logger},
  2067  					d.opts.Comparer, d.mu.versions.currentVersion(), 1, d.mu.compact.inProgress, meta,
  2068  					suggestSplit)
  2069  				if err != nil {
  2070  					return err.Error()
  2071  				}
  2072  				if overlapFile != nil {
  2073  					fmt.Fprintf(&buf, "%d (split file: %s)\n", level, overlapFile.FileNum)
  2074  				} else {
  2075  					fmt.Fprintf(&buf, "%d\n", level)
  2076  				}
  2077  			}
  2078  			return buf.String()
  2079  
  2080  		default:
  2081  			return fmt.Sprintf("unknown command: %s", td.Cmd)
  2082  		}
  2083  	})
  2084  }
  2085  
  2086  func TestIngest(t *testing.T) {
  2087  	var mem vfs.FS
  2088  	var d *DB
  2089  	var flushed bool
  2090  	defer func() {
  2091  		require.NoError(t, d.Close())
  2092  	}()
  2093  
  2094  	reset := func(split bool) {
  2095  		if d != nil {
  2096  			require.NoError(t, d.Close())
  2097  		}
  2098  
  2099  		mem = vfs.NewMem()
  2100  		require.NoError(t, mem.MkdirAll("ext", 0755))
  2101  		opts := &Options{
  2102  			FS:                    mem,
  2103  			L0CompactionThreshold: 100,
  2104  			L0StopWritesThreshold: 100,
  2105  			DebugCheck:            DebugCheckLevels,
  2106  			EventListener: &EventListener{FlushEnd: func(info FlushInfo) {
  2107  				flushed = true
  2108  			}},
  2109  			FormatMajorVersion: internalFormatNewest,
  2110  		}
  2111  		opts.Experimental.IngestSplit = func() bool {
  2112  			return split
  2113  		}
  2114  		// Disable automatic compactions because otherwise we'll race with
  2115  		// delete-only compactions triggered by ingesting range tombstones.
  2116  		opts.DisableAutomaticCompactions = true
  2117  
  2118  		var err error
  2119  		d, err = Open("", opts)
  2120  		require.NoError(t, err)
  2121  	}
  2122  	reset(false /* split */)
  2123  
  2124  	datadriven.RunTest(t, "testdata/ingest", func(t *testing.T, td *datadriven.TestData) string {
  2125  		switch td.Cmd {
  2126  		case "reset":
  2127  			split := false
  2128  			for _, cmd := range td.CmdArgs {
  2129  				switch cmd.Key {
  2130  				case "enable-split":
  2131  					split = true
  2132  				default:
  2133  					return fmt.Sprintf("unexpected key: %s", cmd.Key)
  2134  				}
  2135  			}
  2136  			reset(split)
  2137  			return ""
  2138  		case "batch":
  2139  			b := d.NewIndexedBatch()
  2140  			if err := runBatchDefineCmd(td, b); err != nil {
  2141  				return err.Error()
  2142  			}
  2143  			if err := b.Commit(nil); err != nil {
  2144  				return err.Error()
  2145  			}
  2146  			return ""
  2147  
  2148  		case "build":
  2149  			if err := runBuildCmd(td, d, mem); err != nil {
  2150  				return err.Error()
  2151  			}
  2152  			return ""
  2153  
  2154  		case "ingest":
  2155  			flushed = false
  2156  			if err := runIngestCmd(td, d, mem); err != nil {
  2157  				return err.Error()
  2158  			}
  2159  			// Wait for a possible flush.
  2160  			d.mu.Lock()
  2161  			for d.mu.compact.flushing {
  2162  				d.mu.compact.cond.Wait()
  2163  			}
  2164  			d.mu.Unlock()
  2165  			if flushed {
  2166  				return "memtable flushed"
  2167  			}
  2168  			return ""
  2169  
  2170  		case "get":
  2171  			return runGetCmd(t, td, d)
  2172  
  2173  		case "iter":
  2174  			iter, _ := d.NewIter(&IterOptions{
  2175  				KeyTypes: IterKeyTypePointsAndRanges,
  2176  			})
  2177  			return runIterCmd(td, iter, true)
  2178  
  2179  		case "lsm":
  2180  			return runLSMCmd(td, d)
  2181  
  2182  		case "metrics":
  2183  			// The asynchronous loading of table stats can change metrics, so
  2184  			// wait for all the tables' stats to be loaded.
  2185  			d.mu.Lock()
  2186  			d.waitTableStats()
  2187  			d.mu.Unlock()
  2188  
  2189  			return d.Metrics().StringForTests()
  2190  
  2191  		case "wait-pending-table-stats":
  2192  			return runTableStatsCmd(td, d)
  2193  
  2194  		case "compact":
  2195  			if len(td.CmdArgs) != 2 {
  2196  				panic("insufficient args for compact command")
  2197  			}
  2198  			l := td.CmdArgs[0].Key
  2199  			r := td.CmdArgs[1].Key
  2200  			err := d.Compact([]byte(l), []byte(r), false)
  2201  			if err != nil {
  2202  				return err.Error()
  2203  			}
  2204  			return ""
  2205  		default:
  2206  			return fmt.Sprintf("unknown command: %s", td.Cmd)
  2207  		}
  2208  	})
  2209  }
  2210  
  2211  func TestIngestError(t *testing.T) {
  2212  	for i := int32(0); ; i++ {
  2213  		mem := vfs.NewMem()
  2214  
  2215  		f0, err := mem.Create("ext0")
  2216  		require.NoError(t, err)
  2217  		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f0), sstable.WriterOptions{})
  2218  		require.NoError(t, w.Set([]byte("d"), nil))
  2219  		require.NoError(t, w.Close())
  2220  		f1, err := mem.Create("ext1")
  2221  		require.NoError(t, err)
  2222  		w = sstable.NewWriter(objstorageprovider.NewFileWritable(f1), sstable.WriterOptions{})
  2223  		require.NoError(t, w.Set([]byte("d"), nil))
  2224  		require.NoError(t, w.Close())
  2225  
  2226  		ii := errorfs.OnIndex(-1)
  2227  		d, err := Open("", &Options{
  2228  			FS:                    errorfs.Wrap(mem, errorfs.ErrInjected.If(ii)),
  2229  			Logger:                panicLogger{},
  2230  			L0CompactionThreshold: 8,
  2231  		})
  2232  		require.NoError(t, err)
  2233  		// Force the creation of an L0 sstable that overlaps with the tables
  2234  		// we'll attempt to ingest. This ensures that we exercise filesystem
  2235  		// codepaths when determining the ingest target level.
  2236  		require.NoError(t, d.Set([]byte("a"), nil, nil))
  2237  		require.NoError(t, d.Set([]byte("d"), nil, nil))
  2238  		require.NoError(t, d.Flush())
  2239  
  2240  		t.Run(fmt.Sprintf("index-%d", i), func(t *testing.T) {
  2241  			defer func() {
  2242  				if r := recover(); r != nil {
  2243  					if e, ok := r.(error); ok && errors.Is(e, errorfs.ErrInjected) {
  2244  						return
  2245  					}
  2246  					// d.opts.Logger.Fatalf won't propagate ErrInjected
  2247  					// itself, but should contain the error message.
  2248  					if strings.HasSuffix(fmt.Sprint(r), errorfs.ErrInjected.Error()) {
  2249  						return
  2250  					}
  2251  					t.Fatal(r)
  2252  				}
  2253  			}()
  2254  
  2255  			ii.Store(i)
  2256  			err1 := d.Ingest([]string{"ext0"})
  2257  			err2 := d.Ingest([]string{"ext1"})
  2258  			err := firstError(err1, err2)
  2259  			if err != nil && !errors.Is(err, errorfs.ErrInjected) {
  2260  				t.Fatal(err)
  2261  			}
  2262  		})
  2263  
  2264  		// d.Close may error if we failed to flush the manifest.
  2265  		_ = d.Close()
  2266  
  2267  		// If the injector's index is non-negative, the i-th filesystem
  2268  		// operation was never executed.
  2269  		if ii.Load() >= 0 {
  2270  			break
  2271  		}
  2272  	}
  2273  }
  2274  
  2275  func TestIngestIdempotence(t *testing.T) {
  2276  	// Use an on-disk filesystem, because Ingest with a MemFS will copy, not
  2277  	// link the ingested file.
  2278  	dir, err := os.MkdirTemp("", "ingest-idempotence")
  2279  	require.NoError(t, err)
  2280  	defer os.RemoveAll(dir)
  2281  	fs := vfs.Default
  2282  
  2283  	path := fs.PathJoin(dir, "ext")
  2284  	f, err := fs.Create(fs.PathJoin(dir, "ext"))
  2285  	require.NoError(t, err)
  2286  	w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2287  	require.NoError(t, w.Set([]byte("d"), nil))
  2288  	require.NoError(t, w.Close())
  2289  
  2290  	d, err := Open(dir, &Options{
  2291  		FS: fs,
  2292  	})
  2293  	require.NoError(t, err)
  2294  	const count = 4
  2295  	for i := 0; i < count; i++ {
  2296  		ingestPath := fs.PathJoin(dir, fmt.Sprintf("ext%d", i))
  2297  		require.NoError(t, fs.Link(path, ingestPath))
  2298  		require.NoError(t, d.Ingest([]string{ingestPath}))
  2299  	}
  2300  	require.NoError(t, d.Close())
  2301  }
  2302  
  2303  func TestIngestCompact(t *testing.T) {
  2304  	mem := vfs.NewMem()
  2305  	lel := MakeLoggingEventListener(&base.InMemLogger{})
  2306  	d, err := Open("", &Options{
  2307  		EventListener:         &lel,
  2308  		FS:                    mem,
  2309  		L0CompactionThreshold: 1,
  2310  		L0StopWritesThreshold: 1,
  2311  	})
  2312  	require.NoError(t, err)
  2313  
  2314  	src := func(i int) string {
  2315  		return fmt.Sprintf("ext%d", i)
  2316  	}
  2317  	f, err := mem.Create(src(0))
  2318  	require.NoError(t, err)
  2319  
  2320  	w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2321  	key := []byte("a")
  2322  	require.NoError(t, w.Add(base.MakeInternalKey(key, 0, InternalKeyKindSet), nil))
  2323  	require.NoError(t, w.Close())
  2324  
  2325  	// Make N copies of the sstable.
  2326  	const count = 20
  2327  	for i := 1; i < count; i++ {
  2328  		require.NoError(t, vfs.Copy(d.opts.FS, src(0), src(i)))
  2329  	}
  2330  
  2331  	// Ingest the same sstable multiple times. Compaction should take place as
  2332  	// ingestion happens, preventing an indefinite write stall from occurring.
  2333  	for i := 0; i < count; i++ {
  2334  		if i == 10 {
  2335  			// Half-way through the ingestions, set a key in the memtable to force
  2336  			// overlap with the memtable which will require the memtable to be
  2337  			// flushed.
  2338  			require.NoError(t, d.Set(key, nil, nil))
  2339  		}
  2340  		require.NoError(t, d.Ingest([]string{src(i)}))
  2341  	}
  2342  
  2343  	require.NoError(t, d.Close())
  2344  }
  2345  
  2346  func TestConcurrentIngest(t *testing.T) {
  2347  	mem := vfs.NewMem()
  2348  	d, err := Open("", &Options{
  2349  		FS: mem,
  2350  	})
  2351  	require.NoError(t, err)
  2352  
  2353  	// Create an sstable with 2 keys. This is necessary to trigger the overlap
  2354  	// bug because an sstable with a single key will not have overlap in internal
  2355  	// key space and the sequence number assignment had already guaranteed
  2356  	// correct ordering.
  2357  	src := func(i int) string {
  2358  		return fmt.Sprintf("ext%d", i)
  2359  	}
  2360  	f, err := mem.Create(src(0))
  2361  	require.NoError(t, err)
  2362  
  2363  	w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2364  	require.NoError(t, w.Set([]byte("a"), nil))
  2365  	require.NoError(t, w.Set([]byte("b"), nil))
  2366  	require.NoError(t, w.Close())
  2367  
  2368  	// Make N copies of the sstable.
  2369  	errCh := make(chan error, 5)
  2370  	for i := 1; i < cap(errCh); i++ {
  2371  		require.NoError(t, vfs.Copy(d.opts.FS, src(0), src(i)))
  2372  	}
  2373  
  2374  	// Perform N ingestions concurrently.
  2375  	for i := 0; i < cap(errCh); i++ {
  2376  		go func(i int) {
  2377  			err := d.Ingest([]string{src(i)})
  2378  			if err == nil {
  2379  				if _, err = d.opts.FS.Stat(src(i)); oserror.IsNotExist(err) {
  2380  					err = nil
  2381  				}
  2382  			}
  2383  			errCh <- err
  2384  		}(i)
  2385  	}
  2386  	for i := 0; i < cap(errCh); i++ {
  2387  		require.NoError(t, <-errCh)
  2388  	}
  2389  
  2390  	require.NoError(t, d.Close())
  2391  }
  2392  
  2393  func TestConcurrentIngestCompact(t *testing.T) {
  2394  	for i := 0; i < 2; i++ {
  2395  		t.Run("", func(t *testing.T) {
  2396  			mem := vfs.NewMem()
  2397  			compactionReady := make(chan struct{})
  2398  			compactionBegin := make(chan struct{})
  2399  			d, err := Open("", &Options{
  2400  				FS: mem,
  2401  				EventListener: &EventListener{
  2402  					TableCreated: func(info TableCreateInfo) {
  2403  						if info.Reason == "compacting" {
  2404  							close(compactionReady)
  2405  							<-compactionBegin
  2406  						}
  2407  					},
  2408  				},
  2409  			})
  2410  			require.NoError(t, err)
  2411  
  2412  			ingest := func(keys ...string) {
  2413  				t.Helper()
  2414  				f, err := mem.Create("ext")
  2415  				require.NoError(t, err)
  2416  
  2417  				w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2418  				for _, k := range keys {
  2419  					require.NoError(t, w.Set([]byte(k), nil))
  2420  				}
  2421  				require.NoError(t, w.Close())
  2422  				require.NoError(t, d.Ingest([]string{"ext"}))
  2423  			}
  2424  
  2425  			compact := func(start, end string) {
  2426  				t.Helper()
  2427  				require.NoError(t, d.Compact([]byte(start), []byte(end), false))
  2428  			}
  2429  
  2430  			lsm := func() string {
  2431  				d.mu.Lock()
  2432  				s := d.mu.versions.currentVersion().String()
  2433  				d.mu.Unlock()
  2434  				return s
  2435  			}
  2436  
  2437  			expectLSM := func(expected string) {
  2438  				t.Helper()
  2439  				expected = strings.TrimSpace(expected)
  2440  				actual := strings.TrimSpace(lsm())
  2441  				if expected != actual {
  2442  					t.Fatalf("expected\n%s\nbut found\n%s", expected, actual)
  2443  				}
  2444  			}
  2445  
  2446  			ingest("a")
  2447  			ingest("a")
  2448  			ingest("c")
  2449  			ingest("c")
  2450  
  2451  			expectLSM(`
  2452  0.0:
  2453    000005:[a#11,SET-a#11,SET]
  2454    000007:[c#13,SET-c#13,SET]
  2455  6:
  2456    000004:[a#10,SET-a#10,SET]
  2457    000006:[c#12,SET-c#12,SET]
  2458  `)
  2459  
  2460  			// At this point ingestion of an sstable containing only key "b" will be
  2461  			// targeted at L6. Yet a concurrent compaction of sstables 5 and 7 will
  2462  			// create a new sstable in L6 spanning ["a"-"c"]. So the ingestion must
  2463  			// actually target L5.
  2464  
  2465  			switch i {
  2466  			case 0:
  2467  				// Compact, then ingest.
  2468  				go func() {
  2469  					<-compactionReady
  2470  
  2471  					ingest("b")
  2472  
  2473  					close(compactionBegin)
  2474  				}()
  2475  
  2476  				compact("a", "z")
  2477  
  2478  				expectLSM(`
  2479  0.0:
  2480    000009:[b#14,SET-b#14,SET]
  2481  6:
  2482    000008:[a#0,SET-c#0,SET]
  2483  `)
  2484  
  2485  			case 1:
  2486  				// Ingest, then compact
  2487  				var wg sync.WaitGroup
  2488  				wg.Add(1)
  2489  				go func() {
  2490  					defer wg.Done()
  2491  					close(compactionBegin)
  2492  					compact("a", "z")
  2493  				}()
  2494  
  2495  				ingest("b")
  2496  				wg.Wait()
  2497  
  2498  				// Because we're performing the ingestion and compaction concurrently,
  2499  				// we can't guarantee any particular LSM structure at this point. The
  2500  				// test will fail with an assertion error due to overlapping sstables
  2501  				// if there is insufficient synchronization between ingestion and
  2502  				// compaction.
  2503  			}
  2504  
  2505  			require.NoError(t, d.Close())
  2506  		})
  2507  	}
  2508  }
  2509  
  2510  func TestIngestFlushQueuedMemTable(t *testing.T) {
  2511  	// Verify that ingestion forces a flush of a queued memtable.
  2512  
  2513  	// Test with a format major version prior to FormatFlushableIngest and one
  2514  	// after. Both should result in the same statistic calculations.
  2515  	for _, fmv := range []FormatMajorVersion{FormatFlushableIngest - 1, internalFormatNewest} {
  2516  		func(fmv FormatMajorVersion) {
  2517  			mem := vfs.NewMem()
  2518  			d, err := Open("", &Options{
  2519  				FS:                 mem,
  2520  				FormatMajorVersion: fmv,
  2521  			})
  2522  			require.NoError(t, err)
  2523  
  2524  			// Add the key "a" to the memtable, then fill up the memtable with the key
  2525  			// "b". The ingested sstable will only overlap with the queued memtable.
  2526  			require.NoError(t, d.Set([]byte("a"), nil, nil))
  2527  			for {
  2528  				require.NoError(t, d.Set([]byte("b"), nil, nil))
  2529  				d.mu.Lock()
  2530  				done := len(d.mu.mem.queue) == 2
  2531  				d.mu.Unlock()
  2532  				if done {
  2533  					break
  2534  				}
  2535  			}
  2536  
  2537  			ingest := func(keys ...string) {
  2538  				t.Helper()
  2539  				f, err := mem.Create("ext")
  2540  				require.NoError(t, err)
  2541  
  2542  				w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
  2543  					TableFormat: fmv.MinTableFormat(),
  2544  				})
  2545  				for _, k := range keys {
  2546  					require.NoError(t, w.Set([]byte(k), nil))
  2547  				}
  2548  				require.NoError(t, w.Close())
  2549  				stats, err := d.IngestWithStats([]string{"ext"})
  2550  				require.NoError(t, err)
  2551  				require.Equal(t, stats.ApproxIngestedIntoL0Bytes, stats.Bytes)
  2552  				require.Equal(t, stats.MemtableOverlappingFiles, 1)
  2553  				require.Less(t, uint64(0), stats.Bytes)
  2554  			}
  2555  
  2556  			ingest("a")
  2557  
  2558  			require.NoError(t, d.Close())
  2559  		}(fmv)
  2560  	}
  2561  }
  2562  
  2563  func TestIngestStats(t *testing.T) {
  2564  	mem := vfs.NewMem()
  2565  	d, err := Open("", &Options{
  2566  		FS: mem,
  2567  	})
  2568  	require.NoError(t, err)
  2569  
  2570  	ingest := func(expectedLevel int, keys ...string) {
  2571  		t.Helper()
  2572  		f, err := mem.Create("ext")
  2573  		require.NoError(t, err)
  2574  
  2575  		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2576  		for _, k := range keys {
  2577  			require.NoError(t, w.Set([]byte(k), nil))
  2578  		}
  2579  		require.NoError(t, w.Close())
  2580  		stats, err := d.IngestWithStats([]string{"ext"})
  2581  		require.NoError(t, err)
  2582  		if expectedLevel == 0 {
  2583  			require.Equal(t, stats.ApproxIngestedIntoL0Bytes, stats.Bytes)
  2584  		} else {
  2585  			require.EqualValues(t, 0, stats.ApproxIngestedIntoL0Bytes)
  2586  		}
  2587  		require.Less(t, uint64(0), stats.Bytes)
  2588  	}
  2589  	ingest(6, "a")
  2590  	ingest(0, "a")
  2591  	ingest(6, "b", "g")
  2592  	ingest(0, "c")
  2593  	require.NoError(t, d.Close())
  2594  }
  2595  
  2596  func TestIngestFlushQueuedLargeBatch(t *testing.T) {
  2597  	// Verify that ingestion forces a flush of a queued large batch.
  2598  
  2599  	mem := vfs.NewMem()
  2600  	d, err := Open("", &Options{
  2601  		FS: mem,
  2602  	})
  2603  	require.NoError(t, err)
  2604  
  2605  	// The default large batch threshold is slightly less than 1/2 of the
  2606  	// memtable size which makes triggering a problem with flushing queued large
  2607  	// batches irritating. Manually adjust the threshold to 1/8 of the memtable
  2608  	// size in order to more easily create a situation where a large batch is
  2609  	// queued but not automatically flushed.
  2610  	d.mu.Lock()
  2611  	d.largeBatchThreshold = d.opts.MemTableSize / 8
  2612  	d.mu.Unlock()
  2613  
  2614  	// Set a record with a large value. This will be transformed into a large
  2615  	// batch and placed in the flushable queue.
  2616  	require.NoError(t, d.Set([]byte("a"), bytes.Repeat([]byte("v"), int(d.largeBatchThreshold)), nil))
  2617  
  2618  	ingest := func(keys ...string) {
  2619  		t.Helper()
  2620  		f, err := mem.Create("ext")
  2621  		require.NoError(t, err)
  2622  
  2623  		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2624  		for _, k := range keys {
  2625  			require.NoError(t, w.Set([]byte(k), nil))
  2626  		}
  2627  		require.NoError(t, w.Close())
  2628  		require.NoError(t, d.Ingest([]string{"ext"}))
  2629  	}
  2630  
  2631  	ingest("a")
  2632  
  2633  	require.NoError(t, d.Close())
  2634  }
  2635  
  2636  func TestIngestMemtablePendingOverlap(t *testing.T) {
  2637  	mem := vfs.NewMem()
  2638  	d, err := Open("", &Options{
  2639  		FS: mem,
  2640  	})
  2641  	require.NoError(t, err)
  2642  
  2643  	d.mu.Lock()
  2644  	// Use a custom commit pipeline apply function to give us control over
  2645  	// timing of events.
  2646  	assignedBatch := make(chan struct{})
  2647  	applyBatch := make(chan struct{})
  2648  	originalApply := d.commit.env.apply
  2649  	d.commit.env.apply = func(b *Batch, mem *memTable) error {
  2650  		assignedBatch <- struct{}{}
  2651  		applyBatch <- struct{}{}
  2652  		return originalApply(b, mem)
  2653  	}
  2654  	d.mu.Unlock()
  2655  
  2656  	ingest := func(keys ...string) {
  2657  		t.Helper()
  2658  		f, err := mem.Create("ext")
  2659  		require.NoError(t, err)
  2660  
  2661  		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2662  		for _, k := range keys {
  2663  			require.NoError(t, w.Set([]byte(k), nil))
  2664  		}
  2665  		require.NoError(t, w.Close())
  2666  		require.NoError(t, d.Ingest([]string{"ext"}))
  2667  	}
  2668  
  2669  	var wg sync.WaitGroup
  2670  	wg.Add(2)
  2671  
  2672  	// First, Set('c') begins. This call will:
  2673  	//
  2674  	// * enqueue the batch to the pending queue.
  2675  	// * allocate a sequence number `x`.
  2676  	// * write the batch to the WAL.
  2677  	//
  2678  	// and then block until we read from the `applyBatch` channel down below.
  2679  	go func() {
  2680  		err := d.Set([]byte("c"), nil, nil)
  2681  		if err != nil {
  2682  			t.Error(err)
  2683  		}
  2684  		wg.Done()
  2685  	}()
  2686  
  2687  	// When the above Set('c') is ready to apply, it sends on the
  2688  	// `assignedBatch` channel. Once that happens, we start Ingest('a', 'c').
  2689  	// The Ingest('a', 'c') allocates sequence number `x + 1`.
  2690  	go func() {
  2691  		// Wait until the Set has grabbed a sequence number before ingesting.
  2692  		<-assignedBatch
  2693  		ingest("a", "c")
  2694  		wg.Done()
  2695  	}()
  2696  
  2697  	// The Set('c')#1 and Ingest('a', 'c')#2 are both pending. To maintain
  2698  	// sequence number invariants, the Set needs to be applied and flushed
  2699  	// before the Ingest determines its target level.
  2700  	//
  2701  	// Sleep a bit to ensure that the Ingest has time to call into
  2702  	// AllocateSeqNum. Once it allocates its sequence number, it should see
  2703  	// that there are unpublished sequence numbers below it and spin until the
  2704  	// Set's sequence number is published. After sleeping, read from
  2705  	// `applyBatch` to actually allow the Set to apply and publish its
  2706  	// sequence number.
  2707  	time.Sleep(100 * time.Millisecond)
  2708  	<-applyBatch
  2709  
  2710  	// Wait for both calls to complete.
  2711  	wg.Wait()
  2712  	require.NoError(t, d.Flush())
  2713  	require.NoError(t, d.CheckLevels(nil))
  2714  	require.NoError(t, d.Close())
  2715  }
  2716  
  2717  type testLogger struct {
  2718  	t testing.TB
  2719  }
  2720  
  2721  func (l testLogger) Infof(format string, args ...interface{}) {
  2722  	l.t.Logf(format, args...)
  2723  }
  2724  
  2725  func (l testLogger) Errorf(format string, args ...interface{}) {
  2726  	l.t.Logf(format, args...)
  2727  }
  2728  
  2729  func (l testLogger) Fatalf(format string, args ...interface{}) {
  2730  	l.t.Fatalf(format, args...)
  2731  }
  2732  
  2733  // TestIngestMemtableOverlapRace is a regression test for the race described in
  2734  // #2196. If an ingest that checks for overlap with the mutable memtable and
  2735  // finds no overlap, it must not allow overlapping keys with later sequence
  2736  // numbers to be applied to the memtable and the memtable to be flushed before
  2737  // the ingest completes.
  2738  //
  2739  // This test operates by committing the same key concurrently:
  2740  //   - 1 goroutine repeatedly ingests the same sstable writing the key `foo`
  2741  //   - n goroutines repeatedly apply batches writing the key `foo` and trigger
  2742  //     flushes.
  2743  //
  2744  // After a while, the database is closed and the manifest is verified. Version
  2745  // edits should contain new files with monotonically increasing sequence
  2746  // numbers, since every flush and every ingest conflicts with one another.
  2747  func TestIngestMemtableOverlapRace(t *testing.T) {
  2748  	mem := vfs.NewMem()
  2749  	el := MakeLoggingEventListener(testLogger{t: t})
  2750  	d, err := Open("", &Options{
  2751  		FS: mem,
  2752  		// Disable automatic compactions to keep the manifest clean; only
  2753  		// flushes and ingests.
  2754  		DisableAutomaticCompactions: true,
  2755  		// Disable the WAL to speed up batch commits.
  2756  		DisableWAL:    true,
  2757  		EventListener: &el,
  2758  		// We're endlessly appending to L0 without clearing it, so set a maximal
  2759  		// stop writes threshold.
  2760  		L0StopWritesThreshold: math.MaxInt,
  2761  		// Accumulating more than 1 immutable memtable doesn't help us exercise
  2762  		// the bug, since the committed keys need to be flushed promptly.
  2763  		MemTableStopWritesThreshold: 2,
  2764  	})
  2765  	require.NoError(t, err)
  2766  
  2767  	// Prepare a sstable `ext` deleting foo.
  2768  	f, err := mem.Create("ext")
  2769  	require.NoError(t, err)
  2770  	w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2771  	require.NoError(t, w.Delete([]byte("foo")))
  2772  	require.NoError(t, w.Close())
  2773  
  2774  	var done atomic.Bool
  2775  	const numSetters = 2
  2776  	var wg sync.WaitGroup
  2777  	wg.Add(numSetters + 1)
  2778  
  2779  	untilDone := func(fn func()) {
  2780  		defer wg.Done()
  2781  		for !done.Load() {
  2782  			fn()
  2783  		}
  2784  	}
  2785  
  2786  	// Ingest in the background.
  2787  	totalIngests := 0
  2788  	go untilDone(func() {
  2789  		filename := fmt.Sprintf("ext%d", totalIngests)
  2790  		require.NoError(t, mem.Link("ext", filename))
  2791  		require.NoError(t, d.Ingest([]string{filename}))
  2792  		totalIngests++
  2793  	})
  2794  
  2795  	// Apply batches and trigger flushes in the background.
  2796  	wo := &WriteOptions{Sync: false}
  2797  	var localCommits [numSetters]int
  2798  	for i := 0; i < numSetters; i++ {
  2799  		i := i
  2800  		v := []byte(fmt.Sprintf("v%d", i+1))
  2801  		go untilDone(func() {
  2802  			// Commit a batch setting foo=vN.
  2803  			b := d.NewBatch()
  2804  			require.NoError(t, b.Set([]byte("foo"), v, nil))
  2805  			require.NoError(t, b.Commit(wo))
  2806  			localCommits[i]++
  2807  			d.AsyncFlush()
  2808  		})
  2809  	}
  2810  	time.Sleep(100 * time.Millisecond)
  2811  	done.Store(true)
  2812  	wg.Wait()
  2813  
  2814  	var totalCommits int
  2815  	for i := 0; i < numSetters; i++ {
  2816  		totalCommits += localCommits[i]
  2817  	}
  2818  	m := d.Metrics()
  2819  	tot := m.Total()
  2820  	t.Logf("Committed %d batches.", totalCommits)
  2821  	t.Logf("Flushed %d times.", m.Flush.Count)
  2822  	t.Logf("Ingested %d sstables.", tot.TablesIngested)
  2823  	require.NoError(t, d.CheckLevels(nil))
  2824  	require.NoError(t, d.Close())
  2825  
  2826  	// Replay the manifest. Every flush and ingest is a separate version edit.
  2827  	// Since they all write the same key and compactions are disabled, sequence
  2828  	// numbers of new files should be monotonically increasing.
  2829  	//
  2830  	// This check is necessary because most of these sstables are ingested into
  2831  	// L0. The L0 sublevels construction will order them by LargestSeqNum, even
  2832  	// if they're added to L0 out-of-order. The CheckLevels call at the end of
  2833  	// the test may find that the sublevels are all appropriately ordered, but
  2834  	// the manifest may reveal they were added to the LSM out-of-order.
  2835  	dbDesc, err := Peek("", mem)
  2836  	require.NoError(t, err)
  2837  	require.True(t, dbDesc.Exists)
  2838  	f, err = mem.Open(dbDesc.ManifestFilename)
  2839  	require.NoError(t, err)
  2840  	defer f.Close()
  2841  	rr := record.NewReader(f, 0 /* logNum */)
  2842  	var largest *fileMetadata
  2843  	for {
  2844  		r, err := rr.Next()
  2845  		if err == io.EOF || err == record.ErrInvalidChunk {
  2846  			break
  2847  		}
  2848  		require.NoError(t, err)
  2849  		var ve manifest.VersionEdit
  2850  		require.NoError(t, ve.Decode(r))
  2851  		t.Log(ve.String())
  2852  		for _, f := range ve.NewFiles {
  2853  			if largest != nil {
  2854  				require.Equal(t, 0, f.Level)
  2855  				if largest.LargestSeqNum > f.Meta.LargestSeqNum {
  2856  					t.Fatalf("previous largest file %s has sequence number > next file %s", largest, f.Meta)
  2857  				}
  2858  			}
  2859  			largest = f.Meta
  2860  		}
  2861  	}
  2862  }
  2863  
  2864  type ingestCrashFS struct {
  2865  	vfs.FS
  2866  }
  2867  
  2868  func (fs ingestCrashFS) Link(oldname, newname string) error {
  2869  	if err := fs.FS.Link(oldname, newname); err != nil {
  2870  		return err
  2871  	}
  2872  	panic(errorfs.ErrInjected)
  2873  }
  2874  
  2875  type noRemoveFS struct {
  2876  	vfs.FS
  2877  }
  2878  
  2879  func (fs noRemoveFS) Remove(string) error {
  2880  	return errorfs.ErrInjected
  2881  }
  2882  
  2883  func TestIngestFileNumReuseCrash(t *testing.T) {
  2884  	const count = 10
  2885  	// Use an on-disk filesystem, because Ingest with a MemFS will copy, not
  2886  	// link the ingested file.
  2887  	dir, err := os.MkdirTemp("", "ingest-filenum-reuse")
  2888  	require.NoError(t, err)
  2889  	defer os.RemoveAll(dir)
  2890  	fs := vfs.Default
  2891  
  2892  	readFile := func(s string) []byte {
  2893  		f, err := fs.Open(fs.PathJoin(dir, s))
  2894  		require.NoError(t, err)
  2895  		b, err := io.ReadAll(f)
  2896  		require.NoError(t, err)
  2897  		require.NoError(t, f.Close())
  2898  		return b
  2899  	}
  2900  
  2901  	// Create sstables to ingest.
  2902  	var files []string
  2903  	var fileBytes [][]byte
  2904  	for i := 0; i < count; i++ {
  2905  		name := fmt.Sprintf("ext%d", i)
  2906  		f, err := fs.Create(fs.PathJoin(dir, name))
  2907  		require.NoError(t, err)
  2908  		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2909  		require.NoError(t, w.Set([]byte(fmt.Sprintf("foo%d", i)), nil))
  2910  		require.NoError(t, w.Close())
  2911  		files = append(files, name)
  2912  		fileBytes = append(fileBytes, readFile(name))
  2913  	}
  2914  
  2915  	// Open a database with a filesystem that will successfully link the
  2916  	// ingested files but then panic. This is an approximation of what a crash
  2917  	// after linking but before updating the manifest would look like.
  2918  	d, err := Open(dir, &Options{
  2919  		FS: ingestCrashFS{FS: fs},
  2920  	})
  2921  	// A flush here ensures the file num bumps from creating OPTIONS files,
  2922  	// etc get recorded in the manifest. We want the nextFileNum after the
  2923  	// restart to be the same as one of our ingested sstables.
  2924  	require.NoError(t, err)
  2925  	require.NoError(t, d.Set([]byte("boop"), nil, nil))
  2926  	require.NoError(t, d.Flush())
  2927  	for _, f := range files {
  2928  		func() {
  2929  			defer func() { err = recover().(error) }()
  2930  			err = d.Ingest([]string{fs.PathJoin(dir, f)})
  2931  		}()
  2932  		if err == nil || !errors.Is(err, errorfs.ErrInjected) {
  2933  			t.Fatalf("expected injected error, got %v", err)
  2934  		}
  2935  	}
  2936  	// Leave something in the WAL so that Open will flush while replaying the
  2937  	// WAL.
  2938  	require.NoError(t, d.Set([]byte("wal"), nil, nil))
  2939  	require.NoError(t, d.Close())
  2940  
  2941  	// There are now two links to each external file: the original extX link
  2942  	// and a numbered sstable link. The sstable files are still not a part of
  2943  	// the manifest and so they may be overwritten. Open will detect the
  2944  	// obsolete number sstables and try to remove them. The FS here is wrapped
  2945  	// to induce errors on Remove calls. Even if we're unsuccessful in
  2946  	// removing the obsolete files, the external files should not be
  2947  	// overwritten.
  2948  	d, err = Open(dir, &Options{FS: noRemoveFS{FS: fs}})
  2949  	require.NoError(t, err)
  2950  	require.NoError(t, d.Set([]byte("bar"), nil, nil))
  2951  	require.NoError(t, d.Flush())
  2952  	require.NoError(t, d.Close())
  2953  
  2954  	// None of the external files should change despite modifying the linked
  2955  	// versions.
  2956  	for i, f := range files {
  2957  		afterBytes := readFile(f)
  2958  		require.Equal(t, fileBytes[i], afterBytes)
  2959  	}
  2960  }
  2961  
  2962  func TestIngest_UpdateSequenceNumber(t *testing.T) {
  2963  	mem := vfs.NewMem()
  2964  	cmp := base.DefaultComparer.Compare
  2965  	parse := func(input string) (*sstable.Writer, error) {
  2966  		f, err := mem.Create("ext")
  2967  		if err != nil {
  2968  			return nil, err
  2969  		}
  2970  		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
  2971  			TableFormat: sstable.TableFormatMax,
  2972  		})
  2973  		for _, data := range strings.Split(input, "\n") {
  2974  			if strings.HasPrefix(data, "rangekey: ") {
  2975  				data = strings.TrimPrefix(data, "rangekey: ")
  2976  				s := keyspan.ParseSpan(data)
  2977  				err := rangekey.Encode(&s, w.AddRangeKey)
  2978  				if err != nil {
  2979  					return nil, err
  2980  				}
  2981  				continue
  2982  			}
  2983  			j := strings.Index(data, ":")
  2984  			if j < 0 {
  2985  				return nil, errors.Newf("malformed input: %s\n", data)
  2986  			}
  2987  			key := base.ParseInternalKey(data[:j])
  2988  			value := []byte(data[j+1:])
  2989  			if err := w.Add(key, value); err != nil {
  2990  				return nil, err
  2991  			}
  2992  		}
  2993  		return w, nil
  2994  	}
  2995  
  2996  	var (
  2997  		seqnum uint64
  2998  		err    error
  2999  		metas  []*fileMetadata
  3000  	)
  3001  	datadriven.RunTest(t, "testdata/ingest_update_seqnums", func(t *testing.T, td *datadriven.TestData) string {
  3002  		switch td.Cmd {
  3003  		case "starting-seqnum":
  3004  			seqnum, err = strconv.ParseUint(td.Input, 10, 64)
  3005  			if err != nil {
  3006  				return err.Error()
  3007  			}
  3008  			return ""
  3009  
  3010  		case "reset":
  3011  			metas = metas[:0]
  3012  			return ""
  3013  
  3014  		case "load":
  3015  			w, err := parse(td.Input)
  3016  			if err != nil {
  3017  				return err.Error()
  3018  			}
  3019  			if err = w.Close(); err != nil {
  3020  				return err.Error()
  3021  			}
  3022  			defer w.Close()
  3023  
  3024  			// Format the bounds of the table.
  3025  			wm, err := w.Metadata()
  3026  			if err != nil {
  3027  				return err.Error()
  3028  			}
  3029  
  3030  			// Upper bounds for range dels and range keys are expected to be sentinel
  3031  			// keys.
  3032  			maybeUpdateUpperBound := func(key base.InternalKey) base.InternalKey {
  3033  				switch k := key.Kind(); {
  3034  				case k == base.InternalKeyKindRangeDelete:
  3035  					key.Trailer = base.InternalKeyRangeDeleteSentinel
  3036  				case rangekey.IsRangeKey(k):
  3037  					return base.MakeExclusiveSentinelKey(k, key.UserKey)
  3038  				}
  3039  				return key
  3040  			}
  3041  
  3042  			// Construct the file metadata from the writer metadata.
  3043  			m := &fileMetadata{
  3044  				SmallestSeqNum: 0, // Simulate an ingestion.
  3045  				LargestSeqNum:  0,
  3046  			}
  3047  			if wm.HasPointKeys {
  3048  				m.ExtendPointKeyBounds(cmp, wm.SmallestPoint, wm.LargestPoint)
  3049  			}
  3050  			if wm.HasRangeDelKeys {
  3051  				m.ExtendPointKeyBounds(
  3052  					cmp,
  3053  					wm.SmallestRangeDel,
  3054  					maybeUpdateUpperBound(wm.LargestRangeDel),
  3055  				)
  3056  			}
  3057  			if wm.HasRangeKeys {
  3058  				m.ExtendRangeKeyBounds(
  3059  					cmp,
  3060  					wm.SmallestRangeKey,
  3061  					maybeUpdateUpperBound(wm.LargestRangeKey),
  3062  				)
  3063  			}
  3064  			m.InitPhysicalBacking()
  3065  			if err := m.Validate(cmp, base.DefaultFormatter); err != nil {
  3066  				return err.Error()
  3067  			}
  3068  
  3069  			// Collect this file.
  3070  			metas = append(metas, m)
  3071  
  3072  			// Return an index number for the file.
  3073  			return fmt.Sprintf("file %d\n", len(metas)-1)
  3074  
  3075  		case "update-files":
  3076  			// Update the bounds across all files.
  3077  			if err = ingestUpdateSeqNum(cmp, base.DefaultFormatter, seqnum, ingestLoadResult{localMeta: metas}); err != nil {
  3078  				return err.Error()
  3079  			}
  3080  
  3081  			var buf bytes.Buffer
  3082  			for i, m := range metas {
  3083  				fmt.Fprintf(&buf, "file %d:\n", i)
  3084  				fmt.Fprintf(&buf, "  combined: %s-%s\n", m.Smallest, m.Largest)
  3085  				fmt.Fprintf(&buf, "    points: %s-%s\n", m.SmallestPointKey, m.LargestPointKey)
  3086  				fmt.Fprintf(&buf, "    ranges: %s-%s\n", m.SmallestRangeKey, m.LargestRangeKey)
  3087  			}
  3088  
  3089  			return buf.String()
  3090  
  3091  		default:
  3092  			return fmt.Sprintf("unknown command %s\n", td.Cmd)
  3093  		}
  3094  	})
  3095  }
  3096  
  3097  func TestIngestCleanup(t *testing.T) {
  3098  	fns := []base.FileNum{0, 1, 2}
  3099  
  3100  	testCases := []struct {
  3101  		closeFiles   []base.FileNum
  3102  		cleanupFiles []base.FileNum
  3103  		wantErr      string
  3104  	}{
  3105  		// Close and remove all files.
  3106  		{
  3107  			closeFiles:   fns,
  3108  			cleanupFiles: fns,
  3109  		},
  3110  		// Remove a non-existent file.
  3111  		{
  3112  			closeFiles:   fns,
  3113  			cleanupFiles: []base.FileNum{3},
  3114  			wantErr:      "unknown to the objstorage provider",
  3115  		},
  3116  		// Remove a file that has not been closed.
  3117  		{
  3118  			closeFiles:   []base.FileNum{0, 2},
  3119  			cleanupFiles: fns,
  3120  			wantErr:      oserror.ErrInvalid.Error(),
  3121  		},
  3122  		// Remove all files, one of which is still open, plus a file that does not exist.
  3123  		{
  3124  			closeFiles:   []base.FileNum{0, 2},
  3125  			cleanupFiles: []base.FileNum{0, 1, 2, 3},
  3126  			wantErr:      oserror.ErrInvalid.Error(), // The first error encountered is due to the open file.
  3127  		},
  3128  	}
  3129  
  3130  	for _, tc := range testCases {
  3131  		t.Run("", func(t *testing.T) {
  3132  			mem := vfs.NewMem()
  3133  			mem.UseWindowsSemantics(true)
  3134  			objProvider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(mem, ""))
  3135  			require.NoError(t, err)
  3136  			defer objProvider.Close()
  3137  
  3138  			// Create the files in the VFS.
  3139  			metaMap := make(map[base.FileNum]objstorage.Writable)
  3140  			for _, fn := range fns {
  3141  				w, _, err := objProvider.Create(context.Background(), base.FileTypeTable, fn.DiskFileNum(), objstorage.CreateOptions{})
  3142  				require.NoError(t, err)
  3143  
  3144  				metaMap[fn] = w
  3145  			}
  3146  
  3147  			// Close a select number of files.
  3148  			for _, m := range tc.closeFiles {
  3149  				w, ok := metaMap[m]
  3150  				if !ok {
  3151  					continue
  3152  				}
  3153  				require.NoError(t, w.Finish())
  3154  			}
  3155  
  3156  			// Cleanup the set of files in the FS.
  3157  			var toRemove []*fileMetadata
  3158  			for _, fn := range tc.cleanupFiles {
  3159  				m := &fileMetadata{FileNum: fn}
  3160  				m.InitPhysicalBacking()
  3161  				toRemove = append(toRemove, m)
  3162  			}
  3163  
  3164  			err = ingestCleanup(objProvider, toRemove)
  3165  			if tc.wantErr != "" {
  3166  				require.Error(t, err, "got no error, expected %s", tc.wantErr)
  3167  				require.Contains(t, err.Error(), tc.wantErr)
  3168  			} else {
  3169  				require.NoError(t, err)
  3170  			}
  3171  		})
  3172  	}
  3173  }
  3174  
  3175  // fatalCapturingLogger captures a fatal error instead of panicking.
  3176  type fatalCapturingLogger struct {
  3177  	t   testing.TB
  3178  	err error
  3179  }
  3180  
  3181  // Infof implements the Logger interface.
  3182  func (l *fatalCapturingLogger) Infof(fmt string, args ...interface{}) {
  3183  	l.t.Logf(fmt, args...)
  3184  }
  3185  
  3186  // Errorf implements the Logger interface.
  3187  func (l *fatalCapturingLogger) Errorf(fmt string, args ...interface{}) {
  3188  	l.t.Logf(fmt, args...)
  3189  }
  3190  
  3191  // Fatalf implements the Logger interface.
  3192  func (l *fatalCapturingLogger) Fatalf(_ string, args ...interface{}) {
  3193  	l.err = args[0].(error)
  3194  }
  3195  
  3196  func TestIngestValidation(t *testing.T) {
  3197  	type keyVal struct {
  3198  		key, val []byte
  3199  	}
  3200  	// The corruptionLocation enum defines where to corrupt an sstable if
  3201  	// anywhere. corruptionLocation{Start,End} describe the start and end
  3202  	// data blocks. corruptionLocationInternal describes a random data block
  3203  	// that's neither the start or end blocks. The Ingest operation does not
  3204  	// read the entire sstable, only the start and end blocks, so corruption
  3205  	// introduced using corruptionLocationInternal will not be discovered until
  3206  	// the asynchronous validation job runs.
  3207  	type corruptionLocation int
  3208  	const (
  3209  		corruptionLocationNone corruptionLocation = iota
  3210  		corruptionLocationStart
  3211  		corruptionLocationEnd
  3212  		corruptionLocationInternal
  3213  	)
  3214  	// The errReportLocation type defines an enum to allow tests to enforce
  3215  	// expectations about how an error surfaced during ingestion or validation
  3216  	// is reported. Asynchronous validation that uncovers corruption should call
  3217  	// Fatalf on the Logger. Asychronous validation that encounters
  3218  	// non-corruption errors should surface it through the
  3219  	// EventListener.BackgroundError func.
  3220  	type errReportLocation int
  3221  	const (
  3222  		errReportLocationNone errReportLocation = iota
  3223  		errReportLocationIngest
  3224  		errReportLocationFatal
  3225  		errReportLocationBackgroundError
  3226  	)
  3227  	const (
  3228  		nKeys     = 1_000
  3229  		keySize   = 16
  3230  		valSize   = 100
  3231  		blockSize = 100
  3232  
  3233  		ingestTableName = "ext"
  3234  	)
  3235  
  3236  	seed := uint64(time.Now().UnixNano())
  3237  	rng := rand.New(rand.NewSource(seed))
  3238  	t.Logf("rng seed = %d", seed)
  3239  
  3240  	// errfsCounter is used by test cases that make use of an errorfs.Injector
  3241  	// to inject errors into the ingest validation code path.
  3242  	var errfsCounter atomic.Int32
  3243  	testCases := []struct {
  3244  		description     string
  3245  		cLoc            corruptionLocation
  3246  		wantErrType     errReportLocation
  3247  		wantErr         error
  3248  		errorfsInjector errorfs.Injector
  3249  	}{
  3250  		{
  3251  			description: "no corruption",
  3252  			cLoc:        corruptionLocationNone,
  3253  			wantErrType: errReportLocationNone,
  3254  		},
  3255  		{
  3256  			description: "start block",
  3257  			cLoc:        corruptionLocationStart,
  3258  			wantErr:     ErrCorruption,
  3259  			wantErrType: errReportLocationIngest,
  3260  		},
  3261  		{
  3262  			description: "end block",
  3263  			cLoc:        corruptionLocationEnd,
  3264  			wantErr:     ErrCorruption,
  3265  			wantErrType: errReportLocationIngest,
  3266  		},
  3267  		{
  3268  			description: "non-end block",
  3269  			cLoc:        corruptionLocationInternal,
  3270  			wantErr:     ErrCorruption,
  3271  			wantErrType: errReportLocationFatal,
  3272  		},
  3273  		{
  3274  			description: "non-corruption error",
  3275  			cLoc:        corruptionLocationNone,
  3276  			wantErr:     errorfs.ErrInjected,
  3277  			wantErrType: errReportLocationBackgroundError,
  3278  			errorfsInjector: errorfs.InjectorFunc(func(op errorfs.Op) error {
  3279  				// Inject an error on the first read-at operation on an sstable
  3280  				// (excluding the read on the sstable before ingestion has
  3281  				// linked it in).
  3282  				if op.Path != "ext" && op.Kind != errorfs.OpFileReadAt || filepath.Ext(op.Path) != ".sst" {
  3283  					return nil
  3284  				}
  3285  				if errfsCounter.Add(1) == 1 {
  3286  					return errorfs.ErrInjected
  3287  				}
  3288  				return nil
  3289  			}),
  3290  		},
  3291  	}
  3292  
  3293  	for _, tc := range testCases {
  3294  		t.Run(tc.description, func(t *testing.T) {
  3295  			errfsCounter.Store(0)
  3296  			var wg sync.WaitGroup
  3297  			wg.Add(1)
  3298  
  3299  			fs := vfs.NewMem()
  3300  			var testFS vfs.FS = fs
  3301  			if tc.errorfsInjector != nil {
  3302  				testFS = errorfs.Wrap(fs, tc.errorfsInjector)
  3303  			}
  3304  
  3305  			// backgroundErr is populated by EventListener.BackgroundError.
  3306  			var backgroundErr error
  3307  			logger := &fatalCapturingLogger{t: t}
  3308  			opts := &Options{
  3309  				FS:     testFS,
  3310  				Logger: logger,
  3311  				EventListener: &EventListener{
  3312  					TableValidated: func(i TableValidatedInfo) {
  3313  						wg.Done()
  3314  					},
  3315  					BackgroundError: func(err error) {
  3316  						backgroundErr = err
  3317  					},
  3318  				},
  3319  			}
  3320  			// Disable table stats so that injected errors can't be accidentally
  3321  			// injected into the table stats collector read, and so the table
  3322  			// stats collector won't prime the table+block cache such that the
  3323  			// error injection won't trigger at all during ingest validation.
  3324  			opts.private.disableTableStats = true
  3325  			opts.Experimental.ValidateOnIngest = true
  3326  			d, err := Open("", opts)
  3327  			require.NoError(t, err)
  3328  			defer func() { require.NoError(t, d.Close()) }()
  3329  
  3330  			corrupt := func(f vfs.File) {
  3331  				readable, err := sstable.NewSimpleReadable(f)
  3332  				require.NoError(t, err)
  3333  				// Compute the layout of the sstable in order to find the
  3334  				// appropriate block locations to corrupt.
  3335  				r, err := sstable.NewReader(readable, sstable.ReaderOptions{})
  3336  				require.NoError(t, err)
  3337  				l, err := r.Layout()
  3338  				require.NoError(t, err)
  3339  
  3340  				// Select an appropriate data block to corrupt.
  3341  				var blockIdx int
  3342  				switch tc.cLoc {
  3343  				case corruptionLocationStart:
  3344  					blockIdx = 0
  3345  				case corruptionLocationEnd:
  3346  					blockIdx = len(l.Data) - 1
  3347  				case corruptionLocationInternal:
  3348  					blockIdx = 1 + rng.Intn(len(l.Data)-2)
  3349  				default:
  3350  					t.Fatalf("unknown corruptionLocation: %T", tc.cLoc)
  3351  				}
  3352  				bh := l.Data[blockIdx]
  3353  
  3354  				// Corrupting a key will cause the ingestion to fail due to a
  3355  				// malformed key, rather than a block checksum mismatch.
  3356  				// Instead, we corrupt the last byte in the selected block,
  3357  				// before the trailer, which corresponds to a value.
  3358  				offset := bh.Offset + bh.Length - 1
  3359  				_, err = f.WriteAt([]byte("\xff"), int64(offset))
  3360  				require.NoError(t, err)
  3361  				require.NoError(t, r.Close())
  3362  			}
  3363  
  3364  			type errT struct {
  3365  				errLoc errReportLocation
  3366  				err    error
  3367  			}
  3368  			runIngest := func(keyVals []keyVal) (et errT) {
  3369  				f, err := fs.Create(ingestTableName)
  3370  				require.NoError(t, err)
  3371  				defer func() { _ = fs.Remove(ingestTableName) }()
  3372  
  3373  				w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
  3374  					BlockSize:   blockSize,     // Create many smaller blocks.
  3375  					Compression: NoCompression, // For simpler debugging.
  3376  				})
  3377  				for _, kv := range keyVals {
  3378  					require.NoError(t, w.Set(kv.key, kv.val))
  3379  				}
  3380  				require.NoError(t, w.Close())
  3381  
  3382  				// Possibly corrupt the file.
  3383  				if tc.cLoc != corruptionLocationNone {
  3384  					f, err = fs.OpenReadWrite(ingestTableName)
  3385  					require.NoError(t, err)
  3386  					corrupt(f)
  3387  				}
  3388  
  3389  				// Ingest the external table.
  3390  				err = d.Ingest([]string{ingestTableName})
  3391  				if err != nil {
  3392  					et.errLoc = errReportLocationIngest
  3393  					et.err = err
  3394  					return
  3395  				}
  3396  
  3397  				// Wait for the validation on the sstable to complete.
  3398  				wg.Wait()
  3399  
  3400  				// Return any error encountered during validation.
  3401  				if logger.err != nil {
  3402  					et.errLoc = errReportLocationFatal
  3403  					et.err = logger.err
  3404  				} else if backgroundErr != nil {
  3405  					et.errLoc = errReportLocationBackgroundError
  3406  					et.err = backgroundErr
  3407  				}
  3408  				return
  3409  			}
  3410  
  3411  			// Construct a set of keys to ingest.
  3412  			var keyVals []keyVal
  3413  			for i := 0; i < nKeys; i++ {
  3414  				key := make([]byte, keySize)
  3415  				_, err = rng.Read(key)
  3416  				require.NoError(t, err)
  3417  
  3418  				val := make([]byte, valSize)
  3419  				_, err = rng.Read(val)
  3420  				require.NoError(t, err)
  3421  
  3422  				keyVals = append(keyVals, keyVal{key, val})
  3423  			}
  3424  
  3425  			// Keys must be sorted.
  3426  			slices.SortFunc(keyVals, func(a, b keyVal) int { return d.cmp(a.key, b.key) })
  3427  
  3428  			// Run the ingestion.
  3429  			et := runIngest(keyVals)
  3430  
  3431  			// Assert we saw the errors we expect.
  3432  			switch tc.wantErrType {
  3433  			case errReportLocationNone:
  3434  				require.Equal(t, errReportLocationNone, et.errLoc)
  3435  				require.NoError(t, et.err)
  3436  			case errReportLocationIngest:
  3437  				require.Equal(t, errReportLocationIngest, et.errLoc)
  3438  				require.Error(t, et.err)
  3439  				require.True(t, errors.Is(et.err, tc.wantErr))
  3440  			case errReportLocationFatal:
  3441  				require.Equal(t, errReportLocationFatal, et.errLoc)
  3442  				require.Error(t, et.err)
  3443  				require.True(t, errors.Is(et.err, tc.wantErr))
  3444  			case errReportLocationBackgroundError:
  3445  				require.Equal(t, errReportLocationBackgroundError, et.errLoc)
  3446  				require.Error(t, et.err)
  3447  				require.True(t, errors.Is(et.err, tc.wantErr))
  3448  			default:
  3449  				t.Fatalf("unknown wantErrType %T", tc.wantErrType)
  3450  			}
  3451  		})
  3452  	}
  3453  }
  3454  
  3455  // BenchmarkManySSTables measures the cost of various operations with various
  3456  // counts of SSTables within the database.
  3457  func BenchmarkManySSTables(b *testing.B) {
  3458  	counts := []int{10, 1_000, 10_000, 100_000, 1_000_000}
  3459  	ops := []string{"ingest", "calculateInuseKeyRanges"}
  3460  	for _, op := range ops {
  3461  		b.Run(op, func(b *testing.B) {
  3462  			for _, count := range counts {
  3463  				b.Run(fmt.Sprintf("sstables=%d", count), func(b *testing.B) {
  3464  					mem := vfs.NewMem()
  3465  					d, err := Open("", &Options{
  3466  						FS: mem,
  3467  					})
  3468  					require.NoError(b, err)
  3469  
  3470  					var paths []string
  3471  					for i := 0; i < count; i++ {
  3472  						n := fmt.Sprintf("%07d", i)
  3473  						f, err := mem.Create(n)
  3474  						require.NoError(b, err)
  3475  						w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  3476  						require.NoError(b, w.Set([]byte(n), nil))
  3477  						require.NoError(b, w.Close())
  3478  						paths = append(paths, n)
  3479  					}
  3480  					require.NoError(b, d.Ingest(paths))
  3481  
  3482  					{
  3483  						const broadIngest = "broad.sst"
  3484  						f, err := mem.Create(broadIngest)
  3485  						require.NoError(b, err)
  3486  						w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  3487  						require.NoError(b, w.Set([]byte("0"), nil))
  3488  						require.NoError(b, w.Set([]byte("Z"), nil))
  3489  						require.NoError(b, w.Close())
  3490  						require.NoError(b, d.Ingest([]string{broadIngest}))
  3491  					}
  3492  
  3493  					switch op {
  3494  					case "ingest":
  3495  						runBenchmarkManySSTablesIngest(b, d, mem, count)
  3496  					case "calculateInuseKeyRanges":
  3497  						runBenchmarkManySSTablesInUseKeyRanges(b, d, count)
  3498  					}
  3499  					require.NoError(b, d.Close())
  3500  				})
  3501  			}
  3502  		})
  3503  	}
  3504  }
  3505  
  3506  func runBenchmarkManySSTablesIngest(b *testing.B, d *DB, fs vfs.FS, count int) {
  3507  	b.ResetTimer()
  3508  	for i := 0; i < b.N; i++ {
  3509  		n := fmt.Sprintf("%07d", count+i)
  3510  		f, err := fs.Create(n)
  3511  		require.NoError(b, err)
  3512  		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  3513  		require.NoError(b, w.Set([]byte(n), nil))
  3514  		require.NoError(b, w.Close())
  3515  		require.NoError(b, d.Ingest([]string{n}))
  3516  	}
  3517  }
  3518  
  3519  func runBenchmarkManySSTablesInUseKeyRanges(b *testing.B, d *DB, count int) {
  3520  	// This benchmark is pretty contrived, but it's not easy to write a
  3521  	// microbenchmark for this in a more natural way. L6 has many files, and
  3522  	// L5 has 1 file spanning the entire breadth of L5.
  3523  	d.mu.Lock()
  3524  	defer d.mu.Unlock()
  3525  	v := d.mu.versions.currentVersion()
  3526  	b.ResetTimer()
  3527  
  3528  	smallest := []byte("0")
  3529  	largest := []byte("z")
  3530  	for i := 0; i < b.N; i++ {
  3531  		_ = calculateInuseKeyRanges(v, d.cmp, 0, numLevels-1, smallest, largest)
  3532  	}
  3533  }