github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/ingest_test.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"bytes"
     9  	"context"
    10  	"fmt"
    11  	"io"
    12  	"math"
    13  	"os"
    14  	"path/filepath"
    15  	"sort"
    16  	"strconv"
    17  	"strings"
    18  	"sync"
    19  	"sync/atomic"
    20  	"testing"
    21  	"time"
    22  
    23  	"github.com/cockroachdb/datadriven"
    24  	"github.com/cockroachdb/errors"
    25  	"github.com/cockroachdb/errors/oserror"
    26  	"github.com/cockroachdb/pebble/internal/base"
    27  	"github.com/cockroachdb/pebble/internal/keyspan"
    28  	"github.com/cockroachdb/pebble/internal/manifest"
    29  	"github.com/cockroachdb/pebble/internal/rangekey"
    30  	"github.com/cockroachdb/pebble/internal/testkeys"
    31  	"github.com/cockroachdb/pebble/objstorage"
    32  	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
    33  	"github.com/cockroachdb/pebble/objstorage/remote"
    34  	"github.com/cockroachdb/pebble/record"
    35  	"github.com/cockroachdb/pebble/sstable"
    36  	"github.com/cockroachdb/pebble/vfs"
    37  	"github.com/cockroachdb/pebble/vfs/errorfs"
    38  	"github.com/kr/pretty"
    39  	"github.com/stretchr/testify/require"
    40  	"golang.org/x/exp/rand"
    41  )
    42  
    43  func TestSSTableKeyCompare(t *testing.T) {
    44  	var buf bytes.Buffer
    45  	datadriven.RunTest(t, "testdata/sstable_key_compare", func(t *testing.T, td *datadriven.TestData) string {
    46  		switch td.Cmd {
    47  		case "cmp":
    48  			buf.Reset()
    49  			for _, line := range strings.Split(td.Input, "\n") {
    50  				fields := strings.Fields(line)
    51  				a := base.ParseInternalKey(fields[0])
    52  				b := base.ParseInternalKey(fields[1])
    53  				got := sstableKeyCompare(testkeys.Comparer.Compare, a, b)
    54  				fmt.Fprintf(&buf, "%38s", fmt.Sprint(a.Pretty(base.DefaultFormatter)))
    55  				switch got {
    56  				case -1:
    57  					fmt.Fprint(&buf, " < ")
    58  				case +1:
    59  					fmt.Fprint(&buf, " > ")
    60  				case 0:
    61  					fmt.Fprint(&buf, " = ")
    62  				}
    63  				fmt.Fprintf(&buf, "%s\n", fmt.Sprint(b.Pretty(base.DefaultFormatter)))
    64  			}
    65  			return buf.String()
    66  		default:
    67  			return fmt.Sprintf("unrecognized command %q", td.Cmd)
    68  		}
    69  	})
    70  }
    71  
    72  func TestIngestLoad(t *testing.T) {
    73  	mem := vfs.NewMem()
    74  
    75  	datadriven.RunTest(t, "testdata/ingest_load", func(t *testing.T, td *datadriven.TestData) string {
    76  		switch td.Cmd {
    77  		case "load":
    78  			writerOpts := sstable.WriterOptions{}
    79  			var dbVersion FormatMajorVersion
    80  			for _, cmdArgs := range td.CmdArgs {
    81  				v, err := strconv.Atoi(cmdArgs.Vals[0])
    82  				if err != nil {
    83  					return err.Error()
    84  				}
    85  				switch k := cmdArgs.Key; k {
    86  				case "writer-version":
    87  					fmv := FormatMajorVersion(v)
    88  					writerOpts.TableFormat = fmv.MaxTableFormat()
    89  				case "db-version":
    90  					dbVersion = FormatMajorVersion(v)
    91  				default:
    92  					return fmt.Sprintf("unknown cmd %s\n", k)
    93  				}
    94  			}
    95  			f, err := mem.Create("ext")
    96  			if err != nil {
    97  				return err.Error()
    98  			}
    99  			w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), writerOpts)
   100  			for _, data := range strings.Split(td.Input, "\n") {
   101  				if strings.HasPrefix(data, "rangekey: ") {
   102  					data = strings.TrimPrefix(data, "rangekey: ")
   103  					s := keyspan.ParseSpan(data)
   104  					err := rangekey.Encode(&s, w.AddRangeKey)
   105  					if err != nil {
   106  						return err.Error()
   107  					}
   108  					continue
   109  				}
   110  
   111  				j := strings.Index(data, ":")
   112  				if j < 0 {
   113  					return fmt.Sprintf("malformed input: %s\n", data)
   114  				}
   115  				key := base.ParseInternalKey(data[:j])
   116  				value := []byte(data[j+1:])
   117  				if err := w.Add(key, value); err != nil {
   118  					return err.Error()
   119  				}
   120  			}
   121  			if err := w.Close(); err != nil {
   122  				return err.Error()
   123  			}
   124  
   125  			opts := (&Options{
   126  				Comparer: DefaultComparer,
   127  				FS:       mem,
   128  			}).WithFSDefaults()
   129  			lr, err := ingestLoad(opts, dbVersion, []string{"ext"}, nil, nil, 0, []base.DiskFileNum{base.FileNum(1).DiskFileNum()}, nil, 0)
   130  			if err != nil {
   131  				return err.Error()
   132  			}
   133  			var buf bytes.Buffer
   134  			for _, m := range lr.localMeta {
   135  				fmt.Fprintf(&buf, "%d: %s-%s\n", m.FileNum, m.Smallest, m.Largest)
   136  				fmt.Fprintf(&buf, "  points: %s-%s\n", m.SmallestPointKey, m.LargestPointKey)
   137  				fmt.Fprintf(&buf, "  ranges: %s-%s\n", m.SmallestRangeKey, m.LargestRangeKey)
   138  			}
   139  			return buf.String()
   140  
   141  		default:
   142  			return fmt.Sprintf("unknown command: %s", td.Cmd)
   143  		}
   144  	})
   145  }
   146  
   147  func TestIngestLoadRand(t *testing.T) {
   148  	mem := vfs.NewMem()
   149  	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
   150  	cmp := DefaultComparer.Compare
   151  	version := internalFormatNewest
   152  
   153  	randBytes := func(size int) []byte {
   154  		data := make([]byte, size)
   155  		for i := range data {
   156  			data[i] = byte(rng.Int() & 0xff)
   157  		}
   158  		return data
   159  	}
   160  
   161  	paths := make([]string, 1+rng.Intn(10))
   162  	pending := make([]base.DiskFileNum, len(paths))
   163  	expected := make([]*fileMetadata, len(paths))
   164  	for i := range paths {
   165  		paths[i] = fmt.Sprint(i)
   166  		pending[i] = base.FileNum(rng.Uint64()).DiskFileNum()
   167  		expected[i] = &fileMetadata{
   168  			FileNum: pending[i].FileNum(),
   169  		}
   170  		expected[i].StatsMarkValid()
   171  
   172  		func() {
   173  			f, err := mem.Create(paths[i])
   174  			require.NoError(t, err)
   175  
   176  			keys := make([]InternalKey, 1+rng.Intn(100))
   177  			for i := range keys {
   178  				keys[i] = base.MakeInternalKey(
   179  					randBytes(1+rng.Intn(10)),
   180  					0,
   181  					InternalKeyKindSet)
   182  			}
   183  			sort.Slice(keys, func(i, j int) bool {
   184  				return base.InternalCompare(cmp, keys[i], keys[j]) < 0
   185  			})
   186  
   187  			expected[i].ExtendPointKeyBounds(cmp, keys[0], keys[len(keys)-1])
   188  
   189  			w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
   190  				TableFormat: version.MaxTableFormat(),
   191  			})
   192  			var count uint64
   193  			for i := range keys {
   194  				if i > 0 && base.InternalCompare(cmp, keys[i-1], keys[i]) == 0 {
   195  					// Duplicate key, ignore.
   196  					continue
   197  				}
   198  				w.Add(keys[i], nil)
   199  				count++
   200  			}
   201  			expected[i].Stats.NumEntries = count
   202  			require.NoError(t, w.Close())
   203  
   204  			meta, err := w.Metadata()
   205  			require.NoError(t, err)
   206  
   207  			expected[i].Size = meta.Size
   208  			expected[i].InitPhysicalBacking()
   209  		}()
   210  	}
   211  
   212  	opts := (&Options{
   213  		Comparer: DefaultComparer,
   214  		FS:       mem,
   215  	}).WithFSDefaults()
   216  	lr, err := ingestLoad(opts, version, paths, nil, nil, 0, pending, nil, 0)
   217  	require.NoError(t, err)
   218  
   219  	for _, m := range lr.localMeta {
   220  		m.CreationTime = 0
   221  	}
   222  	t.Log(strings.Join(pretty.Diff(expected, lr.localMeta), "\n"))
   223  	require.Equal(t, expected, lr.localMeta)
   224  }
   225  
   226  func TestIngestLoadInvalid(t *testing.T) {
   227  	mem := vfs.NewMem()
   228  	f, err := mem.Create("invalid")
   229  	require.NoError(t, err)
   230  	require.NoError(t, f.Close())
   231  
   232  	opts := (&Options{
   233  		Comparer: DefaultComparer,
   234  		FS:       mem,
   235  	}).WithFSDefaults()
   236  	if _, err := ingestLoad(opts, internalFormatNewest, []string{"invalid"}, nil, nil, 0, []base.DiskFileNum{base.FileNum(1).DiskFileNum()}, nil, 0); err == nil {
   237  		t.Fatalf("expected error, but found success")
   238  	}
   239  }
   240  
   241  func TestIngestSortAndVerify(t *testing.T) {
   242  	comparers := map[string]Compare{
   243  		"default": DefaultComparer.Compare,
   244  		"reverse": func(a, b []byte) int {
   245  			return DefaultComparer.Compare(b, a)
   246  		},
   247  	}
   248  
   249  	t.Run("", func(t *testing.T) {
   250  		datadriven.RunTest(t, "testdata/ingest_sort_and_verify", func(t *testing.T, d *datadriven.TestData) string {
   251  			switch d.Cmd {
   252  			case "ingest":
   253  				var buf bytes.Buffer
   254  				var meta []*fileMetadata
   255  				var paths []string
   256  				var cmpName string
   257  				d.ScanArgs(t, "cmp", &cmpName)
   258  				cmp := comparers[cmpName]
   259  				if cmp == nil {
   260  					return fmt.Sprintf("%s unknown comparer: %s", d.Cmd, cmpName)
   261  				}
   262  				for i, data := range strings.Split(d.Input, "\n") {
   263  					parts := strings.Split(data, "-")
   264  					if len(parts) != 2 {
   265  						return fmt.Sprintf("malformed test case: %s", d.Input)
   266  					}
   267  					smallest := base.ParseInternalKey(parts[0])
   268  					largest := base.ParseInternalKey(parts[1])
   269  					if cmp(smallest.UserKey, largest.UserKey) > 0 {
   270  						return fmt.Sprintf("range %v-%v is not valid", smallest, largest)
   271  					}
   272  					m := (&fileMetadata{}).ExtendPointKeyBounds(cmp, smallest, largest)
   273  					m.InitPhysicalBacking()
   274  					meta = append(meta, m)
   275  					paths = append(paths, strconv.Itoa(i))
   276  				}
   277  				lr := ingestLoadResult{localPaths: paths, localMeta: meta}
   278  				err := ingestSortAndVerify(cmp, lr, KeyRange{})
   279  				if err != nil {
   280  					return fmt.Sprintf("%v\n", err)
   281  				}
   282  				for i := range meta {
   283  					fmt.Fprintf(&buf, "%s: %v-%v\n", paths[i], meta[i].Smallest, meta[i].Largest)
   284  				}
   285  				return buf.String()
   286  
   287  			default:
   288  				return fmt.Sprintf("unknown command: %s", d.Cmd)
   289  			}
   290  		})
   291  	})
   292  }
   293  
   294  func TestIngestLink(t *testing.T) {
   295  	// Test linking of tables into the DB directory. Test cleanup when one of the
   296  	// tables cannot be linked.
   297  
   298  	const dir = "db"
   299  	const count = 10
   300  	for i := 0; i <= count; i++ {
   301  		t.Run("", func(t *testing.T) {
   302  			opts := &Options{FS: vfs.NewMem()}
   303  			opts.EnsureDefaults().WithFSDefaults()
   304  			require.NoError(t, opts.FS.MkdirAll(dir, 0755))
   305  			objProvider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(opts.FS, dir))
   306  			require.NoError(t, err)
   307  			defer objProvider.Close()
   308  
   309  			paths := make([]string, 10)
   310  			meta := make([]*fileMetadata, len(paths))
   311  			contents := make([][]byte, len(paths))
   312  			for j := range paths {
   313  				paths[j] = fmt.Sprintf("external%d", j)
   314  				meta[j] = &fileMetadata{}
   315  				meta[j].FileNum = FileNum(j)
   316  				meta[j].InitPhysicalBacking()
   317  				f, err := opts.FS.Create(paths[j])
   318  				require.NoError(t, err)
   319  
   320  				contents[j] = []byte(fmt.Sprintf("data%d", j))
   321  				// memFile.Write will modify the supplied buffer when invariants are
   322  				// enabled, so provide a throw-away copy.
   323  				_, err = f.Write(append([]byte(nil), contents[j]...))
   324  				require.NoError(t, err)
   325  				require.NoError(t, f.Close())
   326  			}
   327  
   328  			if i < count {
   329  				opts.FS.Remove(paths[i])
   330  			}
   331  
   332  			lr := ingestLoadResult{localMeta: meta, localPaths: paths}
   333  			err = ingestLink(0 /* jobID */, opts, objProvider, lr, nil /* shared */)
   334  			if i < count {
   335  				if err == nil {
   336  					t.Fatalf("expected error, but found success")
   337  				}
   338  			} else {
   339  				require.NoError(t, err)
   340  			}
   341  
   342  			files, err := opts.FS.List(dir)
   343  			require.NoError(t, err)
   344  
   345  			sort.Strings(files)
   346  
   347  			if i < count {
   348  				if len(files) > 0 {
   349  					t.Fatalf("expected all of the files to be cleaned up, but found:\n%s",
   350  						strings.Join(files, "\n"))
   351  				}
   352  			} else {
   353  				if len(files) != count {
   354  					t.Fatalf("expected %d files, but found:\n%s", count, strings.Join(files, "\n"))
   355  				}
   356  				for j := range files {
   357  					ftype, fileNum, ok := base.ParseFilename(opts.FS, files[j])
   358  					if !ok {
   359  						t.Fatalf("unable to parse filename: %s", files[j])
   360  					}
   361  					if fileTypeTable != ftype {
   362  						t.Fatalf("expected table, but found %d", ftype)
   363  					}
   364  					if j != int(fileNum.FileNum()) {
   365  						t.Fatalf("expected table %d, but found %d", j, fileNum)
   366  					}
   367  					f, err := opts.FS.Open(opts.FS.PathJoin(dir, files[j]))
   368  					require.NoError(t, err)
   369  
   370  					data, err := io.ReadAll(f)
   371  					require.NoError(t, err)
   372  					require.NoError(t, f.Close())
   373  					if !bytes.Equal(contents[j], data) {
   374  						t.Fatalf("expected %s, but found %s", contents[j], data)
   375  					}
   376  				}
   377  			}
   378  		})
   379  	}
   380  }
   381  
   382  func TestIngestLinkFallback(t *testing.T) {
   383  	// Verify that ingestLink succeeds if linking fails by falling back to
   384  	// copying.
   385  	mem := vfs.NewMem()
   386  	src, err := mem.Create("source")
   387  	require.NoError(t, err)
   388  
   389  	opts := &Options{FS: errorfs.Wrap(mem, errorfs.OnIndex(1))}
   390  	opts.EnsureDefaults().WithFSDefaults()
   391  	objSettings := objstorageprovider.DefaultSettings(opts.FS, "")
   392  	// Prevent the provider from listing the dir (where we may get an injected error).
   393  	objSettings.FSDirInitialListing = []string{}
   394  	objProvider, err := objstorageprovider.Open(objSettings)
   395  	require.NoError(t, err)
   396  	defer objProvider.Close()
   397  
   398  	meta := []*fileMetadata{{FileNum: 1}}
   399  	meta[0].InitPhysicalBacking()
   400  	lr := ingestLoadResult{localMeta: meta, localPaths: []string{"source"}}
   401  	err = ingestLink(0, opts, objProvider, lr, nil /* shared */)
   402  	require.NoError(t, err)
   403  
   404  	dest, err := mem.Open("000001.sst")
   405  	require.NoError(t, err)
   406  
   407  	// We should be able to write bytes to src, and not have them show up in
   408  	// dest.
   409  	_, _ = src.Write([]byte("test"))
   410  	data, err := io.ReadAll(dest)
   411  	require.NoError(t, err)
   412  	if len(data) != 0 {
   413  		t.Fatalf("expected copy, but files appear to be hard linked: [%s] unexpectedly found", data)
   414  	}
   415  }
   416  
   417  func TestOverlappingIngestedSSTs(t *testing.T) {
   418  	dir := ""
   419  	var (
   420  		mem        vfs.FS
   421  		d          *DB
   422  		opts       *Options
   423  		closed     = false
   424  		blockFlush = false
   425  	)
   426  	defer func() {
   427  		if !closed {
   428  			require.NoError(t, d.Close())
   429  		}
   430  	}()
   431  
   432  	reset := func(strictMem bool) {
   433  		if d != nil && !closed {
   434  			require.NoError(t, d.Close())
   435  		}
   436  		blockFlush = false
   437  
   438  		if strictMem {
   439  			mem = vfs.NewStrictMem()
   440  		} else {
   441  			mem = vfs.NewMem()
   442  		}
   443  
   444  		require.NoError(t, mem.MkdirAll("ext", 0755))
   445  		opts = (&Options{
   446  			FS:                          mem,
   447  			MemTableStopWritesThreshold: 4,
   448  			L0CompactionThreshold:       100,
   449  			L0StopWritesThreshold:       100,
   450  			DebugCheck:                  DebugCheckLevels,
   451  			FormatMajorVersion:          internalFormatNewest,
   452  		}).WithFSDefaults()
   453  		// Disable automatic compactions because otherwise we'll race with
   454  		// delete-only compactions triggered by ingesting range tombstones.
   455  		opts.DisableAutomaticCompactions = true
   456  
   457  		var err error
   458  		d, err = Open(dir, opts)
   459  		require.NoError(t, err)
   460  		d.TestOnlyWaitForCleaning()
   461  	}
   462  	waitForFlush := func() {
   463  		if d == nil {
   464  			return
   465  		}
   466  		d.mu.Lock()
   467  		for d.mu.compact.flushing {
   468  			d.mu.compact.cond.Wait()
   469  		}
   470  		d.mu.Unlock()
   471  	}
   472  	reset(false)
   473  
   474  	datadriven.RunTest(t, "testdata/flushable_ingest", func(t *testing.T, td *datadriven.TestData) string {
   475  		switch td.Cmd {
   476  		case "reset":
   477  			reset(td.HasArg("strictMem"))
   478  			return ""
   479  
   480  		case "ignoreSyncs":
   481  			var ignoreSyncs bool
   482  			if len(td.CmdArgs) == 1 && td.CmdArgs[0].String() == "true" {
   483  				ignoreSyncs = true
   484  			}
   485  			mem.(*vfs.MemFS).SetIgnoreSyncs(ignoreSyncs)
   486  			return ""
   487  
   488  		case "resetToSynced":
   489  			mem.(*vfs.MemFS).ResetToSyncedState()
   490  			files, err := mem.List(dir)
   491  			sort.Strings(files)
   492  			require.NoError(t, err)
   493  			return strings.Join(files, "\n")
   494  
   495  		case "batch":
   496  			b := d.NewIndexedBatch()
   497  			if err := runBatchDefineCmd(td, b); err != nil {
   498  				return err.Error()
   499  			}
   500  			if err := b.Commit(nil); err != nil {
   501  				return err.Error()
   502  			}
   503  			return ""
   504  
   505  		case "build":
   506  			if err := runBuildCmd(td, d, mem); err != nil {
   507  				return err.Error()
   508  			}
   509  			return ""
   510  
   511  		case "ingest":
   512  			if err := runIngestCmd(td, d, mem); err != nil {
   513  				return err.Error()
   514  			}
   515  			if !blockFlush {
   516  				waitForFlush()
   517  			}
   518  			return ""
   519  
   520  		case "iter":
   521  			iter, _ := d.NewIter(nil)
   522  			return runIterCmd(td, iter, true)
   523  
   524  		case "lsm":
   525  			return runLSMCmd(td, d)
   526  
   527  		case "close":
   528  			if closed {
   529  				return "already closed"
   530  			}
   531  			require.NoError(t, d.Close())
   532  			closed = true
   533  			return ""
   534  
   535  		case "ls":
   536  			files, err := mem.List(dir)
   537  			sort.Strings(files)
   538  			require.NoError(t, err)
   539  			return strings.Join(files, "\n")
   540  
   541  		case "open":
   542  			opts.ReadOnly = td.HasArg("readOnly")
   543  			var err error
   544  			d, err = Open(dir, opts)
   545  			closed = false
   546  			require.NoError(t, err)
   547  			waitForFlush()
   548  			d.TestOnlyWaitForCleaning()
   549  			return ""
   550  
   551  		case "blockFlush":
   552  			blockFlush = true
   553  			d.mu.Lock()
   554  			d.mu.compact.flushing = true
   555  			d.mu.Unlock()
   556  			return ""
   557  
   558  		case "allowFlush":
   559  			blockFlush = false
   560  			d.mu.Lock()
   561  			d.mu.compact.flushing = false
   562  			d.mu.Unlock()
   563  			return ""
   564  
   565  		case "flush":
   566  			d.maybeScheduleFlush()
   567  			waitForFlush()
   568  			d.TestOnlyWaitForCleaning()
   569  			return ""
   570  
   571  		case "get":
   572  			return runGetCmd(t, td, d)
   573  
   574  		default:
   575  			return fmt.Sprintf("unknown command: %s", td.Cmd)
   576  		}
   577  	})
   578  }
   579  
   580  func TestExcise(t *testing.T) {
   581  	var mem vfs.FS
   582  	var d *DB
   583  	var flushed bool
   584  	defer func() {
   585  		require.NoError(t, d.Close())
   586  	}()
   587  
   588  	var opts *Options
   589  	reset := func() {
   590  		if d != nil {
   591  			require.NoError(t, d.Close())
   592  		}
   593  
   594  		mem = vfs.NewMem()
   595  		require.NoError(t, mem.MkdirAll("ext", 0755))
   596  		opts = &Options{
   597  			FS:                    mem,
   598  			L0CompactionThreshold: 100,
   599  			L0StopWritesThreshold: 100,
   600  			DebugCheck:            DebugCheckLevels,
   601  			EventListener: &EventListener{FlushEnd: func(info FlushInfo) {
   602  				flushed = true
   603  			}},
   604  			FormatMajorVersion: FormatVirtualSSTables,
   605  			Comparer:           testkeys.Comparer,
   606  		}
   607  		// Disable automatic compactions because otherwise we'll race with
   608  		// delete-only compactions triggered by ingesting range tombstones.
   609  		opts.DisableAutomaticCompactions = true
   610  		// Set this to true to add some testing for the virtual sstable validation
   611  		// code paths.
   612  		opts.Experimental.ValidateOnIngest = true
   613  
   614  		var err error
   615  		d, err = Open("", opts)
   616  		require.NoError(t, err)
   617  	}
   618  	reset()
   619  
   620  	datadriven.RunTest(t, "testdata/excise", func(t *testing.T, td *datadriven.TestData) string {
   621  		switch td.Cmd {
   622  		case "reset":
   623  			reset()
   624  			return ""
   625  		case "reopen":
   626  			require.NoError(t, d.Close())
   627  			var err error
   628  			d, err = Open("", opts)
   629  			require.NoError(t, err)
   630  
   631  			return ""
   632  		case "batch":
   633  			b := d.NewIndexedBatch()
   634  			if err := runBatchDefineCmd(td, b); err != nil {
   635  				return err.Error()
   636  			}
   637  			if err := b.Commit(nil); err != nil {
   638  				return err.Error()
   639  			}
   640  			return ""
   641  		case "build":
   642  			if err := runBuildCmd(td, d, mem); err != nil {
   643  				return err.Error()
   644  			}
   645  			return ""
   646  
   647  		case "flush":
   648  			if err := d.Flush(); err != nil {
   649  				return err.Error()
   650  			}
   651  			return ""
   652  
   653  		case "ingest":
   654  			flushed = false
   655  			if err := runIngestCmd(td, d, mem); err != nil {
   656  				return err.Error()
   657  			}
   658  			// Wait for a possible flush.
   659  			d.mu.Lock()
   660  			for d.mu.compact.flushing {
   661  				d.mu.compact.cond.Wait()
   662  			}
   663  			d.mu.Unlock()
   664  			if flushed {
   665  				return "memtable flushed"
   666  			}
   667  			return ""
   668  
   669  		case "ingest-and-excise":
   670  			flushed = false
   671  			if err := runIngestAndExciseCmd(td, d, mem); err != nil {
   672  				return err.Error()
   673  			}
   674  			// Wait for a possible flush.
   675  			d.mu.Lock()
   676  			for d.mu.compact.flushing {
   677  				d.mu.compact.cond.Wait()
   678  			}
   679  			d.mu.Unlock()
   680  			if flushed {
   681  				return "memtable flushed"
   682  			}
   683  			return ""
   684  
   685  		case "get":
   686  			return runGetCmd(t, td, d)
   687  
   688  		case "iter":
   689  			iter, _ := d.NewIter(&IterOptions{
   690  				KeyTypes: IterKeyTypePointsAndRanges,
   691  			})
   692  			return runIterCmd(td, iter, true)
   693  
   694  		case "lsm":
   695  			return runLSMCmd(td, d)
   696  
   697  		case "metrics":
   698  			// The asynchronous loading of table stats can change metrics, so
   699  			// wait for all the tables' stats to be loaded.
   700  			d.mu.Lock()
   701  			d.waitTableStats()
   702  			d.mu.Unlock()
   703  
   704  			return d.Metrics().StringForTests()
   705  
   706  		case "wait-pending-table-stats":
   707  			return runTableStatsCmd(td, d)
   708  
   709  		case "excise":
   710  			ve := &versionEdit{
   711  				DeletedFiles: map[deletedFileEntry]*fileMetadata{},
   712  			}
   713  			var exciseSpan KeyRange
   714  			if len(td.CmdArgs) != 2 {
   715  				panic("insufficient args for compact command")
   716  			}
   717  			exciseSpan.Start = []byte(td.CmdArgs[0].Key)
   718  			exciseSpan.End = []byte(td.CmdArgs[1].Key)
   719  
   720  			d.mu.Lock()
   721  			d.mu.versions.logLock()
   722  			d.mu.Unlock()
   723  			current := d.mu.versions.currentVersion()
   724  			for level := range current.Levels {
   725  				iter := current.Levels[level].Iter()
   726  				for m := iter.SeekGE(d.cmp, exciseSpan.Start); m != nil && d.cmp(m.Smallest.UserKey, exciseSpan.End) < 0; m = iter.Next() {
   727  					_, err := d.excise(exciseSpan, m, ve, level)
   728  					if err != nil {
   729  						d.mu.Lock()
   730  						d.mu.versions.logUnlock()
   731  						d.mu.Unlock()
   732  						return fmt.Sprintf("error when excising %s: %s", m.FileNum, err.Error())
   733  					}
   734  				}
   735  			}
   736  			d.mu.Lock()
   737  			d.mu.versions.logUnlock()
   738  			d.mu.Unlock()
   739  			return fmt.Sprintf("would excise %d files, use ingest-and-excise to excise.\n%s", len(ve.DeletedFiles), ve.DebugString(base.DefaultFormatter))
   740  
   741  		case "confirm-backing":
   742  			// Confirms that the files have the same FileBacking.
   743  			fileNums := make(map[base.FileNum]struct{})
   744  			for i := range td.CmdArgs {
   745  				fNum, err := strconv.Atoi(td.CmdArgs[i].Key)
   746  				if err != nil {
   747  					panic("invalid file number")
   748  				}
   749  				fileNums[base.FileNum(fNum)] = struct{}{}
   750  			}
   751  			d.mu.Lock()
   752  			currVersion := d.mu.versions.currentVersion()
   753  			var ptr *manifest.FileBacking
   754  			for _, level := range currVersion.Levels {
   755  				lIter := level.Iter()
   756  				for f := lIter.First(); f != nil; f = lIter.Next() {
   757  					if _, ok := fileNums[f.FileNum]; ok {
   758  						if ptr == nil {
   759  							ptr = f.FileBacking
   760  							continue
   761  						}
   762  						if f.FileBacking != ptr {
   763  							d.mu.Unlock()
   764  							return "file backings are not the same"
   765  						}
   766  					}
   767  				}
   768  			}
   769  			d.mu.Unlock()
   770  			return "file backings are the same"
   771  		case "compact":
   772  			if len(td.CmdArgs) != 2 {
   773  				panic("insufficient args for compact command")
   774  			}
   775  			l := td.CmdArgs[0].Key
   776  			r := td.CmdArgs[1].Key
   777  			err := d.Compact([]byte(l), []byte(r), false)
   778  			if err != nil {
   779  				return err.Error()
   780  			}
   781  			return ""
   782  		default:
   783  			return fmt.Sprintf("unknown command: %s", td.Cmd)
   784  		}
   785  	})
   786  }
   787  
   788  func testIngestSharedImpl(
   789  	t *testing.T, createOnShared remote.CreateOnSharedStrategy, fileName string,
   790  ) {
   791  	var d, d1, d2 *DB
   792  	var efos map[string]*EventuallyFileOnlySnapshot
   793  	defer func() {
   794  		for _, e := range efos {
   795  			require.NoError(t, e.Close())
   796  		}
   797  		if d1 != nil {
   798  			require.NoError(t, d1.Close())
   799  		}
   800  		if d2 != nil {
   801  			require.NoError(t, d2.Close())
   802  		}
   803  	}()
   804  	creatorIDCounter := uint64(1)
   805  	replicateCounter := 1
   806  
   807  	reset := func() {
   808  		for _, e := range efos {
   809  			require.NoError(t, e.Close())
   810  		}
   811  		if d1 != nil {
   812  			require.NoError(t, d1.Close())
   813  		}
   814  		if d2 != nil {
   815  			require.NoError(t, d2.Close())
   816  		}
   817  		efos = make(map[string]*EventuallyFileOnlySnapshot)
   818  
   819  		sstorage := remote.NewInMem()
   820  		mem1 := vfs.NewMem()
   821  		mem2 := vfs.NewMem()
   822  		require.NoError(t, mem1.MkdirAll("ext", 0755))
   823  		require.NoError(t, mem2.MkdirAll("ext", 0755))
   824  		opts1 := &Options{
   825  			Comparer:              testkeys.Comparer,
   826  			FS:                    mem1,
   827  			LBaseMaxBytes:         1,
   828  			L0CompactionThreshold: 100,
   829  			L0StopWritesThreshold: 100,
   830  			DebugCheck:            DebugCheckLevels,
   831  			FormatMajorVersion:    FormatVirtualSSTables,
   832  		}
   833  		// lel.
   834  		lel := MakeLoggingEventListener(DefaultLogger)
   835  		opts1.EventListener = &lel
   836  		opts1.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
   837  			"": sstorage,
   838  		})
   839  		opts1.Experimental.CreateOnShared = createOnShared
   840  		opts1.Experimental.CreateOnSharedLocator = ""
   841  		// Disable automatic compactions because otherwise we'll race with
   842  		// delete-only compactions triggered by ingesting range tombstones.
   843  		opts1.DisableAutomaticCompactions = true
   844  
   845  		opts2 := &Options{}
   846  		*opts2 = *opts1
   847  		opts2.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
   848  			"": sstorage,
   849  		})
   850  		opts2.Experimental.CreateOnShared = createOnShared
   851  		opts2.Experimental.CreateOnSharedLocator = ""
   852  		opts2.FS = mem2
   853  
   854  		var err error
   855  		d1, err = Open("", opts1)
   856  		require.NoError(t, err)
   857  		require.NoError(t, d1.SetCreatorID(creatorIDCounter))
   858  		creatorIDCounter++
   859  		d2, err = Open("", opts2)
   860  		require.NoError(t, err)
   861  		require.NoError(t, d2.SetCreatorID(creatorIDCounter))
   862  		creatorIDCounter++
   863  		d = d1
   864  	}
   865  	reset()
   866  
   867  	datadriven.RunTest(t, fmt.Sprintf("testdata/%s", fileName), func(t *testing.T, td *datadriven.TestData) string {
   868  		switch td.Cmd {
   869  		case "reset":
   870  			reset()
   871  			return ""
   872  		case "switch":
   873  			if len(td.CmdArgs) != 1 {
   874  				return "usage: switch <1 or 2>"
   875  			}
   876  			switch td.CmdArgs[0].Key {
   877  			case "1":
   878  				d = d1
   879  			case "2":
   880  				d = d2
   881  			default:
   882  				return "usage: switch <1 or 2>"
   883  			}
   884  			return "ok"
   885  		case "batch":
   886  			b := d.NewIndexedBatch()
   887  			if err := runBatchDefineCmd(td, b); err != nil {
   888  				return err.Error()
   889  			}
   890  			if err := b.Commit(nil); err != nil {
   891  				return err.Error()
   892  			}
   893  			return ""
   894  		case "build":
   895  			if err := runBuildCmd(td, d, d.opts.FS); err != nil {
   896  				return err.Error()
   897  			}
   898  			return ""
   899  
   900  		case "flush":
   901  			if err := d.Flush(); err != nil {
   902  				return err.Error()
   903  			}
   904  			return ""
   905  
   906  		case "ingest":
   907  			if err := runIngestCmd(td, d, d.opts.FS); err != nil {
   908  				return err.Error()
   909  			}
   910  			// Wait for a possible flush.
   911  			d.mu.Lock()
   912  			for d.mu.compact.flushing {
   913  				d.mu.compact.cond.Wait()
   914  			}
   915  			d.mu.Unlock()
   916  			return ""
   917  
   918  		case "ingest-and-excise":
   919  			if err := runIngestAndExciseCmd(td, d, d.opts.FS); err != nil {
   920  				return err.Error()
   921  			}
   922  			// Wait for a possible flush.
   923  			d.mu.Lock()
   924  			for d.mu.compact.flushing {
   925  				d.mu.compact.cond.Wait()
   926  			}
   927  			d.mu.Unlock()
   928  			return ""
   929  
   930  		case "replicate":
   931  			if len(td.CmdArgs) != 4 {
   932  				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
   933  			}
   934  			var from, to *DB
   935  			switch td.CmdArgs[0].Key {
   936  			case "1":
   937  				from = d1
   938  			case "2":
   939  				from = d2
   940  			default:
   941  				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
   942  			}
   943  			switch td.CmdArgs[1].Key {
   944  			case "1":
   945  				to = d1
   946  			case "2":
   947  				to = d2
   948  			default:
   949  				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
   950  			}
   951  			startKey := []byte(td.CmdArgs[2].Key)
   952  			endKey := []byte(td.CmdArgs[3].Key)
   953  
   954  			writeOpts := d.opts.MakeWriterOptions(0 /* level */, to.opts.FormatMajorVersion.MaxTableFormat())
   955  			sstPath := fmt.Sprintf("ext/replicate%d.sst", replicateCounter)
   956  			f, err := to.opts.FS.Create(sstPath)
   957  			require.NoError(t, err)
   958  			replicateCounter++
   959  			w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), writeOpts)
   960  
   961  			var sharedSSTs []SharedSSTMeta
   962  			err = from.ScanInternal(context.TODO(), startKey, endKey,
   963  				func(key *InternalKey, value LazyValue, _ IteratorLevel) error {
   964  					val, _, err := value.Value(nil)
   965  					require.NoError(t, err)
   966  					require.NoError(t, w.Add(base.MakeInternalKey(key.UserKey, 0, key.Kind()), val))
   967  					return nil
   968  				},
   969  				func(start, end []byte, seqNum uint64) error {
   970  					require.NoError(t, w.DeleteRange(start, end))
   971  					return nil
   972  				},
   973  				func(start, end []byte, keys []keyspan.Key) error {
   974  					s := keyspan.Span{
   975  						Start:     start,
   976  						End:       end,
   977  						Keys:      keys,
   978  						KeysOrder: 0,
   979  					}
   980  					require.NoError(t, rangekey.Encode(&s, func(k base.InternalKey, v []byte) error {
   981  						return w.AddRangeKey(base.MakeInternalKey(k.UserKey, 0, k.Kind()), v)
   982  					}))
   983  					return nil
   984  				},
   985  				func(sst *SharedSSTMeta) error {
   986  					sharedSSTs = append(sharedSSTs, *sst)
   987  					return nil
   988  				},
   989  			)
   990  			require.NoError(t, err)
   991  			require.NoError(t, w.Close())
   992  
   993  			_, err = to.IngestAndExcise([]string{sstPath}, sharedSSTs, KeyRange{Start: startKey, End: endKey})
   994  			require.NoError(t, err)
   995  			return fmt.Sprintf("replicated %d shared SSTs", len(sharedSSTs))
   996  
   997  		case "get":
   998  			return runGetCmd(t, td, d)
   999  
  1000  		case "iter":
  1001  			o := &IterOptions{KeyTypes: IterKeyTypePointsAndRanges}
  1002  			var reader Reader
  1003  			reader = d
  1004  			for _, arg := range td.CmdArgs {
  1005  				switch arg.Key {
  1006  				case "mask-suffix":
  1007  					o.RangeKeyMasking.Suffix = []byte(arg.Vals[0])
  1008  				case "mask-filter":
  1009  					o.RangeKeyMasking.Filter = func() BlockPropertyFilterMask {
  1010  						return sstable.NewTestKeysMaskingFilter()
  1011  					}
  1012  				case "snapshot":
  1013  					reader = efos[arg.Vals[0]]
  1014  				}
  1015  			}
  1016  			iter, err := reader.NewIter(o)
  1017  			if err != nil {
  1018  				return err.Error()
  1019  			}
  1020  			return runIterCmd(td, iter, true)
  1021  
  1022  		case "lsm":
  1023  			return runLSMCmd(td, d)
  1024  
  1025  		case "metrics":
  1026  			// The asynchronous loading of table stats can change metrics, so
  1027  			// wait for all the tables' stats to be loaded.
  1028  			d.mu.Lock()
  1029  			d.waitTableStats()
  1030  			d.mu.Unlock()
  1031  
  1032  			return d.Metrics().StringForTests()
  1033  
  1034  		case "wait-pending-table-stats":
  1035  			return runTableStatsCmd(td, d)
  1036  
  1037  		case "excise":
  1038  			ve := &versionEdit{
  1039  				DeletedFiles: map[deletedFileEntry]*fileMetadata{},
  1040  			}
  1041  			var exciseSpan KeyRange
  1042  			if len(td.CmdArgs) != 2 {
  1043  				panic("insufficient args for excise command")
  1044  			}
  1045  			exciseSpan.Start = []byte(td.CmdArgs[0].Key)
  1046  			exciseSpan.End = []byte(td.CmdArgs[1].Key)
  1047  
  1048  			d.mu.Lock()
  1049  			d.mu.versions.logLock()
  1050  			d.mu.Unlock()
  1051  			current := d.mu.versions.currentVersion()
  1052  			for level := range current.Levels {
  1053  				iter := current.Levels[level].Iter()
  1054  				for m := iter.SeekGE(d.cmp, exciseSpan.Start); m != nil && d.cmp(m.Smallest.UserKey, exciseSpan.End) < 0; m = iter.Next() {
  1055  					_, err := d.excise(exciseSpan, m, ve, level)
  1056  					if err != nil {
  1057  						d.mu.Lock()
  1058  						d.mu.versions.logUnlock()
  1059  						d.mu.Unlock()
  1060  						return fmt.Sprintf("error when excising %s: %s", m.FileNum, err.Error())
  1061  					}
  1062  				}
  1063  			}
  1064  			d.mu.Lock()
  1065  			d.mu.versions.logUnlock()
  1066  			d.mu.Unlock()
  1067  			return fmt.Sprintf("would excise %d files, use ingest-and-excise to excise.\n%s", len(ve.DeletedFiles), ve.String())
  1068  
  1069  		case "file-only-snapshot":
  1070  			if len(td.CmdArgs) != 1 {
  1071  				panic("insufficient args for file-only-snapshot command")
  1072  			}
  1073  			name := td.CmdArgs[0].Key
  1074  			var keyRanges []KeyRange
  1075  			for _, line := range strings.Split(td.Input, "\n") {
  1076  				fields := strings.Fields(line)
  1077  				if len(fields) != 2 {
  1078  					return "expected two fields for file-only snapshot KeyRanges"
  1079  				}
  1080  				kr := KeyRange{Start: []byte(fields[0]), End: []byte(fields[1])}
  1081  				keyRanges = append(keyRanges, kr)
  1082  			}
  1083  
  1084  			s := d.NewEventuallyFileOnlySnapshot(keyRanges)
  1085  			efos[name] = s
  1086  			return "ok"
  1087  
  1088  		case "wait-for-file-only-snapshot":
  1089  			if len(td.CmdArgs) != 1 {
  1090  				panic("insufficient args for file-only-snapshot command")
  1091  			}
  1092  			name := td.CmdArgs[0].Key
  1093  			err := efos[name].WaitForFileOnlySnapshot(context.TODO(), 1*time.Millisecond)
  1094  			if err != nil {
  1095  				return err.Error()
  1096  			}
  1097  			return "ok"
  1098  
  1099  		case "compact":
  1100  			err := runCompactCmd(td, d)
  1101  			if err != nil {
  1102  				return err.Error()
  1103  			}
  1104  			return "ok"
  1105  		default:
  1106  			return fmt.Sprintf("unknown command: %s", td.Cmd)
  1107  		}
  1108  	})
  1109  }
  1110  
  1111  func TestIngestShared(t *testing.T) {
  1112  	for _, strategy := range []remote.CreateOnSharedStrategy{remote.CreateOnSharedAll, remote.CreateOnSharedLower} {
  1113  		strategyStr := "all"
  1114  		if strategy == remote.CreateOnSharedLower {
  1115  			strategyStr = "lower"
  1116  		}
  1117  		t.Run(fmt.Sprintf("createOnShared=%s", strategyStr), func(t *testing.T) {
  1118  			fileName := "ingest_shared"
  1119  			if strategy == remote.CreateOnSharedLower {
  1120  				fileName = "ingest_shared_lower"
  1121  			}
  1122  			testIngestSharedImpl(t, strategy, fileName)
  1123  		})
  1124  	}
  1125  }
  1126  
  1127  func TestSimpleIngestShared(t *testing.T) {
  1128  	mem := vfs.NewMem()
  1129  	var d *DB
  1130  	var provider2 objstorage.Provider
  1131  	opts2 := Options{FS: vfs.NewMem(), FormatMajorVersion: FormatVirtualSSTables}
  1132  	opts2.EnsureDefaults()
  1133  
  1134  	// Create an objProvider where we will fake-create some sstables that can
  1135  	// then be shared back to the db instance.
  1136  	providerSettings := objstorageprovider.Settings{
  1137  		Logger:              opts2.Logger,
  1138  		FS:                  opts2.FS,
  1139  		FSDirName:           "",
  1140  		FSDirInitialListing: nil,
  1141  		FSCleaner:           opts2.Cleaner,
  1142  		NoSyncOnClose:       opts2.NoSyncOnClose,
  1143  		BytesPerSync:        opts2.BytesPerSync,
  1144  	}
  1145  	providerSettings.Remote.StorageFactory = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
  1146  		"": remote.NewInMem(),
  1147  	})
  1148  	providerSettings.Remote.CreateOnShared = remote.CreateOnSharedAll
  1149  	providerSettings.Remote.CreateOnSharedLocator = ""
  1150  
  1151  	provider2, err := objstorageprovider.Open(providerSettings)
  1152  	require.NoError(t, err)
  1153  	creatorIDCounter := uint64(1)
  1154  	provider2.SetCreatorID(objstorage.CreatorID(creatorIDCounter))
  1155  	creatorIDCounter++
  1156  
  1157  	defer func() {
  1158  		require.NoError(t, d.Close())
  1159  	}()
  1160  
  1161  	reset := func() {
  1162  		if d != nil {
  1163  			require.NoError(t, d.Close())
  1164  		}
  1165  
  1166  		mem = vfs.NewMem()
  1167  		require.NoError(t, mem.MkdirAll("ext", 0755))
  1168  		opts := &Options{
  1169  			FormatMajorVersion:    FormatVirtualSSTables,
  1170  			FS:                    mem,
  1171  			L0CompactionThreshold: 100,
  1172  			L0StopWritesThreshold: 100,
  1173  		}
  1174  		opts.Experimental.RemoteStorage = providerSettings.Remote.StorageFactory
  1175  		opts.Experimental.CreateOnShared = providerSettings.Remote.CreateOnShared
  1176  		opts.Experimental.CreateOnSharedLocator = providerSettings.Remote.CreateOnSharedLocator
  1177  
  1178  		var err error
  1179  		d, err = Open("", opts)
  1180  		require.NoError(t, err)
  1181  		require.NoError(t, d.SetCreatorID(creatorIDCounter))
  1182  		creatorIDCounter++
  1183  	}
  1184  	reset()
  1185  
  1186  	metaMap := map[base.DiskFileNum]objstorage.ObjectMetadata{}
  1187  
  1188  	require.NoError(t, d.Set([]byte("d"), []byte("unexpected"), nil))
  1189  	require.NoError(t, d.Set([]byte("e"), []byte("unexpected"), nil))
  1190  	require.NoError(t, d.Set([]byte("a"), []byte("unexpected"), nil))
  1191  	require.NoError(t, d.Set([]byte("f"), []byte("unexpected"), nil))
  1192  	d.Flush()
  1193  
  1194  	{
  1195  		// Create a shared file.
  1196  		fn := base.FileNum(2)
  1197  		f, meta, err := provider2.Create(context.TODO(), fileTypeTable, fn.DiskFileNum(), objstorage.CreateOptions{PreferSharedStorage: true})
  1198  		require.NoError(t, err)
  1199  		w := sstable.NewWriter(f, d.opts.MakeWriterOptions(0, d.opts.FormatMajorVersion.MaxTableFormat()))
  1200  		w.Set([]byte("d"), []byte("shared"))
  1201  		w.Set([]byte("e"), []byte("shared"))
  1202  		w.Close()
  1203  		metaMap[fn.DiskFileNum()] = meta
  1204  	}
  1205  
  1206  	m := metaMap[base.FileNum(2).DiskFileNum()]
  1207  	handle, err := provider2.RemoteObjectBacking(&m)
  1208  	require.NoError(t, err)
  1209  	size, err := provider2.Size(m)
  1210  	require.NoError(t, err)
  1211  
  1212  	sharedSSTMeta := SharedSSTMeta{
  1213  		Backing:          handle,
  1214  		Smallest:         base.MakeInternalKey([]byte("d"), 0, InternalKeyKindSet),
  1215  		Largest:          base.MakeInternalKey([]byte("e"), 0, InternalKeyKindSet),
  1216  		SmallestPointKey: base.MakeInternalKey([]byte("d"), 0, InternalKeyKindSet),
  1217  		LargestPointKey:  base.MakeInternalKey([]byte("e"), 0, InternalKeyKindSet),
  1218  		Level:            6,
  1219  		Size:             uint64(size + 5),
  1220  	}
  1221  	_, err = d.IngestAndExcise([]string{}, []SharedSSTMeta{sharedSSTMeta}, KeyRange{Start: []byte("d"), End: []byte("ee")})
  1222  	require.NoError(t, err)
  1223  
  1224  	// TODO(bilal): Once reading of shared sstables is in, verify that the values
  1225  	// of d and e have been updated.
  1226  }
  1227  
  1228  type blockedCompaction struct {
  1229  	startBlock, unblock chan struct{}
  1230  }
  1231  
  1232  func TestConcurrentExcise(t *testing.T) {
  1233  	var d, d1, d2 *DB
  1234  	var efos map[string]*EventuallyFileOnlySnapshot
  1235  	backgroundErrs := make(chan error, 5)
  1236  	var compactions map[string]*blockedCompaction
  1237  	defer func() {
  1238  		for _, e := range efos {
  1239  			require.NoError(t, e.Close())
  1240  		}
  1241  		if d1 != nil {
  1242  			require.NoError(t, d1.Close())
  1243  		}
  1244  		if d2 != nil {
  1245  			require.NoError(t, d2.Close())
  1246  		}
  1247  	}()
  1248  	creatorIDCounter := uint64(1)
  1249  	replicateCounter := 1
  1250  
  1251  	var wg sync.WaitGroup
  1252  	defer wg.Wait()
  1253  	var blockNextCompaction bool
  1254  	var blockedJobID int
  1255  	var blockedCompactionName string
  1256  	var blockedCompactionsMu sync.Mutex // protects the above three variables.
  1257  
  1258  	reset := func() {
  1259  		wg.Wait()
  1260  		for _, e := range efos {
  1261  			require.NoError(t, e.Close())
  1262  		}
  1263  		if d1 != nil {
  1264  			require.NoError(t, d1.Close())
  1265  		}
  1266  		if d2 != nil {
  1267  			require.NoError(t, d2.Close())
  1268  		}
  1269  		efos = make(map[string]*EventuallyFileOnlySnapshot)
  1270  		compactions = make(map[string]*blockedCompaction)
  1271  		backgroundErrs = make(chan error, 5)
  1272  
  1273  		var el EventListener
  1274  		el.EnsureDefaults(testLogger{t: t})
  1275  		el.FlushBegin = func(info FlushInfo) {
  1276  			// Don't block flushes
  1277  		}
  1278  		el.BackgroundError = func(err error) {
  1279  			backgroundErrs <- err
  1280  		}
  1281  		el.CompactionBegin = func(info CompactionInfo) {
  1282  			if info.Reason == "move" {
  1283  				return
  1284  			}
  1285  			blockedCompactionsMu.Lock()
  1286  			defer blockedCompactionsMu.Unlock()
  1287  			if blockNextCompaction {
  1288  				blockNextCompaction = false
  1289  				blockedJobID = info.JobID
  1290  			}
  1291  		}
  1292  		el.TableCreated = func(info TableCreateInfo) {
  1293  			blockedCompactionsMu.Lock()
  1294  			if info.JobID != blockedJobID {
  1295  				blockedCompactionsMu.Unlock()
  1296  				return
  1297  			}
  1298  			blockedJobID = 0
  1299  			c := compactions[blockedCompactionName]
  1300  			blockedCompactionName = ""
  1301  			blockedCompactionsMu.Unlock()
  1302  			c.startBlock <- struct{}{}
  1303  			<-c.unblock
  1304  		}
  1305  
  1306  		sstorage := remote.NewInMem()
  1307  		mem1 := vfs.NewMem()
  1308  		mem2 := vfs.NewMem()
  1309  		require.NoError(t, mem1.MkdirAll("ext", 0755))
  1310  		require.NoError(t, mem2.MkdirAll("ext", 0755))
  1311  		opts1 := &Options{
  1312  			Comparer:              testkeys.Comparer,
  1313  			LBaseMaxBytes:         1,
  1314  			FS:                    mem1,
  1315  			L0CompactionThreshold: 100,
  1316  			L0StopWritesThreshold: 100,
  1317  			DebugCheck:            DebugCheckLevels,
  1318  			FormatMajorVersion:    FormatVirtualSSTables,
  1319  		}
  1320  		// lel.
  1321  		lel := MakeLoggingEventListener(DefaultLogger)
  1322  		tel := TeeEventListener(lel, el)
  1323  		opts1.EventListener = &tel
  1324  		opts1.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
  1325  			"": sstorage,
  1326  		})
  1327  		opts1.Experimental.CreateOnShared = remote.CreateOnSharedAll
  1328  		opts1.Experimental.CreateOnSharedLocator = ""
  1329  		// Disable automatic compactions because otherwise we'll race with
  1330  		// delete-only compactions triggered by ingesting range tombstones.
  1331  		opts1.DisableAutomaticCompactions = true
  1332  
  1333  		opts2 := &Options{}
  1334  		*opts2 = *opts1
  1335  		opts2.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
  1336  			"": sstorage,
  1337  		})
  1338  		opts2.Experimental.CreateOnShared = remote.CreateOnSharedAll
  1339  		opts2.Experimental.CreateOnSharedLocator = ""
  1340  		opts2.FS = mem2
  1341  
  1342  		var err error
  1343  		d1, err = Open("", opts1)
  1344  		require.NoError(t, err)
  1345  		require.NoError(t, d1.SetCreatorID(creatorIDCounter))
  1346  		creatorIDCounter++
  1347  		d2, err = Open("", opts2)
  1348  		require.NoError(t, err)
  1349  		require.NoError(t, d2.SetCreatorID(creatorIDCounter))
  1350  		creatorIDCounter++
  1351  		d = d1
  1352  	}
  1353  	reset()
  1354  
  1355  	datadriven.RunTest(t, "testdata/concurrent_excise", func(t *testing.T, td *datadriven.TestData) string {
  1356  		switch td.Cmd {
  1357  		case "reset":
  1358  			reset()
  1359  			return ""
  1360  		case "switch":
  1361  			if len(td.CmdArgs) != 1 {
  1362  				return "usage: switch <1 or 2>"
  1363  			}
  1364  			switch td.CmdArgs[0].Key {
  1365  			case "1":
  1366  				d = d1
  1367  			case "2":
  1368  				d = d2
  1369  			default:
  1370  				return "usage: switch <1 or 2>"
  1371  			}
  1372  			return "ok"
  1373  		case "batch":
  1374  			b := d.NewIndexedBatch()
  1375  			if err := runBatchDefineCmd(td, b); err != nil {
  1376  				return err.Error()
  1377  			}
  1378  			if err := b.Commit(nil); err != nil {
  1379  				return err.Error()
  1380  			}
  1381  			return ""
  1382  		case "build":
  1383  			if err := runBuildCmd(td, d, d.opts.FS); err != nil {
  1384  				return err.Error()
  1385  			}
  1386  			return ""
  1387  
  1388  		case "flush":
  1389  			if err := d.Flush(); err != nil {
  1390  				return err.Error()
  1391  			}
  1392  			return ""
  1393  
  1394  		case "ingest":
  1395  			if err := runIngestCmd(td, d, d.opts.FS); err != nil {
  1396  				return err.Error()
  1397  			}
  1398  			// Wait for a possible flush.
  1399  			d.mu.Lock()
  1400  			for d.mu.compact.flushing {
  1401  				d.mu.compact.cond.Wait()
  1402  			}
  1403  			d.mu.Unlock()
  1404  			return ""
  1405  
  1406  		case "ingest-and-excise":
  1407  			if err := runIngestAndExciseCmd(td, d, d.opts.FS); err != nil {
  1408  				return err.Error()
  1409  			}
  1410  			// Wait for a possible flush.
  1411  			d.mu.Lock()
  1412  			for d.mu.compact.flushing {
  1413  				d.mu.compact.cond.Wait()
  1414  			}
  1415  			d.mu.Unlock()
  1416  			return ""
  1417  
  1418  		case "replicate":
  1419  			if len(td.CmdArgs) != 4 {
  1420  				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
  1421  			}
  1422  			var from, to *DB
  1423  			switch td.CmdArgs[0].Key {
  1424  			case "1":
  1425  				from = d1
  1426  			case "2":
  1427  				from = d2
  1428  			default:
  1429  				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
  1430  			}
  1431  			switch td.CmdArgs[1].Key {
  1432  			case "1":
  1433  				to = d1
  1434  			case "2":
  1435  				to = d2
  1436  			default:
  1437  				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
  1438  			}
  1439  			startKey := []byte(td.CmdArgs[2].Key)
  1440  			endKey := []byte(td.CmdArgs[3].Key)
  1441  
  1442  			writeOpts := d.opts.MakeWriterOptions(0 /* level */, to.opts.FormatMajorVersion.MaxTableFormat())
  1443  			sstPath := fmt.Sprintf("ext/replicate%d.sst", replicateCounter)
  1444  			f, err := to.opts.FS.Create(sstPath)
  1445  			require.NoError(t, err)
  1446  			replicateCounter++
  1447  			w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), writeOpts)
  1448  
  1449  			var sharedSSTs []SharedSSTMeta
  1450  			err = from.ScanInternal(context.TODO(), startKey, endKey,
  1451  				func(key *InternalKey, value LazyValue, _ IteratorLevel) error {
  1452  					val, _, err := value.Value(nil)
  1453  					require.NoError(t, err)
  1454  					require.NoError(t, w.Add(base.MakeInternalKey(key.UserKey, 0, key.Kind()), val))
  1455  					return nil
  1456  				},
  1457  				func(start, end []byte, seqNum uint64) error {
  1458  					require.NoError(t, w.DeleteRange(start, end))
  1459  					return nil
  1460  				},
  1461  				func(start, end []byte, keys []keyspan.Key) error {
  1462  					s := keyspan.Span{
  1463  						Start:     start,
  1464  						End:       end,
  1465  						Keys:      keys,
  1466  						KeysOrder: 0,
  1467  					}
  1468  					require.NoError(t, rangekey.Encode(&s, func(k base.InternalKey, v []byte) error {
  1469  						return w.AddRangeKey(base.MakeInternalKey(k.UserKey, 0, k.Kind()), v)
  1470  					}))
  1471  					return nil
  1472  				},
  1473  				func(sst *SharedSSTMeta) error {
  1474  					sharedSSTs = append(sharedSSTs, *sst)
  1475  					return nil
  1476  				},
  1477  			)
  1478  			require.NoError(t, err)
  1479  			require.NoError(t, w.Close())
  1480  
  1481  			_, err = to.IngestAndExcise([]string{sstPath}, sharedSSTs, KeyRange{Start: startKey, End: endKey})
  1482  			require.NoError(t, err)
  1483  			return fmt.Sprintf("replicated %d shared SSTs", len(sharedSSTs))
  1484  
  1485  		case "get":
  1486  			return runGetCmd(t, td, d)
  1487  
  1488  		case "iter":
  1489  			o := &IterOptions{KeyTypes: IterKeyTypePointsAndRanges}
  1490  			var reader Reader
  1491  			reader = d
  1492  			for _, arg := range td.CmdArgs {
  1493  				switch arg.Key {
  1494  				case "mask-suffix":
  1495  					o.RangeKeyMasking.Suffix = []byte(arg.Vals[0])
  1496  				case "mask-filter":
  1497  					o.RangeKeyMasking.Filter = func() BlockPropertyFilterMask {
  1498  						return sstable.NewTestKeysMaskingFilter()
  1499  					}
  1500  				case "snapshot":
  1501  					reader = efos[arg.Vals[0]]
  1502  				}
  1503  			}
  1504  			iter, err := reader.NewIter(o)
  1505  			if err != nil {
  1506  				return err.Error()
  1507  			}
  1508  			return runIterCmd(td, iter, true)
  1509  
  1510  		case "lsm":
  1511  			return runLSMCmd(td, d)
  1512  
  1513  		case "metrics":
  1514  			// The asynchronous loading of table stats can change metrics, so
  1515  			// wait for all the tables' stats to be loaded.
  1516  			d.mu.Lock()
  1517  			d.waitTableStats()
  1518  			d.mu.Unlock()
  1519  
  1520  			return d.Metrics().StringForTests()
  1521  
  1522  		case "wait-pending-table-stats":
  1523  			return runTableStatsCmd(td, d)
  1524  
  1525  		case "excise":
  1526  			ve := &versionEdit{
  1527  				DeletedFiles: map[deletedFileEntry]*fileMetadata{},
  1528  			}
  1529  			var exciseSpan KeyRange
  1530  			if len(td.CmdArgs) != 2 {
  1531  				panic("insufficient args for excise command")
  1532  			}
  1533  			exciseSpan.Start = []byte(td.CmdArgs[0].Key)
  1534  			exciseSpan.End = []byte(td.CmdArgs[1].Key)
  1535  
  1536  			d.mu.Lock()
  1537  			d.mu.versions.logLock()
  1538  			d.mu.Unlock()
  1539  			current := d.mu.versions.currentVersion()
  1540  			for level := range current.Levels {
  1541  				iter := current.Levels[level].Iter()
  1542  				for m := iter.SeekGE(d.cmp, exciseSpan.Start); m != nil && d.cmp(m.Smallest.UserKey, exciseSpan.End) < 0; m = iter.Next() {
  1543  					_, err := d.excise(exciseSpan, m, ve, level)
  1544  					if err != nil {
  1545  						d.mu.Lock()
  1546  						d.mu.versions.logUnlock()
  1547  						d.mu.Unlock()
  1548  						return fmt.Sprintf("error when excising %s: %s", m.FileNum, err.Error())
  1549  					}
  1550  				}
  1551  			}
  1552  			d.mu.Lock()
  1553  			d.mu.versions.logUnlock()
  1554  			d.mu.Unlock()
  1555  			return fmt.Sprintf("would excise %d files, use ingest-and-excise to excise.\n%s", len(ve.DeletedFiles), ve.String())
  1556  
  1557  		case "file-only-snapshot":
  1558  			if len(td.CmdArgs) != 1 {
  1559  				panic("insufficient args for file-only-snapshot command")
  1560  			}
  1561  			name := td.CmdArgs[0].Key
  1562  			var keyRanges []KeyRange
  1563  			for _, line := range strings.Split(td.Input, "\n") {
  1564  				fields := strings.Fields(line)
  1565  				if len(fields) != 2 {
  1566  					return "expected two fields for file-only snapshot KeyRanges"
  1567  				}
  1568  				kr := KeyRange{Start: []byte(fields[0]), End: []byte(fields[1])}
  1569  				keyRanges = append(keyRanges, kr)
  1570  			}
  1571  
  1572  			s := d.NewEventuallyFileOnlySnapshot(keyRanges)
  1573  			efos[name] = s
  1574  			return "ok"
  1575  
  1576  		case "wait-for-file-only-snapshot":
  1577  			if len(td.CmdArgs) != 1 {
  1578  				panic("insufficient args for file-only-snapshot command")
  1579  			}
  1580  			name := td.CmdArgs[0].Key
  1581  			err := efos[name].WaitForFileOnlySnapshot(context.TODO(), 1*time.Millisecond)
  1582  			if err != nil {
  1583  				return err.Error()
  1584  			}
  1585  			return "ok"
  1586  
  1587  		case "unblock":
  1588  			name := td.CmdArgs[0].Key
  1589  			blockedCompactionsMu.Lock()
  1590  			c := compactions[name]
  1591  			delete(compactions, name)
  1592  			blockedCompactionsMu.Unlock()
  1593  			c.unblock <- struct{}{}
  1594  			return "ok"
  1595  
  1596  		case "compact":
  1597  			async := false
  1598  			var otherArgs []datadriven.CmdArg
  1599  			var bc *blockedCompaction
  1600  			for i := range td.CmdArgs {
  1601  				switch td.CmdArgs[i].Key {
  1602  				case "block":
  1603  					name := td.CmdArgs[i].Vals[0]
  1604  					bc = &blockedCompaction{startBlock: make(chan struct{}), unblock: make(chan struct{})}
  1605  					blockedCompactionsMu.Lock()
  1606  					compactions[name] = bc
  1607  					blockNextCompaction = true
  1608  					blockedCompactionName = name
  1609  					blockedCompactionsMu.Unlock()
  1610  					async = true
  1611  				default:
  1612  					otherArgs = append(otherArgs, td.CmdArgs[i])
  1613  				}
  1614  			}
  1615  			var tdClone datadriven.TestData
  1616  			tdClone = *td
  1617  			tdClone.CmdArgs = otherArgs
  1618  			if !async {
  1619  				err := runCompactCmd(td, d)
  1620  				if err != nil {
  1621  					return err.Error()
  1622  				}
  1623  			} else {
  1624  				wg.Add(1)
  1625  				go func() {
  1626  					defer wg.Done()
  1627  					_ = runCompactCmd(&tdClone, d)
  1628  				}()
  1629  				<-bc.startBlock
  1630  				return "spun off in separate goroutine"
  1631  			}
  1632  			return "ok"
  1633  		case "wait-for-background-error":
  1634  			err := <-backgroundErrs
  1635  			return err.Error()
  1636  		default:
  1637  			return fmt.Sprintf("unknown command: %s", td.Cmd)
  1638  		}
  1639  	})
  1640  }
  1641  
  1642  func TestIngestExternal(t *testing.T) {
  1643  	var mem vfs.FS
  1644  	var d *DB
  1645  	var flushed bool
  1646  	defer func() {
  1647  		require.NoError(t, d.Close())
  1648  	}()
  1649  
  1650  	var remoteStorage remote.Storage
  1651  
  1652  	reset := func() {
  1653  		if d != nil {
  1654  			require.NoError(t, d.Close())
  1655  		}
  1656  
  1657  		mem = vfs.NewMem()
  1658  		require.NoError(t, mem.MkdirAll("ext", 0755))
  1659  		remoteStorage = remote.NewInMem()
  1660  		opts := &Options{
  1661  			FS:                    mem,
  1662  			L0CompactionThreshold: 100,
  1663  			L0StopWritesThreshold: 100,
  1664  			DebugCheck:            DebugCheckLevels,
  1665  			EventListener: &EventListener{FlushEnd: func(info FlushInfo) {
  1666  				flushed = true
  1667  			}},
  1668  			FormatMajorVersion: FormatVirtualSSTables,
  1669  		}
  1670  		opts.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
  1671  			"external-locator": remoteStorage,
  1672  		})
  1673  		opts.Experimental.CreateOnShared = remote.CreateOnSharedNone
  1674  		// Disable automatic compactions because otherwise we'll race with
  1675  		// delete-only compactions triggered by ingesting range tombstones.
  1676  		opts.DisableAutomaticCompactions = true
  1677  
  1678  		var err error
  1679  		d, err = Open("", opts)
  1680  		require.NoError(t, err)
  1681  		require.NoError(t, d.SetCreatorID(1))
  1682  	}
  1683  	reset()
  1684  
  1685  	datadriven.RunTest(t, "testdata/ingest_external", func(t *testing.T, td *datadriven.TestData) string {
  1686  		switch td.Cmd {
  1687  		case "reset":
  1688  			reset()
  1689  			return ""
  1690  		case "batch":
  1691  			b := d.NewIndexedBatch()
  1692  			if err := runBatchDefineCmd(td, b); err != nil {
  1693  				return err.Error()
  1694  			}
  1695  			if err := b.Commit(nil); err != nil {
  1696  				return err.Error()
  1697  			}
  1698  			return ""
  1699  		case "build-remote":
  1700  			if err := runBuildRemoteCmd(td, d, remoteStorage); err != nil {
  1701  				return err.Error()
  1702  			}
  1703  			return ""
  1704  
  1705  		case "flush":
  1706  			if err := d.Flush(); err != nil {
  1707  				return err.Error()
  1708  			}
  1709  			return ""
  1710  
  1711  		case "ingest-external":
  1712  			flushed = false
  1713  			if err := runIngestExternalCmd(td, d, "external-locator"); err != nil {
  1714  				return err.Error()
  1715  			}
  1716  			// Wait for a possible flush.
  1717  			d.mu.Lock()
  1718  			for d.mu.compact.flushing {
  1719  				d.mu.compact.cond.Wait()
  1720  			}
  1721  			d.mu.Unlock()
  1722  			if flushed {
  1723  				return "memtable flushed"
  1724  			}
  1725  			return ""
  1726  
  1727  		case "get":
  1728  			return runGetCmd(t, td, d)
  1729  
  1730  		case "iter":
  1731  			iter, _ := d.NewIter(&IterOptions{
  1732  				KeyTypes: IterKeyTypePointsAndRanges,
  1733  			})
  1734  			return runIterCmd(td, iter, true)
  1735  
  1736  		case "lsm":
  1737  			return runLSMCmd(td, d)
  1738  
  1739  		case "metrics":
  1740  			// The asynchronous loading of table stats can change metrics, so
  1741  			// wait for all the tables' stats to be loaded.
  1742  			d.mu.Lock()
  1743  			d.waitTableStats()
  1744  			d.mu.Unlock()
  1745  
  1746  			return d.Metrics().StringForTests()
  1747  
  1748  		case "wait-pending-table-stats":
  1749  			return runTableStatsCmd(td, d)
  1750  
  1751  		case "compact":
  1752  			if len(td.CmdArgs) != 2 {
  1753  				panic("insufficient args for compact command")
  1754  			}
  1755  			l := td.CmdArgs[0].Key
  1756  			r := td.CmdArgs[1].Key
  1757  			err := d.Compact([]byte(l), []byte(r), false)
  1758  			if err != nil {
  1759  				return err.Error()
  1760  			}
  1761  			return ""
  1762  		default:
  1763  			return fmt.Sprintf("unknown command: %s", td.Cmd)
  1764  		}
  1765  	})
  1766  }
  1767  
  1768  func TestIngestMemtableOverlaps(t *testing.T) {
  1769  	comparers := []Comparer{
  1770  		{Name: "default", Compare: DefaultComparer.Compare, FormatKey: DefaultComparer.FormatKey},
  1771  		{
  1772  			Name:      "reverse",
  1773  			Compare:   func(a, b []byte) int { return DefaultComparer.Compare(b, a) },
  1774  			FormatKey: DefaultComparer.FormatKey,
  1775  		},
  1776  	}
  1777  	m := make(map[string]*Comparer)
  1778  	for i := range comparers {
  1779  		c := &comparers[i]
  1780  		m[c.Name] = c
  1781  	}
  1782  
  1783  	for _, comparer := range comparers {
  1784  		t.Run(comparer.Name, func(t *testing.T) {
  1785  			var mem *memTable
  1786  
  1787  			parseMeta := func(s string) *fileMetadata {
  1788  				parts := strings.Split(s, "-")
  1789  				meta := &fileMetadata{}
  1790  				if len(parts) != 2 {
  1791  					t.Fatalf("malformed table spec: %s", s)
  1792  				}
  1793  				var smallest, largest base.InternalKey
  1794  				if strings.Contains(parts[0], ".") {
  1795  					if !strings.Contains(parts[1], ".") {
  1796  						t.Fatalf("malformed table spec: %s", s)
  1797  					}
  1798  					smallest = base.ParseInternalKey(parts[0])
  1799  					largest = base.ParseInternalKey(parts[1])
  1800  				} else {
  1801  					smallest = InternalKey{UserKey: []byte(parts[0])}
  1802  					largest = InternalKey{UserKey: []byte(parts[1])}
  1803  				}
  1804  				// If we're using a reverse comparer, flip the file bounds.
  1805  				if mem.cmp(smallest.UserKey, largest.UserKey) > 0 {
  1806  					smallest, largest = largest, smallest
  1807  				}
  1808  				meta.ExtendPointKeyBounds(comparer.Compare, smallest, largest)
  1809  				meta.InitPhysicalBacking()
  1810  				return meta
  1811  			}
  1812  
  1813  			datadriven.RunTest(t, "testdata/ingest_memtable_overlaps", func(t *testing.T, d *datadriven.TestData) string {
  1814  				switch d.Cmd {
  1815  				case "define":
  1816  					b := newBatch(nil)
  1817  					if err := runBatchDefineCmd(d, b); err != nil {
  1818  						return err.Error()
  1819  					}
  1820  
  1821  					opts := &Options{
  1822  						Comparer: &comparer,
  1823  					}
  1824  					opts.EnsureDefaults().WithFSDefaults()
  1825  					if len(d.CmdArgs) > 1 {
  1826  						return fmt.Sprintf("%s expects at most 1 argument", d.Cmd)
  1827  					}
  1828  					if len(d.CmdArgs) == 1 {
  1829  						opts.Comparer = m[d.CmdArgs[0].String()]
  1830  						if opts.Comparer == nil {
  1831  							return fmt.Sprintf("%s unknown comparer: %s", d.Cmd, d.CmdArgs[0].String())
  1832  						}
  1833  					}
  1834  
  1835  					mem = newMemTable(memTableOptions{Options: opts})
  1836  					if err := mem.apply(b, 0); err != nil {
  1837  						return err.Error()
  1838  					}
  1839  					return ""
  1840  
  1841  				case "overlaps":
  1842  					var buf bytes.Buffer
  1843  					for _, data := range strings.Split(d.Input, "\n") {
  1844  						var keyRanges []internalKeyRange
  1845  						for _, part := range strings.Fields(data) {
  1846  							meta := parseMeta(part)
  1847  							keyRanges = append(keyRanges, internalKeyRange{smallest: meta.Smallest, largest: meta.Largest})
  1848  						}
  1849  						fmt.Fprintf(&buf, "%t\n", ingestMemtableOverlaps(mem.cmp, mem, keyRanges))
  1850  					}
  1851  					return buf.String()
  1852  
  1853  				default:
  1854  					return fmt.Sprintf("unknown command: %s", d.Cmd)
  1855  				}
  1856  			})
  1857  		})
  1858  	}
  1859  }
  1860  
  1861  func TestKeyRangeBasic(t *testing.T) {
  1862  	cmp := base.DefaultComparer.Compare
  1863  	k1 := KeyRange{Start: []byte("b"), End: []byte("c")}
  1864  
  1865  	// Tests for Contains()
  1866  	require.True(t, k1.Contains(cmp, base.MakeInternalKey([]byte("b"), 1, InternalKeyKindSet)))
  1867  	require.False(t, k1.Contains(cmp, base.MakeInternalKey([]byte("c"), 1, InternalKeyKindSet)))
  1868  	require.True(t, k1.Contains(cmp, base.MakeInternalKey([]byte("bb"), 1, InternalKeyKindSet)))
  1869  	require.True(t, k1.Contains(cmp, base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, []byte("c"))))
  1870  
  1871  	m1 := &fileMetadata{
  1872  		Smallest: base.MakeInternalKey([]byte("b"), 1, InternalKeyKindSet),
  1873  		Largest:  base.MakeInternalKey([]byte("c"), 1, InternalKeyKindSet),
  1874  	}
  1875  	require.True(t, k1.Overlaps(cmp, m1))
  1876  	m2 := &fileMetadata{
  1877  		Smallest: base.MakeInternalKey([]byte("c"), 1, InternalKeyKindSet),
  1878  		Largest:  base.MakeInternalKey([]byte("d"), 1, InternalKeyKindSet),
  1879  	}
  1880  	require.False(t, k1.Overlaps(cmp, m2))
  1881  	m3 := &fileMetadata{
  1882  		Smallest: base.MakeInternalKey([]byte("a"), 1, InternalKeyKindSet),
  1883  		Largest:  base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, []byte("b")),
  1884  	}
  1885  	require.False(t, k1.Overlaps(cmp, m3))
  1886  	m4 := &fileMetadata{
  1887  		Smallest: base.MakeInternalKey([]byte("a"), 1, InternalKeyKindSet),
  1888  		Largest:  base.MakeInternalKey([]byte("b"), 1, InternalKeyKindSet),
  1889  	}
  1890  	require.True(t, k1.Overlaps(cmp, m4))
  1891  }
  1892  
  1893  func BenchmarkIngestOverlappingMemtable(b *testing.B) {
  1894  	assertNoError := func(err error) {
  1895  		b.Helper()
  1896  		if err != nil {
  1897  			b.Fatal(err)
  1898  		}
  1899  	}
  1900  
  1901  	for count := 1; count < 6; count++ {
  1902  		b.Run(fmt.Sprintf("memtables=%d", count), func(b *testing.B) {
  1903  			for i := 0; i < b.N; i++ {
  1904  				b.StopTimer()
  1905  				mem := vfs.NewMem()
  1906  				d, err := Open("", &Options{
  1907  					FS: mem,
  1908  				})
  1909  				assertNoError(err)
  1910  
  1911  				// Create memtables.
  1912  				for {
  1913  					assertNoError(d.Set([]byte("a"), nil, nil))
  1914  					d.mu.Lock()
  1915  					done := len(d.mu.mem.queue) == count
  1916  					d.mu.Unlock()
  1917  					if done {
  1918  						break
  1919  					}
  1920  				}
  1921  
  1922  				// Create the overlapping sstable that will force a flush when ingested.
  1923  				f, err := mem.Create("ext")
  1924  				assertNoError(err)
  1925  				w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  1926  				assertNoError(w.Set([]byte("a"), nil))
  1927  				assertNoError(w.Close())
  1928  
  1929  				b.StartTimer()
  1930  				assertNoError(d.Ingest([]string{"ext"}))
  1931  			}
  1932  		})
  1933  	}
  1934  }
  1935  
  1936  func TestIngestTargetLevel(t *testing.T) {
  1937  	var d *DB
  1938  	defer func() {
  1939  		if d != nil {
  1940  			// Ignore errors because this test defines fake in-progress transactions
  1941  			// that prohibit clean shutdown.
  1942  			_ = d.Close()
  1943  		}
  1944  	}()
  1945  
  1946  	parseMeta := func(s string) *fileMetadata {
  1947  		var rkey bool
  1948  		if len(s) >= 4 && s[0:4] == "rkey" {
  1949  			rkey = true
  1950  			s = s[5:]
  1951  		}
  1952  		parts := strings.Split(s, "-")
  1953  		if len(parts) != 2 {
  1954  			t.Fatalf("malformed table spec: %s", s)
  1955  		}
  1956  		var m *fileMetadata
  1957  		if rkey {
  1958  			m = (&fileMetadata{}).ExtendRangeKeyBounds(
  1959  				d.cmp,
  1960  				InternalKey{UserKey: []byte(parts[0])},
  1961  				InternalKey{UserKey: []byte(parts[1])},
  1962  			)
  1963  		} else {
  1964  			m = (&fileMetadata{}).ExtendPointKeyBounds(
  1965  				d.cmp,
  1966  				InternalKey{UserKey: []byte(parts[0])},
  1967  				InternalKey{UserKey: []byte(parts[1])},
  1968  			)
  1969  		}
  1970  		m.InitPhysicalBacking()
  1971  		return m
  1972  	}
  1973  
  1974  	datadriven.RunTest(t, "testdata/ingest_target_level", func(t *testing.T, td *datadriven.TestData) string {
  1975  		switch td.Cmd {
  1976  		case "define":
  1977  			if d != nil {
  1978  				// Ignore errors because this test defines fake in-progress
  1979  				// transactions that prohibit clean shutdown.
  1980  				_ = d.Close()
  1981  			}
  1982  
  1983  			var err error
  1984  			opts := Options{
  1985  				FormatMajorVersion: internalFormatNewest,
  1986  			}
  1987  			opts.WithFSDefaults()
  1988  			if d, err = runDBDefineCmd(td, &opts); err != nil {
  1989  				return err.Error()
  1990  			}
  1991  
  1992  			readState := d.loadReadState()
  1993  			c := &checkConfig{
  1994  				logger:    d.opts.Logger,
  1995  				comparer:  d.opts.Comparer,
  1996  				readState: readState,
  1997  				newIters:  d.newIters,
  1998  				// TODO: runDBDefineCmd doesn't properly update the visible
  1999  				// sequence number. So we have to explicitly configure level checker with a very large
  2000  				// sequence number, otherwise the DB appears empty.
  2001  				seqNum: InternalKeySeqNumMax,
  2002  			}
  2003  			if err := checkLevelsInternal(c); err != nil {
  2004  				return err.Error()
  2005  			}
  2006  			readState.unref()
  2007  
  2008  			d.mu.Lock()
  2009  			s := d.mu.versions.currentVersion().String()
  2010  			d.mu.Unlock()
  2011  			return s
  2012  
  2013  		case "target":
  2014  			var buf bytes.Buffer
  2015  			suggestSplit := false
  2016  			for _, cmd := range td.CmdArgs {
  2017  				switch cmd.Key {
  2018  				case "suggest-split":
  2019  					suggestSplit = true
  2020  				}
  2021  			}
  2022  			for _, target := range strings.Split(td.Input, "\n") {
  2023  				meta := parseMeta(target)
  2024  				level, overlapFile, err := ingestTargetLevel(
  2025  					d.newIters, d.tableNewRangeKeyIter, IterOptions{logger: d.opts.Logger},
  2026  					d.opts.Comparer, d.mu.versions.currentVersion(), 1, d.mu.compact.inProgress, meta,
  2027  					suggestSplit)
  2028  				if err != nil {
  2029  					return err.Error()
  2030  				}
  2031  				if overlapFile != nil {
  2032  					fmt.Fprintf(&buf, "%d (split file: %s)\n", level, overlapFile.FileNum)
  2033  				} else {
  2034  					fmt.Fprintf(&buf, "%d\n", level)
  2035  				}
  2036  			}
  2037  			return buf.String()
  2038  
  2039  		default:
  2040  			return fmt.Sprintf("unknown command: %s", td.Cmd)
  2041  		}
  2042  	})
  2043  }
  2044  
  2045  func TestIngest(t *testing.T) {
  2046  	var mem vfs.FS
  2047  	var d *DB
  2048  	var flushed bool
  2049  	defer func() {
  2050  		require.NoError(t, d.Close())
  2051  	}()
  2052  
  2053  	reset := func(split bool) {
  2054  		if d != nil {
  2055  			require.NoError(t, d.Close())
  2056  		}
  2057  
  2058  		mem = vfs.NewMem()
  2059  		require.NoError(t, mem.MkdirAll("ext", 0755))
  2060  		opts := &Options{
  2061  			FS:                    mem,
  2062  			L0CompactionThreshold: 100,
  2063  			L0StopWritesThreshold: 100,
  2064  			DebugCheck:            DebugCheckLevels,
  2065  			EventListener: &EventListener{FlushEnd: func(info FlushInfo) {
  2066  				flushed = true
  2067  			}},
  2068  			FormatMajorVersion: internalFormatNewest,
  2069  		}
  2070  		opts.Experimental.IngestSplit = func() bool {
  2071  			return split
  2072  		}
  2073  		// Disable automatic compactions because otherwise we'll race with
  2074  		// delete-only compactions triggered by ingesting range tombstones.
  2075  		opts.DisableAutomaticCompactions = true
  2076  
  2077  		var err error
  2078  		d, err = Open("", opts)
  2079  		require.NoError(t, err)
  2080  	}
  2081  	reset(false /* split */)
  2082  
  2083  	datadriven.RunTest(t, "testdata/ingest", func(t *testing.T, td *datadriven.TestData) string {
  2084  		switch td.Cmd {
  2085  		case "reset":
  2086  			split := false
  2087  			for _, cmd := range td.CmdArgs {
  2088  				switch cmd.Key {
  2089  				case "enable-split":
  2090  					split = true
  2091  				default:
  2092  					return fmt.Sprintf("unexpected key: %s", cmd.Key)
  2093  				}
  2094  			}
  2095  			reset(split)
  2096  			return ""
  2097  		case "batch":
  2098  			b := d.NewIndexedBatch()
  2099  			if err := runBatchDefineCmd(td, b); err != nil {
  2100  				return err.Error()
  2101  			}
  2102  			if err := b.Commit(nil); err != nil {
  2103  				return err.Error()
  2104  			}
  2105  			return ""
  2106  
  2107  		case "build":
  2108  			if err := runBuildCmd(td, d, mem); err != nil {
  2109  				return err.Error()
  2110  			}
  2111  			return ""
  2112  
  2113  		case "ingest":
  2114  			flushed = false
  2115  			if err := runIngestCmd(td, d, mem); err != nil {
  2116  				return err.Error()
  2117  			}
  2118  			// Wait for a possible flush.
  2119  			d.mu.Lock()
  2120  			for d.mu.compact.flushing {
  2121  				d.mu.compact.cond.Wait()
  2122  			}
  2123  			d.mu.Unlock()
  2124  			if flushed {
  2125  				return "memtable flushed"
  2126  			}
  2127  			return ""
  2128  
  2129  		case "get":
  2130  			return runGetCmd(t, td, d)
  2131  
  2132  		case "iter":
  2133  			iter, _ := d.NewIter(&IterOptions{
  2134  				KeyTypes: IterKeyTypePointsAndRanges,
  2135  			})
  2136  			return runIterCmd(td, iter, true)
  2137  
  2138  		case "lsm":
  2139  			return runLSMCmd(td, d)
  2140  
  2141  		case "metrics":
  2142  			// The asynchronous loading of table stats can change metrics, so
  2143  			// wait for all the tables' stats to be loaded.
  2144  			d.mu.Lock()
  2145  			d.waitTableStats()
  2146  			d.mu.Unlock()
  2147  
  2148  			return d.Metrics().StringForTests()
  2149  
  2150  		case "wait-pending-table-stats":
  2151  			return runTableStatsCmd(td, d)
  2152  
  2153  		case "compact":
  2154  			if len(td.CmdArgs) != 2 {
  2155  				panic("insufficient args for compact command")
  2156  			}
  2157  			l := td.CmdArgs[0].Key
  2158  			r := td.CmdArgs[1].Key
  2159  			err := d.Compact([]byte(l), []byte(r), false)
  2160  			if err != nil {
  2161  				return err.Error()
  2162  			}
  2163  			return ""
  2164  		default:
  2165  			return fmt.Sprintf("unknown command: %s", td.Cmd)
  2166  		}
  2167  	})
  2168  }
  2169  
  2170  func TestIngestError(t *testing.T) {
  2171  	for i := int32(0); ; i++ {
  2172  		mem := vfs.NewMem()
  2173  
  2174  		f0, err := mem.Create("ext0")
  2175  		require.NoError(t, err)
  2176  		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f0), sstable.WriterOptions{})
  2177  		require.NoError(t, w.Set([]byte("d"), nil))
  2178  		require.NoError(t, w.Close())
  2179  		f1, err := mem.Create("ext1")
  2180  		require.NoError(t, err)
  2181  		w = sstable.NewWriter(objstorageprovider.NewFileWritable(f1), sstable.WriterOptions{})
  2182  		require.NoError(t, w.Set([]byte("d"), nil))
  2183  		require.NoError(t, w.Close())
  2184  
  2185  		inj := errorfs.OnIndex(-1)
  2186  		d, err := Open("", &Options{
  2187  			FS:                    errorfs.Wrap(mem, inj),
  2188  			Logger:                panicLogger{},
  2189  			L0CompactionThreshold: 8,
  2190  		})
  2191  		require.NoError(t, err)
  2192  		// Force the creation of an L0 sstable that overlaps with the tables
  2193  		// we'll attempt to ingest. This ensures that we exercise filesystem
  2194  		// codepaths when determining the ingest target level.
  2195  		require.NoError(t, d.Set([]byte("a"), nil, nil))
  2196  		require.NoError(t, d.Set([]byte("d"), nil, nil))
  2197  		require.NoError(t, d.Flush())
  2198  
  2199  		t.Run(fmt.Sprintf("index-%d", i), func(t *testing.T) {
  2200  			defer func() {
  2201  				if r := recover(); r != nil {
  2202  					if e, ok := r.(error); ok && errors.Is(e, errorfs.ErrInjected) {
  2203  						return
  2204  					}
  2205  					// d.opts.Logger.Fatalf won't propagate ErrInjected
  2206  					// itself, but should contain the error message.
  2207  					if strings.HasSuffix(fmt.Sprint(r), errorfs.ErrInjected.Error()) {
  2208  						return
  2209  					}
  2210  					t.Fatal(r)
  2211  				}
  2212  			}()
  2213  
  2214  			inj.SetIndex(i)
  2215  			err1 := d.Ingest([]string{"ext0"})
  2216  			err2 := d.Ingest([]string{"ext1"})
  2217  			err := firstError(err1, err2)
  2218  			if err != nil && !errors.Is(err, errorfs.ErrInjected) {
  2219  				t.Fatal(err)
  2220  			}
  2221  		})
  2222  
  2223  		// d.Close may error if we failed to flush the manifest.
  2224  		_ = d.Close()
  2225  
  2226  		// If the injector's index is non-negative, the i-th filesystem
  2227  		// operation was never executed.
  2228  		if inj.Index() >= 0 {
  2229  			break
  2230  		}
  2231  	}
  2232  }
  2233  
  2234  func TestIngestIdempotence(t *testing.T) {
  2235  	// Use an on-disk filesystem, because Ingest with a MemFS will copy, not
  2236  	// link the ingested file.
  2237  	dir, err := os.MkdirTemp("", "ingest-idempotence")
  2238  	require.NoError(t, err)
  2239  	defer os.RemoveAll(dir)
  2240  	fs := vfs.Default
  2241  
  2242  	path := fs.PathJoin(dir, "ext")
  2243  	f, err := fs.Create(fs.PathJoin(dir, "ext"))
  2244  	require.NoError(t, err)
  2245  	w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2246  	require.NoError(t, w.Set([]byte("d"), nil))
  2247  	require.NoError(t, w.Close())
  2248  
  2249  	d, err := Open(dir, &Options{
  2250  		FS: fs,
  2251  	})
  2252  	require.NoError(t, err)
  2253  	const count = 4
  2254  	for i := 0; i < count; i++ {
  2255  		ingestPath := fs.PathJoin(dir, fmt.Sprintf("ext%d", i))
  2256  		require.NoError(t, fs.Link(path, ingestPath))
  2257  		require.NoError(t, d.Ingest([]string{ingestPath}))
  2258  	}
  2259  	require.NoError(t, d.Close())
  2260  }
  2261  
  2262  func TestIngestCompact(t *testing.T) {
  2263  	mem := vfs.NewMem()
  2264  	lel := MakeLoggingEventListener(&base.InMemLogger{})
  2265  	d, err := Open("", &Options{
  2266  		EventListener:         &lel,
  2267  		FS:                    mem,
  2268  		L0CompactionThreshold: 1,
  2269  		L0StopWritesThreshold: 1,
  2270  	})
  2271  	require.NoError(t, err)
  2272  
  2273  	src := func(i int) string {
  2274  		return fmt.Sprintf("ext%d", i)
  2275  	}
  2276  	f, err := mem.Create(src(0))
  2277  	require.NoError(t, err)
  2278  
  2279  	w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2280  	key := []byte("a")
  2281  	require.NoError(t, w.Add(base.MakeInternalKey(key, 0, InternalKeyKindSet), nil))
  2282  	require.NoError(t, w.Close())
  2283  
  2284  	// Make N copies of the sstable.
  2285  	const count = 20
  2286  	for i := 1; i < count; i++ {
  2287  		require.NoError(t, vfs.Copy(d.opts.FS, src(0), src(i)))
  2288  	}
  2289  
  2290  	// Ingest the same sstable multiple times. Compaction should take place as
  2291  	// ingestion happens, preventing an indefinite write stall from occurring.
  2292  	for i := 0; i < count; i++ {
  2293  		if i == 10 {
  2294  			// Half-way through the ingestions, set a key in the memtable to force
  2295  			// overlap with the memtable which will require the memtable to be
  2296  			// flushed.
  2297  			require.NoError(t, d.Set(key, nil, nil))
  2298  		}
  2299  		require.NoError(t, d.Ingest([]string{src(i)}))
  2300  	}
  2301  
  2302  	require.NoError(t, d.Close())
  2303  }
  2304  
  2305  func TestConcurrentIngest(t *testing.T) {
  2306  	mem := vfs.NewMem()
  2307  	d, err := Open("", &Options{
  2308  		FS: mem,
  2309  	})
  2310  	require.NoError(t, err)
  2311  
  2312  	// Create an sstable with 2 keys. This is necessary to trigger the overlap
  2313  	// bug because an sstable with a single key will not have overlap in internal
  2314  	// key space and the sequence number assignment had already guaranteed
  2315  	// correct ordering.
  2316  	src := func(i int) string {
  2317  		return fmt.Sprintf("ext%d", i)
  2318  	}
  2319  	f, err := mem.Create(src(0))
  2320  	require.NoError(t, err)
  2321  
  2322  	w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2323  	require.NoError(t, w.Set([]byte("a"), nil))
  2324  	require.NoError(t, w.Set([]byte("b"), nil))
  2325  	require.NoError(t, w.Close())
  2326  
  2327  	// Make N copies of the sstable.
  2328  	errCh := make(chan error, 5)
  2329  	for i := 1; i < cap(errCh); i++ {
  2330  		require.NoError(t, vfs.Copy(d.opts.FS, src(0), src(i)))
  2331  	}
  2332  
  2333  	// Perform N ingestions concurrently.
  2334  	for i := 0; i < cap(errCh); i++ {
  2335  		go func(i int) {
  2336  			err := d.Ingest([]string{src(i)})
  2337  			if err == nil {
  2338  				if _, err = d.opts.FS.Stat(src(i)); oserror.IsNotExist(err) {
  2339  					err = nil
  2340  				}
  2341  			}
  2342  			errCh <- err
  2343  		}(i)
  2344  	}
  2345  	for i := 0; i < cap(errCh); i++ {
  2346  		require.NoError(t, <-errCh)
  2347  	}
  2348  
  2349  	require.NoError(t, d.Close())
  2350  }
  2351  
  2352  func TestConcurrentIngestCompact(t *testing.T) {
  2353  	for i := 0; i < 2; i++ {
  2354  		t.Run("", func(t *testing.T) {
  2355  			mem := vfs.NewMem()
  2356  			compactionReady := make(chan struct{})
  2357  			compactionBegin := make(chan struct{})
  2358  			d, err := Open("", &Options{
  2359  				FS: mem,
  2360  				EventListener: &EventListener{
  2361  					TableCreated: func(info TableCreateInfo) {
  2362  						if info.Reason == "compacting" {
  2363  							close(compactionReady)
  2364  							<-compactionBegin
  2365  						}
  2366  					},
  2367  				},
  2368  			})
  2369  			require.NoError(t, err)
  2370  
  2371  			ingest := func(keys ...string) {
  2372  				t.Helper()
  2373  				f, err := mem.Create("ext")
  2374  				require.NoError(t, err)
  2375  
  2376  				w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2377  				for _, k := range keys {
  2378  					require.NoError(t, w.Set([]byte(k), nil))
  2379  				}
  2380  				require.NoError(t, w.Close())
  2381  				require.NoError(t, d.Ingest([]string{"ext"}))
  2382  			}
  2383  
  2384  			compact := func(start, end string) {
  2385  				t.Helper()
  2386  				require.NoError(t, d.Compact([]byte(start), []byte(end), false))
  2387  			}
  2388  
  2389  			lsm := func() string {
  2390  				d.mu.Lock()
  2391  				s := d.mu.versions.currentVersion().String()
  2392  				d.mu.Unlock()
  2393  				return s
  2394  			}
  2395  
  2396  			expectLSM := func(expected string) {
  2397  				t.Helper()
  2398  				expected = strings.TrimSpace(expected)
  2399  				actual := strings.TrimSpace(lsm())
  2400  				if expected != actual {
  2401  					t.Fatalf("expected\n%s\nbut found\n%s", expected, actual)
  2402  				}
  2403  			}
  2404  
  2405  			ingest("a")
  2406  			ingest("a")
  2407  			ingest("c")
  2408  			ingest("c")
  2409  
  2410  			expectLSM(`
  2411  0.0:
  2412    000005:[a#11,SET-a#11,SET]
  2413    000007:[c#13,SET-c#13,SET]
  2414  6:
  2415    000004:[a#10,SET-a#10,SET]
  2416    000006:[c#12,SET-c#12,SET]
  2417  `)
  2418  
  2419  			// At this point ingestion of an sstable containing only key "b" will be
  2420  			// targeted at L6. Yet a concurrent compaction of sstables 5 and 7 will
  2421  			// create a new sstable in L6 spanning ["a"-"c"]. So the ingestion must
  2422  			// actually target L5.
  2423  
  2424  			switch i {
  2425  			case 0:
  2426  				// Compact, then ingest.
  2427  				go func() {
  2428  					<-compactionReady
  2429  
  2430  					ingest("b")
  2431  
  2432  					close(compactionBegin)
  2433  				}()
  2434  
  2435  				compact("a", "z")
  2436  
  2437  				expectLSM(`
  2438  0.0:
  2439    000009:[b#14,SET-b#14,SET]
  2440  6:
  2441    000008:[a#0,SET-c#0,SET]
  2442  `)
  2443  
  2444  			case 1:
  2445  				// Ingest, then compact
  2446  				var wg sync.WaitGroup
  2447  				wg.Add(1)
  2448  				go func() {
  2449  					defer wg.Done()
  2450  					close(compactionBegin)
  2451  					compact("a", "z")
  2452  				}()
  2453  
  2454  				ingest("b")
  2455  				wg.Wait()
  2456  
  2457  				// Because we're performing the ingestion and compaction concurrently,
  2458  				// we can't guarantee any particular LSM structure at this point. The
  2459  				// test will fail with an assertion error due to overlapping sstables
  2460  				// if there is insufficient synchronization between ingestion and
  2461  				// compaction.
  2462  			}
  2463  
  2464  			require.NoError(t, d.Close())
  2465  		})
  2466  	}
  2467  }
  2468  
  2469  func TestIngestFlushQueuedMemTable(t *testing.T) {
  2470  	// Verify that ingestion forces a flush of a queued memtable.
  2471  
  2472  	// Test with a format major version prior to FormatFlushableIngest and one
  2473  	// after. Both should result in the same statistic calculations.
  2474  	for _, fmv := range []FormatMajorVersion{FormatFlushableIngest - 1, internalFormatNewest} {
  2475  		func(fmv FormatMajorVersion) {
  2476  			mem := vfs.NewMem()
  2477  			d, err := Open("", &Options{
  2478  				FS:                 mem,
  2479  				FormatMajorVersion: fmv,
  2480  			})
  2481  			require.NoError(t, err)
  2482  
  2483  			// Add the key "a" to the memtable, then fill up the memtable with the key
  2484  			// "b". The ingested sstable will only overlap with the queued memtable.
  2485  			require.NoError(t, d.Set([]byte("a"), nil, nil))
  2486  			for {
  2487  				require.NoError(t, d.Set([]byte("b"), nil, nil))
  2488  				d.mu.Lock()
  2489  				done := len(d.mu.mem.queue) == 2
  2490  				d.mu.Unlock()
  2491  				if done {
  2492  					break
  2493  				}
  2494  			}
  2495  
  2496  			ingest := func(keys ...string) {
  2497  				t.Helper()
  2498  				f, err := mem.Create("ext")
  2499  				require.NoError(t, err)
  2500  
  2501  				w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
  2502  					TableFormat: fmv.MinTableFormat(),
  2503  				})
  2504  				for _, k := range keys {
  2505  					require.NoError(t, w.Set([]byte(k), nil))
  2506  				}
  2507  				require.NoError(t, w.Close())
  2508  				stats, err := d.IngestWithStats([]string{"ext"})
  2509  				require.NoError(t, err)
  2510  				require.Equal(t, stats.ApproxIngestedIntoL0Bytes, stats.Bytes)
  2511  				require.Equal(t, stats.MemtableOverlappingFiles, 1)
  2512  				require.Less(t, uint64(0), stats.Bytes)
  2513  			}
  2514  
  2515  			ingest("a")
  2516  
  2517  			require.NoError(t, d.Close())
  2518  		}(fmv)
  2519  	}
  2520  }
  2521  
  2522  func TestIngestStats(t *testing.T) {
  2523  	mem := vfs.NewMem()
  2524  	d, err := Open("", &Options{
  2525  		FS: mem,
  2526  	})
  2527  	require.NoError(t, err)
  2528  
  2529  	ingest := func(expectedLevel int, keys ...string) {
  2530  		t.Helper()
  2531  		f, err := mem.Create("ext")
  2532  		require.NoError(t, err)
  2533  
  2534  		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2535  		for _, k := range keys {
  2536  			require.NoError(t, w.Set([]byte(k), nil))
  2537  		}
  2538  		require.NoError(t, w.Close())
  2539  		stats, err := d.IngestWithStats([]string{"ext"})
  2540  		require.NoError(t, err)
  2541  		if expectedLevel == 0 {
  2542  			require.Equal(t, stats.ApproxIngestedIntoL0Bytes, stats.Bytes)
  2543  		} else {
  2544  			require.EqualValues(t, 0, stats.ApproxIngestedIntoL0Bytes)
  2545  		}
  2546  		require.Less(t, uint64(0), stats.Bytes)
  2547  	}
  2548  	ingest(6, "a")
  2549  	ingest(0, "a")
  2550  	ingest(6, "b", "g")
  2551  	ingest(0, "c")
  2552  	require.NoError(t, d.Close())
  2553  }
  2554  
  2555  func TestIngestFlushQueuedLargeBatch(t *testing.T) {
  2556  	// Verify that ingestion forces a flush of a queued large batch.
  2557  
  2558  	mem := vfs.NewMem()
  2559  	d, err := Open("", &Options{
  2560  		FS: mem,
  2561  	})
  2562  	require.NoError(t, err)
  2563  
  2564  	// The default large batch threshold is slightly less than 1/2 of the
  2565  	// memtable size which makes triggering a problem with flushing queued large
  2566  	// batches irritating. Manually adjust the threshold to 1/8 of the memtable
  2567  	// size in order to more easily create a situation where a large batch is
  2568  	// queued but not automatically flushed.
  2569  	d.mu.Lock()
  2570  	d.largeBatchThreshold = d.opts.MemTableSize / 8
  2571  	d.mu.Unlock()
  2572  
  2573  	// Set a record with a large value. This will be transformed into a large
  2574  	// batch and placed in the flushable queue.
  2575  	require.NoError(t, d.Set([]byte("a"), bytes.Repeat([]byte("v"), int(d.largeBatchThreshold)), nil))
  2576  
  2577  	ingest := func(keys ...string) {
  2578  		t.Helper()
  2579  		f, err := mem.Create("ext")
  2580  		require.NoError(t, err)
  2581  
  2582  		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2583  		for _, k := range keys {
  2584  			require.NoError(t, w.Set([]byte(k), nil))
  2585  		}
  2586  		require.NoError(t, w.Close())
  2587  		require.NoError(t, d.Ingest([]string{"ext"}))
  2588  	}
  2589  
  2590  	ingest("a")
  2591  
  2592  	require.NoError(t, d.Close())
  2593  }
  2594  
  2595  func TestIngestMemtablePendingOverlap(t *testing.T) {
  2596  	mem := vfs.NewMem()
  2597  	d, err := Open("", &Options{
  2598  		FS: mem,
  2599  	})
  2600  	require.NoError(t, err)
  2601  
  2602  	d.mu.Lock()
  2603  	// Use a custom commit pipeline apply function to give us control over
  2604  	// timing of events.
  2605  	assignedBatch := make(chan struct{})
  2606  	applyBatch := make(chan struct{})
  2607  	originalApply := d.commit.env.apply
  2608  	d.commit.env.apply = func(b *Batch, mem *memTable) error {
  2609  		assignedBatch <- struct{}{}
  2610  		applyBatch <- struct{}{}
  2611  		return originalApply(b, mem)
  2612  	}
  2613  	d.mu.Unlock()
  2614  
  2615  	ingest := func(keys ...string) {
  2616  		t.Helper()
  2617  		f, err := mem.Create("ext")
  2618  		require.NoError(t, err)
  2619  
  2620  		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2621  		for _, k := range keys {
  2622  			require.NoError(t, w.Set([]byte(k), nil))
  2623  		}
  2624  		require.NoError(t, w.Close())
  2625  		require.NoError(t, d.Ingest([]string{"ext"}))
  2626  	}
  2627  
  2628  	var wg sync.WaitGroup
  2629  	wg.Add(2)
  2630  
  2631  	// First, Set('c') begins. This call will:
  2632  	//
  2633  	// * enqueue the batch to the pending queue.
  2634  	// * allocate a sequence number `x`.
  2635  	// * write the batch to the WAL.
  2636  	//
  2637  	// and then block until we read from the `applyBatch` channel down below.
  2638  	go func() {
  2639  		err := d.Set([]byte("c"), nil, nil)
  2640  		if err != nil {
  2641  			t.Error(err)
  2642  		}
  2643  		wg.Done()
  2644  	}()
  2645  
  2646  	// When the above Set('c') is ready to apply, it sends on the
  2647  	// `assignedBatch` channel. Once that happens, we start Ingest('a', 'c').
  2648  	// The Ingest('a', 'c') allocates sequence number `x + 1`.
  2649  	go func() {
  2650  		// Wait until the Set has grabbed a sequence number before ingesting.
  2651  		<-assignedBatch
  2652  		ingest("a", "c")
  2653  		wg.Done()
  2654  	}()
  2655  
  2656  	// The Set('c')#1 and Ingest('a', 'c')#2 are both pending. To maintain
  2657  	// sequence number invariants, the Set needs to be applied and flushed
  2658  	// before the Ingest determines its target level.
  2659  	//
  2660  	// Sleep a bit to ensure that the Ingest has time to call into
  2661  	// AllocateSeqNum. Once it allocates its sequence number, it should see
  2662  	// that there are unpublished sequence numbers below it and spin until the
  2663  	// Set's sequence number is published. After sleeping, read from
  2664  	// `applyBatch` to actually allow the Set to apply and publish its
  2665  	// sequence number.
  2666  	time.Sleep(100 * time.Millisecond)
  2667  	<-applyBatch
  2668  
  2669  	// Wait for both calls to complete.
  2670  	wg.Wait()
  2671  	require.NoError(t, d.Flush())
  2672  	require.NoError(t, d.CheckLevels(nil))
  2673  	require.NoError(t, d.Close())
  2674  }
  2675  
  2676  type testLogger struct {
  2677  	t testing.TB
  2678  }
  2679  
  2680  func (l testLogger) Infof(format string, args ...interface{}) {
  2681  	l.t.Logf(format, args...)
  2682  }
  2683  
  2684  func (l testLogger) Fatalf(format string, args ...interface{}) {
  2685  	l.t.Fatalf(format, args...)
  2686  }
  2687  
  2688  // TestIngestMemtableOverlapRace is a regression test for the race described in
  2689  // #2196. If an ingest that checks for overlap with the mutable memtable and
  2690  // finds no overlap, it must not allow overlapping keys with later sequence
  2691  // numbers to be applied to the memtable and the memtable to be flushed before
  2692  // the ingest completes.
  2693  //
  2694  // This test operates by committing the same key concurrently:
  2695  //   - 1 goroutine repeatedly ingests the same sstable writing the key `foo`
  2696  //   - n goroutines repeatedly apply batches writing the key `foo` and trigger
  2697  //     flushes.
  2698  //
  2699  // After a while, the database is closed and the manifest is verified. Version
  2700  // edits should contain new files with monotonically increasing sequence
  2701  // numbers, since every flush and every ingest conflicts with one another.
  2702  func TestIngestMemtableOverlapRace(t *testing.T) {
  2703  	mem := vfs.NewMem()
  2704  	el := MakeLoggingEventListener(testLogger{t: t})
  2705  	d, err := Open("", &Options{
  2706  		FS: mem,
  2707  		// Disable automatic compactions to keep the manifest clean; only
  2708  		// flushes and ingests.
  2709  		DisableAutomaticCompactions: true,
  2710  		// Disable the WAL to speed up batch commits.
  2711  		DisableWAL:    true,
  2712  		EventListener: &el,
  2713  		// We're endlessly appending to L0 without clearing it, so set a maximal
  2714  		// stop writes threshold.
  2715  		L0StopWritesThreshold: math.MaxInt,
  2716  		// Accumulating more than 1 immutable memtable doesn't help us exercise
  2717  		// the bug, since the committed keys need to be flushed promptly.
  2718  		MemTableStopWritesThreshold: 2,
  2719  	})
  2720  	require.NoError(t, err)
  2721  
  2722  	// Prepare a sstable `ext` deleting foo.
  2723  	f, err := mem.Create("ext")
  2724  	require.NoError(t, err)
  2725  	w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2726  	require.NoError(t, w.Delete([]byte("foo")))
  2727  	require.NoError(t, w.Close())
  2728  
  2729  	var done atomic.Bool
  2730  	const numSetters = 2
  2731  	var wg sync.WaitGroup
  2732  	wg.Add(numSetters + 1)
  2733  
  2734  	untilDone := func(fn func()) {
  2735  		defer wg.Done()
  2736  		for !done.Load() {
  2737  			fn()
  2738  		}
  2739  	}
  2740  
  2741  	// Ingest in the background.
  2742  	totalIngests := 0
  2743  	go untilDone(func() {
  2744  		filename := fmt.Sprintf("ext%d", totalIngests)
  2745  		require.NoError(t, mem.Link("ext", filename))
  2746  		require.NoError(t, d.Ingest([]string{filename}))
  2747  		totalIngests++
  2748  	})
  2749  
  2750  	// Apply batches and trigger flushes in the background.
  2751  	wo := &WriteOptions{Sync: false}
  2752  	var localCommits [numSetters]int
  2753  	for i := 0; i < numSetters; i++ {
  2754  		i := i
  2755  		v := []byte(fmt.Sprintf("v%d", i+1))
  2756  		go untilDone(func() {
  2757  			// Commit a batch setting foo=vN.
  2758  			b := d.NewBatch()
  2759  			require.NoError(t, b.Set([]byte("foo"), v, nil))
  2760  			require.NoError(t, b.Commit(wo))
  2761  			localCommits[i]++
  2762  			d.AsyncFlush()
  2763  		})
  2764  	}
  2765  	time.Sleep(100 * time.Millisecond)
  2766  	done.Store(true)
  2767  	wg.Wait()
  2768  
  2769  	var totalCommits int
  2770  	for i := 0; i < numSetters; i++ {
  2771  		totalCommits += localCommits[i]
  2772  	}
  2773  	m := d.Metrics()
  2774  	tot := m.Total()
  2775  	t.Logf("Committed %d batches.", totalCommits)
  2776  	t.Logf("Flushed %d times.", m.Flush.Count)
  2777  	t.Logf("Ingested %d sstables.", tot.TablesIngested)
  2778  	require.NoError(t, d.CheckLevels(nil))
  2779  	require.NoError(t, d.Close())
  2780  
  2781  	// Replay the manifest. Every flush and ingest is a separate version edit.
  2782  	// Since they all write the same key and compactions are disabled, sequence
  2783  	// numbers of new files should be monotonically increasing.
  2784  	//
  2785  	// This check is necessary because most of these sstables are ingested into
  2786  	// L0. The L0 sublevels construction will order them by LargestSeqNum, even
  2787  	// if they're added to L0 out-of-order. The CheckLevels call at the end of
  2788  	// the test may find that the sublevels are all appropriately ordered, but
  2789  	// the manifest may reveal they were added to the LSM out-of-order.
  2790  	dbDesc, err := Peek("", mem)
  2791  	require.NoError(t, err)
  2792  	require.True(t, dbDesc.Exists)
  2793  	f, err = mem.Open(dbDesc.ManifestFilename)
  2794  	require.NoError(t, err)
  2795  	defer f.Close()
  2796  	rr := record.NewReader(f, 0 /* logNum */)
  2797  	var largest *fileMetadata
  2798  	for {
  2799  		r, err := rr.Next()
  2800  		if err == io.EOF || err == record.ErrInvalidChunk {
  2801  			break
  2802  		}
  2803  		require.NoError(t, err)
  2804  		var ve manifest.VersionEdit
  2805  		require.NoError(t, ve.Decode(r))
  2806  		t.Log(ve.String())
  2807  		for _, f := range ve.NewFiles {
  2808  			if largest != nil {
  2809  				require.Equal(t, 0, f.Level)
  2810  				if largest.LargestSeqNum > f.Meta.LargestSeqNum {
  2811  					t.Fatalf("previous largest file %s has sequence number > next file %s", largest, f.Meta)
  2812  				}
  2813  			}
  2814  			largest = f.Meta
  2815  		}
  2816  	}
  2817  }
  2818  
  2819  type ingestCrashFS struct {
  2820  	vfs.FS
  2821  }
  2822  
  2823  func (fs ingestCrashFS) Link(oldname, newname string) error {
  2824  	if err := fs.FS.Link(oldname, newname); err != nil {
  2825  		return err
  2826  	}
  2827  	panic(errorfs.ErrInjected)
  2828  }
  2829  
  2830  type noRemoveFS struct {
  2831  	vfs.FS
  2832  }
  2833  
  2834  func (fs noRemoveFS) Remove(string) error {
  2835  	return errorfs.ErrInjected
  2836  }
  2837  
  2838  func TestIngestFileNumReuseCrash(t *testing.T) {
  2839  	const count = 10
  2840  	// Use an on-disk filesystem, because Ingest with a MemFS will copy, not
  2841  	// link the ingested file.
  2842  	dir, err := os.MkdirTemp("", "ingest-filenum-reuse")
  2843  	require.NoError(t, err)
  2844  	defer os.RemoveAll(dir)
  2845  	fs := vfs.Default
  2846  
  2847  	readFile := func(s string) []byte {
  2848  		f, err := fs.Open(fs.PathJoin(dir, s))
  2849  		require.NoError(t, err)
  2850  		b, err := io.ReadAll(f)
  2851  		require.NoError(t, err)
  2852  		require.NoError(t, f.Close())
  2853  		return b
  2854  	}
  2855  
  2856  	// Create sstables to ingest.
  2857  	var files []string
  2858  	var fileBytes [][]byte
  2859  	for i := 0; i < count; i++ {
  2860  		name := fmt.Sprintf("ext%d", i)
  2861  		f, err := fs.Create(fs.PathJoin(dir, name))
  2862  		require.NoError(t, err)
  2863  		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2864  		require.NoError(t, w.Set([]byte(fmt.Sprintf("foo%d", i)), nil))
  2865  		require.NoError(t, w.Close())
  2866  		files = append(files, name)
  2867  		fileBytes = append(fileBytes, readFile(name))
  2868  	}
  2869  
  2870  	// Open a database with a filesystem that will successfully link the
  2871  	// ingested files but then panic. This is an approximation of what a crash
  2872  	// after linking but before updating the manifest would look like.
  2873  	d, err := Open(dir, &Options{
  2874  		FS: ingestCrashFS{FS: fs},
  2875  	})
  2876  	// A flush here ensures the file num bumps from creating OPTIONS files,
  2877  	// etc get recorded in the manifest. We want the nextFileNum after the
  2878  	// restart to be the same as one of our ingested sstables.
  2879  	require.NoError(t, err)
  2880  	require.NoError(t, d.Set([]byte("boop"), nil, nil))
  2881  	require.NoError(t, d.Flush())
  2882  	for _, f := range files {
  2883  		func() {
  2884  			defer func() { err = recover().(error) }()
  2885  			err = d.Ingest([]string{fs.PathJoin(dir, f)})
  2886  		}()
  2887  		if err == nil || !errors.Is(err, errorfs.ErrInjected) {
  2888  			t.Fatalf("expected injected error, got %v", err)
  2889  		}
  2890  	}
  2891  	// Leave something in the WAL so that Open will flush while replaying the
  2892  	// WAL.
  2893  	require.NoError(t, d.Set([]byte("wal"), nil, nil))
  2894  	require.NoError(t, d.Close())
  2895  
  2896  	// There are now two links to each external file: the original extX link
  2897  	// and a numbered sstable link. The sstable files are still not a part of
  2898  	// the manifest and so they may be overwritten. Open will detect the
  2899  	// obsolete number sstables and try to remove them. The FS here is wrapped
  2900  	// to induce errors on Remove calls. Even if we're unsuccessful in
  2901  	// removing the obsolete files, the external files should not be
  2902  	// overwritten.
  2903  	d, err = Open(dir, &Options{FS: noRemoveFS{FS: fs}})
  2904  	require.NoError(t, err)
  2905  	require.NoError(t, d.Set([]byte("bar"), nil, nil))
  2906  	require.NoError(t, d.Flush())
  2907  	require.NoError(t, d.Close())
  2908  
  2909  	// None of the external files should change despite modifying the linked
  2910  	// versions.
  2911  	for i, f := range files {
  2912  		afterBytes := readFile(f)
  2913  		require.Equal(t, fileBytes[i], afterBytes)
  2914  	}
  2915  }
  2916  
  2917  func TestIngest_UpdateSequenceNumber(t *testing.T) {
  2918  	mem := vfs.NewMem()
  2919  	cmp := base.DefaultComparer.Compare
  2920  	parse := func(input string) (*sstable.Writer, error) {
  2921  		f, err := mem.Create("ext")
  2922  		if err != nil {
  2923  			return nil, err
  2924  		}
  2925  		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
  2926  			TableFormat: sstable.TableFormatMax,
  2927  		})
  2928  		for _, data := range strings.Split(input, "\n") {
  2929  			if strings.HasPrefix(data, "rangekey: ") {
  2930  				data = strings.TrimPrefix(data, "rangekey: ")
  2931  				s := keyspan.ParseSpan(data)
  2932  				err := rangekey.Encode(&s, w.AddRangeKey)
  2933  				if err != nil {
  2934  					return nil, err
  2935  				}
  2936  				continue
  2937  			}
  2938  			j := strings.Index(data, ":")
  2939  			if j < 0 {
  2940  				return nil, errors.Newf("malformed input: %s\n", data)
  2941  			}
  2942  			key := base.ParseInternalKey(data[:j])
  2943  			value := []byte(data[j+1:])
  2944  			if err := w.Add(key, value); err != nil {
  2945  				return nil, err
  2946  			}
  2947  		}
  2948  		return w, nil
  2949  	}
  2950  
  2951  	var (
  2952  		seqnum uint64
  2953  		err    error
  2954  		metas  []*fileMetadata
  2955  	)
  2956  	datadriven.RunTest(t, "testdata/ingest_update_seqnums", func(t *testing.T, td *datadriven.TestData) string {
  2957  		switch td.Cmd {
  2958  		case "starting-seqnum":
  2959  			seqnum, err = strconv.ParseUint(td.Input, 10, 64)
  2960  			if err != nil {
  2961  				return err.Error()
  2962  			}
  2963  			return ""
  2964  
  2965  		case "reset":
  2966  			metas = metas[:0]
  2967  			return ""
  2968  
  2969  		case "load":
  2970  			w, err := parse(td.Input)
  2971  			if err != nil {
  2972  				return err.Error()
  2973  			}
  2974  			if err = w.Close(); err != nil {
  2975  				return err.Error()
  2976  			}
  2977  			defer w.Close()
  2978  
  2979  			// Format the bounds of the table.
  2980  			wm, err := w.Metadata()
  2981  			if err != nil {
  2982  				return err.Error()
  2983  			}
  2984  
  2985  			// Upper bounds for range dels and range keys are expected to be sentinel
  2986  			// keys.
  2987  			maybeUpdateUpperBound := func(key base.InternalKey) base.InternalKey {
  2988  				switch k := key.Kind(); {
  2989  				case k == base.InternalKeyKindRangeDelete:
  2990  					key.Trailer = base.InternalKeyRangeDeleteSentinel
  2991  				case rangekey.IsRangeKey(k):
  2992  					return base.MakeExclusiveSentinelKey(k, key.UserKey)
  2993  				}
  2994  				return key
  2995  			}
  2996  
  2997  			// Construct the file metadata from the writer metadata.
  2998  			m := &fileMetadata{
  2999  				SmallestSeqNum: 0, // Simulate an ingestion.
  3000  				LargestSeqNum:  0,
  3001  			}
  3002  			if wm.HasPointKeys {
  3003  				m.ExtendPointKeyBounds(cmp, wm.SmallestPoint, wm.LargestPoint)
  3004  			}
  3005  			if wm.HasRangeDelKeys {
  3006  				m.ExtendPointKeyBounds(
  3007  					cmp,
  3008  					wm.SmallestRangeDel,
  3009  					maybeUpdateUpperBound(wm.LargestRangeDel),
  3010  				)
  3011  			}
  3012  			if wm.HasRangeKeys {
  3013  				m.ExtendRangeKeyBounds(
  3014  					cmp,
  3015  					wm.SmallestRangeKey,
  3016  					maybeUpdateUpperBound(wm.LargestRangeKey),
  3017  				)
  3018  			}
  3019  			m.InitPhysicalBacking()
  3020  			if err := m.Validate(cmp, base.DefaultFormatter); err != nil {
  3021  				return err.Error()
  3022  			}
  3023  
  3024  			// Collect this file.
  3025  			metas = append(metas, m)
  3026  
  3027  			// Return an index number for the file.
  3028  			return fmt.Sprintf("file %d\n", len(metas)-1)
  3029  
  3030  		case "update-files":
  3031  			// Update the bounds across all files.
  3032  			if err = ingestUpdateSeqNum(cmp, base.DefaultFormatter, seqnum, ingestLoadResult{localMeta: metas}); err != nil {
  3033  				return err.Error()
  3034  			}
  3035  
  3036  			var buf bytes.Buffer
  3037  			for i, m := range metas {
  3038  				fmt.Fprintf(&buf, "file %d:\n", i)
  3039  				fmt.Fprintf(&buf, "  combined: %s-%s\n", m.Smallest, m.Largest)
  3040  				fmt.Fprintf(&buf, "    points: %s-%s\n", m.SmallestPointKey, m.LargestPointKey)
  3041  				fmt.Fprintf(&buf, "    ranges: %s-%s\n", m.SmallestRangeKey, m.LargestRangeKey)
  3042  			}
  3043  
  3044  			return buf.String()
  3045  
  3046  		default:
  3047  			return fmt.Sprintf("unknown command %s\n", td.Cmd)
  3048  		}
  3049  	})
  3050  }
  3051  
  3052  func TestIngestCleanup(t *testing.T) {
  3053  	fns := []base.FileNum{0, 1, 2}
  3054  
  3055  	testCases := []struct {
  3056  		closeFiles   []base.FileNum
  3057  		cleanupFiles []base.FileNum
  3058  		wantErr      string
  3059  	}{
  3060  		// Close and remove all files.
  3061  		{
  3062  			closeFiles:   fns,
  3063  			cleanupFiles: fns,
  3064  		},
  3065  		// Remove a non-existent file.
  3066  		{
  3067  			closeFiles:   fns,
  3068  			cleanupFiles: []base.FileNum{3},
  3069  			wantErr:      "unknown to the objstorage provider",
  3070  		},
  3071  		// Remove a file that has not been closed.
  3072  		{
  3073  			closeFiles:   []base.FileNum{0, 2},
  3074  			cleanupFiles: fns,
  3075  			wantErr:      oserror.ErrInvalid.Error(),
  3076  		},
  3077  		// Remove all files, one of which is still open, plus a file that does not exist.
  3078  		{
  3079  			closeFiles:   []base.FileNum{0, 2},
  3080  			cleanupFiles: []base.FileNum{0, 1, 2, 3},
  3081  			wantErr:      oserror.ErrInvalid.Error(), // The first error encountered is due to the open file.
  3082  		},
  3083  	}
  3084  
  3085  	for _, tc := range testCases {
  3086  		t.Run("", func(t *testing.T) {
  3087  			mem := vfs.NewMem()
  3088  			mem.UseWindowsSemantics(true)
  3089  			objProvider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(mem, ""))
  3090  			require.NoError(t, err)
  3091  			defer objProvider.Close()
  3092  
  3093  			// Create the files in the VFS.
  3094  			metaMap := make(map[base.FileNum]objstorage.Writable)
  3095  			for _, fn := range fns {
  3096  				w, _, err := objProvider.Create(context.Background(), base.FileTypeTable, fn.DiskFileNum(), objstorage.CreateOptions{})
  3097  				require.NoError(t, err)
  3098  
  3099  				metaMap[fn] = w
  3100  			}
  3101  
  3102  			// Close a select number of files.
  3103  			for _, m := range tc.closeFiles {
  3104  				w, ok := metaMap[m]
  3105  				if !ok {
  3106  					continue
  3107  				}
  3108  				require.NoError(t, w.Finish())
  3109  			}
  3110  
  3111  			// Cleanup the set of files in the FS.
  3112  			var toRemove []*fileMetadata
  3113  			for _, fn := range tc.cleanupFiles {
  3114  				m := &fileMetadata{FileNum: fn}
  3115  				m.InitPhysicalBacking()
  3116  				toRemove = append(toRemove, m)
  3117  			}
  3118  
  3119  			err = ingestCleanup(objProvider, toRemove)
  3120  			if tc.wantErr != "" {
  3121  				require.Error(t, err, "got no error, expected %s", tc.wantErr)
  3122  				require.Contains(t, err.Error(), tc.wantErr)
  3123  			} else {
  3124  				require.NoError(t, err)
  3125  			}
  3126  		})
  3127  	}
  3128  }
  3129  
  3130  // fatalCapturingLogger captures a fatal error instead of panicking.
  3131  type fatalCapturingLogger struct {
  3132  	t   testing.TB
  3133  	err error
  3134  }
  3135  
  3136  // Infof implements the Logger interface.
  3137  func (l *fatalCapturingLogger) Infof(fmt string, args ...interface{}) {
  3138  	l.t.Logf(fmt, args...)
  3139  }
  3140  
  3141  // Fatalf implements the Logger interface.
  3142  func (l *fatalCapturingLogger) Fatalf(_ string, args ...interface{}) {
  3143  	l.err = args[0].(error)
  3144  }
  3145  
  3146  func TestIngestValidation(t *testing.T) {
  3147  	type keyVal struct {
  3148  		key, val []byte
  3149  	}
  3150  	type corruptionLocation int
  3151  	const (
  3152  		corruptionLocationNone corruptionLocation = iota
  3153  		corruptionLocationStart
  3154  		corruptionLocationEnd
  3155  		corruptionLocationInternal
  3156  	)
  3157  	type errLocation int
  3158  	const (
  3159  		errLocationNone errLocation = iota
  3160  		errLocationIngest
  3161  		errLocationValidation
  3162  	)
  3163  	const (
  3164  		nKeys     = 1_000
  3165  		keySize   = 16
  3166  		valSize   = 100
  3167  		blockSize = 100
  3168  
  3169  		ingestTableName = "ext"
  3170  	)
  3171  	ingestPath := filepath.Join(t.TempDir(), ingestTableName)
  3172  
  3173  	seed := uint64(time.Now().UnixNano())
  3174  	rng := rand.New(rand.NewSource(seed))
  3175  	t.Logf("rng seed = %d", seed)
  3176  
  3177  	testCases := []struct {
  3178  		description string
  3179  		cLoc        corruptionLocation
  3180  		wantErrType errLocation
  3181  	}{
  3182  		{
  3183  			description: "no corruption",
  3184  			cLoc:        corruptionLocationNone,
  3185  			wantErrType: errLocationNone,
  3186  		},
  3187  		{
  3188  			description: "start block",
  3189  			cLoc:        corruptionLocationStart,
  3190  			wantErrType: errLocationIngest,
  3191  		},
  3192  		{
  3193  			description: "end block",
  3194  			cLoc:        corruptionLocationEnd,
  3195  			wantErrType: errLocationIngest,
  3196  		},
  3197  		{
  3198  			description: "non-end block",
  3199  			cLoc:        corruptionLocationInternal,
  3200  			wantErrType: errLocationValidation,
  3201  		},
  3202  	}
  3203  
  3204  	for _, tc := range testCases {
  3205  		t.Run(tc.description, func(t *testing.T) {
  3206  			var wg sync.WaitGroup
  3207  			wg.Add(1)
  3208  
  3209  			fs := vfs.NewMem()
  3210  			logger := &fatalCapturingLogger{t: t}
  3211  			opts := &Options{
  3212  				FS:     fs,
  3213  				Logger: logger,
  3214  				EventListener: &EventListener{
  3215  					TableValidated: func(i TableValidatedInfo) {
  3216  						wg.Done()
  3217  					},
  3218  				},
  3219  			}
  3220  			opts.Experimental.ValidateOnIngest = true
  3221  			d, err := Open("", opts)
  3222  			require.NoError(t, err)
  3223  			defer func() { require.NoError(t, d.Close()) }()
  3224  
  3225  			corrupt := func(f vfs.File) {
  3226  				readable, err := sstable.NewSimpleReadable(f)
  3227  				require.NoError(t, err)
  3228  				// Compute the layout of the sstable in order to find the
  3229  				// appropriate block locations to corrupt.
  3230  				r, err := sstable.NewReader(readable, sstable.ReaderOptions{})
  3231  				require.NoError(t, err)
  3232  				l, err := r.Layout()
  3233  				require.NoError(t, err)
  3234  				require.NoError(t, r.Close())
  3235  
  3236  				// Select an appropriate data block to corrupt.
  3237  				var blockIdx int
  3238  				switch tc.cLoc {
  3239  				case corruptionLocationStart:
  3240  					blockIdx = 0
  3241  				case corruptionLocationEnd:
  3242  					blockIdx = len(l.Data) - 1
  3243  				case corruptionLocationInternal:
  3244  					blockIdx = 1 + rng.Intn(len(l.Data)-2)
  3245  				default:
  3246  					t.Fatalf("unknown corruptionLocation: %T", tc.cLoc)
  3247  				}
  3248  				bh := l.Data[blockIdx]
  3249  
  3250  				osF, err := os.OpenFile(ingestPath, os.O_RDWR, 0600)
  3251  				require.NoError(t, err)
  3252  				defer func() { require.NoError(t, osF.Close()) }()
  3253  
  3254  				// Corrupting a key will cause the ingestion to fail due to a
  3255  				// malformed key, rather than a block checksum mismatch.
  3256  				// Instead, we corrupt the last byte in the selected block,
  3257  				// before the trailer, which corresponds to a value.
  3258  				offset := bh.Offset + bh.Length - 1
  3259  				_, err = osF.WriteAt([]byte("\xff"), int64(offset))
  3260  				require.NoError(t, err)
  3261  			}
  3262  
  3263  			type errT struct {
  3264  				errLoc errLocation
  3265  				err    error
  3266  			}
  3267  			runIngest := func(keyVals []keyVal) (et errT) {
  3268  				// The vfs.File does not allow for random reads and writes.
  3269  				// Create a disk-backed file outside of the DB FS that we can
  3270  				// open as a regular os.File, if required.
  3271  				tmpFS := vfs.Default
  3272  				f, err := tmpFS.Create(ingestPath)
  3273  				require.NoError(t, err)
  3274  				defer func() { _ = tmpFS.Remove(ingestPath) }()
  3275  
  3276  				w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
  3277  					BlockSize:   blockSize,     // Create many smaller blocks.
  3278  					Compression: NoCompression, // For simpler debugging.
  3279  				})
  3280  				for _, kv := range keyVals {
  3281  					require.NoError(t, w.Set(kv.key, kv.val))
  3282  				}
  3283  				require.NoError(t, w.Close())
  3284  
  3285  				// Possibly corrupt the file.
  3286  				if tc.cLoc != corruptionLocationNone {
  3287  					f, err = tmpFS.Open(ingestPath)
  3288  					require.NoError(t, err)
  3289  					corrupt(f)
  3290  				}
  3291  
  3292  				// Copy the file into the DB's FS.
  3293  				_, err = vfs.Clone(tmpFS, fs, ingestPath, ingestTableName)
  3294  				require.NoError(t, err)
  3295  
  3296  				// Ingest the external table.
  3297  				err = d.Ingest([]string{ingestTableName})
  3298  				if err != nil {
  3299  					et.errLoc = errLocationIngest
  3300  					et.err = err
  3301  					return
  3302  				}
  3303  
  3304  				// Wait for the validation on the sstable to complete.
  3305  				wg.Wait()
  3306  
  3307  				// Return any error encountered during validation.
  3308  				if logger.err != nil {
  3309  					et.errLoc = errLocationValidation
  3310  					et.err = logger.err
  3311  				}
  3312  
  3313  				return
  3314  			}
  3315  
  3316  			// Construct a set of keys to ingest.
  3317  			var keyVals []keyVal
  3318  			for i := 0; i < nKeys; i++ {
  3319  				key := make([]byte, keySize)
  3320  				_, err = rng.Read(key)
  3321  				require.NoError(t, err)
  3322  
  3323  				val := make([]byte, valSize)
  3324  				_, err = rng.Read(val)
  3325  				require.NoError(t, err)
  3326  
  3327  				keyVals = append(keyVals, keyVal{key, val})
  3328  			}
  3329  
  3330  			// Keys must be sorted.
  3331  			sort.Slice(keyVals, func(i, j int) bool {
  3332  				return d.cmp(keyVals[i].key, keyVals[j].key) <= 0
  3333  			})
  3334  
  3335  			// Run the ingestion.
  3336  			et := runIngest(keyVals)
  3337  
  3338  			// Assert we saw the errors we expect.
  3339  			switch tc.wantErrType {
  3340  			case errLocationNone:
  3341  				require.Equal(t, errLocationNone, et.errLoc)
  3342  				require.NoError(t, et.err)
  3343  			case errLocationIngest:
  3344  				require.Equal(t, errLocationIngest, et.errLoc)
  3345  				require.Error(t, et.err)
  3346  				require.True(t, errors.Is(et.err, base.ErrCorruption))
  3347  			case errLocationValidation:
  3348  				require.Equal(t, errLocationValidation, et.errLoc)
  3349  				require.Error(t, et.err)
  3350  				require.True(t, errors.Is(et.err, base.ErrCorruption))
  3351  			default:
  3352  				t.Fatalf("unknown wantErrType %T", tc.wantErrType)
  3353  			}
  3354  		})
  3355  	}
  3356  }
  3357  
  3358  // BenchmarkManySSTables measures the cost of various operations with various
  3359  // counts of SSTables within the database.
  3360  func BenchmarkManySSTables(b *testing.B) {
  3361  	counts := []int{10, 1_000, 10_000, 100_000, 1_000_000}
  3362  	ops := []string{"ingest", "calculateInuseKeyRanges"}
  3363  	for _, op := range ops {
  3364  		b.Run(op, func(b *testing.B) {
  3365  			for _, count := range counts {
  3366  				b.Run(fmt.Sprintf("sstables=%d", count), func(b *testing.B) {
  3367  					mem := vfs.NewMem()
  3368  					d, err := Open("", &Options{
  3369  						FS: mem,
  3370  					})
  3371  					require.NoError(b, err)
  3372  
  3373  					var paths []string
  3374  					for i := 0; i < count; i++ {
  3375  						n := fmt.Sprintf("%07d", i)
  3376  						f, err := mem.Create(n)
  3377  						require.NoError(b, err)
  3378  						w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  3379  						require.NoError(b, w.Set([]byte(n), nil))
  3380  						require.NoError(b, w.Close())
  3381  						paths = append(paths, n)
  3382  					}
  3383  					require.NoError(b, d.Ingest(paths))
  3384  
  3385  					{
  3386  						const broadIngest = "broad.sst"
  3387  						f, err := mem.Create(broadIngest)
  3388  						require.NoError(b, err)
  3389  						w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  3390  						require.NoError(b, w.Set([]byte("0"), nil))
  3391  						require.NoError(b, w.Set([]byte("Z"), nil))
  3392  						require.NoError(b, w.Close())
  3393  						require.NoError(b, d.Ingest([]string{broadIngest}))
  3394  					}
  3395  
  3396  					switch op {
  3397  					case "ingest":
  3398  						runBenchmarkManySSTablesIngest(b, d, mem, count)
  3399  					case "calculateInuseKeyRanges":
  3400  						runBenchmarkManySSTablesInUseKeyRanges(b, d, count)
  3401  					}
  3402  					require.NoError(b, d.Close())
  3403  				})
  3404  			}
  3405  		})
  3406  	}
  3407  }
  3408  
  3409  func runBenchmarkManySSTablesIngest(b *testing.B, d *DB, fs vfs.FS, count int) {
  3410  	b.ResetTimer()
  3411  	for i := 0; i < b.N; i++ {
  3412  		n := fmt.Sprintf("%07d", count+i)
  3413  		f, err := fs.Create(n)
  3414  		require.NoError(b, err)
  3415  		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  3416  		require.NoError(b, w.Set([]byte(n), nil))
  3417  		require.NoError(b, w.Close())
  3418  		require.NoError(b, d.Ingest([]string{n}))
  3419  	}
  3420  }
  3421  
  3422  func runBenchmarkManySSTablesInUseKeyRanges(b *testing.B, d *DB, count int) {
  3423  	// This benchmark is pretty contrived, but it's not easy to write a
  3424  	// microbenchmark for this in a more natural way. L6 has many files, and
  3425  	// L5 has 1 file spanning the entire breadth of L5.
  3426  	d.mu.Lock()
  3427  	defer d.mu.Unlock()
  3428  	v := d.mu.versions.currentVersion()
  3429  	b.ResetTimer()
  3430  
  3431  	smallest := []byte("0")
  3432  	largest := []byte("z")
  3433  	for i := 0; i < b.N; i++ {
  3434  		_ = calculateInuseKeyRanges(v, d.cmp, 0, numLevels-1, smallest, largest)
  3435  	}
  3436  }