github.com/cockroachdb/pebble@v1.1.2/ingest_test.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"bytes"
     9  	"context"
    10  	"fmt"
    11  	"io"
    12  	"math"
    13  	"os"
    14  	"path/filepath"
    15  	"runtime"
    16  	"sort"
    17  	"strconv"
    18  	"strings"
    19  	"sync"
    20  	"sync/atomic"
    21  	"testing"
    22  	"time"
    23  
    24  	"github.com/cockroachdb/datadriven"
    25  	"github.com/cockroachdb/errors"
    26  	"github.com/cockroachdb/errors/oserror"
    27  	"github.com/cockroachdb/pebble/internal/base"
    28  	"github.com/cockroachdb/pebble/internal/keyspan"
    29  	"github.com/cockroachdb/pebble/internal/manifest"
    30  	"github.com/cockroachdb/pebble/internal/rangekey"
    31  	"github.com/cockroachdb/pebble/internal/testkeys"
    32  	"github.com/cockroachdb/pebble/objstorage"
    33  	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
    34  	"github.com/cockroachdb/pebble/objstorage/remote"
    35  	"github.com/cockroachdb/pebble/record"
    36  	"github.com/cockroachdb/pebble/sstable"
    37  	"github.com/cockroachdb/pebble/vfs"
    38  	"github.com/cockroachdb/pebble/vfs/errorfs"
    39  	"github.com/kr/pretty"
    40  	"github.com/stretchr/testify/require"
    41  	"golang.org/x/exp/rand"
    42  )
    43  
    44  func TestSSTableKeyCompare(t *testing.T) {
    45  	var buf bytes.Buffer
    46  	datadriven.RunTest(t, "testdata/sstable_key_compare", func(t *testing.T, td *datadriven.TestData) string {
    47  		switch td.Cmd {
    48  		case "cmp":
    49  			buf.Reset()
    50  			for _, line := range strings.Split(td.Input, "\n") {
    51  				fields := strings.Fields(line)
    52  				a := base.ParseInternalKey(fields[0])
    53  				b := base.ParseInternalKey(fields[1])
    54  				got := sstableKeyCompare(testkeys.Comparer.Compare, a, b)
    55  				fmt.Fprintf(&buf, "%38s", fmt.Sprint(a.Pretty(base.DefaultFormatter)))
    56  				switch got {
    57  				case -1:
    58  					fmt.Fprint(&buf, " < ")
    59  				case +1:
    60  					fmt.Fprint(&buf, " > ")
    61  				case 0:
    62  					fmt.Fprint(&buf, " = ")
    63  				}
    64  				fmt.Fprintf(&buf, "%s\n", fmt.Sprint(b.Pretty(base.DefaultFormatter)))
    65  			}
    66  			return buf.String()
    67  		default:
    68  			return fmt.Sprintf("unrecognized command %q", td.Cmd)
    69  		}
    70  	})
    71  }
    72  
    73  func TestIngestLoad(t *testing.T) {
    74  	mem := vfs.NewMem()
    75  
    76  	datadriven.RunTest(t, "testdata/ingest_load", func(t *testing.T, td *datadriven.TestData) string {
    77  		switch td.Cmd {
    78  		case "load":
    79  			writerOpts := sstable.WriterOptions{}
    80  			var dbVersion FormatMajorVersion
    81  			for _, cmdArgs := range td.CmdArgs {
    82  				v, err := strconv.Atoi(cmdArgs.Vals[0])
    83  				if err != nil {
    84  					return err.Error()
    85  				}
    86  				switch k := cmdArgs.Key; k {
    87  				case "writer-version":
    88  					fmv := FormatMajorVersion(v)
    89  					writerOpts.TableFormat = fmv.MaxTableFormat()
    90  				case "db-version":
    91  					dbVersion = FormatMajorVersion(v)
    92  				default:
    93  					return fmt.Sprintf("unknown cmd %s\n", k)
    94  				}
    95  			}
    96  			f, err := mem.Create("ext")
    97  			if err != nil {
    98  				return err.Error()
    99  			}
   100  			w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), writerOpts)
   101  			for _, data := range strings.Split(td.Input, "\n") {
   102  				if strings.HasPrefix(data, "rangekey: ") {
   103  					data = strings.TrimPrefix(data, "rangekey: ")
   104  					s := keyspan.ParseSpan(data)
   105  					err := rangekey.Encode(&s, w.AddRangeKey)
   106  					if err != nil {
   107  						return err.Error()
   108  					}
   109  					continue
   110  				}
   111  
   112  				j := strings.Index(data, ":")
   113  				if j < 0 {
   114  					return fmt.Sprintf("malformed input: %s\n", data)
   115  				}
   116  				key := base.ParseInternalKey(data[:j])
   117  				value := []byte(data[j+1:])
   118  				if err := w.Add(key, value); err != nil {
   119  					return err.Error()
   120  				}
   121  			}
   122  			if err := w.Close(); err != nil {
   123  				return err.Error()
   124  			}
   125  
   126  			opts := (&Options{
   127  				Comparer: DefaultComparer,
   128  				FS:       mem,
   129  			}).WithFSDefaults()
   130  			lr, err := ingestLoad(opts, dbVersion, []string{"ext"}, nil, nil, 0, []base.DiskFileNum{base.FileNum(1).DiskFileNum()}, nil, 0)
   131  			if err != nil {
   132  				return err.Error()
   133  			}
   134  			var buf bytes.Buffer
   135  			for _, m := range lr.localMeta {
   136  				fmt.Fprintf(&buf, "%d: %s-%s\n", m.FileNum, m.Smallest, m.Largest)
   137  				fmt.Fprintf(&buf, "  points: %s-%s\n", m.SmallestPointKey, m.LargestPointKey)
   138  				fmt.Fprintf(&buf, "  ranges: %s-%s\n", m.SmallestRangeKey, m.LargestRangeKey)
   139  			}
   140  			return buf.String()
   141  
   142  		default:
   143  			return fmt.Sprintf("unknown command: %s", td.Cmd)
   144  		}
   145  	})
   146  }
   147  
   148  func TestIngestLoadRand(t *testing.T) {
   149  	mem := vfs.NewMem()
   150  	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
   151  	cmp := DefaultComparer.Compare
   152  	version := internalFormatNewest
   153  
   154  	randBytes := func(size int) []byte {
   155  		data := make([]byte, size)
   156  		for i := range data {
   157  			data[i] = byte(rng.Int() & 0xff)
   158  		}
   159  		return data
   160  	}
   161  
   162  	paths := make([]string, 1+rng.Intn(10))
   163  	pending := make([]base.DiskFileNum, len(paths))
   164  	expected := make([]*fileMetadata, len(paths))
   165  	for i := range paths {
   166  		paths[i] = fmt.Sprint(i)
   167  		pending[i] = base.FileNum(rng.Uint64()).DiskFileNum()
   168  		expected[i] = &fileMetadata{
   169  			FileNum: pending[i].FileNum(),
   170  		}
   171  		expected[i].StatsMarkValid()
   172  
   173  		func() {
   174  			f, err := mem.Create(paths[i])
   175  			require.NoError(t, err)
   176  
   177  			keys := make([]InternalKey, 1+rng.Intn(100))
   178  			for i := range keys {
   179  				keys[i] = base.MakeInternalKey(
   180  					randBytes(1+rng.Intn(10)),
   181  					0,
   182  					InternalKeyKindSet)
   183  			}
   184  			sort.Slice(keys, func(i, j int) bool {
   185  				return base.InternalCompare(cmp, keys[i], keys[j]) < 0
   186  			})
   187  
   188  			expected[i].ExtendPointKeyBounds(cmp, keys[0], keys[len(keys)-1])
   189  
   190  			w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
   191  				TableFormat: version.MaxTableFormat(),
   192  			})
   193  			var count uint64
   194  			for i := range keys {
   195  				if i > 0 && base.InternalCompare(cmp, keys[i-1], keys[i]) == 0 {
   196  					// Duplicate key, ignore.
   197  					continue
   198  				}
   199  				w.Add(keys[i], nil)
   200  				count++
   201  			}
   202  			expected[i].Stats.NumEntries = count
   203  			require.NoError(t, w.Close())
   204  
   205  			meta, err := w.Metadata()
   206  			require.NoError(t, err)
   207  
   208  			expected[i].Size = meta.Size
   209  			expected[i].InitPhysicalBacking()
   210  		}()
   211  	}
   212  
   213  	opts := (&Options{
   214  		Comparer: DefaultComparer,
   215  		FS:       mem,
   216  	}).WithFSDefaults()
   217  	lr, err := ingestLoad(opts, version, paths, nil, nil, 0, pending, nil, 0)
   218  	require.NoError(t, err)
   219  
   220  	for _, m := range lr.localMeta {
   221  		m.CreationTime = 0
   222  	}
   223  	t.Log(strings.Join(pretty.Diff(expected, lr.localMeta), "\n"))
   224  	require.Equal(t, expected, lr.localMeta)
   225  }
   226  
   227  func TestIngestLoadInvalid(t *testing.T) {
   228  	mem := vfs.NewMem()
   229  	f, err := mem.Create("invalid")
   230  	require.NoError(t, err)
   231  	require.NoError(t, f.Close())
   232  
   233  	opts := (&Options{
   234  		Comparer: DefaultComparer,
   235  		FS:       mem,
   236  	}).WithFSDefaults()
   237  	if _, err := ingestLoad(opts, internalFormatNewest, []string{"invalid"}, nil, nil, 0, []base.DiskFileNum{base.FileNum(1).DiskFileNum()}, nil, 0); err == nil {
   238  		t.Fatalf("expected error, but found success")
   239  	}
   240  }
   241  
   242  func TestIngestSortAndVerify(t *testing.T) {
   243  	comparers := map[string]Compare{
   244  		"default": DefaultComparer.Compare,
   245  		"reverse": func(a, b []byte) int {
   246  			return DefaultComparer.Compare(b, a)
   247  		},
   248  	}
   249  
   250  	t.Run("", func(t *testing.T) {
   251  		datadriven.RunTest(t, "testdata/ingest_sort_and_verify", func(t *testing.T, d *datadriven.TestData) string {
   252  			switch d.Cmd {
   253  			case "ingest":
   254  				var buf bytes.Buffer
   255  				var meta []*fileMetadata
   256  				var paths []string
   257  				var cmpName string
   258  				d.ScanArgs(t, "cmp", &cmpName)
   259  				cmp := comparers[cmpName]
   260  				if cmp == nil {
   261  					return fmt.Sprintf("%s unknown comparer: %s", d.Cmd, cmpName)
   262  				}
   263  				for i, data := range strings.Split(d.Input, "\n") {
   264  					parts := strings.Split(data, "-")
   265  					if len(parts) != 2 {
   266  						return fmt.Sprintf("malformed test case: %s", d.Input)
   267  					}
   268  					smallest := base.ParseInternalKey(parts[0])
   269  					largest := base.ParseInternalKey(parts[1])
   270  					if cmp(smallest.UserKey, largest.UserKey) > 0 {
   271  						return fmt.Sprintf("range %v-%v is not valid", smallest, largest)
   272  					}
   273  					m := (&fileMetadata{}).ExtendPointKeyBounds(cmp, smallest, largest)
   274  					m.InitPhysicalBacking()
   275  					meta = append(meta, m)
   276  					paths = append(paths, strconv.Itoa(i))
   277  				}
   278  				lr := ingestLoadResult{localPaths: paths, localMeta: meta}
   279  				err := ingestSortAndVerify(cmp, lr, KeyRange{})
   280  				if err != nil {
   281  					return fmt.Sprintf("%v\n", err)
   282  				}
   283  				for i := range meta {
   284  					fmt.Fprintf(&buf, "%s: %v-%v\n", paths[i], meta[i].Smallest, meta[i].Largest)
   285  				}
   286  				return buf.String()
   287  
   288  			default:
   289  				return fmt.Sprintf("unknown command: %s", d.Cmd)
   290  			}
   291  		})
   292  	})
   293  }
   294  
   295  func TestIngestLink(t *testing.T) {
   296  	// Test linking of tables into the DB directory. Test cleanup when one of the
   297  	// tables cannot be linked.
   298  
   299  	const dir = "db"
   300  	const count = 10
   301  	for i := 0; i <= count; i++ {
   302  		t.Run("", func(t *testing.T) {
   303  			opts := &Options{FS: vfs.NewMem()}
   304  			opts.EnsureDefaults().WithFSDefaults()
   305  			require.NoError(t, opts.FS.MkdirAll(dir, 0755))
   306  			objProvider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(opts.FS, dir))
   307  			require.NoError(t, err)
   308  			defer objProvider.Close()
   309  
   310  			paths := make([]string, 10)
   311  			meta := make([]*fileMetadata, len(paths))
   312  			contents := make([][]byte, len(paths))
   313  			for j := range paths {
   314  				paths[j] = fmt.Sprintf("external%d", j)
   315  				meta[j] = &fileMetadata{}
   316  				meta[j].FileNum = FileNum(j)
   317  				meta[j].InitPhysicalBacking()
   318  				f, err := opts.FS.Create(paths[j])
   319  				require.NoError(t, err)
   320  
   321  				contents[j] = []byte(fmt.Sprintf("data%d", j))
   322  				// memFile.Write will modify the supplied buffer when invariants are
   323  				// enabled, so provide a throw-away copy.
   324  				_, err = f.Write(append([]byte(nil), contents[j]...))
   325  				require.NoError(t, err)
   326  				require.NoError(t, f.Close())
   327  			}
   328  
   329  			if i < count {
   330  				opts.FS.Remove(paths[i])
   331  			}
   332  
   333  			lr := ingestLoadResult{localMeta: meta, localPaths: paths}
   334  			err = ingestLink(0 /* jobID */, opts, objProvider, lr, nil /* shared */)
   335  			if i < count {
   336  				if err == nil {
   337  					t.Fatalf("expected error, but found success")
   338  				}
   339  			} else {
   340  				require.NoError(t, err)
   341  			}
   342  
   343  			files, err := opts.FS.List(dir)
   344  			require.NoError(t, err)
   345  
   346  			sort.Strings(files)
   347  
   348  			if i < count {
   349  				if len(files) > 0 {
   350  					t.Fatalf("expected all of the files to be cleaned up, but found:\n%s",
   351  						strings.Join(files, "\n"))
   352  				}
   353  			} else {
   354  				if len(files) != count {
   355  					t.Fatalf("expected %d files, but found:\n%s", count, strings.Join(files, "\n"))
   356  				}
   357  				for j := range files {
   358  					ftype, fileNum, ok := base.ParseFilename(opts.FS, files[j])
   359  					if !ok {
   360  						t.Fatalf("unable to parse filename: %s", files[j])
   361  					}
   362  					if fileTypeTable != ftype {
   363  						t.Fatalf("expected table, but found %d", ftype)
   364  					}
   365  					if j != int(fileNum.FileNum()) {
   366  						t.Fatalf("expected table %d, but found %d", j, fileNum)
   367  					}
   368  					f, err := opts.FS.Open(opts.FS.PathJoin(dir, files[j]))
   369  					require.NoError(t, err)
   370  
   371  					data, err := io.ReadAll(f)
   372  					require.NoError(t, err)
   373  					require.NoError(t, f.Close())
   374  					if !bytes.Equal(contents[j], data) {
   375  						t.Fatalf("expected %s, but found %s", contents[j], data)
   376  					}
   377  				}
   378  			}
   379  		})
   380  	}
   381  }
   382  
   383  func TestIngestLinkFallback(t *testing.T) {
   384  	// Verify that ingestLink succeeds if linking fails by falling back to
   385  	// copying.
   386  	mem := vfs.NewMem()
   387  	src, err := mem.Create("source")
   388  	require.NoError(t, err)
   389  
   390  	opts := &Options{FS: errorfs.Wrap(mem, errorfs.OnIndex(1))}
   391  	opts.EnsureDefaults().WithFSDefaults()
   392  	objSettings := objstorageprovider.DefaultSettings(opts.FS, "")
   393  	// Prevent the provider from listing the dir (where we may get an injected error).
   394  	objSettings.FSDirInitialListing = []string{}
   395  	objProvider, err := objstorageprovider.Open(objSettings)
   396  	require.NoError(t, err)
   397  	defer objProvider.Close()
   398  
   399  	meta := []*fileMetadata{{FileNum: 1}}
   400  	meta[0].InitPhysicalBacking()
   401  	lr := ingestLoadResult{localMeta: meta, localPaths: []string{"source"}}
   402  	err = ingestLink(0, opts, objProvider, lr, nil /* shared */)
   403  	require.NoError(t, err)
   404  
   405  	dest, err := mem.Open("000001.sst")
   406  	require.NoError(t, err)
   407  
   408  	// We should be able to write bytes to src, and not have them show up in
   409  	// dest.
   410  	_, _ = src.Write([]byte("test"))
   411  	data, err := io.ReadAll(dest)
   412  	require.NoError(t, err)
   413  	if len(data) != 0 {
   414  		t.Fatalf("expected copy, but files appear to be hard linked: [%s] unexpectedly found", data)
   415  	}
   416  }
   417  
   418  func TestOverlappingIngestedSSTs(t *testing.T) {
   419  	dir := ""
   420  	var (
   421  		mem        vfs.FS
   422  		d          *DB
   423  		opts       *Options
   424  		closed     = false
   425  		blockFlush = false
   426  	)
   427  	defer func() {
   428  		if !closed {
   429  			require.NoError(t, d.Close())
   430  		}
   431  	}()
   432  
   433  	reset := func(strictMem bool) {
   434  		if d != nil && !closed {
   435  			require.NoError(t, d.Close())
   436  		}
   437  		blockFlush = false
   438  
   439  		if strictMem {
   440  			mem = vfs.NewStrictMem()
   441  		} else {
   442  			mem = vfs.NewMem()
   443  		}
   444  
   445  		require.NoError(t, mem.MkdirAll("ext", 0755))
   446  		opts = (&Options{
   447  			FS:                          mem,
   448  			MemTableStopWritesThreshold: 4,
   449  			L0CompactionThreshold:       100,
   450  			L0StopWritesThreshold:       100,
   451  			DebugCheck:                  DebugCheckLevels,
   452  			FormatMajorVersion:          internalFormatNewest,
   453  		}).WithFSDefaults()
   454  		// Disable automatic compactions because otherwise we'll race with
   455  		// delete-only compactions triggered by ingesting range tombstones.
   456  		opts.DisableAutomaticCompactions = true
   457  
   458  		var err error
   459  		d, err = Open(dir, opts)
   460  		require.NoError(t, err)
   461  		d.TestOnlyWaitForCleaning()
   462  	}
   463  	waitForFlush := func() {
   464  		if d == nil {
   465  			return
   466  		}
   467  		d.mu.Lock()
   468  		for d.mu.compact.flushing {
   469  			d.mu.compact.cond.Wait()
   470  		}
   471  		d.mu.Unlock()
   472  	}
   473  	reset(false)
   474  
   475  	datadriven.RunTest(t, "testdata/flushable_ingest", func(t *testing.T, td *datadriven.TestData) string {
   476  		switch td.Cmd {
   477  		case "reset":
   478  			reset(td.HasArg("strictMem"))
   479  			return ""
   480  
   481  		case "ignoreSyncs":
   482  			var ignoreSyncs bool
   483  			if len(td.CmdArgs) == 1 && td.CmdArgs[0].String() == "true" {
   484  				ignoreSyncs = true
   485  			}
   486  			mem.(*vfs.MemFS).SetIgnoreSyncs(ignoreSyncs)
   487  			return ""
   488  
   489  		case "resetToSynced":
   490  			mem.(*vfs.MemFS).ResetToSyncedState()
   491  			files, err := mem.List(dir)
   492  			sort.Strings(files)
   493  			require.NoError(t, err)
   494  			return strings.Join(files, "\n")
   495  
   496  		case "batch":
   497  			b := d.NewIndexedBatch()
   498  			if err := runBatchDefineCmd(td, b); err != nil {
   499  				return err.Error()
   500  			}
   501  			if err := b.Commit(nil); err != nil {
   502  				return err.Error()
   503  			}
   504  			return ""
   505  
   506  		case "build":
   507  			if err := runBuildCmd(td, d, mem); err != nil {
   508  				return err.Error()
   509  			}
   510  			return ""
   511  
   512  		case "ingest":
   513  			if err := runIngestCmd(td, d, mem); err != nil {
   514  				return err.Error()
   515  			}
   516  			if !blockFlush {
   517  				waitForFlush()
   518  			}
   519  			return ""
   520  
   521  		case "iter":
   522  			iter, _ := d.NewIter(nil)
   523  			return runIterCmd(td, iter, true)
   524  
   525  		case "lsm":
   526  			return runLSMCmd(td, d)
   527  
   528  		case "close":
   529  			if closed {
   530  				return "already closed"
   531  			}
   532  			require.NoError(t, d.Close())
   533  			closed = true
   534  			return ""
   535  
   536  		case "ls":
   537  			files, err := mem.List(dir)
   538  			sort.Strings(files)
   539  			require.NoError(t, err)
   540  			return strings.Join(files, "\n")
   541  
   542  		case "open":
   543  			opts.ReadOnly = td.HasArg("readOnly")
   544  			var err error
   545  			d, err = Open(dir, opts)
   546  			closed = false
   547  			require.NoError(t, err)
   548  			waitForFlush()
   549  			d.TestOnlyWaitForCleaning()
   550  			return ""
   551  
   552  		case "blockFlush":
   553  			blockFlush = true
   554  			d.mu.Lock()
   555  			d.mu.compact.flushing = true
   556  			d.mu.Unlock()
   557  			return ""
   558  
   559  		case "allowFlush":
   560  			blockFlush = false
   561  			d.mu.Lock()
   562  			d.mu.compact.flushing = false
   563  			d.mu.Unlock()
   564  			return ""
   565  
   566  		case "flush":
   567  			d.maybeScheduleFlush()
   568  			waitForFlush()
   569  			d.TestOnlyWaitForCleaning()
   570  			return ""
   571  
   572  		case "get":
   573  			return runGetCmd(t, td, d)
   574  
   575  		default:
   576  			return fmt.Sprintf("unknown command: %s", td.Cmd)
   577  		}
   578  	})
   579  }
   580  
   581  func TestExcise(t *testing.T) {
   582  	var mem vfs.FS
   583  	var d *DB
   584  	var flushed bool
   585  	defer func() {
   586  		require.NoError(t, d.Close())
   587  	}()
   588  
   589  	var opts *Options
   590  	reset := func() {
   591  		if d != nil {
   592  			require.NoError(t, d.Close())
   593  		}
   594  
   595  		mem = vfs.NewMem()
   596  		require.NoError(t, mem.MkdirAll("ext", 0755))
   597  		opts = &Options{
   598  			FS:                    mem,
   599  			L0CompactionThreshold: 100,
   600  			L0StopWritesThreshold: 100,
   601  			DebugCheck:            DebugCheckLevels,
   602  			EventListener: &EventListener{FlushEnd: func(info FlushInfo) {
   603  				flushed = true
   604  			}},
   605  			FormatMajorVersion: FormatVirtualSSTables,
   606  			Comparer:           testkeys.Comparer,
   607  		}
   608  		// Disable automatic compactions because otherwise we'll race with
   609  		// delete-only compactions triggered by ingesting range tombstones.
   610  		opts.DisableAutomaticCompactions = true
   611  		// Set this to true to add some testing for the virtual sstable validation
   612  		// code paths.
   613  		opts.Experimental.ValidateOnIngest = true
   614  
   615  		var err error
   616  		d, err = Open("", opts)
   617  		require.NoError(t, err)
   618  	}
   619  	reset()
   620  
   621  	datadriven.RunTest(t, "testdata/excise", func(t *testing.T, td *datadriven.TestData) string {
   622  		switch td.Cmd {
   623  		case "reset":
   624  			reset()
   625  			return ""
   626  		case "reopen":
   627  			require.NoError(t, d.Close())
   628  			var err error
   629  			d, err = Open("", opts)
   630  			require.NoError(t, err)
   631  
   632  			return ""
   633  		case "batch":
   634  			b := d.NewIndexedBatch()
   635  			if err := runBatchDefineCmd(td, b); err != nil {
   636  				return err.Error()
   637  			}
   638  			if err := b.Commit(nil); err != nil {
   639  				return err.Error()
   640  			}
   641  			return ""
   642  		case "build":
   643  			if err := runBuildCmd(td, d, mem); err != nil {
   644  				return err.Error()
   645  			}
   646  			return ""
   647  
   648  		case "flush":
   649  			if err := d.Flush(); err != nil {
   650  				return err.Error()
   651  			}
   652  			return ""
   653  
   654  		case "ingest":
   655  			flushed = false
   656  			if err := runIngestCmd(td, d, mem); err != nil {
   657  				return err.Error()
   658  			}
   659  			// Wait for a possible flush.
   660  			d.mu.Lock()
   661  			for d.mu.compact.flushing {
   662  				d.mu.compact.cond.Wait()
   663  			}
   664  			d.mu.Unlock()
   665  			if flushed {
   666  				return "memtable flushed"
   667  			}
   668  			return ""
   669  
   670  		case "ingest-and-excise":
   671  			flushed = false
   672  			if err := runIngestAndExciseCmd(td, d, mem); err != nil {
   673  				return err.Error()
   674  			}
   675  			// Wait for a possible flush.
   676  			d.mu.Lock()
   677  			for d.mu.compact.flushing {
   678  				d.mu.compact.cond.Wait()
   679  			}
   680  			d.mu.Unlock()
   681  			if flushed {
   682  				return "memtable flushed"
   683  			}
   684  			return ""
   685  
   686  		case "get":
   687  			return runGetCmd(t, td, d)
   688  
   689  		case "iter":
   690  			iter, _ := d.NewIter(&IterOptions{
   691  				KeyTypes: IterKeyTypePointsAndRanges,
   692  			})
   693  			return runIterCmd(td, iter, true)
   694  
   695  		case "lsm":
   696  			return runLSMCmd(td, d)
   697  
   698  		case "metrics":
   699  			// The asynchronous loading of table stats can change metrics, so
   700  			// wait for all the tables' stats to be loaded.
   701  			d.mu.Lock()
   702  			d.waitTableStats()
   703  			d.mu.Unlock()
   704  
   705  			return d.Metrics().StringForTests()
   706  
   707  		case "wait-pending-table-stats":
   708  			return runTableStatsCmd(td, d)
   709  
   710  		case "excise":
   711  			ve := &versionEdit{
   712  				DeletedFiles: map[deletedFileEntry]*fileMetadata{},
   713  			}
   714  			var exciseSpan KeyRange
   715  			if len(td.CmdArgs) != 2 {
   716  				panic("insufficient args for compact command")
   717  			}
   718  			exciseSpan.Start = []byte(td.CmdArgs[0].Key)
   719  			exciseSpan.End = []byte(td.CmdArgs[1].Key)
   720  
   721  			d.mu.Lock()
   722  			d.mu.versions.logLock()
   723  			d.mu.Unlock()
   724  			current := d.mu.versions.currentVersion()
   725  			for level := range current.Levels {
   726  				iter := current.Levels[level].Iter()
   727  				for m := iter.SeekGE(d.cmp, exciseSpan.Start); m != nil && d.cmp(m.Smallest.UserKey, exciseSpan.End) < 0; m = iter.Next() {
   728  					_, err := d.excise(exciseSpan, m, ve, level)
   729  					if err != nil {
   730  						d.mu.Lock()
   731  						d.mu.versions.logUnlock()
   732  						d.mu.Unlock()
   733  						return fmt.Sprintf("error when excising %s: %s", m.FileNum, err.Error())
   734  					}
   735  				}
   736  			}
   737  			d.mu.Lock()
   738  			d.mu.versions.logUnlock()
   739  			d.mu.Unlock()
   740  			return fmt.Sprintf("would excise %d files, use ingest-and-excise to excise.\n%s", len(ve.DeletedFiles), ve.DebugString(base.DefaultFormatter))
   741  
   742  		case "confirm-backing":
   743  			// Confirms that the files have the same FileBacking.
   744  			fileNums := make(map[base.FileNum]struct{})
   745  			for i := range td.CmdArgs {
   746  				fNum, err := strconv.Atoi(td.CmdArgs[i].Key)
   747  				if err != nil {
   748  					panic("invalid file number")
   749  				}
   750  				fileNums[base.FileNum(fNum)] = struct{}{}
   751  			}
   752  			d.mu.Lock()
   753  			currVersion := d.mu.versions.currentVersion()
   754  			var ptr *manifest.FileBacking
   755  			for _, level := range currVersion.Levels {
   756  				lIter := level.Iter()
   757  				for f := lIter.First(); f != nil; f = lIter.Next() {
   758  					if _, ok := fileNums[f.FileNum]; ok {
   759  						if ptr == nil {
   760  							ptr = f.FileBacking
   761  							continue
   762  						}
   763  						if f.FileBacking != ptr {
   764  							d.mu.Unlock()
   765  							return "file backings are not the same"
   766  						}
   767  					}
   768  				}
   769  			}
   770  			d.mu.Unlock()
   771  			return "file backings are the same"
   772  		case "compact":
   773  			if len(td.CmdArgs) != 2 {
   774  				panic("insufficient args for compact command")
   775  			}
   776  			l := td.CmdArgs[0].Key
   777  			r := td.CmdArgs[1].Key
   778  			err := d.Compact([]byte(l), []byte(r), false)
   779  			if err != nil {
   780  				return err.Error()
   781  			}
   782  			return ""
   783  		default:
   784  			return fmt.Sprintf("unknown command: %s", td.Cmd)
   785  		}
   786  	})
   787  }
   788  
   789  func testIngestSharedImpl(
   790  	t *testing.T, createOnShared remote.CreateOnSharedStrategy, fileName string,
   791  ) {
   792  	var d, d1, d2 *DB
   793  	var efos map[string]*EventuallyFileOnlySnapshot
   794  	defer func() {
   795  		for _, e := range efos {
   796  			require.NoError(t, e.Close())
   797  		}
   798  		if d1 != nil {
   799  			require.NoError(t, d1.Close())
   800  		}
   801  		if d2 != nil {
   802  			require.NoError(t, d2.Close())
   803  		}
   804  	}()
   805  	creatorIDCounter := uint64(1)
   806  	replicateCounter := 1
   807  
   808  	reset := func() {
   809  		for _, e := range efos {
   810  			require.NoError(t, e.Close())
   811  		}
   812  		if d1 != nil {
   813  			require.NoError(t, d1.Close())
   814  		}
   815  		if d2 != nil {
   816  			require.NoError(t, d2.Close())
   817  		}
   818  		efos = make(map[string]*EventuallyFileOnlySnapshot)
   819  
   820  		sstorage := remote.NewInMem()
   821  		mem1 := vfs.NewMem()
   822  		mem2 := vfs.NewMem()
   823  		require.NoError(t, mem1.MkdirAll("ext", 0755))
   824  		require.NoError(t, mem2.MkdirAll("ext", 0755))
   825  		opts1 := &Options{
   826  			Comparer:              testkeys.Comparer,
   827  			FS:                    mem1,
   828  			LBaseMaxBytes:         1,
   829  			L0CompactionThreshold: 100,
   830  			L0StopWritesThreshold: 100,
   831  			DebugCheck:            DebugCheckLevels,
   832  			FormatMajorVersion:    FormatVirtualSSTables,
   833  		}
   834  		// lel.
   835  		lel := MakeLoggingEventListener(DefaultLogger)
   836  		opts1.EventListener = &lel
   837  		opts1.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
   838  			"": sstorage,
   839  		})
   840  		opts1.Experimental.CreateOnShared = createOnShared
   841  		opts1.Experimental.CreateOnSharedLocator = ""
   842  		// Disable automatic compactions because otherwise we'll race with
   843  		// delete-only compactions triggered by ingesting range tombstones.
   844  		opts1.DisableAutomaticCompactions = true
   845  
   846  		opts2 := &Options{}
   847  		*opts2 = *opts1
   848  		opts2.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
   849  			"": sstorage,
   850  		})
   851  		opts2.Experimental.CreateOnShared = createOnShared
   852  		opts2.Experimental.CreateOnSharedLocator = ""
   853  		opts2.FS = mem2
   854  
   855  		var err error
   856  		d1, err = Open("", opts1)
   857  		require.NoError(t, err)
   858  		require.NoError(t, d1.SetCreatorID(creatorIDCounter))
   859  		creatorIDCounter++
   860  		d2, err = Open("", opts2)
   861  		require.NoError(t, err)
   862  		require.NoError(t, d2.SetCreatorID(creatorIDCounter))
   863  		creatorIDCounter++
   864  		d = d1
   865  	}
   866  	reset()
   867  
   868  	datadriven.RunTest(t, fmt.Sprintf("testdata/%s", fileName), func(t *testing.T, td *datadriven.TestData) string {
   869  		switch td.Cmd {
   870  		case "reset":
   871  			reset()
   872  			return ""
   873  		case "switch":
   874  			if len(td.CmdArgs) != 1 {
   875  				return "usage: switch <1 or 2>"
   876  			}
   877  			switch td.CmdArgs[0].Key {
   878  			case "1":
   879  				d = d1
   880  			case "2":
   881  				d = d2
   882  			default:
   883  				return "usage: switch <1 or 2>"
   884  			}
   885  			return "ok"
   886  		case "batch":
   887  			b := d.NewIndexedBatch()
   888  			if err := runBatchDefineCmd(td, b); err != nil {
   889  				return err.Error()
   890  			}
   891  			if err := b.Commit(nil); err != nil {
   892  				return err.Error()
   893  			}
   894  			return ""
   895  		case "build":
   896  			if err := runBuildCmd(td, d, d.opts.FS); err != nil {
   897  				return err.Error()
   898  			}
   899  			return ""
   900  
   901  		case "flush":
   902  			if err := d.Flush(); err != nil {
   903  				return err.Error()
   904  			}
   905  			return ""
   906  
   907  		case "ingest":
   908  			if err := runIngestCmd(td, d, d.opts.FS); err != nil {
   909  				return err.Error()
   910  			}
   911  			// Wait for a possible flush.
   912  			d.mu.Lock()
   913  			for d.mu.compact.flushing {
   914  				d.mu.compact.cond.Wait()
   915  			}
   916  			d.mu.Unlock()
   917  			return ""
   918  
   919  		case "ingest-and-excise":
   920  			if err := runIngestAndExciseCmd(td, d, d.opts.FS); err != nil {
   921  				return err.Error()
   922  			}
   923  			// Wait for a possible flush.
   924  			d.mu.Lock()
   925  			for d.mu.compact.flushing {
   926  				d.mu.compact.cond.Wait()
   927  			}
   928  			d.mu.Unlock()
   929  			return ""
   930  
   931  		case "replicate":
   932  			if len(td.CmdArgs) != 4 {
   933  				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
   934  			}
   935  			var from, to *DB
   936  			switch td.CmdArgs[0].Key {
   937  			case "1":
   938  				from = d1
   939  			case "2":
   940  				from = d2
   941  			default:
   942  				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
   943  			}
   944  			switch td.CmdArgs[1].Key {
   945  			case "1":
   946  				to = d1
   947  			case "2":
   948  				to = d2
   949  			default:
   950  				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
   951  			}
   952  			startKey := []byte(td.CmdArgs[2].Key)
   953  			endKey := []byte(td.CmdArgs[3].Key)
   954  
   955  			writeOpts := d.opts.MakeWriterOptions(0 /* level */, to.opts.FormatMajorVersion.MaxTableFormat())
   956  			sstPath := fmt.Sprintf("ext/replicate%d.sst", replicateCounter)
   957  			f, err := to.opts.FS.Create(sstPath)
   958  			require.NoError(t, err)
   959  			replicateCounter++
   960  			w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), writeOpts)
   961  
   962  			var sharedSSTs []SharedSSTMeta
   963  			err = from.ScanInternal(context.TODO(), startKey, endKey,
   964  				func(key *InternalKey, value LazyValue, _ IteratorLevel) error {
   965  					val, _, err := value.Value(nil)
   966  					require.NoError(t, err)
   967  					require.NoError(t, w.Add(base.MakeInternalKey(key.UserKey, 0, key.Kind()), val))
   968  					return nil
   969  				},
   970  				func(start, end []byte, seqNum uint64) error {
   971  					require.NoError(t, w.DeleteRange(start, end))
   972  					return nil
   973  				},
   974  				func(start, end []byte, keys []keyspan.Key) error {
   975  					s := keyspan.Span{
   976  						Start:     start,
   977  						End:       end,
   978  						Keys:      keys,
   979  						KeysOrder: 0,
   980  					}
   981  					require.NoError(t, rangekey.Encode(&s, func(k base.InternalKey, v []byte) error {
   982  						return w.AddRangeKey(base.MakeInternalKey(k.UserKey, 0, k.Kind()), v)
   983  					}))
   984  					return nil
   985  				},
   986  				func(sst *SharedSSTMeta) error {
   987  					sharedSSTs = append(sharedSSTs, *sst)
   988  					return nil
   989  				},
   990  			)
   991  			require.NoError(t, err)
   992  			require.NoError(t, w.Close())
   993  
   994  			_, err = to.IngestAndExcise([]string{sstPath}, sharedSSTs, KeyRange{Start: startKey, End: endKey})
   995  			require.NoError(t, err)
   996  			return fmt.Sprintf("replicated %d shared SSTs", len(sharedSSTs))
   997  
   998  		case "get":
   999  			return runGetCmd(t, td, d)
  1000  
  1001  		case "iter":
  1002  			o := &IterOptions{KeyTypes: IterKeyTypePointsAndRanges}
  1003  			var reader Reader
  1004  			reader = d
  1005  			for _, arg := range td.CmdArgs {
  1006  				switch arg.Key {
  1007  				case "mask-suffix":
  1008  					o.RangeKeyMasking.Suffix = []byte(arg.Vals[0])
  1009  				case "mask-filter":
  1010  					o.RangeKeyMasking.Filter = func() BlockPropertyFilterMask {
  1011  						return sstable.NewTestKeysMaskingFilter()
  1012  					}
  1013  				case "snapshot":
  1014  					reader = efos[arg.Vals[0]]
  1015  				}
  1016  			}
  1017  			iter, err := reader.NewIter(o)
  1018  			if err != nil {
  1019  				return err.Error()
  1020  			}
  1021  			return runIterCmd(td, iter, true)
  1022  
  1023  		case "lsm":
  1024  			return runLSMCmd(td, d)
  1025  
  1026  		case "metrics":
  1027  			// The asynchronous loading of table stats can change metrics, so
  1028  			// wait for all the tables' stats to be loaded.
  1029  			d.mu.Lock()
  1030  			d.waitTableStats()
  1031  			d.mu.Unlock()
  1032  
  1033  			return d.Metrics().StringForTests()
  1034  
  1035  		case "wait-pending-table-stats":
  1036  			return runTableStatsCmd(td, d)
  1037  
  1038  		case "excise":
  1039  			ve := &versionEdit{
  1040  				DeletedFiles: map[deletedFileEntry]*fileMetadata{},
  1041  			}
  1042  			var exciseSpan KeyRange
  1043  			if len(td.CmdArgs) != 2 {
  1044  				panic("insufficient args for excise command")
  1045  			}
  1046  			exciseSpan.Start = []byte(td.CmdArgs[0].Key)
  1047  			exciseSpan.End = []byte(td.CmdArgs[1].Key)
  1048  
  1049  			d.mu.Lock()
  1050  			d.mu.versions.logLock()
  1051  			d.mu.Unlock()
  1052  			current := d.mu.versions.currentVersion()
  1053  			for level := range current.Levels {
  1054  				iter := current.Levels[level].Iter()
  1055  				for m := iter.SeekGE(d.cmp, exciseSpan.Start); m != nil && d.cmp(m.Smallest.UserKey, exciseSpan.End) < 0; m = iter.Next() {
  1056  					_, err := d.excise(exciseSpan, m, ve, level)
  1057  					if err != nil {
  1058  						d.mu.Lock()
  1059  						d.mu.versions.logUnlock()
  1060  						d.mu.Unlock()
  1061  						return fmt.Sprintf("error when excising %s: %s", m.FileNum, err.Error())
  1062  					}
  1063  				}
  1064  			}
  1065  			d.mu.Lock()
  1066  			d.mu.versions.logUnlock()
  1067  			d.mu.Unlock()
  1068  			return fmt.Sprintf("would excise %d files, use ingest-and-excise to excise.\n%s", len(ve.DeletedFiles), ve.String())
  1069  
  1070  		case "file-only-snapshot":
  1071  			if len(td.CmdArgs) != 1 {
  1072  				panic("insufficient args for file-only-snapshot command")
  1073  			}
  1074  			name := td.CmdArgs[0].Key
  1075  			var keyRanges []KeyRange
  1076  			for _, line := range strings.Split(td.Input, "\n") {
  1077  				fields := strings.Fields(line)
  1078  				if len(fields) != 2 {
  1079  					return "expected two fields for file-only snapshot KeyRanges"
  1080  				}
  1081  				kr := KeyRange{Start: []byte(fields[0]), End: []byte(fields[1])}
  1082  				keyRanges = append(keyRanges, kr)
  1083  			}
  1084  
  1085  			s := d.NewEventuallyFileOnlySnapshot(keyRanges)
  1086  			efos[name] = s
  1087  			return "ok"
  1088  
  1089  		case "wait-for-file-only-snapshot":
  1090  			if len(td.CmdArgs) != 1 {
  1091  				panic("insufficient args for file-only-snapshot command")
  1092  			}
  1093  			name := td.CmdArgs[0].Key
  1094  			err := efos[name].WaitForFileOnlySnapshot(context.TODO(), 1*time.Millisecond)
  1095  			if err != nil {
  1096  				return err.Error()
  1097  			}
  1098  			return "ok"
  1099  
  1100  		case "compact":
  1101  			err := runCompactCmd(td, d)
  1102  			if err != nil {
  1103  				return err.Error()
  1104  			}
  1105  			return "ok"
  1106  		default:
  1107  			return fmt.Sprintf("unknown command: %s", td.Cmd)
  1108  		}
  1109  	})
  1110  }
  1111  
  1112  func TestIngestShared(t *testing.T) {
  1113  	for _, strategy := range []remote.CreateOnSharedStrategy{remote.CreateOnSharedAll, remote.CreateOnSharedLower} {
  1114  		strategyStr := "all"
  1115  		if strategy == remote.CreateOnSharedLower {
  1116  			strategyStr = "lower"
  1117  		}
  1118  		t.Run(fmt.Sprintf("createOnShared=%s", strategyStr), func(t *testing.T) {
  1119  			fileName := "ingest_shared"
  1120  			if strategy == remote.CreateOnSharedLower {
  1121  				fileName = "ingest_shared_lower"
  1122  			}
  1123  			testIngestSharedImpl(t, strategy, fileName)
  1124  		})
  1125  	}
  1126  }
  1127  
  1128  func TestSimpleIngestShared(t *testing.T) {
  1129  	mem := vfs.NewMem()
  1130  	var d *DB
  1131  	var provider2 objstorage.Provider
  1132  	opts2 := Options{FS: vfs.NewMem(), FormatMajorVersion: FormatVirtualSSTables}
  1133  	opts2.EnsureDefaults()
  1134  
  1135  	// Create an objProvider where we will fake-create some sstables that can
  1136  	// then be shared back to the db instance.
  1137  	providerSettings := objstorageprovider.Settings{
  1138  		Logger:              opts2.Logger,
  1139  		FS:                  opts2.FS,
  1140  		FSDirName:           "",
  1141  		FSDirInitialListing: nil,
  1142  		FSCleaner:           opts2.Cleaner,
  1143  		NoSyncOnClose:       opts2.NoSyncOnClose,
  1144  		BytesPerSync:        opts2.BytesPerSync,
  1145  	}
  1146  	providerSettings.Remote.StorageFactory = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
  1147  		"": remote.NewInMem(),
  1148  	})
  1149  	providerSettings.Remote.CreateOnShared = remote.CreateOnSharedAll
  1150  	providerSettings.Remote.CreateOnSharedLocator = ""
  1151  
  1152  	provider2, err := objstorageprovider.Open(providerSettings)
  1153  	require.NoError(t, err)
  1154  	creatorIDCounter := uint64(1)
  1155  	provider2.SetCreatorID(objstorage.CreatorID(creatorIDCounter))
  1156  	creatorIDCounter++
  1157  
  1158  	defer func() {
  1159  		require.NoError(t, d.Close())
  1160  	}()
  1161  
  1162  	reset := func() {
  1163  		if d != nil {
  1164  			require.NoError(t, d.Close())
  1165  		}
  1166  
  1167  		mem = vfs.NewMem()
  1168  		require.NoError(t, mem.MkdirAll("ext", 0755))
  1169  		opts := &Options{
  1170  			FormatMajorVersion:    FormatVirtualSSTables,
  1171  			FS:                    mem,
  1172  			L0CompactionThreshold: 100,
  1173  			L0StopWritesThreshold: 100,
  1174  		}
  1175  		opts.Experimental.RemoteStorage = providerSettings.Remote.StorageFactory
  1176  		opts.Experimental.CreateOnShared = providerSettings.Remote.CreateOnShared
  1177  		opts.Experimental.CreateOnSharedLocator = providerSettings.Remote.CreateOnSharedLocator
  1178  
  1179  		var err error
  1180  		d, err = Open("", opts)
  1181  		require.NoError(t, err)
  1182  		require.NoError(t, d.SetCreatorID(creatorIDCounter))
  1183  		creatorIDCounter++
  1184  	}
  1185  	reset()
  1186  
  1187  	metaMap := map[base.DiskFileNum]objstorage.ObjectMetadata{}
  1188  
  1189  	require.NoError(t, d.Set([]byte("d"), []byte("unexpected"), nil))
  1190  	require.NoError(t, d.Set([]byte("e"), []byte("unexpected"), nil))
  1191  	require.NoError(t, d.Set([]byte("a"), []byte("unexpected"), nil))
  1192  	require.NoError(t, d.Set([]byte("f"), []byte("unexpected"), nil))
  1193  	d.Flush()
  1194  
  1195  	{
  1196  		// Create a shared file.
  1197  		fn := base.FileNum(2)
  1198  		f, meta, err := provider2.Create(context.TODO(), fileTypeTable, fn.DiskFileNum(), objstorage.CreateOptions{PreferSharedStorage: true})
  1199  		require.NoError(t, err)
  1200  		w := sstable.NewWriter(f, d.opts.MakeWriterOptions(0, d.opts.FormatMajorVersion.MaxTableFormat()))
  1201  		w.Set([]byte("d"), []byte("shared"))
  1202  		w.Set([]byte("e"), []byte("shared"))
  1203  		w.Close()
  1204  		metaMap[fn.DiskFileNum()] = meta
  1205  	}
  1206  
  1207  	m := metaMap[base.FileNum(2).DiskFileNum()]
  1208  	handle, err := provider2.RemoteObjectBacking(&m)
  1209  	require.NoError(t, err)
  1210  	size, err := provider2.Size(m)
  1211  	require.NoError(t, err)
  1212  
  1213  	sharedSSTMeta := SharedSSTMeta{
  1214  		Backing:          handle,
  1215  		Smallest:         base.MakeInternalKey([]byte("d"), 0, InternalKeyKindSet),
  1216  		Largest:          base.MakeInternalKey([]byte("e"), 0, InternalKeyKindSet),
  1217  		SmallestPointKey: base.MakeInternalKey([]byte("d"), 0, InternalKeyKindSet),
  1218  		LargestPointKey:  base.MakeInternalKey([]byte("e"), 0, InternalKeyKindSet),
  1219  		Level:            6,
  1220  		Size:             uint64(size + 5),
  1221  	}
  1222  	_, err = d.IngestAndExcise([]string{}, []SharedSSTMeta{sharedSSTMeta}, KeyRange{Start: []byte("d"), End: []byte("ee")})
  1223  	require.NoError(t, err)
  1224  
  1225  	// TODO(bilal): Once reading of shared sstables is in, verify that the values
  1226  	// of d and e have been updated.
  1227  }
  1228  
  1229  type blockedCompaction struct {
  1230  	startBlock, unblock chan struct{}
  1231  }
  1232  
  1233  func TestConcurrentExcise(t *testing.T) {
  1234  	var d, d1, d2 *DB
  1235  	var efos map[string]*EventuallyFileOnlySnapshot
  1236  	backgroundErrs := make(chan error, 5)
  1237  	var compactions map[string]*blockedCompaction
  1238  	defer func() {
  1239  		for _, e := range efos {
  1240  			require.NoError(t, e.Close())
  1241  		}
  1242  		if d1 != nil {
  1243  			require.NoError(t, d1.Close())
  1244  		}
  1245  		if d2 != nil {
  1246  			require.NoError(t, d2.Close())
  1247  		}
  1248  	}()
  1249  	creatorIDCounter := uint64(1)
  1250  	replicateCounter := 1
  1251  
  1252  	var wg sync.WaitGroup
  1253  	defer wg.Wait()
  1254  	var blockNextCompaction bool
  1255  	var blockedJobID int
  1256  	var blockedCompactionName string
  1257  	var blockedCompactionsMu sync.Mutex // protects the above three variables.
  1258  
  1259  	reset := func() {
  1260  		wg.Wait()
  1261  		for _, e := range efos {
  1262  			require.NoError(t, e.Close())
  1263  		}
  1264  		if d1 != nil {
  1265  			require.NoError(t, d1.Close())
  1266  		}
  1267  		if d2 != nil {
  1268  			require.NoError(t, d2.Close())
  1269  		}
  1270  		efos = make(map[string]*EventuallyFileOnlySnapshot)
  1271  		compactions = make(map[string]*blockedCompaction)
  1272  		backgroundErrs = make(chan error, 5)
  1273  
  1274  		var el EventListener
  1275  		el.EnsureDefaults(testLogger{t: t})
  1276  		el.FlushBegin = func(info FlushInfo) {
  1277  			// Don't block flushes
  1278  		}
  1279  		el.BackgroundError = func(err error) {
  1280  			backgroundErrs <- err
  1281  		}
  1282  		el.CompactionBegin = func(info CompactionInfo) {
  1283  			if info.Reason == "move" {
  1284  				return
  1285  			}
  1286  			blockedCompactionsMu.Lock()
  1287  			defer blockedCompactionsMu.Unlock()
  1288  			if blockNextCompaction {
  1289  				blockNextCompaction = false
  1290  				blockedJobID = info.JobID
  1291  			}
  1292  		}
  1293  		el.TableCreated = func(info TableCreateInfo) {
  1294  			blockedCompactionsMu.Lock()
  1295  			if info.JobID != blockedJobID {
  1296  				blockedCompactionsMu.Unlock()
  1297  				return
  1298  			}
  1299  			blockedJobID = 0
  1300  			c := compactions[blockedCompactionName]
  1301  			blockedCompactionName = ""
  1302  			blockedCompactionsMu.Unlock()
  1303  			c.startBlock <- struct{}{}
  1304  			<-c.unblock
  1305  		}
  1306  
  1307  		sstorage := remote.NewInMem()
  1308  		mem1 := vfs.NewMem()
  1309  		mem2 := vfs.NewMem()
  1310  		require.NoError(t, mem1.MkdirAll("ext", 0755))
  1311  		require.NoError(t, mem2.MkdirAll("ext", 0755))
  1312  		opts1 := &Options{
  1313  			Comparer:              testkeys.Comparer,
  1314  			LBaseMaxBytes:         1,
  1315  			FS:                    mem1,
  1316  			L0CompactionThreshold: 100,
  1317  			L0StopWritesThreshold: 100,
  1318  			DebugCheck:            DebugCheckLevels,
  1319  			FormatMajorVersion:    FormatVirtualSSTables,
  1320  		}
  1321  		// lel.
  1322  		lel := MakeLoggingEventListener(DefaultLogger)
  1323  		tel := TeeEventListener(lel, el)
  1324  		opts1.EventListener = &tel
  1325  		opts1.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
  1326  			"": sstorage,
  1327  		})
  1328  		opts1.Experimental.CreateOnShared = remote.CreateOnSharedAll
  1329  		opts1.Experimental.CreateOnSharedLocator = ""
  1330  		// Disable automatic compactions because otherwise we'll race with
  1331  		// delete-only compactions triggered by ingesting range tombstones.
  1332  		opts1.DisableAutomaticCompactions = true
  1333  
  1334  		opts2 := &Options{}
  1335  		*opts2 = *opts1
  1336  		opts2.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
  1337  			"": sstorage,
  1338  		})
  1339  		opts2.Experimental.CreateOnShared = remote.CreateOnSharedAll
  1340  		opts2.Experimental.CreateOnSharedLocator = ""
  1341  		opts2.FS = mem2
  1342  
  1343  		var err error
  1344  		d1, err = Open("", opts1)
  1345  		require.NoError(t, err)
  1346  		require.NoError(t, d1.SetCreatorID(creatorIDCounter))
  1347  		creatorIDCounter++
  1348  		d2, err = Open("", opts2)
  1349  		require.NoError(t, err)
  1350  		require.NoError(t, d2.SetCreatorID(creatorIDCounter))
  1351  		creatorIDCounter++
  1352  		d = d1
  1353  	}
  1354  	reset()
  1355  
  1356  	datadriven.RunTest(t, "testdata/concurrent_excise", func(t *testing.T, td *datadriven.TestData) string {
  1357  		switch td.Cmd {
  1358  		case "reset":
  1359  			reset()
  1360  			return ""
  1361  		case "switch":
  1362  			if len(td.CmdArgs) != 1 {
  1363  				return "usage: switch <1 or 2>"
  1364  			}
  1365  			switch td.CmdArgs[0].Key {
  1366  			case "1":
  1367  				d = d1
  1368  			case "2":
  1369  				d = d2
  1370  			default:
  1371  				return "usage: switch <1 or 2>"
  1372  			}
  1373  			return "ok"
  1374  		case "batch":
  1375  			b := d.NewIndexedBatch()
  1376  			if err := runBatchDefineCmd(td, b); err != nil {
  1377  				return err.Error()
  1378  			}
  1379  			if err := b.Commit(nil); err != nil {
  1380  				return err.Error()
  1381  			}
  1382  			return ""
  1383  		case "build":
  1384  			if err := runBuildCmd(td, d, d.opts.FS); err != nil {
  1385  				return err.Error()
  1386  			}
  1387  			return ""
  1388  
  1389  		case "flush":
  1390  			if err := d.Flush(); err != nil {
  1391  				return err.Error()
  1392  			}
  1393  			return ""
  1394  
  1395  		case "ingest":
  1396  			if err := runIngestCmd(td, d, d.opts.FS); err != nil {
  1397  				return err.Error()
  1398  			}
  1399  			// Wait for a possible flush.
  1400  			d.mu.Lock()
  1401  			for d.mu.compact.flushing {
  1402  				d.mu.compact.cond.Wait()
  1403  			}
  1404  			d.mu.Unlock()
  1405  			return ""
  1406  
  1407  		case "ingest-and-excise":
  1408  			if err := runIngestAndExciseCmd(td, d, d.opts.FS); err != nil {
  1409  				return err.Error()
  1410  			}
  1411  			// Wait for a possible flush.
  1412  			d.mu.Lock()
  1413  			for d.mu.compact.flushing {
  1414  				d.mu.compact.cond.Wait()
  1415  			}
  1416  			d.mu.Unlock()
  1417  			return ""
  1418  
  1419  		case "replicate":
  1420  			if len(td.CmdArgs) != 4 {
  1421  				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
  1422  			}
  1423  			var from, to *DB
  1424  			switch td.CmdArgs[0].Key {
  1425  			case "1":
  1426  				from = d1
  1427  			case "2":
  1428  				from = d2
  1429  			default:
  1430  				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
  1431  			}
  1432  			switch td.CmdArgs[1].Key {
  1433  			case "1":
  1434  				to = d1
  1435  			case "2":
  1436  				to = d2
  1437  			default:
  1438  				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
  1439  			}
  1440  			startKey := []byte(td.CmdArgs[2].Key)
  1441  			endKey := []byte(td.CmdArgs[3].Key)
  1442  
  1443  			writeOpts := d.opts.MakeWriterOptions(0 /* level */, to.opts.FormatMajorVersion.MaxTableFormat())
  1444  			sstPath := fmt.Sprintf("ext/replicate%d.sst", replicateCounter)
  1445  			f, err := to.opts.FS.Create(sstPath)
  1446  			require.NoError(t, err)
  1447  			replicateCounter++
  1448  			w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), writeOpts)
  1449  
  1450  			var sharedSSTs []SharedSSTMeta
  1451  			err = from.ScanInternal(context.TODO(), startKey, endKey,
  1452  				func(key *InternalKey, value LazyValue, _ IteratorLevel) error {
  1453  					val, _, err := value.Value(nil)
  1454  					require.NoError(t, err)
  1455  					require.NoError(t, w.Add(base.MakeInternalKey(key.UserKey, 0, key.Kind()), val))
  1456  					return nil
  1457  				},
  1458  				func(start, end []byte, seqNum uint64) error {
  1459  					require.NoError(t, w.DeleteRange(start, end))
  1460  					return nil
  1461  				},
  1462  				func(start, end []byte, keys []keyspan.Key) error {
  1463  					s := keyspan.Span{
  1464  						Start:     start,
  1465  						End:       end,
  1466  						Keys:      keys,
  1467  						KeysOrder: 0,
  1468  					}
  1469  					require.NoError(t, rangekey.Encode(&s, func(k base.InternalKey, v []byte) error {
  1470  						return w.AddRangeKey(base.MakeInternalKey(k.UserKey, 0, k.Kind()), v)
  1471  					}))
  1472  					return nil
  1473  				},
  1474  				func(sst *SharedSSTMeta) error {
  1475  					sharedSSTs = append(sharedSSTs, *sst)
  1476  					return nil
  1477  				},
  1478  			)
  1479  			require.NoError(t, err)
  1480  			require.NoError(t, w.Close())
  1481  
  1482  			_, err = to.IngestAndExcise([]string{sstPath}, sharedSSTs, KeyRange{Start: startKey, End: endKey})
  1483  			require.NoError(t, err)
  1484  			return fmt.Sprintf("replicated %d shared SSTs", len(sharedSSTs))
  1485  
  1486  		case "get":
  1487  			return runGetCmd(t, td, d)
  1488  
  1489  		case "iter":
  1490  			o := &IterOptions{KeyTypes: IterKeyTypePointsAndRanges}
  1491  			var reader Reader
  1492  			reader = d
  1493  			for _, arg := range td.CmdArgs {
  1494  				switch arg.Key {
  1495  				case "mask-suffix":
  1496  					o.RangeKeyMasking.Suffix = []byte(arg.Vals[0])
  1497  				case "mask-filter":
  1498  					o.RangeKeyMasking.Filter = func() BlockPropertyFilterMask {
  1499  						return sstable.NewTestKeysMaskingFilter()
  1500  					}
  1501  				case "snapshot":
  1502  					reader = efos[arg.Vals[0]]
  1503  				}
  1504  			}
  1505  			iter, err := reader.NewIter(o)
  1506  			if err != nil {
  1507  				return err.Error()
  1508  			}
  1509  			return runIterCmd(td, iter, true)
  1510  
  1511  		case "lsm":
  1512  			return runLSMCmd(td, d)
  1513  
  1514  		case "metrics":
  1515  			// The asynchronous loading of table stats can change metrics, so
  1516  			// wait for all the tables' stats to be loaded.
  1517  			d.mu.Lock()
  1518  			d.waitTableStats()
  1519  			d.mu.Unlock()
  1520  
  1521  			return d.Metrics().StringForTests()
  1522  
  1523  		case "wait-pending-table-stats":
  1524  			return runTableStatsCmd(td, d)
  1525  
  1526  		case "excise":
  1527  			ve := &versionEdit{
  1528  				DeletedFiles: map[deletedFileEntry]*fileMetadata{},
  1529  			}
  1530  			var exciseSpan KeyRange
  1531  			if len(td.CmdArgs) != 2 {
  1532  				panic("insufficient args for excise command")
  1533  			}
  1534  			exciseSpan.Start = []byte(td.CmdArgs[0].Key)
  1535  			exciseSpan.End = []byte(td.CmdArgs[1].Key)
  1536  
  1537  			d.mu.Lock()
  1538  			d.mu.versions.logLock()
  1539  			d.mu.Unlock()
  1540  			current := d.mu.versions.currentVersion()
  1541  			for level := range current.Levels {
  1542  				iter := current.Levels[level].Iter()
  1543  				for m := iter.SeekGE(d.cmp, exciseSpan.Start); m != nil && d.cmp(m.Smallest.UserKey, exciseSpan.End) < 0; m = iter.Next() {
  1544  					_, err := d.excise(exciseSpan, m, ve, level)
  1545  					if err != nil {
  1546  						d.mu.Lock()
  1547  						d.mu.versions.logUnlock()
  1548  						d.mu.Unlock()
  1549  						return fmt.Sprintf("error when excising %s: %s", m.FileNum, err.Error())
  1550  					}
  1551  				}
  1552  			}
  1553  			d.mu.Lock()
  1554  			d.mu.versions.logUnlock()
  1555  			d.mu.Unlock()
  1556  			return fmt.Sprintf("would excise %d files, use ingest-and-excise to excise.\n%s", len(ve.DeletedFiles), ve.String())
  1557  
  1558  		case "file-only-snapshot":
  1559  			if len(td.CmdArgs) != 1 {
  1560  				panic("insufficient args for file-only-snapshot command")
  1561  			}
  1562  			name := td.CmdArgs[0].Key
  1563  			var keyRanges []KeyRange
  1564  			for _, line := range strings.Split(td.Input, "\n") {
  1565  				fields := strings.Fields(line)
  1566  				if len(fields) != 2 {
  1567  					return "expected two fields for file-only snapshot KeyRanges"
  1568  				}
  1569  				kr := KeyRange{Start: []byte(fields[0]), End: []byte(fields[1])}
  1570  				keyRanges = append(keyRanges, kr)
  1571  			}
  1572  
  1573  			s := d.NewEventuallyFileOnlySnapshot(keyRanges)
  1574  			efos[name] = s
  1575  			return "ok"
  1576  
  1577  		case "wait-for-file-only-snapshot":
  1578  			if len(td.CmdArgs) != 1 {
  1579  				panic("insufficient args for file-only-snapshot command")
  1580  			}
  1581  			name := td.CmdArgs[0].Key
  1582  			err := efos[name].WaitForFileOnlySnapshot(context.TODO(), 1*time.Millisecond)
  1583  			if err != nil {
  1584  				return err.Error()
  1585  			}
  1586  			return "ok"
  1587  
  1588  		case "unblock":
  1589  			name := td.CmdArgs[0].Key
  1590  			blockedCompactionsMu.Lock()
  1591  			c := compactions[name]
  1592  			delete(compactions, name)
  1593  			blockedCompactionsMu.Unlock()
  1594  			c.unblock <- struct{}{}
  1595  			return "ok"
  1596  
  1597  		case "compact":
  1598  			async := false
  1599  			var otherArgs []datadriven.CmdArg
  1600  			var bc *blockedCompaction
  1601  			for i := range td.CmdArgs {
  1602  				switch td.CmdArgs[i].Key {
  1603  				case "block":
  1604  					name := td.CmdArgs[i].Vals[0]
  1605  					bc = &blockedCompaction{startBlock: make(chan struct{}), unblock: make(chan struct{})}
  1606  					blockedCompactionsMu.Lock()
  1607  					compactions[name] = bc
  1608  					blockNextCompaction = true
  1609  					blockedCompactionName = name
  1610  					blockedCompactionsMu.Unlock()
  1611  					async = true
  1612  				default:
  1613  					otherArgs = append(otherArgs, td.CmdArgs[i])
  1614  				}
  1615  			}
  1616  			var tdClone datadriven.TestData
  1617  			tdClone = *td
  1618  			tdClone.CmdArgs = otherArgs
  1619  			if !async {
  1620  				err := runCompactCmd(td, d)
  1621  				if err != nil {
  1622  					return err.Error()
  1623  				}
  1624  			} else {
  1625  				wg.Add(1)
  1626  				go func() {
  1627  					defer wg.Done()
  1628  					_ = runCompactCmd(&tdClone, d)
  1629  				}()
  1630  				<-bc.startBlock
  1631  				return "spun off in separate goroutine"
  1632  			}
  1633  			return "ok"
  1634  		case "wait-for-background-error":
  1635  			err := <-backgroundErrs
  1636  			return err.Error()
  1637  		default:
  1638  			return fmt.Sprintf("unknown command: %s", td.Cmd)
  1639  		}
  1640  	})
  1641  }
  1642  
  1643  func TestIngestExternal(t *testing.T) {
  1644  	var mem vfs.FS
  1645  	var d *DB
  1646  	var flushed bool
  1647  	defer func() {
  1648  		require.NoError(t, d.Close())
  1649  	}()
  1650  
  1651  	var remoteStorage remote.Storage
  1652  
  1653  	reset := func() {
  1654  		if d != nil {
  1655  			require.NoError(t, d.Close())
  1656  		}
  1657  
  1658  		mem = vfs.NewMem()
  1659  		require.NoError(t, mem.MkdirAll("ext", 0755))
  1660  		remoteStorage = remote.NewInMem()
  1661  		opts := &Options{
  1662  			FS:                    mem,
  1663  			L0CompactionThreshold: 100,
  1664  			L0StopWritesThreshold: 100,
  1665  			DebugCheck:            DebugCheckLevels,
  1666  			EventListener: &EventListener{FlushEnd: func(info FlushInfo) {
  1667  				flushed = true
  1668  			}},
  1669  			FormatMajorVersion: FormatVirtualSSTables,
  1670  		}
  1671  		opts.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
  1672  			"external-locator": remoteStorage,
  1673  		})
  1674  		opts.Experimental.CreateOnShared = remote.CreateOnSharedNone
  1675  		// Disable automatic compactions because otherwise we'll race with
  1676  		// delete-only compactions triggered by ingesting range tombstones.
  1677  		opts.DisableAutomaticCompactions = true
  1678  
  1679  		var err error
  1680  		d, err = Open("", opts)
  1681  		require.NoError(t, err)
  1682  		require.NoError(t, d.SetCreatorID(1))
  1683  	}
  1684  	reset()
  1685  
  1686  	datadriven.RunTest(t, "testdata/ingest_external", func(t *testing.T, td *datadriven.TestData) string {
  1687  		switch td.Cmd {
  1688  		case "reset":
  1689  			reset()
  1690  			return ""
  1691  		case "batch":
  1692  			b := d.NewIndexedBatch()
  1693  			if err := runBatchDefineCmd(td, b); err != nil {
  1694  				return err.Error()
  1695  			}
  1696  			if err := b.Commit(nil); err != nil {
  1697  				return err.Error()
  1698  			}
  1699  			return ""
  1700  		case "build-remote":
  1701  			if err := runBuildRemoteCmd(td, d, remoteStorage); err != nil {
  1702  				return err.Error()
  1703  			}
  1704  			return ""
  1705  
  1706  		case "flush":
  1707  			if err := d.Flush(); err != nil {
  1708  				return err.Error()
  1709  			}
  1710  			return ""
  1711  
  1712  		case "ingest-external":
  1713  			flushed = false
  1714  			if err := runIngestExternalCmd(td, d, "external-locator"); err != nil {
  1715  				return err.Error()
  1716  			}
  1717  			// Wait for a possible flush.
  1718  			d.mu.Lock()
  1719  			for d.mu.compact.flushing {
  1720  				d.mu.compact.cond.Wait()
  1721  			}
  1722  			d.mu.Unlock()
  1723  			if flushed {
  1724  				return "memtable flushed"
  1725  			}
  1726  			return ""
  1727  
  1728  		case "get":
  1729  			return runGetCmd(t, td, d)
  1730  
  1731  		case "iter":
  1732  			iter, _ := d.NewIter(&IterOptions{
  1733  				KeyTypes: IterKeyTypePointsAndRanges,
  1734  			})
  1735  			return runIterCmd(td, iter, true)
  1736  
  1737  		case "lsm":
  1738  			return runLSMCmd(td, d)
  1739  
  1740  		case "metrics":
  1741  			// The asynchronous loading of table stats can change metrics, so
  1742  			// wait for all the tables' stats to be loaded.
  1743  			d.mu.Lock()
  1744  			d.waitTableStats()
  1745  			d.mu.Unlock()
  1746  
  1747  			return d.Metrics().StringForTests()
  1748  
  1749  		case "wait-pending-table-stats":
  1750  			return runTableStatsCmd(td, d)
  1751  
  1752  		case "compact":
  1753  			if len(td.CmdArgs) != 2 {
  1754  				panic("insufficient args for compact command")
  1755  			}
  1756  			l := td.CmdArgs[0].Key
  1757  			r := td.CmdArgs[1].Key
  1758  			err := d.Compact([]byte(l), []byte(r), false)
  1759  			if err != nil {
  1760  				return err.Error()
  1761  			}
  1762  			return ""
  1763  		default:
  1764  			return fmt.Sprintf("unknown command: %s", td.Cmd)
  1765  		}
  1766  	})
  1767  }
  1768  
  1769  func TestIngestMemtableOverlaps(t *testing.T) {
  1770  	comparers := []Comparer{
  1771  		{Name: "default", Compare: DefaultComparer.Compare, FormatKey: DefaultComparer.FormatKey},
  1772  		{
  1773  			Name:      "reverse",
  1774  			Compare:   func(a, b []byte) int { return DefaultComparer.Compare(b, a) },
  1775  			FormatKey: DefaultComparer.FormatKey,
  1776  		},
  1777  	}
  1778  	m := make(map[string]*Comparer)
  1779  	for i := range comparers {
  1780  		c := &comparers[i]
  1781  		m[c.Name] = c
  1782  	}
  1783  
  1784  	for _, comparer := range comparers {
  1785  		t.Run(comparer.Name, func(t *testing.T) {
  1786  			var mem *memTable
  1787  
  1788  			parseMeta := func(s string) *fileMetadata {
  1789  				parts := strings.Split(s, "-")
  1790  				meta := &fileMetadata{}
  1791  				if len(parts) != 2 {
  1792  					t.Fatalf("malformed table spec: %s", s)
  1793  				}
  1794  				var smallest, largest base.InternalKey
  1795  				if strings.Contains(parts[0], ".") {
  1796  					if !strings.Contains(parts[1], ".") {
  1797  						t.Fatalf("malformed table spec: %s", s)
  1798  					}
  1799  					smallest = base.ParseInternalKey(parts[0])
  1800  					largest = base.ParseInternalKey(parts[1])
  1801  				} else {
  1802  					smallest = InternalKey{UserKey: []byte(parts[0])}
  1803  					largest = InternalKey{UserKey: []byte(parts[1])}
  1804  				}
  1805  				// If we're using a reverse comparer, flip the file bounds.
  1806  				if mem.cmp(smallest.UserKey, largest.UserKey) > 0 {
  1807  					smallest, largest = largest, smallest
  1808  				}
  1809  				meta.ExtendPointKeyBounds(comparer.Compare, smallest, largest)
  1810  				meta.InitPhysicalBacking()
  1811  				return meta
  1812  			}
  1813  
  1814  			datadriven.RunTest(t, "testdata/ingest_memtable_overlaps", func(t *testing.T, d *datadriven.TestData) string {
  1815  				switch d.Cmd {
  1816  				case "define":
  1817  					b := newBatch(nil)
  1818  					if err := runBatchDefineCmd(d, b); err != nil {
  1819  						return err.Error()
  1820  					}
  1821  
  1822  					opts := &Options{
  1823  						Comparer: &comparer,
  1824  					}
  1825  					opts.EnsureDefaults().WithFSDefaults()
  1826  					if len(d.CmdArgs) > 1 {
  1827  						return fmt.Sprintf("%s expects at most 1 argument", d.Cmd)
  1828  					}
  1829  					if len(d.CmdArgs) == 1 {
  1830  						opts.Comparer = m[d.CmdArgs[0].String()]
  1831  						if opts.Comparer == nil {
  1832  							return fmt.Sprintf("%s unknown comparer: %s", d.Cmd, d.CmdArgs[0].String())
  1833  						}
  1834  					}
  1835  
  1836  					mem = newMemTable(memTableOptions{Options: opts})
  1837  					if err := mem.apply(b, 0); err != nil {
  1838  						return err.Error()
  1839  					}
  1840  					return ""
  1841  
  1842  				case "overlaps":
  1843  					var buf bytes.Buffer
  1844  					for _, data := range strings.Split(d.Input, "\n") {
  1845  						var keyRanges []internalKeyRange
  1846  						for _, part := range strings.Fields(data) {
  1847  							meta := parseMeta(part)
  1848  							keyRanges = append(keyRanges, internalKeyRange{smallest: meta.Smallest, largest: meta.Largest})
  1849  						}
  1850  						fmt.Fprintf(&buf, "%t\n", ingestMemtableOverlaps(mem.cmp, mem, keyRanges))
  1851  					}
  1852  					return buf.String()
  1853  
  1854  				default:
  1855  					return fmt.Sprintf("unknown command: %s", d.Cmd)
  1856  				}
  1857  			})
  1858  		})
  1859  	}
  1860  }
  1861  
  1862  func TestKeyRangeBasic(t *testing.T) {
  1863  	cmp := base.DefaultComparer.Compare
  1864  	k1 := KeyRange{Start: []byte("b"), End: []byte("c")}
  1865  
  1866  	// Tests for Contains()
  1867  	require.True(t, k1.Contains(cmp, base.MakeInternalKey([]byte("b"), 1, InternalKeyKindSet)))
  1868  	require.False(t, k1.Contains(cmp, base.MakeInternalKey([]byte("c"), 1, InternalKeyKindSet)))
  1869  	require.True(t, k1.Contains(cmp, base.MakeInternalKey([]byte("bb"), 1, InternalKeyKindSet)))
  1870  	require.True(t, k1.Contains(cmp, base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, []byte("c"))))
  1871  
  1872  	m1 := &fileMetadata{
  1873  		Smallest: base.MakeInternalKey([]byte("b"), 1, InternalKeyKindSet),
  1874  		Largest:  base.MakeInternalKey([]byte("c"), 1, InternalKeyKindSet),
  1875  	}
  1876  	require.True(t, k1.Overlaps(cmp, m1))
  1877  	m2 := &fileMetadata{
  1878  		Smallest: base.MakeInternalKey([]byte("c"), 1, InternalKeyKindSet),
  1879  		Largest:  base.MakeInternalKey([]byte("d"), 1, InternalKeyKindSet),
  1880  	}
  1881  	require.False(t, k1.Overlaps(cmp, m2))
  1882  	m3 := &fileMetadata{
  1883  		Smallest: base.MakeInternalKey([]byte("a"), 1, InternalKeyKindSet),
  1884  		Largest:  base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, []byte("b")),
  1885  	}
  1886  	require.False(t, k1.Overlaps(cmp, m3))
  1887  	m4 := &fileMetadata{
  1888  		Smallest: base.MakeInternalKey([]byte("a"), 1, InternalKeyKindSet),
  1889  		Largest:  base.MakeInternalKey([]byte("b"), 1, InternalKeyKindSet),
  1890  	}
  1891  	require.True(t, k1.Overlaps(cmp, m4))
  1892  }
  1893  
  1894  func BenchmarkIngestOverlappingMemtable(b *testing.B) {
  1895  	assertNoError := func(err error) {
  1896  		b.Helper()
  1897  		if err != nil {
  1898  			b.Fatal(err)
  1899  		}
  1900  	}
  1901  
  1902  	for count := 1; count < 6; count++ {
  1903  		b.Run(fmt.Sprintf("memtables=%d", count), func(b *testing.B) {
  1904  			for i := 0; i < b.N; i++ {
  1905  				b.StopTimer()
  1906  				mem := vfs.NewMem()
  1907  				d, err := Open("", &Options{
  1908  					FS: mem,
  1909  				})
  1910  				assertNoError(err)
  1911  
  1912  				// Create memtables.
  1913  				for {
  1914  					assertNoError(d.Set([]byte("a"), nil, nil))
  1915  					d.mu.Lock()
  1916  					done := len(d.mu.mem.queue) == count
  1917  					d.mu.Unlock()
  1918  					if done {
  1919  						break
  1920  					}
  1921  				}
  1922  
  1923  				// Create the overlapping sstable that will force a flush when ingested.
  1924  				f, err := mem.Create("ext")
  1925  				assertNoError(err)
  1926  				w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  1927  				assertNoError(w.Set([]byte("a"), nil))
  1928  				assertNoError(w.Close())
  1929  
  1930  				b.StartTimer()
  1931  				assertNoError(d.Ingest([]string{"ext"}))
  1932  			}
  1933  		})
  1934  	}
  1935  }
  1936  
  1937  func TestIngestTargetLevel(t *testing.T) {
  1938  	var d *DB
  1939  	defer func() {
  1940  		if d != nil {
  1941  			// Ignore errors because this test defines fake in-progress transactions
  1942  			// that prohibit clean shutdown.
  1943  			_ = d.Close()
  1944  		}
  1945  	}()
  1946  
  1947  	parseMeta := func(s string) *fileMetadata {
  1948  		var rkey bool
  1949  		if len(s) >= 4 && s[0:4] == "rkey" {
  1950  			rkey = true
  1951  			s = s[5:]
  1952  		}
  1953  		parts := strings.Split(s, "-")
  1954  		if len(parts) != 2 {
  1955  			t.Fatalf("malformed table spec: %s", s)
  1956  		}
  1957  		var m *fileMetadata
  1958  		if rkey {
  1959  			m = (&fileMetadata{}).ExtendRangeKeyBounds(
  1960  				d.cmp,
  1961  				InternalKey{UserKey: []byte(parts[0])},
  1962  				InternalKey{UserKey: []byte(parts[1])},
  1963  			)
  1964  		} else {
  1965  			m = (&fileMetadata{}).ExtendPointKeyBounds(
  1966  				d.cmp,
  1967  				InternalKey{UserKey: []byte(parts[0])},
  1968  				InternalKey{UserKey: []byte(parts[1])},
  1969  			)
  1970  		}
  1971  		m.InitPhysicalBacking()
  1972  		return m
  1973  	}
  1974  
  1975  	datadriven.RunTest(t, "testdata/ingest_target_level", func(t *testing.T, td *datadriven.TestData) string {
  1976  		switch td.Cmd {
  1977  		case "define":
  1978  			if d != nil {
  1979  				// Ignore errors because this test defines fake in-progress
  1980  				// transactions that prohibit clean shutdown.
  1981  				_ = d.Close()
  1982  			}
  1983  
  1984  			var err error
  1985  			opts := Options{
  1986  				FormatMajorVersion: internalFormatNewest,
  1987  			}
  1988  			opts.WithFSDefaults()
  1989  			if d, err = runDBDefineCmd(td, &opts); err != nil {
  1990  				return err.Error()
  1991  			}
  1992  
  1993  			readState := d.loadReadState()
  1994  			c := &checkConfig{
  1995  				logger:    d.opts.Logger,
  1996  				comparer:  d.opts.Comparer,
  1997  				readState: readState,
  1998  				newIters:  d.newIters,
  1999  				// TODO: runDBDefineCmd doesn't properly update the visible
  2000  				// sequence number. So we have to explicitly configure level checker with a very large
  2001  				// sequence number, otherwise the DB appears empty.
  2002  				seqNum: InternalKeySeqNumMax,
  2003  			}
  2004  			if err := checkLevelsInternal(c); err != nil {
  2005  				return err.Error()
  2006  			}
  2007  			readState.unref()
  2008  
  2009  			d.mu.Lock()
  2010  			s := d.mu.versions.currentVersion().String()
  2011  			d.mu.Unlock()
  2012  			return s
  2013  
  2014  		case "target":
  2015  			var buf bytes.Buffer
  2016  			suggestSplit := false
  2017  			for _, cmd := range td.CmdArgs {
  2018  				switch cmd.Key {
  2019  				case "suggest-split":
  2020  					suggestSplit = true
  2021  				}
  2022  			}
  2023  			for _, target := range strings.Split(td.Input, "\n") {
  2024  				meta := parseMeta(target)
  2025  				level, overlapFile, err := ingestTargetLevel(
  2026  					d.newIters, d.tableNewRangeKeyIter, IterOptions{logger: d.opts.Logger},
  2027  					d.opts.Comparer, d.mu.versions.currentVersion(), 1, d.mu.compact.inProgress, meta,
  2028  					suggestSplit)
  2029  				if err != nil {
  2030  					return err.Error()
  2031  				}
  2032  				if overlapFile != nil {
  2033  					fmt.Fprintf(&buf, "%d (split file: %s)\n", level, overlapFile.FileNum)
  2034  				} else {
  2035  					fmt.Fprintf(&buf, "%d\n", level)
  2036  				}
  2037  			}
  2038  			return buf.String()
  2039  
  2040  		default:
  2041  			return fmt.Sprintf("unknown command: %s", td.Cmd)
  2042  		}
  2043  	})
  2044  }
  2045  
  2046  func TestIngest(t *testing.T) {
  2047  	var mem vfs.FS
  2048  	var d *DB
  2049  	var flushed bool
  2050  	if runtime.GOARCH == "386" {
  2051  		t.Skip("skipped on 32-bit due to slightly varied output")
  2052  	}
  2053  	defer func() {
  2054  		require.NoError(t, d.Close())
  2055  	}()
  2056  
  2057  	reset := func(split bool) {
  2058  		if d != nil {
  2059  			require.NoError(t, d.Close())
  2060  		}
  2061  
  2062  		mem = vfs.NewMem()
  2063  		require.NoError(t, mem.MkdirAll("ext", 0755))
  2064  		opts := &Options{
  2065  			FS:                    mem,
  2066  			L0CompactionThreshold: 100,
  2067  			L0StopWritesThreshold: 100,
  2068  			DebugCheck:            DebugCheckLevels,
  2069  			EventListener: &EventListener{FlushEnd: func(info FlushInfo) {
  2070  				flushed = true
  2071  			}},
  2072  			FormatMajorVersion: internalFormatNewest,
  2073  		}
  2074  		opts.Experimental.IngestSplit = func() bool {
  2075  			return split
  2076  		}
  2077  		// Disable automatic compactions because otherwise we'll race with
  2078  		// delete-only compactions triggered by ingesting range tombstones.
  2079  		opts.DisableAutomaticCompactions = true
  2080  
  2081  		var err error
  2082  		d, err = Open("", opts)
  2083  		require.NoError(t, err)
  2084  	}
  2085  	reset(false /* split */)
  2086  
  2087  	datadriven.RunTest(t, "testdata/ingest", func(t *testing.T, td *datadriven.TestData) string {
  2088  		switch td.Cmd {
  2089  		case "reset":
  2090  			split := false
  2091  			for _, cmd := range td.CmdArgs {
  2092  				switch cmd.Key {
  2093  				case "enable-split":
  2094  					split = true
  2095  				default:
  2096  					return fmt.Sprintf("unexpected key: %s", cmd.Key)
  2097  				}
  2098  			}
  2099  			reset(split)
  2100  			return ""
  2101  		case "batch":
  2102  			b := d.NewIndexedBatch()
  2103  			if err := runBatchDefineCmd(td, b); err != nil {
  2104  				return err.Error()
  2105  			}
  2106  			if err := b.Commit(nil); err != nil {
  2107  				return err.Error()
  2108  			}
  2109  			return ""
  2110  
  2111  		case "build":
  2112  			if err := runBuildCmd(td, d, mem); err != nil {
  2113  				return err.Error()
  2114  			}
  2115  			return ""
  2116  
  2117  		case "ingest":
  2118  			flushed = false
  2119  			if err := runIngestCmd(td, d, mem); err != nil {
  2120  				return err.Error()
  2121  			}
  2122  			// Wait for a possible flush.
  2123  			d.mu.Lock()
  2124  			for d.mu.compact.flushing {
  2125  				d.mu.compact.cond.Wait()
  2126  			}
  2127  			d.mu.Unlock()
  2128  			if flushed {
  2129  				return "memtable flushed"
  2130  			}
  2131  			return ""
  2132  
  2133  		case "get":
  2134  			return runGetCmd(t, td, d)
  2135  
  2136  		case "iter":
  2137  			iter, _ := d.NewIter(&IterOptions{
  2138  				KeyTypes: IterKeyTypePointsAndRanges,
  2139  			})
  2140  			return runIterCmd(td, iter, true)
  2141  
  2142  		case "lsm":
  2143  			return runLSMCmd(td, d)
  2144  
  2145  		case "metrics":
  2146  			// The asynchronous loading of table stats can change metrics, so
  2147  			// wait for all the tables' stats to be loaded.
  2148  			d.mu.Lock()
  2149  			d.waitTableStats()
  2150  			d.mu.Unlock()
  2151  
  2152  			return d.Metrics().StringForTests()
  2153  
  2154  		case "wait-pending-table-stats":
  2155  			return runTableStatsCmd(td, d)
  2156  
  2157  		case "compact":
  2158  			if len(td.CmdArgs) != 2 {
  2159  				panic("insufficient args for compact command")
  2160  			}
  2161  			l := td.CmdArgs[0].Key
  2162  			r := td.CmdArgs[1].Key
  2163  			err := d.Compact([]byte(l), []byte(r), false)
  2164  			if err != nil {
  2165  				return err.Error()
  2166  			}
  2167  			return ""
  2168  		default:
  2169  			return fmt.Sprintf("unknown command: %s", td.Cmd)
  2170  		}
  2171  	})
  2172  }
  2173  
  2174  func TestIngestError(t *testing.T) {
  2175  	for i := int32(0); ; i++ {
  2176  		mem := vfs.NewMem()
  2177  
  2178  		f0, err := mem.Create("ext0")
  2179  		require.NoError(t, err)
  2180  		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f0), sstable.WriterOptions{})
  2181  		require.NoError(t, w.Set([]byte("d"), nil))
  2182  		require.NoError(t, w.Close())
  2183  		f1, err := mem.Create("ext1")
  2184  		require.NoError(t, err)
  2185  		w = sstable.NewWriter(objstorageprovider.NewFileWritable(f1), sstable.WriterOptions{})
  2186  		require.NoError(t, w.Set([]byte("d"), nil))
  2187  		require.NoError(t, w.Close())
  2188  
  2189  		inj := errorfs.OnIndex(-1)
  2190  		d, err := Open("", &Options{
  2191  			FS:                    errorfs.Wrap(mem, inj),
  2192  			Logger:                panicLogger{},
  2193  			L0CompactionThreshold: 8,
  2194  		})
  2195  		require.NoError(t, err)
  2196  		// Force the creation of an L0 sstable that overlaps with the tables
  2197  		// we'll attempt to ingest. This ensures that we exercise filesystem
  2198  		// codepaths when determining the ingest target level.
  2199  		require.NoError(t, d.Set([]byte("a"), nil, nil))
  2200  		require.NoError(t, d.Set([]byte("d"), nil, nil))
  2201  		require.NoError(t, d.Flush())
  2202  
  2203  		t.Run(fmt.Sprintf("index-%d", i), func(t *testing.T) {
  2204  			defer func() {
  2205  				if r := recover(); r != nil {
  2206  					if e, ok := r.(error); ok && errors.Is(e, errorfs.ErrInjected) {
  2207  						return
  2208  					}
  2209  					// d.opts.Logger.Fatalf won't propagate ErrInjected
  2210  					// itself, but should contain the error message.
  2211  					if strings.HasSuffix(fmt.Sprint(r), errorfs.ErrInjected.Error()) {
  2212  						return
  2213  					}
  2214  					t.Fatal(r)
  2215  				}
  2216  			}()
  2217  
  2218  			inj.SetIndex(i)
  2219  			err1 := d.Ingest([]string{"ext0"})
  2220  			err2 := d.Ingest([]string{"ext1"})
  2221  			err := firstError(err1, err2)
  2222  			if err != nil && !errors.Is(err, errorfs.ErrInjected) {
  2223  				t.Fatal(err)
  2224  			}
  2225  		})
  2226  
  2227  		// d.Close may error if we failed to flush the manifest.
  2228  		_ = d.Close()
  2229  
  2230  		// If the injector's index is non-negative, the i-th filesystem
  2231  		// operation was never executed.
  2232  		if inj.Index() >= 0 {
  2233  			break
  2234  		}
  2235  	}
  2236  }
  2237  
  2238  func TestIngestIdempotence(t *testing.T) {
  2239  	// Use an on-disk filesystem, because Ingest with a MemFS will copy, not
  2240  	// link the ingested file.
  2241  	dir, err := os.MkdirTemp("", "ingest-idempotence")
  2242  	require.NoError(t, err)
  2243  	defer os.RemoveAll(dir)
  2244  	fs := vfs.Default
  2245  
  2246  	path := fs.PathJoin(dir, "ext")
  2247  	f, err := fs.Create(fs.PathJoin(dir, "ext"))
  2248  	require.NoError(t, err)
  2249  	w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2250  	require.NoError(t, w.Set([]byte("d"), nil))
  2251  	require.NoError(t, w.Close())
  2252  
  2253  	d, err := Open(dir, &Options{
  2254  		FS: fs,
  2255  	})
  2256  	require.NoError(t, err)
  2257  	const count = 4
  2258  	for i := 0; i < count; i++ {
  2259  		ingestPath := fs.PathJoin(dir, fmt.Sprintf("ext%d", i))
  2260  		require.NoError(t, fs.Link(path, ingestPath))
  2261  		require.NoError(t, d.Ingest([]string{ingestPath}))
  2262  	}
  2263  	require.NoError(t, d.Close())
  2264  }
  2265  
  2266  func TestIngestCompact(t *testing.T) {
  2267  	mem := vfs.NewMem()
  2268  	lel := MakeLoggingEventListener(&base.InMemLogger{})
  2269  	d, err := Open("", &Options{
  2270  		EventListener:         &lel,
  2271  		FS:                    mem,
  2272  		L0CompactionThreshold: 1,
  2273  		L0StopWritesThreshold: 1,
  2274  	})
  2275  	require.NoError(t, err)
  2276  
  2277  	src := func(i int) string {
  2278  		return fmt.Sprintf("ext%d", i)
  2279  	}
  2280  	f, err := mem.Create(src(0))
  2281  	require.NoError(t, err)
  2282  
  2283  	w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2284  	key := []byte("a")
  2285  	require.NoError(t, w.Add(base.MakeInternalKey(key, 0, InternalKeyKindSet), nil))
  2286  	require.NoError(t, w.Close())
  2287  
  2288  	// Make N copies of the sstable.
  2289  	const count = 20
  2290  	for i := 1; i < count; i++ {
  2291  		require.NoError(t, vfs.Copy(d.opts.FS, src(0), src(i)))
  2292  	}
  2293  
  2294  	// Ingest the same sstable multiple times. Compaction should take place as
  2295  	// ingestion happens, preventing an indefinite write stall from occurring.
  2296  	for i := 0; i < count; i++ {
  2297  		if i == 10 {
  2298  			// Half-way through the ingestions, set a key in the memtable to force
  2299  			// overlap with the memtable which will require the memtable to be
  2300  			// flushed.
  2301  			require.NoError(t, d.Set(key, nil, nil))
  2302  		}
  2303  		require.NoError(t, d.Ingest([]string{src(i)}))
  2304  	}
  2305  
  2306  	require.NoError(t, d.Close())
  2307  }
  2308  
  2309  func TestConcurrentIngest(t *testing.T) {
  2310  	mem := vfs.NewMem()
  2311  	d, err := Open("", &Options{
  2312  		FS: mem,
  2313  	})
  2314  	require.NoError(t, err)
  2315  
  2316  	// Create an sstable with 2 keys. This is necessary to trigger the overlap
  2317  	// bug because an sstable with a single key will not have overlap in internal
  2318  	// key space and the sequence number assignment had already guaranteed
  2319  	// correct ordering.
  2320  	src := func(i int) string {
  2321  		return fmt.Sprintf("ext%d", i)
  2322  	}
  2323  	f, err := mem.Create(src(0))
  2324  	require.NoError(t, err)
  2325  
  2326  	w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2327  	require.NoError(t, w.Set([]byte("a"), nil))
  2328  	require.NoError(t, w.Set([]byte("b"), nil))
  2329  	require.NoError(t, w.Close())
  2330  
  2331  	// Make N copies of the sstable.
  2332  	errCh := make(chan error, 5)
  2333  	for i := 1; i < cap(errCh); i++ {
  2334  		require.NoError(t, vfs.Copy(d.opts.FS, src(0), src(i)))
  2335  	}
  2336  
  2337  	// Perform N ingestions concurrently.
  2338  	for i := 0; i < cap(errCh); i++ {
  2339  		go func(i int) {
  2340  			err := d.Ingest([]string{src(i)})
  2341  			if err == nil {
  2342  				if _, err = d.opts.FS.Stat(src(i)); oserror.IsNotExist(err) {
  2343  					err = nil
  2344  				}
  2345  			}
  2346  			errCh <- err
  2347  		}(i)
  2348  	}
  2349  	for i := 0; i < cap(errCh); i++ {
  2350  		require.NoError(t, <-errCh)
  2351  	}
  2352  
  2353  	require.NoError(t, d.Close())
  2354  }
  2355  
  2356  func TestConcurrentIngestCompact(t *testing.T) {
  2357  	for i := 0; i < 2; i++ {
  2358  		t.Run("", func(t *testing.T) {
  2359  			mem := vfs.NewMem()
  2360  			compactionReady := make(chan struct{})
  2361  			compactionBegin := make(chan struct{})
  2362  			d, err := Open("", &Options{
  2363  				FS: mem,
  2364  				EventListener: &EventListener{
  2365  					TableCreated: func(info TableCreateInfo) {
  2366  						if info.Reason == "compacting" {
  2367  							close(compactionReady)
  2368  							<-compactionBegin
  2369  						}
  2370  					},
  2371  				},
  2372  			})
  2373  			require.NoError(t, err)
  2374  
  2375  			ingest := func(keys ...string) {
  2376  				t.Helper()
  2377  				f, err := mem.Create("ext")
  2378  				require.NoError(t, err)
  2379  
  2380  				w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2381  				for _, k := range keys {
  2382  					require.NoError(t, w.Set([]byte(k), nil))
  2383  				}
  2384  				require.NoError(t, w.Close())
  2385  				require.NoError(t, d.Ingest([]string{"ext"}))
  2386  			}
  2387  
  2388  			compact := func(start, end string) {
  2389  				t.Helper()
  2390  				require.NoError(t, d.Compact([]byte(start), []byte(end), false))
  2391  			}
  2392  
  2393  			lsm := func() string {
  2394  				d.mu.Lock()
  2395  				s := d.mu.versions.currentVersion().String()
  2396  				d.mu.Unlock()
  2397  				return s
  2398  			}
  2399  
  2400  			expectLSM := func(expected string) {
  2401  				t.Helper()
  2402  				expected = strings.TrimSpace(expected)
  2403  				actual := strings.TrimSpace(lsm())
  2404  				if expected != actual {
  2405  					t.Fatalf("expected\n%s\nbut found\n%s", expected, actual)
  2406  				}
  2407  			}
  2408  
  2409  			ingest("a")
  2410  			ingest("a")
  2411  			ingest("c")
  2412  			ingest("c")
  2413  
  2414  			expectLSM(`
  2415  0.0:
  2416    000005:[a#11,SET-a#11,SET]
  2417    000007:[c#13,SET-c#13,SET]
  2418  6:
  2419    000004:[a#10,SET-a#10,SET]
  2420    000006:[c#12,SET-c#12,SET]
  2421  `)
  2422  
  2423  			// At this point ingestion of an sstable containing only key "b" will be
  2424  			// targeted at L6. Yet a concurrent compaction of sstables 5 and 7 will
  2425  			// create a new sstable in L6 spanning ["a"-"c"]. So the ingestion must
  2426  			// actually target L5.
  2427  
  2428  			switch i {
  2429  			case 0:
  2430  				// Compact, then ingest.
  2431  				go func() {
  2432  					<-compactionReady
  2433  
  2434  					ingest("b")
  2435  
  2436  					close(compactionBegin)
  2437  				}()
  2438  
  2439  				compact("a", "z")
  2440  
  2441  				expectLSM(`
  2442  0.0:
  2443    000009:[b#14,SET-b#14,SET]
  2444  6:
  2445    000008:[a#0,SET-c#0,SET]
  2446  `)
  2447  
  2448  			case 1:
  2449  				// Ingest, then compact
  2450  				var wg sync.WaitGroup
  2451  				wg.Add(1)
  2452  				go func() {
  2453  					defer wg.Done()
  2454  					close(compactionBegin)
  2455  					compact("a", "z")
  2456  				}()
  2457  
  2458  				ingest("b")
  2459  				wg.Wait()
  2460  
  2461  				// Because we're performing the ingestion and compaction concurrently,
  2462  				// we can't guarantee any particular LSM structure at this point. The
  2463  				// test will fail with an assertion error due to overlapping sstables
  2464  				// if there is insufficient synchronization between ingestion and
  2465  				// compaction.
  2466  			}
  2467  
  2468  			require.NoError(t, d.Close())
  2469  		})
  2470  	}
  2471  }
  2472  
  2473  func TestIngestFlushQueuedMemTable(t *testing.T) {
  2474  	// Verify that ingestion forces a flush of a queued memtable.
  2475  
  2476  	// Test with a format major version prior to FormatFlushableIngest and one
  2477  	// after. Both should result in the same statistic calculations.
  2478  	for _, fmv := range []FormatMajorVersion{FormatFlushableIngest - 1, internalFormatNewest} {
  2479  		func(fmv FormatMajorVersion) {
  2480  			mem := vfs.NewMem()
  2481  			d, err := Open("", &Options{
  2482  				FS:                 mem,
  2483  				FormatMajorVersion: fmv,
  2484  			})
  2485  			require.NoError(t, err)
  2486  
  2487  			// Add the key "a" to the memtable, then fill up the memtable with the key
  2488  			// "b". The ingested sstable will only overlap with the queued memtable.
  2489  			require.NoError(t, d.Set([]byte("a"), nil, nil))
  2490  			for {
  2491  				require.NoError(t, d.Set([]byte("b"), nil, nil))
  2492  				d.mu.Lock()
  2493  				done := len(d.mu.mem.queue) == 2
  2494  				d.mu.Unlock()
  2495  				if done {
  2496  					break
  2497  				}
  2498  			}
  2499  
  2500  			ingest := func(keys ...string) {
  2501  				t.Helper()
  2502  				f, err := mem.Create("ext")
  2503  				require.NoError(t, err)
  2504  
  2505  				w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
  2506  					TableFormat: fmv.MinTableFormat(),
  2507  				})
  2508  				for _, k := range keys {
  2509  					require.NoError(t, w.Set([]byte(k), nil))
  2510  				}
  2511  				require.NoError(t, w.Close())
  2512  				stats, err := d.IngestWithStats([]string{"ext"})
  2513  				require.NoError(t, err)
  2514  				require.Equal(t, stats.ApproxIngestedIntoL0Bytes, stats.Bytes)
  2515  				require.Equal(t, stats.MemtableOverlappingFiles, 1)
  2516  				require.Less(t, uint64(0), stats.Bytes)
  2517  			}
  2518  
  2519  			ingest("a")
  2520  
  2521  			require.NoError(t, d.Close())
  2522  		}(fmv)
  2523  	}
  2524  }
  2525  
  2526  func TestIngestStats(t *testing.T) {
  2527  	mem := vfs.NewMem()
  2528  	d, err := Open("", &Options{
  2529  		FS: mem,
  2530  	})
  2531  	require.NoError(t, err)
  2532  
  2533  	ingest := func(expectedLevel int, keys ...string) {
  2534  		t.Helper()
  2535  		f, err := mem.Create("ext")
  2536  		require.NoError(t, err)
  2537  
  2538  		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2539  		for _, k := range keys {
  2540  			require.NoError(t, w.Set([]byte(k), nil))
  2541  		}
  2542  		require.NoError(t, w.Close())
  2543  		stats, err := d.IngestWithStats([]string{"ext"})
  2544  		require.NoError(t, err)
  2545  		if expectedLevel == 0 {
  2546  			require.Equal(t, stats.ApproxIngestedIntoL0Bytes, stats.Bytes)
  2547  		} else {
  2548  			require.EqualValues(t, 0, stats.ApproxIngestedIntoL0Bytes)
  2549  		}
  2550  		require.Less(t, uint64(0), stats.Bytes)
  2551  	}
  2552  	ingest(6, "a")
  2553  	ingest(0, "a")
  2554  	ingest(6, "b", "g")
  2555  	ingest(0, "c")
  2556  	require.NoError(t, d.Close())
  2557  }
  2558  
  2559  func TestIngestFlushQueuedLargeBatch(t *testing.T) {
  2560  	// Verify that ingestion forces a flush of a queued large batch.
  2561  
  2562  	mem := vfs.NewMem()
  2563  	d, err := Open("", &Options{
  2564  		FS: mem,
  2565  	})
  2566  	require.NoError(t, err)
  2567  
  2568  	// The default large batch threshold is slightly less than 1/2 of the
  2569  	// memtable size which makes triggering a problem with flushing queued large
  2570  	// batches irritating. Manually adjust the threshold to 1/8 of the memtable
  2571  	// size in order to more easily create a situation where a large batch is
  2572  	// queued but not automatically flushed.
  2573  	d.mu.Lock()
  2574  	d.largeBatchThreshold = d.opts.MemTableSize / 8
  2575  	d.mu.Unlock()
  2576  
  2577  	// Set a record with a large value. This will be transformed into a large
  2578  	// batch and placed in the flushable queue.
  2579  	require.NoError(t, d.Set([]byte("a"), bytes.Repeat([]byte("v"), int(d.largeBatchThreshold)), nil))
  2580  
  2581  	ingest := func(keys ...string) {
  2582  		t.Helper()
  2583  		f, err := mem.Create("ext")
  2584  		require.NoError(t, err)
  2585  
  2586  		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2587  		for _, k := range keys {
  2588  			require.NoError(t, w.Set([]byte(k), nil))
  2589  		}
  2590  		require.NoError(t, w.Close())
  2591  		require.NoError(t, d.Ingest([]string{"ext"}))
  2592  	}
  2593  
  2594  	ingest("a")
  2595  
  2596  	require.NoError(t, d.Close())
  2597  }
  2598  
  2599  func TestIngestMemtablePendingOverlap(t *testing.T) {
  2600  	mem := vfs.NewMem()
  2601  	d, err := Open("", &Options{
  2602  		FS: mem,
  2603  	})
  2604  	require.NoError(t, err)
  2605  
  2606  	d.mu.Lock()
  2607  	// Use a custom commit pipeline apply function to give us control over
  2608  	// timing of events.
  2609  	assignedBatch := make(chan struct{})
  2610  	applyBatch := make(chan struct{})
  2611  	originalApply := d.commit.env.apply
  2612  	d.commit.env.apply = func(b *Batch, mem *memTable) error {
  2613  		assignedBatch <- struct{}{}
  2614  		applyBatch <- struct{}{}
  2615  		return originalApply(b, mem)
  2616  	}
  2617  	d.mu.Unlock()
  2618  
  2619  	ingest := func(keys ...string) {
  2620  		t.Helper()
  2621  		f, err := mem.Create("ext")
  2622  		require.NoError(t, err)
  2623  
  2624  		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2625  		for _, k := range keys {
  2626  			require.NoError(t, w.Set([]byte(k), nil))
  2627  		}
  2628  		require.NoError(t, w.Close())
  2629  		require.NoError(t, d.Ingest([]string{"ext"}))
  2630  	}
  2631  
  2632  	var wg sync.WaitGroup
  2633  	wg.Add(2)
  2634  
  2635  	// First, Set('c') begins. This call will:
  2636  	//
  2637  	// * enqueue the batch to the pending queue.
  2638  	// * allocate a sequence number `x`.
  2639  	// * write the batch to the WAL.
  2640  	//
  2641  	// and then block until we read from the `applyBatch` channel down below.
  2642  	go func() {
  2643  		err := d.Set([]byte("c"), nil, nil)
  2644  		if err != nil {
  2645  			t.Error(err)
  2646  		}
  2647  		wg.Done()
  2648  	}()
  2649  
  2650  	// When the above Set('c') is ready to apply, it sends on the
  2651  	// `assignedBatch` channel. Once that happens, we start Ingest('a', 'c').
  2652  	// The Ingest('a', 'c') allocates sequence number `x + 1`.
  2653  	go func() {
  2654  		// Wait until the Set has grabbed a sequence number before ingesting.
  2655  		<-assignedBatch
  2656  		ingest("a", "c")
  2657  		wg.Done()
  2658  	}()
  2659  
  2660  	// The Set('c')#1 and Ingest('a', 'c')#2 are both pending. To maintain
  2661  	// sequence number invariants, the Set needs to be applied and flushed
  2662  	// before the Ingest determines its target level.
  2663  	//
  2664  	// Sleep a bit to ensure that the Ingest has time to call into
  2665  	// AllocateSeqNum. Once it allocates its sequence number, it should see
  2666  	// that there are unpublished sequence numbers below it and spin until the
  2667  	// Set's sequence number is published. After sleeping, read from
  2668  	// `applyBatch` to actually allow the Set to apply and publish its
  2669  	// sequence number.
  2670  	time.Sleep(100 * time.Millisecond)
  2671  	<-applyBatch
  2672  
  2673  	// Wait for both calls to complete.
  2674  	wg.Wait()
  2675  	require.NoError(t, d.Flush())
  2676  	require.NoError(t, d.CheckLevels(nil))
  2677  	require.NoError(t, d.Close())
  2678  }
  2679  
  2680  type testLogger struct {
  2681  	t testing.TB
  2682  }
  2683  
  2684  func (l testLogger) Infof(format string, args ...interface{}) {
  2685  	l.t.Logf(format, args...)
  2686  }
  2687  
  2688  func (l testLogger) Fatalf(format string, args ...interface{}) {
  2689  	l.t.Fatalf(format, args...)
  2690  }
  2691  
  2692  // TestIngestMemtableOverlapRace is a regression test for the race described in
  2693  // #2196. If an ingest that checks for overlap with the mutable memtable and
  2694  // finds no overlap, it must not allow overlapping keys with later sequence
  2695  // numbers to be applied to the memtable and the memtable to be flushed before
  2696  // the ingest completes.
  2697  //
  2698  // This test operates by committing the same key concurrently:
  2699  //   - 1 goroutine repeatedly ingests the same sstable writing the key `foo`
  2700  //   - n goroutines repeatedly apply batches writing the key `foo` and trigger
  2701  //     flushes.
  2702  //
  2703  // After a while, the database is closed and the manifest is verified. Version
  2704  // edits should contain new files with monotonically increasing sequence
  2705  // numbers, since every flush and every ingest conflicts with one another.
  2706  func TestIngestMemtableOverlapRace(t *testing.T) {
  2707  	mem := vfs.NewMem()
  2708  	el := MakeLoggingEventListener(testLogger{t: t})
  2709  	d, err := Open("", &Options{
  2710  		FS: mem,
  2711  		// Disable automatic compactions to keep the manifest clean; only
  2712  		// flushes and ingests.
  2713  		DisableAutomaticCompactions: true,
  2714  		// Disable the WAL to speed up batch commits.
  2715  		DisableWAL:    true,
  2716  		EventListener: &el,
  2717  		// We're endlessly appending to L0 without clearing it, so set a maximal
  2718  		// stop writes threshold.
  2719  		L0StopWritesThreshold: math.MaxInt,
  2720  		// Accumulating more than 1 immutable memtable doesn't help us exercise
  2721  		// the bug, since the committed keys need to be flushed promptly.
  2722  		MemTableStopWritesThreshold: 2,
  2723  	})
  2724  	require.NoError(t, err)
  2725  
  2726  	// Prepare a sstable `ext` deleting foo.
  2727  	f, err := mem.Create("ext")
  2728  	require.NoError(t, err)
  2729  	w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2730  	require.NoError(t, w.Delete([]byte("foo")))
  2731  	require.NoError(t, w.Close())
  2732  
  2733  	var done atomic.Bool
  2734  	const numSetters = 2
  2735  	var wg sync.WaitGroup
  2736  	wg.Add(numSetters + 1)
  2737  
  2738  	untilDone := func(fn func()) {
  2739  		defer wg.Done()
  2740  		for !done.Load() {
  2741  			fn()
  2742  		}
  2743  	}
  2744  
  2745  	// Ingest in the background.
  2746  	totalIngests := 0
  2747  	go untilDone(func() {
  2748  		filename := fmt.Sprintf("ext%d", totalIngests)
  2749  		require.NoError(t, mem.Link("ext", filename))
  2750  		require.NoError(t, d.Ingest([]string{filename}))
  2751  		totalIngests++
  2752  	})
  2753  
  2754  	// Apply batches and trigger flushes in the background.
  2755  	wo := &WriteOptions{Sync: false}
  2756  	var localCommits [numSetters]int
  2757  	for i := 0; i < numSetters; i++ {
  2758  		i := i
  2759  		v := []byte(fmt.Sprintf("v%d", i+1))
  2760  		go untilDone(func() {
  2761  			// Commit a batch setting foo=vN.
  2762  			b := d.NewBatch()
  2763  			require.NoError(t, b.Set([]byte("foo"), v, nil))
  2764  			require.NoError(t, b.Commit(wo))
  2765  			localCommits[i]++
  2766  			d.AsyncFlush()
  2767  		})
  2768  	}
  2769  	time.Sleep(100 * time.Millisecond)
  2770  	done.Store(true)
  2771  	wg.Wait()
  2772  
  2773  	var totalCommits int
  2774  	for i := 0; i < numSetters; i++ {
  2775  		totalCommits += localCommits[i]
  2776  	}
  2777  	m := d.Metrics()
  2778  	tot := m.Total()
  2779  	t.Logf("Committed %d batches.", totalCommits)
  2780  	t.Logf("Flushed %d times.", m.Flush.Count)
  2781  	t.Logf("Ingested %d sstables.", tot.TablesIngested)
  2782  	require.NoError(t, d.CheckLevels(nil))
  2783  	require.NoError(t, d.Close())
  2784  
  2785  	// Replay the manifest. Every flush and ingest is a separate version edit.
  2786  	// Since they all write the same key and compactions are disabled, sequence
  2787  	// numbers of new files should be monotonically increasing.
  2788  	//
  2789  	// This check is necessary because most of these sstables are ingested into
  2790  	// L0. The L0 sublevels construction will order them by LargestSeqNum, even
  2791  	// if they're added to L0 out-of-order. The CheckLevels call at the end of
  2792  	// the test may find that the sublevels are all appropriately ordered, but
  2793  	// the manifest may reveal they were added to the LSM out-of-order.
  2794  	dbDesc, err := Peek("", mem)
  2795  	require.NoError(t, err)
  2796  	require.True(t, dbDesc.Exists)
  2797  	f, err = mem.Open(dbDesc.ManifestFilename)
  2798  	require.NoError(t, err)
  2799  	defer f.Close()
  2800  	rr := record.NewReader(f, 0 /* logNum */)
  2801  	var largest *fileMetadata
  2802  	for {
  2803  		r, err := rr.Next()
  2804  		if err == io.EOF || err == record.ErrInvalidChunk {
  2805  			break
  2806  		}
  2807  		require.NoError(t, err)
  2808  		var ve manifest.VersionEdit
  2809  		require.NoError(t, ve.Decode(r))
  2810  		t.Log(ve.String())
  2811  		for _, f := range ve.NewFiles {
  2812  			if largest != nil {
  2813  				require.Equal(t, 0, f.Level)
  2814  				if largest.LargestSeqNum > f.Meta.LargestSeqNum {
  2815  					t.Fatalf("previous largest file %s has sequence number > next file %s", largest, f.Meta)
  2816  				}
  2817  			}
  2818  			largest = f.Meta
  2819  		}
  2820  	}
  2821  }
  2822  
  2823  type ingestCrashFS struct {
  2824  	vfs.FS
  2825  }
  2826  
  2827  func (fs ingestCrashFS) Link(oldname, newname string) error {
  2828  	if err := fs.FS.Link(oldname, newname); err != nil {
  2829  		return err
  2830  	}
  2831  	panic(errorfs.ErrInjected)
  2832  }
  2833  
  2834  type noRemoveFS struct {
  2835  	vfs.FS
  2836  }
  2837  
  2838  func (fs noRemoveFS) Remove(string) error {
  2839  	return errorfs.ErrInjected
  2840  }
  2841  
  2842  func TestIngestFileNumReuseCrash(t *testing.T) {
  2843  	const count = 10
  2844  	// Use an on-disk filesystem, because Ingest with a MemFS will copy, not
  2845  	// link the ingested file.
  2846  	dir, err := os.MkdirTemp("", "ingest-filenum-reuse")
  2847  	require.NoError(t, err)
  2848  	defer os.RemoveAll(dir)
  2849  	fs := vfs.Default
  2850  
  2851  	readFile := func(s string) []byte {
  2852  		f, err := fs.Open(fs.PathJoin(dir, s))
  2853  		require.NoError(t, err)
  2854  		b, err := io.ReadAll(f)
  2855  		require.NoError(t, err)
  2856  		require.NoError(t, f.Close())
  2857  		return b
  2858  	}
  2859  
  2860  	// Create sstables to ingest.
  2861  	var files []string
  2862  	var fileBytes [][]byte
  2863  	for i := 0; i < count; i++ {
  2864  		name := fmt.Sprintf("ext%d", i)
  2865  		f, err := fs.Create(fs.PathJoin(dir, name))
  2866  		require.NoError(t, err)
  2867  		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  2868  		require.NoError(t, w.Set([]byte(fmt.Sprintf("foo%d", i)), nil))
  2869  		require.NoError(t, w.Close())
  2870  		files = append(files, name)
  2871  		fileBytes = append(fileBytes, readFile(name))
  2872  	}
  2873  
  2874  	// Open a database with a filesystem that will successfully link the
  2875  	// ingested files but then panic. This is an approximation of what a crash
  2876  	// after linking but before updating the manifest would look like.
  2877  	d, err := Open(dir, &Options{
  2878  		FS: ingestCrashFS{FS: fs},
  2879  	})
  2880  	// A flush here ensures the file num bumps from creating OPTIONS files,
  2881  	// etc get recorded in the manifest. We want the nextFileNum after the
  2882  	// restart to be the same as one of our ingested sstables.
  2883  	require.NoError(t, err)
  2884  	require.NoError(t, d.Set([]byte("boop"), nil, nil))
  2885  	require.NoError(t, d.Flush())
  2886  	for _, f := range files {
  2887  		func() {
  2888  			defer func() { err = recover().(error) }()
  2889  			err = d.Ingest([]string{fs.PathJoin(dir, f)})
  2890  		}()
  2891  		if err == nil || !errors.Is(err, errorfs.ErrInjected) {
  2892  			t.Fatalf("expected injected error, got %v", err)
  2893  		}
  2894  	}
  2895  	// Leave something in the WAL so that Open will flush while replaying the
  2896  	// WAL.
  2897  	require.NoError(t, d.Set([]byte("wal"), nil, nil))
  2898  	require.NoError(t, d.Close())
  2899  
  2900  	// There are now two links to each external file: the original extX link
  2901  	// and a numbered sstable link. The sstable files are still not a part of
  2902  	// the manifest and so they may be overwritten. Open will detect the
  2903  	// obsolete number sstables and try to remove them. The FS here is wrapped
  2904  	// to induce errors on Remove calls. Even if we're unsuccessful in
  2905  	// removing the obsolete files, the external files should not be
  2906  	// overwritten.
  2907  	d, err = Open(dir, &Options{FS: noRemoveFS{FS: fs}})
  2908  	require.NoError(t, err)
  2909  	require.NoError(t, d.Set([]byte("bar"), nil, nil))
  2910  	require.NoError(t, d.Flush())
  2911  	require.NoError(t, d.Close())
  2912  
  2913  	// None of the external files should change despite modifying the linked
  2914  	// versions.
  2915  	for i, f := range files {
  2916  		afterBytes := readFile(f)
  2917  		require.Equal(t, fileBytes[i], afterBytes)
  2918  	}
  2919  }
  2920  
  2921  func TestIngest_UpdateSequenceNumber(t *testing.T) {
  2922  	mem := vfs.NewMem()
  2923  	cmp := base.DefaultComparer.Compare
  2924  	parse := func(input string) (*sstable.Writer, error) {
  2925  		f, err := mem.Create("ext")
  2926  		if err != nil {
  2927  			return nil, err
  2928  		}
  2929  		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
  2930  			TableFormat: sstable.TableFormatMax,
  2931  		})
  2932  		for _, data := range strings.Split(input, "\n") {
  2933  			if strings.HasPrefix(data, "rangekey: ") {
  2934  				data = strings.TrimPrefix(data, "rangekey: ")
  2935  				s := keyspan.ParseSpan(data)
  2936  				err := rangekey.Encode(&s, w.AddRangeKey)
  2937  				if err != nil {
  2938  					return nil, err
  2939  				}
  2940  				continue
  2941  			}
  2942  			j := strings.Index(data, ":")
  2943  			if j < 0 {
  2944  				return nil, errors.Newf("malformed input: %s\n", data)
  2945  			}
  2946  			key := base.ParseInternalKey(data[:j])
  2947  			value := []byte(data[j+1:])
  2948  			if err := w.Add(key, value); err != nil {
  2949  				return nil, err
  2950  			}
  2951  		}
  2952  		return w, nil
  2953  	}
  2954  
  2955  	var (
  2956  		seqnum uint64
  2957  		err    error
  2958  		metas  []*fileMetadata
  2959  	)
  2960  	datadriven.RunTest(t, "testdata/ingest_update_seqnums", func(t *testing.T, td *datadriven.TestData) string {
  2961  		switch td.Cmd {
  2962  		case "starting-seqnum":
  2963  			seqnum, err = strconv.ParseUint(td.Input, 10, 64)
  2964  			if err != nil {
  2965  				return err.Error()
  2966  			}
  2967  			return ""
  2968  
  2969  		case "reset":
  2970  			metas = metas[:0]
  2971  			return ""
  2972  
  2973  		case "load":
  2974  			w, err := parse(td.Input)
  2975  			if err != nil {
  2976  				return err.Error()
  2977  			}
  2978  			if err = w.Close(); err != nil {
  2979  				return err.Error()
  2980  			}
  2981  			defer w.Close()
  2982  
  2983  			// Format the bounds of the table.
  2984  			wm, err := w.Metadata()
  2985  			if err != nil {
  2986  				return err.Error()
  2987  			}
  2988  
  2989  			// Upper bounds for range dels and range keys are expected to be sentinel
  2990  			// keys.
  2991  			maybeUpdateUpperBound := func(key base.InternalKey) base.InternalKey {
  2992  				switch k := key.Kind(); {
  2993  				case k == base.InternalKeyKindRangeDelete:
  2994  					key.Trailer = base.InternalKeyRangeDeleteSentinel
  2995  				case rangekey.IsRangeKey(k):
  2996  					return base.MakeExclusiveSentinelKey(k, key.UserKey)
  2997  				}
  2998  				return key
  2999  			}
  3000  
  3001  			// Construct the file metadata from the writer metadata.
  3002  			m := &fileMetadata{
  3003  				SmallestSeqNum: 0, // Simulate an ingestion.
  3004  				LargestSeqNum:  0,
  3005  			}
  3006  			if wm.HasPointKeys {
  3007  				m.ExtendPointKeyBounds(cmp, wm.SmallestPoint, wm.LargestPoint)
  3008  			}
  3009  			if wm.HasRangeDelKeys {
  3010  				m.ExtendPointKeyBounds(
  3011  					cmp,
  3012  					wm.SmallestRangeDel,
  3013  					maybeUpdateUpperBound(wm.LargestRangeDel),
  3014  				)
  3015  			}
  3016  			if wm.HasRangeKeys {
  3017  				m.ExtendRangeKeyBounds(
  3018  					cmp,
  3019  					wm.SmallestRangeKey,
  3020  					maybeUpdateUpperBound(wm.LargestRangeKey),
  3021  				)
  3022  			}
  3023  			m.InitPhysicalBacking()
  3024  			if err := m.Validate(cmp, base.DefaultFormatter); err != nil {
  3025  				return err.Error()
  3026  			}
  3027  
  3028  			// Collect this file.
  3029  			metas = append(metas, m)
  3030  
  3031  			// Return an index number for the file.
  3032  			return fmt.Sprintf("file %d\n", len(metas)-1)
  3033  
  3034  		case "update-files":
  3035  			// Update the bounds across all files.
  3036  			if err = ingestUpdateSeqNum(cmp, base.DefaultFormatter, seqnum, ingestLoadResult{localMeta: metas}); err != nil {
  3037  				return err.Error()
  3038  			}
  3039  
  3040  			var buf bytes.Buffer
  3041  			for i, m := range metas {
  3042  				fmt.Fprintf(&buf, "file %d:\n", i)
  3043  				fmt.Fprintf(&buf, "  combined: %s-%s\n", m.Smallest, m.Largest)
  3044  				fmt.Fprintf(&buf, "    points: %s-%s\n", m.SmallestPointKey, m.LargestPointKey)
  3045  				fmt.Fprintf(&buf, "    ranges: %s-%s\n", m.SmallestRangeKey, m.LargestRangeKey)
  3046  			}
  3047  
  3048  			return buf.String()
  3049  
  3050  		default:
  3051  			return fmt.Sprintf("unknown command %s\n", td.Cmd)
  3052  		}
  3053  	})
  3054  }
  3055  
  3056  func TestIngestCleanup(t *testing.T) {
  3057  	fns := []base.FileNum{0, 1, 2}
  3058  
  3059  	testCases := []struct {
  3060  		closeFiles   []base.FileNum
  3061  		cleanupFiles []base.FileNum
  3062  		wantErr      string
  3063  	}{
  3064  		// Close and remove all files.
  3065  		{
  3066  			closeFiles:   fns,
  3067  			cleanupFiles: fns,
  3068  		},
  3069  		// Remove a non-existent file.
  3070  		{
  3071  			closeFiles:   fns,
  3072  			cleanupFiles: []base.FileNum{3},
  3073  			wantErr:      "unknown to the objstorage provider",
  3074  		},
  3075  		// Remove a file that has not been closed.
  3076  		{
  3077  			closeFiles:   []base.FileNum{0, 2},
  3078  			cleanupFiles: fns,
  3079  			wantErr:      oserror.ErrInvalid.Error(),
  3080  		},
  3081  		// Remove all files, one of which is still open, plus a file that does not exist.
  3082  		{
  3083  			closeFiles:   []base.FileNum{0, 2},
  3084  			cleanupFiles: []base.FileNum{0, 1, 2, 3},
  3085  			wantErr:      oserror.ErrInvalid.Error(), // The first error encountered is due to the open file.
  3086  		},
  3087  	}
  3088  
  3089  	for _, tc := range testCases {
  3090  		t.Run("", func(t *testing.T) {
  3091  			mem := vfs.NewMem()
  3092  			mem.UseWindowsSemantics(true)
  3093  			objProvider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(mem, ""))
  3094  			require.NoError(t, err)
  3095  			defer objProvider.Close()
  3096  
  3097  			// Create the files in the VFS.
  3098  			metaMap := make(map[base.FileNum]objstorage.Writable)
  3099  			for _, fn := range fns {
  3100  				w, _, err := objProvider.Create(context.Background(), base.FileTypeTable, fn.DiskFileNum(), objstorage.CreateOptions{})
  3101  				require.NoError(t, err)
  3102  
  3103  				metaMap[fn] = w
  3104  			}
  3105  
  3106  			// Close a select number of files.
  3107  			for _, m := range tc.closeFiles {
  3108  				w, ok := metaMap[m]
  3109  				if !ok {
  3110  					continue
  3111  				}
  3112  				require.NoError(t, w.Finish())
  3113  			}
  3114  
  3115  			// Cleanup the set of files in the FS.
  3116  			var toRemove []*fileMetadata
  3117  			for _, fn := range tc.cleanupFiles {
  3118  				m := &fileMetadata{FileNum: fn}
  3119  				m.InitPhysicalBacking()
  3120  				toRemove = append(toRemove, m)
  3121  			}
  3122  
  3123  			err = ingestCleanup(objProvider, toRemove)
  3124  			if tc.wantErr != "" {
  3125  				require.Error(t, err, "got no error, expected %s", tc.wantErr)
  3126  				require.Contains(t, err.Error(), tc.wantErr)
  3127  			} else {
  3128  				require.NoError(t, err)
  3129  			}
  3130  		})
  3131  	}
  3132  }
  3133  
  3134  // fatalCapturingLogger captures a fatal error instead of panicking.
  3135  type fatalCapturingLogger struct {
  3136  	t   testing.TB
  3137  	err error
  3138  }
  3139  
  3140  // Infof implements the Logger interface.
  3141  func (l *fatalCapturingLogger) Infof(fmt string, args ...interface{}) {
  3142  	l.t.Logf(fmt, args...)
  3143  }
  3144  
  3145  // Fatalf implements the Logger interface.
  3146  func (l *fatalCapturingLogger) Fatalf(_ string, args ...interface{}) {
  3147  	l.err = args[0].(error)
  3148  }
  3149  
  3150  func TestIngestValidation(t *testing.T) {
  3151  	type keyVal struct {
  3152  		key, val []byte
  3153  	}
  3154  	type corruptionLocation int
  3155  	const (
  3156  		corruptionLocationNone corruptionLocation = iota
  3157  		corruptionLocationStart
  3158  		corruptionLocationEnd
  3159  		corruptionLocationInternal
  3160  	)
  3161  	type errLocation int
  3162  	const (
  3163  		errLocationNone errLocation = iota
  3164  		errLocationIngest
  3165  		errLocationValidation
  3166  	)
  3167  	const (
  3168  		nKeys     = 1_000
  3169  		keySize   = 16
  3170  		valSize   = 100
  3171  		blockSize = 100
  3172  
  3173  		ingestTableName = "ext"
  3174  	)
  3175  	ingestPath := filepath.Join(t.TempDir(), ingestTableName)
  3176  
  3177  	seed := uint64(time.Now().UnixNano())
  3178  	rng := rand.New(rand.NewSource(seed))
  3179  	t.Logf("rng seed = %d", seed)
  3180  
  3181  	testCases := []struct {
  3182  		description string
  3183  		cLoc        corruptionLocation
  3184  		wantErrType errLocation
  3185  	}{
  3186  		{
  3187  			description: "no corruption",
  3188  			cLoc:        corruptionLocationNone,
  3189  			wantErrType: errLocationNone,
  3190  		},
  3191  		{
  3192  			description: "start block",
  3193  			cLoc:        corruptionLocationStart,
  3194  			wantErrType: errLocationIngest,
  3195  		},
  3196  		{
  3197  			description: "end block",
  3198  			cLoc:        corruptionLocationEnd,
  3199  			wantErrType: errLocationIngest,
  3200  		},
  3201  		{
  3202  			description: "non-end block",
  3203  			cLoc:        corruptionLocationInternal,
  3204  			wantErrType: errLocationValidation,
  3205  		},
  3206  	}
  3207  
  3208  	for _, tc := range testCases {
  3209  		t.Run(tc.description, func(t *testing.T) {
  3210  			var wg sync.WaitGroup
  3211  			wg.Add(1)
  3212  
  3213  			fs := vfs.NewMem()
  3214  			logger := &fatalCapturingLogger{t: t}
  3215  			opts := &Options{
  3216  				FS:     fs,
  3217  				Logger: logger,
  3218  				EventListener: &EventListener{
  3219  					TableValidated: func(i TableValidatedInfo) {
  3220  						wg.Done()
  3221  					},
  3222  				},
  3223  			}
  3224  			opts.Experimental.ValidateOnIngest = true
  3225  			d, err := Open("", opts)
  3226  			require.NoError(t, err)
  3227  			defer func() { require.NoError(t, d.Close()) }()
  3228  
  3229  			corrupt := func(f vfs.File) {
  3230  				readable, err := sstable.NewSimpleReadable(f)
  3231  				require.NoError(t, err)
  3232  				// Compute the layout of the sstable in order to find the
  3233  				// appropriate block locations to corrupt.
  3234  				r, err := sstable.NewReader(readable, sstable.ReaderOptions{})
  3235  				require.NoError(t, err)
  3236  				l, err := r.Layout()
  3237  				require.NoError(t, err)
  3238  				require.NoError(t, r.Close())
  3239  
  3240  				// Select an appropriate data block to corrupt.
  3241  				var blockIdx int
  3242  				switch tc.cLoc {
  3243  				case corruptionLocationStart:
  3244  					blockIdx = 0
  3245  				case corruptionLocationEnd:
  3246  					blockIdx = len(l.Data) - 1
  3247  				case corruptionLocationInternal:
  3248  					blockIdx = 1 + rng.Intn(len(l.Data)-2)
  3249  				default:
  3250  					t.Fatalf("unknown corruptionLocation: %T", tc.cLoc)
  3251  				}
  3252  				bh := l.Data[blockIdx]
  3253  
  3254  				osF, err := os.OpenFile(ingestPath, os.O_RDWR, 0600)
  3255  				require.NoError(t, err)
  3256  				defer func() { require.NoError(t, osF.Close()) }()
  3257  
  3258  				// Corrupting a key will cause the ingestion to fail due to a
  3259  				// malformed key, rather than a block checksum mismatch.
  3260  				// Instead, we corrupt the last byte in the selected block,
  3261  				// before the trailer, which corresponds to a value.
  3262  				offset := bh.Offset + bh.Length - 1
  3263  				_, err = osF.WriteAt([]byte("\xff"), int64(offset))
  3264  				require.NoError(t, err)
  3265  			}
  3266  
  3267  			type errT struct {
  3268  				errLoc errLocation
  3269  				err    error
  3270  			}
  3271  			runIngest := func(keyVals []keyVal) (et errT) {
  3272  				// The vfs.File does not allow for random reads and writes.
  3273  				// Create a disk-backed file outside of the DB FS that we can
  3274  				// open as a regular os.File, if required.
  3275  				tmpFS := vfs.Default
  3276  				f, err := tmpFS.Create(ingestPath)
  3277  				require.NoError(t, err)
  3278  				defer func() { _ = tmpFS.Remove(ingestPath) }()
  3279  
  3280  				w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
  3281  					BlockSize:   blockSize,     // Create many smaller blocks.
  3282  					Compression: NoCompression, // For simpler debugging.
  3283  				})
  3284  				for _, kv := range keyVals {
  3285  					require.NoError(t, w.Set(kv.key, kv.val))
  3286  				}
  3287  				require.NoError(t, w.Close())
  3288  
  3289  				// Possibly corrupt the file.
  3290  				if tc.cLoc != corruptionLocationNone {
  3291  					f, err = tmpFS.Open(ingestPath)
  3292  					require.NoError(t, err)
  3293  					corrupt(f)
  3294  				}
  3295  
  3296  				// Copy the file into the DB's FS.
  3297  				_, err = vfs.Clone(tmpFS, fs, ingestPath, ingestTableName)
  3298  				require.NoError(t, err)
  3299  
  3300  				// Ingest the external table.
  3301  				err = d.Ingest([]string{ingestTableName})
  3302  				if err != nil {
  3303  					et.errLoc = errLocationIngest
  3304  					et.err = err
  3305  					return
  3306  				}
  3307  
  3308  				// Wait for the validation on the sstable to complete.
  3309  				wg.Wait()
  3310  
  3311  				// Return any error encountered during validation.
  3312  				if logger.err != nil {
  3313  					et.errLoc = errLocationValidation
  3314  					et.err = logger.err
  3315  				}
  3316  
  3317  				return
  3318  			}
  3319  
  3320  			// Construct a set of keys to ingest.
  3321  			var keyVals []keyVal
  3322  			for i := 0; i < nKeys; i++ {
  3323  				key := make([]byte, keySize)
  3324  				_, err = rng.Read(key)
  3325  				require.NoError(t, err)
  3326  
  3327  				val := make([]byte, valSize)
  3328  				_, err = rng.Read(val)
  3329  				require.NoError(t, err)
  3330  
  3331  				keyVals = append(keyVals, keyVal{key, val})
  3332  			}
  3333  
  3334  			// Keys must be sorted.
  3335  			sort.Slice(keyVals, func(i, j int) bool {
  3336  				return d.cmp(keyVals[i].key, keyVals[j].key) <= 0
  3337  			})
  3338  
  3339  			// Run the ingestion.
  3340  			et := runIngest(keyVals)
  3341  
  3342  			// Assert we saw the errors we expect.
  3343  			switch tc.wantErrType {
  3344  			case errLocationNone:
  3345  				require.Equal(t, errLocationNone, et.errLoc)
  3346  				require.NoError(t, et.err)
  3347  			case errLocationIngest:
  3348  				require.Equal(t, errLocationIngest, et.errLoc)
  3349  				require.Error(t, et.err)
  3350  				require.True(t, errors.Is(et.err, base.ErrCorruption))
  3351  			case errLocationValidation:
  3352  				require.Equal(t, errLocationValidation, et.errLoc)
  3353  				require.Error(t, et.err)
  3354  				require.True(t, errors.Is(et.err, base.ErrCorruption))
  3355  			default:
  3356  				t.Fatalf("unknown wantErrType %T", tc.wantErrType)
  3357  			}
  3358  		})
  3359  	}
  3360  }
  3361  
  3362  // BenchmarkManySSTables measures the cost of various operations with various
  3363  // counts of SSTables within the database.
  3364  func BenchmarkManySSTables(b *testing.B) {
  3365  	counts := []int{10, 1_000, 10_000, 100_000, 1_000_000}
  3366  	ops := []string{"ingest", "calculateInuseKeyRanges"}
  3367  	for _, op := range ops {
  3368  		b.Run(op, func(b *testing.B) {
  3369  			for _, count := range counts {
  3370  				b.Run(fmt.Sprintf("sstables=%d", count), func(b *testing.B) {
  3371  					mem := vfs.NewMem()
  3372  					d, err := Open("", &Options{
  3373  						FS: mem,
  3374  					})
  3375  					require.NoError(b, err)
  3376  
  3377  					var paths []string
  3378  					for i := 0; i < count; i++ {
  3379  						n := fmt.Sprintf("%07d", i)
  3380  						f, err := mem.Create(n)
  3381  						require.NoError(b, err)
  3382  						w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  3383  						require.NoError(b, w.Set([]byte(n), nil))
  3384  						require.NoError(b, w.Close())
  3385  						paths = append(paths, n)
  3386  					}
  3387  					require.NoError(b, d.Ingest(paths))
  3388  
  3389  					{
  3390  						const broadIngest = "broad.sst"
  3391  						f, err := mem.Create(broadIngest)
  3392  						require.NoError(b, err)
  3393  						w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  3394  						require.NoError(b, w.Set([]byte("0"), nil))
  3395  						require.NoError(b, w.Set([]byte("Z"), nil))
  3396  						require.NoError(b, w.Close())
  3397  						require.NoError(b, d.Ingest([]string{broadIngest}))
  3398  					}
  3399  
  3400  					switch op {
  3401  					case "ingest":
  3402  						runBenchmarkManySSTablesIngest(b, d, mem, count)
  3403  					case "calculateInuseKeyRanges":
  3404  						runBenchmarkManySSTablesInUseKeyRanges(b, d, count)
  3405  					}
  3406  					require.NoError(b, d.Close())
  3407  				})
  3408  			}
  3409  		})
  3410  	}
  3411  }
  3412  
  3413  func runBenchmarkManySSTablesIngest(b *testing.B, d *DB, fs vfs.FS, count int) {
  3414  	b.ResetTimer()
  3415  	for i := 0; i < b.N; i++ {
  3416  		n := fmt.Sprintf("%07d", count+i)
  3417  		f, err := fs.Create(n)
  3418  		require.NoError(b, err)
  3419  		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
  3420  		require.NoError(b, w.Set([]byte(n), nil))
  3421  		require.NoError(b, w.Close())
  3422  		require.NoError(b, d.Ingest([]string{n}))
  3423  	}
  3424  }
  3425  
  3426  func runBenchmarkManySSTablesInUseKeyRanges(b *testing.B, d *DB, count int) {
  3427  	// This benchmark is pretty contrived, but it's not easy to write a
  3428  	// microbenchmark for this in a more natural way. L6 has many files, and
  3429  	// L5 has 1 file spanning the entire breadth of L5.
  3430  	d.mu.Lock()
  3431  	defer d.mu.Unlock()
  3432  	v := d.mu.versions.currentVersion()
  3433  	b.ResetTimer()
  3434  
  3435  	smallest := []byte("0")
  3436  	largest := []byte("z")
  3437  	for i := 0; i < b.N; i++ {
  3438  		_ = calculateInuseKeyRanges(v, d.cmp, 0, numLevels-1, smallest, largest)
  3439  	}
  3440  }