github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/sstable/reader_test.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package sstable
     6  
     7  import (
     8  	"bytes"
     9  	"context"
    10  	"encoding/binary"
    11  	"fmt"
    12  	"io"
    13  	"math"
    14  	"os"
    15  	"path"
    16  	"path/filepath"
    17  	"strings"
    18  	"testing"
    19  	"time"
    20  
    21  	"github.com/cockroachdb/datadriven"
    22  	"github.com/cockroachdb/errors"
    23  	"github.com/cockroachdb/pebble/bloom"
    24  	"github.com/cockroachdb/pebble/internal/base"
    25  	"github.com/cockroachdb/pebble/internal/cache"
    26  	"github.com/cockroachdb/pebble/internal/humanize"
    27  	"github.com/cockroachdb/pebble/internal/manifest"
    28  	"github.com/cockroachdb/pebble/internal/testkeys"
    29  	"github.com/cockroachdb/pebble/objstorage"
    30  	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
    31  	"github.com/cockroachdb/pebble/vfs"
    32  	"github.com/cockroachdb/pebble/vfs/errorfs"
    33  	"github.com/stretchr/testify/require"
    34  	"golang.org/x/exp/rand"
    35  )
    36  
    37  // get is a testing helper that simulates a read and helps verify bloom filters
    38  // until they are available through iterators.
    39  func (r *Reader) get(key []byte) (value []byte, err error) {
    40  	if r.err != nil {
    41  		return nil, r.err
    42  	}
    43  
    44  	if r.tableFilter != nil {
    45  		dataH, err := r.readFilter(context.Background(), nil /* stats */, nil)
    46  		if err != nil {
    47  			return nil, err
    48  		}
    49  		var lookupKey []byte
    50  		if r.Split != nil {
    51  			lookupKey = key[:r.Split(key)]
    52  		} else {
    53  			lookupKey = key
    54  		}
    55  		mayContain := r.tableFilter.mayContain(dataH.Get(), lookupKey)
    56  		dataH.Release()
    57  		if !mayContain {
    58  			return nil, base.ErrNotFound
    59  		}
    60  	}
    61  
    62  	i, err := r.NewIter(nil /* lower */, nil /* upper */)
    63  	if err != nil {
    64  		return nil, err
    65  	}
    66  	var v base.LazyValue
    67  	ikey, v := i.SeekGE(key, base.SeekGEFlagsNone)
    68  	value, _, err = v.Value(nil)
    69  	if err != nil {
    70  		return nil, err
    71  	}
    72  
    73  	if ikey == nil || r.Compare(key, ikey.UserKey) != 0 {
    74  		err := i.Close()
    75  		if err == nil {
    76  			err = base.ErrNotFound
    77  		}
    78  		return nil, err
    79  	}
    80  
    81  	// The value will be "freed" when the iterator is closed, so make a copy
    82  	// which will outlast the lifetime of the iterator.
    83  	newValue := make([]byte, len(value))
    84  	copy(newValue, value)
    85  	if err := i.Close(); err != nil {
    86  		return nil, err
    87  	}
    88  	return newValue, nil
    89  }
    90  
    91  // iterAdapter adapts the new Iterator API which returns the key and value from
    92  // positioning methods (Seek*, First, Last, Next, Prev) to the old API which
    93  // returned a boolean corresponding to Valid. Only used by test code.
    94  type iterAdapter struct {
    95  	Iterator
    96  	key *InternalKey
    97  	val []byte
    98  }
    99  
   100  func newIterAdapter(iter Iterator) *iterAdapter {
   101  	return &iterAdapter{
   102  		Iterator: iter,
   103  	}
   104  }
   105  
   106  func (i *iterAdapter) update(key *InternalKey, val base.LazyValue) bool {
   107  	i.key = key
   108  	if v, _, err := val.Value(nil); err != nil {
   109  		i.key = nil
   110  		i.val = nil
   111  	} else {
   112  		i.val = v
   113  	}
   114  	return i.key != nil
   115  }
   116  
   117  func (i *iterAdapter) String() string {
   118  	return "iter-adapter"
   119  }
   120  
   121  func (i *iterAdapter) SeekGE(key []byte, flags base.SeekGEFlags) bool {
   122  	return i.update(i.Iterator.SeekGE(key, flags))
   123  }
   124  
   125  func (i *iterAdapter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) bool {
   126  	return i.update(i.Iterator.SeekPrefixGE(prefix, key, flags))
   127  }
   128  
   129  func (i *iterAdapter) SeekLT(key []byte, flags base.SeekLTFlags) bool {
   130  	return i.update(i.Iterator.SeekLT(key, flags))
   131  }
   132  
   133  func (i *iterAdapter) First() bool {
   134  	return i.update(i.Iterator.First())
   135  }
   136  
   137  func (i *iterAdapter) Last() bool {
   138  	return i.update(i.Iterator.Last())
   139  }
   140  
   141  func (i *iterAdapter) Next() bool {
   142  	return i.update(i.Iterator.Next())
   143  }
   144  
   145  func (i *iterAdapter) NextPrefix(succKey []byte) bool {
   146  	return i.update(i.Iterator.NextPrefix(succKey))
   147  }
   148  
   149  func (i *iterAdapter) NextIgnoreResult() {
   150  	i.Iterator.Next()
   151  	i.update(nil, base.LazyValue{})
   152  }
   153  
   154  func (i *iterAdapter) Prev() bool {
   155  	return i.update(i.Iterator.Prev())
   156  }
   157  
   158  func (i *iterAdapter) Key() *InternalKey {
   159  	return i.key
   160  }
   161  
   162  func (i *iterAdapter) Value() []byte {
   163  	return i.val
   164  }
   165  
   166  func (i *iterAdapter) Valid() bool {
   167  	return i.key != nil
   168  }
   169  
   170  func (i *iterAdapter) SetBounds(lower, upper []byte) {
   171  	i.Iterator.SetBounds(lower, upper)
   172  	i.key = nil
   173  }
   174  
   175  func (i *iterAdapter) SetContext(ctx context.Context) {
   176  	i.Iterator.SetContext(ctx)
   177  }
   178  
   179  func TestVirtualReader(t *testing.T) {
   180  	// A faux filenum used to create fake filemetadata for testing.
   181  	var fileNum int = 1
   182  	nextFileNum := func() base.FileNum {
   183  		fileNum++
   184  		return base.FileNum(fileNum - 1)
   185  	}
   186  
   187  	// Set during the latest build command.
   188  	var r *Reader
   189  	var meta manifest.PhysicalFileMeta
   190  	var bp BufferPool
   191  
   192  	// Set during the latest virtualize command.
   193  	var vMeta1 manifest.VirtualFileMeta
   194  	var v VirtualReader
   195  
   196  	defer func() {
   197  		if r != nil {
   198  			require.NoError(t, r.Close())
   199  			bp.Release()
   200  		}
   201  	}()
   202  
   203  	createPhysicalMeta := func(w *WriterMetadata, r *Reader) (manifest.PhysicalFileMeta, error) {
   204  		meta := &manifest.FileMetadata{}
   205  		meta.FileNum = nextFileNum()
   206  		meta.CreationTime = time.Now().Unix()
   207  		meta.Size = w.Size
   208  		meta.SmallestSeqNum = w.SmallestSeqNum
   209  		meta.LargestSeqNum = w.LargestSeqNum
   210  
   211  		if w.HasPointKeys {
   212  			meta.ExtendPointKeyBounds(r.Compare, w.SmallestPoint, w.LargestPoint)
   213  		}
   214  		if w.HasRangeDelKeys {
   215  			meta.ExtendPointKeyBounds(r.Compare, w.SmallestRangeDel, w.LargestRangeDel)
   216  		}
   217  		if w.HasRangeKeys {
   218  			meta.ExtendRangeKeyBounds(r.Compare, w.SmallestRangeKey, w.LargestRangeKey)
   219  		}
   220  		meta.InitPhysicalBacking()
   221  
   222  		if err := meta.Validate(r.Compare, r.opts.Comparer.FormatKey); err != nil {
   223  			return manifest.PhysicalFileMeta{}, err
   224  		}
   225  
   226  		return meta.PhysicalMeta(), nil
   227  	}
   228  
   229  	formatWMeta := func(m *WriterMetadata) string {
   230  		var b bytes.Buffer
   231  		if m.HasPointKeys {
   232  			fmt.Fprintf(&b, "point:    [%s-%s]\n", m.SmallestPoint, m.LargestPoint)
   233  		}
   234  		if m.HasRangeDelKeys {
   235  			fmt.Fprintf(&b, "rangedel: [%s-%s]\n", m.SmallestRangeDel, m.LargestRangeDel)
   236  		}
   237  		if m.HasRangeKeys {
   238  			fmt.Fprintf(&b, "rangekey: [%s-%s]\n", m.SmallestRangeKey, m.LargestRangeKey)
   239  		}
   240  		fmt.Fprintf(&b, "seqnums:  [%d-%d]\n", m.SmallestSeqNum, m.LargestSeqNum)
   241  		return b.String()
   242  	}
   243  
   244  	formatVirtualReader := func(v *VirtualReader) string {
   245  		var b bytes.Buffer
   246  		fmt.Fprintf(&b, "bounds:  [%s-%s]\n", v.vState.lower, v.vState.upper)
   247  		fmt.Fprintf(&b, "filenum: %s\n", v.vState.fileNum.String())
   248  		fmt.Fprintf(
   249  			&b, "props: %s: %d, %s: %d, %s: %d, %s: %d, %s: %d, %s: %d, %s: %d, %s: %d, %s: %d, %s: %d, %s: %d\n",
   250  			"NumEntries",
   251  			v.Properties.NumEntries,
   252  			"RawKeySize",
   253  			v.Properties.RawKeySize,
   254  			"RawValueSize",
   255  			v.Properties.RawValueSize,
   256  			"RawPointTombstoneKeySize",
   257  			v.Properties.RawPointTombstoneKeySize,
   258  			"RawPointTombstoneValueSize",
   259  			v.Properties.RawPointTombstoneValueSize,
   260  			"NumSizedDeletions",
   261  			v.Properties.NumSizedDeletions,
   262  			"NumDeletions",
   263  			v.Properties.NumDeletions,
   264  			"NumRangeDeletions",
   265  			v.Properties.NumRangeDeletions,
   266  			"NumRangeKeyDels",
   267  			v.Properties.NumRangeKeyDels,
   268  			"NumRangeKeySets",
   269  			v.Properties.NumRangeKeySets,
   270  			"ValueBlocksSize",
   271  			v.Properties.ValueBlocksSize,
   272  		)
   273  		return b.String()
   274  	}
   275  
   276  	datadriven.RunTest(t, "testdata/virtual_reader", func(t *testing.T, td *datadriven.TestData) string {
   277  		switch td.Cmd {
   278  		case "build":
   279  			if r != nil {
   280  				bp.Release()
   281  				_ = r.Close()
   282  				r = nil
   283  				meta.FileMetadata = nil
   284  				vMeta1.FileMetadata = nil
   285  				v = VirtualReader{}
   286  			}
   287  			var wMeta *WriterMetadata
   288  			var err error
   289  			writerOpts := &WriterOptions{
   290  				TableFormat: TableFormatMax,
   291  			}
   292  			// Use a single level index by default.
   293  			writerOpts.IndexBlockSize = 100000
   294  			if len(td.CmdArgs) == 1 {
   295  				if td.CmdArgs[0].String() == "twoLevel" {
   296  					// Force a two level index.
   297  					writerOpts.IndexBlockSize = 1
   298  					writerOpts.BlockSize = 1
   299  				}
   300  			}
   301  			wMeta, r, err = runBuildCmd(td, writerOpts, 0)
   302  			if err != nil {
   303  				return err.Error()
   304  			}
   305  			bp.Init(5)
   306  
   307  			// Create a fake filemetada using the writer meta.
   308  			meta, err = createPhysicalMeta(wMeta, r)
   309  			if err != nil {
   310  				return err.Error()
   311  			}
   312  			r.fileNum = meta.FileBacking.DiskFileNum
   313  			return formatWMeta(wMeta)
   314  
   315  		case "virtualize":
   316  			// virtualize will split the previously built physical sstable into
   317  			// a single sstable with virtual bounds. The command assumes that
   318  			// the bounds for the virtual sstable are valid. For the purposes of
   319  			// this command the bounds must be valid keys. In general, and for
   320  			// this command, range key/range del spans must also not span across
   321  			// virtual sstable bounds.
   322  			if meta.FileMetadata == nil {
   323  				return "build must be called at least once before virtualize"
   324  			}
   325  			if vMeta1.FileMetadata != nil {
   326  				vMeta1.FileMetadata = nil
   327  				v = VirtualReader{}
   328  			}
   329  			vMeta := &manifest.FileMetadata{
   330  				FileBacking:    meta.FileBacking,
   331  				SmallestSeqNum: meta.SmallestSeqNum,
   332  				LargestSeqNum:  meta.LargestSeqNum,
   333  				Virtual:        true,
   334  			}
   335  			// Parse the virtualization bounds.
   336  			bounds := strings.Split(td.CmdArgs[0].String(), "-")
   337  			vMeta.Smallest = base.ParseInternalKey(bounds[0])
   338  			vMeta.Largest = base.ParseInternalKey(bounds[1])
   339  			vMeta.FileNum = nextFileNum()
   340  			var err error
   341  			vMeta.Size, err = r.EstimateDiskUsage(vMeta.Smallest.UserKey, vMeta.Largest.UserKey)
   342  			if err != nil {
   343  				return err.Error()
   344  			}
   345  			vMeta.ValidateVirtual(meta.FileMetadata)
   346  
   347  			vMeta1 = vMeta.VirtualMeta()
   348  			v = MakeVirtualReader(r, vMeta1, false /* isForeign */)
   349  			return formatVirtualReader(&v)
   350  
   351  		case "citer":
   352  			// Creates a compaction iterator from the virtual reader, and then
   353  			// just scans the keyspace. Which is all a compaction iterator is
   354  			// used for. This tests the First and Next calls.
   355  			if vMeta1.FileMetadata == nil {
   356  				return "virtualize must be called before creating compaction iters"
   357  			}
   358  
   359  			var rp ReaderProvider
   360  			var bytesIterated uint64
   361  			iter, err := v.NewCompactionIter(&bytesIterated, CategoryAndQoS{}, nil, rp, &bp)
   362  			if err != nil {
   363  				return err.Error()
   364  			}
   365  
   366  			var buf bytes.Buffer
   367  			for key, val := iter.First(); key != nil; key, val = iter.Next() {
   368  				fmt.Fprintf(&buf, "%s:%s\n", key.String(), val.InPlaceValue())
   369  			}
   370  			err = iter.Close()
   371  			if err != nil {
   372  				return err.Error()
   373  			}
   374  			return buf.String()
   375  
   376  		case "constrain":
   377  			if vMeta1.FileMetadata == nil {
   378  				return "virtualize must be called before constrain"
   379  			}
   380  			splits := strings.Split(td.CmdArgs[0].String(), ",")
   381  			of, ol := []byte(splits[0]), []byte(splits[1])
   382  			inclusive, f, l := v.vState.constrainBounds(of, ol, splits[2] == "true")
   383  			var buf bytes.Buffer
   384  			buf.Write(f)
   385  			buf.WriteByte(',')
   386  			buf.Write(l)
   387  			buf.WriteByte(',')
   388  			if inclusive {
   389  				buf.WriteString("true")
   390  			} else {
   391  				buf.WriteString("false")
   392  			}
   393  			buf.WriteByte('\n')
   394  			return buf.String()
   395  
   396  		case "scan-range-del":
   397  			if vMeta1.FileMetadata == nil {
   398  				return "virtualize must be called before scan-range-del"
   399  			}
   400  			iter, err := v.NewRawRangeDelIter()
   401  			if err != nil {
   402  				return err.Error()
   403  			}
   404  			if iter == nil {
   405  				return ""
   406  			}
   407  			defer iter.Close()
   408  
   409  			var buf bytes.Buffer
   410  			for s := iter.First(); s != nil; s = iter.Next() {
   411  				fmt.Fprintf(&buf, "%s\n", s)
   412  			}
   413  			return buf.String()
   414  
   415  		case "scan-range-key":
   416  			if vMeta1.FileMetadata == nil {
   417  				return "virtualize must be called before scan-range-key"
   418  			}
   419  			iter, err := v.NewRawRangeKeyIter()
   420  			if err != nil {
   421  				return err.Error()
   422  			}
   423  			if iter == nil {
   424  				return ""
   425  			}
   426  			defer iter.Close()
   427  
   428  			var buf bytes.Buffer
   429  			for s := iter.First(); s != nil; s = iter.Next() {
   430  				fmt.Fprintf(&buf, "%s\n", s)
   431  			}
   432  			return buf.String()
   433  
   434  		case "iter":
   435  			if vMeta1.FileMetadata == nil {
   436  				return "virtualize must be called before iter"
   437  			}
   438  			var lower, upper []byte
   439  			if len(td.CmdArgs) > 0 {
   440  				splits := strings.Split(td.CmdArgs[0].String(), "-")
   441  				lower, upper = []byte(splits[0]), []byte(splits[1])
   442  			}
   443  
   444  			var stats base.InternalIteratorStats
   445  			iter, err := v.NewIterWithBlockPropertyFiltersAndContextEtc(
   446  				context.Background(), lower, upper, nil, false, false,
   447  				&stats, CategoryAndQoS{}, nil, TrivialReaderProvider{Reader: r})
   448  			if err != nil {
   449  				return err.Error()
   450  			}
   451  			return runIterCmd(td, iter, true, runIterCmdStats(&stats))
   452  
   453  		default:
   454  			return fmt.Sprintf("unknown command: %s", td.Cmd)
   455  		}
   456  	})
   457  }
   458  
   459  func TestReader(t *testing.T) {
   460  	writerOpts := map[string]WriterOptions{
   461  		// No bloom filters.
   462  		"default": {},
   463  		"bloom10bit": {
   464  			// The standard policy.
   465  			FilterPolicy: bloom.FilterPolicy(10),
   466  			FilterType:   base.TableFilter,
   467  		},
   468  		"bloom1bit": {
   469  			// A policy with many false positives.
   470  			FilterPolicy: bloom.FilterPolicy(1),
   471  			FilterType:   base.TableFilter,
   472  		},
   473  		"bloom100bit": {
   474  			// A policy unlikely to have false positives.
   475  			FilterPolicy: bloom.FilterPolicy(100),
   476  			FilterType:   base.TableFilter,
   477  		},
   478  	}
   479  
   480  	blockSizes := map[string]int{
   481  		"1bytes":   1,
   482  		"5bytes":   5,
   483  		"10bytes":  10,
   484  		"25bytes":  25,
   485  		"Maxbytes": math.MaxInt32,
   486  	}
   487  
   488  	opts := map[string]*Comparer{
   489  		"default":      testkeys.Comparer,
   490  		"prefixFilter": fixtureComparer,
   491  	}
   492  
   493  	testDirs := map[string]string{
   494  		"default":      "testdata/reader",
   495  		"prefixFilter": "testdata/prefixreader",
   496  	}
   497  
   498  	for format := TableFormatPebblev2; format <= TableFormatMax; format++ {
   499  		for dName, blockSize := range blockSizes {
   500  			for iName, indexBlockSize := range blockSizes {
   501  				for lName, tableOpt := range writerOpts {
   502  					for oName, cmp := range opts {
   503  						tableOpt.BlockSize = blockSize
   504  						tableOpt.Comparer = cmp
   505  						tableOpt.IndexBlockSize = indexBlockSize
   506  						tableOpt.TableFormat = format
   507  
   508  						t.Run(
   509  							fmt.Sprintf("format=%d,opts=%s,writerOpts=%s,blockSize=%s,indexSize=%s",
   510  								format, oName, lName, dName, iName),
   511  							func(t *testing.T) {
   512  								runTestReader(
   513  									t, tableOpt, testDirs[oName], nil /* Reader */, true)
   514  							})
   515  					}
   516  				}
   517  			}
   518  		}
   519  	}
   520  }
   521  
   522  func TestReaderHideObsolete(t *testing.T) {
   523  	blockSizes := map[string]int{
   524  		"1bytes":   1,
   525  		"5bytes":   5,
   526  		"10bytes":  10,
   527  		"25bytes":  25,
   528  		"Maxbytes": math.MaxInt32,
   529  	}
   530  	for dName, blockSize := range blockSizes {
   531  		opts := WriterOptions{
   532  			TableFormat:    TableFormatPebblev4,
   533  			BlockSize:      blockSize,
   534  			IndexBlockSize: blockSize,
   535  			Comparer:       testkeys.Comparer,
   536  		}
   537  		t.Run(fmt.Sprintf("blockSize=%s", dName), func(t *testing.T) {
   538  			runTestReader(
   539  				t, opts, "testdata/reader_hide_obsolete",
   540  				nil /* Reader */, true)
   541  		})
   542  	}
   543  }
   544  
   545  func TestHamletReader(t *testing.T) {
   546  	for _, fixture := range TestFixtures {
   547  		f, err := os.Open(filepath.Join("testdata", fixture.Filename))
   548  		require.NoError(t, err)
   549  
   550  		r, err := newReader(f, ReaderOptions{})
   551  		require.NoError(t, err)
   552  
   553  		t.Run(
   554  			fmt.Sprintf("sst=%s", fixture.Filename),
   555  			func(t *testing.T) {
   556  				runTestReader(t, WriterOptions{}, "testdata/hamletreader", r, false)
   557  			},
   558  		)
   559  	}
   560  }
   561  
   562  func forEveryTableFormat[I any](
   563  	t *testing.T, formatTable [NumTableFormats]I, runTest func(*testing.T, TableFormat, I),
   564  ) {
   565  	t.Helper()
   566  	for tf := TableFormatUnspecified + 1; tf <= TableFormatMax; tf++ {
   567  		t.Run(tf.String(), func(t *testing.T) {
   568  			runTest(t, tf, formatTable[tf])
   569  		})
   570  	}
   571  }
   572  
   573  func TestReaderStats(t *testing.T) {
   574  	forEveryTableFormat[string](t,
   575  		[NumTableFormats]string{
   576  			TableFormatUnspecified: "",
   577  			TableFormatLevelDB:     "testdata/readerstats_LevelDB",
   578  			TableFormatRocksDBv2:   "testdata/readerstats_LevelDB",
   579  			TableFormatPebblev1:    "testdata/readerstats_LevelDB",
   580  			TableFormatPebblev2:    "testdata/readerstats_LevelDB",
   581  			TableFormatPebblev3:    "testdata/readerstats_Pebblev3",
   582  			TableFormatPebblev4:    "testdata/readerstats_Pebblev3",
   583  		}, func(t *testing.T, format TableFormat, dir string) {
   584  			if dir == "" {
   585  				t.Skip()
   586  			}
   587  			writerOpt := WriterOptions{
   588  				BlockSize:      32 << 10,
   589  				IndexBlockSize: 32 << 10,
   590  				Comparer:       testkeys.Comparer,
   591  				TableFormat:    format,
   592  			}
   593  			runTestReader(t, writerOpt, dir, nil /* Reader */, false /* printValue */)
   594  		})
   595  }
   596  
   597  func TestReaderWithBlockPropertyFilter(t *testing.T) {
   598  	// Some of these tests examine internal iterator state, so they require
   599  	// determinism. When the invariants tag is set, disableBoundsOpt may disable
   600  	// the bounds optimization depending on the iterator pointer address. This
   601  	// can add nondeterminism to the internal iterator statae. Disable this
   602  	// nondeterminism for the duration of this test.
   603  	ensureBoundsOptDeterminism = true
   604  	defer func() { ensureBoundsOptDeterminism = false }()
   605  
   606  	forEveryTableFormat[string](t,
   607  		[NumTableFormats]string{
   608  			TableFormatUnspecified: "", // Block properties unsupported
   609  			TableFormatLevelDB:     "", // Block properties unsupported
   610  			TableFormatRocksDBv2:   "", // Block properties unsupported
   611  			TableFormatPebblev1:    "", // Block properties unsupported
   612  			TableFormatPebblev2:    "testdata/reader_bpf/Pebblev2",
   613  			TableFormatPebblev3:    "testdata/reader_bpf/Pebblev3",
   614  			TableFormatPebblev4:    "testdata/reader_bpf/Pebblev3",
   615  		}, func(t *testing.T, format TableFormat, dir string) {
   616  			if dir == "" {
   617  				t.Skip("Block-properties unsupported")
   618  			}
   619  			writerOpt := WriterOptions{
   620  				Comparer:                testkeys.Comparer,
   621  				TableFormat:             format,
   622  				BlockPropertyCollectors: []func() BlockPropertyCollector{NewTestKeysBlockPropertyCollector},
   623  			}
   624  			runTestReader(t, writerOpt, dir, nil /* Reader */, false)
   625  		})
   626  }
   627  
   628  func TestInjectedErrors(t *testing.T) {
   629  	for _, fixture := range TestFixtures {
   630  		run := func(i int) (reterr error) {
   631  			f, err := vfs.Default.Open(filepath.Join("testdata", fixture.Filename))
   632  			require.NoError(t, err)
   633  
   634  			r, err := newReader(errorfs.WrapFile(f, errorfs.ErrInjected.If(errorfs.OnIndex(int32(i)))), ReaderOptions{})
   635  			if err != nil {
   636  				return firstError(err, f.Close())
   637  			}
   638  			defer func() { reterr = firstError(reterr, r.Close()) }()
   639  
   640  			_, err = r.EstimateDiskUsage([]byte("borrower"), []byte("lender"))
   641  			if err != nil {
   642  				return err
   643  			}
   644  
   645  			iter, err := r.NewIter(nil, nil)
   646  			if err != nil {
   647  				return err
   648  			}
   649  			defer func() { reterr = firstError(reterr, iter.Close()) }()
   650  			for k, v := iter.First(); k != nil; k, v = iter.Next() {
   651  				val, _, err := v.Value(nil)
   652  				if err != nil {
   653  					return err
   654  				}
   655  				if val == nil {
   656  					break
   657  				}
   658  			}
   659  			if err = iter.Error(); err != nil {
   660  				return err
   661  			}
   662  			return nil
   663  		}
   664  		for i := 0; ; i++ {
   665  			err := run(i)
   666  			if errors.Is(err, errorfs.ErrInjected) {
   667  				t.Logf("%q, index %d: %s", fixture.Filename, i, err)
   668  				continue
   669  			}
   670  			if err != nil {
   671  				t.Errorf("%q, index %d: non-injected error: %+v", fixture.Filename, i, err)
   672  				break
   673  			}
   674  			t.Logf("%q: no error at index %d", fixture.Filename, i)
   675  			break
   676  		}
   677  	}
   678  }
   679  
   680  func TestInvalidReader(t *testing.T) {
   681  	invalid, err := NewSimpleReadable(vfs.NewMemFile([]byte("invalid sst bytes")))
   682  	if err != nil {
   683  		t.Fatal(err)
   684  	}
   685  	testCases := []struct {
   686  		readable objstorage.Readable
   687  		expected string
   688  	}{
   689  		{nil, "nil file"},
   690  		{invalid, "invalid table"},
   691  	}
   692  	for _, tc := range testCases {
   693  		r, err := NewReader(tc.readable, ReaderOptions{})
   694  		if !strings.Contains(err.Error(), tc.expected) {
   695  			t.Fatalf("expected %q, but found %q", tc.expected, err.Error())
   696  		}
   697  		if r != nil {
   698  			t.Fatalf("found non-nil reader returned with non-nil error %q", err.Error())
   699  		}
   700  	}
   701  }
   702  
   703  func indexLayoutString(t *testing.T, r *Reader) string {
   704  	indexH, err := r.readIndex(context.Background(), nil, nil)
   705  	require.NoError(t, err)
   706  	defer indexH.Release()
   707  	var buf strings.Builder
   708  	twoLevelIndex := r.Properties.IndexType == twoLevelIndex
   709  	buf.WriteString("index entries:\n")
   710  	iter, err := newBlockIter(r.Compare, indexH.Get())
   711  	defer func() {
   712  		require.NoError(t, iter.Close())
   713  	}()
   714  	require.NoError(t, err)
   715  	for key, value := iter.First(); key != nil; key, value = iter.Next() {
   716  		bh, err := decodeBlockHandleWithProperties(value.InPlaceValue())
   717  		require.NoError(t, err)
   718  		fmt.Fprintf(&buf, " %s: size %d\n", string(key.UserKey), bh.Length)
   719  		if twoLevelIndex {
   720  			b, err := r.readBlock(
   721  				context.Background(), bh.BlockHandle, nil, nil, nil, nil, nil)
   722  			require.NoError(t, err)
   723  			defer b.Release()
   724  			iter2, err := newBlockIter(r.Compare, b.Get())
   725  			defer func() {
   726  				require.NoError(t, iter2.Close())
   727  			}()
   728  			require.NoError(t, err)
   729  			for key, value := iter2.First(); key != nil; key, value = iter2.Next() {
   730  				bh, err := decodeBlockHandleWithProperties(value.InPlaceValue())
   731  				require.NoError(t, err)
   732  				fmt.Fprintf(&buf, "   %s: size %d\n", string(key.UserKey), bh.Length)
   733  			}
   734  		}
   735  	}
   736  	return buf.String()
   737  }
   738  
   739  func runTestReader(t *testing.T, o WriterOptions, dir string, r *Reader, printValue bool) {
   740  	datadriven.Walk(t, dir, func(t *testing.T, path string) {
   741  		defer func() {
   742  			if r != nil {
   743  				r.Close()
   744  				r = nil
   745  			}
   746  		}()
   747  
   748  		datadriven.RunTest(t, path, func(t *testing.T, d *datadriven.TestData) string {
   749  			switch d.Cmd {
   750  			case "build":
   751  				if r != nil {
   752  					r.Close()
   753  					r = nil
   754  				}
   755  				var cacheSize int
   756  				var printLayout bool
   757  				d.MaybeScanArgs(t, "cache-size", &cacheSize)
   758  				d.MaybeScanArgs(t, "print-layout", &printLayout)
   759  				d.MaybeScanArgs(t, "block-size", &o.BlockSize)
   760  				d.MaybeScanArgs(t, "index-block-size", &o.IndexBlockSize)
   761  
   762  				var err error
   763  				_, r, err = runBuildCmd(d, &o, cacheSize)
   764  				if err != nil {
   765  					return err.Error()
   766  				}
   767  				if printLayout {
   768  					return indexLayoutString(t, r)
   769  				}
   770  				return ""
   771  
   772  			case "iter":
   773  				seqNum, err := scanGlobalSeqNum(d)
   774  				if err != nil {
   775  					return err.Error()
   776  				}
   777  				var stats base.InternalIteratorStats
   778  				r.Properties.GlobalSeqNum = seqNum
   779  				var bpfs []BlockPropertyFilter
   780  				if d.HasArg("block-property-filter") {
   781  					var filterMin, filterMax uint64
   782  					d.ScanArgs(t, "block-property-filter", &filterMin, &filterMax)
   783  					bpf := NewTestKeysBlockPropertyFilter(filterMin, filterMax)
   784  					bpfs = append(bpfs, bpf)
   785  				}
   786  				hideObsoletePoints := false
   787  				if d.HasArg("hide-obsolete-points") {
   788  					d.ScanArgs(t, "hide-obsolete-points", &hideObsoletePoints)
   789  					if hideObsoletePoints {
   790  						hideObsoletePoints, bpfs = r.TryAddBlockPropertyFilterForHideObsoletePoints(
   791  							InternalKeySeqNumMax, InternalKeySeqNumMax-1, bpfs)
   792  						require.True(t, hideObsoletePoints)
   793  					}
   794  				}
   795  				var filterer *BlockPropertiesFilterer
   796  				if len(bpfs) > 0 {
   797  					filterer = newBlockPropertiesFilterer(bpfs, nil)
   798  					intersects, err :=
   799  						filterer.intersectsUserPropsAndFinishInit(r.Properties.UserProperties)
   800  					if err != nil {
   801  						return err.Error()
   802  					}
   803  					if !intersects {
   804  						return "table does not intersect BlockPropertyFilter"
   805  					}
   806  				}
   807  				iter, err := r.NewIterWithBlockPropertyFiltersAndContextEtc(
   808  					context.Background(),
   809  					nil, /* lower */
   810  					nil, /* upper */
   811  					filterer,
   812  					hideObsoletePoints,
   813  					true, /* use filter block */
   814  					&stats,
   815  					CategoryAndQoS{},
   816  					nil,
   817  					TrivialReaderProvider{Reader: r},
   818  				)
   819  				if err != nil {
   820  					return err.Error()
   821  				}
   822  				return runIterCmd(d, iter, printValue, runIterCmdStats(&stats))
   823  
   824  			case "get":
   825  				var b bytes.Buffer
   826  				for _, k := range strings.Split(d.Input, "\n") {
   827  					v, err := r.get([]byte(k))
   828  					if err != nil {
   829  						fmt.Fprintf(&b, "<err: %s>\n", err)
   830  					} else {
   831  						fmt.Fprintln(&b, string(v))
   832  					}
   833  				}
   834  				return b.String()
   835  			default:
   836  				return fmt.Sprintf("unknown command: %s", d.Cmd)
   837  			}
   838  		})
   839  	})
   840  }
   841  
   842  func TestReaderCheckComparerMerger(t *testing.T) {
   843  	const testTable = "test"
   844  
   845  	testComparer := &base.Comparer{
   846  		Name:      "test.comparer",
   847  		Compare:   base.DefaultComparer.Compare,
   848  		Equal:     base.DefaultComparer.Equal,
   849  		Separator: base.DefaultComparer.Separator,
   850  		Successor: base.DefaultComparer.Successor,
   851  	}
   852  	testMerger := &base.Merger{
   853  		Name:  "test.merger",
   854  		Merge: base.DefaultMerger.Merge,
   855  	}
   856  	writerOpts := WriterOptions{
   857  		Comparer:   testComparer,
   858  		MergerName: "test.merger",
   859  	}
   860  
   861  	mem := vfs.NewMem()
   862  	f0, err := mem.Create(testTable)
   863  	require.NoError(t, err)
   864  
   865  	w := NewWriter(objstorageprovider.NewFileWritable(f0), writerOpts)
   866  	require.NoError(t, w.Set([]byte("test"), nil))
   867  	require.NoError(t, w.Close())
   868  
   869  	testCases := []struct {
   870  		comparers []*base.Comparer
   871  		mergers   []*base.Merger
   872  		expected  string
   873  	}{
   874  		{
   875  			[]*base.Comparer{testComparer},
   876  			[]*base.Merger{testMerger},
   877  			"",
   878  		},
   879  		{
   880  			[]*base.Comparer{testComparer, base.DefaultComparer},
   881  			[]*base.Merger{testMerger, base.DefaultMerger},
   882  			"",
   883  		},
   884  		{
   885  			[]*base.Comparer{},
   886  			[]*base.Merger{testMerger},
   887  			"unknown comparer test.comparer",
   888  		},
   889  		{
   890  			[]*base.Comparer{base.DefaultComparer},
   891  			[]*base.Merger{testMerger},
   892  			"unknown comparer test.comparer",
   893  		},
   894  		{
   895  			[]*base.Comparer{testComparer},
   896  			[]*base.Merger{},
   897  			"unknown merger test.merger",
   898  		},
   899  		{
   900  			[]*base.Comparer{testComparer},
   901  			[]*base.Merger{base.DefaultMerger},
   902  			"unknown merger test.merger",
   903  		},
   904  	}
   905  
   906  	for _, c := range testCases {
   907  		t.Run("", func(t *testing.T) {
   908  			f1, err := mem.Open(testTable)
   909  			require.NoError(t, err)
   910  
   911  			comparers := make(Comparers)
   912  			for _, comparer := range c.comparers {
   913  				comparers[comparer.Name] = comparer
   914  			}
   915  			mergers := make(Mergers)
   916  			for _, merger := range c.mergers {
   917  				mergers[merger.Name] = merger
   918  			}
   919  
   920  			r, err := newReader(f1, ReaderOptions{}, comparers, mergers)
   921  			if err != nil {
   922  				if r != nil {
   923  					t.Fatalf("found non-nil reader returned with non-nil error %q", err.Error())
   924  				}
   925  				if !strings.HasSuffix(err.Error(), c.expected) {
   926  					t.Fatalf("expected %q, but found %q", c.expected, err.Error())
   927  				}
   928  			} else if c.expected != "" {
   929  				t.Fatalf("expected %q, but found success", c.expected)
   930  			}
   931  			if r != nil {
   932  				_ = r.Close()
   933  			}
   934  		})
   935  	}
   936  }
   937  func checkValidPrefix(prefix, key []byte) bool {
   938  	return prefix == nil || bytes.HasPrefix(key, prefix)
   939  }
   940  
   941  func testBytesIteratedWithCompression(
   942  	t *testing.T,
   943  	compression Compression,
   944  	allowedSizeDeviationPercent uint64,
   945  	blockSizes []int,
   946  	maxNumEntries []uint64,
   947  ) {
   948  	for i, blockSize := range blockSizes {
   949  		for _, indexBlockSize := range blockSizes {
   950  			for _, numEntries := range []uint64{0, 1, maxNumEntries[i]} {
   951  				r := buildTestTable(t, numEntries, blockSize, indexBlockSize, compression)
   952  				var bytesIterated, prevIterated uint64
   953  				var pool BufferPool
   954  				pool.Init(5)
   955  				citer, err := r.NewCompactionIter(
   956  					&bytesIterated, CategoryAndQoS{}, nil, TrivialReaderProvider{Reader: r}, &pool)
   957  				require.NoError(t, err)
   958  
   959  				for key, _ := citer.First(); key != nil; key, _ = citer.Next() {
   960  					if bytesIterated < prevIterated {
   961  						t.Fatalf("bytesIterated moved backward: %d < %d", bytesIterated, prevIterated)
   962  					}
   963  					prevIterated = bytesIterated
   964  				}
   965  
   966  				expected := r.Properties.DataSize
   967  				allowedSizeDeviation := expected * allowedSizeDeviationPercent / 100
   968  				// There is some inaccuracy due to compression estimation.
   969  				if bytesIterated < expected-allowedSizeDeviation || bytesIterated > expected+allowedSizeDeviation {
   970  					t.Fatalf("bytesIterated: got %d, want %d", bytesIterated, expected)
   971  				}
   972  
   973  				require.NoError(t, citer.Close())
   974  				require.NoError(t, r.Close())
   975  				pool.Release()
   976  			}
   977  		}
   978  	}
   979  }
   980  
   981  func TestBytesIterated(t *testing.T) {
   982  	blockSizes := []int{10, 100, 1000, 4096, math.MaxInt32}
   983  	t.Run("Compressed", func(t *testing.T) {
   984  		testBytesIteratedWithCompression(t, SnappyCompression, 1, blockSizes, []uint64{1e5, 1e5, 1e5, 1e5, 1e5})
   985  	})
   986  	t.Run("Uncompressed", func(t *testing.T) {
   987  		testBytesIteratedWithCompression(t, NoCompression, 0, blockSizes, []uint64{1e5, 1e5, 1e5, 1e5, 1e5})
   988  	})
   989  	t.Run("Zstd", func(t *testing.T) {
   990  		// compression with zstd is extremely slow with small block size (esp the nocgo version).
   991  		// use less numEntries to make the test run at reasonable speed (under 10 seconds).
   992  		maxNumEntries := []uint64{1e2, 1e2, 1e3, 4e3, 1e5}
   993  		if useStandardZstdLib {
   994  			maxNumEntries = []uint64{1e3, 1e3, 1e4, 4e4, 1e5}
   995  		}
   996  		testBytesIteratedWithCompression(t, ZstdCompression, 1, blockSizes, maxNumEntries)
   997  	})
   998  }
   999  
  1000  func TestCompactionIteratorSetupForCompaction(t *testing.T) {
  1001  	tmpDir := path.Join(t.TempDir())
  1002  	provider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(vfs.Default, tmpDir))
  1003  	require.NoError(t, err)
  1004  	defer provider.Close()
  1005  	blockSizes := []int{10, 100, 1000, 4096, math.MaxInt32}
  1006  	for _, blockSize := range blockSizes {
  1007  		for _, indexBlockSize := range blockSizes {
  1008  			for _, numEntries := range []uint64{0, 1, 1e5} {
  1009  				r := buildTestTableWithProvider(t, provider, numEntries, blockSize, indexBlockSize, DefaultCompression)
  1010  				var bytesIterated uint64
  1011  				var pool BufferPool
  1012  				pool.Init(5)
  1013  				citer, err := r.NewCompactionIter(
  1014  					&bytesIterated, CategoryAndQoS{}, nil, TrivialReaderProvider{Reader: r}, &pool)
  1015  				require.NoError(t, err)
  1016  				switch i := citer.(type) {
  1017  				case *compactionIterator:
  1018  					require.True(t, objstorageprovider.TestingCheckMaxReadahead(i.dataRH))
  1019  					// Each key has one version, so no value block, regardless of
  1020  					// sstable version.
  1021  					require.Nil(t, i.vbRH)
  1022  				case *twoLevelCompactionIterator:
  1023  					require.True(t, objstorageprovider.TestingCheckMaxReadahead(i.dataRH))
  1024  					// Each key has one version, so no value block, regardless of
  1025  					// sstable version.
  1026  					require.Nil(t, i.vbRH)
  1027  				default:
  1028  					require.Failf(t, fmt.Sprintf("unknown compaction iterator type: %T", citer), "")
  1029  				}
  1030  				require.NoError(t, citer.Close())
  1031  				require.NoError(t, r.Close())
  1032  				pool.Release()
  1033  			}
  1034  		}
  1035  	}
  1036  }
  1037  
  1038  func TestReadaheadSetupForV3TablesWithMultipleVersions(t *testing.T) {
  1039  	tmpDir := path.Join(t.TempDir())
  1040  	provider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(vfs.Default, tmpDir))
  1041  	require.NoError(t, err)
  1042  	defer provider.Close()
  1043  	f0, _, err := provider.Create(context.Background(), base.FileTypeTable, base.FileNum(0).DiskFileNum(), objstorage.CreateOptions{})
  1044  	require.NoError(t, err)
  1045  
  1046  	w := NewWriter(f0, WriterOptions{
  1047  		TableFormat: TableFormatPebblev3,
  1048  		Comparer:    testkeys.Comparer,
  1049  	})
  1050  	keys := testkeys.Alpha(1)
  1051  	keyBuf := make([]byte, 1+testkeys.MaxSuffixLen)
  1052  	// Write a few keys with multiple timestamps (MVCC versions).
  1053  	for i := int64(0); i < 2; i++ {
  1054  		for j := int64(2); j >= 1; j-- {
  1055  			n := testkeys.WriteKeyAt(keyBuf[:], keys, i, j)
  1056  			key := keyBuf[:n]
  1057  			require.NoError(t, w.Set(key, key))
  1058  		}
  1059  	}
  1060  	require.NoError(t, w.Close())
  1061  	f1, err := provider.OpenForReading(context.Background(), base.FileTypeTable, base.FileNum(0).DiskFileNum(), objstorage.OpenOptions{})
  1062  	require.NoError(t, err)
  1063  	r, err := NewReader(f1, ReaderOptions{Comparer: testkeys.Comparer})
  1064  	require.NoError(t, err)
  1065  	defer r.Close()
  1066  	{
  1067  		var pool BufferPool
  1068  		pool.Init(5)
  1069  		citer, err := r.NewCompactionIter(
  1070  			nil, CategoryAndQoS{}, nil, TrivialReaderProvider{Reader: r}, &pool)
  1071  		require.NoError(t, err)
  1072  		defer citer.Close()
  1073  		i := citer.(*compactionIterator)
  1074  		require.True(t, objstorageprovider.TestingCheckMaxReadahead(i.dataRH))
  1075  		require.True(t, objstorageprovider.TestingCheckMaxReadahead(i.vbRH))
  1076  	}
  1077  	{
  1078  		iter, err := r.NewIter(nil, nil)
  1079  		require.NoError(t, err)
  1080  		defer iter.Close()
  1081  		i := iter.(*singleLevelIterator)
  1082  		require.False(t, objstorageprovider.TestingCheckMaxReadahead(i.dataRH))
  1083  		require.False(t, objstorageprovider.TestingCheckMaxReadahead(i.vbRH))
  1084  	}
  1085  }
  1086  
  1087  func TestReaderChecksumErrors(t *testing.T) {
  1088  	for _, checksumType := range []ChecksumType{ChecksumTypeCRC32c, ChecksumTypeXXHash64} {
  1089  		t.Run(fmt.Sprintf("checksum-type=%d", checksumType), func(t *testing.T) {
  1090  			for _, twoLevelIndex := range []bool{false, true} {
  1091  				t.Run(fmt.Sprintf("two-level-index=%t", twoLevelIndex), func(t *testing.T) {
  1092  					mem := vfs.NewMem()
  1093  
  1094  					{
  1095  						// Create an sstable with 3 data blocks.
  1096  						f, err := mem.Create("test")
  1097  						require.NoError(t, err)
  1098  
  1099  						const blockSize = 32
  1100  						indexBlockSize := 4096
  1101  						if twoLevelIndex {
  1102  							indexBlockSize = 1
  1103  						}
  1104  
  1105  						w := NewWriter(objstorageprovider.NewFileWritable(f), WriterOptions{
  1106  							BlockSize:      blockSize,
  1107  							IndexBlockSize: indexBlockSize,
  1108  							Checksum:       checksumType,
  1109  						})
  1110  						require.NoError(t, w.Set(bytes.Repeat([]byte("a"), blockSize), nil))
  1111  						require.NoError(t, w.Set(bytes.Repeat([]byte("b"), blockSize), nil))
  1112  						require.NoError(t, w.Set(bytes.Repeat([]byte("c"), blockSize), nil))
  1113  						require.NoError(t, w.Close())
  1114  					}
  1115  
  1116  					// Load the layout so that we no the location of the data blocks.
  1117  					var layout *Layout
  1118  					{
  1119  						f, err := mem.Open("test")
  1120  						require.NoError(t, err)
  1121  
  1122  						r, err := newReader(f, ReaderOptions{})
  1123  						require.NoError(t, err)
  1124  						layout, err = r.Layout()
  1125  						require.NoError(t, err)
  1126  						require.EqualValues(t, len(layout.Data), 3)
  1127  						require.NoError(t, r.Close())
  1128  					}
  1129  
  1130  					for _, bh := range layout.Data {
  1131  						// Read the sstable and corrupt the first byte in the target data
  1132  						// block.
  1133  						orig, err := mem.Open("test")
  1134  						require.NoError(t, err)
  1135  						data, err := io.ReadAll(orig)
  1136  						require.NoError(t, err)
  1137  						require.NoError(t, orig.Close())
  1138  
  1139  						// Corrupt the first byte in the block.
  1140  						data[bh.Offset] ^= 0xff
  1141  
  1142  						corrupted, err := mem.Create("corrupted")
  1143  						require.NoError(t, err)
  1144  						_, err = corrupted.Write(data)
  1145  						require.NoError(t, err)
  1146  						require.NoError(t, corrupted.Close())
  1147  
  1148  						// Verify that we encounter a checksum mismatch error while iterating
  1149  						// over the sstable.
  1150  						corrupted, err = mem.Open("corrupted")
  1151  						require.NoError(t, err)
  1152  
  1153  						r, err := newReader(corrupted, ReaderOptions{})
  1154  						require.NoError(t, err)
  1155  
  1156  						iter, err := r.NewIter(nil, nil)
  1157  						require.NoError(t, err)
  1158  						for k, _ := iter.First(); k != nil; k, _ = iter.Next() {
  1159  						}
  1160  						require.Regexp(t, `checksum mismatch`, iter.Error())
  1161  						require.Regexp(t, `checksum mismatch`, iter.Close())
  1162  
  1163  						iter, err = r.NewIter(nil, nil)
  1164  						require.NoError(t, err)
  1165  						for k, _ := iter.Last(); k != nil; k, _ = iter.Prev() {
  1166  						}
  1167  						require.Regexp(t, `checksum mismatch`, iter.Error())
  1168  						require.Regexp(t, `checksum mismatch`, iter.Close())
  1169  
  1170  						require.NoError(t, r.Close())
  1171  					}
  1172  				})
  1173  			}
  1174  		})
  1175  	}
  1176  }
  1177  
  1178  func TestValidateBlockChecksums(t *testing.T) {
  1179  	seed := uint64(time.Now().UnixNano())
  1180  	rng := rand.New(rand.NewSource(seed))
  1181  	t.Logf("using seed = %d", seed)
  1182  
  1183  	var allFiles []string
  1184  	for _, fixture := range TestFixtures {
  1185  		allFiles = append(allFiles, fixture.Filename)
  1186  	}
  1187  
  1188  	type corruptionLocation int
  1189  	const (
  1190  		corruptionLocationData corruptionLocation = iota
  1191  		corruptionLocationIndex
  1192  		corruptionLocationTopIndex
  1193  		corruptionLocationFilter
  1194  		corruptionLocationRangeDel
  1195  		corruptionLocationProperties
  1196  		corruptionLocationMetaIndex
  1197  	)
  1198  
  1199  	testCases := []struct {
  1200  		name                string
  1201  		files               []string
  1202  		corruptionLocations []corruptionLocation
  1203  	}{
  1204  		{
  1205  			name:                "no corruption",
  1206  			corruptionLocations: []corruptionLocation{},
  1207  		},
  1208  		{
  1209  			name: "data block corruption",
  1210  			corruptionLocations: []corruptionLocation{
  1211  				corruptionLocationData,
  1212  			},
  1213  		},
  1214  		{
  1215  			name: "index block corruption",
  1216  			corruptionLocations: []corruptionLocation{
  1217  				corruptionLocationIndex,
  1218  			},
  1219  		},
  1220  		{
  1221  			name: "top index block corruption",
  1222  			files: []string{
  1223  				"h.no-compression.two_level_index.sst",
  1224  			},
  1225  			corruptionLocations: []corruptionLocation{
  1226  				corruptionLocationTopIndex,
  1227  			},
  1228  		},
  1229  		{
  1230  			name: "filter block corruption",
  1231  			files: []string{
  1232  				"h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst",
  1233  				"h.table-bloom.no-compression.sst",
  1234  				"h.table-bloom.sst",
  1235  			},
  1236  			corruptionLocations: []corruptionLocation{
  1237  				corruptionLocationFilter,
  1238  			},
  1239  		},
  1240  		{
  1241  			name: "range deletion block corruption",
  1242  			corruptionLocations: []corruptionLocation{
  1243  				corruptionLocationRangeDel,
  1244  			},
  1245  		},
  1246  		{
  1247  			name: "properties block corruption",
  1248  			corruptionLocations: []corruptionLocation{
  1249  				corruptionLocationProperties,
  1250  			},
  1251  		},
  1252  		{
  1253  			name: "metaindex block corruption",
  1254  			corruptionLocations: []corruptionLocation{
  1255  				corruptionLocationMetaIndex,
  1256  			},
  1257  		},
  1258  		{
  1259  			name: "multiple blocks corrupted",
  1260  			corruptionLocations: []corruptionLocation{
  1261  				corruptionLocationData,
  1262  				corruptionLocationIndex,
  1263  				corruptionLocationRangeDel,
  1264  				corruptionLocationProperties,
  1265  				corruptionLocationMetaIndex,
  1266  			},
  1267  		},
  1268  	}
  1269  
  1270  	testFn := func(t *testing.T, file string, corruptionLocations []corruptionLocation) {
  1271  		// Create a copy of the SSTable that we can freely corrupt.
  1272  		f, err := os.Open(filepath.Join("testdata", file))
  1273  		require.NoError(t, err)
  1274  
  1275  		pathCopy := path.Join(t.TempDir(), path.Base(file))
  1276  		fCopy, err := os.OpenFile(pathCopy, os.O_CREATE|os.O_RDWR, 0600)
  1277  		require.NoError(t, err)
  1278  		defer fCopy.Close()
  1279  
  1280  		_, err = io.Copy(fCopy, f)
  1281  		require.NoError(t, err)
  1282  		err = fCopy.Sync()
  1283  		require.NoError(t, err)
  1284  		require.NoError(t, f.Close())
  1285  
  1286  		filter := bloom.FilterPolicy(10)
  1287  		r, err := newReader(fCopy, ReaderOptions{
  1288  			Filters: map[string]FilterPolicy{
  1289  				filter.Name(): filter,
  1290  			},
  1291  		})
  1292  		require.NoError(t, err)
  1293  		defer func() { require.NoError(t, r.Close()) }()
  1294  
  1295  		// Prior to corruption, validation is successful.
  1296  		require.NoError(t, r.ValidateBlockChecksums())
  1297  
  1298  		// If we are not testing for corruption, we can stop here.
  1299  		if len(corruptionLocations) == 0 {
  1300  			return
  1301  		}
  1302  
  1303  		// Perform bit flips in various corruption locations.
  1304  		layout, err := r.Layout()
  1305  		require.NoError(t, err)
  1306  		for _, location := range corruptionLocations {
  1307  			var bh BlockHandle
  1308  			switch location {
  1309  			case corruptionLocationData:
  1310  				bh = layout.Data[rng.Intn(len(layout.Data))].BlockHandle
  1311  			case corruptionLocationIndex:
  1312  				bh = layout.Index[rng.Intn(len(layout.Index))]
  1313  			case corruptionLocationTopIndex:
  1314  				bh = layout.TopIndex
  1315  			case corruptionLocationFilter:
  1316  				bh = layout.Filter
  1317  			case corruptionLocationRangeDel:
  1318  				bh = layout.RangeDel
  1319  			case corruptionLocationProperties:
  1320  				bh = layout.Properties
  1321  			case corruptionLocationMetaIndex:
  1322  				bh = layout.MetaIndex
  1323  			default:
  1324  				t.Fatalf("unknown location")
  1325  			}
  1326  
  1327  			// Corrupt a random byte within the selected block.
  1328  			pos := int64(bh.Offset) + rng.Int63n(int64(bh.Length))
  1329  			t.Logf("altering file=%s @ offset = %d", file, pos)
  1330  
  1331  			b := make([]byte, 1)
  1332  			n, err := fCopy.ReadAt(b, pos)
  1333  			require.NoError(t, err)
  1334  			require.Equal(t, 1, n)
  1335  			t.Logf("data (before) = %08b", b)
  1336  
  1337  			b[0] ^= 0xff
  1338  			t.Logf("data (after) = %08b", b)
  1339  
  1340  			_, err = fCopy.WriteAt(b, pos)
  1341  			require.NoError(t, err)
  1342  		}
  1343  
  1344  		// Write back to the file.
  1345  		err = fCopy.Sync()
  1346  		require.NoError(t, err)
  1347  
  1348  		// Confirm that checksum validation fails.
  1349  		err = r.ValidateBlockChecksums()
  1350  		require.Error(t, err)
  1351  		require.Regexp(t, `checksum mismatch`, err.Error())
  1352  	}
  1353  
  1354  	for _, tc := range testCases {
  1355  		t.Run(tc.name, func(t *testing.T) {
  1356  			// By default, test across all files, unless overridden.
  1357  			files := tc.files
  1358  			if files == nil {
  1359  				files = allFiles
  1360  			}
  1361  			for _, file := range files {
  1362  				t.Run(file, func(t *testing.T) {
  1363  					testFn(t, file, tc.corruptionLocations)
  1364  				})
  1365  			}
  1366  		})
  1367  	}
  1368  }
  1369  
  1370  func TestReader_TableFormat(t *testing.T) {
  1371  	test := func(t *testing.T, want TableFormat) {
  1372  		fs := vfs.NewMem()
  1373  		f, err := fs.Create("test")
  1374  		require.NoError(t, err)
  1375  
  1376  		opts := WriterOptions{TableFormat: want}
  1377  		w := NewWriter(objstorageprovider.NewFileWritable(f), opts)
  1378  		err = w.Close()
  1379  		require.NoError(t, err)
  1380  
  1381  		f, err = fs.Open("test")
  1382  		require.NoError(t, err)
  1383  		r, err := newReader(f, ReaderOptions{})
  1384  		require.NoError(t, err)
  1385  		defer r.Close()
  1386  
  1387  		got, err := r.TableFormat()
  1388  		require.NoError(t, err)
  1389  		require.Equal(t, want, got)
  1390  	}
  1391  
  1392  	for tf := TableFormatLevelDB; tf <= TableFormatMax; tf++ {
  1393  		t.Run(tf.String(), func(t *testing.T) {
  1394  			test(t, tf)
  1395  		})
  1396  	}
  1397  }
  1398  
  1399  func buildTestTable(
  1400  	t *testing.T, numEntries uint64, blockSize, indexBlockSize int, compression Compression,
  1401  ) *Reader {
  1402  	provider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(vfs.NewMem(), "" /* dirName */))
  1403  	require.NoError(t, err)
  1404  	defer provider.Close()
  1405  	return buildTestTableWithProvider(t, provider, numEntries, blockSize, indexBlockSize, compression)
  1406  }
  1407  
  1408  func buildTestTableWithProvider(
  1409  	t *testing.T,
  1410  	provider objstorage.Provider,
  1411  	numEntries uint64,
  1412  	blockSize, indexBlockSize int,
  1413  	compression Compression,
  1414  ) *Reader {
  1415  	f0, _, err := provider.Create(context.Background(), base.FileTypeTable, base.FileNum(0).DiskFileNum(), objstorage.CreateOptions{})
  1416  	require.NoError(t, err)
  1417  
  1418  	w := NewWriter(f0, WriterOptions{
  1419  		BlockSize:      blockSize,
  1420  		IndexBlockSize: indexBlockSize,
  1421  		Compression:    compression,
  1422  		FilterPolicy:   nil,
  1423  	})
  1424  
  1425  	var ikey InternalKey
  1426  	for i := uint64(0); i < numEntries; i++ {
  1427  		key := make([]byte, 8+i%3)
  1428  		value := make([]byte, i%100)
  1429  		binary.BigEndian.PutUint64(key, i)
  1430  		ikey.UserKey = key
  1431  		w.Add(ikey, value)
  1432  	}
  1433  
  1434  	require.NoError(t, w.Close())
  1435  
  1436  	// Re-open that Filename for reading.
  1437  	f1, err := provider.OpenForReading(context.Background(), base.FileTypeTable, base.FileNum(0).DiskFileNum(), objstorage.OpenOptions{})
  1438  	require.NoError(t, err)
  1439  
  1440  	c := cache.New(128 << 20)
  1441  	defer c.Unref()
  1442  	r, err := NewReader(f1, ReaderOptions{
  1443  		Cache: c,
  1444  	})
  1445  	require.NoError(t, err)
  1446  	return r
  1447  }
  1448  
  1449  func buildBenchmarkTable(
  1450  	b *testing.B, options WriterOptions, confirmTwoLevelIndex bool, offset int,
  1451  ) (*Reader, [][]byte) {
  1452  	mem := vfs.NewMem()
  1453  	f0, err := mem.Create("bench")
  1454  	if err != nil {
  1455  		b.Fatal(err)
  1456  	}
  1457  
  1458  	w := NewWriter(objstorageprovider.NewFileWritable(f0), options)
  1459  
  1460  	var keys [][]byte
  1461  	var ikey InternalKey
  1462  	for i := uint64(0); i < 1e6; i++ {
  1463  		key := make([]byte, 8)
  1464  		binary.BigEndian.PutUint64(key, i+uint64(offset))
  1465  		keys = append(keys, key)
  1466  		ikey.UserKey = key
  1467  		w.Add(ikey, nil)
  1468  	}
  1469  
  1470  	if err := w.Close(); err != nil {
  1471  		b.Fatal(err)
  1472  	}
  1473  
  1474  	// Re-open that Filename for reading.
  1475  	f1, err := mem.Open("bench")
  1476  	if err != nil {
  1477  		b.Fatal(err)
  1478  	}
  1479  	c := cache.New(128 << 20)
  1480  	defer c.Unref()
  1481  	r, err := newReader(f1, ReaderOptions{
  1482  		Cache: c,
  1483  	})
  1484  	if err != nil {
  1485  		b.Fatal(err)
  1486  	}
  1487  	if confirmTwoLevelIndex && r.Properties.IndexPartitions == 0 {
  1488  		b.Fatalf("should have constructed two level index")
  1489  	}
  1490  	return r, keys
  1491  }
  1492  
  1493  var basicBenchmarks = []struct {
  1494  	name    string
  1495  	options WriterOptions
  1496  }{
  1497  	{
  1498  		name: "restart=16,compression=Snappy",
  1499  		options: WriterOptions{
  1500  			BlockSize:            32 << 10,
  1501  			BlockRestartInterval: 16,
  1502  			FilterPolicy:         nil,
  1503  			Compression:          SnappyCompression,
  1504  			TableFormat:          TableFormatPebblev2,
  1505  		},
  1506  	},
  1507  	{
  1508  		name: "restart=16,compression=ZSTD",
  1509  		options: WriterOptions{
  1510  			BlockSize:            32 << 10,
  1511  			BlockRestartInterval: 16,
  1512  			FilterPolicy:         nil,
  1513  			Compression:          ZstdCompression,
  1514  			TableFormat:          TableFormatPebblev2,
  1515  		},
  1516  	},
  1517  }
  1518  
  1519  func BenchmarkTableIterSeekGE(b *testing.B) {
  1520  	for _, bm := range basicBenchmarks {
  1521  		b.Run(bm.name,
  1522  			func(b *testing.B) {
  1523  				r, keys := buildBenchmarkTable(b, bm.options, false, 0)
  1524  				it, err := r.NewIter(nil /* lower */, nil /* upper */)
  1525  				require.NoError(b, err)
  1526  				rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
  1527  
  1528  				b.ResetTimer()
  1529  				for i := 0; i < b.N; i++ {
  1530  					it.SeekGE(keys[rng.Intn(len(keys))], base.SeekGEFlagsNone)
  1531  				}
  1532  
  1533  				b.StopTimer()
  1534  				it.Close()
  1535  				r.Close()
  1536  			})
  1537  	}
  1538  }
  1539  
  1540  func BenchmarkTableIterSeekLT(b *testing.B) {
  1541  	for _, bm := range basicBenchmarks {
  1542  		b.Run(bm.name,
  1543  			func(b *testing.B) {
  1544  				r, keys := buildBenchmarkTable(b, bm.options, false, 0)
  1545  				it, err := r.NewIter(nil /* lower */, nil /* upper */)
  1546  				require.NoError(b, err)
  1547  				rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
  1548  
  1549  				b.ResetTimer()
  1550  				for i := 0; i < b.N; i++ {
  1551  					it.SeekLT(keys[rng.Intn(len(keys))], base.SeekLTFlagsNone)
  1552  				}
  1553  
  1554  				b.StopTimer()
  1555  				it.Close()
  1556  				r.Close()
  1557  			})
  1558  	}
  1559  }
  1560  
  1561  func BenchmarkTableIterNext(b *testing.B) {
  1562  	for _, bm := range basicBenchmarks {
  1563  		b.Run(bm.name,
  1564  			func(b *testing.B) {
  1565  				r, _ := buildBenchmarkTable(b, bm.options, false, 0)
  1566  				it, err := r.NewIter(nil /* lower */, nil /* upper */)
  1567  				require.NoError(b, err)
  1568  
  1569  				b.ResetTimer()
  1570  				var sum int64
  1571  				var key *InternalKey
  1572  				for i := 0; i < b.N; i++ {
  1573  					if key == nil {
  1574  						key, _ = it.First()
  1575  					}
  1576  					sum += int64(binary.BigEndian.Uint64(key.UserKey))
  1577  					key, _ = it.Next()
  1578  				}
  1579  				if testing.Verbose() {
  1580  					fmt.Fprint(io.Discard, sum)
  1581  				}
  1582  
  1583  				b.StopTimer()
  1584  				it.Close()
  1585  				r.Close()
  1586  			})
  1587  	}
  1588  }
  1589  
  1590  func BenchmarkTableIterPrev(b *testing.B) {
  1591  	for _, bm := range basicBenchmarks {
  1592  		b.Run(bm.name,
  1593  			func(b *testing.B) {
  1594  				r, _ := buildBenchmarkTable(b, bm.options, false, 0)
  1595  				it, err := r.NewIter(nil /* lower */, nil /* upper */)
  1596  				require.NoError(b, err)
  1597  
  1598  				b.ResetTimer()
  1599  				var sum int64
  1600  				var key *InternalKey
  1601  				for i := 0; i < b.N; i++ {
  1602  					if key == nil {
  1603  						key, _ = it.Last()
  1604  					}
  1605  					sum += int64(binary.BigEndian.Uint64(key.UserKey))
  1606  					key, _ = it.Prev()
  1607  				}
  1608  				if testing.Verbose() {
  1609  					fmt.Fprint(io.Discard, sum)
  1610  				}
  1611  
  1612  				b.StopTimer()
  1613  				it.Close()
  1614  				r.Close()
  1615  			})
  1616  	}
  1617  }
  1618  
  1619  func BenchmarkLayout(b *testing.B) {
  1620  	r, _ := buildBenchmarkTable(b, WriterOptions{}, false, 0)
  1621  	b.ResetTimer()
  1622  	for i := 0; i < b.N; i++ {
  1623  		r.Layout()
  1624  	}
  1625  	b.StopTimer()
  1626  	r.Close()
  1627  }
  1628  
  1629  func BenchmarkSeqSeekGEExhausted(b *testing.B) {
  1630  	// Snappy with no bloom filter.
  1631  	options := basicBenchmarks[0].options
  1632  
  1633  	for _, twoLevelIndex := range []bool{false, true} {
  1634  		switch twoLevelIndex {
  1635  		case false:
  1636  			options.IndexBlockSize = 0
  1637  		case true:
  1638  			options.IndexBlockSize = 512
  1639  		}
  1640  		const offsetCount = 5000
  1641  		reader, keys := buildBenchmarkTable(b, options, twoLevelIndex, offsetCount)
  1642  		var preKeys [][]byte
  1643  		for i := 0; i < offsetCount; i++ {
  1644  			key := make([]byte, 8)
  1645  			binary.BigEndian.PutUint64(key, uint64(i))
  1646  			preKeys = append(preKeys, key)
  1647  		}
  1648  		var postKeys [][]byte
  1649  		for i := 0; i < offsetCount; i++ {
  1650  			key := make([]byte, 8)
  1651  			binary.BigEndian.PutUint64(key, uint64(i+offsetCount+len(keys)))
  1652  			postKeys = append(postKeys, key)
  1653  		}
  1654  		for _, exhaustedBounds := range []bool{false, true} {
  1655  			for _, prefixSeek := range []bool{false, true} {
  1656  				exhausted := "file"
  1657  				if exhaustedBounds {
  1658  					exhausted = "bounds"
  1659  				}
  1660  				seekKind := "ge"
  1661  				if prefixSeek {
  1662  					seekKind = "prefix-ge"
  1663  				}
  1664  				b.Run(fmt.Sprintf(
  1665  					"two-level=%t/exhausted=%s/seek=%s", twoLevelIndex, exhausted, seekKind),
  1666  					func(b *testing.B) {
  1667  						var upper []byte
  1668  						var seekKeys [][]byte
  1669  						if exhaustedBounds {
  1670  							seekKeys = preKeys
  1671  							upper = keys[0]
  1672  						} else {
  1673  							seekKeys = postKeys
  1674  						}
  1675  						it, err := reader.NewIter(nil /* lower */, upper)
  1676  						require.NoError(b, err)
  1677  						b.ResetTimer()
  1678  						pos := 0
  1679  						var seekGEFlags SeekGEFlags
  1680  						for i := 0; i < b.N; i++ {
  1681  							seekKey := seekKeys[0]
  1682  							var k *InternalKey
  1683  							if prefixSeek {
  1684  								k, _ = it.SeekPrefixGE(seekKey, seekKey, seekGEFlags)
  1685  							} else {
  1686  								k, _ = it.SeekGE(seekKey, seekGEFlags)
  1687  							}
  1688  							if k != nil {
  1689  								b.Fatal("found a key")
  1690  							}
  1691  							if it.Error() != nil {
  1692  								b.Fatalf("%s", it.Error().Error())
  1693  							}
  1694  							pos++
  1695  							if pos == len(seekKeys) {
  1696  								pos = 0
  1697  								seekGEFlags = seekGEFlags.DisableTrySeekUsingNext()
  1698  							} else {
  1699  								seekGEFlags = seekGEFlags.EnableTrySeekUsingNext()
  1700  							}
  1701  						}
  1702  						b.StopTimer()
  1703  						it.Close()
  1704  					})
  1705  			}
  1706  		}
  1707  		reader.Close()
  1708  	}
  1709  }
  1710  
  1711  func BenchmarkIteratorScanManyVersions(b *testing.B) {
  1712  	options := WriterOptions{
  1713  		BlockSize:            32 << 10,
  1714  		BlockRestartInterval: 16,
  1715  		FilterPolicy:         nil,
  1716  		Compression:          SnappyCompression,
  1717  		Comparer:             testkeys.Comparer,
  1718  	}
  1719  	// 10,000 key prefixes, each with 100 versions.
  1720  	const keyCount = 10000
  1721  	const sharedPrefixLen = 32
  1722  	const unsharedPrefixLen = 8
  1723  	const versionCount = 100
  1724  
  1725  	// Take the very large keyspace consisting of alphabetic characters of
  1726  	// lengths up to unsharedPrefixLen and reduce it down to keyCount keys by
  1727  	// picking every 1 key every keyCount keys.
  1728  	keys := testkeys.Alpha(unsharedPrefixLen)
  1729  	keys = keys.EveryN(keys.Count() / keyCount)
  1730  	if keys.Count() < keyCount {
  1731  		b.Fatalf("expected %d keys, found %d", keyCount, keys.Count())
  1732  	}
  1733  	keyBuf := make([]byte, sharedPrefixLen+unsharedPrefixLen+testkeys.MaxSuffixLen)
  1734  	for i := 0; i < sharedPrefixLen; i++ {
  1735  		keyBuf[i] = 'A' + byte(i)
  1736  	}
  1737  	// v2 sstable is 115,178,070 bytes. v3 sstable is 107,181,105 bytes with
  1738  	// 99,049,269 bytes in value blocks.
  1739  	setupBench := func(b *testing.B, tableFormat TableFormat, cacheSize int64) *Reader {
  1740  		mem := vfs.NewMem()
  1741  		f0, err := mem.Create("bench")
  1742  		require.NoError(b, err)
  1743  		options.TableFormat = tableFormat
  1744  		w := NewWriter(objstorageprovider.NewFileWritable(f0), options)
  1745  		val := make([]byte, 100)
  1746  		rng := rand.New(rand.NewSource(100))
  1747  		for i := int64(0); i < keys.Count(); i++ {
  1748  			for v := 0; v < versionCount; v++ {
  1749  				n := testkeys.WriteKeyAt(keyBuf[sharedPrefixLen:], keys, i, int64(versionCount-v+1))
  1750  				key := keyBuf[:n+sharedPrefixLen]
  1751  				rng.Read(val)
  1752  				require.NoError(b, w.Set(key, val))
  1753  			}
  1754  		}
  1755  		require.NoError(b, w.Close())
  1756  		c := cache.New(cacheSize)
  1757  		defer c.Unref()
  1758  		// Re-open the Filename for reading.
  1759  		f0, err = mem.Open("bench")
  1760  		require.NoError(b, err)
  1761  		r, err := newReader(f0, ReaderOptions{
  1762  			Cache:    c,
  1763  			Comparer: testkeys.Comparer,
  1764  		})
  1765  		require.NoError(b, err)
  1766  		return r
  1767  	}
  1768  	for _, format := range []TableFormat{TableFormatPebblev2, TableFormatPebblev3} {
  1769  		b.Run(fmt.Sprintf("format=%s", format.String()), func(b *testing.B) {
  1770  			// 150MiB results in a high cache hit rate for both formats. 20MiB
  1771  			// results in a high cache hit rate for the data blocks in
  1772  			// TableFormatPebblev3.
  1773  			for _, cacheSize := range []int64{20 << 20, 150 << 20} {
  1774  				b.Run(fmt.Sprintf("cache-size=%s", humanize.Bytes.Int64(cacheSize)),
  1775  					func(b *testing.B) {
  1776  						r := setupBench(b, format, cacheSize)
  1777  						defer func() {
  1778  							require.NoError(b, r.Close())
  1779  						}()
  1780  						for _, readValue := range []bool{false, true} {
  1781  							b.Run(fmt.Sprintf("read-value=%t", readValue), func(b *testing.B) {
  1782  								iter, err := r.NewIter(nil, nil)
  1783  								require.NoError(b, err)
  1784  								var k *InternalKey
  1785  								var v base.LazyValue
  1786  								var valBuf [100]byte
  1787  								b.ResetTimer()
  1788  								for i := 0; i < b.N; i++ {
  1789  									if k == nil {
  1790  										k, _ = iter.First()
  1791  										if k == nil {
  1792  											b.Fatalf("k is nil")
  1793  										}
  1794  									}
  1795  									k, v = iter.Next()
  1796  									if k != nil && readValue {
  1797  										_, callerOwned, err := v.Value(valBuf[:])
  1798  										if err != nil {
  1799  											b.Fatal(err)
  1800  										} else if callerOwned {
  1801  											b.Fatalf("unexpected callerOwned: %t", callerOwned)
  1802  										}
  1803  									}
  1804  								}
  1805  							})
  1806  						}
  1807  					})
  1808  			}
  1809  		})
  1810  	}
  1811  }
  1812  
  1813  func BenchmarkIteratorScanNextPrefix(b *testing.B) {
  1814  	options := WriterOptions{
  1815  		BlockSize:            32 << 10,
  1816  		BlockRestartInterval: 16,
  1817  		FilterPolicy:         nil,
  1818  		Compression:          SnappyCompression,
  1819  		TableFormat:          TableFormatPebblev3,
  1820  		Comparer:             testkeys.Comparer,
  1821  	}
  1822  	const keyCount = 10000
  1823  	const sharedPrefixLen = 32
  1824  	const unsharedPrefixLen = 8
  1825  	val := make([]byte, 100)
  1826  	rand.New(rand.NewSource(100)).Read(val)
  1827  
  1828  	// Take the very large keyspace consisting of alphabetic characters of
  1829  	// lengths up to unsharedPrefixLen and reduce it down to keyCount keys by
  1830  	// picking every 1 key every keyCount keys.
  1831  	keys := testkeys.Alpha(unsharedPrefixLen)
  1832  	keys = keys.EveryN(keys.Count() / keyCount)
  1833  	if keys.Count() < keyCount {
  1834  		b.Fatalf("expected %d keys, found %d", keyCount, keys.Count())
  1835  	}
  1836  	keyBuf := make([]byte, sharedPrefixLen+unsharedPrefixLen+testkeys.MaxSuffixLen)
  1837  	for i := 0; i < sharedPrefixLen; i++ {
  1838  		keyBuf[i] = 'A' + byte(i)
  1839  	}
  1840  	setupBench := func(b *testing.B, versCount int) (r *Reader, succKeys [][]byte) {
  1841  		mem := vfs.NewMem()
  1842  		f0, err := mem.Create("bench")
  1843  		require.NoError(b, err)
  1844  		w := NewWriter(objstorageprovider.NewFileWritable(f0), options)
  1845  		for i := int64(0); i < keys.Count(); i++ {
  1846  			for v := 0; v < versCount; v++ {
  1847  				n := testkeys.WriteKeyAt(keyBuf[sharedPrefixLen:], keys, i, int64(versCount-v+1))
  1848  				key := keyBuf[:n+sharedPrefixLen]
  1849  				require.NoError(b, w.Set(key, val))
  1850  				if v == 0 {
  1851  					prefixLen := testkeys.Comparer.Split(key)
  1852  					prefixKey := key[:prefixLen]
  1853  					succKey := testkeys.Comparer.ImmediateSuccessor(nil, prefixKey)
  1854  					succKeys = append(succKeys, succKey)
  1855  				}
  1856  			}
  1857  		}
  1858  		require.NoError(b, w.Close())
  1859  		// NB: This 200MiB cache is sufficient for even the largest file: 10,000
  1860  		// keys * 100 versions = 1M keys, where each key-value pair is ~140 bytes
  1861  		// = 140MB. So we are not measuring the caching benefit of
  1862  		// TableFormatPebblev3 storing older values in value blocks.
  1863  		c := cache.New(200 << 20)
  1864  		defer c.Unref()
  1865  		// Re-open the Filename for reading.
  1866  		f0, err = mem.Open("bench")
  1867  		require.NoError(b, err)
  1868  		r, err = newReader(f0, ReaderOptions{
  1869  			Cache:    c,
  1870  			Comparer: testkeys.Comparer,
  1871  		})
  1872  		require.NoError(b, err)
  1873  		return r, succKeys
  1874  	}
  1875  	// Analysis of some sample results with TableFormatPebblev2:
  1876  	// versions=1/method=seek-ge-10         	22107622	        53.57 ns/op
  1877  	// versions=1/method=next-prefix-10     	36292837	        33.07 ns/op
  1878  	// versions=2/method=seek-ge-10         	14429138	        82.92 ns/op
  1879  	// versions=2/method=next-prefix-10     	19676055	        60.78 ns/op
  1880  	// versions=10/method=seek-ge-10        	 1453726	       825.2 ns/op
  1881  	// versions=10/method=next-prefix-10    	 2450498	       489.6 ns/op
  1882  	// versions=100/method=seek-ge-10       	  965143	      1257 ns/op
  1883  	// versions=100/method=next-prefix-10   	 1000000	      1054 ns/op
  1884  	//
  1885  	// With 1 version, both SeekGE and NextPrefix will be able to complete after
  1886  	// doing a single call to blockIter.Next. However, SeekGE has to do two key
  1887  	// comparisons unlike the one key comparison in NextPrefix. This is because
  1888  	// SeekGE also compares *before* calling Next since it is possible that the
  1889  	// preceding SeekGE is already at the right place.
  1890  	//
  1891  	// With 2 versions, both will do two calls to blockIter.Next. The difference
  1892  	// in the cost is the same as in the 1 version case.
  1893  	//
  1894  	// With 10 versions, it is still likely that the desired key is in the same
  1895  	// data block. NextPrefix will seek only the blockIter. And in the rare case
  1896  	// that the key is in the next data block, it will step the index block (not
  1897  	// seek). In comparison, SeekGE will seek the index block too.
  1898  	//
  1899  	// With 100 versions we more often cross from one data block to the next, so
  1900  	// the difference in cost declines.
  1901  	//
  1902  	// Some sample results with TableFormatPebblev3:
  1903  
  1904  	// versions=1/method=seek-ge-10         	18702609	        53.90 ns/op
  1905  	// versions=1/method=next-prefix-10     	77440167	        15.41 ns/op
  1906  	// versions=2/method=seek-ge-10         	13554286	        87.91 ns/op
  1907  	// versions=2/method=next-prefix-10     	62148526	        19.25 ns/op
  1908  	// versions=10/method=seek-ge-10        	 1316676	       910.5 ns/op
  1909  	// versions=10/method=next-prefix-10    	18829448	        62.61 ns/op
  1910  	// versions=100/method=seek-ge-10       	 1166139	      1025 ns/op
  1911  	// versions=100/method=next-prefix-10   	 4443386	       265.3 ns/op
  1912  	//
  1913  	// NextPrefix is much cheaper than in TableFormatPebblev2 with larger number
  1914  	// of versions. It is also cheaper with 1 and 2 versions since
  1915  	// setHasSamePrefix=false eliminates a key comparison.
  1916  	for _, versionCount := range []int{1, 2, 10, 100} {
  1917  		b.Run(fmt.Sprintf("versions=%d", versionCount), func(b *testing.B) {
  1918  			r, succKeys := setupBench(b, versionCount)
  1919  			defer func() {
  1920  				require.NoError(b, r.Close())
  1921  			}()
  1922  			for _, method := range []string{"seek-ge", "next-prefix"} {
  1923  				b.Run(fmt.Sprintf("method=%s", method), func(b *testing.B) {
  1924  					for _, readValue := range []bool{false, true} {
  1925  						b.Run(fmt.Sprintf("read-value=%t", readValue), func(b *testing.B) {
  1926  							iter, err := r.NewIter(nil, nil)
  1927  							require.NoError(b, err)
  1928  							var nextFunc func(index int) (*InternalKey, base.LazyValue)
  1929  							switch method {
  1930  							case "seek-ge":
  1931  								nextFunc = func(index int) (*InternalKey, base.LazyValue) {
  1932  									var flags base.SeekGEFlags
  1933  									return iter.SeekGE(succKeys[index], flags.EnableTrySeekUsingNext())
  1934  								}
  1935  							case "next-prefix":
  1936  								nextFunc = func(index int) (*InternalKey, base.LazyValue) {
  1937  									return iter.NextPrefix(succKeys[index])
  1938  								}
  1939  							default:
  1940  								b.Fatalf("unknown method %s", method)
  1941  							}
  1942  							n := keys.Count()
  1943  							j := n
  1944  							var k *InternalKey
  1945  							var v base.LazyValue
  1946  							var valBuf [100]byte
  1947  							b.ResetTimer()
  1948  							for i := 0; i < b.N; i++ {
  1949  								if k == nil {
  1950  									if j != n {
  1951  										b.Fatalf("unexpected %d != %d", j, n)
  1952  									}
  1953  									k, _ = iter.First()
  1954  									j = 0
  1955  								} else {
  1956  									k, v = nextFunc(int(j - 1))
  1957  									if k != nil && readValue {
  1958  										_, callerOwned, err := v.Value(valBuf[:])
  1959  										if err != nil {
  1960  											b.Fatal(err)
  1961  										} else if callerOwned {
  1962  											b.Fatalf("unexpected callerOwned: %t", callerOwned)
  1963  										}
  1964  									}
  1965  
  1966  								}
  1967  								if k != nil {
  1968  									j++
  1969  								}
  1970  							}
  1971  						})
  1972  					}
  1973  				})
  1974  			}
  1975  		})
  1976  	}
  1977  }
  1978  
  1979  func BenchmarkIteratorScanObsolete(b *testing.B) {
  1980  	options := WriterOptions{
  1981  		BlockSize:            32 << 10,
  1982  		BlockRestartInterval: 16,
  1983  		FilterPolicy:         nil,
  1984  		Compression:          SnappyCompression,
  1985  		Comparer:             testkeys.Comparer,
  1986  	}
  1987  	const keyCount = 1 << 20
  1988  	const keyLen = 10
  1989  
  1990  	// Take the very large keyspace consisting of alphabetic characters of
  1991  	// lengths up to unsharedPrefixLen and reduce it down to keyCount keys by
  1992  	// picking every 1 key every keyCount keys.
  1993  	keys := testkeys.Alpha(keyLen)
  1994  	keys = keys.EveryN(keys.Count() / keyCount)
  1995  	if keys.Count() < keyCount {
  1996  		b.Fatalf("expected %d keys, found %d", keyCount, keys.Count())
  1997  	}
  1998  	expectedKeyCount := keys.Count()
  1999  	keyBuf := make([]byte, keyLen)
  2000  	setupBench := func(b *testing.B, tableFormat TableFormat, cacheSize int64) *Reader {
  2001  		mem := vfs.NewMem()
  2002  		f0, err := mem.Create("bench")
  2003  		require.NoError(b, err)
  2004  		options.TableFormat = tableFormat
  2005  		w := NewWriter(objstorageprovider.NewFileWritable(f0), options)
  2006  		val := make([]byte, 100)
  2007  		rng := rand.New(rand.NewSource(100))
  2008  		for i := int64(0); i < keys.Count(); i++ {
  2009  			n := testkeys.WriteKey(keyBuf, keys, i)
  2010  			key := keyBuf[:n]
  2011  			rng.Read(val)
  2012  			forceObsolete := true
  2013  			if i == 0 {
  2014  				forceObsolete = false
  2015  			}
  2016  			require.NoError(b, w.AddWithForceObsolete(
  2017  				base.MakeInternalKey(key, 0, InternalKeyKindSet), val, forceObsolete))
  2018  		}
  2019  		require.NoError(b, w.Close())
  2020  		c := cache.New(cacheSize)
  2021  		defer c.Unref()
  2022  		// Re-open the Filename for reading.
  2023  		f0, err = mem.Open("bench")
  2024  		require.NoError(b, err)
  2025  		r, err := newReader(f0, ReaderOptions{
  2026  			Cache:    c,
  2027  			Comparer: testkeys.Comparer,
  2028  		})
  2029  		require.NoError(b, err)
  2030  		return r
  2031  	}
  2032  	for _, format := range []TableFormat{TableFormatPebblev3, TableFormatPebblev4} {
  2033  		b.Run(fmt.Sprintf("format=%s", format.String()), func(b *testing.B) {
  2034  			// 150MiB results in a high cache hit rate for both formats.
  2035  			for _, cacheSize := range []int64{1, 150 << 20} {
  2036  				b.Run(fmt.Sprintf("cache-size=%s", humanize.Bytes.Int64(cacheSize)),
  2037  					func(b *testing.B) {
  2038  						r := setupBench(b, format, cacheSize)
  2039  						defer func() {
  2040  							require.NoError(b, r.Close())
  2041  						}()
  2042  						for _, hideObsoletePoints := range []bool{false, true} {
  2043  							b.Run(fmt.Sprintf("hide-obsolete=%t", hideObsoletePoints), func(b *testing.B) {
  2044  								var filterer *BlockPropertiesFilterer
  2045  								if format == TableFormatPebblev4 && hideObsoletePoints {
  2046  									filterer = newBlockPropertiesFilterer(
  2047  										[]BlockPropertyFilter{obsoleteKeyBlockPropertyFilter{}}, nil)
  2048  									intersects, err :=
  2049  										filterer.intersectsUserPropsAndFinishInit(r.Properties.UserProperties)
  2050  									if err != nil {
  2051  										b.Fatalf("%s", err.Error())
  2052  									}
  2053  									if !intersects {
  2054  										b.Fatalf("sstable does not intersect")
  2055  									}
  2056  								}
  2057  								iter, err := r.NewIterWithBlockPropertyFiltersAndContextEtc(
  2058  									context.Background(), nil, nil, filterer, hideObsoletePoints,
  2059  									true, nil, CategoryAndQoS{}, nil,
  2060  									TrivialReaderProvider{Reader: r})
  2061  								require.NoError(b, err)
  2062  								b.ResetTimer()
  2063  								for i := 0; i < b.N; i++ {
  2064  									count := int64(0)
  2065  									k, _ := iter.First()
  2066  									for k != nil {
  2067  										count++
  2068  										k, _ = iter.Next()
  2069  									}
  2070  									if format == TableFormatPebblev4 && hideObsoletePoints {
  2071  										if count != 1 {
  2072  											b.Fatalf("found %d points", count)
  2073  										}
  2074  									} else {
  2075  										if count != expectedKeyCount {
  2076  											b.Fatalf("found %d points", count)
  2077  										}
  2078  									}
  2079  								}
  2080  							})
  2081  						}
  2082  					})
  2083  			}
  2084  		})
  2085  	}
  2086  }
  2087  
  2088  func newReader(r ReadableFile, o ReaderOptions, extraOpts ...ReaderOption) (*Reader, error) {
  2089  	readable, err := NewSimpleReadable(r)
  2090  	if err != nil {
  2091  		return nil, err
  2092  	}
  2093  	return NewReader(readable, o, extraOpts...)
  2094  }