github.com/cockroachdb/pebble@v1.1.2/sstable/reader_test.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package sstable
     6  
     7  import (
     8  	"bytes"
     9  	"context"
    10  	"encoding/binary"
    11  	"fmt"
    12  	"io"
    13  	"math"
    14  	"os"
    15  	"path"
    16  	"path/filepath"
    17  	"strings"
    18  	"testing"
    19  	"time"
    20  
    21  	"github.com/cockroachdb/datadriven"
    22  	"github.com/cockroachdb/errors"
    23  	"github.com/cockroachdb/pebble/bloom"
    24  	"github.com/cockroachdb/pebble/internal/base"
    25  	"github.com/cockroachdb/pebble/internal/cache"
    26  	"github.com/cockroachdb/pebble/internal/humanize"
    27  	"github.com/cockroachdb/pebble/internal/manifest"
    28  	"github.com/cockroachdb/pebble/internal/testkeys"
    29  	"github.com/cockroachdb/pebble/objstorage"
    30  	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
    31  	"github.com/cockroachdb/pebble/vfs"
    32  	"github.com/cockroachdb/pebble/vfs/errorfs"
    33  	"github.com/stretchr/testify/require"
    34  	"golang.org/x/exp/rand"
    35  )
    36  
    37  // get is a testing helper that simulates a read and helps verify bloom filters
    38  // until they are available through iterators.
    39  func (r *Reader) get(key []byte) (value []byte, err error) {
    40  	if r.err != nil {
    41  		return nil, r.err
    42  	}
    43  
    44  	if r.tableFilter != nil {
    45  		dataH, err := r.readFilter(context.Background(), nil /* stats */)
    46  		if err != nil {
    47  			return nil, err
    48  		}
    49  		var lookupKey []byte
    50  		if r.Split != nil {
    51  			lookupKey = key[:r.Split(key)]
    52  		} else {
    53  			lookupKey = key
    54  		}
    55  		mayContain := r.tableFilter.mayContain(dataH.Get(), lookupKey)
    56  		dataH.Release()
    57  		if !mayContain {
    58  			return nil, base.ErrNotFound
    59  		}
    60  	}
    61  
    62  	i, err := r.NewIter(nil /* lower */, nil /* upper */)
    63  	if err != nil {
    64  		return nil, err
    65  	}
    66  	var v base.LazyValue
    67  	ikey, v := i.SeekGE(key, base.SeekGEFlagsNone)
    68  	value, _, err = v.Value(nil)
    69  	if err != nil {
    70  		return nil, err
    71  	}
    72  
    73  	if ikey == nil || r.Compare(key, ikey.UserKey) != 0 {
    74  		err := i.Close()
    75  		if err == nil {
    76  			err = base.ErrNotFound
    77  		}
    78  		return nil, err
    79  	}
    80  
    81  	// The value will be "freed" when the iterator is closed, so make a copy
    82  	// which will outlast the lifetime of the iterator.
    83  	newValue := make([]byte, len(value))
    84  	copy(newValue, value)
    85  	if err := i.Close(); err != nil {
    86  		return nil, err
    87  	}
    88  	return newValue, nil
    89  }
    90  
    91  // iterAdapter adapts the new Iterator API which returns the key and value from
    92  // positioning methods (Seek*, First, Last, Next, Prev) to the old API which
    93  // returned a boolean corresponding to Valid. Only used by test code.
    94  type iterAdapter struct {
    95  	Iterator
    96  	key *InternalKey
    97  	val []byte
    98  }
    99  
   100  func newIterAdapter(iter Iterator) *iterAdapter {
   101  	return &iterAdapter{
   102  		Iterator: iter,
   103  	}
   104  }
   105  
   106  func (i *iterAdapter) update(key *InternalKey, val base.LazyValue) bool {
   107  	i.key = key
   108  	if v, _, err := val.Value(nil); err != nil {
   109  		i.key = nil
   110  		i.val = nil
   111  	} else {
   112  		i.val = v
   113  	}
   114  	return i.key != nil
   115  }
   116  
   117  func (i *iterAdapter) String() string {
   118  	return "iter-adapter"
   119  }
   120  
   121  func (i *iterAdapter) SeekGE(key []byte, flags base.SeekGEFlags) bool {
   122  	return i.update(i.Iterator.SeekGE(key, flags))
   123  }
   124  
   125  func (i *iterAdapter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) bool {
   126  	return i.update(i.Iterator.SeekPrefixGE(prefix, key, flags))
   127  }
   128  
   129  func (i *iterAdapter) SeekLT(key []byte, flags base.SeekLTFlags) bool {
   130  	return i.update(i.Iterator.SeekLT(key, flags))
   131  }
   132  
   133  func (i *iterAdapter) First() bool {
   134  	return i.update(i.Iterator.First())
   135  }
   136  
   137  func (i *iterAdapter) Last() bool {
   138  	return i.update(i.Iterator.Last())
   139  }
   140  
   141  func (i *iterAdapter) Next() bool {
   142  	return i.update(i.Iterator.Next())
   143  }
   144  
   145  func (i *iterAdapter) NextPrefix(succKey []byte) bool {
   146  	return i.update(i.Iterator.NextPrefix(succKey))
   147  }
   148  
   149  func (i *iterAdapter) NextIgnoreResult() {
   150  	i.Iterator.Next()
   151  	i.update(nil, base.LazyValue{})
   152  }
   153  
   154  func (i *iterAdapter) Prev() bool {
   155  	return i.update(i.Iterator.Prev())
   156  }
   157  
   158  func (i *iterAdapter) Key() *InternalKey {
   159  	return i.key
   160  }
   161  
   162  func (i *iterAdapter) Value() []byte {
   163  	return i.val
   164  }
   165  
   166  func (i *iterAdapter) Valid() bool {
   167  	return i.key != nil
   168  }
   169  
   170  func (i *iterAdapter) SetBounds(lower, upper []byte) {
   171  	i.Iterator.SetBounds(lower, upper)
   172  	i.key = nil
   173  }
   174  
   175  func TestVirtualReader(t *testing.T) {
   176  	// A faux filenum used to create fake filemetadata for testing.
   177  	var fileNum int = 1
   178  	nextFileNum := func() base.FileNum {
   179  		fileNum++
   180  		return base.FileNum(fileNum - 1)
   181  	}
   182  
   183  	// Set during the latest build command.
   184  	var r *Reader
   185  	var meta manifest.PhysicalFileMeta
   186  	var bp BufferPool
   187  
   188  	// Set during the latest virtualize command.
   189  	var vMeta1 manifest.VirtualFileMeta
   190  	var v VirtualReader
   191  
   192  	defer func() {
   193  		if r != nil {
   194  			require.NoError(t, r.Close())
   195  			bp.Release()
   196  		}
   197  	}()
   198  
   199  	createPhysicalMeta := func(w *WriterMetadata, r *Reader) (manifest.PhysicalFileMeta, error) {
   200  		meta := &manifest.FileMetadata{}
   201  		meta.FileNum = nextFileNum()
   202  		meta.CreationTime = time.Now().Unix()
   203  		meta.Size = w.Size
   204  		meta.SmallestSeqNum = w.SmallestSeqNum
   205  		meta.LargestSeqNum = w.LargestSeqNum
   206  
   207  		if w.HasPointKeys {
   208  			meta.ExtendPointKeyBounds(r.Compare, w.SmallestPoint, w.LargestPoint)
   209  		}
   210  		if w.HasRangeDelKeys {
   211  			meta.ExtendPointKeyBounds(r.Compare, w.SmallestRangeDel, w.LargestRangeDel)
   212  		}
   213  		if w.HasRangeKeys {
   214  			meta.ExtendRangeKeyBounds(r.Compare, w.SmallestRangeKey, w.LargestRangeKey)
   215  		}
   216  		meta.InitPhysicalBacking()
   217  
   218  		if err := meta.Validate(r.Compare, r.opts.Comparer.FormatKey); err != nil {
   219  			return manifest.PhysicalFileMeta{}, err
   220  		}
   221  
   222  		return meta.PhysicalMeta(), nil
   223  	}
   224  
   225  	formatWMeta := func(m *WriterMetadata) string {
   226  		var b bytes.Buffer
   227  		if m.HasPointKeys {
   228  			fmt.Fprintf(&b, "point:    [%s-%s]\n", m.SmallestPoint, m.LargestPoint)
   229  		}
   230  		if m.HasRangeDelKeys {
   231  			fmt.Fprintf(&b, "rangedel: [%s-%s]\n", m.SmallestRangeDel, m.LargestRangeDel)
   232  		}
   233  		if m.HasRangeKeys {
   234  			fmt.Fprintf(&b, "rangekey: [%s-%s]\n", m.SmallestRangeKey, m.LargestRangeKey)
   235  		}
   236  		fmt.Fprintf(&b, "seqnums:  [%d-%d]\n", m.SmallestSeqNum, m.LargestSeqNum)
   237  		return b.String()
   238  	}
   239  
   240  	formatVirtualReader := func(v *VirtualReader) string {
   241  		var b bytes.Buffer
   242  		fmt.Fprintf(&b, "bounds:  [%s-%s]\n", v.vState.lower, v.vState.upper)
   243  		fmt.Fprintf(&b, "filenum: %s\n", v.vState.fileNum.String())
   244  		fmt.Fprintf(
   245  			&b, "props: %s: %d, %s: %d, %s: %d, %s: %d, %s: %d, %s: %d, %s: %d, %s: %d, %s: %d, %s: %d, %s: %d\n",
   246  			"NumEntries",
   247  			v.Properties.NumEntries,
   248  			"RawKeySize",
   249  			v.Properties.RawKeySize,
   250  			"RawValueSize",
   251  			v.Properties.RawValueSize,
   252  			"RawPointTombstoneKeySize",
   253  			v.Properties.RawPointTombstoneKeySize,
   254  			"RawPointTombstoneValueSize",
   255  			v.Properties.RawPointTombstoneValueSize,
   256  			"NumSizedDeletions",
   257  			v.Properties.NumSizedDeletions,
   258  			"NumDeletions",
   259  			v.Properties.NumDeletions,
   260  			"NumRangeDeletions",
   261  			v.Properties.NumRangeDeletions,
   262  			"NumRangeKeyDels",
   263  			v.Properties.NumRangeKeyDels,
   264  			"NumRangeKeySets",
   265  			v.Properties.NumRangeKeySets,
   266  			"ValueBlocksSize",
   267  			v.Properties.ValueBlocksSize,
   268  		)
   269  		return b.String()
   270  	}
   271  
   272  	datadriven.RunTest(t, "testdata/virtual_reader", func(t *testing.T, td *datadriven.TestData) string {
   273  		switch td.Cmd {
   274  		case "build":
   275  			if r != nil {
   276  				bp.Release()
   277  				_ = r.Close()
   278  				r = nil
   279  				meta.FileMetadata = nil
   280  				vMeta1.FileMetadata = nil
   281  				v = VirtualReader{}
   282  			}
   283  			var wMeta *WriterMetadata
   284  			var err error
   285  			writerOpts := &WriterOptions{
   286  				TableFormat: TableFormatMax,
   287  			}
   288  			// Use a single level index by default.
   289  			writerOpts.IndexBlockSize = 100000
   290  			if len(td.CmdArgs) == 1 {
   291  				if td.CmdArgs[0].String() == "twoLevel" {
   292  					// Force a two level index.
   293  					writerOpts.IndexBlockSize = 1
   294  					writerOpts.BlockSize = 1
   295  				}
   296  			}
   297  			wMeta, r, err = runBuildCmd(td, writerOpts, 0)
   298  			if err != nil {
   299  				return err.Error()
   300  			}
   301  			bp.Init(5)
   302  
   303  			// Create a fake filemetada using the writer meta.
   304  			meta, err = createPhysicalMeta(wMeta, r)
   305  			if err != nil {
   306  				return err.Error()
   307  			}
   308  			r.fileNum = meta.FileBacking.DiskFileNum
   309  			return formatWMeta(wMeta)
   310  
   311  		case "virtualize":
   312  			// virtualize will split the previously built physical sstable into
   313  			// a single sstable with virtual bounds. The command assumes that
   314  			// the bounds for the virtual sstable are valid. For the purposes of
   315  			// this command the bounds must be valid keys. In general, and for
   316  			// this command, range key/range del spans must also not span across
   317  			// virtual sstable bounds.
   318  			if meta.FileMetadata == nil {
   319  				return "build must be called at least once before virtualize"
   320  			}
   321  			if vMeta1.FileMetadata != nil {
   322  				vMeta1.FileMetadata = nil
   323  				v = VirtualReader{}
   324  			}
   325  			vMeta := &manifest.FileMetadata{
   326  				FileBacking:    meta.FileBacking,
   327  				SmallestSeqNum: meta.SmallestSeqNum,
   328  				LargestSeqNum:  meta.LargestSeqNum,
   329  				Virtual:        true,
   330  			}
   331  			// Parse the virtualization bounds.
   332  			bounds := strings.Split(td.CmdArgs[0].String(), "-")
   333  			vMeta.Smallest = base.ParseInternalKey(bounds[0])
   334  			vMeta.Largest = base.ParseInternalKey(bounds[1])
   335  			vMeta.FileNum = nextFileNum()
   336  			var err error
   337  			vMeta.Size, err = r.EstimateDiskUsage(vMeta.Smallest.UserKey, vMeta.Largest.UserKey)
   338  			if err != nil {
   339  				return err.Error()
   340  			}
   341  			vMeta.ValidateVirtual(meta.FileMetadata)
   342  
   343  			vMeta1 = vMeta.VirtualMeta()
   344  			v = MakeVirtualReader(r, vMeta1, false /* isForeign */)
   345  			return formatVirtualReader(&v)
   346  
   347  		case "citer":
   348  			// Creates a compaction iterator from the virtual reader, and then
   349  			// just scans the keyspace. Which is all a compaction iterator is
   350  			// used for. This tests the First and Next calls.
   351  			if vMeta1.FileMetadata == nil {
   352  				return "virtualize must be called before creating compaction iters"
   353  			}
   354  
   355  			var rp ReaderProvider
   356  			var bytesIterated uint64
   357  			iter, err := v.NewCompactionIter(&bytesIterated, rp, &bp)
   358  			if err != nil {
   359  				return err.Error()
   360  			}
   361  
   362  			var buf bytes.Buffer
   363  			for key, val := iter.First(); key != nil; key, val = iter.Next() {
   364  				fmt.Fprintf(&buf, "%s:%s\n", key.String(), val.InPlaceValue())
   365  			}
   366  			err = iter.Close()
   367  			if err != nil {
   368  				return err.Error()
   369  			}
   370  			return buf.String()
   371  
   372  		case "constrain":
   373  			if vMeta1.FileMetadata == nil {
   374  				return "virtualize must be called before constrain"
   375  			}
   376  			splits := strings.Split(td.CmdArgs[0].String(), ",")
   377  			of, ol := []byte(splits[0]), []byte(splits[1])
   378  			inclusive, f, l := v.vState.constrainBounds(of, ol, splits[2] == "true")
   379  			var buf bytes.Buffer
   380  			buf.Write(f)
   381  			buf.WriteByte(',')
   382  			buf.Write(l)
   383  			buf.WriteByte(',')
   384  			if inclusive {
   385  				buf.WriteString("true")
   386  			} else {
   387  				buf.WriteString("false")
   388  			}
   389  			buf.WriteByte('\n')
   390  			return buf.String()
   391  
   392  		case "scan-range-del":
   393  			if vMeta1.FileMetadata == nil {
   394  				return "virtualize must be called before scan-range-del"
   395  			}
   396  			iter, err := v.NewRawRangeDelIter()
   397  			if err != nil {
   398  				return err.Error()
   399  			}
   400  			if iter == nil {
   401  				return ""
   402  			}
   403  			defer iter.Close()
   404  
   405  			var buf bytes.Buffer
   406  			for s := iter.First(); s != nil; s = iter.Next() {
   407  				fmt.Fprintf(&buf, "%s\n", s)
   408  			}
   409  			return buf.String()
   410  
   411  		case "scan-range-key":
   412  			if vMeta1.FileMetadata == nil {
   413  				return "virtualize must be called before scan-range-key"
   414  			}
   415  			iter, err := v.NewRawRangeKeyIter()
   416  			if err != nil {
   417  				return err.Error()
   418  			}
   419  			if iter == nil {
   420  				return ""
   421  			}
   422  			defer iter.Close()
   423  
   424  			var buf bytes.Buffer
   425  			for s := iter.First(); s != nil; s = iter.Next() {
   426  				fmt.Fprintf(&buf, "%s\n", s)
   427  			}
   428  			return buf.String()
   429  
   430  		case "iter":
   431  			if vMeta1.FileMetadata == nil {
   432  				return "virtualize must be called before iter"
   433  			}
   434  			var lower, upper []byte
   435  			if len(td.CmdArgs) > 0 {
   436  				splits := strings.Split(td.CmdArgs[0].String(), "-")
   437  				lower, upper = []byte(splits[0]), []byte(splits[1])
   438  			}
   439  
   440  			var stats base.InternalIteratorStats
   441  			iter, err := v.NewIterWithBlockPropertyFiltersAndContextEtc(
   442  				context.Background(), lower, upper, nil, false, false,
   443  				&stats, TrivialReaderProvider{Reader: r})
   444  			if err != nil {
   445  				return err.Error()
   446  			}
   447  			return runIterCmd(td, iter, true, runIterCmdStats(&stats))
   448  
   449  		default:
   450  			return fmt.Sprintf("unknown command: %s", td.Cmd)
   451  		}
   452  	})
   453  }
   454  
   455  func TestReader(t *testing.T) {
   456  	writerOpts := map[string]WriterOptions{
   457  		// No bloom filters.
   458  		"default": {},
   459  		"bloom10bit": {
   460  			// The standard policy.
   461  			FilterPolicy: bloom.FilterPolicy(10),
   462  			FilterType:   base.TableFilter,
   463  		},
   464  		"bloom1bit": {
   465  			// A policy with many false positives.
   466  			FilterPolicy: bloom.FilterPolicy(1),
   467  			FilterType:   base.TableFilter,
   468  		},
   469  		"bloom100bit": {
   470  			// A policy unlikely to have false positives.
   471  			FilterPolicy: bloom.FilterPolicy(100),
   472  			FilterType:   base.TableFilter,
   473  		},
   474  	}
   475  
   476  	blockSizes := map[string]int{
   477  		"1bytes":   1,
   478  		"5bytes":   5,
   479  		"10bytes":  10,
   480  		"25bytes":  25,
   481  		"Maxbytes": math.MaxInt32,
   482  	}
   483  
   484  	opts := map[string]*Comparer{
   485  		"default":      testkeys.Comparer,
   486  		"prefixFilter": fixtureComparer,
   487  	}
   488  
   489  	testDirs := map[string]string{
   490  		"default":      "testdata/reader",
   491  		"prefixFilter": "testdata/prefixreader",
   492  	}
   493  
   494  	for format := TableFormatPebblev2; format <= TableFormatMax; format++ {
   495  		for dName, blockSize := range blockSizes {
   496  			for iName, indexBlockSize := range blockSizes {
   497  				for lName, tableOpt := range writerOpts {
   498  					for oName, cmp := range opts {
   499  						tableOpt.BlockSize = blockSize
   500  						tableOpt.Comparer = cmp
   501  						tableOpt.IndexBlockSize = indexBlockSize
   502  						tableOpt.TableFormat = format
   503  
   504  						t.Run(
   505  							fmt.Sprintf("format=%d,opts=%s,writerOpts=%s,blockSize=%s,indexSize=%s",
   506  								format, oName, lName, dName, iName),
   507  							func(t *testing.T) {
   508  								runTestReader(
   509  									t, tableOpt, testDirs[oName], nil /* Reader */, true)
   510  							})
   511  					}
   512  				}
   513  			}
   514  		}
   515  	}
   516  }
   517  
   518  func TestReaderHideObsolete(t *testing.T) {
   519  	blockSizes := map[string]int{
   520  		"1bytes":   1,
   521  		"5bytes":   5,
   522  		"10bytes":  10,
   523  		"25bytes":  25,
   524  		"Maxbytes": math.MaxInt32,
   525  	}
   526  	for dName, blockSize := range blockSizes {
   527  		opts := WriterOptions{
   528  			TableFormat:    TableFormatPebblev4,
   529  			BlockSize:      blockSize,
   530  			IndexBlockSize: blockSize,
   531  			Comparer:       testkeys.Comparer,
   532  		}
   533  		t.Run(fmt.Sprintf("blockSize=%s", dName), func(t *testing.T) {
   534  			runTestReader(
   535  				t, opts, "testdata/reader_hide_obsolete",
   536  				nil /* Reader */, true)
   537  		})
   538  	}
   539  }
   540  
   541  func TestHamletReader(t *testing.T) {
   542  	prebuiltSSTs := []string{
   543  		"testdata/h.ldb",
   544  		"testdata/h.sst",
   545  		"testdata/h.no-compression.sst",
   546  		"testdata/h.no-compression.two_level_index.sst",
   547  		"testdata/h.block-bloom.no-compression.sst",
   548  		"testdata/h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst",
   549  		"testdata/h.table-bloom.no-compression.sst",
   550  	}
   551  
   552  	for _, prebuiltSST := range prebuiltSSTs {
   553  		f, err := os.Open(filepath.FromSlash(prebuiltSST))
   554  		require.NoError(t, err)
   555  
   556  		r, err := newReader(f, ReaderOptions{})
   557  		require.NoError(t, err)
   558  
   559  		t.Run(
   560  			fmt.Sprintf("sst=%s", prebuiltSST),
   561  			func(t *testing.T) {
   562  				runTestReader(t, WriterOptions{}, "testdata/hamletreader", r, false)
   563  			},
   564  		)
   565  	}
   566  }
   567  
   568  func forEveryTableFormat[I any](
   569  	t *testing.T, formatTable [NumTableFormats]I, runTest func(*testing.T, TableFormat, I),
   570  ) {
   571  	t.Helper()
   572  	for tf := TableFormatUnspecified + 1; tf <= TableFormatMax; tf++ {
   573  		t.Run(tf.String(), func(t *testing.T) {
   574  			runTest(t, tf, formatTable[tf])
   575  		})
   576  	}
   577  }
   578  
   579  func TestReaderStats(t *testing.T) {
   580  	forEveryTableFormat[string](t,
   581  		[NumTableFormats]string{
   582  			TableFormatUnspecified: "",
   583  			TableFormatLevelDB:     "testdata/readerstats_LevelDB",
   584  			TableFormatRocksDBv2:   "testdata/readerstats_LevelDB",
   585  			TableFormatPebblev1:    "testdata/readerstats_LevelDB",
   586  			TableFormatPebblev2:    "testdata/readerstats_LevelDB",
   587  			TableFormatPebblev3:    "testdata/readerstats_Pebblev3",
   588  			TableFormatPebblev4:    "testdata/readerstats_Pebblev3",
   589  		}, func(t *testing.T, format TableFormat, dir string) {
   590  			if dir == "" {
   591  				t.Skip()
   592  			}
   593  			writerOpt := WriterOptions{
   594  				BlockSize:      32 << 10,
   595  				IndexBlockSize: 32 << 10,
   596  				Comparer:       testkeys.Comparer,
   597  				TableFormat:    format,
   598  			}
   599  			runTestReader(t, writerOpt, dir, nil /* Reader */, false /* printValue */)
   600  		})
   601  }
   602  
   603  func TestReaderWithBlockPropertyFilter(t *testing.T) {
   604  	// Some of these tests examine internal iterator state, so they require
   605  	// determinism. When the invariants tag is set, disableBoundsOpt may disable
   606  	// the bounds optimization depending on the iterator pointer address. This
   607  	// can add nondeterminism to the internal iterator statae. Disable this
   608  	// nondeterminism for the duration of this test.
   609  	ensureBoundsOptDeterminism = true
   610  	defer func() { ensureBoundsOptDeterminism = false }()
   611  
   612  	forEveryTableFormat[string](t,
   613  		[NumTableFormats]string{
   614  			TableFormatUnspecified: "", // Block properties unsupported
   615  			TableFormatLevelDB:     "", // Block properties unsupported
   616  			TableFormatRocksDBv2:   "", // Block properties unsupported
   617  			TableFormatPebblev1:    "", // Block properties unsupported
   618  			TableFormatPebblev2:    "testdata/reader_bpf/Pebblev2",
   619  			TableFormatPebblev3:    "testdata/reader_bpf/Pebblev3",
   620  			TableFormatPebblev4:    "testdata/reader_bpf/Pebblev3",
   621  		}, func(t *testing.T, format TableFormat, dir string) {
   622  			if dir == "" {
   623  				t.Skip("Block-properties unsupported")
   624  			}
   625  			writerOpt := WriterOptions{
   626  				Comparer:                testkeys.Comparer,
   627  				TableFormat:             format,
   628  				BlockPropertyCollectors: []func() BlockPropertyCollector{NewTestKeysBlockPropertyCollector},
   629  			}
   630  			runTestReader(t, writerOpt, dir, nil /* Reader */, false)
   631  		})
   632  }
   633  
   634  func TestInjectedErrors(t *testing.T) {
   635  	prebuiltSSTs := []string{
   636  		"testdata/h.ldb",
   637  		"testdata/h.sst",
   638  		"testdata/h.no-compression.sst",
   639  		"testdata/h.no-compression.two_level_index.sst",
   640  		"testdata/h.block-bloom.no-compression.sst",
   641  		"testdata/h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst",
   642  		"testdata/h.table-bloom.no-compression.sst",
   643  	}
   644  
   645  	for _, prebuiltSST := range prebuiltSSTs {
   646  		run := func(i int) (reterr error) {
   647  			f, err := vfs.Default.Open(filepath.FromSlash(prebuiltSST))
   648  			require.NoError(t, err)
   649  
   650  			r, err := newReader(errorfs.WrapFile(f, errorfs.OnIndex(int32(i))), ReaderOptions{})
   651  			if err != nil {
   652  				return firstError(err, f.Close())
   653  			}
   654  			defer func() { reterr = firstError(reterr, r.Close()) }()
   655  
   656  			_, err = r.EstimateDiskUsage([]byte("borrower"), []byte("lender"))
   657  			if err != nil {
   658  				return err
   659  			}
   660  
   661  			iter, err := r.NewIter(nil, nil)
   662  			if err != nil {
   663  				return err
   664  			}
   665  			defer func() { reterr = firstError(reterr, iter.Close()) }()
   666  			for k, v := iter.First(); k != nil; k, v = iter.Next() {
   667  				val, _, err := v.Value(nil)
   668  				if err != nil {
   669  					return err
   670  				}
   671  				if val == nil {
   672  					break
   673  				}
   674  			}
   675  			if err = iter.Error(); err != nil {
   676  				return err
   677  			}
   678  			return nil
   679  		}
   680  		for i := 0; ; i++ {
   681  			err := run(i)
   682  			if errors.Is(err, errorfs.ErrInjected) {
   683  				t.Logf("%q, index %d: %s", prebuiltSST, i, err)
   684  				continue
   685  			}
   686  			if err != nil {
   687  				t.Errorf("%q, index %d: non-injected error: %+v", prebuiltSST, i, err)
   688  				break
   689  			}
   690  			t.Logf("%q: no error at index %d", prebuiltSST, i)
   691  			break
   692  		}
   693  	}
   694  }
   695  
   696  func TestInvalidReader(t *testing.T) {
   697  	invalid, err := NewSimpleReadable(vfs.NewMemFile([]byte("invalid sst bytes")))
   698  	if err != nil {
   699  		t.Fatal(err)
   700  	}
   701  	testCases := []struct {
   702  		readable objstorage.Readable
   703  		expected string
   704  	}{
   705  		{nil, "nil file"},
   706  		{invalid, "invalid table"},
   707  	}
   708  	for _, tc := range testCases {
   709  		r, err := NewReader(tc.readable, ReaderOptions{})
   710  		if !strings.Contains(err.Error(), tc.expected) {
   711  			t.Fatalf("expected %q, but found %q", tc.expected, err.Error())
   712  		}
   713  		if r != nil {
   714  			t.Fatalf("found non-nil reader returned with non-nil error %q", err.Error())
   715  		}
   716  	}
   717  }
   718  
   719  func indexLayoutString(t *testing.T, r *Reader) string {
   720  	indexH, err := r.readIndex(context.Background(), nil)
   721  	require.NoError(t, err)
   722  	defer indexH.Release()
   723  	var buf strings.Builder
   724  	twoLevelIndex := r.Properties.IndexType == twoLevelIndex
   725  	buf.WriteString("index entries:\n")
   726  	iter, err := newBlockIter(r.Compare, indexH.Get())
   727  	defer func() {
   728  		require.NoError(t, iter.Close())
   729  	}()
   730  	require.NoError(t, err)
   731  	for key, value := iter.First(); key != nil; key, value = iter.Next() {
   732  		bh, err := decodeBlockHandleWithProperties(value.InPlaceValue())
   733  		require.NoError(t, err)
   734  		fmt.Fprintf(&buf, " %s: size %d\n", string(key.UserKey), bh.Length)
   735  		if twoLevelIndex {
   736  			b, err := r.readBlock(
   737  				context.Background(), bh.BlockHandle, nil, nil, nil, nil)
   738  			require.NoError(t, err)
   739  			defer b.Release()
   740  			iter2, err := newBlockIter(r.Compare, b.Get())
   741  			defer func() {
   742  				require.NoError(t, iter2.Close())
   743  			}()
   744  			require.NoError(t, err)
   745  			for key, value := iter2.First(); key != nil; key, value = iter2.Next() {
   746  				bh, err := decodeBlockHandleWithProperties(value.InPlaceValue())
   747  				require.NoError(t, err)
   748  				fmt.Fprintf(&buf, "   %s: size %d\n", string(key.UserKey), bh.Length)
   749  			}
   750  		}
   751  	}
   752  	return buf.String()
   753  }
   754  
   755  func runTestReader(t *testing.T, o WriterOptions, dir string, r *Reader, printValue bool) {
   756  	datadriven.Walk(t, dir, func(t *testing.T, path string) {
   757  		defer func() {
   758  			if r != nil {
   759  				r.Close()
   760  				r = nil
   761  			}
   762  		}()
   763  
   764  		datadriven.RunTest(t, path, func(t *testing.T, d *datadriven.TestData) string {
   765  			switch d.Cmd {
   766  			case "build":
   767  				if r != nil {
   768  					r.Close()
   769  					r = nil
   770  				}
   771  				var cacheSize int
   772  				var printLayout bool
   773  				d.MaybeScanArgs(t, "cache-size", &cacheSize)
   774  				d.MaybeScanArgs(t, "print-layout", &printLayout)
   775  				d.MaybeScanArgs(t, "block-size", &o.BlockSize)
   776  				d.MaybeScanArgs(t, "index-block-size", &o.IndexBlockSize)
   777  
   778  				var err error
   779  				_, r, err = runBuildCmd(d, &o, cacheSize)
   780  				if err != nil {
   781  					return err.Error()
   782  				}
   783  				if printLayout {
   784  					return indexLayoutString(t, r)
   785  				}
   786  				return ""
   787  
   788  			case "iter":
   789  				seqNum, err := scanGlobalSeqNum(d)
   790  				if err != nil {
   791  					return err.Error()
   792  				}
   793  				var stats base.InternalIteratorStats
   794  				r.Properties.GlobalSeqNum = seqNum
   795  				var bpfs []BlockPropertyFilter
   796  				if d.HasArg("block-property-filter") {
   797  					var filterMin, filterMax uint64
   798  					d.ScanArgs(t, "block-property-filter", &filterMin, &filterMax)
   799  					bpf := NewTestKeysBlockPropertyFilter(filterMin, filterMax)
   800  					bpfs = append(bpfs, bpf)
   801  				}
   802  				hideObsoletePoints := false
   803  				if d.HasArg("hide-obsolete-points") {
   804  					d.ScanArgs(t, "hide-obsolete-points", &hideObsoletePoints)
   805  					if hideObsoletePoints {
   806  						hideObsoletePoints, bpfs = r.TryAddBlockPropertyFilterForHideObsoletePoints(
   807  							InternalKeySeqNumMax, InternalKeySeqNumMax-1, bpfs)
   808  						require.True(t, hideObsoletePoints)
   809  					}
   810  				}
   811  				var filterer *BlockPropertiesFilterer
   812  				if len(bpfs) > 0 {
   813  					filterer = newBlockPropertiesFilterer(bpfs, nil)
   814  					intersects, err :=
   815  						filterer.intersectsUserPropsAndFinishInit(r.Properties.UserProperties)
   816  					if err != nil {
   817  						return err.Error()
   818  					}
   819  					if !intersects {
   820  						return "table does not intersect BlockPropertyFilter"
   821  					}
   822  				}
   823  				iter, err := r.NewIterWithBlockPropertyFiltersAndContextEtc(
   824  					context.Background(),
   825  					nil, /* lower */
   826  					nil, /* upper */
   827  					filterer,
   828  					hideObsoletePoints,
   829  					true, /* use filter block */
   830  					&stats,
   831  					TrivialReaderProvider{Reader: r},
   832  				)
   833  				if err != nil {
   834  					return err.Error()
   835  				}
   836  				return runIterCmd(d, iter, printValue, runIterCmdStats(&stats))
   837  
   838  			case "get":
   839  				var b bytes.Buffer
   840  				for _, k := range strings.Split(d.Input, "\n") {
   841  					v, err := r.get([]byte(k))
   842  					if err != nil {
   843  						fmt.Fprintf(&b, "<err: %s>\n", err)
   844  					} else {
   845  						fmt.Fprintln(&b, string(v))
   846  					}
   847  				}
   848  				return b.String()
   849  			default:
   850  				return fmt.Sprintf("unknown command: %s", d.Cmd)
   851  			}
   852  		})
   853  	})
   854  }
   855  
   856  func TestReaderCheckComparerMerger(t *testing.T) {
   857  	const testTable = "test"
   858  
   859  	testComparer := &base.Comparer{
   860  		Name:      "test.comparer",
   861  		Compare:   base.DefaultComparer.Compare,
   862  		Equal:     base.DefaultComparer.Equal,
   863  		Separator: base.DefaultComparer.Separator,
   864  		Successor: base.DefaultComparer.Successor,
   865  	}
   866  	testMerger := &base.Merger{
   867  		Name:  "test.merger",
   868  		Merge: base.DefaultMerger.Merge,
   869  	}
   870  	writerOpts := WriterOptions{
   871  		Comparer:   testComparer,
   872  		MergerName: "test.merger",
   873  	}
   874  
   875  	mem := vfs.NewMem()
   876  	f0, err := mem.Create(testTable)
   877  	require.NoError(t, err)
   878  
   879  	w := NewWriter(objstorageprovider.NewFileWritable(f0), writerOpts)
   880  	require.NoError(t, w.Set([]byte("test"), nil))
   881  	require.NoError(t, w.Close())
   882  
   883  	testCases := []struct {
   884  		comparers []*base.Comparer
   885  		mergers   []*base.Merger
   886  		expected  string
   887  	}{
   888  		{
   889  			[]*base.Comparer{testComparer},
   890  			[]*base.Merger{testMerger},
   891  			"",
   892  		},
   893  		{
   894  			[]*base.Comparer{testComparer, base.DefaultComparer},
   895  			[]*base.Merger{testMerger, base.DefaultMerger},
   896  			"",
   897  		},
   898  		{
   899  			[]*base.Comparer{},
   900  			[]*base.Merger{testMerger},
   901  			"unknown comparer test.comparer",
   902  		},
   903  		{
   904  			[]*base.Comparer{base.DefaultComparer},
   905  			[]*base.Merger{testMerger},
   906  			"unknown comparer test.comparer",
   907  		},
   908  		{
   909  			[]*base.Comparer{testComparer},
   910  			[]*base.Merger{},
   911  			"unknown merger test.merger",
   912  		},
   913  		{
   914  			[]*base.Comparer{testComparer},
   915  			[]*base.Merger{base.DefaultMerger},
   916  			"unknown merger test.merger",
   917  		},
   918  	}
   919  
   920  	for _, c := range testCases {
   921  		t.Run("", func(t *testing.T) {
   922  			f1, err := mem.Open(testTable)
   923  			require.NoError(t, err)
   924  
   925  			comparers := make(Comparers)
   926  			for _, comparer := range c.comparers {
   927  				comparers[comparer.Name] = comparer
   928  			}
   929  			mergers := make(Mergers)
   930  			for _, merger := range c.mergers {
   931  				mergers[merger.Name] = merger
   932  			}
   933  
   934  			r, err := newReader(f1, ReaderOptions{}, comparers, mergers)
   935  			if err != nil {
   936  				if r != nil {
   937  					t.Fatalf("found non-nil reader returned with non-nil error %q", err.Error())
   938  				}
   939  				if !strings.HasSuffix(err.Error(), c.expected) {
   940  					t.Fatalf("expected %q, but found %q", c.expected, err.Error())
   941  				}
   942  			} else if c.expected != "" {
   943  				t.Fatalf("expected %q, but found success", c.expected)
   944  			}
   945  			if r != nil {
   946  				_ = r.Close()
   947  			}
   948  		})
   949  	}
   950  }
   951  func checkValidPrefix(prefix, key []byte) bool {
   952  	return prefix == nil || bytes.HasPrefix(key, prefix)
   953  }
   954  
   955  func testBytesIteratedWithCompression(
   956  	t *testing.T,
   957  	compression Compression,
   958  	allowedSizeDeviationPercent uint64,
   959  	blockSizes []int,
   960  	maxNumEntries []uint64,
   961  ) {
   962  	for i, blockSize := range blockSizes {
   963  		for _, indexBlockSize := range blockSizes {
   964  			for _, numEntries := range []uint64{0, 1, maxNumEntries[i]} {
   965  				r := buildTestTable(t, numEntries, blockSize, indexBlockSize, compression)
   966  				var bytesIterated, prevIterated uint64
   967  				var pool BufferPool
   968  				pool.Init(5)
   969  				citer, err := r.NewCompactionIter(&bytesIterated, TrivialReaderProvider{Reader: r}, &pool)
   970  				require.NoError(t, err)
   971  
   972  				for key, _ := citer.First(); key != nil; key, _ = citer.Next() {
   973  					if bytesIterated < prevIterated {
   974  						t.Fatalf("bytesIterated moved backward: %d < %d", bytesIterated, prevIterated)
   975  					}
   976  					prevIterated = bytesIterated
   977  				}
   978  
   979  				expected := r.Properties.DataSize
   980  				allowedSizeDeviation := expected * allowedSizeDeviationPercent / 100
   981  				// There is some inaccuracy due to compression estimation.
   982  				if bytesIterated < expected-allowedSizeDeviation || bytesIterated > expected+allowedSizeDeviation {
   983  					t.Fatalf("bytesIterated: got %d, want %d", bytesIterated, expected)
   984  				}
   985  
   986  				require.NoError(t, citer.Close())
   987  				require.NoError(t, r.Close())
   988  				pool.Release()
   989  			}
   990  		}
   991  	}
   992  }
   993  
   994  func TestBytesIterated(t *testing.T) {
   995  	blockSizes := []int{10, 100, 1000, 4096, math.MaxInt32}
   996  	t.Run("Compressed", func(t *testing.T) {
   997  		testBytesIteratedWithCompression(t, SnappyCompression, 1, blockSizes, []uint64{1e5, 1e5, 1e5, 1e5, 1e5})
   998  	})
   999  	t.Run("Uncompressed", func(t *testing.T) {
  1000  		testBytesIteratedWithCompression(t, NoCompression, 0, blockSizes, []uint64{1e5, 1e5, 1e5, 1e5, 1e5})
  1001  	})
  1002  	t.Run("Zstd", func(t *testing.T) {
  1003  		// compression with zstd is extremely slow with small block size (esp the nocgo version).
  1004  		// use less numEntries to make the test run at reasonable speed (under 10 seconds).
  1005  		maxNumEntries := []uint64{1e2, 1e2, 1e3, 4e3, 1e5}
  1006  		if useStandardZstdLib {
  1007  			maxNumEntries = []uint64{1e3, 1e3, 1e4, 4e4, 1e5}
  1008  		}
  1009  		testBytesIteratedWithCompression(t, ZstdCompression, 1, blockSizes, maxNumEntries)
  1010  	})
  1011  }
  1012  
  1013  func TestCompactionIteratorSetupForCompaction(t *testing.T) {
  1014  	tmpDir := path.Join(t.TempDir())
  1015  	provider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(vfs.Default, tmpDir))
  1016  	require.NoError(t, err)
  1017  	defer provider.Close()
  1018  	blockSizes := []int{10, 100, 1000, 4096, math.MaxInt32}
  1019  	for _, blockSize := range blockSizes {
  1020  		for _, indexBlockSize := range blockSizes {
  1021  			for _, numEntries := range []uint64{0, 1, 1e5} {
  1022  				r := buildTestTableWithProvider(t, provider, numEntries, blockSize, indexBlockSize, DefaultCompression)
  1023  				var bytesIterated uint64
  1024  				var pool BufferPool
  1025  				pool.Init(5)
  1026  				citer, err := r.NewCompactionIter(&bytesIterated, TrivialReaderProvider{Reader: r}, &pool)
  1027  				require.NoError(t, err)
  1028  				switch i := citer.(type) {
  1029  				case *compactionIterator:
  1030  					require.True(t, objstorageprovider.TestingCheckMaxReadahead(i.dataRH))
  1031  					// Each key has one version, so no value block, regardless of
  1032  					// sstable version.
  1033  					require.Nil(t, i.vbRH)
  1034  				case *twoLevelCompactionIterator:
  1035  					require.True(t, objstorageprovider.TestingCheckMaxReadahead(i.dataRH))
  1036  					// Each key has one version, so no value block, regardless of
  1037  					// sstable version.
  1038  					require.Nil(t, i.vbRH)
  1039  				default:
  1040  					require.Failf(t, fmt.Sprintf("unknown compaction iterator type: %T", citer), "")
  1041  				}
  1042  				require.NoError(t, citer.Close())
  1043  				require.NoError(t, r.Close())
  1044  				pool.Release()
  1045  			}
  1046  		}
  1047  	}
  1048  }
  1049  
  1050  func TestReadaheadSetupForV3TablesWithMultipleVersions(t *testing.T) {
  1051  	tmpDir := path.Join(t.TempDir())
  1052  	provider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(vfs.Default, tmpDir))
  1053  	require.NoError(t, err)
  1054  	defer provider.Close()
  1055  	f0, _, err := provider.Create(context.Background(), base.FileTypeTable, base.FileNum(0).DiskFileNum(), objstorage.CreateOptions{})
  1056  	require.NoError(t, err)
  1057  
  1058  	w := NewWriter(f0, WriterOptions{
  1059  		TableFormat: TableFormatPebblev3,
  1060  		Comparer:    testkeys.Comparer,
  1061  	})
  1062  	keys := testkeys.Alpha(1)
  1063  	keyBuf := make([]byte, 1+testkeys.MaxSuffixLen)
  1064  	// Write a few keys with multiple timestamps (MVCC versions).
  1065  	for i := int64(0); i < 2; i++ {
  1066  		for j := int64(2); j >= 1; j-- {
  1067  			n := testkeys.WriteKeyAt(keyBuf[:], keys, i, j)
  1068  			key := keyBuf[:n]
  1069  			require.NoError(t, w.Set(key, key))
  1070  		}
  1071  	}
  1072  	require.NoError(t, w.Close())
  1073  	f1, err := provider.OpenForReading(context.Background(), base.FileTypeTable, base.FileNum(0).DiskFileNum(), objstorage.OpenOptions{})
  1074  	require.NoError(t, err)
  1075  	r, err := NewReader(f1, ReaderOptions{Comparer: testkeys.Comparer})
  1076  	require.NoError(t, err)
  1077  	defer r.Close()
  1078  	{
  1079  		var pool BufferPool
  1080  		pool.Init(5)
  1081  		citer, err := r.NewCompactionIter(nil, TrivialReaderProvider{Reader: r}, &pool)
  1082  		require.NoError(t, err)
  1083  		defer citer.Close()
  1084  		i := citer.(*compactionIterator)
  1085  		require.True(t, objstorageprovider.TestingCheckMaxReadahead(i.dataRH))
  1086  		require.True(t, objstorageprovider.TestingCheckMaxReadahead(i.vbRH))
  1087  	}
  1088  	{
  1089  		iter, err := r.NewIter(nil, nil)
  1090  		require.NoError(t, err)
  1091  		defer iter.Close()
  1092  		i := iter.(*singleLevelIterator)
  1093  		require.False(t, objstorageprovider.TestingCheckMaxReadahead(i.dataRH))
  1094  		require.False(t, objstorageprovider.TestingCheckMaxReadahead(i.vbRH))
  1095  	}
  1096  }
  1097  
  1098  func TestReaderChecksumErrors(t *testing.T) {
  1099  	for _, checksumType := range []ChecksumType{ChecksumTypeCRC32c, ChecksumTypeXXHash64} {
  1100  		t.Run(fmt.Sprintf("checksum-type=%d", checksumType), func(t *testing.T) {
  1101  			for _, twoLevelIndex := range []bool{false, true} {
  1102  				t.Run(fmt.Sprintf("two-level-index=%t", twoLevelIndex), func(t *testing.T) {
  1103  					mem := vfs.NewMem()
  1104  
  1105  					{
  1106  						// Create an sstable with 3 data blocks.
  1107  						f, err := mem.Create("test")
  1108  						require.NoError(t, err)
  1109  
  1110  						const blockSize = 32
  1111  						indexBlockSize := 4096
  1112  						if twoLevelIndex {
  1113  							indexBlockSize = 1
  1114  						}
  1115  
  1116  						w := NewWriter(objstorageprovider.NewFileWritable(f), WriterOptions{
  1117  							BlockSize:      blockSize,
  1118  							IndexBlockSize: indexBlockSize,
  1119  							Checksum:       checksumType,
  1120  						})
  1121  						require.NoError(t, w.Set(bytes.Repeat([]byte("a"), blockSize), nil))
  1122  						require.NoError(t, w.Set(bytes.Repeat([]byte("b"), blockSize), nil))
  1123  						require.NoError(t, w.Set(bytes.Repeat([]byte("c"), blockSize), nil))
  1124  						require.NoError(t, w.Close())
  1125  					}
  1126  
  1127  					// Load the layout so that we no the location of the data blocks.
  1128  					var layout *Layout
  1129  					{
  1130  						f, err := mem.Open("test")
  1131  						require.NoError(t, err)
  1132  
  1133  						r, err := newReader(f, ReaderOptions{})
  1134  						require.NoError(t, err)
  1135  						layout, err = r.Layout()
  1136  						require.NoError(t, err)
  1137  						require.EqualValues(t, len(layout.Data), 3)
  1138  						require.NoError(t, r.Close())
  1139  					}
  1140  
  1141  					for _, bh := range layout.Data {
  1142  						// Read the sstable and corrupt the first byte in the target data
  1143  						// block.
  1144  						orig, err := mem.Open("test")
  1145  						require.NoError(t, err)
  1146  						data, err := io.ReadAll(orig)
  1147  						require.NoError(t, err)
  1148  						require.NoError(t, orig.Close())
  1149  
  1150  						// Corrupt the first byte in the block.
  1151  						data[bh.Offset] ^= 0xff
  1152  
  1153  						corrupted, err := mem.Create("corrupted")
  1154  						require.NoError(t, err)
  1155  						_, err = corrupted.Write(data)
  1156  						require.NoError(t, err)
  1157  						require.NoError(t, corrupted.Close())
  1158  
  1159  						// Verify that we encounter a checksum mismatch error while iterating
  1160  						// over the sstable.
  1161  						corrupted, err = mem.Open("corrupted")
  1162  						require.NoError(t, err)
  1163  
  1164  						r, err := newReader(corrupted, ReaderOptions{})
  1165  						require.NoError(t, err)
  1166  
  1167  						iter, err := r.NewIter(nil, nil)
  1168  						require.NoError(t, err)
  1169  						for k, _ := iter.First(); k != nil; k, _ = iter.Next() {
  1170  						}
  1171  						require.Regexp(t, `checksum mismatch`, iter.Error())
  1172  						require.Regexp(t, `checksum mismatch`, iter.Close())
  1173  
  1174  						iter, err = r.NewIter(nil, nil)
  1175  						require.NoError(t, err)
  1176  						for k, _ := iter.Last(); k != nil; k, _ = iter.Prev() {
  1177  						}
  1178  						require.Regexp(t, `checksum mismatch`, iter.Error())
  1179  						require.Regexp(t, `checksum mismatch`, iter.Close())
  1180  
  1181  						require.NoError(t, r.Close())
  1182  					}
  1183  				})
  1184  			}
  1185  		})
  1186  	}
  1187  }
  1188  
  1189  func TestValidateBlockChecksums(t *testing.T) {
  1190  	seed := uint64(time.Now().UnixNano())
  1191  	rng := rand.New(rand.NewSource(seed))
  1192  	t.Logf("using seed = %d", seed)
  1193  
  1194  	allFiles := []string{
  1195  		"testdata/h.no-compression.sst",
  1196  		"testdata/h.no-compression.two_level_index.sst",
  1197  		"testdata/h.sst",
  1198  		"testdata/h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst",
  1199  		"testdata/h.table-bloom.no-compression.sst",
  1200  		"testdata/h.table-bloom.sst",
  1201  		"testdata/h.zstd-compression.sst",
  1202  	}
  1203  
  1204  	type corruptionLocation int
  1205  	const (
  1206  		corruptionLocationData corruptionLocation = iota
  1207  		corruptionLocationIndex
  1208  		corruptionLocationTopIndex
  1209  		corruptionLocationFilter
  1210  		corruptionLocationRangeDel
  1211  		corruptionLocationProperties
  1212  		corruptionLocationMetaIndex
  1213  	)
  1214  
  1215  	testCases := []struct {
  1216  		name                string
  1217  		files               []string
  1218  		corruptionLocations []corruptionLocation
  1219  	}{
  1220  		{
  1221  			name:                "no corruption",
  1222  			corruptionLocations: []corruptionLocation{},
  1223  		},
  1224  		{
  1225  			name: "data block corruption",
  1226  			corruptionLocations: []corruptionLocation{
  1227  				corruptionLocationData,
  1228  			},
  1229  		},
  1230  		{
  1231  			name: "index block corruption",
  1232  			corruptionLocations: []corruptionLocation{
  1233  				corruptionLocationIndex,
  1234  			},
  1235  		},
  1236  		{
  1237  			name: "top index block corruption",
  1238  			files: []string{
  1239  				"testdata/h.no-compression.two_level_index.sst",
  1240  			},
  1241  			corruptionLocations: []corruptionLocation{
  1242  				corruptionLocationTopIndex,
  1243  			},
  1244  		},
  1245  		{
  1246  			name: "filter block corruption",
  1247  			files: []string{
  1248  				"testdata/h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst",
  1249  				"testdata/h.table-bloom.no-compression.sst",
  1250  				"testdata/h.table-bloom.sst",
  1251  			},
  1252  			corruptionLocations: []corruptionLocation{
  1253  				corruptionLocationFilter,
  1254  			},
  1255  		},
  1256  		{
  1257  			name: "range deletion block corruption",
  1258  			corruptionLocations: []corruptionLocation{
  1259  				corruptionLocationRangeDel,
  1260  			},
  1261  		},
  1262  		{
  1263  			name: "properties block corruption",
  1264  			corruptionLocations: []corruptionLocation{
  1265  				corruptionLocationProperties,
  1266  			},
  1267  		},
  1268  		{
  1269  			name: "metaindex block corruption",
  1270  			corruptionLocations: []corruptionLocation{
  1271  				corruptionLocationMetaIndex,
  1272  			},
  1273  		},
  1274  		{
  1275  			name: "multiple blocks corrupted",
  1276  			corruptionLocations: []corruptionLocation{
  1277  				corruptionLocationData,
  1278  				corruptionLocationIndex,
  1279  				corruptionLocationRangeDel,
  1280  				corruptionLocationProperties,
  1281  				corruptionLocationMetaIndex,
  1282  			},
  1283  		},
  1284  	}
  1285  
  1286  	testFn := func(t *testing.T, file string, corruptionLocations []corruptionLocation) {
  1287  		// Create a copy of the SSTable that we can freely corrupt.
  1288  		f, err := os.Open(filepath.FromSlash(file))
  1289  		require.NoError(t, err)
  1290  
  1291  		pathCopy := path.Join(t.TempDir(), path.Base(file))
  1292  		fCopy, err := os.OpenFile(pathCopy, os.O_CREATE|os.O_RDWR, 0600)
  1293  		require.NoError(t, err)
  1294  		defer fCopy.Close()
  1295  
  1296  		_, err = io.Copy(fCopy, f)
  1297  		require.NoError(t, err)
  1298  		err = fCopy.Sync()
  1299  		require.NoError(t, err)
  1300  		require.NoError(t, f.Close())
  1301  
  1302  		filter := bloom.FilterPolicy(10)
  1303  		r, err := newReader(fCopy, ReaderOptions{
  1304  			Filters: map[string]FilterPolicy{
  1305  				filter.Name(): filter,
  1306  			},
  1307  		})
  1308  		require.NoError(t, err)
  1309  		defer func() { require.NoError(t, r.Close()) }()
  1310  
  1311  		// Prior to corruption, validation is successful.
  1312  		require.NoError(t, r.ValidateBlockChecksums())
  1313  
  1314  		// If we are not testing for corruption, we can stop here.
  1315  		if len(corruptionLocations) == 0 {
  1316  			return
  1317  		}
  1318  
  1319  		// Perform bit flips in various corruption locations.
  1320  		layout, err := r.Layout()
  1321  		require.NoError(t, err)
  1322  		for _, location := range corruptionLocations {
  1323  			var bh BlockHandle
  1324  			switch location {
  1325  			case corruptionLocationData:
  1326  				bh = layout.Data[rng.Intn(len(layout.Data))].BlockHandle
  1327  			case corruptionLocationIndex:
  1328  				bh = layout.Index[rng.Intn(len(layout.Index))]
  1329  			case corruptionLocationTopIndex:
  1330  				bh = layout.TopIndex
  1331  			case corruptionLocationFilter:
  1332  				bh = layout.Filter
  1333  			case corruptionLocationRangeDel:
  1334  				bh = layout.RangeDel
  1335  			case corruptionLocationProperties:
  1336  				bh = layout.Properties
  1337  			case corruptionLocationMetaIndex:
  1338  				bh = layout.MetaIndex
  1339  			default:
  1340  				t.Fatalf("unknown location")
  1341  			}
  1342  
  1343  			// Corrupt a random byte within the selected block.
  1344  			pos := int64(bh.Offset) + rng.Int63n(int64(bh.Length))
  1345  			t.Logf("altering file=%s @ offset = %d", file, pos)
  1346  
  1347  			b := make([]byte, 1)
  1348  			n, err := fCopy.ReadAt(b, pos)
  1349  			require.NoError(t, err)
  1350  			require.Equal(t, 1, n)
  1351  			t.Logf("data (before) = %08b", b)
  1352  
  1353  			b[0] ^= 0xff
  1354  			t.Logf("data (after) = %08b", b)
  1355  
  1356  			_, err = fCopy.WriteAt(b, pos)
  1357  			require.NoError(t, err)
  1358  		}
  1359  
  1360  		// Write back to the file.
  1361  		err = fCopy.Sync()
  1362  		require.NoError(t, err)
  1363  
  1364  		// Confirm that checksum validation fails.
  1365  		err = r.ValidateBlockChecksums()
  1366  		require.Error(t, err)
  1367  		require.Regexp(t, `checksum mismatch`, err.Error())
  1368  	}
  1369  
  1370  	for _, tc := range testCases {
  1371  		// By default, test across all files, unless overridden.
  1372  		files := tc.files
  1373  		if files == nil {
  1374  			files = allFiles
  1375  		}
  1376  		for _, file := range files {
  1377  			t.Run(tc.name+" "+path.Base(file), func(t *testing.T) {
  1378  				testFn(t, file, tc.corruptionLocations)
  1379  			})
  1380  		}
  1381  	}
  1382  }
  1383  
  1384  func TestReader_TableFormat(t *testing.T) {
  1385  	test := func(t *testing.T, want TableFormat) {
  1386  		fs := vfs.NewMem()
  1387  		f, err := fs.Create("test")
  1388  		require.NoError(t, err)
  1389  
  1390  		opts := WriterOptions{TableFormat: want}
  1391  		w := NewWriter(objstorageprovider.NewFileWritable(f), opts)
  1392  		err = w.Close()
  1393  		require.NoError(t, err)
  1394  
  1395  		f, err = fs.Open("test")
  1396  		require.NoError(t, err)
  1397  		r, err := newReader(f, ReaderOptions{})
  1398  		require.NoError(t, err)
  1399  		defer r.Close()
  1400  
  1401  		got, err := r.TableFormat()
  1402  		require.NoError(t, err)
  1403  		require.Equal(t, want, got)
  1404  	}
  1405  
  1406  	for tf := TableFormatLevelDB; tf <= TableFormatMax; tf++ {
  1407  		t.Run(tf.String(), func(t *testing.T) {
  1408  			test(t, tf)
  1409  		})
  1410  	}
  1411  }
  1412  
  1413  func buildTestTable(
  1414  	t *testing.T, numEntries uint64, blockSize, indexBlockSize int, compression Compression,
  1415  ) *Reader {
  1416  	provider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(vfs.NewMem(), "" /* dirName */))
  1417  	require.NoError(t, err)
  1418  	defer provider.Close()
  1419  	return buildTestTableWithProvider(t, provider, numEntries, blockSize, indexBlockSize, compression)
  1420  }
  1421  
  1422  func buildTestTableWithProvider(
  1423  	t *testing.T,
  1424  	provider objstorage.Provider,
  1425  	numEntries uint64,
  1426  	blockSize, indexBlockSize int,
  1427  	compression Compression,
  1428  ) *Reader {
  1429  	f0, _, err := provider.Create(context.Background(), base.FileTypeTable, base.FileNum(0).DiskFileNum(), objstorage.CreateOptions{})
  1430  	require.NoError(t, err)
  1431  
  1432  	w := NewWriter(f0, WriterOptions{
  1433  		BlockSize:      blockSize,
  1434  		IndexBlockSize: indexBlockSize,
  1435  		Compression:    compression,
  1436  		FilterPolicy:   nil,
  1437  	})
  1438  
  1439  	var ikey InternalKey
  1440  	for i := uint64(0); i < numEntries; i++ {
  1441  		key := make([]byte, 8+i%3)
  1442  		value := make([]byte, i%100)
  1443  		binary.BigEndian.PutUint64(key, i)
  1444  		ikey.UserKey = key
  1445  		w.Add(ikey, value)
  1446  	}
  1447  
  1448  	require.NoError(t, w.Close())
  1449  
  1450  	// Re-open that filename for reading.
  1451  	f1, err := provider.OpenForReading(context.Background(), base.FileTypeTable, base.FileNum(0).DiskFileNum(), objstorage.OpenOptions{})
  1452  	require.NoError(t, err)
  1453  
  1454  	c := cache.New(128 << 20)
  1455  	defer c.Unref()
  1456  	r, err := NewReader(f1, ReaderOptions{
  1457  		Cache: c,
  1458  	})
  1459  	require.NoError(t, err)
  1460  	return r
  1461  }
  1462  
  1463  func buildBenchmarkTable(
  1464  	b *testing.B, options WriterOptions, confirmTwoLevelIndex bool, offset int,
  1465  ) (*Reader, [][]byte) {
  1466  	mem := vfs.NewMem()
  1467  	f0, err := mem.Create("bench")
  1468  	if err != nil {
  1469  		b.Fatal(err)
  1470  	}
  1471  
  1472  	w := NewWriter(objstorageprovider.NewFileWritable(f0), options)
  1473  
  1474  	var keys [][]byte
  1475  	var ikey InternalKey
  1476  	for i := uint64(0); i < 1e6; i++ {
  1477  		key := make([]byte, 8)
  1478  		binary.BigEndian.PutUint64(key, i+uint64(offset))
  1479  		keys = append(keys, key)
  1480  		ikey.UserKey = key
  1481  		w.Add(ikey, nil)
  1482  	}
  1483  
  1484  	if err := w.Close(); err != nil {
  1485  		b.Fatal(err)
  1486  	}
  1487  
  1488  	// Re-open that filename for reading.
  1489  	f1, err := mem.Open("bench")
  1490  	if err != nil {
  1491  		b.Fatal(err)
  1492  	}
  1493  	c := cache.New(128 << 20)
  1494  	defer c.Unref()
  1495  	r, err := newReader(f1, ReaderOptions{
  1496  		Cache: c,
  1497  	})
  1498  	if err != nil {
  1499  		b.Fatal(err)
  1500  	}
  1501  	if confirmTwoLevelIndex && r.Properties.IndexPartitions == 0 {
  1502  		b.Fatalf("should have constructed two level index")
  1503  	}
  1504  	return r, keys
  1505  }
  1506  
  1507  var basicBenchmarks = []struct {
  1508  	name    string
  1509  	options WriterOptions
  1510  }{
  1511  	{
  1512  		name: "restart=16,compression=Snappy",
  1513  		options: WriterOptions{
  1514  			BlockSize:            32 << 10,
  1515  			BlockRestartInterval: 16,
  1516  			FilterPolicy:         nil,
  1517  			Compression:          SnappyCompression,
  1518  			TableFormat:          TableFormatPebblev2,
  1519  		},
  1520  	},
  1521  	{
  1522  		name: "restart=16,compression=ZSTD",
  1523  		options: WriterOptions{
  1524  			BlockSize:            32 << 10,
  1525  			BlockRestartInterval: 16,
  1526  			FilterPolicy:         nil,
  1527  			Compression:          ZstdCompression,
  1528  			TableFormat:          TableFormatPebblev2,
  1529  		},
  1530  	},
  1531  }
  1532  
  1533  func BenchmarkTableIterSeekGE(b *testing.B) {
  1534  	for _, bm := range basicBenchmarks {
  1535  		b.Run(bm.name,
  1536  			func(b *testing.B) {
  1537  				r, keys := buildBenchmarkTable(b, bm.options, false, 0)
  1538  				it, err := r.NewIter(nil /* lower */, nil /* upper */)
  1539  				require.NoError(b, err)
  1540  				rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
  1541  
  1542  				b.ResetTimer()
  1543  				for i := 0; i < b.N; i++ {
  1544  					it.SeekGE(keys[rng.Intn(len(keys))], base.SeekGEFlagsNone)
  1545  				}
  1546  
  1547  				b.StopTimer()
  1548  				it.Close()
  1549  				r.Close()
  1550  			})
  1551  	}
  1552  }
  1553  
  1554  func BenchmarkTableIterSeekLT(b *testing.B) {
  1555  	for _, bm := range basicBenchmarks {
  1556  		b.Run(bm.name,
  1557  			func(b *testing.B) {
  1558  				r, keys := buildBenchmarkTable(b, bm.options, false, 0)
  1559  				it, err := r.NewIter(nil /* lower */, nil /* upper */)
  1560  				require.NoError(b, err)
  1561  				rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
  1562  
  1563  				b.ResetTimer()
  1564  				for i := 0; i < b.N; i++ {
  1565  					it.SeekLT(keys[rng.Intn(len(keys))], base.SeekLTFlagsNone)
  1566  				}
  1567  
  1568  				b.StopTimer()
  1569  				it.Close()
  1570  				r.Close()
  1571  			})
  1572  	}
  1573  }
  1574  
  1575  func BenchmarkTableIterNext(b *testing.B) {
  1576  	for _, bm := range basicBenchmarks {
  1577  		b.Run(bm.name,
  1578  			func(b *testing.B) {
  1579  				r, _ := buildBenchmarkTable(b, bm.options, false, 0)
  1580  				it, err := r.NewIter(nil /* lower */, nil /* upper */)
  1581  				require.NoError(b, err)
  1582  
  1583  				b.ResetTimer()
  1584  				var sum int64
  1585  				var key *InternalKey
  1586  				for i := 0; i < b.N; i++ {
  1587  					if key == nil {
  1588  						key, _ = it.First()
  1589  					}
  1590  					sum += int64(binary.BigEndian.Uint64(key.UserKey))
  1591  					key, _ = it.Next()
  1592  				}
  1593  				if testing.Verbose() {
  1594  					fmt.Fprint(io.Discard, sum)
  1595  				}
  1596  
  1597  				b.StopTimer()
  1598  				it.Close()
  1599  				r.Close()
  1600  			})
  1601  	}
  1602  }
  1603  
  1604  func BenchmarkTableIterPrev(b *testing.B) {
  1605  	for _, bm := range basicBenchmarks {
  1606  		b.Run(bm.name,
  1607  			func(b *testing.B) {
  1608  				r, _ := buildBenchmarkTable(b, bm.options, false, 0)
  1609  				it, err := r.NewIter(nil /* lower */, nil /* upper */)
  1610  				require.NoError(b, err)
  1611  
  1612  				b.ResetTimer()
  1613  				var sum int64
  1614  				var key *InternalKey
  1615  				for i := 0; i < b.N; i++ {
  1616  					if key == nil {
  1617  						key, _ = it.Last()
  1618  					}
  1619  					sum += int64(binary.BigEndian.Uint64(key.UserKey))
  1620  					key, _ = it.Prev()
  1621  				}
  1622  				if testing.Verbose() {
  1623  					fmt.Fprint(io.Discard, sum)
  1624  				}
  1625  
  1626  				b.StopTimer()
  1627  				it.Close()
  1628  				r.Close()
  1629  			})
  1630  	}
  1631  }
  1632  
  1633  func BenchmarkLayout(b *testing.B) {
  1634  	r, _ := buildBenchmarkTable(b, WriterOptions{}, false, 0)
  1635  	b.ResetTimer()
  1636  	for i := 0; i < b.N; i++ {
  1637  		r.Layout()
  1638  	}
  1639  	b.StopTimer()
  1640  	r.Close()
  1641  }
  1642  
  1643  func BenchmarkSeqSeekGEExhausted(b *testing.B) {
  1644  	// Snappy with no bloom filter.
  1645  	options := basicBenchmarks[0].options
  1646  
  1647  	for _, twoLevelIndex := range []bool{false, true} {
  1648  		switch twoLevelIndex {
  1649  		case false:
  1650  			options.IndexBlockSize = 0
  1651  		case true:
  1652  			options.IndexBlockSize = 512
  1653  		}
  1654  		const offsetCount = 5000
  1655  		reader, keys := buildBenchmarkTable(b, options, twoLevelIndex, offsetCount)
  1656  		var preKeys [][]byte
  1657  		for i := 0; i < offsetCount; i++ {
  1658  			key := make([]byte, 8)
  1659  			binary.BigEndian.PutUint64(key, uint64(i))
  1660  			preKeys = append(preKeys, key)
  1661  		}
  1662  		var postKeys [][]byte
  1663  		for i := 0; i < offsetCount; i++ {
  1664  			key := make([]byte, 8)
  1665  			binary.BigEndian.PutUint64(key, uint64(i+offsetCount+len(keys)))
  1666  			postKeys = append(postKeys, key)
  1667  		}
  1668  		for _, exhaustedBounds := range []bool{false, true} {
  1669  			for _, prefixSeek := range []bool{false, true} {
  1670  				exhausted := "file"
  1671  				if exhaustedBounds {
  1672  					exhausted = "bounds"
  1673  				}
  1674  				seekKind := "ge"
  1675  				if prefixSeek {
  1676  					seekKind = "prefix-ge"
  1677  				}
  1678  				b.Run(fmt.Sprintf(
  1679  					"two-level=%t/exhausted=%s/seek=%s", twoLevelIndex, exhausted, seekKind),
  1680  					func(b *testing.B) {
  1681  						var upper []byte
  1682  						var seekKeys [][]byte
  1683  						if exhaustedBounds {
  1684  							seekKeys = preKeys
  1685  							upper = keys[0]
  1686  						} else {
  1687  							seekKeys = postKeys
  1688  						}
  1689  						it, err := reader.NewIter(nil /* lower */, upper)
  1690  						require.NoError(b, err)
  1691  						b.ResetTimer()
  1692  						pos := 0
  1693  						var seekGEFlags SeekGEFlags
  1694  						for i := 0; i < b.N; i++ {
  1695  							seekKey := seekKeys[0]
  1696  							var k *InternalKey
  1697  							if prefixSeek {
  1698  								k, _ = it.SeekPrefixGE(seekKey, seekKey, seekGEFlags)
  1699  							} else {
  1700  								k, _ = it.SeekGE(seekKey, seekGEFlags)
  1701  							}
  1702  							if k != nil {
  1703  								b.Fatal("found a key")
  1704  							}
  1705  							if it.Error() != nil {
  1706  								b.Fatalf("%s", it.Error().Error())
  1707  							}
  1708  							pos++
  1709  							if pos == len(seekKeys) {
  1710  								pos = 0
  1711  								seekGEFlags = seekGEFlags.DisableTrySeekUsingNext()
  1712  							} else {
  1713  								seekGEFlags = seekGEFlags.EnableTrySeekUsingNext()
  1714  							}
  1715  						}
  1716  						b.StopTimer()
  1717  						it.Close()
  1718  					})
  1719  			}
  1720  		}
  1721  		reader.Close()
  1722  	}
  1723  }
  1724  
  1725  func BenchmarkIteratorScanManyVersions(b *testing.B) {
  1726  	options := WriterOptions{
  1727  		BlockSize:            32 << 10,
  1728  		BlockRestartInterval: 16,
  1729  		FilterPolicy:         nil,
  1730  		Compression:          SnappyCompression,
  1731  		Comparer:             testkeys.Comparer,
  1732  	}
  1733  	// 10,000 key prefixes, each with 100 versions.
  1734  	const keyCount = 10000
  1735  	const sharedPrefixLen = 32
  1736  	const unsharedPrefixLen = 8
  1737  	const versionCount = 100
  1738  
  1739  	// Take the very large keyspace consisting of alphabetic characters of
  1740  	// lengths up to unsharedPrefixLen and reduce it down to keyCount keys by
  1741  	// picking every 1 key every keyCount keys.
  1742  	keys := testkeys.Alpha(unsharedPrefixLen)
  1743  	keys = keys.EveryN(keys.Count() / keyCount)
  1744  	if keys.Count() < keyCount {
  1745  		b.Fatalf("expected %d keys, found %d", keyCount, keys.Count())
  1746  	}
  1747  	keyBuf := make([]byte, sharedPrefixLen+unsharedPrefixLen+testkeys.MaxSuffixLen)
  1748  	for i := 0; i < sharedPrefixLen; i++ {
  1749  		keyBuf[i] = 'A' + byte(i)
  1750  	}
  1751  	// v2 sstable is 115,178,070 bytes. v3 sstable is 107,181,105 bytes with
  1752  	// 99,049,269 bytes in value blocks.
  1753  	setupBench := func(b *testing.B, tableFormat TableFormat, cacheSize int64) *Reader {
  1754  		mem := vfs.NewMem()
  1755  		f0, err := mem.Create("bench")
  1756  		require.NoError(b, err)
  1757  		options.TableFormat = tableFormat
  1758  		w := NewWriter(objstorageprovider.NewFileWritable(f0), options)
  1759  		val := make([]byte, 100)
  1760  		rng := rand.New(rand.NewSource(100))
  1761  		for i := int64(0); i < keys.Count(); i++ {
  1762  			for v := 0; v < versionCount; v++ {
  1763  				n := testkeys.WriteKeyAt(keyBuf[sharedPrefixLen:], keys, i, int64(versionCount-v+1))
  1764  				key := keyBuf[:n+sharedPrefixLen]
  1765  				rng.Read(val)
  1766  				require.NoError(b, w.Set(key, val))
  1767  			}
  1768  		}
  1769  		require.NoError(b, w.Close())
  1770  		c := cache.New(cacheSize)
  1771  		defer c.Unref()
  1772  		// Re-open the filename for reading.
  1773  		f0, err = mem.Open("bench")
  1774  		require.NoError(b, err)
  1775  		r, err := newReader(f0, ReaderOptions{
  1776  			Cache:    c,
  1777  			Comparer: testkeys.Comparer,
  1778  		})
  1779  		require.NoError(b, err)
  1780  		return r
  1781  	}
  1782  	for _, format := range []TableFormat{TableFormatPebblev2, TableFormatPebblev3} {
  1783  		b.Run(fmt.Sprintf("format=%s", format.String()), func(b *testing.B) {
  1784  			// 150MiB results in a high cache hit rate for both formats. 20MiB
  1785  			// results in a high cache hit rate for the data blocks in
  1786  			// TableFormatPebblev3.
  1787  			for _, cacheSize := range []int64{20 << 20, 150 << 20} {
  1788  				b.Run(fmt.Sprintf("cache-size=%s", humanize.Bytes.Int64(cacheSize)),
  1789  					func(b *testing.B) {
  1790  						r := setupBench(b, format, cacheSize)
  1791  						defer func() {
  1792  							require.NoError(b, r.Close())
  1793  						}()
  1794  						for _, readValue := range []bool{false, true} {
  1795  							b.Run(fmt.Sprintf("read-value=%t", readValue), func(b *testing.B) {
  1796  								iter, err := r.NewIter(nil, nil)
  1797  								require.NoError(b, err)
  1798  								var k *InternalKey
  1799  								var v base.LazyValue
  1800  								var valBuf [100]byte
  1801  								b.ResetTimer()
  1802  								for i := 0; i < b.N; i++ {
  1803  									if k == nil {
  1804  										k, _ = iter.First()
  1805  										if k == nil {
  1806  											b.Fatalf("k is nil")
  1807  										}
  1808  									}
  1809  									k, v = iter.Next()
  1810  									if k != nil && readValue {
  1811  										_, callerOwned, err := v.Value(valBuf[:])
  1812  										if err != nil {
  1813  											b.Fatal(err)
  1814  										} else if callerOwned {
  1815  											b.Fatalf("unexpected callerOwned: %t", callerOwned)
  1816  										}
  1817  									}
  1818  								}
  1819  							})
  1820  						}
  1821  					})
  1822  			}
  1823  		})
  1824  	}
  1825  }
  1826  
  1827  func BenchmarkIteratorScanNextPrefix(b *testing.B) {
  1828  	options := WriterOptions{
  1829  		BlockSize:            32 << 10,
  1830  		BlockRestartInterval: 16,
  1831  		FilterPolicy:         nil,
  1832  		Compression:          SnappyCompression,
  1833  		TableFormat:          TableFormatPebblev3,
  1834  		Comparer:             testkeys.Comparer,
  1835  	}
  1836  	const keyCount = 10000
  1837  	const sharedPrefixLen = 32
  1838  	const unsharedPrefixLen = 8
  1839  	val := make([]byte, 100)
  1840  	rand.New(rand.NewSource(100)).Read(val)
  1841  
  1842  	// Take the very large keyspace consisting of alphabetic characters of
  1843  	// lengths up to unsharedPrefixLen and reduce it down to keyCount keys by
  1844  	// picking every 1 key every keyCount keys.
  1845  	keys := testkeys.Alpha(unsharedPrefixLen)
  1846  	keys = keys.EveryN(keys.Count() / keyCount)
  1847  	if keys.Count() < keyCount {
  1848  		b.Fatalf("expected %d keys, found %d", keyCount, keys.Count())
  1849  	}
  1850  	keyBuf := make([]byte, sharedPrefixLen+unsharedPrefixLen+testkeys.MaxSuffixLen)
  1851  	for i := 0; i < sharedPrefixLen; i++ {
  1852  		keyBuf[i] = 'A' + byte(i)
  1853  	}
  1854  	setupBench := func(b *testing.B, versCount int) (r *Reader, succKeys [][]byte) {
  1855  		mem := vfs.NewMem()
  1856  		f0, err := mem.Create("bench")
  1857  		require.NoError(b, err)
  1858  		w := NewWriter(objstorageprovider.NewFileWritable(f0), options)
  1859  		for i := int64(0); i < keys.Count(); i++ {
  1860  			for v := 0; v < versCount; v++ {
  1861  				n := testkeys.WriteKeyAt(keyBuf[sharedPrefixLen:], keys, i, int64(versCount-v+1))
  1862  				key := keyBuf[:n+sharedPrefixLen]
  1863  				require.NoError(b, w.Set(key, val))
  1864  				if v == 0 {
  1865  					prefixLen := testkeys.Comparer.Split(key)
  1866  					prefixKey := key[:prefixLen]
  1867  					succKey := testkeys.Comparer.ImmediateSuccessor(nil, prefixKey)
  1868  					succKeys = append(succKeys, succKey)
  1869  				}
  1870  			}
  1871  		}
  1872  		require.NoError(b, w.Close())
  1873  		// NB: This 200MiB cache is sufficient for even the largest file: 10,000
  1874  		// keys * 100 versions = 1M keys, where each key-value pair is ~140 bytes
  1875  		// = 140MB. So we are not measuring the caching benefit of
  1876  		// TableFormatPebblev3 storing older values in value blocks.
  1877  		c := cache.New(200 << 20)
  1878  		defer c.Unref()
  1879  		// Re-open the filename for reading.
  1880  		f0, err = mem.Open("bench")
  1881  		require.NoError(b, err)
  1882  		r, err = newReader(f0, ReaderOptions{
  1883  			Cache:    c,
  1884  			Comparer: testkeys.Comparer,
  1885  		})
  1886  		require.NoError(b, err)
  1887  		return r, succKeys
  1888  	}
  1889  	// Analysis of some sample results with TableFormatPebblev2:
  1890  	// versions=1/method=seek-ge-10         	22107622	        53.57 ns/op
  1891  	// versions=1/method=next-prefix-10     	36292837	        33.07 ns/op
  1892  	// versions=2/method=seek-ge-10         	14429138	        82.92 ns/op
  1893  	// versions=2/method=next-prefix-10     	19676055	        60.78 ns/op
  1894  	// versions=10/method=seek-ge-10        	 1453726	       825.2 ns/op
  1895  	// versions=10/method=next-prefix-10    	 2450498	       489.6 ns/op
  1896  	// versions=100/method=seek-ge-10       	  965143	      1257 ns/op
  1897  	// versions=100/method=next-prefix-10   	 1000000	      1054 ns/op
  1898  	//
  1899  	// With 1 version, both SeekGE and NextPrefix will be able to complete after
  1900  	// doing a single call to blockIter.Next. However, SeekGE has to do two key
  1901  	// comparisons unlike the one key comparison in NextPrefix. This is because
  1902  	// SeekGE also compares *before* calling Next since it is possible that the
  1903  	// preceding SeekGE is already at the right place.
  1904  	//
  1905  	// With 2 versions, both will do two calls to blockIter.Next. The difference
  1906  	// in the cost is the same as in the 1 version case.
  1907  	//
  1908  	// With 10 versions, it is still likely that the desired key is in the same
  1909  	// data block. NextPrefix will seek only the blockIter. And in the rare case
  1910  	// that the key is in the next data block, it will step the index block (not
  1911  	// seek). In comparison, SeekGE will seek the index block too.
  1912  	//
  1913  	// With 100 versions we more often cross from one data block to the next, so
  1914  	// the difference in cost declines.
  1915  	//
  1916  	// Some sample results with TableFormatPebblev3:
  1917  
  1918  	// versions=1/method=seek-ge-10         	18702609	        53.90 ns/op
  1919  	// versions=1/method=next-prefix-10     	77440167	        15.41 ns/op
  1920  	// versions=2/method=seek-ge-10         	13554286	        87.91 ns/op
  1921  	// versions=2/method=next-prefix-10     	62148526	        19.25 ns/op
  1922  	// versions=10/method=seek-ge-10        	 1316676	       910.5 ns/op
  1923  	// versions=10/method=next-prefix-10    	18829448	        62.61 ns/op
  1924  	// versions=100/method=seek-ge-10       	 1166139	      1025 ns/op
  1925  	// versions=100/method=next-prefix-10   	 4443386	       265.3 ns/op
  1926  	//
  1927  	// NextPrefix is much cheaper than in TableFormatPebblev2 with larger number
  1928  	// of versions. It is also cheaper with 1 and 2 versions since
  1929  	// setHasSamePrefix=false eliminates a key comparison.
  1930  	for _, versionCount := range []int{1, 2, 10, 100} {
  1931  		b.Run(fmt.Sprintf("versions=%d", versionCount), func(b *testing.B) {
  1932  			r, succKeys := setupBench(b, versionCount)
  1933  			defer func() {
  1934  				require.NoError(b, r.Close())
  1935  			}()
  1936  			for _, method := range []string{"seek-ge", "next-prefix"} {
  1937  				b.Run(fmt.Sprintf("method=%s", method), func(b *testing.B) {
  1938  					for _, readValue := range []bool{false, true} {
  1939  						b.Run(fmt.Sprintf("read-value=%t", readValue), func(b *testing.B) {
  1940  							iter, err := r.NewIter(nil, nil)
  1941  							require.NoError(b, err)
  1942  							var nextFunc func(index int) (*InternalKey, base.LazyValue)
  1943  							switch method {
  1944  							case "seek-ge":
  1945  								nextFunc = func(index int) (*InternalKey, base.LazyValue) {
  1946  									var flags base.SeekGEFlags
  1947  									return iter.SeekGE(succKeys[index], flags.EnableTrySeekUsingNext())
  1948  								}
  1949  							case "next-prefix":
  1950  								nextFunc = func(index int) (*InternalKey, base.LazyValue) {
  1951  									return iter.NextPrefix(succKeys[index])
  1952  								}
  1953  							default:
  1954  								b.Fatalf("unknown method %s", method)
  1955  							}
  1956  							n := keys.Count()
  1957  							j := n
  1958  							var k *InternalKey
  1959  							var v base.LazyValue
  1960  							var valBuf [100]byte
  1961  							b.ResetTimer()
  1962  							for i := 0; i < b.N; i++ {
  1963  								if k == nil {
  1964  									if j != n {
  1965  										b.Fatalf("unexpected %d != %d", j, n)
  1966  									}
  1967  									k, _ = iter.First()
  1968  									j = 0
  1969  								} else {
  1970  									k, v = nextFunc(int(j - 1))
  1971  									if k != nil && readValue {
  1972  										_, callerOwned, err := v.Value(valBuf[:])
  1973  										if err != nil {
  1974  											b.Fatal(err)
  1975  										} else if callerOwned {
  1976  											b.Fatalf("unexpected callerOwned: %t", callerOwned)
  1977  										}
  1978  									}
  1979  
  1980  								}
  1981  								if k != nil {
  1982  									j++
  1983  								}
  1984  							}
  1985  						})
  1986  					}
  1987  				})
  1988  			}
  1989  		})
  1990  	}
  1991  }
  1992  
  1993  func BenchmarkIteratorScanObsolete(b *testing.B) {
  1994  	options := WriterOptions{
  1995  		BlockSize:            32 << 10,
  1996  		BlockRestartInterval: 16,
  1997  		FilterPolicy:         nil,
  1998  		Compression:          SnappyCompression,
  1999  		Comparer:             testkeys.Comparer,
  2000  	}
  2001  	const keyCount = 1 << 20
  2002  	const keyLen = 10
  2003  
  2004  	// Take the very large keyspace consisting of alphabetic characters of
  2005  	// lengths up to unsharedPrefixLen and reduce it down to keyCount keys by
  2006  	// picking every 1 key every keyCount keys.
  2007  	keys := testkeys.Alpha(keyLen)
  2008  	keys = keys.EveryN(keys.Count() / keyCount)
  2009  	if keys.Count() < keyCount {
  2010  		b.Fatalf("expected %d keys, found %d", keyCount, keys.Count())
  2011  	}
  2012  	expectedKeyCount := keys.Count()
  2013  	keyBuf := make([]byte, keyLen)
  2014  	setupBench := func(b *testing.B, tableFormat TableFormat, cacheSize int64) *Reader {
  2015  		mem := vfs.NewMem()
  2016  		f0, err := mem.Create("bench")
  2017  		require.NoError(b, err)
  2018  		options.TableFormat = tableFormat
  2019  		w := NewWriter(objstorageprovider.NewFileWritable(f0), options)
  2020  		val := make([]byte, 100)
  2021  		rng := rand.New(rand.NewSource(100))
  2022  		for i := int64(0); i < keys.Count(); i++ {
  2023  			n := testkeys.WriteKey(keyBuf, keys, i)
  2024  			key := keyBuf[:n]
  2025  			rng.Read(val)
  2026  			forceObsolete := true
  2027  			if i == 0 {
  2028  				forceObsolete = false
  2029  			}
  2030  			require.NoError(b, w.AddWithForceObsolete(
  2031  				base.MakeInternalKey(key, 0, InternalKeyKindSet), val, forceObsolete))
  2032  		}
  2033  		require.NoError(b, w.Close())
  2034  		c := cache.New(cacheSize)
  2035  		defer c.Unref()
  2036  		// Re-open the filename for reading.
  2037  		f0, err = mem.Open("bench")
  2038  		require.NoError(b, err)
  2039  		r, err := newReader(f0, ReaderOptions{
  2040  			Cache:    c,
  2041  			Comparer: testkeys.Comparer,
  2042  		})
  2043  		require.NoError(b, err)
  2044  		return r
  2045  	}
  2046  	for _, format := range []TableFormat{TableFormatPebblev3, TableFormatPebblev4} {
  2047  		b.Run(fmt.Sprintf("format=%s", format.String()), func(b *testing.B) {
  2048  			// 150MiB results in a high cache hit rate for both formats.
  2049  			for _, cacheSize := range []int64{1, 150 << 20} {
  2050  				b.Run(fmt.Sprintf("cache-size=%s", humanize.Bytes.Int64(cacheSize)),
  2051  					func(b *testing.B) {
  2052  						r := setupBench(b, format, cacheSize)
  2053  						defer func() {
  2054  							require.NoError(b, r.Close())
  2055  						}()
  2056  						for _, hideObsoletePoints := range []bool{false, true} {
  2057  							b.Run(fmt.Sprintf("hide-obsolete=%t", hideObsoletePoints), func(b *testing.B) {
  2058  								var filterer *BlockPropertiesFilterer
  2059  								if format == TableFormatPebblev4 && hideObsoletePoints {
  2060  									filterer = newBlockPropertiesFilterer(
  2061  										[]BlockPropertyFilter{obsoleteKeyBlockPropertyFilter{}}, nil)
  2062  									intersects, err :=
  2063  										filterer.intersectsUserPropsAndFinishInit(r.Properties.UserProperties)
  2064  									if err != nil {
  2065  										b.Fatalf("%s", err.Error())
  2066  									}
  2067  									if !intersects {
  2068  										b.Fatalf("sstable does not intersect")
  2069  									}
  2070  								}
  2071  								iter, err := r.NewIterWithBlockPropertyFiltersAndContextEtc(
  2072  									context.Background(), nil, nil, filterer, hideObsoletePoints,
  2073  									true, nil, TrivialReaderProvider{Reader: r})
  2074  								require.NoError(b, err)
  2075  								b.ResetTimer()
  2076  								for i := 0; i < b.N; i++ {
  2077  									count := int64(0)
  2078  									k, _ := iter.First()
  2079  									for k != nil {
  2080  										count++
  2081  										k, _ = iter.Next()
  2082  									}
  2083  									if format == TableFormatPebblev4 && hideObsoletePoints {
  2084  										if count != 1 {
  2085  											b.Fatalf("found %d points", count)
  2086  										}
  2087  									} else {
  2088  										if count != expectedKeyCount {
  2089  											b.Fatalf("found %d points", count)
  2090  										}
  2091  									}
  2092  								}
  2093  							})
  2094  						}
  2095  					})
  2096  			}
  2097  		})
  2098  	}
  2099  }
  2100  
  2101  func newReader(r ReadableFile, o ReaderOptions, extraOpts ...ReaderOption) (*Reader, error) {
  2102  	readable, err := NewSimpleReadable(r)
  2103  	if err != nil {
  2104  		return nil, err
  2105  	}
  2106  	return NewReader(readable, o, extraOpts...)
  2107  }