github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/nbs/table_test.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  //
    15  // This file incorporates work covered by the following copyright and
    16  // permission notice:
    17  //
    18  // Copyright 2016 Attic Labs, Inc. All rights reserved.
    19  // Licensed under the Apache License, version 2.0:
    20  // http://www.apache.org/licenses/LICENSE-2.0
    21  
    22  package nbs
    23  
    24  import (
    25  	"context"
    26  	"encoding/binary"
    27  	"fmt"
    28  	"math/rand"
    29  	"sort"
    30  	"testing"
    31  
    32  	"github.com/stretchr/testify/assert"
    33  	"github.com/stretchr/testify/require"
    34  	"golang.org/x/sync/errgroup"
    35  
    36  	"github.com/dolthub/dolt/go/store/chunks"
    37  	"github.com/dolthub/dolt/go/store/hash"
    38  )
    39  
    40  func buildTable(chunks [][]byte) ([]byte, hash.Hash, error) {
    41  	totalData := uint64(0)
    42  	for _, chunk := range chunks {
    43  		totalData += uint64(len(chunk))
    44  	}
    45  	capacity := maxTableSize(uint64(len(chunks)), totalData)
    46  
    47  	buff := make([]byte, capacity)
    48  
    49  	tw := newTableWriter(buff, nil)
    50  
    51  	for _, chunk := range chunks {
    52  		tw.addChunk(computeAddr(chunk), chunk)
    53  	}
    54  
    55  	length, blockHash, err := tw.finish()
    56  
    57  	if err != nil {
    58  		return nil, hash.Hash{}, err
    59  	}
    60  
    61  	return buff[:length], blockHash, nil
    62  }
    63  
    64  func mustGetString(assert *assert.Assertions, ctx context.Context, tr tableReader, data []byte) string {
    65  	bytes, err := tr.get(ctx, computeAddr(data), &Stats{})
    66  	assert.NoError(err)
    67  	return string(bytes)
    68  }
    69  
    70  func TestSimple(t *testing.T) {
    71  	ctx := context.Background()
    72  	assert := assert.New(t)
    73  
    74  	chunks := [][]byte{
    75  		[]byte("hello2"),
    76  		[]byte("goodbye2"),
    77  		[]byte("badbye2"),
    78  	}
    79  
    80  	tableData, _, err := buildTable(chunks)
    81  	require.NoError(t, err)
    82  	ti, err := parseTableIndexByCopy(ctx, tableData, &UnlimitedQuotaProvider{})
    83  	require.NoError(t, err)
    84  	tr, err := newTableReader(ti, tableReaderAtFromBytes(tableData), fileBlockSize)
    85  	require.NoError(t, err)
    86  	defer tr.close()
    87  
    88  	assertChunksInReader(chunks, tr, assert)
    89  
    90  	assert.Equal(string(chunks[0]), mustGetString(assert, context.Background(), tr, chunks[0]))
    91  	assert.Equal(string(chunks[1]), mustGetString(assert, context.Background(), tr, chunks[1]))
    92  	assert.Equal(string(chunks[2]), mustGetString(assert, context.Background(), tr, chunks[2]))
    93  
    94  	notPresent := [][]byte{
    95  		[]byte("yo"),
    96  		[]byte("do"),
    97  		[]byte("so much to do"),
    98  	}
    99  
   100  	assertChunksNotInReader(notPresent, tr, assert)
   101  
   102  	assert.NotEqual(string(notPresent[0]), mustGetString(assert, context.Background(), tr, notPresent[0]))
   103  	assert.NotEqual(string(notPresent[1]), mustGetString(assert, context.Background(), tr, notPresent[1]))
   104  	assert.NotEqual(string(notPresent[2]), mustGetString(assert, context.Background(), tr, notPresent[2]))
   105  }
   106  
   107  func assertChunksInReader(chunks [][]byte, r chunkReader, assert *assert.Assertions) {
   108  	for _, c := range chunks {
   109  		assert.True(r.has(computeAddr(c)))
   110  	}
   111  }
   112  
   113  func assertChunksNotInReader(chunks [][]byte, r chunkReader, assert *assert.Assertions) {
   114  	for _, c := range chunks {
   115  		assert.False(r.has(computeAddr(c)))
   116  	}
   117  }
   118  
   119  func TestHasMany(t *testing.T) {
   120  	ctx := context.Background()
   121  	assert := assert.New(t)
   122  
   123  	chunks := [][]byte{
   124  		[]byte("hello2"),
   125  		[]byte("goodbye2"),
   126  		[]byte("badbye2"),
   127  	}
   128  
   129  	tableData, _, err := buildTable(chunks)
   130  	require.NoError(t, err)
   131  	ti, err := parseTableIndexByCopy(ctx, tableData, &UnlimitedQuotaProvider{})
   132  	require.NoError(t, err)
   133  	tr, err := newTableReader(ti, tableReaderAtFromBytes(tableData), fileBlockSize)
   134  	require.NoError(t, err)
   135  	defer tr.close()
   136  
   137  	addrs := hash.HashSlice{computeAddr(chunks[0]), computeAddr(chunks[1]), computeAddr(chunks[2])}
   138  	hasAddrs := []hasRecord{
   139  		{&addrs[0], binary.BigEndian.Uint64(addrs[0][:hash.PrefixLen]), 0, false},
   140  		{&addrs[1], binary.BigEndian.Uint64(addrs[1][:hash.PrefixLen]), 1, false},
   141  		{&addrs[2], binary.BigEndian.Uint64(addrs[2][:hash.PrefixLen]), 2, false},
   142  	}
   143  	sort.Sort(hasRecordByPrefix(hasAddrs))
   144  
   145  	_, err = tr.hasMany(hasAddrs)
   146  	require.NoError(t, err)
   147  	for _, ha := range hasAddrs {
   148  		assert.True(ha.has, "Nothing for prefix %d", ha.prefix)
   149  	}
   150  }
   151  
   152  func TestHasManySequentialPrefix(t *testing.T) {
   153  	ctx := context.Background()
   154  	assert := assert.New(t)
   155  
   156  	// Use bogus addrs so we can generate the case of sequentially non-unique prefixes in the index
   157  	// Note that these are already sorted
   158  	addrStrings := []string{
   159  		"0rfgadopg6h3fk7d253ivbjsij4qo3nv",
   160  		"0rfgadopg6h3fk7d253ivbjsij4qo4nv",
   161  		"0rfgadopg6h3fk7d253ivbjsij4qo9nv",
   162  	}
   163  
   164  	addrs := make([]hash.Hash, len(addrStrings))
   165  	for i, s := range addrStrings {
   166  		addrs[i] = hash.Parse(s)
   167  	}
   168  
   169  	bogusData := []byte("bogus") // doesn't matter what this is. hasMany() won't check chunkRecords
   170  	totalData := uint64(len(bogusData) * len(addrs))
   171  
   172  	capacity := maxTableSize(uint64(len(addrs)), totalData)
   173  	buff := make([]byte, capacity)
   174  	tw := newTableWriter(buff, nil)
   175  
   176  	for _, a := range addrs {
   177  		tw.addChunk(a, bogusData)
   178  	}
   179  
   180  	length, _, err := tw.finish()
   181  	require.NoError(t, err)
   182  	buff = buff[:length]
   183  
   184  	ti, err := parseTableIndexByCopy(ctx, buff, &UnlimitedQuotaProvider{})
   185  	require.NoError(t, err)
   186  	tr, err := newTableReader(ti, tableReaderAtFromBytes(buff), fileBlockSize)
   187  	require.NoError(t, err)
   188  	defer tr.close()
   189  
   190  	hasAddrs := make([]hasRecord, 2)
   191  	// Leave out the first address
   192  	hasAddrs[0] = hasRecord{&addrs[1], addrs[1].Prefix(), 1, false}
   193  	hasAddrs[1] = hasRecord{&addrs[2], addrs[2].Prefix(), 2, false}
   194  
   195  	_, err = tr.hasMany(hasAddrs)
   196  	require.NoError(t, err)
   197  
   198  	for _, ha := range hasAddrs {
   199  		assert.True(ha.has, fmt.Sprintf("Nothing for prefix %x\n", ha.prefix))
   200  	}
   201  }
   202  
   203  func BenchmarkHasMany(b *testing.B) {
   204  	const cnt = 64 * 1024
   205  	chnks := make([][]byte, cnt)
   206  	addrs := make(hash.HashSlice, cnt)
   207  	hrecs := make([]hasRecord, cnt)
   208  	sparse := make([]hasRecord, cnt/1024)
   209  
   210  	data := make([]byte, cnt*16)
   211  	rand.Read(data)
   212  	for i := range chnks {
   213  		chnks[i] = data[i*16 : (i+1)*16]
   214  	}
   215  	for i := range addrs {
   216  		addrs[i] = computeAddr(chnks[i])
   217  	}
   218  	for i := range hrecs {
   219  		hrecs[i] = hasRecord{
   220  			a:      &addrs[i],
   221  			prefix: addrs[i].Prefix(),
   222  			order:  i,
   223  		}
   224  	}
   225  	for i := range sparse {
   226  		j := i * 64
   227  		hrecs[i] = hasRecord{
   228  			a:      &addrs[j],
   229  			prefix: addrs[j].Prefix(),
   230  			order:  j,
   231  		}
   232  	}
   233  	sort.Sort(hasRecordByPrefix(hrecs))
   234  	sort.Sort(hasRecordByPrefix(sparse))
   235  
   236  	ctx := context.Background()
   237  	tableData, _, err := buildTable(chnks)
   238  	require.NoError(b, err)
   239  	ti, err := parseTableIndexByCopy(ctx, tableData, &UnlimitedQuotaProvider{})
   240  	require.NoError(b, err)
   241  	tr, err := newTableReader(ti, tableReaderAtFromBytes(tableData), fileBlockSize)
   242  	require.NoError(b, err)
   243  	defer tr.close()
   244  
   245  	b.ResetTimer()
   246  	b.Run("dense has many", func(b *testing.B) {
   247  		var ok bool
   248  		for i := 0; i < b.N; i++ {
   249  			ok, err = tr.hasMany(hrecs)
   250  		}
   251  		assert.False(b, ok)
   252  		assert.NoError(b, err)
   253  	})
   254  	b.Run("sparse has many", func(b *testing.B) {
   255  		var ok bool
   256  		for i := 0; i < b.N; i++ {
   257  			ok, err = tr.hasMany(sparse)
   258  		}
   259  		assert.True(b, ok)
   260  		assert.NoError(b, err)
   261  	})
   262  }
   263  
   264  func TestGetMany(t *testing.T) {
   265  	ctx := context.Background()
   266  	assert := assert.New(t)
   267  
   268  	data := [][]byte{
   269  		[]byte("hello2"),
   270  		[]byte("goodbye2"),
   271  		[]byte("badbye2"),
   272  	}
   273  
   274  	tableData, _, err := buildTable(data)
   275  	require.NoError(t, err)
   276  	ti, err := parseTableIndexByCopy(ctx, tableData, &UnlimitedQuotaProvider{})
   277  	require.NoError(t, err)
   278  	tr, err := newTableReader(ti, tableReaderAtFromBytes(tableData), fileBlockSize)
   279  	require.NoError(t, err)
   280  	defer tr.close()
   281  
   282  	addrs := hash.HashSlice{computeAddr(data[0]), computeAddr(data[1]), computeAddr(data[2])}
   283  	getBatch := []getRecord{
   284  		{&addrs[0], binary.BigEndian.Uint64(addrs[0][:hash.PrefixLen]), false},
   285  		{&addrs[1], binary.BigEndian.Uint64(addrs[1][:hash.PrefixLen]), false},
   286  		{&addrs[2], binary.BigEndian.Uint64(addrs[2][:hash.PrefixLen]), false},
   287  	}
   288  	sort.Sort(getRecordByPrefix(getBatch))
   289  
   290  	eg, ctx := errgroup.WithContext(context.Background())
   291  
   292  	got := make([]*chunks.Chunk, 0)
   293  	_, err = tr.getMany(ctx, eg, getBatch, func(ctx context.Context, c *chunks.Chunk) { got = append(got, c) }, &Stats{})
   294  	require.NoError(t, err)
   295  	require.NoError(t, eg.Wait())
   296  
   297  	assert.True(len(got) == len(getBatch))
   298  }
   299  
   300  func TestCalcReads(t *testing.T) {
   301  	ctx := context.Background()
   302  	assert := assert.New(t)
   303  
   304  	chunks := [][]byte{
   305  		[]byte("hello2"),
   306  		[]byte("goodbye2"),
   307  		[]byte("badbye2"),
   308  	}
   309  
   310  	tableData, _, err := buildTable(chunks)
   311  	require.NoError(t, err)
   312  	ti, err := parseTableIndexByCopy(ctx, tableData, &UnlimitedQuotaProvider{})
   313  	require.NoError(t, err)
   314  	tr, err := newTableReader(ti, tableReaderAtFromBytes(tableData), 0)
   315  	require.NoError(t, err)
   316  	defer tr.close()
   317  	addrs := hash.HashSlice{computeAddr(chunks[0]), computeAddr(chunks[1]), computeAddr(chunks[2])}
   318  	getBatch := []getRecord{
   319  		{&addrs[0], binary.BigEndian.Uint64(addrs[0][:hash.PrefixLen]), false},
   320  		{&addrs[1], binary.BigEndian.Uint64(addrs[1][:hash.PrefixLen]), false},
   321  		{&addrs[2], binary.BigEndian.Uint64(addrs[2][:hash.PrefixLen]), false},
   322  	}
   323  
   324  	gb2 := []getRecord{getBatch[0], getBatch[2]}
   325  	sort.Sort(getRecordByPrefix(getBatch))
   326  
   327  	reads, remaining, err := tr.calcReads(getBatch, 0)
   328  	require.NoError(t, err)
   329  	assert.False(remaining)
   330  	assert.Equal(1, reads)
   331  
   332  	sort.Sort(getRecordByPrefix(gb2))
   333  	reads, remaining, err = tr.calcReads(gb2, 0)
   334  	require.NoError(t, err)
   335  	assert.False(remaining)
   336  	assert.Equal(2, reads)
   337  }
   338  
   339  func TestExtract(t *testing.T) {
   340  	ctx := context.Background()
   341  	assert := assert.New(t)
   342  
   343  	chunks := [][]byte{
   344  		[]byte("hello2"),
   345  		[]byte("goodbye2"),
   346  		[]byte("badbye2"),
   347  	}
   348  
   349  	tableData, _, err := buildTable(chunks)
   350  	require.NoError(t, err)
   351  	ti, err := parseTableIndexByCopy(ctx, tableData, &UnlimitedQuotaProvider{})
   352  	require.NoError(t, err)
   353  	tr, err := newTableReader(ti, tableReaderAtFromBytes(tableData), fileBlockSize)
   354  	require.NoError(t, err)
   355  	defer tr.close()
   356  
   357  	addrs := hash.HashSlice{computeAddr(chunks[0]), computeAddr(chunks[1]), computeAddr(chunks[2])}
   358  
   359  	chunkChan := make(chan extractRecord)
   360  	go func() {
   361  		err := tr.extract(context.Background(), chunkChan)
   362  		require.NoError(t, err)
   363  		close(chunkChan)
   364  	}()
   365  
   366  	i := 0
   367  	for rec := range chunkChan {
   368  		assert.NotNil(rec.data, "Nothing for", addrs[i])
   369  		assert.Equal(addrs[i], rec.a)
   370  		assert.Equal(chunks[i], rec.data)
   371  		i++
   372  	}
   373  }
   374  
   375  func Test65k(t *testing.T) {
   376  	ctx := context.Background()
   377  	assert := assert.New(t)
   378  
   379  	count := 1 << 16
   380  	chunks := make([][]byte, count)
   381  
   382  	dataFn := func(i int) []byte {
   383  		return []byte(fmt.Sprintf("data%d", i*2))
   384  	}
   385  
   386  	for i := 0; i < count; i++ {
   387  		chunks[i] = dataFn(i)
   388  	}
   389  
   390  	tableData, _, err := buildTable(chunks)
   391  	require.NoError(t, err)
   392  	ti, err := parseTableIndexByCopy(ctx, tableData, &UnlimitedQuotaProvider{})
   393  	require.NoError(t, err)
   394  	tr, err := newTableReader(ti, tableReaderAtFromBytes(tableData), fileBlockSize)
   395  	require.NoError(t, err)
   396  	defer tr.close()
   397  
   398  	for i := 0; i < count; i++ {
   399  		data := dataFn(i)
   400  		h := computeAddr(data)
   401  		assert.True(tr.has(computeAddr(data)))
   402  		bytes, err := tr.get(context.Background(), h, &Stats{})
   403  		require.NoError(t, err)
   404  		assert.Equal(string(data), string(bytes))
   405  	}
   406  
   407  	for i := count; i < count*2; i++ {
   408  		data := dataFn(i)
   409  		h := computeAddr(data)
   410  		assert.False(tr.has(computeAddr(data)))
   411  		bytes, err := tr.get(context.Background(), h, &Stats{})
   412  		require.NoError(t, err)
   413  		assert.NotEqual(string(data), string(bytes))
   414  	}
   415  }
   416  
   417  // Ensure all addresses share the first 7 bytes. Useful for easily generating tests which have
   418  // "prefix" collisions.
   419  func computeAddrCommonPrefix(data []byte) hash.Hash {
   420  	a := computeHashDefault(data)
   421  	a[0] = 0x01
   422  	a[1] = 0x23
   423  	a[2] = 0x45
   424  	a[3] = 0x67
   425  	a[4] = 0x89
   426  	a[5] = 0xab
   427  	a[6] = 0xcd
   428  	return a
   429  }
   430  
   431  func doTestNGetMany(t *testing.T, count int) {
   432  	ctx := context.Background()
   433  	assert := assert.New(t)
   434  
   435  	data := make([][]byte, count)
   436  
   437  	dataFn := func(i int) []byte {
   438  		return []byte(fmt.Sprintf("data%d", i*2))
   439  	}
   440  
   441  	for i := 0; i < count; i++ {
   442  		data[i] = dataFn(i)
   443  	}
   444  
   445  	tableData, _, err := buildTable(data)
   446  	require.NoError(t, err)
   447  	ti, err := parseTableIndexByCopy(ctx, tableData, &UnlimitedQuotaProvider{})
   448  	require.NoError(t, err)
   449  	tr, err := newTableReader(ti, tableReaderAtFromBytes(tableData), fileBlockSize)
   450  	require.NoError(t, err)
   451  	defer tr.close()
   452  
   453  	getBatch := make([]getRecord, len(data))
   454  	for i := 0; i < count; i++ {
   455  		a := computeAddr(dataFn(i))
   456  		getBatch[i] = getRecord{&a, a.Prefix(), false}
   457  	}
   458  
   459  	sort.Sort(getRecordByPrefix(getBatch))
   460  
   461  	eg, ctx := errgroup.WithContext(context.Background())
   462  
   463  	got := make([]*chunks.Chunk, 0)
   464  	_, err = tr.getMany(ctx, eg, getBatch, func(ctx context.Context, c *chunks.Chunk) { got = append(got, c) }, &Stats{})
   465  	require.NoError(t, err)
   466  	require.NoError(t, eg.Wait())
   467  
   468  	assert.True(len(got) == len(getBatch))
   469  }
   470  
   471  func Test65kGetMany(t *testing.T) {
   472  	doTestNGetMany(t, 1<<16)
   473  }
   474  
   475  func Test2kGetManyCommonPrefix(t *testing.T) {
   476  	computeAddr = computeAddrCommonPrefix
   477  	defer func() {
   478  		computeAddr = computeHashDefault
   479  	}()
   480  
   481  	doTestNGetMany(t, 1<<11)
   482  }
   483  
   484  func TestEmpty(t *testing.T) {
   485  	assert := assert.New(t)
   486  
   487  	buff := make([]byte, footerSize)
   488  	tw := newTableWriter(buff, nil)
   489  	length, _, err := tw.finish()
   490  	require.NoError(t, err)
   491  	assert.True(length == footerSize)
   492  }