github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/nbs/table_index_test.go (about)

     1  // Copyright 2022 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package nbs
    16  
    17  import (
    18  	"bytes"
    19  	"context"
    20  	"encoding/binary"
    21  	"fmt"
    22  	"io"
    23  	"os"
    24  	"testing"
    25  
    26  	"github.com/stretchr/testify/assert"
    27  	"github.com/stretchr/testify/require"
    28  
    29  	"github.com/dolthub/dolt/go/store/hash"
    30  )
    31  
    32  func TestParseTableIndex(t *testing.T) {
    33  	ctx := context.Background()
    34  	f, err := os.Open("testdata/0oa7mch34jg1rvghrnhr4shrp2fm4ftd.idx")
    35  	require.NoError(t, err)
    36  	defer f.Close()
    37  	bs, err := io.ReadAll(f)
    38  	require.NoError(t, err)
    39  	idx, err := parseTableIndexByCopy(ctx, bs, &UnlimitedQuotaProvider{})
    40  	require.NoError(t, err)
    41  	defer idx.Close()
    42  	assert.Equal(t, uint32(596), idx.chunkCount())
    43  	seen := make(map[hash.Hash]bool)
    44  	for i := uint32(0); i < idx.chunkCount(); i++ {
    45  		var onheapaddr hash.Hash
    46  		e, err := idx.indexEntry(i, &onheapaddr)
    47  		require.NoError(t, err)
    48  		if _, ok := seen[onheapaddr]; !ok {
    49  			seen[onheapaddr] = true
    50  			lookupe, ok, err := idx.lookup(&onheapaddr)
    51  			require.NoError(t, err)
    52  			assert.True(t, ok)
    53  			assert.Equal(t, e.Offset(), lookupe.Offset(), "%v does not match %v for address %v", e, lookupe, onheapaddr)
    54  			assert.Equal(t, e.Length(), lookupe.Length())
    55  		}
    56  	}
    57  }
    58  
    59  func BenchmarkFindPrefix(b *testing.B) {
    60  	ctx := context.Background()
    61  	f, err := os.Open("testdata/0oa7mch34jg1rvghrnhr4shrp2fm4ftd.idx")
    62  	require.NoError(b, err)
    63  	defer f.Close()
    64  	bs, err := io.ReadAll(f)
    65  	require.NoError(b, err)
    66  	idx, err := parseTableIndexByCopy(ctx, bs, &UnlimitedQuotaProvider{})
    67  	require.NoError(b, err)
    68  	defer idx.Close()
    69  	assert.Equal(b, uint32(596), idx.chunkCount())
    70  
    71  	prefixes, err := idx.prefixes()
    72  	require.NoError(b, err)
    73  
    74  	b.Run("benchmark prefixIdx()", func(b *testing.B) {
    75  		var ord uint32
    76  		for i := 0; i < b.N; i++ {
    77  			ord = prefixIdx(idx, prefixes[uint(i)&uint(512)])
    78  		}
    79  		assert.True(b, ord < 596)
    80  	})
    81  	b.Run("benchmark findPrefix", func(b *testing.B) {
    82  		var ord uint32
    83  		for i := 0; i < b.N; i++ {
    84  			ord = idx.findPrefix(prefixes[uint(i)&uint(512)])
    85  		}
    86  		assert.True(b, ord < 596)
    87  	})
    88  }
    89  
    90  // previous implementation for findIndex().
    91  func prefixIdx(ti onHeapTableIndex, prefix uint64) (idx uint32) {
    92  	// NOTE: The golang impl of sort.Search is basically inlined here. This method can be called in
    93  	// an extremely tight loop and inlining the code was a significant perf improvement.
    94  	idx, j := 0, ti.chunkCount()
    95  	for idx < j {
    96  		h := idx + (j-idx)/2 // avoid overflow when computing h
    97  		// i ≤ h < j
    98  		if ti.prefixAt(h) < prefix {
    99  			idx = h + 1 // preserves f(i-1) == false
   100  		} else {
   101  			j = h // preserves f(j) == true
   102  		}
   103  	}
   104  	return
   105  }
   106  
   107  func TestOnHeapTableIndex_ResolveShortHash(t *testing.T) {
   108  	ctx := context.Background()
   109  	f, err := os.Open("testdata/0oa7mch34jg1rvghrnhr4shrp2fm4ftd.idx")
   110  	require.NoError(t, err)
   111  	defer f.Close()
   112  	bs, err := io.ReadAll(f)
   113  	require.NoError(t, err)
   114  	idx, err := parseTableIndexByCopy(ctx, bs, &UnlimitedQuotaProvider{})
   115  	require.NoError(t, err)
   116  	defer idx.Close()
   117  	res, err := idx.ResolveShortHash([]byte("0"))
   118  	require.NoError(t, err)
   119  	t.Log("matched: ", len(res))
   120  	for _, h := range res {
   121  		t.Log("\t", h)
   122  	}
   123  }
   124  
   125  func TestResolveOneHash(t *testing.T) {
   126  	ctx := context.Background()
   127  	// create chunks
   128  	chunks := [][]byte{
   129  		[]byte("chunk1"),
   130  	}
   131  
   132  	// build table index
   133  	td, _, err := buildTable(chunks)
   134  	tIdx, err := parseTableIndexByCopy(ctx, td, &UnlimitedQuotaProvider{})
   135  	require.NoError(t, err)
   136  	defer tIdx.Close()
   137  
   138  	// get hashes out
   139  	hashes := make([]string, len(chunks))
   140  	for i, c := range chunks {
   141  		hashes[i] = computeAddr(c).String()
   142  		t.Log(hashes[i])
   143  	}
   144  
   145  	// resolve them
   146  	for _, h := range hashes {
   147  		// try every length
   148  		for i := 0; i < 32; i++ {
   149  			res, err := tIdx.ResolveShortHash([]byte(h[:i]))
   150  			require.NoError(t, err)
   151  			assert.Equal(t, 1, len(res))
   152  		}
   153  	}
   154  }
   155  
   156  func TestResolveFewHash(t *testing.T) {
   157  	ctx := context.Background()
   158  	// create chunks
   159  	chunks := [][]byte{
   160  		[]byte("chunk1"),
   161  		[]byte("chunk2"),
   162  		[]byte("chunk3"),
   163  	}
   164  
   165  	// build table index
   166  	td, _, err := buildTable(chunks)
   167  	tIdx, err := parseTableIndexByCopy(ctx, td, &UnlimitedQuotaProvider{})
   168  	require.NoError(t, err)
   169  	defer tIdx.Close()
   170  
   171  	// get hashes out
   172  	hashes := make([]string, len(chunks))
   173  	for i, c := range chunks {
   174  		hashes[i] = computeAddr(c).String()
   175  		t.Log(hashes[i])
   176  	}
   177  
   178  	// resolve them
   179  	for _, h := range hashes {
   180  		// try every length
   181  		for i := 0; i < 32; i++ {
   182  			res, err := tIdx.ResolveShortHash([]byte(h[:i]))
   183  			require.NoError(t, err)
   184  			t.Log("asserting length: ", i)
   185  			assert.Less(t, 0, len(res))
   186  		}
   187  	}
   188  }
   189  
   190  func TestAmbiguousShortHash(t *testing.T) {
   191  	ctx := context.Background()
   192  	// create chunks
   193  	chunks := []fakeChunk{
   194  		{address: addrFromPrefix("abcdef"), data: fakeData},
   195  		{address: addrFromPrefix("abctuv"), data: fakeData},
   196  		{address: addrFromPrefix("abcd123"), data: fakeData},
   197  	}
   198  
   199  	// build table index
   200  	td, _, err := buildFakeChunkTable(chunks)
   201  	idx, err := parseTableIndexByCopy(ctx, td, &UnlimitedQuotaProvider{})
   202  	require.NoError(t, err)
   203  	defer idx.Close()
   204  
   205  	tests := []struct {
   206  		pre string
   207  		sz  int
   208  	}{
   209  		{pre: "", sz: 3},
   210  		{pre: "a", sz: 3},
   211  		{pre: "b", sz: 0},
   212  		{pre: "v", sz: 0},
   213  		{pre: "ab", sz: 3},
   214  		{pre: "abc", sz: 3},
   215  		{pre: "abcd", sz: 2},
   216  		{pre: "abct", sz: 1},
   217  		{pre: "abcde", sz: 1},
   218  		{pre: "abcd1", sz: 1},
   219  		{pre: "abcdef", sz: 1},
   220  		{pre: "abctuv", sz: 1},
   221  		{pre: "abcd123", sz: 1},
   222  	}
   223  
   224  	for _, test := range tests {
   225  		name := fmt.Sprintf("Expect %d results for prefix %s", test.sz, test.pre)
   226  		t.Run(name, func(t *testing.T) {
   227  			res, err := idx.ResolveShortHash([]byte(test.pre))
   228  			require.NoError(t, err)
   229  			assert.Len(t, res, test.sz)
   230  		})
   231  	}
   232  }
   233  
   234  func TestReadTableFooter(t *testing.T) {
   235  	// Less than 20 bytes is not enough to read the footer
   236  	reader := bytes.NewReader(make([]byte, 19))
   237  	_, _, err := ReadTableFooter(reader)
   238  	assert.Error(t, err)
   239  	assert.Contains(t, err.Error(), "negative position")
   240  
   241  	data := make([]byte, 20)
   242  	binary.BigEndian.PutUint32(data[:4], 98765)   // Chunk Count.
   243  	binary.BigEndian.PutUint64(data[4:12], 12345) // Total Size
   244  	copy(data[12:], magicNumber)
   245  	reader = bytes.NewReader(data)
   246  	chunkCount, totalSize, err := ReadTableFooter(reader)
   247  	assert.NoError(t, err)
   248  	assert.Equal(t, uint32(98765), chunkCount)
   249  	assert.Equal(t, uint64(12345), totalSize)
   250  
   251  	// Now with a future magic number
   252  	data[12] = 0
   253  	copy(data[13:], doltMagicNumber)
   254  	reader = bytes.NewReader(data)
   255  	_, _, err = ReadTableFooter(reader)
   256  	assert.Error(t, err)
   257  	assert.Contains(t, err.Error(), "unsupported table file format")
   258  
   259  	// Now with corrupted info that we don't recognize.
   260  	copy(data[12:], "DEADBEEF")
   261  	reader = bytes.NewReader(data)
   262  	_, _, err = ReadTableFooter(reader)
   263  	assert.Error(t, err)
   264  	assert.Contains(t, err.Error(), "invalid or corrupt table file")
   265  }
   266  
   267  // fakeChunk is chunk with a faked address
   268  type fakeChunk struct {
   269  	address hash.Hash
   270  	data    []byte
   271  }
   272  
   273  var fakeData = []byte("supercalifragilisticexpialidocious")
   274  
   275  func addrFromPrefix(prefix string) hash.Hash {
   276  	// create a full length addr from a prefix
   277  	for {
   278  		if len(prefix) < hash.StringLen {
   279  			prefix += "0"
   280  		} else {
   281  			break
   282  		}
   283  	}
   284  	return hash.Parse(prefix)
   285  }
   286  
   287  func buildFakeChunkTable(chunks []fakeChunk) ([]byte, hash.Hash, error) {
   288  	totalData := uint64(0)
   289  	for _, chunk := range chunks {
   290  		totalData += uint64(len(chunk.data))
   291  	}
   292  	capacity := maxTableSize(uint64(len(chunks)), totalData)
   293  
   294  	buff := make([]byte, capacity)
   295  
   296  	tw := newTableWriter(buff, nil)
   297  
   298  	for _, chunk := range chunks {
   299  		tw.addChunk(chunk.address, chunk.data)
   300  	}
   301  
   302  	length, blockHash, err := tw.finish()
   303  
   304  	if err != nil {
   305  		return nil, hash.Hash{}, err
   306  	}
   307  
   308  	return buff[:length], blockHash, nil
   309  }