github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/nbs/archive_test.go (about)

     1  // Copyright 2024 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package nbs
    16  
    17  import (
    18  	"bytes"
    19  	"encoding/binary"
    20  	"math"
    21  	"math/rand"
    22  	"testing"
    23  
    24  	"github.com/dolthub/gozstd"
    25  	"github.com/stretchr/testify/assert"
    26  
    27  	"github.com/dolthub/dolt/go/store/chunks"
    28  	"github.com/dolthub/dolt/go/store/hash"
    29  )
    30  
    31  func TestArchiveSingleChunk(t *testing.T) {
    32  	writer := NewFixedBufferByteSink(make([]byte, 1024))
    33  	aw := newArchiveWriterWithSink(writer)
    34  	testBlob := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
    35  	bsId, err := aw.writeByteSpan(testBlob)
    36  	assert.NoError(t, err)
    37  	assert.Equal(t, uint32(1), bsId)
    38  	assert.Equal(t, uint64(10), aw.bytesWritten) // 10 data bytes. No CRC or anything.
    39  
    40  	oneHash := hashWithPrefix(t, 23)
    41  
    42  	err = aw.stageChunk(oneHash, 0, 1)
    43  	assert.NoError(t, err)
    44  
    45  	err = aw.finalizeByteSpans()
    46  	assert.NoError(t, err)
    47  
    48  	err = aw.writeIndex()
    49  	assert.NoError(t, err)
    50  	// The 'uncompressed' size of the index is 23 bytes. Compressing such small data is not worth it, but we do verify
    51  	// that the index is 35 bytes in this situation.
    52  	assert.Equal(t, uint32(35), aw.indexLen)
    53  
    54  	err = aw.writeMetadata([]byte(""))
    55  	assert.NoError(t, err)
    56  
    57  	err = aw.writeFooter()
    58  	assert.NoError(t, err)
    59  
    60  	assert.Equal(t, 10+35+archiveFooterSize, aw.bytesWritten) // 10 data bytes, 35 index bytes + footer
    61  
    62  	theBytes := writer.buff[:writer.pos]
    63  	fileSize := uint64(len(theBytes))
    64  	readerAt := bytes.NewReader(theBytes)
    65  	aIdx, err := newArchiveReader(readerAt, fileSize)
    66  	assert.NoError(t, err)
    67  
    68  	assert.Equal(t, []uint64{23}, aIdx.prefixes)
    69  	assert.True(t, aIdx.has(oneHash))
    70  
    71  	dict, data, err := aIdx.getRaw(oneHash)
    72  	assert.NoError(t, err)
    73  	assert.Nil(t, dict)
    74  	assert.Equal(t, testBlob, data)
    75  }
    76  
    77  func TestArchiveSingleChunkWithDictionary(t *testing.T) {
    78  	writer := NewFixedBufferByteSink(make([]byte, 1024))
    79  	aw := newArchiveWriterWithSink(writer)
    80  	testDict := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
    81  	testData := []byte{9, 8, 7, 6, 5, 4, 3, 2, 1, 0}
    82  	_, _ = aw.writeByteSpan(testDict)
    83  	_, _ = aw.writeByteSpan(testData)
    84  
    85  	h := hashWithPrefix(t, 42)
    86  	err := aw.stageChunk(h, 1, 2)
    87  	assert.NoError(t, err)
    88  
    89  	_ = aw.finalizeByteSpans()
    90  	_ = aw.writeIndex()
    91  	_ = aw.writeMetadata([]byte(""))
    92  	err = aw.writeFooter()
    93  	assert.NoError(t, err)
    94  
    95  	theBytes := writer.buff[:writer.pos]
    96  	fileSize := uint64(len(theBytes))
    97  	readerAt := bytes.NewReader(theBytes)
    98  	aIdx, err := newArchiveReader(readerAt, fileSize)
    99  	assert.NoError(t, err)
   100  	assert.Equal(t, []uint64{42}, aIdx.prefixes)
   101  
   102  	assert.True(t, aIdx.has(h))
   103  
   104  	dict, data, err := aIdx.getRaw(h)
   105  	assert.NoError(t, err)
   106  	assert.Equal(t, testDict, dict)
   107  	assert.Equal(t, testData, data)
   108  }
   109  
   110  func TestArchiverMultipleChunksMultipleDictionaries(t *testing.T) {
   111  	writer := NewFixedBufferByteSink(make([]byte, 1024))
   112  	aw := newArchiveWriterWithSink(writer)
   113  	data1 := []byte{11, 11, 11, 11, 11, 11, 11, 11, 11, 11} // span 1
   114  	dict1 := []byte{1, 1, 1, 1, 1, 1, 1, 1, 1, 1}           // span 2
   115  	data2 := []byte{22, 22, 22, 22, 22, 22, 22, 22, 22, 22} // span 3
   116  	data3 := []byte{33, 33, 33, 33, 33, 33, 33, 33, 33, 33} // span 4
   117  	data4 := []byte{44, 44, 44, 44, 44, 44, 44, 44, 44, 44} // span 5
   118  	dict2 := []byte{2, 2, 2, 2, 2, 2, 2, 2, 2, 2}           // span 6
   119  
   120  	h1 := hashWithPrefix(t, 42)
   121  	id, _ := aw.writeByteSpan(data1)
   122  	assert.Equal(t, uint32(1), id)
   123  	_ = aw.stageChunk(h1, 0, 1)
   124  
   125  	h2 := hashWithPrefix(t, 42)
   126  	_, _ = aw.writeByteSpan(dict1)
   127  	_, _ = aw.writeByteSpan(data2)
   128  	_ = aw.stageChunk(h2, 2, 3)
   129  
   130  	h3 := hashWithPrefix(t, 42)
   131  	_, _ = aw.writeByteSpan(data3)
   132  	_ = aw.stageChunk(h3, 2, 4)
   133  
   134  	h4 := hashWithPrefix(t, 81)
   135  	_, _ = aw.writeByteSpan(data4)
   136  	_ = aw.stageChunk(h4, 0, 5)
   137  
   138  	h5 := hashWithPrefix(t, 21)
   139  	id, _ = aw.writeByteSpan(dict2)
   140  	assert.Equal(t, uint32(6), id)
   141  	_ = aw.stageChunk(h5, 6, 1)
   142  
   143  	h6 := hashWithPrefix(t, 88)
   144  	_ = aw.stageChunk(h6, 6, 1)
   145  
   146  	h7 := hashWithPrefix(t, 42)
   147  	_ = aw.stageChunk(h7, 2, 4)
   148  
   149  	_ = aw.finalizeByteSpans()
   150  	_ = aw.writeIndex()
   151  	_ = aw.writeMetadata([]byte(""))
   152  	_ = aw.writeFooter()
   153  
   154  	theBytes := writer.buff[:writer.pos]
   155  	fileSize := uint64(len(theBytes))
   156  	readerAt := bytes.NewReader(theBytes)
   157  	aIdx, err := newArchiveReader(readerAt, fileSize)
   158  	assert.NoError(t, err)
   159  	assert.Equal(t, []uint64{21, 42, 42, 42, 42, 81, 88}, aIdx.prefixes)
   160  
   161  	assert.True(t, aIdx.has(h1))
   162  	assert.True(t, aIdx.has(h2))
   163  	assert.True(t, aIdx.has(h3))
   164  	assert.True(t, aIdx.has(h4))
   165  	assert.True(t, aIdx.has(h5))
   166  	assert.True(t, aIdx.has(h6))
   167  	assert.True(t, aIdx.has(h7))
   168  	assert.False(t, aIdx.has(hash.Hash{}))
   169  	assert.False(t, aIdx.has(hashWithPrefix(t, 42)))
   170  	assert.False(t, aIdx.has(hashWithPrefix(t, 55)))
   171  
   172  	dict, data, _ := aIdx.getRaw(h1)
   173  	assert.Nil(t, dict)
   174  	assert.Equal(t, data1, data)
   175  
   176  	dict, data, _ = aIdx.getRaw(h2)
   177  	assert.Equal(t, dict1, dict)
   178  	assert.Equal(t, data2, data)
   179  
   180  	dict, data, _ = aIdx.getRaw(h3)
   181  	assert.Equal(t, dict1, dict)
   182  	assert.Equal(t, data3, data)
   183  
   184  	dict, data, _ = aIdx.getRaw(h4)
   185  	assert.Nil(t, dict)
   186  	assert.Equal(t, data, data)
   187  
   188  	dict, data, _ = aIdx.getRaw(h5)
   189  	assert.Equal(t, dict2, dict)
   190  	assert.Equal(t, data1, data)
   191  
   192  	dict, data, _ = aIdx.getRaw(h6)
   193  	assert.Equal(t, dict2, dict)
   194  	assert.Equal(t, data1, data)
   195  
   196  	dict, data, _ = aIdx.getRaw(h7)
   197  	assert.Equal(t, dict1, dict)
   198  	assert.Equal(t, data3, data)
   199  }
   200  
   201  func TestArchiveDictDecompression(t *testing.T) {
   202  	writer := NewFixedBufferByteSink(make([]byte, 4096))
   203  
   204  	// This is 32K worth of data, but it's all very similar. Only fits in 4K if compressed with a dictionary.
   205  	chks := generateSimilarChunks(42, 32)
   206  	samples := make([][]byte, len(chks))
   207  	for i, c := range chks {
   208  		samples[i] = c.Data()
   209  	}
   210  
   211  	dict := gozstd.BuildDict(samples, 2048)
   212  	cDict, err := gozstd.NewCDict(dict)
   213  	assert.NoError(t, err)
   214  
   215  	aw := newArchiveWriterWithSink(writer)
   216  
   217  	dictId, err := aw.writeByteSpan(dict)
   218  	for _, chk := range chks {
   219  		cmp := gozstd.CompressDict(nil, chk.Data(), cDict)
   220  
   221  		chId, err := aw.writeByteSpan(cmp)
   222  		assert.NoError(t, err)
   223  
   224  		err = aw.stageChunk(chk.Hash(), dictId, chId)
   225  		assert.NoError(t, err)
   226  	}
   227  	err = aw.finalizeByteSpans()
   228  	assert.NoError(t, err)
   229  
   230  	err = aw.writeIndex()
   231  	assert.NoError(t, err)
   232  
   233  	err = aw.writeMetadata([]byte("hello world"))
   234  	err = aw.writeFooter()
   235  	assert.NoError(t, err)
   236  
   237  	theBytes := writer.buff[:writer.pos]
   238  	fileSize := uint64(len(theBytes))
   239  	readerAt := bytes.NewReader(theBytes)
   240  	aIdx, err := newArchiveReader(readerAt, fileSize)
   241  	assert.NoError(t, err)
   242  
   243  	// Now verify that we can look up the chunks by their original addresses, and the data is the same.
   244  	for _, chk := range chks {
   245  		roundTripData, err := aIdx.get(chk.Hash())
   246  		assert.NoError(t, err)
   247  		assert.Equal(t, chk.Data(), roundTripData)
   248  	}
   249  }
   250  
   251  func TestMetadata(t *testing.T) {
   252  	writer := NewFixedBufferByteSink(make([]byte, 1024))
   253  	aw := newArchiveWriterWithSink(writer)
   254  	err := aw.finalizeByteSpans()
   255  	assert.NoError(t, err)
   256  	err = aw.writeIndex()
   257  	assert.NoError(t, err)
   258  	err = aw.writeMetadata([]byte("All work and no play"))
   259  	assert.NoError(t, err)
   260  	err = aw.writeFooter()
   261  	assert.NoError(t, err)
   262  
   263  	theBytes := writer.buff[:writer.pos]
   264  	fileSize := uint64(len(theBytes))
   265  	readerAt := bytes.NewReader(theBytes)
   266  	rdr, err := newArchiveReader(readerAt, fileSize)
   267  	assert.NoError(t, err)
   268  
   269  	md, err := rdr.getMetadata()
   270  	assert.NoError(t, err)
   271  	assert.Equal(t, []byte("All work and no play"), md)
   272  }
   273  
   274  // zStd has a CRC check built into it, and it will get triggered when we
   275  // attempt to decompress a corrupted chunk.
   276  func TestArchiveChunkCorruption(t *testing.T) {
   277  	writer := NewFixedBufferByteSink(make([]byte, 1024))
   278  	aw := newArchiveWriterWithSink(writer)
   279  	testBlob := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
   280  	_, _ = aw.writeByteSpan(testBlob)
   281  
   282  	h := hashWithPrefix(t, 23)
   283  	_ = aw.stageChunk(h, 0, 1)
   284  	_ = aw.finalizeByteSpans()
   285  	_ = aw.writeIndex()
   286  	_ = aw.writeMetadata(nil)
   287  	_ = aw.writeFooter()
   288  
   289  	theBytes := writer.buff[:writer.pos]
   290  	fileSize := uint64(len(theBytes))
   291  	readerAt := bytes.NewReader(theBytes)
   292  	idx, err := newArchiveReader(readerAt, fileSize)
   293  	assert.NoError(t, err)
   294  
   295  	// Corrupt the data
   296  	writer.buff[3] = writer.buff[3] + 1
   297  
   298  	data, err := idx.get(h)
   299  	assert.ErrorContains(t, err, "cannot decompress invalid src")
   300  	assert.Nil(t, data)
   301  }
   302  
   303  // Varlidate that the SHA512 checksums in the footer checkout, and fail when they are corrupted.
   304  func TestArchiveCheckSumValidations(t *testing.T) {
   305  	writer := NewFixedBufferByteSink(make([]byte, 1024))
   306  	aw := newArchiveWriterWithSink(writer)
   307  
   308  	testBlob := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
   309  	_, _ = aw.writeByteSpan(testBlob)
   310  
   311  	h := hashWithPrefix(t, 23)
   312  	_ = aw.stageChunk(h, 0, 1)
   313  	err := aw.finalizeByteSpans()
   314  	assert.NoError(t, err)
   315  	err = aw.writeIndex()
   316  	assert.NoError(t, err)
   317  	err = aw.writeMetadata([]byte("All work and no play"))
   318  	assert.NoError(t, err)
   319  	err = aw.writeFooter()
   320  	assert.NoError(t, err)
   321  
   322  	theBytes := writer.buff[:writer.pos]
   323  	fileSize := uint64(len(theBytes))
   324  	readerAt := bytes.NewReader(theBytes)
   325  	rdr, err := newArchiveReader(readerAt, fileSize)
   326  	assert.NoError(t, err)
   327  
   328  	err = rdr.verifyDataCheckSum()
   329  	assert.NoError(t, err)
   330  	err = rdr.verifyIndexCheckSum()
   331  	assert.NoError(t, err)
   332  	err = rdr.verifyMetaCheckSum()
   333  	assert.NoError(t, err)
   334  
   335  	theBytes[5] = theBytes[5] + 1
   336  	err = rdr.verifyDataCheckSum()
   337  	assert.ErrorContains(t, err, "checksum mismatch")
   338  
   339  	offset := rdr.footer.totalIndexSpan().offset + 2
   340  	theBytes[offset] = theBytes[offset] + 1
   341  	err = rdr.verifyIndexCheckSum()
   342  	assert.ErrorContains(t, err, "checksum mismatch")
   343  
   344  	offset = rdr.footer.metadataSpan().offset + 2
   345  	theBytes[offset] = theBytes[offset] + 1
   346  	err = rdr.verifyMetaCheckSum()
   347  	assert.ErrorContains(t, err, "checksum mismatch")
   348  }
   349  
   350  func TestProllyBinSearchUneven(t *testing.T) {
   351  	// We construct a prefix list which is not well distributed to ensure that the search still works, even if not
   352  	// optimal.
   353  	pf := make([]uint64, 1000)
   354  	for i := 0; i < 900; i++ {
   355  		pf[i] = uint64(i)
   356  	}
   357  	target := uint64(12345)
   358  	pf[900] = target
   359  	for i := 901; i < 1000; i++ {
   360  		pf[i] = uint64(10000000 + i)
   361  	}
   362  	// In normal circumstances, a value of 12345 would be far to the left side of the list
   363  	found := prollyBinSearch(pf, target)
   364  	assert.Equal(t, 900, found)
   365  
   366  	// Same test, but from something on the right side of the list.
   367  	for i := 999; i > 100; i-- {
   368  		pf[i] = uint64(math.MaxUint64 - uint64(i))
   369  	}
   370  	target = uint64(math.MaxUint64 - 12345)
   371  	pf[100] = target
   372  	for i := 99; i >= 0; i-- {
   373  		pf[i] = uint64(10000000 - i)
   374  	}
   375  	found = prollyBinSearch(pf, target)
   376  	assert.Equal(t, 100, found)
   377  }
   378  
   379  func TestProllyBinSearch(t *testing.T) {
   380  	r := rand.New(rand.NewSource(42))
   381  	curVal := uint64(r.Int())
   382  	pf := make([]uint64, 10000)
   383  	for i := 0; i < 10000; i++ {
   384  		pf[i] = curVal
   385  		curVal += uint64(r.Intn(10))
   386  	}
   387  
   388  	for i := 0; i < 10000; i++ {
   389  		idx := prollyBinSearch(pf, pf[i])
   390  		// There are dupes in the list, so we don't always end up with the same index.
   391  		assert.Equal(t, pf[i], pf[idx])
   392  	}
   393  
   394  	idx := prollyBinSearch(pf, pf[0]-1)
   395  	assert.Equal(t, 0, idx)
   396  	idx = prollyBinSearch(pf, pf[9999]+1)
   397  	assert.Equal(t, 10000, idx)
   398  
   399  	// 23 is not a dupe, and neighbors don't match. stable due to seed.
   400  	idx = prollyBinSearch(pf, pf[23]+1)
   401  	assert.Equal(t, 24, idx)
   402  	idx = prollyBinSearch(pf, pf[23]-1)
   403  	assert.Equal(t, 23, idx)
   404  
   405  }
   406  
   407  func TestDuplicateInsertion(t *testing.T) {
   408  	writer := NewFixedBufferByteSink(make([]byte, 1024))
   409  	aw := newArchiveWriterWithSink(writer)
   410  	testBlob := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
   411  	_, _ = aw.writeByteSpan(testBlob)
   412  
   413  	h := hashWithPrefix(t, 23)
   414  	_ = aw.stageChunk(h, 0, 1)
   415  	err := aw.stageChunk(h, 0, 1)
   416  	assert.Equal(t, ErrDuplicateChunkWritten, err)
   417  }
   418  
   419  func TestInsertRanges(t *testing.T) {
   420  	writer := NewFixedBufferByteSink(make([]byte, 1024))
   421  	aw := newArchiveWriterWithSink(writer)
   422  	testBlob := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
   423  	_, _ = aw.writeByteSpan(testBlob)
   424  
   425  	h := hashWithPrefix(t, 23)
   426  	err := aw.stageChunk(h, 0, 2)
   427  	assert.Equal(t, ErrInvalidChunkRange, err)
   428  
   429  	err = aw.stageChunk(h, 2, 1)
   430  	assert.Equal(t, ErrInvalidDictionaryRange, err)
   431  }
   432  
   433  func TestFooterVersionAndSignature(t *testing.T) {
   434  	writer := NewFixedBufferByteSink(make([]byte, 1024))
   435  	aw := newArchiveWriterWithSink(writer)
   436  	err := aw.finalizeByteSpans()
   437  	assert.NoError(t, err)
   438  	err = aw.writeIndex()
   439  	assert.NoError(t, err)
   440  	err = aw.writeMetadata([]byte("All work and no play"))
   441  	assert.NoError(t, err)
   442  	err = aw.writeFooter()
   443  	assert.NoError(t, err)
   444  
   445  	theBytes := writer.buff[:writer.pos]
   446  	fileSize := uint64(len(theBytes))
   447  	readerAt := bytes.NewReader(theBytes)
   448  	rdr, err := newArchiveReader(readerAt, fileSize)
   449  	assert.NoError(t, err)
   450  
   451  	assert.Equal(t, archiveFormatVersion, rdr.footer.formatVersion)
   452  	assert.Equal(t, archiveFileSignature, rdr.footer.fileSignature)
   453  
   454  	// Corrupt the version
   455  	theBytes[fileSize-archiveFooterSize+afrVersionOffset] = 23
   456  	readerAt = bytes.NewReader(theBytes)
   457  	_, err = newArchiveReader(readerAt, fileSize)
   458  	assert.ErrorContains(t, err, "invalid format version")
   459  
   460  	// Corrupt the signature, but first restore the version.
   461  	theBytes[fileSize-archiveFooterSize+afrVersionOffset] = archiveFormatVersion
   462  	theBytes[fileSize-archiveFooterSize+afrSigOffset+2] = 'X'
   463  	readerAt = bytes.NewReader(theBytes)
   464  	_, err = newArchiveReader(readerAt, fileSize)
   465  	assert.ErrorContains(t, err, "invalid file signature")
   466  
   467  }
   468  
   469  // Helper functions to create test data below....
   470  func hashWithPrefix(t *testing.T, prefix uint64) hash.Hash {
   471  	randomBytes := make([]byte, 20)
   472  	n, err := rand.Read(randomBytes)
   473  	assert.Equal(t, 20, n)
   474  	assert.NoError(t, err)
   475  
   476  	binary.BigEndian.PutUint64(randomBytes, prefix)
   477  	return hash.Hash(randomBytes)
   478  }
   479  
   480  func generateSimilarChunks(seed int64, count int) []*chunks.Chunk {
   481  	chks := make([]*chunks.Chunk, count)
   482  	for i := 0; i < count; i++ {
   483  		chks[i] = generateRandomChunk(seed, 1000+i)
   484  	}
   485  
   486  	return chks
   487  }
   488  
   489  func generateRandomChunk(seed int64, len int) *chunks.Chunk {
   490  	r := rand.NewSource(seed)
   491  
   492  	data := make([]byte, len)
   493  	for i := range data {
   494  		data[i] = byte(r.Int63())
   495  	}
   496  	c := chunks.NewChunk(data)
   497  	return &c
   498  }