github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/nbs/archive_test.go (about) 1 // Copyright 2024 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package nbs 16 17 import ( 18 "bytes" 19 "encoding/binary" 20 "math" 21 "math/rand" 22 "testing" 23 24 "github.com/dolthub/gozstd" 25 "github.com/stretchr/testify/assert" 26 27 "github.com/dolthub/dolt/go/store/chunks" 28 "github.com/dolthub/dolt/go/store/hash" 29 ) 30 31 func TestArchiveSingleChunk(t *testing.T) { 32 writer := NewFixedBufferByteSink(make([]byte, 1024)) 33 aw := newArchiveWriterWithSink(writer) 34 testBlob := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 35 bsId, err := aw.writeByteSpan(testBlob) 36 assert.NoError(t, err) 37 assert.Equal(t, uint32(1), bsId) 38 assert.Equal(t, uint64(10), aw.bytesWritten) // 10 data bytes. No CRC or anything. 39 40 oneHash := hashWithPrefix(t, 23) 41 42 err = aw.stageChunk(oneHash, 0, 1) 43 assert.NoError(t, err) 44 45 err = aw.finalizeByteSpans() 46 assert.NoError(t, err) 47 48 err = aw.writeIndex() 49 assert.NoError(t, err) 50 // The 'uncompressed' size of the index is 23 bytes. Compressing such small data is not worth it, but we do verify 51 // that the index is 35 bytes in this situation. 52 assert.Equal(t, uint32(35), aw.indexLen) 53 54 err = aw.writeMetadata([]byte("")) 55 assert.NoError(t, err) 56 57 err = aw.writeFooter() 58 assert.NoError(t, err) 59 60 assert.Equal(t, 10+35+archiveFooterSize, aw.bytesWritten) // 10 data bytes, 35 index bytes + footer 61 62 theBytes := writer.buff[:writer.pos] 63 fileSize := uint64(len(theBytes)) 64 readerAt := bytes.NewReader(theBytes) 65 aIdx, err := newArchiveReader(readerAt, fileSize) 66 assert.NoError(t, err) 67 68 assert.Equal(t, []uint64{23}, aIdx.prefixes) 69 assert.True(t, aIdx.has(oneHash)) 70 71 dict, data, err := aIdx.getRaw(oneHash) 72 assert.NoError(t, err) 73 assert.Nil(t, dict) 74 assert.Equal(t, testBlob, data) 75 } 76 77 func TestArchiveSingleChunkWithDictionary(t *testing.T) { 78 writer := NewFixedBufferByteSink(make([]byte, 1024)) 79 aw := newArchiveWriterWithSink(writer) 80 testDict := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 81 testData := []byte{9, 8, 7, 6, 5, 4, 3, 2, 1, 0} 82 _, _ = aw.writeByteSpan(testDict) 83 _, _ = aw.writeByteSpan(testData) 84 85 h := hashWithPrefix(t, 42) 86 err := aw.stageChunk(h, 1, 2) 87 assert.NoError(t, err) 88 89 _ = aw.finalizeByteSpans() 90 _ = aw.writeIndex() 91 _ = aw.writeMetadata([]byte("")) 92 err = aw.writeFooter() 93 assert.NoError(t, err) 94 95 theBytes := writer.buff[:writer.pos] 96 fileSize := uint64(len(theBytes)) 97 readerAt := bytes.NewReader(theBytes) 98 aIdx, err := newArchiveReader(readerAt, fileSize) 99 assert.NoError(t, err) 100 assert.Equal(t, []uint64{42}, aIdx.prefixes) 101 102 assert.True(t, aIdx.has(h)) 103 104 dict, data, err := aIdx.getRaw(h) 105 assert.NoError(t, err) 106 assert.Equal(t, testDict, dict) 107 assert.Equal(t, testData, data) 108 } 109 110 func TestArchiverMultipleChunksMultipleDictionaries(t *testing.T) { 111 writer := NewFixedBufferByteSink(make([]byte, 1024)) 112 aw := newArchiveWriterWithSink(writer) 113 data1 := []byte{11, 11, 11, 11, 11, 11, 11, 11, 11, 11} // span 1 114 dict1 := []byte{1, 1, 1, 1, 1, 1, 1, 1, 1, 1} // span 2 115 data2 := []byte{22, 22, 22, 22, 22, 22, 22, 22, 22, 22} // span 3 116 data3 := []byte{33, 33, 33, 33, 33, 33, 33, 33, 33, 33} // span 4 117 data4 := []byte{44, 44, 44, 44, 44, 44, 44, 44, 44, 44} // span 5 118 dict2 := []byte{2, 2, 2, 2, 2, 2, 2, 2, 2, 2} // span 6 119 120 h1 := hashWithPrefix(t, 42) 121 id, _ := aw.writeByteSpan(data1) 122 assert.Equal(t, uint32(1), id) 123 _ = aw.stageChunk(h1, 0, 1) 124 125 h2 := hashWithPrefix(t, 42) 126 _, _ = aw.writeByteSpan(dict1) 127 _, _ = aw.writeByteSpan(data2) 128 _ = aw.stageChunk(h2, 2, 3) 129 130 h3 := hashWithPrefix(t, 42) 131 _, _ = aw.writeByteSpan(data3) 132 _ = aw.stageChunk(h3, 2, 4) 133 134 h4 := hashWithPrefix(t, 81) 135 _, _ = aw.writeByteSpan(data4) 136 _ = aw.stageChunk(h4, 0, 5) 137 138 h5 := hashWithPrefix(t, 21) 139 id, _ = aw.writeByteSpan(dict2) 140 assert.Equal(t, uint32(6), id) 141 _ = aw.stageChunk(h5, 6, 1) 142 143 h6 := hashWithPrefix(t, 88) 144 _ = aw.stageChunk(h6, 6, 1) 145 146 h7 := hashWithPrefix(t, 42) 147 _ = aw.stageChunk(h7, 2, 4) 148 149 _ = aw.finalizeByteSpans() 150 _ = aw.writeIndex() 151 _ = aw.writeMetadata([]byte("")) 152 _ = aw.writeFooter() 153 154 theBytes := writer.buff[:writer.pos] 155 fileSize := uint64(len(theBytes)) 156 readerAt := bytes.NewReader(theBytes) 157 aIdx, err := newArchiveReader(readerAt, fileSize) 158 assert.NoError(t, err) 159 assert.Equal(t, []uint64{21, 42, 42, 42, 42, 81, 88}, aIdx.prefixes) 160 161 assert.True(t, aIdx.has(h1)) 162 assert.True(t, aIdx.has(h2)) 163 assert.True(t, aIdx.has(h3)) 164 assert.True(t, aIdx.has(h4)) 165 assert.True(t, aIdx.has(h5)) 166 assert.True(t, aIdx.has(h6)) 167 assert.True(t, aIdx.has(h7)) 168 assert.False(t, aIdx.has(hash.Hash{})) 169 assert.False(t, aIdx.has(hashWithPrefix(t, 42))) 170 assert.False(t, aIdx.has(hashWithPrefix(t, 55))) 171 172 dict, data, _ := aIdx.getRaw(h1) 173 assert.Nil(t, dict) 174 assert.Equal(t, data1, data) 175 176 dict, data, _ = aIdx.getRaw(h2) 177 assert.Equal(t, dict1, dict) 178 assert.Equal(t, data2, data) 179 180 dict, data, _ = aIdx.getRaw(h3) 181 assert.Equal(t, dict1, dict) 182 assert.Equal(t, data3, data) 183 184 dict, data, _ = aIdx.getRaw(h4) 185 assert.Nil(t, dict) 186 assert.Equal(t, data, data) 187 188 dict, data, _ = aIdx.getRaw(h5) 189 assert.Equal(t, dict2, dict) 190 assert.Equal(t, data1, data) 191 192 dict, data, _ = aIdx.getRaw(h6) 193 assert.Equal(t, dict2, dict) 194 assert.Equal(t, data1, data) 195 196 dict, data, _ = aIdx.getRaw(h7) 197 assert.Equal(t, dict1, dict) 198 assert.Equal(t, data3, data) 199 } 200 201 func TestArchiveDictDecompression(t *testing.T) { 202 writer := NewFixedBufferByteSink(make([]byte, 4096)) 203 204 // This is 32K worth of data, but it's all very similar. Only fits in 4K if compressed with a dictionary. 205 chks := generateSimilarChunks(42, 32) 206 samples := make([][]byte, len(chks)) 207 for i, c := range chks { 208 samples[i] = c.Data() 209 } 210 211 dict := gozstd.BuildDict(samples, 2048) 212 cDict, err := gozstd.NewCDict(dict) 213 assert.NoError(t, err) 214 215 aw := newArchiveWriterWithSink(writer) 216 217 dictId, err := aw.writeByteSpan(dict) 218 for _, chk := range chks { 219 cmp := gozstd.CompressDict(nil, chk.Data(), cDict) 220 221 chId, err := aw.writeByteSpan(cmp) 222 assert.NoError(t, err) 223 224 err = aw.stageChunk(chk.Hash(), dictId, chId) 225 assert.NoError(t, err) 226 } 227 err = aw.finalizeByteSpans() 228 assert.NoError(t, err) 229 230 err = aw.writeIndex() 231 assert.NoError(t, err) 232 233 err = aw.writeMetadata([]byte("hello world")) 234 err = aw.writeFooter() 235 assert.NoError(t, err) 236 237 theBytes := writer.buff[:writer.pos] 238 fileSize := uint64(len(theBytes)) 239 readerAt := bytes.NewReader(theBytes) 240 aIdx, err := newArchiveReader(readerAt, fileSize) 241 assert.NoError(t, err) 242 243 // Now verify that we can look up the chunks by their original addresses, and the data is the same. 244 for _, chk := range chks { 245 roundTripData, err := aIdx.get(chk.Hash()) 246 assert.NoError(t, err) 247 assert.Equal(t, chk.Data(), roundTripData) 248 } 249 } 250 251 func TestMetadata(t *testing.T) { 252 writer := NewFixedBufferByteSink(make([]byte, 1024)) 253 aw := newArchiveWriterWithSink(writer) 254 err := aw.finalizeByteSpans() 255 assert.NoError(t, err) 256 err = aw.writeIndex() 257 assert.NoError(t, err) 258 err = aw.writeMetadata([]byte("All work and no play")) 259 assert.NoError(t, err) 260 err = aw.writeFooter() 261 assert.NoError(t, err) 262 263 theBytes := writer.buff[:writer.pos] 264 fileSize := uint64(len(theBytes)) 265 readerAt := bytes.NewReader(theBytes) 266 rdr, err := newArchiveReader(readerAt, fileSize) 267 assert.NoError(t, err) 268 269 md, err := rdr.getMetadata() 270 assert.NoError(t, err) 271 assert.Equal(t, []byte("All work and no play"), md) 272 } 273 274 // zStd has a CRC check built into it, and it will get triggered when we 275 // attempt to decompress a corrupted chunk. 276 func TestArchiveChunkCorruption(t *testing.T) { 277 writer := NewFixedBufferByteSink(make([]byte, 1024)) 278 aw := newArchiveWriterWithSink(writer) 279 testBlob := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 280 _, _ = aw.writeByteSpan(testBlob) 281 282 h := hashWithPrefix(t, 23) 283 _ = aw.stageChunk(h, 0, 1) 284 _ = aw.finalizeByteSpans() 285 _ = aw.writeIndex() 286 _ = aw.writeMetadata(nil) 287 _ = aw.writeFooter() 288 289 theBytes := writer.buff[:writer.pos] 290 fileSize := uint64(len(theBytes)) 291 readerAt := bytes.NewReader(theBytes) 292 idx, err := newArchiveReader(readerAt, fileSize) 293 assert.NoError(t, err) 294 295 // Corrupt the data 296 writer.buff[3] = writer.buff[3] + 1 297 298 data, err := idx.get(h) 299 assert.ErrorContains(t, err, "cannot decompress invalid src") 300 assert.Nil(t, data) 301 } 302 303 // Varlidate that the SHA512 checksums in the footer checkout, and fail when they are corrupted. 304 func TestArchiveCheckSumValidations(t *testing.T) { 305 writer := NewFixedBufferByteSink(make([]byte, 1024)) 306 aw := newArchiveWriterWithSink(writer) 307 308 testBlob := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 309 _, _ = aw.writeByteSpan(testBlob) 310 311 h := hashWithPrefix(t, 23) 312 _ = aw.stageChunk(h, 0, 1) 313 err := aw.finalizeByteSpans() 314 assert.NoError(t, err) 315 err = aw.writeIndex() 316 assert.NoError(t, err) 317 err = aw.writeMetadata([]byte("All work and no play")) 318 assert.NoError(t, err) 319 err = aw.writeFooter() 320 assert.NoError(t, err) 321 322 theBytes := writer.buff[:writer.pos] 323 fileSize := uint64(len(theBytes)) 324 readerAt := bytes.NewReader(theBytes) 325 rdr, err := newArchiveReader(readerAt, fileSize) 326 assert.NoError(t, err) 327 328 err = rdr.verifyDataCheckSum() 329 assert.NoError(t, err) 330 err = rdr.verifyIndexCheckSum() 331 assert.NoError(t, err) 332 err = rdr.verifyMetaCheckSum() 333 assert.NoError(t, err) 334 335 theBytes[5] = theBytes[5] + 1 336 err = rdr.verifyDataCheckSum() 337 assert.ErrorContains(t, err, "checksum mismatch") 338 339 offset := rdr.footer.totalIndexSpan().offset + 2 340 theBytes[offset] = theBytes[offset] + 1 341 err = rdr.verifyIndexCheckSum() 342 assert.ErrorContains(t, err, "checksum mismatch") 343 344 offset = rdr.footer.metadataSpan().offset + 2 345 theBytes[offset] = theBytes[offset] + 1 346 err = rdr.verifyMetaCheckSum() 347 assert.ErrorContains(t, err, "checksum mismatch") 348 } 349 350 func TestProllyBinSearchUneven(t *testing.T) { 351 // We construct a prefix list which is not well distributed to ensure that the search still works, even if not 352 // optimal. 353 pf := make([]uint64, 1000) 354 for i := 0; i < 900; i++ { 355 pf[i] = uint64(i) 356 } 357 target := uint64(12345) 358 pf[900] = target 359 for i := 901; i < 1000; i++ { 360 pf[i] = uint64(10000000 + i) 361 } 362 // In normal circumstances, a value of 12345 would be far to the left side of the list 363 found := prollyBinSearch(pf, target) 364 assert.Equal(t, 900, found) 365 366 // Same test, but from something on the right side of the list. 367 for i := 999; i > 100; i-- { 368 pf[i] = uint64(math.MaxUint64 - uint64(i)) 369 } 370 target = uint64(math.MaxUint64 - 12345) 371 pf[100] = target 372 for i := 99; i >= 0; i-- { 373 pf[i] = uint64(10000000 - i) 374 } 375 found = prollyBinSearch(pf, target) 376 assert.Equal(t, 100, found) 377 } 378 379 func TestProllyBinSearch(t *testing.T) { 380 r := rand.New(rand.NewSource(42)) 381 curVal := uint64(r.Int()) 382 pf := make([]uint64, 10000) 383 for i := 0; i < 10000; i++ { 384 pf[i] = curVal 385 curVal += uint64(r.Intn(10)) 386 } 387 388 for i := 0; i < 10000; i++ { 389 idx := prollyBinSearch(pf, pf[i]) 390 // There are dupes in the list, so we don't always end up with the same index. 391 assert.Equal(t, pf[i], pf[idx]) 392 } 393 394 idx := prollyBinSearch(pf, pf[0]-1) 395 assert.Equal(t, 0, idx) 396 idx = prollyBinSearch(pf, pf[9999]+1) 397 assert.Equal(t, 10000, idx) 398 399 // 23 is not a dupe, and neighbors don't match. stable due to seed. 400 idx = prollyBinSearch(pf, pf[23]+1) 401 assert.Equal(t, 24, idx) 402 idx = prollyBinSearch(pf, pf[23]-1) 403 assert.Equal(t, 23, idx) 404 405 } 406 407 func TestDuplicateInsertion(t *testing.T) { 408 writer := NewFixedBufferByteSink(make([]byte, 1024)) 409 aw := newArchiveWriterWithSink(writer) 410 testBlob := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 411 _, _ = aw.writeByteSpan(testBlob) 412 413 h := hashWithPrefix(t, 23) 414 _ = aw.stageChunk(h, 0, 1) 415 err := aw.stageChunk(h, 0, 1) 416 assert.Equal(t, ErrDuplicateChunkWritten, err) 417 } 418 419 func TestInsertRanges(t *testing.T) { 420 writer := NewFixedBufferByteSink(make([]byte, 1024)) 421 aw := newArchiveWriterWithSink(writer) 422 testBlob := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 423 _, _ = aw.writeByteSpan(testBlob) 424 425 h := hashWithPrefix(t, 23) 426 err := aw.stageChunk(h, 0, 2) 427 assert.Equal(t, ErrInvalidChunkRange, err) 428 429 err = aw.stageChunk(h, 2, 1) 430 assert.Equal(t, ErrInvalidDictionaryRange, err) 431 } 432 433 func TestFooterVersionAndSignature(t *testing.T) { 434 writer := NewFixedBufferByteSink(make([]byte, 1024)) 435 aw := newArchiveWriterWithSink(writer) 436 err := aw.finalizeByteSpans() 437 assert.NoError(t, err) 438 err = aw.writeIndex() 439 assert.NoError(t, err) 440 err = aw.writeMetadata([]byte("All work and no play")) 441 assert.NoError(t, err) 442 err = aw.writeFooter() 443 assert.NoError(t, err) 444 445 theBytes := writer.buff[:writer.pos] 446 fileSize := uint64(len(theBytes)) 447 readerAt := bytes.NewReader(theBytes) 448 rdr, err := newArchiveReader(readerAt, fileSize) 449 assert.NoError(t, err) 450 451 assert.Equal(t, archiveFormatVersion, rdr.footer.formatVersion) 452 assert.Equal(t, archiveFileSignature, rdr.footer.fileSignature) 453 454 // Corrupt the version 455 theBytes[fileSize-archiveFooterSize+afrVersionOffset] = 23 456 readerAt = bytes.NewReader(theBytes) 457 _, err = newArchiveReader(readerAt, fileSize) 458 assert.ErrorContains(t, err, "invalid format version") 459 460 // Corrupt the signature, but first restore the version. 461 theBytes[fileSize-archiveFooterSize+afrVersionOffset] = archiveFormatVersion 462 theBytes[fileSize-archiveFooterSize+afrSigOffset+2] = 'X' 463 readerAt = bytes.NewReader(theBytes) 464 _, err = newArchiveReader(readerAt, fileSize) 465 assert.ErrorContains(t, err, "invalid file signature") 466 467 } 468 469 // Helper functions to create test data below.... 470 func hashWithPrefix(t *testing.T, prefix uint64) hash.Hash { 471 randomBytes := make([]byte, 20) 472 n, err := rand.Read(randomBytes) 473 assert.Equal(t, 20, n) 474 assert.NoError(t, err) 475 476 binary.BigEndian.PutUint64(randomBytes, prefix) 477 return hash.Hash(randomBytes) 478 } 479 480 func generateSimilarChunks(seed int64, count int) []*chunks.Chunk { 481 chks := make([]*chunks.Chunk, count) 482 for i := 0; i < count; i++ { 483 chks[i] = generateRandomChunk(seed, 1000+i) 484 } 485 486 return chks 487 } 488 489 func generateRandomChunk(seed int64, len int) *chunks.Chunk { 490 r := rand.NewSource(seed) 491 492 data := make([]byte, len) 493 for i := range data { 494 data[i] = byte(r.Int63()) 495 } 496 c := chunks.NewChunk(data) 497 return &c 498 }