github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/lsmkv/compaction_replace_integration_test.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 //go:build integrationTest 13 // +build integrationTest 14 15 package lsmkv 16 17 import ( 18 "context" 19 "fmt" 20 "math/rand" 21 "testing" 22 23 "github.com/stretchr/testify/assert" 24 "github.com/stretchr/testify/require" 25 "github.com/weaviate/weaviate/entities/cyclemanager" 26 ) 27 28 func compactionReplaceStrategy(ctx context.Context, t *testing.T, opts []BucketOption, 29 expectedMinSize, expectedMaxSize int64, 30 ) { 31 size := 200 32 33 type kv struct { 34 key []byte 35 value []byte 36 delete bool 37 } 38 39 var segment1 []kv 40 var segment2 []kv 41 var expected []kv 42 var bucket *Bucket 43 44 dirName := t.TempDir() 45 46 t.Run("create test data", func(t *testing.T) { 47 // The test data is split into 4 scenarios evenly: 48 // 49 // 1.) created in the first segment, never touched again 50 // 2.) created in the first segment, updated in the second 51 // 3.) created in the first segment, deleted in the second 52 // 4.) not present in the first segment, created in the second 53 for i := 0; i < size; i++ { 54 key := []byte(fmt.Sprintf("key-%3d", i)) 55 originalValue := []byte(fmt.Sprintf("value-%3d-original", i)) 56 57 switch i % 4 { 58 case 0: 59 // add to segment 1 60 segment1 = append(segment1, kv{ 61 key: key, 62 value: originalValue, 63 }) 64 65 // leave this element untouched in the second segment 66 expected = append(expected, kv{ 67 key: key, 68 value: originalValue, 69 }) 70 case 1: 71 // add to segment 1 72 segment1 = append(segment1, kv{ 73 key: key, 74 value: originalValue, 75 }) 76 77 // update in the second segment 78 updatedValue := []byte(fmt.Sprintf("value-%3d-updated", i)) 79 segment2 = append(segment2, kv{ 80 key: key, 81 value: updatedValue, 82 }) 83 84 expected = append(expected, kv{ 85 key: key, 86 value: updatedValue, 87 }) 88 case 2: 89 // add to segment 1 90 segment1 = append(segment1, kv{ 91 key: key, 92 value: originalValue, 93 }) 94 95 // delete in the second segment 96 segment2 = append(segment2, kv{ 97 key: key, 98 delete: true, 99 }) 100 101 // do not add to expected at all 102 103 case 3: 104 // do not add to segment 1 105 106 // only add to segment 2 (first entry) 107 segment2 = append(segment2, kv{ 108 key: key, 109 value: originalValue, 110 }) 111 112 expected = append(expected, kv{ 113 key: key, 114 value: originalValue, 115 }) 116 } 117 } 118 }) 119 120 t.Run("shuffle the import order for each segment", func(t *testing.T) { 121 // this is to make sure we don't accidentally rely on the import order 122 rand.Shuffle(len(segment1), func(i, j int) { 123 segment1[i], segment1[j] = segment1[j], segment1[i] 124 }) 125 rand.Shuffle(len(segment2), func(i, j int) { 126 segment2[i], segment2[j] = segment2[j], segment2[i] 127 }) 128 }) 129 130 t.Run("init bucket", func(t *testing.T) { 131 b, err := NewBucket(ctx, dirName, dirName, nullLogger(), nil, 132 cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), opts...) 133 require.Nil(t, err) 134 135 // so big it effectively never triggers as part of this test 136 b.SetMemtableThreshold(1e9) 137 138 bucket = b 139 }) 140 141 t.Run("import segment 1", func(t *testing.T) { 142 for _, pair := range segment1 { 143 if !pair.delete { 144 err := bucket.Put(pair.key, pair.value) 145 require.Nil(t, err) 146 } else { 147 err := bucket.Delete(pair.key) 148 require.Nil(t, err) 149 150 } 151 } 152 }) 153 154 t.Run("flush to disk", func(t *testing.T) { 155 require.Nil(t, bucket.FlushAndSwitch()) 156 }) 157 158 t.Run("import segment 2", func(t *testing.T) { 159 for _, pair := range segment2 { 160 if !pair.delete { 161 err := bucket.Put(pair.key, pair.value) 162 require.Nil(t, err) 163 } else { 164 err := bucket.Delete(pair.key) 165 require.Nil(t, err) 166 167 } 168 } 169 }) 170 171 t.Run("flush to disk", func(t *testing.T) { 172 require.Nil(t, bucket.FlushAndSwitch()) 173 }) 174 175 t.Run("verify control before compaction", func(t *testing.T) { 176 var retrieved []kv 177 178 c := bucket.Cursor() 179 defer c.Close() 180 181 for k, v := c.First(); k != nil; k, v = c.Next() { 182 keyCopy := copyByteSlice(k) 183 valueCopy := copyByteSlice(v) 184 retrieved = append(retrieved, kv{ 185 key: keyCopy, 186 value: valueCopy, 187 }) 188 } 189 190 assert.Equal(t, expected, retrieved) 191 }) 192 193 t.Run("verify count control before compaction", func(*testing.T) { 194 assert.Equal(t, len(expected), bucket.Count()) 195 }) 196 197 t.Run("compact until no longer eligible", func(t *testing.T) { 198 var compacted bool 199 var err error 200 for compacted, err = bucket.disk.compactOnce(); err == nil && compacted; compacted, err = bucket.disk.compactOnce() { 201 } 202 require.Nil(t, err) 203 }) 204 205 t.Run("verify control after compaction", func(t *testing.T) { 206 var retrieved []kv 207 208 c := bucket.Cursor() 209 defer c.Close() 210 211 for k, v := c.First(); k != nil; k, v = c.Next() { 212 keyCopy := copyByteSlice(k) 213 valueCopy := copyByteSlice(v) 214 retrieved = append(retrieved, kv{ 215 key: keyCopy, 216 value: valueCopy, 217 }) 218 } 219 220 assert.Equal(t, expected, retrieved) 221 assertSingleSegmentOfSize(t, bucket, expectedMinSize, expectedMaxSize) 222 }) 223 224 t.Run("verify control using individual get operations", 225 func(t *testing.T) { 226 for _, pair := range expected { 227 retrieved, err := bucket.Get(pair.key) 228 require.NoError(t, err) 229 230 assert.Equal(t, pair.value, retrieved) 231 } 232 }) 233 234 t.Run("verify count after compaction", func(*testing.T) { 235 assert.Equal(t, len(expected), bucket.Count()) 236 }) 237 } 238 239 func compactionReplaceStrategy_WithSecondaryKeys(ctx context.Context, t *testing.T, opts []BucketOption) { 240 size := 4 241 242 type kv struct { 243 key []byte 244 value []byte 245 secondaryKeys [][]byte 246 delete bool 247 } 248 249 var segment1 []kv 250 var segment2 []kv 251 var expected []kv 252 var expectedNotPresent []kv 253 var bucket *Bucket 254 255 dirName := t.TempDir() 256 257 t.Run("create test data", func(t *testing.T) { 258 // The test data is split into 4 scenarios evenly: 259 // 260 // 1.) created in the first segment, never touched again 261 // 2.) created in the first segment, updated in the second 262 // 3.) created in the first segment, deleted in the second 263 // 4.) not present in the first segment, created in the second 264 for i := 0; i < size; i++ { 265 key := []byte(fmt.Sprintf("key-%02d", i)) 266 secondaryKey := []byte(fmt.Sprintf("secondary-key-%02d", i)) 267 originalValue := []byte(fmt.Sprintf("value-%2d-original", i)) 268 269 switch i % 4 { 270 case 0: 271 // add to segment 1 272 segment1 = append(segment1, kv{ 273 key: key, 274 secondaryKeys: [][]byte{secondaryKey}, 275 value: originalValue, 276 }) 277 278 // leave this element untouched in the second segment 279 expected = append(expected, kv{ 280 key: secondaryKey, 281 value: originalValue, 282 }) 283 case 1: 284 // add to segment 1 285 segment1 = append(segment1, kv{ 286 key: key, 287 secondaryKeys: [][]byte{secondaryKey}, 288 value: originalValue, 289 }) 290 291 // update in the second segment 292 updatedValue := []byte(fmt.Sprintf("value-%2d-updated", i)) 293 segment2 = append(segment2, kv{ 294 key: key, 295 secondaryKeys: [][]byte{secondaryKey}, 296 value: updatedValue, 297 }) 298 299 expected = append(expected, kv{ 300 key: secondaryKey, 301 value: updatedValue, 302 }) 303 case 2: 304 // add to segment 1 305 segment1 = append(segment1, kv{ 306 key: key, 307 secondaryKeys: [][]byte{secondaryKey}, 308 value: originalValue, 309 }) 310 311 // delete in the second segment 312 segment2 = append(segment2, kv{ 313 key: key, 314 secondaryKeys: [][]byte{secondaryKey}, 315 delete: true, 316 }) 317 318 expectedNotPresent = append(expectedNotPresent, kv{ 319 key: secondaryKey, 320 }) 321 322 case 3: 323 // do not add to segment 1 324 325 // only add to segment 2 (first entry) 326 segment2 = append(segment2, kv{ 327 key: key, 328 secondaryKeys: [][]byte{secondaryKey}, 329 value: originalValue, 330 }) 331 332 expected = append(expected, kv{ 333 key: secondaryKey, 334 value: originalValue, 335 }) 336 } 337 } 338 }) 339 340 t.Run("shuffle the import order for each segment", func(t *testing.T) { 341 // this is to make sure we don't accidentally rely on the import order 342 rand.Shuffle(len(segment1), func(i, j int) { 343 segment1[i], segment1[j] = segment1[j], segment1[i] 344 }) 345 rand.Shuffle(len(segment2), func(i, j int) { 346 segment2[i], segment2[j] = segment2[j], segment2[i] 347 }) 348 }) 349 350 t.Run("init bucket", func(t *testing.T) { 351 b, err := NewBucket(ctx, dirName, dirName, nullLogger(), nil, 352 cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), opts...) 353 require.Nil(t, err) 354 355 // so big it effectively never triggers as part of this test 356 b.SetMemtableThreshold(1e9) 357 358 bucket = b 359 }) 360 361 t.Run("import segment 1", func(t *testing.T) { 362 for _, pair := range segment1 { 363 if !pair.delete { 364 err := bucket.Put(pair.key, pair.value, 365 WithSecondaryKey(0, pair.secondaryKeys[0])) 366 require.Nil(t, err) 367 } else { 368 err := bucket.Delete(pair.key, 369 WithSecondaryKey(0, pair.secondaryKeys[0])) 370 require.Nil(t, err) 371 372 } 373 } 374 }) 375 376 t.Run("flush to disk", func(t *testing.T) { 377 require.Nil(t, bucket.FlushAndSwitch()) 378 }) 379 380 t.Run("import segment 2", func(t *testing.T) { 381 for _, pair := range segment2 { 382 if !pair.delete { 383 err := bucket.Put(pair.key, pair.value, 384 WithSecondaryKey(0, pair.secondaryKeys[0])) 385 require.Nil(t, err) 386 } else { 387 err := bucket.Delete(pair.key, 388 WithSecondaryKey(0, pair.secondaryKeys[0])) 389 require.Nil(t, err) 390 391 } 392 } 393 }) 394 395 t.Run("flush to disk", func(t *testing.T) { 396 require.Nil(t, bucket.FlushAndSwitch()) 397 }) 398 399 t.Run("verify control before compaction", func(t *testing.T) { 400 t.Run("verify the ones that should exist", func(t *testing.T) { 401 for _, pair := range expected { 402 res, err := bucket.GetBySecondary(0, pair.key) 403 require.Nil(t, err) 404 405 assert.Equal(t, pair.value, res) 406 } 407 }) 408 409 t.Run("verify the ones that should NOT exist", func(t *testing.T) { 410 for _, pair := range expectedNotPresent { 411 res, err := bucket.GetBySecondary(0, pair.key) 412 require.Nil(t, err) 413 assert.Nil(t, res) 414 } 415 }) 416 }) 417 418 t.Run("compact until no longer eligible", func(t *testing.T) { 419 var compacted bool 420 var err error 421 for compacted, err = bucket.disk.compactOnce(); err == nil && compacted; compacted, err = bucket.disk.compactOnce() { 422 } 423 require.Nil(t, err) 424 }) 425 426 t.Run("verify control after compaction", func(t *testing.T) { 427 t.Run("verify the ones that should exist", func(t *testing.T) { 428 for _, pair := range expected { 429 res, err := bucket.GetBySecondary(0, pair.key) 430 require.Nil(t, err) 431 432 assert.Equal(t, pair.value, res) 433 } 434 }) 435 436 t.Run("verify the ones that should NOT exist", func(t *testing.T) { 437 for _, pair := range expectedNotPresent { 438 res, err := bucket.GetBySecondary(0, pair.key) 439 require.Nil(t, err) 440 assert.Nil(t, res) 441 } 442 }) 443 }) 444 } 445 446 func compactionReplaceStrategy_RemoveUnnecessaryDeletes(ctx context.Context, t *testing.T, opts []BucketOption) { 447 // in this test each segment reverses the action of the previous segment so 448 // that in the end a lot of information is present in the individual segments 449 // which is no longer needed. We then verify that after all compaction this 450 // information is gone, thus freeing up disk space 451 size := 100 452 453 type kv struct { 454 key []byte 455 value []byte 456 } 457 458 key := []byte("my-key") 459 460 var bucket *Bucket 461 dirName := t.TempDir() 462 463 t.Run("init bucket", func(t *testing.T) { 464 b, err := NewBucket(ctx, dirName, dirName, nullLogger(), nil, 465 cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), opts...) 466 require.Nil(t, err) 467 468 // so big it effectively never triggers as part of this test 469 b.SetMemtableThreshold(1e9) 470 471 bucket = b 472 }) 473 474 t.Run("write segments", func(t *testing.T) { 475 for i := 0; i < size; i++ { 476 if i != 0 { 477 // we can only update an existing value if this isn't the first write 478 err := bucket.Delete(key) 479 require.Nil(t, err) 480 } 481 482 err := bucket.Put(key, []byte(fmt.Sprintf("set in round %d", i))) 483 require.Nil(t, err) 484 485 require.Nil(t, bucket.FlushAndSwitch()) 486 } 487 }) 488 489 expected := []kv{ 490 { 491 key: key, 492 value: []byte(fmt.Sprintf("set in round %d", size-1)), 493 }, 494 } 495 496 t.Run("verify control before compaction", func(t *testing.T) { 497 var retrieved []kv 498 499 c := bucket.Cursor() 500 defer c.Close() 501 502 for k, v := c.First(); k != nil; k, v = c.Next() { 503 retrieved = append(retrieved, kv{ 504 key: k, 505 value: v, 506 }) 507 } 508 509 assert.Equal(t, expected, retrieved) 510 }) 511 512 t.Run("compact until no longer eligible", func(t *testing.T) { 513 var compacted bool 514 var err error 515 for compacted, err = bucket.disk.compactOnce(); err == nil && compacted; compacted, err = bucket.disk.compactOnce() { 516 } 517 require.Nil(t, err) 518 }) 519 520 t.Run("verify control before compaction", func(t *testing.T) { 521 var retrieved []kv 522 523 c := bucket.Cursor() 524 defer c.Close() 525 526 for k, v := c.First(); k != nil; k, v = c.Next() { 527 retrieved = append(retrieved, kv{ 528 key: k, 529 value: v, 530 }) 531 } 532 533 assert.Equal(t, expected, retrieved) 534 }) 535 } 536 537 func compactionReplaceStrategy_RemoveUnnecessaryUpdates(ctx context.Context, t *testing.T, opts []BucketOption) { 538 // in this test each segment reverses the action of the previous segment so 539 // that in the end a lot of information is present in the individual segments 540 // which is no longer needed. We then verify that after all compaction this 541 // information is gone, thus freeing up disk space 542 size := 100 543 544 type kv struct { 545 key []byte 546 value []byte 547 } 548 549 key := []byte("my-key") 550 551 var bucket *Bucket 552 dirName := t.TempDir() 553 554 t.Run("init bucket", func(t *testing.T) { 555 b, err := NewBucket(ctx, dirName, dirName, nullLogger(), nil, 556 cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), opts...) 557 require.Nil(t, err) 558 559 // so big it effectively never triggers as part of this test 560 b.SetMemtableThreshold(1e9) 561 562 bucket = b 563 }) 564 565 t.Run("write segments", func(t *testing.T) { 566 for i := 0; i < size; i++ { 567 err := bucket.Put(key, []byte(fmt.Sprintf("set in round %d", i))) 568 require.Nil(t, err) 569 570 require.Nil(t, bucket.FlushAndSwitch()) 571 } 572 }) 573 574 expected := []kv{ 575 { 576 key: key, 577 value: []byte(fmt.Sprintf("set in round %d", size-1)), 578 }, 579 } 580 581 t.Run("verify control before compaction", func(t *testing.T) { 582 var retrieved []kv 583 584 c := bucket.Cursor() 585 defer c.Close() 586 587 for k, v := c.First(); k != nil; k, v = c.Next() { 588 retrieved = append(retrieved, kv{ 589 key: k, 590 value: v, 591 }) 592 } 593 594 assert.Equal(t, expected, retrieved) 595 }) 596 597 t.Run("compact until no longer eligible", func(t *testing.T) { 598 var compacted bool 599 var err error 600 for compacted, err = bucket.disk.compactOnce(); err == nil && compacted; compacted, err = bucket.disk.compactOnce() { 601 } 602 require.Nil(t, err) 603 }) 604 605 t.Run("verify control after compaction", func(t *testing.T) { 606 var retrieved []kv 607 608 c := bucket.Cursor() 609 defer c.Close() 610 611 for k, v := c.First(); k != nil; k, v = c.Next() { 612 retrieved = append(retrieved, kv{ 613 key: k, 614 value: v, 615 }) 616 } 617 618 assert.Equal(t, expected, retrieved) 619 }) 620 } 621 622 func compactionReplaceStrategy_FrequentPutDeleteOperations(ctx context.Context, t *testing.T, opts []BucketOption) { 623 // In this test we are testing that the compaction doesn't make the object to disappear 624 // We are creating even number of segments in which first we create an object 625 // then we in the next segment with delete it and we do this operation in loop 626 // we make sure that the last operation done in the last segment is create object operation 627 // In this situation after the compaction the object has to exist 628 size := 100 629 630 key := []byte("my-key") 631 632 var bucket *Bucket 633 dirName := t.TempDir() 634 635 t.Run("init bucket", func(t *testing.T) { 636 b, err := NewBucket(ctx, dirName, dirName, nullLogger(), nil, 637 cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), opts...) 638 require.Nil(t, err) 639 640 // so big it effectively never triggers as part of this test 641 b.SetMemtableThreshold(1e9) 642 643 bucket = b 644 }) 645 646 t.Run("write segments, leave the last segment with value", func(t *testing.T) { 647 for i := 0; i < size; i++ { 648 err := bucket.Put(key, []byte(fmt.Sprintf("set in round %d", i))) 649 require.Nil(t, err) 650 651 if i != size-1 { 652 // don't delete from the last segment 653 err := bucket.Delete(key) 654 require.Nil(t, err) 655 } 656 657 require.Nil(t, bucket.FlushAndSwitch()) 658 } 659 }) 660 661 t.Run("verify that the object exists before compaction", func(t *testing.T) { 662 res, err := bucket.Get(key) 663 assert.Nil(t, err) 664 assert.NotNil(t, res) 665 }) 666 667 t.Run("compact until no longer eligible", func(t *testing.T) { 668 var compacted bool 669 var err error 670 for compacted, err = bucket.disk.compactOnce(); err == nil && compacted; compacted, err = bucket.disk.compactOnce() { 671 } 672 require.Nil(t, err) 673 }) 674 675 t.Run("verify that the object still exists after compaction", func(t *testing.T) { 676 res, err := bucket.Get(key) 677 assert.Nil(t, err) 678 assert.NotNil(t, res) 679 }) 680 } 681 682 func compactionReplaceStrategy_FrequentPutDeleteOperations_WithSecondaryKeys(ctx context.Context, t *testing.T, opts []BucketOption) { 683 // In this test we are testing that the compaction doesn't make the object to disappear 684 // We are creating even number of segments in which first we create an object 685 // then we in the next segment with delete it and we do this operation in loop 686 // we make sure that the last operation done in the last segment is create object operation 687 // We are doing this for 4 to 10 segments scenarios, without the fix for firstWithAllKeys 688 // cursor method that now sets the nextOffset properly, we got discrepancies 689 // after compaction on 4 and 8 segments scenario. 690 maxSize := 10 691 692 for size := 4; size < maxSize; size++ { 693 t.Run(fmt.Sprintf("compact %v segments", size), func(t *testing.T) { 694 var bucket *Bucket 695 696 key := []byte("key-original") 697 keySecondary := []byte(fmt.Sprintf("secondary-key-%02d", size-1)) 698 699 dirName := t.TempDir() 700 701 t.Run("init bucket", func(t *testing.T) { 702 b, err := NewBucket(ctx, dirName, dirName, nullLogger(), nil, 703 cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), opts...) 704 require.Nil(t, err) 705 706 // so big it effectively never triggers as part of this test 707 b.SetMemtableThreshold(1e9) 708 709 bucket = b 710 }) 711 712 t.Run("write segments, leave the last segment with value", func(t *testing.T) { 713 for i := 0; i < size; i++ { 714 secondaryKey := []byte(fmt.Sprintf("secondary-key-%02d", i)) 715 originalValue := []byte(fmt.Sprintf("value-%2d-original", i)) 716 717 err := bucket.Put(key, originalValue, WithSecondaryKey(0, secondaryKey)) 718 require.Nil(t, err) 719 720 if i != size-1 { 721 // don't delete from the last segment 722 err := bucket.Delete(key, WithSecondaryKey(0, secondaryKey)) 723 require.Nil(t, err) 724 } 725 726 require.Nil(t, bucket.FlushAndSwitch()) 727 } 728 }) 729 730 t.Run("verify that the object exists before compaction", func(t *testing.T) { 731 res, err := bucket.GetBySecondary(0, keySecondary) 732 assert.Nil(t, err) 733 assert.NotNil(t, res) 734 res, err = bucket.Get(key) 735 assert.Nil(t, err) 736 assert.NotNil(t, res) 737 }) 738 739 t.Run("compact until no longer eligible", func(t *testing.T) { 740 var compacted bool 741 var err error 742 for compacted, err = bucket.disk.compactOnce(); err == nil && compacted; compacted, err = bucket.disk.compactOnce() { 743 } 744 require.Nil(t, err) 745 }) 746 747 t.Run("verify that the object still exists after compaction", func(t *testing.T) { 748 res, err := bucket.GetBySecondary(0, keySecondary) 749 assert.Nil(t, err) 750 assert.NotNil(t, res) 751 res, err = bucket.Get(key) 752 assert.Nil(t, err) 753 assert.NotNil(t, res) 754 }) 755 }) 756 } 757 }