github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/lsmkv/compaction_map_integration_test.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 //go:build integrationTest 13 // +build integrationTest 14 15 package lsmkv 16 17 import ( 18 "bytes" 19 "context" 20 "fmt" 21 "math/rand" 22 "sort" 23 "testing" 24 25 "github.com/stretchr/testify/assert" 26 "github.com/stretchr/testify/require" 27 "github.com/weaviate/weaviate/entities/cyclemanager" 28 ) 29 30 func compactionMapStrategy(ctx context.Context, t *testing.T, opts []BucketOption, 31 expectedMinSize, expectedMaxSize int64, 32 ) { 33 size := 100 34 35 type kv struct { 36 key []byte 37 values []MapPair 38 } 39 40 // this segment is not part of the merge, but might still play a role in 41 // overall results. For example if one of the later segments has a tombstone 42 // for it 43 var previous1 []kv 44 var previous2 []kv 45 46 var segment1 []kv 47 var segment2 []kv 48 var expected []kv 49 var bucket *Bucket 50 51 dirName := t.TempDir() 52 53 t.Run("create test data", func(t *testing.T) { 54 // The test data is split into 4 scenarios evenly: 55 // 56 // 0.) created in the first segment, never touched again 57 // 1.) created in the first segment, appended to it in the second 58 // 2.) created in the first segment, first element updated in the second 59 // 3.) created in the first segment, second element updated in the second 60 // 4.) created in the first segment, first element deleted in the second 61 // 5.) created in the first segment, second element deleted in the second 62 // 6.) not present in the first segment, created in the second 63 // 7.) present in an unrelated previous segment, deleted in the first 64 // 8.) present in an unrelated previous segment, deleted in the second 65 // 9.) present in an unrelated previous segment, never touched again 66 for i := 0; i < size; i++ { 67 rowKey := []byte(fmt.Sprintf("row-%03d", i)) 68 69 pair1 := MapPair{ 70 Key: []byte(fmt.Sprintf("value-%03d-01", i)), 71 Value: []byte(fmt.Sprintf("value-%03d-01-original", i)), 72 } 73 pair2 := MapPair{ 74 Key: []byte(fmt.Sprintf("value-%03d-02", i)), 75 Value: []byte(fmt.Sprintf("value-%03d-02-original", i)), 76 } 77 pairs := []MapPair{pair1, pair2} 78 79 switch i % 10 { 80 case 0: 81 // add to segment 1 82 segment1 = append(segment1, kv{ 83 key: rowKey, 84 values: pairs[:1], 85 }) 86 87 // leave this element untouched in the second segment 88 expected = append(expected, kv{ 89 key: rowKey, 90 values: pairs[:1], 91 }) 92 case 1: 93 // add to segment 1 94 segment1 = append(segment1, kv{ 95 key: rowKey, 96 values: pairs[:1], 97 }) 98 99 // add extra pair in the second segment 100 segment2 = append(segment2, kv{ 101 key: rowKey, 102 values: pairs[1:2], 103 }) 104 105 expected = append(expected, kv{ 106 key: rowKey, 107 values: pairs, 108 }) 109 case 2: 110 // add both to segment 1 111 segment1 = append(segment1, kv{ 112 key: rowKey, 113 values: pairs, 114 }) 115 116 // update first key in the second segment 117 updated := pair1 118 updated.Value = []byte("updated") 119 120 segment2 = append(segment2, kv{ 121 key: rowKey, 122 values: []MapPair{updated}, 123 }) 124 125 expected = append(expected, kv{ 126 key: rowKey, 127 values: []MapPair{pair2, updated}, 128 }) 129 130 case 3: 131 // add both to segment 1 132 segment1 = append(segment1, kv{ 133 key: rowKey, 134 values: pairs, 135 }) 136 137 // update first key in the second segment 138 updated := pair2 139 updated.Value = []byte("updated") 140 141 segment2 = append(segment2, kv{ 142 key: rowKey, 143 values: []MapPair{updated}, 144 }) 145 146 expected = append(expected, kv{ 147 key: rowKey, 148 values: []MapPair{pair1, updated}, 149 }) 150 151 case 4: 152 // add both to segment 1 153 segment1 = append(segment1, kv{ 154 key: rowKey, 155 values: pairs, 156 }) 157 158 // delete first key in the second segment 159 updated := pair1 160 updated.Value = nil 161 updated.Tombstone = true 162 163 segment2 = append(segment2, kv{ 164 key: rowKey, 165 values: []MapPair{updated}, 166 }) 167 168 expected = append(expected, kv{ 169 key: rowKey, 170 values: []MapPair{pair2}, 171 }) 172 173 case 5: 174 // add both to segment 1 175 segment1 = append(segment1, kv{ 176 key: rowKey, 177 values: pairs, 178 }) 179 180 // delete second key in the second segment 181 updated := pair2 182 updated.Value = nil 183 updated.Tombstone = true 184 185 segment2 = append(segment2, kv{ 186 key: rowKey, 187 values: []MapPair{updated}, 188 }) 189 190 expected = append(expected, kv{ 191 key: rowKey, 192 values: []MapPair{pair1}, 193 }) 194 195 case 6: 196 // do not add to segment 2 197 198 // only add to segment 2 (first entry) 199 segment2 = append(segment2, kv{ 200 key: rowKey, 201 values: pairs, 202 }) 203 204 expected = append(expected, kv{ 205 key: rowKey, 206 values: pairs, 207 }) 208 209 case 7: 210 // only part of a previous segment, which is not part of the merge 211 previous1 = append(previous1, kv{ 212 key: rowKey, 213 values: pairs[:1], 214 }) 215 previous2 = append(previous2, kv{ 216 key: rowKey, 217 values: pairs[1:], 218 }) 219 220 // delete in segment 1 221 deleted1 := pair1 222 deleted1.Value = nil 223 deleted1.Tombstone = true 224 225 deleted2 := pair2 226 deleted2.Value = nil 227 deleted2.Tombstone = true 228 229 segment1 = append(segment1, kv{ 230 key: rowKey, 231 values: []MapPair{deleted1}, 232 }) 233 segment1 = append(segment1, kv{ 234 key: rowKey, 235 values: []MapPair{deleted2}, 236 }) 237 238 // should not have any values in expected at all, not even a key 239 240 case 8: 241 // only part of a previous segment, which is not part of the merge 242 previous1 = append(previous1, kv{ 243 key: rowKey, 244 values: pairs[:1], 245 }) 246 previous2 = append(previous2, kv{ 247 key: rowKey, 248 values: pairs[1:], 249 }) 250 251 // delete in segment 1 252 deleted1 := pair1 253 deleted1.Value = nil 254 deleted1.Tombstone = true 255 256 deleted2 := pair2 257 deleted2.Value = nil 258 deleted2.Tombstone = true 259 260 segment2 = append(segment2, kv{ 261 key: rowKey, 262 values: []MapPair{deleted1}, 263 }) 264 segment2 = append(segment2, kv{ 265 key: rowKey, 266 values: []MapPair{deleted2}, 267 }) 268 269 // should not have any values in expected at all, not even a key 270 271 case 9: 272 // only part of a previous segment 273 previous1 = append(previous1, kv{ 274 key: rowKey, 275 values: pairs[:1], 276 }) 277 previous2 = append(previous2, kv{ 278 key: rowKey, 279 values: pairs[1:], 280 }) 281 282 expected = append(expected, kv{ 283 key: rowKey, 284 values: pairs, 285 }) 286 } 287 } 288 }) 289 290 t.Run("shuffle the import order for each segment", func(t *testing.T) { 291 // this is to make sure we don't accidentally rely on the import order 292 rand.Shuffle(len(segment1), func(i, j int) { 293 segment1[i], segment1[j] = segment1[j], segment1[i] 294 }) 295 rand.Shuffle(len(segment2), func(i, j int) { 296 segment2[i], segment2[j] = segment2[j], segment2[i] 297 }) 298 }) 299 300 t.Run("init bucket", func(t *testing.T) { 301 b, err := NewBucket(ctx, dirName, dirName, nullLogger(), nil, 302 cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), opts...) 303 require.Nil(t, err) 304 305 // so big it effectively never triggers as part of this test 306 b.SetMemtableThreshold(1e9) 307 308 bucket = b 309 }) 310 311 t.Run("import and flush previous segments", func(t *testing.T) { 312 for _, kvs := range previous1 { 313 for _, pair := range kvs.values { 314 err := bucket.MapSet(kvs.key, pair) 315 require.Nil(t, err) 316 } 317 } 318 319 require.Nil(t, bucket.FlushAndSwitch()) 320 321 for _, kvs := range previous2 { 322 for _, pair := range kvs.values { 323 err := bucket.MapSet(kvs.key, pair) 324 require.Nil(t, err) 325 } 326 } 327 328 require.Nil(t, bucket.FlushAndSwitch()) 329 }) 330 331 t.Run("import segment 1", func(t *testing.T) { 332 for _, kvs := range segment1 { 333 for _, pair := range kvs.values { 334 err := bucket.MapSet(kvs.key, pair) 335 require.Nil(t, err) 336 } 337 } 338 }) 339 340 t.Run("flush to disk", func(t *testing.T) { 341 require.Nil(t, bucket.FlushAndSwitch()) 342 }) 343 344 t.Run("import segment 2", func(t *testing.T) { 345 for _, kvs := range segment2 { 346 for _, pair := range kvs.values { 347 err := bucket.MapSet(kvs.key, pair) 348 require.Nil(t, err) 349 } 350 } 351 }) 352 353 t.Run("flush to disk", func(t *testing.T) { 354 require.Nil(t, bucket.FlushAndSwitch()) 355 }) 356 357 t.Run("within control make sure map keys are sorted", func(t *testing.T) { 358 for i := range expected { 359 sort.Slice(expected[i].values, func(a, b int) bool { 360 return bytes.Compare(expected[i].values[a].Key, expected[i].values[b].Key) < 0 361 }) 362 } 363 }) 364 365 t.Run("verify control before compaction", func(t *testing.T) { 366 var retrieved []kv 367 368 c := bucket.MapCursor() 369 defer c.Close() 370 371 for k, v := c.First(); k != nil; k, v = c.Next() { 372 retrieved = append(retrieved, kv{ 373 key: k, 374 values: v, 375 }) 376 } 377 378 assert.Equal(t, expected, retrieved) 379 }) 380 381 t.Run("compact until no longer eligible", func(t *testing.T) { 382 i := 0 383 var compacted bool 384 var err error 385 for compacted, err = bucket.disk.compactOnce(); err == nil && compacted; compacted, err = bucket.disk.compactOnce() { 386 if i == 1 { 387 // segment1 and segment2 merged 388 // none of them is root segment, so tombstones 389 // will not be removed regardless of keepTombstones setting 390 assertSecondSegmentOfSize(t, bucket, 11876, 11876) 391 } 392 i++ 393 } 394 require.Nil(t, err) 395 }) 396 397 t.Run("verify control after compaction using a cursor", func(t *testing.T) { 398 var retrieved []kv 399 400 c := bucket.MapCursor() 401 defer c.Close() 402 403 for k, v := c.First(); k != nil; k, v = c.Next() { 404 retrieved = append(retrieved, kv{ 405 key: k, 406 values: v, 407 }) 408 } 409 410 assert.Equal(t, expected, retrieved) 411 assertSingleSegmentOfSize(t, bucket, expectedMinSize, expectedMaxSize) 412 }) 413 414 t.Run("verify control using individual get (MapList) operations", 415 func(t *testing.T) { 416 // Previously the only verification was done using the cursor. That 417 // guaranteed that all pairs are present in the payload, but it did not 418 // guarantee the integrity of the index (DiskTree) which is used to access 419 // _individual_ keys. Corrupting this index is exactly what happened in 420 // https://github.com/weaviate/weaviate/issues/3517 421 for _, pair := range expected { 422 retrieved, err := bucket.MapList(pair.key) 423 require.NoError(t, err) 424 425 assert.Equal(t, pair.values, retrieved) 426 } 427 }) 428 } 429 430 func compactionMapStrategy_RemoveUnnecessary(ctx context.Context, t *testing.T, opts []BucketOption) { 431 // in this test each segment reverses the action of the previous segment so 432 // that in the end a lot of information is present in the individual segments 433 // which is no longer needed. We then verify that after all compaction this 434 // information is gone, thus freeing up disk space 435 size := 100 436 437 type kv struct { 438 key []byte 439 values []MapPair 440 } 441 442 key := []byte("my-key") 443 444 var bucket *Bucket 445 dirName := t.TempDir() 446 447 t.Run("init bucket", func(t *testing.T) { 448 b, err := NewBucket(ctx, dirName, dirName, nullLogger(), nil, 449 cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), opts...) 450 require.Nil(t, err) 451 452 // so big it effectively never triggers as part of this test 453 b.SetMemtableThreshold(1e9) 454 455 bucket = b 456 }) 457 458 t.Run("write segments", func(t *testing.T) { 459 for i := 0; i < size; i++ { 460 if i != 0 { 461 // we can only update an existing value if this isn't the first write 462 pair := MapPair{ 463 Key: []byte(fmt.Sprintf("value-%05d", i-1)), 464 Value: []byte(fmt.Sprintf("updated in round %d", i)), 465 } 466 err := bucket.MapSet(key, pair) 467 require.Nil(t, err) 468 } 469 470 if i > 1 { 471 // we can only delete two back an existing value if this isn't the 472 // first or second write 473 pair := MapPair{ 474 Key: []byte(fmt.Sprintf("value-%05d", i-2)), 475 Tombstone: true, 476 } 477 err := bucket.MapSet(key, pair) 478 require.Nil(t, err) 479 } 480 481 pair := MapPair{ 482 Key: []byte(fmt.Sprintf("value-%05d", i)), 483 Value: []byte("original value"), 484 } 485 err := bucket.MapSet(key, pair) 486 require.Nil(t, err) 487 488 require.Nil(t, bucket.FlushAndSwitch()) 489 } 490 }) 491 492 expected := []kv{ 493 { 494 key: key, 495 values: []MapPair{ 496 { 497 Key: []byte(fmt.Sprintf("value-%05d", size-2)), 498 Value: []byte(fmt.Sprintf("updated in round %d", size-1)), 499 }, 500 { 501 Key: []byte(fmt.Sprintf("value-%05d", size-1)), 502 Value: []byte("original value"), 503 }, 504 }, 505 }, 506 } 507 508 t.Run("verify control before compaction", func(t *testing.T) { 509 var retrieved []kv 510 511 c := bucket.MapCursor() 512 defer c.Close() 513 514 for k, v := c.First(); k != nil; k, v = c.Next() { 515 retrieved = append(retrieved, kv{ 516 key: k, 517 values: v, 518 }) 519 } 520 521 assert.Equal(t, expected, retrieved) 522 }) 523 524 t.Run("compact until no longer eligible", func(t *testing.T) { 525 var compacted bool 526 var err error 527 for compacted, err = bucket.disk.compactOnce(); err == nil && compacted; compacted, err = bucket.disk.compactOnce() { 528 } 529 require.Nil(t, err) 530 }) 531 532 t.Run("verify control before compaction", func(t *testing.T) { 533 var retrieved []kv 534 535 c := bucket.MapCursor() 536 defer c.Close() 537 538 for k, v := c.First(); k != nil; k, v = c.Next() { 539 retrieved = append(retrieved, kv{ 540 key: k, 541 values: v, 542 }) 543 } 544 545 assert.Equal(t, expected, retrieved) 546 }) 547 548 t.Run("verify control using individual get (MapList) operations", 549 func(t *testing.T) { 550 // Previously the only verification was done using the cursor. That 551 // guaranteed that all pairs are present in the payload, but it did not 552 // guarantee the integrity of the index (DiskTree) which is used to access 553 // _individual_ keys. Corrupting this index is exactly what happened in 554 // https://github.com/weaviate/weaviate/issues/3517 555 for _, pair := range expected { 556 retrieved, err := bucket.MapList(pair.key) 557 require.NoError(t, err) 558 559 assert.Equal(t, pair.values, retrieved) 560 } 561 }) 562 } 563 564 func compactionMapStrategy_FrequentPutDeleteOperations(ctx context.Context, t *testing.T, opts []BucketOption) { 565 // In this test we are testing that the compaction works well for map collection 566 maxSize := 10 567 568 key := []byte("my-key") 569 mapKey := []byte("value-1") 570 571 for size := 4; size < maxSize; size++ { 572 t.Run(fmt.Sprintf("compact %v segments", size), func(t *testing.T) { 573 var bucket *Bucket 574 dirName := t.TempDir() 575 576 t.Run("init bucket", func(t *testing.T) { 577 b, err := NewBucket(ctx, dirName, dirName, nullLogger(), nil, 578 cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), opts...) 579 require.Nil(t, err) 580 581 // so big it effectively never triggers as part of this test 582 b.SetMemtableThreshold(1e9) 583 584 bucket = b 585 }) 586 587 t.Run("write segments", func(t *testing.T) { 588 for i := 0; i < size; i++ { 589 value := []byte(fmt.Sprintf("updated in round %d", i)) 590 pair := MapPair{Key: mapKey, Value: value} 591 592 err := bucket.MapSet(key, pair) 593 require.Nil(t, err) 594 595 if size == 5 || size == 6 { 596 // delete all 597 err = bucket.MapDeleteKey(key, mapKey) 598 require.Nil(t, err) 599 } else if i != size-1 { 600 // don't delete at the end 601 err := bucket.MapDeleteKey(key, mapKey) 602 require.Nil(t, err) 603 } 604 605 require.Nil(t, bucket.FlushAndSwitch()) 606 } 607 }) 608 609 t.Run("check entries before compaction", func(t *testing.T) { 610 res, err := bucket.MapList(key) 611 assert.Nil(t, err) 612 if size == 5 || size == 6 { 613 assert.Empty(t, res) 614 } else { 615 assert.Len(t, res, 1) 616 assert.Equal(t, false, res[0].Tombstone) 617 } 618 }) 619 620 t.Run("compact until no longer eligible", func(t *testing.T) { 621 var compacted bool 622 var err error 623 for compacted, err = bucket.disk.compactOnce(); err == nil && compacted; compacted, err = bucket.disk.compactOnce() { 624 } 625 require.Nil(t, err) 626 }) 627 628 t.Run("check entries after compaction", func(t *testing.T) { 629 res, err := bucket.MapList(key) 630 assert.Nil(t, err) 631 if size == 5 || size == 6 { 632 assert.Empty(t, res) 633 } else { 634 assert.Len(t, res, 1) 635 assert.Equal(t, false, res[0].Tombstone) 636 } 637 }) 638 }) 639 } 640 }