github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/lsmkv/compaction_roaring_set_integration_test.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 //go:build integrationTest 13 // +build integrationTest 14 15 package lsmkv 16 17 import ( 18 "context" 19 "encoding/binary" 20 "fmt" 21 "math/rand" 22 "testing" 23 24 "github.com/stretchr/testify/assert" 25 "github.com/stretchr/testify/require" 26 "github.com/weaviate/sroar" 27 "github.com/weaviate/weaviate/entities/cyclemanager" 28 ) 29 30 func compactionRoaringSetStrategy_Random(ctx context.Context, t *testing.T, opts []BucketOption) { 31 maxID := uint64(100) 32 maxElement := uint64(1e6) 33 iterations := uint64(100_000) 34 35 deleteRatio := 0.2 // 20% of all operations will be deletes, 80% additions 36 flushChance := 0.001 // on average one flush per 1000 iterations 37 38 r := getRandomSeed() 39 40 instr := generateRandomInstructions(r, maxID, maxElement, iterations, deleteRatio) 41 control := controlFromInstructions(instr, maxID) 42 43 b, err := NewBucket(ctx, t.TempDir(), "", nullLogger(), nil, 44 cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), opts...) 45 require.Nil(t, err) 46 47 defer b.Shutdown(testCtx()) 48 49 // so big it effectively never triggers as part of this test 50 b.SetMemtableThreshold(1e9) 51 52 compactions := 0 53 for _, inst := range instr { 54 key := make([]byte, 8) 55 binary.LittleEndian.PutUint64(key, inst.key) 56 if inst.addition { 57 b.RoaringSetAddOne(key, inst.element) 58 } else { 59 b.RoaringSetRemoveOne(key, inst.element) 60 } 61 62 if r.Float64() < flushChance { 63 require.Nil(t, b.FlushAndSwitch()) 64 65 for compacted, err := b.disk.compactOnce(); err == nil && compacted; compacted, err = b.disk.compactOnce() { 66 require.Nil(t, err) 67 compactions++ 68 } 69 } 70 71 } 72 73 // this is a sanity check to make sure the test setup actually does what we 74 // want. With the current setup, we expect on average to have ~100 75 // compactions. It would be extremely unexpected to have fewer than 25. 76 assert.Greater(t, compactions, 25) 77 78 verifyBucketAgainstControl(t, b, control) 79 } 80 81 func verifyBucketAgainstControl(t *testing.T, b *Bucket, control []*sroar.Bitmap) { 82 // This test was built before the bucket had cursors, so we are retrieving 83 // each key individually, rather than cursing over the entire bucket. 84 // However, this is also good for isolation purposes, this test tests 85 // compactions, not cursors. 86 87 for i, controlBM := range control { 88 key := make([]byte, 8) 89 binary.LittleEndian.PutUint64(key, uint64(i)) 90 91 actual, err := b.RoaringSetGet(key) 92 require.Nil(t, err) 93 94 assert.Equal(t, controlBM.ToArray(), actual.ToArray()) 95 96 } 97 } 98 99 type roaringSetInstruction struct { 100 // is a []byte in reality, but makes the test setup easier if we pretent 101 // its an int 102 key uint64 103 element uint64 104 105 // true=addition, false=deletion 106 addition bool 107 } 108 109 func generateRandomInstructions(r *rand.Rand, maxID, maxElement, iterations uint64, 110 deleteRatio float64, 111 ) []roaringSetInstruction { 112 instr := make([]roaringSetInstruction, iterations) 113 114 for i := range instr { 115 instr[i].key = uint64(r.Intn(int(maxID))) 116 instr[i].element = uint64(r.Intn(int(maxElement))) 117 118 if r.Float64() > deleteRatio { 119 instr[i].addition = true 120 } else { 121 instr[i].addition = false 122 } 123 } 124 125 return instr 126 } 127 128 func controlFromInstructions(instr []roaringSetInstruction, maxID uint64) []*sroar.Bitmap { 129 out := make([]*sroar.Bitmap, maxID) 130 for i := range out { 131 out[i] = sroar.NewBitmap() 132 } 133 134 for _, inst := range instr { 135 if inst.addition { 136 out[inst.key].Set(inst.element) 137 } else { 138 out[inst.key].Remove(inst.element) 139 } 140 } 141 142 return out 143 } 144 145 func compactionRoaringSetStrategy(ctx context.Context, t *testing.T, opts []BucketOption, 146 expectedMinSize, expectedMaxSize int64, 147 ) { 148 size := 100 149 150 type kv struct { 151 key []byte 152 additions []uint64 153 deletions []uint64 154 } 155 // this segment is not part of the merge, but might still play a role in 156 // overall results. For example if one of the later segments has a tombstone 157 // for it 158 var previous1 []kv 159 var previous2 []kv 160 161 var segment1 []kv 162 var segment2 []kv 163 var expected []kv 164 var bucket *Bucket 165 166 dirName := t.TempDir() 167 168 t.Run("create test data", func(t *testing.T) { 169 // The test data is split into 4 scenarios evenly: 170 // 171 // 0.) created in the first segment, never touched again 172 // 1.) created in the first segment, appended to it in the second 173 // 2.) created in the first segment, first element deleted in the second 174 // 3.) created in the first segment, second element deleted in the second 175 // 4.) not present in the first segment, created in the second 176 // 5.) present in an unrelated previous segment, deleted in the first 177 // 6.) present in an unrelated previous segment, deleted in the second 178 // 7.) present in an unrelated previous segment, never touched again 179 for i := 0; i < size; i++ { 180 key := []byte(fmt.Sprintf("key-%02d", i)) 181 value1 := uint64(i) + 1 182 value2 := uint64(i) + 2 183 values := []uint64{value1, value2} 184 185 switch i % 8 { 186 case 0: 187 // add to segment 1 188 segment1 = append(segment1, kv{ 189 key: key, 190 additions: values[:1], 191 }) 192 193 // leave this element untouched in the second segment 194 expected = append(expected, kv{ 195 key: key, 196 additions: values[:1], 197 }) 198 199 case 1: 200 // add to segment 1 201 segment1 = append(segment1, kv{ 202 key: key, 203 additions: values[:1], 204 }) 205 206 // update in the second segment 207 segment2 = append(segment2, kv{ 208 key: key, 209 additions: values[1:], 210 }) 211 212 expected = append(expected, kv{ 213 key: key, 214 additions: values, 215 }) 216 217 case 2: 218 // add both to segment 1, delete the first 219 segment1 = append(segment1, kv{ 220 key: key, 221 additions: values, 222 }) 223 224 // delete first element in the second segment 225 segment2 = append(segment2, kv{ 226 key: key, 227 deletions: values[:1], 228 }) 229 230 // only the 2nd element should be left in the expected 231 expected = append(expected, kv{ 232 key: key, 233 additions: values[1:], 234 }) 235 236 case 3: 237 // add both to segment 1, delete the second 238 segment1 = append(segment1, kv{ 239 key: key, 240 additions: values, 241 }) 242 243 // delete second element in the second segment 244 segment2 = append(segment2, kv{ 245 key: key, 246 deletions: values[1:], 247 }) 248 249 // only the 1st element should be left in the expected 250 expected = append(expected, kv{ 251 key: key, 252 additions: values[:1], 253 }) 254 255 case 4: 256 // do not add to segment 1 257 258 // only add to segment 2 (first entry) 259 segment2 = append(segment2, kv{ 260 key: key, 261 additions: values, 262 }) 263 264 expected = append(expected, kv{ 265 key: key, 266 additions: values, 267 }) 268 269 case 5: 270 // only part of a previous segment, which is not part of the merge 271 previous1 = append(previous1, kv{ 272 key: key, 273 additions: values[:1], 274 }) 275 previous2 = append(previous2, kv{ 276 key: key, 277 additions: values[1:], 278 }) 279 280 // delete in segment 1 281 segment1 = append(segment1, kv{ 282 key: key, 283 deletions: values, 284 }) 285 286 // should not have any values in expected at all, not even a key 287 288 case 6: 289 // only part of a previous segment, which is not part of the merge 290 previous1 = append(previous1, kv{ 291 key: key, 292 additions: values[:1], 293 }) 294 previous2 = append(previous2, kv{ 295 key: key, 296 additions: values[1:], 297 }) 298 299 // delete in segment 2 300 segment2 = append(segment2, kv{ 301 key: key, 302 deletions: values, 303 }) 304 305 // should not have any values in expected at all, not even a key 306 307 case 7: 308 // part of a previous segment 309 previous1 = append(previous1, kv{ 310 key: key, 311 additions: values[:1], 312 }) 313 previous2 = append(previous2, kv{ 314 key: key, 315 additions: values[1:], 316 }) 317 318 expected = append(expected, kv{ 319 key: key, 320 additions: values, 321 }) 322 } 323 } 324 }) 325 326 t.Run("shuffle the import order for each segment", func(t *testing.T) { 327 // this is to make sure we don't accidentally rely on the import order 328 rand.Shuffle(len(segment1), func(i, j int) { 329 segment1[i], segment1[j] = segment1[j], segment1[i] 330 }) 331 rand.Shuffle(len(segment2), func(i, j int) { 332 segment2[i], segment2[j] = segment2[j], segment2[i] 333 }) 334 }) 335 336 t.Run("init bucket", func(t *testing.T) { 337 b, err := NewBucket(ctx, dirName, dirName, nullLogger(), nil, 338 cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), opts...) 339 require.Nil(t, err) 340 341 // so big it effectively never triggers as part of this test 342 b.SetMemtableThreshold(1e9) 343 344 bucket = b 345 }) 346 347 t.Run("import and flush previous segments", func(t *testing.T) { 348 for _, kv := range previous1 { 349 err := bucket.RoaringSetAddList(kv.key, kv.additions) 350 require.NoError(t, err) 351 } 352 353 require.NoError(t, bucket.FlushAndSwitch()) 354 355 for _, kv := range previous2 { 356 err := bucket.RoaringSetAddList(kv.key, kv.additions) 357 require.NoError(t, err) 358 } 359 360 require.NoError(t, bucket.FlushAndSwitch()) 361 }) 362 363 t.Run("import segment 1", func(t *testing.T) { 364 for _, kv := range segment1 { 365 if len(kv.additions) > 0 { 366 err := bucket.RoaringSetAddList(kv.key, kv.additions) 367 require.NoError(t, err) 368 } 369 for i := range kv.deletions { 370 err := bucket.RoaringSetRemoveOne(kv.key, kv.deletions[i]) 371 require.NoError(t, err) 372 } 373 } 374 }) 375 376 t.Run("flush to disk", func(t *testing.T) { 377 require.NoError(t, bucket.FlushAndSwitch()) 378 }) 379 380 t.Run("import segment 2", func(t *testing.T) { 381 for _, kv := range segment2 { 382 if len(kv.additions) > 0 { 383 err := bucket.RoaringSetAddList(kv.key, kv.additions) 384 require.NoError(t, err) 385 } 386 for i := range kv.deletions { 387 err := bucket.RoaringSetRemoveOne(kv.key, kv.deletions[i]) 388 require.NoError(t, err) 389 } 390 } 391 }) 392 393 t.Run("flush to disk", func(t *testing.T) { 394 require.NoError(t, bucket.FlushAndSwitch()) 395 }) 396 397 t.Run("verify control before compaction", func(t *testing.T) { 398 var retrieved []kv 399 400 c := bucket.CursorRoaringSet() 401 defer c.Close() 402 403 for k, v := c.First(); k != nil; k, v = c.Next() { 404 retrieved = append(retrieved, kv{ 405 key: k, 406 additions: v.ToArray(), 407 }) 408 } 409 410 assert.Equal(t, expected, retrieved) 411 }) 412 413 t.Run("compact until no longer eligible", func(t *testing.T) { 414 i := 0 415 var compacted bool 416 var err error 417 for compacted, err = bucket.disk.compactOnce(); err == nil && compacted; compacted, err = bucket.disk.compactOnce() { 418 if i == 1 { 419 // segment1 and segment2 merged 420 // none of them is root segment, so tombstones 421 // will not be removed regardless of keepTombstones setting 422 assertSecondSegmentOfSize(t, bucket, 26768, 26768) 423 } 424 i++ 425 } 426 require.Nil(t, err) 427 }) 428 429 t.Run("verify control after compaction", func(t *testing.T) { 430 var retrieved []kv 431 432 c := bucket.CursorRoaringSet() 433 defer c.Close() 434 435 for k, v := c.First(); k != nil; k, v = c.Next() { 436 retrieved = append(retrieved, kv{ 437 key: k, 438 additions: v.ToArray(), 439 }) 440 } 441 442 assert.Equal(t, expected, retrieved) 443 assertSingleSegmentOfSize(t, bucket, expectedMinSize, expectedMaxSize) 444 }) 445 } 446 447 func compactionRoaringSetStrategy_RemoveUnnecessary(ctx context.Context, t *testing.T, opts []BucketOption) { 448 // in this test each segment reverses the action of the previous segment so 449 // that in the end a lot of information is present in the individual segments 450 // which is no longer needed. We then verify that after all compaction this 451 // information is gone, thus freeing up disk space 452 size := 100 453 454 type kv struct { 455 key []byte 456 values []uint64 457 } 458 459 key := []byte("my-key") 460 461 var bucket *Bucket 462 dirName := t.TempDir() 463 464 t.Run("init bucket", func(t *testing.T) { 465 b, err := NewBucket(ctx, dirName, "", nullLogger(), nil, 466 cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), opts...) 467 require.Nil(t, err) 468 469 // so big it effectively never triggers as part of this test 470 b.SetMemtableThreshold(1e9) 471 472 bucket = b 473 }) 474 475 t.Run("write segments", func(t *testing.T) { 476 for i := 0; i < size; i++ { 477 if i != 0 { 478 // we can only delete an existing value if this isn't the first write 479 err := bucket.RoaringSetRemoveOne(key, uint64(i)-1) 480 require.NoError(t, err) 481 } 482 483 err := bucket.RoaringSetAddOne(key, uint64(i)) 484 require.NoError(t, err) 485 486 require.NoError(t, bucket.FlushAndSwitch()) 487 } 488 }) 489 490 t.Run("verify control before compaction", func(t *testing.T) { 491 var retrieved []kv 492 expected := []kv{ 493 { 494 key: key, 495 values: []uint64{uint64(size) - 1}, 496 }, 497 } 498 499 c := bucket.CursorRoaringSet() 500 defer c.Close() 501 502 for k, v := c.First(); k != nil; k, v = c.Next() { 503 retrieved = append(retrieved, kv{ 504 key: k, 505 values: v.ToArray(), 506 }) 507 } 508 509 assert.Equal(t, expected, retrieved) 510 }) 511 512 t.Run("compact until no longer eligible", func(t *testing.T) { 513 var compacted bool 514 var err error 515 for compacted, err = bucket.disk.compactOnce(); err == nil && compacted; compacted, err = bucket.disk.compactOnce() { 516 } 517 require.Nil(t, err) 518 }) 519 520 t.Run("verify control before compaction", func(t *testing.T) { 521 var retrieved []kv 522 expected := []kv{ 523 { 524 key: key, 525 values: []uint64{uint64(size) - 1}, 526 }, 527 } 528 529 c := bucket.CursorRoaringSet() 530 defer c.Close() 531 532 for k, v := c.First(); k != nil; k, v = c.Next() { 533 retrieved = append(retrieved, kv{ 534 key: k, 535 values: v.ToArray(), 536 }) 537 } 538 539 assert.Equal(t, expected, retrieved) 540 }) 541 } 542 543 func compactionRoaringSetStrategy_FrequentPutDeleteOperations(ctx context.Context, t *testing.T, opts []BucketOption) { 544 // In this test we are testing that the compaction works well for set collection 545 maxSize := 10 546 547 for size := 4; size < maxSize; size++ { 548 t.Run(fmt.Sprintf("compact %v segments", size), func(t *testing.T) { 549 var bucket *Bucket 550 551 key := []byte("key-original") 552 value1 := uint64(1) 553 value2 := uint64(2) 554 values := []uint64{value1, value2} 555 556 dirName := t.TempDir() 557 558 t.Run("init bucket", func(t *testing.T) { 559 b, err := NewBucket(ctx, dirName, "", nullLogger(), nil, 560 cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), opts...) 561 require.Nil(t, err) 562 563 // so big it effectively never triggers as part of this test 564 b.SetMemtableThreshold(1e9) 565 566 bucket = b 567 }) 568 569 t.Run("import and flush segments", func(t *testing.T) { 570 for i := 0; i < size; i++ { 571 err := bucket.RoaringSetAddList(key, values) 572 require.Nil(t, err) 573 574 if size == 5 { 575 // delete all 576 err := bucket.RoaringSetRemoveOne(key, values[0]) 577 require.Nil(t, err) 578 err = bucket.RoaringSetRemoveOne(key, values[1]) 579 require.Nil(t, err) 580 } else if size == 6 { 581 // delete only one value 582 err := bucket.RoaringSetRemoveOne(key, values[0]) 583 require.Nil(t, err) 584 } else if i != size-1 { 585 // don't delete from the last segment 586 err := bucket.RoaringSetRemoveOne(key, values[0]) 587 require.Nil(t, err) 588 err = bucket.RoaringSetRemoveOne(key, values[1]) 589 require.Nil(t, err) 590 } 591 592 require.Nil(t, bucket.FlushAndSwitch()) 593 } 594 }) 595 596 t.Run("verify that objects exist before compaction", func(t *testing.T) { 597 res, err := bucket.RoaringSetGet(key) 598 require.NoError(t, err) 599 if size == 5 { 600 assert.Equal(t, 0, res.GetCardinality()) 601 } else if size == 6 { 602 assert.Equal(t, 1, res.GetCardinality()) 603 } else { 604 assert.Equal(t, 2, res.GetCardinality()) 605 } 606 }) 607 608 t.Run("compact until no longer eligible", func(t *testing.T) { 609 var compacted bool 610 var err error 611 for compacted, err = bucket.disk.compactOnce(); err == nil && compacted; compacted, err = bucket.disk.compactOnce() { 612 } 613 require.Nil(t, err) 614 }) 615 616 t.Run("verify that objects exist after compaction", func(t *testing.T) { 617 res, err := bucket.RoaringSetGet(key) 618 require.NoError(t, err) 619 if size == 5 { 620 assert.Equal(t, 0, res.GetCardinality()) 621 } else if size == 6 { 622 assert.Equal(t, 1, res.GetCardinality()) 623 } else { 624 assert.Equal(t, 2, res.GetCardinality()) 625 } 626 }) 627 }) 628 } 629 }