github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/lsmkv/compaction_set_integration_test.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 //go:build integrationTest 13 // +build integrationTest 14 15 package lsmkv 16 17 import ( 18 "context" 19 "fmt" 20 "math/rand" 21 "testing" 22 23 "github.com/stretchr/testify/assert" 24 "github.com/stretchr/testify/require" 25 "github.com/weaviate/weaviate/entities/cyclemanager" 26 ) 27 28 func compactionSetStrategy(ctx context.Context, t *testing.T, opts []BucketOption, 29 expectedMinSize, expectedMaxSize int64, 30 ) { 31 size := 100 32 33 type kv struct { 34 key []byte 35 values [][]byte 36 delete bool 37 } 38 // this segment is not part of the merge, but might still play a role in 39 // overall results. For example if one of the later segments has a tombstone 40 // for it 41 var previous1 []kv 42 var previous2 []kv 43 44 var segment1 []kv 45 var segment2 []kv 46 var expected []kv 47 var bucket *Bucket 48 49 dirName := t.TempDir() 50 51 t.Run("create test data", func(t *testing.T) { 52 // The test data is split into 4 scenarios evenly: 53 // 54 // 0.) created in the first segment, never touched again 55 // 1.) created in the first segment, appended to it in the second 56 // 2.) created in the first segment, first element deleted in the second 57 // 3.) created in the first segment, second element deleted in the second 58 // 4.) not present in the first segment, created in the second 59 // 5.) present in an unrelated previous segment, deleted in the first 60 // 6.) present in an unrelated previous segment, deleted in the second 61 // 7.) present in an unrelated previous segment, never touched again 62 for i := 0; i < size; i++ { 63 key := []byte(fmt.Sprintf("key-%02d", i)) 64 65 value1 := []byte(fmt.Sprintf("value-%02d-01", i)) 66 value2 := []byte(fmt.Sprintf("value-%02d-02", i)) 67 values := [][]byte{value1, value2} 68 69 switch i % 8 { 70 case 0: 71 // add to segment 1 72 segment1 = append(segment1, kv{ 73 key: key, 74 values: values[:1], 75 }) 76 77 // leave this element untouched in the second segment 78 expected = append(expected, kv{ 79 key: key, 80 values: values[:1], 81 }) 82 83 case 1: 84 // add to segment 1 85 segment1 = append(segment1, kv{ 86 key: key, 87 values: values[:1], 88 }) 89 90 // update in the second segment 91 segment2 = append(segment2, kv{ 92 key: key, 93 values: values[1:2], 94 }) 95 96 expected = append(expected, kv{ 97 key: key, 98 values: values, 99 }) 100 101 case 2: 102 // add both to segment 1, delete the first 103 segment1 = append(segment1, kv{ 104 key: key, 105 values: values, 106 }) 107 108 // delete first element in the second segment 109 segment2 = append(segment2, kv{ 110 key: key, 111 values: values[:1], 112 delete: true, 113 }) 114 115 // only the 2nd element should be left in the expected 116 expected = append(expected, kv{ 117 key: key, 118 values: values[1:2], 119 }) 120 121 case 3: 122 // add both to segment 1, delete the second 123 segment1 = append(segment1, kv{ 124 key: key, 125 values: values, 126 }) 127 128 // delete second element in the second segment 129 segment2 = append(segment2, kv{ 130 key: key, 131 values: values[1:], 132 delete: true, 133 }) 134 135 // only the 1st element should be left in the expected 136 expected = append(expected, kv{ 137 key: key, 138 values: values[:1], 139 }) 140 141 case 4: 142 // do not add to segment 1 143 144 // only add to segment 2 (first entry) 145 segment2 = append(segment2, kv{ 146 key: key, 147 values: values, 148 }) 149 150 expected = append(expected, kv{ 151 key: key, 152 values: values, 153 }) 154 155 case 5: 156 // only part of a previous segment, which is not part of the merge 157 previous1 = append(previous1, kv{ 158 key: key, 159 values: values[:1], 160 }) 161 previous2 = append(previous2, kv{ 162 key: key, 163 values: values[1:], 164 }) 165 166 // delete in segment 1 167 segment1 = append(segment1, kv{ 168 key: key, 169 values: values[:1], 170 delete: true, 171 }) 172 segment1 = append(segment1, kv{ 173 key: key, 174 values: values[1:], 175 delete: true, 176 }) 177 178 // should not have any values in expected at all, not even a key 179 180 case 6: 181 // only part of a previous segment, which is not part of the merge 182 previous1 = append(previous1, kv{ 183 key: key, 184 values: values[:1], 185 }) 186 previous2 = append(previous2, kv{ 187 key: key, 188 values: values[1:], 189 }) 190 191 // delete in segment 2 192 segment2 = append(segment2, kv{ 193 key: key, 194 values: values[:1], 195 delete: true, 196 }) 197 segment2 = append(segment2, kv{ 198 key: key, 199 values: values[1:], 200 delete: true, 201 }) 202 203 // should not have any values in expected at all, not even a key 204 205 case 7: 206 // part of a previous segment 207 previous1 = append(previous1, kv{ 208 key: key, 209 values: values[:1], 210 }) 211 previous2 = append(previous2, kv{ 212 key: key, 213 values: values[1:], 214 }) 215 216 expected = append(expected, kv{ 217 key: key, 218 values: values, 219 }) 220 } 221 } 222 }) 223 224 t.Run("shuffle the import order for each segment", func(t *testing.T) { 225 // this is to make sure we don't accidentally rely on the import order 226 rand.Shuffle(len(segment1), func(i, j int) { 227 segment1[i], segment1[j] = segment1[j], segment1[i] 228 }) 229 rand.Shuffle(len(segment2), func(i, j int) { 230 segment2[i], segment2[j] = segment2[j], segment2[i] 231 }) 232 }) 233 234 t.Run("init bucket", func(t *testing.T) { 235 b, err := NewBucket(ctx, dirName, dirName, nullLogger(), nil, 236 cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), opts...) 237 require.Nil(t, err) 238 239 // so big it effectively never triggers as part of this test 240 b.SetMemtableThreshold(1e9) 241 242 bucket = b 243 }) 244 245 t.Run("import and flush previous segments", func(t *testing.T) { 246 for _, pair := range previous1 { 247 err := bucket.SetAdd(pair.key, pair.values) 248 require.Nil(t, err) 249 } 250 251 require.Nil(t, bucket.FlushAndSwitch()) 252 253 for _, pair := range previous2 { 254 err := bucket.SetAdd(pair.key, pair.values) 255 require.Nil(t, err) 256 } 257 258 require.Nil(t, bucket.FlushAndSwitch()) 259 }) 260 261 t.Run("import segment 1", func(t *testing.T) { 262 for _, pair := range segment1 { 263 if !pair.delete { 264 err := bucket.SetAdd(pair.key, pair.values) 265 require.Nil(t, err) 266 } else { 267 err := bucket.SetDeleteSingle(pair.key, pair.values[0]) 268 require.Nil(t, err) 269 } 270 } 271 }) 272 273 t.Run("flush to disk", func(t *testing.T) { 274 require.Nil(t, bucket.FlushAndSwitch()) 275 }) 276 277 t.Run("import segment 2", func(t *testing.T) { 278 for _, pair := range segment2 { 279 if !pair.delete { 280 err := bucket.SetAdd(pair.key, pair.values) 281 require.Nil(t, err) 282 } else { 283 err := bucket.SetDeleteSingle(pair.key, pair.values[0]) 284 require.Nil(t, err) 285 } 286 } 287 }) 288 289 t.Run("flush to disk", func(t *testing.T) { 290 require.Nil(t, bucket.FlushAndSwitch()) 291 }) 292 293 t.Run("verify control before compaction", func(t *testing.T) { 294 var retrieved []kv 295 296 c := bucket.SetCursor() 297 defer c.Close() 298 299 for k, v := c.First(); k != nil; k, v = c.Next() { 300 retrieved = append(retrieved, kv{ 301 key: k, 302 values: v, 303 }) 304 } 305 306 assert.Equal(t, expected, retrieved) 307 }) 308 309 t.Run("compact until no longer eligible", func(t *testing.T) { 310 i := 0 311 var compacted bool 312 var err error 313 for compacted, err = bucket.disk.compactOnce(); err == nil && compacted; compacted, err = bucket.disk.compactOnce() { 314 if i == 1 { 315 // segment1 and segment2 merged 316 // none of them is root segment, so tombstones 317 // will not be removed regardless of keepTombstones setting 318 assertSecondSegmentOfSize(t, bucket, 8556, 8556) 319 } 320 i++ 321 } 322 require.Nil(t, err) 323 }) 324 325 t.Run("verify control after compaction", func(t *testing.T) { 326 var retrieved []kv 327 328 c := bucket.SetCursor() 329 defer c.Close() 330 331 for k, v := c.First(); k != nil; k, v = c.Next() { 332 retrieved = append(retrieved, kv{ 333 key: k, 334 values: v, 335 }) 336 } 337 338 assert.Equal(t, expected, retrieved) 339 assertSingleSegmentOfSize(t, bucket, expectedMinSize, expectedMaxSize) 340 }) 341 } 342 343 func compactionSetStrategy_RemoveUnnecessary(ctx context.Context, t *testing.T, opts []BucketOption) { 344 // in this test each segment reverses the action of the previous segment so 345 // that in the end a lot of information is present in the individual segments 346 // which is no longer needed. We then verify that after all compaction this 347 // information is gone, thus freeing up disk space 348 size := 100 349 350 type kv struct { 351 key []byte 352 values [][]byte 353 } 354 355 key := []byte("my-key") 356 357 var bucket *Bucket 358 dirName := t.TempDir() 359 360 t.Run("init bucket", func(t *testing.T) { 361 b, err := NewBucket(ctx, dirName, dirName, nullLogger(), nil, 362 cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), opts...) 363 require.Nil(t, err) 364 365 // so big it effectively never triggers as part of this test 366 b.SetMemtableThreshold(1e9) 367 368 bucket = b 369 }) 370 371 t.Run("write segments", func(t *testing.T) { 372 for i := 0; i < size; i++ { 373 if i != 0 { 374 // we can only delete an existing value if this isn't the first write 375 value := []byte(fmt.Sprintf("value-%05d", i-1)) 376 err := bucket.SetDeleteSingle(key, value) 377 require.Nil(t, err) 378 } 379 380 value := []byte(fmt.Sprintf("value-%05d", i)) 381 err := bucket.SetAdd(key, [][]byte{value}) 382 require.Nil(t, err) 383 384 require.Nil(t, bucket.FlushAndSwitch()) 385 } 386 }) 387 388 t.Run("verify control before compaction", func(t *testing.T) { 389 var retrieved []kv 390 expected := []kv{ 391 { 392 key: key, 393 values: [][]byte{[]byte(fmt.Sprintf("value-%05d", size-1))}, 394 }, 395 } 396 397 c := bucket.SetCursor() 398 defer c.Close() 399 400 for k, v := c.First(); k != nil; k, v = c.Next() { 401 retrieved = append(retrieved, kv{ 402 key: k, 403 values: v, 404 }) 405 } 406 407 assert.Equal(t, expected, retrieved) 408 }) 409 t.Run("compact until no longer eligible", func(t *testing.T) { 410 var compacted bool 411 var err error 412 for compacted, err = bucket.disk.compactOnce(); err == nil && compacted; compacted, err = bucket.disk.compactOnce() { 413 } 414 require.Nil(t, err) 415 }) 416 417 t.Run("verify control before compaction", func(t *testing.T) { 418 var retrieved []kv 419 expected := []kv{ 420 { 421 key: key, 422 values: [][]byte{[]byte(fmt.Sprintf("value-%05d", size-1))}, 423 }, 424 } 425 426 c := bucket.SetCursor() 427 defer c.Close() 428 429 for k, v := c.First(); k != nil; k, v = c.Next() { 430 retrieved = append(retrieved, kv{ 431 key: k, 432 values: v, 433 }) 434 } 435 436 assert.Equal(t, expected, retrieved) 437 }) 438 } 439 440 func compactionSetStrategy_FrequentPutDeleteOperations(ctx context.Context, t *testing.T, opts []BucketOption) { 441 // In this test we are testing that the compaction works well for set collection 442 maxSize := 10 443 444 for size := 4; size < maxSize; size++ { 445 t.Run(fmt.Sprintf("compact %v segments", size), func(t *testing.T) { 446 var bucket *Bucket 447 448 key := []byte("key-original") 449 value1 := []byte("value-01") 450 value2 := []byte("value-02") 451 values := [][]byte{value1, value2} 452 453 dirName := t.TempDir() 454 455 t.Run("init bucket", func(t *testing.T) { 456 b, err := NewBucket(ctx, dirName, dirName, nullLogger(), nil, 457 cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), opts...) 458 require.Nil(t, err) 459 460 // so big it effectively never triggers as part of this test 461 b.SetMemtableThreshold(1e9) 462 463 bucket = b 464 }) 465 466 t.Run("import and flush segments", func(t *testing.T) { 467 for i := 0; i < size; i++ { 468 err := bucket.SetAdd(key, values) 469 require.Nil(t, err) 470 471 if size == 5 { 472 // delete all 473 err := bucket.SetDeleteSingle(key, values[0]) 474 require.Nil(t, err) 475 err = bucket.SetDeleteSingle(key, values[1]) 476 require.Nil(t, err) 477 } else if size == 6 { 478 // delete only one value 479 err := bucket.SetDeleteSingle(key, values[0]) 480 require.Nil(t, err) 481 } else if i != size-1 { 482 // don't delete from the last segment 483 err := bucket.SetDeleteSingle(key, values[0]) 484 require.Nil(t, err) 485 err = bucket.SetDeleteSingle(key, values[1]) 486 require.Nil(t, err) 487 } 488 489 require.Nil(t, bucket.FlushAndSwitch()) 490 } 491 }) 492 493 t.Run("verify that objects exist before compaction", func(t *testing.T) { 494 res, err := bucket.SetList(key) 495 assert.Nil(t, err) 496 if size == 5 { 497 assert.Len(t, res, 0) 498 } else if size == 6 { 499 assert.Len(t, res, 1) 500 } else { 501 assert.Len(t, res, 2) 502 } 503 }) 504 505 t.Run("compact until no longer eligible", func(t *testing.T) { 506 var compacted bool 507 var err error 508 for compacted, err = bucket.disk.compactOnce(); err == nil && compacted; compacted, err = bucket.disk.compactOnce() { 509 } 510 require.Nil(t, err) 511 }) 512 513 t.Run("verify that objects exist after compaction", func(t *testing.T) { 514 res, err := bucket.SetList(key) 515 assert.Nil(t, err) 516 if size == 5 { 517 assert.Len(t, res, 0) 518 } else if size == 6 { 519 assert.Len(t, res, 1) 520 } else { 521 assert.Len(t, res, 2) 522 } 523 }) 524 }) 525 } 526 }