github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/lsmkv/compaction_roaring_set_integration_test.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  //go:build integrationTest
    13  // +build integrationTest
    14  
    15  package lsmkv
    16  
    17  import (
    18  	"context"
    19  	"encoding/binary"
    20  	"fmt"
    21  	"math/rand"
    22  	"testing"
    23  
    24  	"github.com/stretchr/testify/assert"
    25  	"github.com/stretchr/testify/require"
    26  	"github.com/weaviate/sroar"
    27  	"github.com/weaviate/weaviate/entities/cyclemanager"
    28  )
    29  
    30  func compactionRoaringSetStrategy_Random(ctx context.Context, t *testing.T, opts []BucketOption) {
    31  	maxID := uint64(100)
    32  	maxElement := uint64(1e6)
    33  	iterations := uint64(100_000)
    34  
    35  	deleteRatio := 0.2   // 20% of all operations will be deletes, 80% additions
    36  	flushChance := 0.001 // on average one flush per 1000 iterations
    37  
    38  	r := getRandomSeed()
    39  
    40  	instr := generateRandomInstructions(r, maxID, maxElement, iterations, deleteRatio)
    41  	control := controlFromInstructions(instr, maxID)
    42  
    43  	b, err := NewBucket(ctx, t.TempDir(), "", nullLogger(), nil,
    44  		cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), opts...)
    45  	require.Nil(t, err)
    46  
    47  	defer b.Shutdown(testCtx())
    48  
    49  	// so big it effectively never triggers as part of this test
    50  	b.SetMemtableThreshold(1e9)
    51  
    52  	compactions := 0
    53  	for _, inst := range instr {
    54  		key := make([]byte, 8)
    55  		binary.LittleEndian.PutUint64(key, inst.key)
    56  		if inst.addition {
    57  			b.RoaringSetAddOne(key, inst.element)
    58  		} else {
    59  			b.RoaringSetRemoveOne(key, inst.element)
    60  		}
    61  
    62  		if r.Float64() < flushChance {
    63  			require.Nil(t, b.FlushAndSwitch())
    64  
    65  			for compacted, err := b.disk.compactOnce(); err == nil && compacted; compacted, err = b.disk.compactOnce() {
    66  				require.Nil(t, err)
    67  				compactions++
    68  			}
    69  		}
    70  
    71  	}
    72  
    73  	// this is a sanity check to make sure the test setup actually does what we
    74  	// want. With the current setup, we expect on average to have ~100
    75  	// compactions. It would be extremely unexpected to have fewer than 25.
    76  	assert.Greater(t, compactions, 25)
    77  
    78  	verifyBucketAgainstControl(t, b, control)
    79  }
    80  
    81  func verifyBucketAgainstControl(t *testing.T, b *Bucket, control []*sroar.Bitmap) {
    82  	// This test was built before the bucket had cursors, so we are retrieving
    83  	// each key individually, rather than cursing over the entire bucket.
    84  	// However, this is also good for isolation purposes, this test tests
    85  	// compactions, not cursors.
    86  
    87  	for i, controlBM := range control {
    88  		key := make([]byte, 8)
    89  		binary.LittleEndian.PutUint64(key, uint64(i))
    90  
    91  		actual, err := b.RoaringSetGet(key)
    92  		require.Nil(t, err)
    93  
    94  		assert.Equal(t, controlBM.ToArray(), actual.ToArray())
    95  
    96  	}
    97  }
    98  
    99  type roaringSetInstruction struct {
   100  	// is a []byte in reality, but makes the test setup easier if we pretent
   101  	// its an int
   102  	key     uint64
   103  	element uint64
   104  
   105  	// true=addition, false=deletion
   106  	addition bool
   107  }
   108  
   109  func generateRandomInstructions(r *rand.Rand, maxID, maxElement, iterations uint64,
   110  	deleteRatio float64,
   111  ) []roaringSetInstruction {
   112  	instr := make([]roaringSetInstruction, iterations)
   113  
   114  	for i := range instr {
   115  		instr[i].key = uint64(r.Intn(int(maxID)))
   116  		instr[i].element = uint64(r.Intn(int(maxElement)))
   117  
   118  		if r.Float64() > deleteRatio {
   119  			instr[i].addition = true
   120  		} else {
   121  			instr[i].addition = false
   122  		}
   123  	}
   124  
   125  	return instr
   126  }
   127  
   128  func controlFromInstructions(instr []roaringSetInstruction, maxID uint64) []*sroar.Bitmap {
   129  	out := make([]*sroar.Bitmap, maxID)
   130  	for i := range out {
   131  		out[i] = sroar.NewBitmap()
   132  	}
   133  
   134  	for _, inst := range instr {
   135  		if inst.addition {
   136  			out[inst.key].Set(inst.element)
   137  		} else {
   138  			out[inst.key].Remove(inst.element)
   139  		}
   140  	}
   141  
   142  	return out
   143  }
   144  
   145  func compactionRoaringSetStrategy(ctx context.Context, t *testing.T, opts []BucketOption,
   146  	expectedMinSize, expectedMaxSize int64,
   147  ) {
   148  	size := 100
   149  
   150  	type kv struct {
   151  		key       []byte
   152  		additions []uint64
   153  		deletions []uint64
   154  	}
   155  	// this segment is not part of the merge, but might still play a role in
   156  	// overall results. For example if one of the later segments has a tombstone
   157  	// for it
   158  	var previous1 []kv
   159  	var previous2 []kv
   160  
   161  	var segment1 []kv
   162  	var segment2 []kv
   163  	var expected []kv
   164  	var bucket *Bucket
   165  
   166  	dirName := t.TempDir()
   167  
   168  	t.Run("create test data", func(t *testing.T) {
   169  		// The test data is split into 4 scenarios evenly:
   170  		//
   171  		// 0.) created in the first segment, never touched again
   172  		// 1.) created in the first segment, appended to it in the second
   173  		// 2.) created in the first segment, first element deleted in the second
   174  		// 3.) created in the first segment, second element deleted in the second
   175  		// 4.) not present in the first segment, created in the second
   176  		// 5.) present in an unrelated previous segment, deleted in the first
   177  		// 6.) present in an unrelated previous segment, deleted in the second
   178  		// 7.) present in an unrelated previous segment, never touched again
   179  		for i := 0; i < size; i++ {
   180  			key := []byte(fmt.Sprintf("key-%02d", i))
   181  			value1 := uint64(i) + 1
   182  			value2 := uint64(i) + 2
   183  			values := []uint64{value1, value2}
   184  
   185  			switch i % 8 {
   186  			case 0:
   187  				// add to segment 1
   188  				segment1 = append(segment1, kv{
   189  					key:       key,
   190  					additions: values[:1],
   191  				})
   192  
   193  				// leave this element untouched in the second segment
   194  				expected = append(expected, kv{
   195  					key:       key,
   196  					additions: values[:1],
   197  				})
   198  
   199  			case 1:
   200  				// add to segment 1
   201  				segment1 = append(segment1, kv{
   202  					key:       key,
   203  					additions: values[:1],
   204  				})
   205  
   206  				// update in the second segment
   207  				segment2 = append(segment2, kv{
   208  					key:       key,
   209  					additions: values[1:],
   210  				})
   211  
   212  				expected = append(expected, kv{
   213  					key:       key,
   214  					additions: values,
   215  				})
   216  
   217  			case 2:
   218  				// add both to segment 1, delete the first
   219  				segment1 = append(segment1, kv{
   220  					key:       key,
   221  					additions: values,
   222  				})
   223  
   224  				// delete first element in the second segment
   225  				segment2 = append(segment2, kv{
   226  					key:       key,
   227  					deletions: values[:1],
   228  				})
   229  
   230  				// only the 2nd element should be left in the expected
   231  				expected = append(expected, kv{
   232  					key:       key,
   233  					additions: values[1:],
   234  				})
   235  
   236  			case 3:
   237  				// add both to segment 1, delete the second
   238  				segment1 = append(segment1, kv{
   239  					key:       key,
   240  					additions: values,
   241  				})
   242  
   243  				// delete second element in the second segment
   244  				segment2 = append(segment2, kv{
   245  					key:       key,
   246  					deletions: values[1:],
   247  				})
   248  
   249  				// only the 1st element should be left in the expected
   250  				expected = append(expected, kv{
   251  					key:       key,
   252  					additions: values[:1],
   253  				})
   254  
   255  			case 4:
   256  				// do not add to segment 1
   257  
   258  				// only add to segment 2 (first entry)
   259  				segment2 = append(segment2, kv{
   260  					key:       key,
   261  					additions: values,
   262  				})
   263  
   264  				expected = append(expected, kv{
   265  					key:       key,
   266  					additions: values,
   267  				})
   268  
   269  			case 5:
   270  				// only part of a previous segment, which is not part of the merge
   271  				previous1 = append(previous1, kv{
   272  					key:       key,
   273  					additions: values[:1],
   274  				})
   275  				previous2 = append(previous2, kv{
   276  					key:       key,
   277  					additions: values[1:],
   278  				})
   279  
   280  				// delete in segment 1
   281  				segment1 = append(segment1, kv{
   282  					key:       key,
   283  					deletions: values,
   284  				})
   285  
   286  				// should not have any values in expected at all, not even a key
   287  
   288  			case 6:
   289  				// only part of a previous segment, which is not part of the merge
   290  				previous1 = append(previous1, kv{
   291  					key:       key,
   292  					additions: values[:1],
   293  				})
   294  				previous2 = append(previous2, kv{
   295  					key:       key,
   296  					additions: values[1:],
   297  				})
   298  
   299  				// delete in segment 2
   300  				segment2 = append(segment2, kv{
   301  					key:       key,
   302  					deletions: values,
   303  				})
   304  
   305  				// should not have any values in expected at all, not even a key
   306  
   307  			case 7:
   308  				// part of a previous segment
   309  				previous1 = append(previous1, kv{
   310  					key:       key,
   311  					additions: values[:1],
   312  				})
   313  				previous2 = append(previous2, kv{
   314  					key:       key,
   315  					additions: values[1:],
   316  				})
   317  
   318  				expected = append(expected, kv{
   319  					key:       key,
   320  					additions: values,
   321  				})
   322  			}
   323  		}
   324  	})
   325  
   326  	t.Run("shuffle the import order for each segment", func(t *testing.T) {
   327  		// this is to make sure we don't accidentally rely on the import order
   328  		rand.Shuffle(len(segment1), func(i, j int) {
   329  			segment1[i], segment1[j] = segment1[j], segment1[i]
   330  		})
   331  		rand.Shuffle(len(segment2), func(i, j int) {
   332  			segment2[i], segment2[j] = segment2[j], segment2[i]
   333  		})
   334  	})
   335  
   336  	t.Run("init bucket", func(t *testing.T) {
   337  		b, err := NewBucket(ctx, dirName, dirName, nullLogger(), nil,
   338  			cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), opts...)
   339  		require.Nil(t, err)
   340  
   341  		// so big it effectively never triggers as part of this test
   342  		b.SetMemtableThreshold(1e9)
   343  
   344  		bucket = b
   345  	})
   346  
   347  	t.Run("import and flush previous segments", func(t *testing.T) {
   348  		for _, kv := range previous1 {
   349  			err := bucket.RoaringSetAddList(kv.key, kv.additions)
   350  			require.NoError(t, err)
   351  		}
   352  
   353  		require.NoError(t, bucket.FlushAndSwitch())
   354  
   355  		for _, kv := range previous2 {
   356  			err := bucket.RoaringSetAddList(kv.key, kv.additions)
   357  			require.NoError(t, err)
   358  		}
   359  
   360  		require.NoError(t, bucket.FlushAndSwitch())
   361  	})
   362  
   363  	t.Run("import segment 1", func(t *testing.T) {
   364  		for _, kv := range segment1 {
   365  			if len(kv.additions) > 0 {
   366  				err := bucket.RoaringSetAddList(kv.key, kv.additions)
   367  				require.NoError(t, err)
   368  			}
   369  			for i := range kv.deletions {
   370  				err := bucket.RoaringSetRemoveOne(kv.key, kv.deletions[i])
   371  				require.NoError(t, err)
   372  			}
   373  		}
   374  	})
   375  
   376  	t.Run("flush to disk", func(t *testing.T) {
   377  		require.NoError(t, bucket.FlushAndSwitch())
   378  	})
   379  
   380  	t.Run("import segment 2", func(t *testing.T) {
   381  		for _, kv := range segment2 {
   382  			if len(kv.additions) > 0 {
   383  				err := bucket.RoaringSetAddList(kv.key, kv.additions)
   384  				require.NoError(t, err)
   385  			}
   386  			for i := range kv.deletions {
   387  				err := bucket.RoaringSetRemoveOne(kv.key, kv.deletions[i])
   388  				require.NoError(t, err)
   389  			}
   390  		}
   391  	})
   392  
   393  	t.Run("flush to disk", func(t *testing.T) {
   394  		require.NoError(t, bucket.FlushAndSwitch())
   395  	})
   396  
   397  	t.Run("verify control before compaction", func(t *testing.T) {
   398  		var retrieved []kv
   399  
   400  		c := bucket.CursorRoaringSet()
   401  		defer c.Close()
   402  
   403  		for k, v := c.First(); k != nil; k, v = c.Next() {
   404  			retrieved = append(retrieved, kv{
   405  				key:       k,
   406  				additions: v.ToArray(),
   407  			})
   408  		}
   409  
   410  		assert.Equal(t, expected, retrieved)
   411  	})
   412  
   413  	t.Run("compact until no longer eligible", func(t *testing.T) {
   414  		i := 0
   415  		var compacted bool
   416  		var err error
   417  		for compacted, err = bucket.disk.compactOnce(); err == nil && compacted; compacted, err = bucket.disk.compactOnce() {
   418  			if i == 1 {
   419  				// segment1 and segment2 merged
   420  				// none of them is root segment, so tombstones
   421  				// will not be removed regardless of keepTombstones setting
   422  				assertSecondSegmentOfSize(t, bucket, 26768, 26768)
   423  			}
   424  			i++
   425  		}
   426  		require.Nil(t, err)
   427  	})
   428  
   429  	t.Run("verify control after compaction", func(t *testing.T) {
   430  		var retrieved []kv
   431  
   432  		c := bucket.CursorRoaringSet()
   433  		defer c.Close()
   434  
   435  		for k, v := c.First(); k != nil; k, v = c.Next() {
   436  			retrieved = append(retrieved, kv{
   437  				key:       k,
   438  				additions: v.ToArray(),
   439  			})
   440  		}
   441  
   442  		assert.Equal(t, expected, retrieved)
   443  		assertSingleSegmentOfSize(t, bucket, expectedMinSize, expectedMaxSize)
   444  	})
   445  }
   446  
   447  func compactionRoaringSetStrategy_RemoveUnnecessary(ctx context.Context, t *testing.T, opts []BucketOption) {
   448  	// in this test each segment reverses the action of the previous segment so
   449  	// that in the end a lot of information is present in the individual segments
   450  	// which is no longer needed. We then verify that after all compaction this
   451  	// information is gone, thus freeing up disk space
   452  	size := 100
   453  
   454  	type kv struct {
   455  		key    []byte
   456  		values []uint64
   457  	}
   458  
   459  	key := []byte("my-key")
   460  
   461  	var bucket *Bucket
   462  	dirName := t.TempDir()
   463  
   464  	t.Run("init bucket", func(t *testing.T) {
   465  		b, err := NewBucket(ctx, dirName, "", nullLogger(), nil,
   466  			cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), opts...)
   467  		require.Nil(t, err)
   468  
   469  		// so big it effectively never triggers as part of this test
   470  		b.SetMemtableThreshold(1e9)
   471  
   472  		bucket = b
   473  	})
   474  
   475  	t.Run("write segments", func(t *testing.T) {
   476  		for i := 0; i < size; i++ {
   477  			if i != 0 {
   478  				// we can only delete an existing value if this isn't the first write
   479  				err := bucket.RoaringSetRemoveOne(key, uint64(i)-1)
   480  				require.NoError(t, err)
   481  			}
   482  
   483  			err := bucket.RoaringSetAddOne(key, uint64(i))
   484  			require.NoError(t, err)
   485  
   486  			require.NoError(t, bucket.FlushAndSwitch())
   487  		}
   488  	})
   489  
   490  	t.Run("verify control before compaction", func(t *testing.T) {
   491  		var retrieved []kv
   492  		expected := []kv{
   493  			{
   494  				key:    key,
   495  				values: []uint64{uint64(size) - 1},
   496  			},
   497  		}
   498  
   499  		c := bucket.CursorRoaringSet()
   500  		defer c.Close()
   501  
   502  		for k, v := c.First(); k != nil; k, v = c.Next() {
   503  			retrieved = append(retrieved, kv{
   504  				key:    k,
   505  				values: v.ToArray(),
   506  			})
   507  		}
   508  
   509  		assert.Equal(t, expected, retrieved)
   510  	})
   511  
   512  	t.Run("compact until no longer eligible", func(t *testing.T) {
   513  		var compacted bool
   514  		var err error
   515  		for compacted, err = bucket.disk.compactOnce(); err == nil && compacted; compacted, err = bucket.disk.compactOnce() {
   516  		}
   517  		require.Nil(t, err)
   518  	})
   519  
   520  	t.Run("verify control before compaction", func(t *testing.T) {
   521  		var retrieved []kv
   522  		expected := []kv{
   523  			{
   524  				key:    key,
   525  				values: []uint64{uint64(size) - 1},
   526  			},
   527  		}
   528  
   529  		c := bucket.CursorRoaringSet()
   530  		defer c.Close()
   531  
   532  		for k, v := c.First(); k != nil; k, v = c.Next() {
   533  			retrieved = append(retrieved, kv{
   534  				key:    k,
   535  				values: v.ToArray(),
   536  			})
   537  		}
   538  
   539  		assert.Equal(t, expected, retrieved)
   540  	})
   541  }
   542  
   543  func compactionRoaringSetStrategy_FrequentPutDeleteOperations(ctx context.Context, t *testing.T, opts []BucketOption) {
   544  	// In this test we are testing that the compaction works well for set collection
   545  	maxSize := 10
   546  
   547  	for size := 4; size < maxSize; size++ {
   548  		t.Run(fmt.Sprintf("compact %v segments", size), func(t *testing.T) {
   549  			var bucket *Bucket
   550  
   551  			key := []byte("key-original")
   552  			value1 := uint64(1)
   553  			value2 := uint64(2)
   554  			values := []uint64{value1, value2}
   555  
   556  			dirName := t.TempDir()
   557  
   558  			t.Run("init bucket", func(t *testing.T) {
   559  				b, err := NewBucket(ctx, dirName, "", nullLogger(), nil,
   560  					cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), opts...)
   561  				require.Nil(t, err)
   562  
   563  				// so big it effectively never triggers as part of this test
   564  				b.SetMemtableThreshold(1e9)
   565  
   566  				bucket = b
   567  			})
   568  
   569  			t.Run("import and flush segments", func(t *testing.T) {
   570  				for i := 0; i < size; i++ {
   571  					err := bucket.RoaringSetAddList(key, values)
   572  					require.Nil(t, err)
   573  
   574  					if size == 5 {
   575  						// delete all
   576  						err := bucket.RoaringSetRemoveOne(key, values[0])
   577  						require.Nil(t, err)
   578  						err = bucket.RoaringSetRemoveOne(key, values[1])
   579  						require.Nil(t, err)
   580  					} else if size == 6 {
   581  						// delete only one value
   582  						err := bucket.RoaringSetRemoveOne(key, values[0])
   583  						require.Nil(t, err)
   584  					} else if i != size-1 {
   585  						// don't delete from the last segment
   586  						err := bucket.RoaringSetRemoveOne(key, values[0])
   587  						require.Nil(t, err)
   588  						err = bucket.RoaringSetRemoveOne(key, values[1])
   589  						require.Nil(t, err)
   590  					}
   591  
   592  					require.Nil(t, bucket.FlushAndSwitch())
   593  				}
   594  			})
   595  
   596  			t.Run("verify that objects exist before compaction", func(t *testing.T) {
   597  				res, err := bucket.RoaringSetGet(key)
   598  				require.NoError(t, err)
   599  				if size == 5 {
   600  					assert.Equal(t, 0, res.GetCardinality())
   601  				} else if size == 6 {
   602  					assert.Equal(t, 1, res.GetCardinality())
   603  				} else {
   604  					assert.Equal(t, 2, res.GetCardinality())
   605  				}
   606  			})
   607  
   608  			t.Run("compact until no longer eligible", func(t *testing.T) {
   609  				var compacted bool
   610  				var err error
   611  				for compacted, err = bucket.disk.compactOnce(); err == nil && compacted; compacted, err = bucket.disk.compactOnce() {
   612  				}
   613  				require.Nil(t, err)
   614  			})
   615  
   616  			t.Run("verify that objects exist after compaction", func(t *testing.T) {
   617  				res, err := bucket.RoaringSetGet(key)
   618  				require.NoError(t, err)
   619  				if size == 5 {
   620  					assert.Equal(t, 0, res.GetCardinality())
   621  				} else if size == 6 {
   622  					assert.Equal(t, 1, res.GetCardinality())
   623  				} else {
   624  					assert.Equal(t, 2, res.GetCardinality())
   625  				}
   626  			})
   627  		})
   628  	}
   629  }