github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/lsmkv/compaction_set_integration_test.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  //go:build integrationTest
    13  // +build integrationTest
    14  
    15  package lsmkv
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"math/rand"
    21  	"testing"
    22  
    23  	"github.com/stretchr/testify/assert"
    24  	"github.com/stretchr/testify/require"
    25  	"github.com/weaviate/weaviate/entities/cyclemanager"
    26  )
    27  
    28  func compactionSetStrategy(ctx context.Context, t *testing.T, opts []BucketOption,
    29  	expectedMinSize, expectedMaxSize int64,
    30  ) {
    31  	size := 100
    32  
    33  	type kv struct {
    34  		key    []byte
    35  		values [][]byte
    36  		delete bool
    37  	}
    38  	// this segment is not part of the merge, but might still play a role in
    39  	// overall results. For example if one of the later segments has a tombstone
    40  	// for it
    41  	var previous1 []kv
    42  	var previous2 []kv
    43  
    44  	var segment1 []kv
    45  	var segment2 []kv
    46  	var expected []kv
    47  	var bucket *Bucket
    48  
    49  	dirName := t.TempDir()
    50  
    51  	t.Run("create test data", func(t *testing.T) {
    52  		// The test data is split into 4 scenarios evenly:
    53  		//
    54  		// 0.) created in the first segment, never touched again
    55  		// 1.) created in the first segment, appended to it in the second
    56  		// 2.) created in the first segment, first element deleted in the second
    57  		// 3.) created in the first segment, second element deleted in the second
    58  		// 4.) not present in the first segment, created in the second
    59  		// 5.) present in an unrelated previous segment, deleted in the first
    60  		// 6.) present in an unrelated previous segment, deleted in the second
    61  		// 7.) present in an unrelated previous segment, never touched again
    62  		for i := 0; i < size; i++ {
    63  			key := []byte(fmt.Sprintf("key-%02d", i))
    64  
    65  			value1 := []byte(fmt.Sprintf("value-%02d-01", i))
    66  			value2 := []byte(fmt.Sprintf("value-%02d-02", i))
    67  			values := [][]byte{value1, value2}
    68  
    69  			switch i % 8 {
    70  			case 0:
    71  				// add to segment 1
    72  				segment1 = append(segment1, kv{
    73  					key:    key,
    74  					values: values[:1],
    75  				})
    76  
    77  				// leave this element untouched in the second segment
    78  				expected = append(expected, kv{
    79  					key:    key,
    80  					values: values[:1],
    81  				})
    82  
    83  			case 1:
    84  				// add to segment 1
    85  				segment1 = append(segment1, kv{
    86  					key:    key,
    87  					values: values[:1],
    88  				})
    89  
    90  				// update in the second segment
    91  				segment2 = append(segment2, kv{
    92  					key:    key,
    93  					values: values[1:2],
    94  				})
    95  
    96  				expected = append(expected, kv{
    97  					key:    key,
    98  					values: values,
    99  				})
   100  
   101  			case 2:
   102  				// add both to segment 1, delete the first
   103  				segment1 = append(segment1, kv{
   104  					key:    key,
   105  					values: values,
   106  				})
   107  
   108  				// delete first element in the second segment
   109  				segment2 = append(segment2, kv{
   110  					key:    key,
   111  					values: values[:1],
   112  					delete: true,
   113  				})
   114  
   115  				// only the 2nd element should be left in the expected
   116  				expected = append(expected, kv{
   117  					key:    key,
   118  					values: values[1:2],
   119  				})
   120  
   121  			case 3:
   122  				// add both to segment 1, delete the second
   123  				segment1 = append(segment1, kv{
   124  					key:    key,
   125  					values: values,
   126  				})
   127  
   128  				// delete second element in the second segment
   129  				segment2 = append(segment2, kv{
   130  					key:    key,
   131  					values: values[1:],
   132  					delete: true,
   133  				})
   134  
   135  				// only the 1st element should be left in the expected
   136  				expected = append(expected, kv{
   137  					key:    key,
   138  					values: values[:1],
   139  				})
   140  
   141  			case 4:
   142  				// do not add to segment 1
   143  
   144  				// only add to segment 2 (first entry)
   145  				segment2 = append(segment2, kv{
   146  					key:    key,
   147  					values: values,
   148  				})
   149  
   150  				expected = append(expected, kv{
   151  					key:    key,
   152  					values: values,
   153  				})
   154  
   155  			case 5:
   156  				// only part of a previous segment, which is not part of the merge
   157  				previous1 = append(previous1, kv{
   158  					key:    key,
   159  					values: values[:1],
   160  				})
   161  				previous2 = append(previous2, kv{
   162  					key:    key,
   163  					values: values[1:],
   164  				})
   165  
   166  				// delete in segment 1
   167  				segment1 = append(segment1, kv{
   168  					key:    key,
   169  					values: values[:1],
   170  					delete: true,
   171  				})
   172  				segment1 = append(segment1, kv{
   173  					key:    key,
   174  					values: values[1:],
   175  					delete: true,
   176  				})
   177  
   178  				// should not have any values in expected at all, not even a key
   179  
   180  			case 6:
   181  				// only part of a previous segment, which is not part of the merge
   182  				previous1 = append(previous1, kv{
   183  					key:    key,
   184  					values: values[:1],
   185  				})
   186  				previous2 = append(previous2, kv{
   187  					key:    key,
   188  					values: values[1:],
   189  				})
   190  
   191  				// delete in segment 2
   192  				segment2 = append(segment2, kv{
   193  					key:    key,
   194  					values: values[:1],
   195  					delete: true,
   196  				})
   197  				segment2 = append(segment2, kv{
   198  					key:    key,
   199  					values: values[1:],
   200  					delete: true,
   201  				})
   202  
   203  				// should not have any values in expected at all, not even a key
   204  
   205  			case 7:
   206  				// part of a previous segment
   207  				previous1 = append(previous1, kv{
   208  					key:    key,
   209  					values: values[:1],
   210  				})
   211  				previous2 = append(previous2, kv{
   212  					key:    key,
   213  					values: values[1:],
   214  				})
   215  
   216  				expected = append(expected, kv{
   217  					key:    key,
   218  					values: values,
   219  				})
   220  			}
   221  		}
   222  	})
   223  
   224  	t.Run("shuffle the import order for each segment", func(t *testing.T) {
   225  		// this is to make sure we don't accidentally rely on the import order
   226  		rand.Shuffle(len(segment1), func(i, j int) {
   227  			segment1[i], segment1[j] = segment1[j], segment1[i]
   228  		})
   229  		rand.Shuffle(len(segment2), func(i, j int) {
   230  			segment2[i], segment2[j] = segment2[j], segment2[i]
   231  		})
   232  	})
   233  
   234  	t.Run("init bucket", func(t *testing.T) {
   235  		b, err := NewBucket(ctx, dirName, dirName, nullLogger(), nil,
   236  			cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), opts...)
   237  		require.Nil(t, err)
   238  
   239  		// so big it effectively never triggers as part of this test
   240  		b.SetMemtableThreshold(1e9)
   241  
   242  		bucket = b
   243  	})
   244  
   245  	t.Run("import and flush previous segments", func(t *testing.T) {
   246  		for _, pair := range previous1 {
   247  			err := bucket.SetAdd(pair.key, pair.values)
   248  			require.Nil(t, err)
   249  		}
   250  
   251  		require.Nil(t, bucket.FlushAndSwitch())
   252  
   253  		for _, pair := range previous2 {
   254  			err := bucket.SetAdd(pair.key, pair.values)
   255  			require.Nil(t, err)
   256  		}
   257  
   258  		require.Nil(t, bucket.FlushAndSwitch())
   259  	})
   260  
   261  	t.Run("import segment 1", func(t *testing.T) {
   262  		for _, pair := range segment1 {
   263  			if !pair.delete {
   264  				err := bucket.SetAdd(pair.key, pair.values)
   265  				require.Nil(t, err)
   266  			} else {
   267  				err := bucket.SetDeleteSingle(pair.key, pair.values[0])
   268  				require.Nil(t, err)
   269  			}
   270  		}
   271  	})
   272  
   273  	t.Run("flush to disk", func(t *testing.T) {
   274  		require.Nil(t, bucket.FlushAndSwitch())
   275  	})
   276  
   277  	t.Run("import segment 2", func(t *testing.T) {
   278  		for _, pair := range segment2 {
   279  			if !pair.delete {
   280  				err := bucket.SetAdd(pair.key, pair.values)
   281  				require.Nil(t, err)
   282  			} else {
   283  				err := bucket.SetDeleteSingle(pair.key, pair.values[0])
   284  				require.Nil(t, err)
   285  			}
   286  		}
   287  	})
   288  
   289  	t.Run("flush to disk", func(t *testing.T) {
   290  		require.Nil(t, bucket.FlushAndSwitch())
   291  	})
   292  
   293  	t.Run("verify control before compaction", func(t *testing.T) {
   294  		var retrieved []kv
   295  
   296  		c := bucket.SetCursor()
   297  		defer c.Close()
   298  
   299  		for k, v := c.First(); k != nil; k, v = c.Next() {
   300  			retrieved = append(retrieved, kv{
   301  				key:    k,
   302  				values: v,
   303  			})
   304  		}
   305  
   306  		assert.Equal(t, expected, retrieved)
   307  	})
   308  
   309  	t.Run("compact until no longer eligible", func(t *testing.T) {
   310  		i := 0
   311  		var compacted bool
   312  		var err error
   313  		for compacted, err = bucket.disk.compactOnce(); err == nil && compacted; compacted, err = bucket.disk.compactOnce() {
   314  			if i == 1 {
   315  				// segment1 and segment2 merged
   316  				// none of them is root segment, so tombstones
   317  				// will not be removed regardless of keepTombstones setting
   318  				assertSecondSegmentOfSize(t, bucket, 8556, 8556)
   319  			}
   320  			i++
   321  		}
   322  		require.Nil(t, err)
   323  	})
   324  
   325  	t.Run("verify control after compaction", func(t *testing.T) {
   326  		var retrieved []kv
   327  
   328  		c := bucket.SetCursor()
   329  		defer c.Close()
   330  
   331  		for k, v := c.First(); k != nil; k, v = c.Next() {
   332  			retrieved = append(retrieved, kv{
   333  				key:    k,
   334  				values: v,
   335  			})
   336  		}
   337  
   338  		assert.Equal(t, expected, retrieved)
   339  		assertSingleSegmentOfSize(t, bucket, expectedMinSize, expectedMaxSize)
   340  	})
   341  }
   342  
   343  func compactionSetStrategy_RemoveUnnecessary(ctx context.Context, t *testing.T, opts []BucketOption) {
   344  	// in this test each segment reverses the action of the previous segment so
   345  	// that in the end a lot of information is present in the individual segments
   346  	// which is no longer needed. We then verify that after all compaction this
   347  	// information is gone, thus freeing up disk space
   348  	size := 100
   349  
   350  	type kv struct {
   351  		key    []byte
   352  		values [][]byte
   353  	}
   354  
   355  	key := []byte("my-key")
   356  
   357  	var bucket *Bucket
   358  	dirName := t.TempDir()
   359  
   360  	t.Run("init bucket", func(t *testing.T) {
   361  		b, err := NewBucket(ctx, dirName, dirName, nullLogger(), nil,
   362  			cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), opts...)
   363  		require.Nil(t, err)
   364  
   365  		// so big it effectively never triggers as part of this test
   366  		b.SetMemtableThreshold(1e9)
   367  
   368  		bucket = b
   369  	})
   370  
   371  	t.Run("write segments", func(t *testing.T) {
   372  		for i := 0; i < size; i++ {
   373  			if i != 0 {
   374  				// we can only delete an existing value if this isn't the first write
   375  				value := []byte(fmt.Sprintf("value-%05d", i-1))
   376  				err := bucket.SetDeleteSingle(key, value)
   377  				require.Nil(t, err)
   378  			}
   379  
   380  			value := []byte(fmt.Sprintf("value-%05d", i))
   381  			err := bucket.SetAdd(key, [][]byte{value})
   382  			require.Nil(t, err)
   383  
   384  			require.Nil(t, bucket.FlushAndSwitch())
   385  		}
   386  	})
   387  
   388  	t.Run("verify control before compaction", func(t *testing.T) {
   389  		var retrieved []kv
   390  		expected := []kv{
   391  			{
   392  				key:    key,
   393  				values: [][]byte{[]byte(fmt.Sprintf("value-%05d", size-1))},
   394  			},
   395  		}
   396  
   397  		c := bucket.SetCursor()
   398  		defer c.Close()
   399  
   400  		for k, v := c.First(); k != nil; k, v = c.Next() {
   401  			retrieved = append(retrieved, kv{
   402  				key:    k,
   403  				values: v,
   404  			})
   405  		}
   406  
   407  		assert.Equal(t, expected, retrieved)
   408  	})
   409  	t.Run("compact until no longer eligible", func(t *testing.T) {
   410  		var compacted bool
   411  		var err error
   412  		for compacted, err = bucket.disk.compactOnce(); err == nil && compacted; compacted, err = bucket.disk.compactOnce() {
   413  		}
   414  		require.Nil(t, err)
   415  	})
   416  
   417  	t.Run("verify control before compaction", func(t *testing.T) {
   418  		var retrieved []kv
   419  		expected := []kv{
   420  			{
   421  				key:    key,
   422  				values: [][]byte{[]byte(fmt.Sprintf("value-%05d", size-1))},
   423  			},
   424  		}
   425  
   426  		c := bucket.SetCursor()
   427  		defer c.Close()
   428  
   429  		for k, v := c.First(); k != nil; k, v = c.Next() {
   430  			retrieved = append(retrieved, kv{
   431  				key:    k,
   432  				values: v,
   433  			})
   434  		}
   435  
   436  		assert.Equal(t, expected, retrieved)
   437  	})
   438  }
   439  
   440  func compactionSetStrategy_FrequentPutDeleteOperations(ctx context.Context, t *testing.T, opts []BucketOption) {
   441  	// In this test we are testing that the compaction works well for set collection
   442  	maxSize := 10
   443  
   444  	for size := 4; size < maxSize; size++ {
   445  		t.Run(fmt.Sprintf("compact %v segments", size), func(t *testing.T) {
   446  			var bucket *Bucket
   447  
   448  			key := []byte("key-original")
   449  			value1 := []byte("value-01")
   450  			value2 := []byte("value-02")
   451  			values := [][]byte{value1, value2}
   452  
   453  			dirName := t.TempDir()
   454  
   455  			t.Run("init bucket", func(t *testing.T) {
   456  				b, err := NewBucket(ctx, dirName, dirName, nullLogger(), nil,
   457  					cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), opts...)
   458  				require.Nil(t, err)
   459  
   460  				// so big it effectively never triggers as part of this test
   461  				b.SetMemtableThreshold(1e9)
   462  
   463  				bucket = b
   464  			})
   465  
   466  			t.Run("import and flush segments", func(t *testing.T) {
   467  				for i := 0; i < size; i++ {
   468  					err := bucket.SetAdd(key, values)
   469  					require.Nil(t, err)
   470  
   471  					if size == 5 {
   472  						// delete all
   473  						err := bucket.SetDeleteSingle(key, values[0])
   474  						require.Nil(t, err)
   475  						err = bucket.SetDeleteSingle(key, values[1])
   476  						require.Nil(t, err)
   477  					} else if size == 6 {
   478  						// delete only one value
   479  						err := bucket.SetDeleteSingle(key, values[0])
   480  						require.Nil(t, err)
   481  					} else if i != size-1 {
   482  						// don't delete from the last segment
   483  						err := bucket.SetDeleteSingle(key, values[0])
   484  						require.Nil(t, err)
   485  						err = bucket.SetDeleteSingle(key, values[1])
   486  						require.Nil(t, err)
   487  					}
   488  
   489  					require.Nil(t, bucket.FlushAndSwitch())
   490  				}
   491  			})
   492  
   493  			t.Run("verify that objects exist before compaction", func(t *testing.T) {
   494  				res, err := bucket.SetList(key)
   495  				assert.Nil(t, err)
   496  				if size == 5 {
   497  					assert.Len(t, res, 0)
   498  				} else if size == 6 {
   499  					assert.Len(t, res, 1)
   500  				} else {
   501  					assert.Len(t, res, 2)
   502  				}
   503  			})
   504  
   505  			t.Run("compact until no longer eligible", func(t *testing.T) {
   506  				var compacted bool
   507  				var err error
   508  				for compacted, err = bucket.disk.compactOnce(); err == nil && compacted; compacted, err = bucket.disk.compactOnce() {
   509  				}
   510  				require.Nil(t, err)
   511  			})
   512  
   513  			t.Run("verify that objects exist after compaction", func(t *testing.T) {
   514  				res, err := bucket.SetList(key)
   515  				assert.Nil(t, err)
   516  				if size == 5 {
   517  					assert.Len(t, res, 0)
   518  				} else if size == 6 {
   519  					assert.Len(t, res, 1)
   520  				} else {
   521  					assert.Len(t, res, 2)
   522  				}
   523  			})
   524  		})
   525  	}
   526  }