github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/roaringset/compactor_test.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package roaringset
    13  
    14  import (
    15  	"io"
    16  	"os"
    17  	"path/filepath"
    18  	"testing"
    19  
    20  	"github.com/stretchr/testify/assert"
    21  	"github.com/stretchr/testify/require"
    22  	"github.com/weaviate/weaviate/adapters/repos/db/lsmkv/segmentindex"
    23  )
    24  
    25  func Test_Compactor(t *testing.T) {
    26  	type test struct {
    27  		name         string
    28  		left         []byte
    29  		right        []byte
    30  		expected     []keyWithBML
    31  		expectedRoot []keyWithBML
    32  	}
    33  
    34  	tests := []test{
    35  		{
    36  			name: "independent segments without overlap",
    37  			left: createSegmentsFromKeys(t, []keyWithBML{
    38  				{
    39  					key:       []byte("aaa"),
    40  					additions: []uint64{0},
    41  					deletions: []uint64{1},
    42  				},
    43  				{
    44  					key:       []byte("ccc"),
    45  					additions: []uint64{4},
    46  					deletions: []uint64{5},
    47  				},
    48  			}),
    49  			right: createSegmentsFromKeys(t, []keyWithBML{
    50  				{
    51  					key:       []byte("bbb"),
    52  					additions: []uint64{2},
    53  					deletions: []uint64{3},
    54  				},
    55  				{
    56  					key:       []byte("ddd"),
    57  					additions: []uint64{6},
    58  					deletions: []uint64{7},
    59  				},
    60  			}),
    61  			expected: []keyWithBML{
    62  				{
    63  					key:       []byte("aaa"),
    64  					additions: []uint64{0},
    65  					deletions: []uint64{1},
    66  				},
    67  				{
    68  					key:       []byte("bbb"),
    69  					additions: []uint64{2},
    70  					deletions: []uint64{3},
    71  				},
    72  				{
    73  					key:       []byte("ccc"),
    74  					additions: []uint64{4},
    75  					deletions: []uint64{5},
    76  				},
    77  				{
    78  					key:       []byte("ddd"),
    79  					additions: []uint64{6},
    80  					deletions: []uint64{7},
    81  				},
    82  			},
    83  			expectedRoot: []keyWithBML{
    84  				{
    85  					key:       []byte("aaa"),
    86  					additions: []uint64{0},
    87  				},
    88  				{
    89  					key:       []byte("bbb"),
    90  					additions: []uint64{2},
    91  				},
    92  				{
    93  					key:       []byte("ccc"),
    94  					additions: []uint64{4},
    95  				},
    96  				{
    97  					key:       []byte("ddd"),
    98  					additions: []uint64{6},
    99  				},
   100  			},
   101  		},
   102  		{
   103  			name: "some segments overlap",
   104  			// note: there is no need to test every possible edge case for the
   105  			// overlapping segments in this place, as this logic is outsourced to
   106  			// BitmapLayer.Merge() which already has tests for edge cases
   107  			left: createSegmentsFromKeys(t, []keyWithBML{
   108  				{
   109  					key:       []byte("aaa"),
   110  					additions: []uint64{0},
   111  					deletions: []uint64{1},
   112  				},
   113  				{
   114  					key:       []byte("overlap"),
   115  					additions: []uint64{4, 5, 6},
   116  					deletions: []uint64{1, 3, 7},
   117  				},
   118  			}),
   119  			right: createSegmentsFromKeys(t, []keyWithBML{
   120  				{
   121  					key:       []byte("overlap"),
   122  					additions: []uint64{3, 8},
   123  					deletions: []uint64{5},
   124  				},
   125  				{
   126  					key:       []byte("zzz"),
   127  					additions: []uint64{6},
   128  					deletions: []uint64{7},
   129  				},
   130  			}),
   131  			expected: []keyWithBML{
   132  				{
   133  					key:       []byte("aaa"),
   134  					additions: []uint64{0},
   135  					deletions: []uint64{1},
   136  				},
   137  				{
   138  					key:       []byte("overlap"),
   139  					additions: []uint64{3, 4, 6, 8},
   140  					deletions: []uint64{1, 5, 7},
   141  				},
   142  				{
   143  					key:       []byte("zzz"),
   144  					additions: []uint64{6},
   145  					deletions: []uint64{7},
   146  				},
   147  			},
   148  			expectedRoot: []keyWithBML{
   149  				{
   150  					key:       []byte("aaa"),
   151  					additions: []uint64{0},
   152  				},
   153  				{
   154  					key:       []byte("overlap"),
   155  					additions: []uint64{3, 4, 6, 8},
   156  				},
   157  				{
   158  					key:       []byte("zzz"),
   159  					additions: []uint64{6},
   160  				},
   161  			},
   162  		},
   163  		{
   164  			name: "everything but one is deleted",
   165  			left: createSegmentsFromKeys(t, []keyWithBML{
   166  				{
   167  					key:       []byte("aaa"),
   168  					additions: []uint64{0},
   169  					deletions: []uint64{},
   170  				},
   171  				{
   172  					key:       []byte("bbb"),
   173  					additions: []uint64{4, 5, 6},
   174  					deletions: []uint64{},
   175  				},
   176  				{
   177  					key:       []byte("ddd"),
   178  					additions: []uint64{11, 12, 111},
   179  					deletions: []uint64{},
   180  				},
   181  			}),
   182  			right: createSegmentsFromKeys(t, []keyWithBML{
   183  				{
   184  					key:       []byte("aaa"),
   185  					additions: []uint64{},
   186  					deletions: []uint64{0},
   187  				},
   188  				{
   189  					key:       []byte("bbb"),
   190  					additions: []uint64{},
   191  					deletions: []uint64{4, 5, 6},
   192  				},
   193  				{
   194  					key:       []byte("ccc"),
   195  					additions: []uint64{},
   196  					deletions: []uint64{7, 8},
   197  				},
   198  				{
   199  					key:       []byte("ddd"),
   200  					additions: []uint64{222},
   201  					deletions: []uint64{11, 12, 13, 14},
   202  				},
   203  			}),
   204  			expected: []keyWithBML{
   205  				{
   206  					key:       []byte("aaa"),
   207  					additions: []uint64{},
   208  					deletions: []uint64{0},
   209  				},
   210  				{
   211  					key:       []byte("bbb"),
   212  					additions: []uint64{},
   213  					deletions: []uint64{4, 5, 6},
   214  				},
   215  				{
   216  					key:       []byte("ccc"),
   217  					additions: []uint64{},
   218  					deletions: []uint64{7, 8},
   219  				},
   220  				{
   221  					key:       []byte("ddd"),
   222  					additions: []uint64{111, 222},
   223  					deletions: []uint64{11, 12, 13, 14},
   224  				},
   225  			},
   226  			expectedRoot: []keyWithBML{
   227  				{
   228  					key:       []byte("ddd"),
   229  					additions: []uint64{111, 222},
   230  				},
   231  			},
   232  		},
   233  
   234  		// the key loop is essentially a state machine. The next tests try to cover
   235  		// all possible states:
   236  		//
   237  		// 1. only the left key is set -> take left key
   238  		// 2. both left key and right key are set, but left is smaller -> take left
   239  		//    key
   240  		// 3. only the right key is set -> take right key
   241  		// 4. both right and left keys are set, but right key is smaller -> take
   242  		//    the right key
   243  		// 5. both keys are identical -> merge them
   244  		//
   245  		// Note: There is also an implicit 6th case: both keys are not set, this is
   246  		// the exit condition which is part of every test.
   247  		{
   248  			name: "state 1 - only left key is set",
   249  			left: createSegmentsFromKeys(t, []keyWithBML{
   250  				{
   251  					key:       []byte("aaa"),
   252  					additions: []uint64{0},
   253  					deletions: []uint64{1},
   254  				},
   255  			}),
   256  			right: createSegmentsFromKeys(t, []keyWithBML{}),
   257  			expected: []keyWithBML{
   258  				{
   259  					key:       []byte("aaa"),
   260  					additions: []uint64{0},
   261  					deletions: []uint64{1},
   262  				},
   263  			},
   264  			expectedRoot: []keyWithBML{
   265  				{
   266  					key:       []byte("aaa"),
   267  					additions: []uint64{0},
   268  				},
   269  			},
   270  		},
   271  		{
   272  			name: "state 2 - left+right, left is smaller",
   273  			left: createSegmentsFromKeys(t, []keyWithBML{
   274  				{
   275  					key:       []byte("aaa"),
   276  					additions: []uint64{0},
   277  					deletions: []uint64{1},
   278  				},
   279  			}),
   280  			right: createSegmentsFromKeys(t, []keyWithBML{
   281  				{
   282  					key:       []byte("bbb"),
   283  					additions: []uint64{2},
   284  					deletions: []uint64{3},
   285  				},
   286  			}),
   287  			expected: []keyWithBML{
   288  				{
   289  					key:       []byte("aaa"),
   290  					additions: []uint64{0},
   291  					deletions: []uint64{1},
   292  				},
   293  				{
   294  					key:       []byte("bbb"),
   295  					additions: []uint64{2},
   296  					deletions: []uint64{3},
   297  				},
   298  			},
   299  			expectedRoot: []keyWithBML{
   300  				{
   301  					key:       []byte("aaa"),
   302  					additions: []uint64{0},
   303  				},
   304  				{
   305  					key:       []byte("bbb"),
   306  					additions: []uint64{2},
   307  				},
   308  			},
   309  		},
   310  		{
   311  			name: "state 3 - only the right key is set",
   312  			left: createSegmentsFromKeys(t, []keyWithBML{}),
   313  			right: createSegmentsFromKeys(t, []keyWithBML{
   314  				{
   315  					key:       []byte("bbb"),
   316  					additions: []uint64{2},
   317  					deletions: []uint64{3},
   318  				},
   319  			}),
   320  			expected: []keyWithBML{
   321  				{
   322  					key:       []byte("bbb"),
   323  					additions: []uint64{2},
   324  					deletions: []uint64{3},
   325  				},
   326  			},
   327  			expectedRoot: []keyWithBML{
   328  				{
   329  					key:       []byte("bbb"),
   330  					additions: []uint64{2},
   331  				},
   332  			},
   333  		},
   334  		{
   335  			name: "state 4 - left+right, right is smaller",
   336  			left: createSegmentsFromKeys(t, []keyWithBML{
   337  				{
   338  					key:       []byte("ccc"),
   339  					additions: []uint64{0},
   340  					deletions: []uint64{1},
   341  				},
   342  			}),
   343  			right: createSegmentsFromKeys(t, []keyWithBML{
   344  				{
   345  					key:       []byte("bbb"),
   346  					additions: []uint64{2},
   347  					deletions: []uint64{3},
   348  				},
   349  			}),
   350  			expected: []keyWithBML{
   351  				{
   352  					key:       []byte("bbb"),
   353  					additions: []uint64{2},
   354  					deletions: []uint64{3},
   355  				},
   356  				{
   357  					key:       []byte("ccc"),
   358  					additions: []uint64{0},
   359  					deletions: []uint64{1},
   360  				},
   361  			},
   362  			expectedRoot: []keyWithBML{
   363  				{
   364  					key:       []byte("bbb"),
   365  					additions: []uint64{2},
   366  				},
   367  				{
   368  					key:       []byte("ccc"),
   369  					additions: []uint64{0},
   370  				},
   371  			},
   372  		},
   373  		{
   374  			name: "state 5 - left+right are identical",
   375  			left: createSegmentsFromKeys(t, []keyWithBML{
   376  				{
   377  					key:       []byte("aaa"),
   378  					additions: []uint64{0},
   379  					deletions: []uint64{1},
   380  				},
   381  			}),
   382  			right: createSegmentsFromKeys(t, []keyWithBML{
   383  				{
   384  					key:       []byte("aaa"),
   385  					additions: []uint64{2},
   386  					deletions: []uint64{3},
   387  				},
   388  			}),
   389  			expected: []keyWithBML{
   390  				{
   391  					key:       []byte("aaa"),
   392  					additions: []uint64{0, 2},
   393  					deletions: []uint64{1, 3},
   394  				},
   395  			},
   396  			expectedRoot: []keyWithBML{
   397  				{
   398  					key:       []byte("aaa"),
   399  					additions: []uint64{0, 2},
   400  				},
   401  			},
   402  		},
   403  	}
   404  
   405  	for _, test := range tests {
   406  		t.Run("[keep]"+test.name, func(t *testing.T) {
   407  			dir := t.TempDir()
   408  
   409  			leftCursor := NewSegmentCursor(test.left, nil)
   410  			rightCursor := NewSegmentCursor(test.right, nil)
   411  
   412  			segmentFile := filepath.Join(dir, "result.db")
   413  			f, err := os.Create(segmentFile)
   414  			require.NoError(t, err)
   415  
   416  			c := NewCompactor(f, leftCursor, rightCursor, 5, dir+"/scratch", false)
   417  			require.NoError(t, c.Do())
   418  
   419  			require.NoError(t, f.Close())
   420  
   421  			f, err = os.Open(segmentFile)
   422  			require.NoError(t, err)
   423  
   424  			header, err := segmentindex.ParseHeader(f)
   425  			require.NoError(t, err)
   426  
   427  			segmentBytes, err := io.ReadAll(f)
   428  			require.NoError(t, err)
   429  
   430  			require.NoError(t, f.Close())
   431  
   432  			cu := NewSegmentCursor(segmentBytes[:header.IndexStart-segmentindex.HeaderSize], nil)
   433  
   434  			i := 0
   435  			for k, v, _ := cu.First(); k != nil; k, v, _ = cu.Next() {
   436  				assert.Equal(t, test.expected[i].key, k)
   437  				assert.Equal(t, test.expected[i].additions, v.Additions.ToArray())
   438  				assert.Equal(t, test.expected[i].deletions, v.Deletions.ToArray())
   439  				i++
   440  			}
   441  
   442  			assert.Equal(t, len(test.expected), i, "all expected keys must have been hit")
   443  		})
   444  	}
   445  
   446  	for _, test := range tests {
   447  		t.Run("[cleanup] "+test.name, func(t *testing.T) {
   448  			dir := t.TempDir()
   449  
   450  			leftCursor := NewSegmentCursor(test.left, nil)
   451  			rightCursor := NewSegmentCursor(test.right, nil)
   452  
   453  			segmentFile := filepath.Join(dir, "result.db")
   454  			f, err := os.Create(segmentFile)
   455  			require.NoError(t, err)
   456  
   457  			c := NewCompactor(f, leftCursor, rightCursor, 5, dir+"/scratch", true)
   458  			require.NoError(t, c.Do())
   459  
   460  			require.NoError(t, f.Close())
   461  
   462  			f, err = os.Open(segmentFile)
   463  			require.NoError(t, err)
   464  
   465  			header, err := segmentindex.ParseHeader(f)
   466  			require.NoError(t, err)
   467  
   468  			segmentBytes, err := io.ReadAll(f)
   469  			require.NoError(t, err)
   470  
   471  			require.NoError(t, f.Close())
   472  
   473  			cu := NewSegmentCursor(segmentBytes[:header.IndexStart-segmentindex.HeaderSize], nil)
   474  
   475  			i := 0
   476  			for k, v, _ := cu.First(); k != nil; k, v, _ = cu.Next() {
   477  				assert.Equal(t, test.expectedRoot[i].key, k)
   478  				assert.Equal(t, test.expectedRoot[i].additions, v.Additions.ToArray())
   479  				assert.Empty(t, v.Deletions.ToArray())
   480  				i++
   481  			}
   482  
   483  			assert.Equal(t, len(test.expectedRoot), i, "all expected keys must have been hit")
   484  		})
   485  	}
   486  }
   487  
   488  type keyWithBML struct {
   489  	key       []byte
   490  	additions []uint64
   491  	deletions []uint64
   492  }
   493  
   494  func createSegmentsFromKeys(t *testing.T, keys []keyWithBML) []byte {
   495  	out := []byte{}
   496  
   497  	for _, k := range keys {
   498  		add := NewBitmap(k.additions...)
   499  		del := NewBitmap(k.deletions...)
   500  		sn, err := NewSegmentNode(k.key, add, del)
   501  		require.Nil(t, err)
   502  		out = append(out, sn.ToBuffer()...)
   503  	}
   504  
   505  	return out
   506  }