github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/inverted/prop_length_tracker_test.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package inverted
    13  
    14  import (
    15  	"fmt"
    16  	"path"
    17  	"testing"
    18  
    19  	"github.com/sirupsen/logrus"
    20  	"github.com/stretchr/testify/assert"
    21  	"github.com/stretchr/testify/require"
    22  )
    23  
    24  func Test_PropertyLengthTracker(t *testing.T) {
    25  	dirName := t.TempDir()
    26  	trackerPath := path.Join(dirName, "my_test_shard")
    27  	l := logrus.New()
    28  
    29  	// This test suite doesn't actually test persistence, there is a separate
    30  	// one. However, we still need to supply a valid path. Since nothing is ever
    31  	// written, we can use the same one for each sub-test without them
    32  	// accidentally sharing state.
    33  
    34  	t.Run("single prop", func(t *testing.T) {
    35  		type test struct {
    36  			values       []float32
    37  			name         string
    38  			floatCompare bool
    39  		}
    40  
    41  		tests := []test{
    42  			{
    43  				values:       []float32{2, 2, 3, 100, 100, 500, 7},
    44  				name:         "mixed_values",
    45  				floatCompare: true,
    46  			},
    47  			{
    48  				values: []float32{
    49  					1000, 1200, 1000, 1300, 800, 2000, 2050,
    50  					2070, 900,
    51  				},
    52  				name:         "high_values",
    53  				floatCompare: true,
    54  			},
    55  			{
    56  				values: []float32{
    57  					60000, 50000, 65000,
    58  				},
    59  				name:         "very_high_values",
    60  				floatCompare: true,
    61  			},
    62  			{
    63  				values: []float32{
    64  					1, 2, 4, 3, 4, 2, 1, 5, 6, 7, 8, 2, 7, 2, 3, 5,
    65  					6, 3, 5, 9, 3, 4, 8,
    66  				},
    67  				name:         "very_low_values",
    68  				floatCompare: true,
    69  			},
    70  			{
    71  				values:       []float32{0, 0},
    72  				name:         "zeros",
    73  				floatCompare: false,
    74  			},
    75  		}
    76  
    77  		for _, test := range tests {
    78  			t.Run(test.name, func(t *testing.T) {
    79  				tracker, err := NewJsonPropertyLengthTracker(trackerPath+test.name, l)
    80  				require.Nil(t, err)
    81  
    82  				actualMean := float32(0)
    83  				for _, v := range test.values {
    84  					tracker.TrackProperty("my-very-first-prop", v)
    85  					actualMean += v
    86  				}
    87  				actualMean = actualMean / float32(len(test.values))
    88  
    89  				res, err := tracker.PropertyMean("my-very-first-prop")
    90  				require.Nil(t, err)
    91  
    92  				if test.floatCompare {
    93  					assert.InEpsilon(t, actualMean, res, 0.1)
    94  				} else {
    95  					assert.Equal(t, actualMean, res)
    96  				}
    97  				require.Nil(t, tracker.Close())
    98  			})
    99  		}
   100  	})
   101  
   102  	t.Run("test untrack", func(t *testing.T) {
   103  		tracker, err := NewJsonPropertyLengthTracker(trackerPath, l)
   104  		require.Nil(t, err)
   105  
   106  		tracker.TrackProperty("test-prop", 1)
   107  		tracker.TrackProperty("test-prop", 2)
   108  		tracker.TrackProperty("test-prop", 3)
   109  		tracker.Flush(false)
   110  
   111  		sum, count, mean, err := tracker.PropertyTally("test-prop")
   112  		require.Nil(t, err)
   113  		assert.Equal(t, 6, sum)
   114  		assert.Equal(t, 3, count)
   115  		assert.InEpsilon(t, 2, mean, 0.1)
   116  
   117  		tracker.UnTrackProperty("test-prop", 2)
   118  		sum, count, mean, err = tracker.PropertyTally("test-prop")
   119  		require.Nil(t, err)
   120  		assert.Equal(t, 4, sum)
   121  		assert.Equal(t, 2, count)
   122  		assert.InEpsilon(t, 2, mean, 0.1)
   123  
   124  		tracker.UnTrackProperty("test-prop", 1)
   125  		sum, count, mean, err = tracker.PropertyTally("test-prop")
   126  		require.Nil(t, err)
   127  		assert.Equal(t, 3, sum)
   128  		assert.Equal(t, 1, count)
   129  		assert.InEpsilon(t, 3, mean, 0.1)
   130  
   131  		require.Nil(t, tracker.Close())
   132  	})
   133  
   134  	t.Run("multiple properties (can all fit on one page)", func(t *testing.T) {
   135  		type prop struct {
   136  			values   []float32
   137  			propName string
   138  		}
   139  
   140  		props := []prop{
   141  			{
   142  				values:   []float32{2, 2, 3, 100, 100, 500, 7},
   143  				propName: "property-numero-uno",
   144  			}, {
   145  				values: []float32{
   146  					1000, 1200, 1000, 1300, 800, 2000, 2050,
   147  					2070, 900,
   148  				},
   149  				propName: "the-second-of-the-properties",
   150  			}, {
   151  				values: []float32{
   152  					60000, 50000, 65000,
   153  				},
   154  				propName: "property_nummer_DREI",
   155  			},
   156  		}
   157  
   158  		// This time we use a single tracker
   159  		tracker, err := NewJsonPropertyLengthTracker(trackerPath, l)
   160  		require.Nil(t, err)
   161  
   162  		for _, prop := range props {
   163  			for _, v := range prop.values {
   164  				tracker.TrackProperty(prop.propName, v)
   165  			}
   166  		}
   167  
   168  		for _, prop := range props {
   169  			actualMean := float32(0)
   170  			for _, v := range prop.values {
   171  				actualMean += v
   172  			}
   173  			actualMean = actualMean / float32(len(prop.values))
   174  
   175  			res, err := tracker.PropertyMean(prop.propName)
   176  			require.Nil(t, err)
   177  
   178  			assert.InEpsilon(t, actualMean, res, 0.1)
   179  		}
   180  
   181  		require.Nil(t, tracker.Close())
   182  	})
   183  
   184  	t.Run("with more properties that can fit on one page", func(t *testing.T) {
   185  		// This time we use a single tracker
   186  		tracker, err := NewJsonPropertyLengthTracker(trackerPath, l)
   187  		require.Nil(t, err)
   188  
   189  		create20PropsAndVerify(t, tracker)
   190  
   191  		require.Nil(t, tracker.Close())
   192  	})
   193  }
   194  
   195  func create20PropsAndVerify(t *testing.T, tracker *JsonPropertyLengthTracker) {
   196  	type prop struct {
   197  		values   []float32
   198  		propName string
   199  	}
   200  
   201  	// the most props we could ever fit on a single page is 16 if there was no
   202  	// index, which is impossible. This means the practical max is 15, so at
   203  	// least 5 props should overflow to the second page.
   204  	propCount := 20
   205  	props := make([]prop, propCount)
   206  
   207  	for i := range props {
   208  		props[i] = prop{
   209  			values:   []float32{1, 4, 3, 17},
   210  			propName: fmt.Sprintf("prop_%d", i),
   211  		}
   212  	}
   213  
   214  	for _, prop := range props {
   215  		for _, v := range prop.values {
   216  			tracker.TrackProperty(prop.propName, v)
   217  		}
   218  	}
   219  
   220  	for _, prop := range props {
   221  		actualMean := float32(0)
   222  		for _, v := range prop.values {
   223  			actualMean += v
   224  		}
   225  		actualMean = actualMean / float32(len(prop.values))
   226  
   227  		res, err := tracker.PropertyMean(prop.propName)
   228  		require.Nil(t, err)
   229  
   230  		assert.InEpsilon(t, actualMean, res, 0.1)
   231  	}
   232  
   233  	// modify a prop on page 2 and verify
   234  	tracker.TrackProperty("prop_19", 24)
   235  	actualMeanForProp20 := float32(1+4+3+17+25) / 5.0
   236  	res, err := tracker.PropertyMean("prop_19")
   237  	require.Nil(t, err)
   238  
   239  	assert.InEpsilon(t, actualMeanForProp20, res, 0.1)
   240  }
   241  
   242  func Test_PropertyLengthTracker_Persistence(t *testing.T) {
   243  	dirName := t.TempDir()
   244  
   245  	path := path.Join(dirName, "my_test_shard")
   246  
   247  	var tracker *JsonPropertyLengthTracker
   248  	l := logrus.New()
   249  
   250  	t.Run("initializing an empty tracker, no file present", func(t *testing.T) {
   251  		tr, err := NewJsonPropertyLengthTracker(path, l)
   252  		require.Nil(t, err)
   253  		tracker = tr
   254  	})
   255  
   256  	t.Run("importing multi-page data and verifying", func(t *testing.T) {
   257  		create20PropsAndVerify(t, tracker)
   258  	})
   259  
   260  	t.Run("commit the state to disk", func(t *testing.T) {
   261  		require.Nil(t, tracker.Flush(false))
   262  	})
   263  
   264  	t.Run("shut down the tracker", func(t *testing.T) {
   265  		require.Nil(t, tracker.Close())
   266  	})
   267  
   268  	var secondTracker *JsonPropertyLengthTracker
   269  	t.Run("initializing a new tracker from the same file", func(t *testing.T) {
   270  		tr, err := NewJsonPropertyLengthTracker(path, l)
   271  		require.Nil(t, err)
   272  		secondTracker = tr
   273  	})
   274  
   275  	t.Run("verify data is correct after read from disk", func(t *testing.T) {
   276  		// root page
   277  		actualMeanForProp0 := float32(1+4+3+17) / 4.0
   278  		res, err := secondTracker.PropertyMean("prop_0")
   279  		require.Nil(t, err)
   280  		assert.InEpsilon(t, actualMeanForProp0, res, 0.1)
   281  
   282  		// later page
   283  		actualMeanForProp20 := float32(1+4+3+17+25) / 5.0
   284  		res, err = secondTracker.PropertyMean("prop_19")
   285  		require.Nil(t, err)
   286  		assert.InEpsilon(t, actualMeanForProp20, res, 0.1)
   287  	})
   288  }
   289  
   290  // Testing the switch from the old property length tracker to the new one
   291  func TestFormatConversion(t *testing.T) {
   292  	dirName := t.TempDir()
   293  
   294  	path := path.Join(dirName, "my_test_shard")
   295  
   296  	var tracker *PropertyLengthTracker
   297  
   298  	t.Run("initializing an empty tracker, no file present", func(t *testing.T) {
   299  		tr, err := NewPropertyLengthTracker(path)
   300  		require.Nil(t, err)
   301  		tracker = tr
   302  	})
   303  
   304  	t.Run("importing multi-page data and verifying", func(t *testing.T) {
   305  		create20PropsAndVerify_old(t, tracker)
   306  	})
   307  
   308  	t.Run("commit the state to disk", func(t *testing.T) {
   309  		require.Nil(t, tracker.Flush())
   310  	})
   311  
   312  	t.Run("shut down the tracker", func(t *testing.T) {
   313  		require.Nil(t, tracker.Close())
   314  	})
   315  
   316  	var newTracker *JsonPropertyLengthTracker
   317  	l := logrus.New()
   318  
   319  	t.Run("initializing a new tracker from the same file", func(t *testing.T) {
   320  		tr, err := NewJsonPropertyLengthTracker(path, l)
   321  		require.Nil(t, err)
   322  		newTracker = tr
   323  	})
   324  
   325  	t.Run("verify data is correct after read from disk", func(t *testing.T) {
   326  		// root page
   327  		actualMeanForProp0 := float32(1+4+3+17) / 4.0
   328  		res, err := newTracker.PropertyMean("prop_0")
   329  		require.Nil(t, err)
   330  		assert.InEpsilon(t, actualMeanForProp0, res, 0.1)
   331  
   332  		// later page
   333  		actualMeanForProp20 := float32(1+4+3+17+25) / 5.0
   334  		res, err = newTracker.PropertyMean("prop_19")
   335  		require.Nil(t, err)
   336  		assert.InEpsilon(t, actualMeanForProp20, res, 0.1)
   337  
   338  		res, err = newTracker.PropertyMean("prop_22")
   339  		require.Nil(t, err)
   340  		assert.EqualValues(t, res, 0)
   341  		sum, count, average, _ := newTracker.PropertyTally("prop_22")
   342  		assert.EqualValues(t, 0, sum)
   343  		assert.EqualValues(t, 3, count)
   344  		assert.EqualValues(t, 0, average)
   345  	})
   346  }
   347  
   348  func create20PropsAndVerify_old(t *testing.T, tracker *PropertyLengthTracker) {
   349  	type prop struct {
   350  		values   []float32
   351  		propName string
   352  	}
   353  
   354  	// the most props we could ever fit on a single page is 16 if there was no
   355  	// index, which is impossible. This means the practical max is 15, so at
   356  	// least 5 props should overflow to the second page.
   357  	propCount := 20
   358  	props := make([]prop, propCount)
   359  
   360  	for i := range props {
   361  		props[i] = prop{
   362  			values:   []float32{1, 4, 3, 17},
   363  			propName: fmt.Sprintf("prop_%d", i),
   364  		}
   365  	}
   366  
   367  	for _, prop := range props {
   368  		for _, v := range prop.values {
   369  			tracker.TrackProperty(prop.propName, v)
   370  		}
   371  	}
   372  
   373  	tracker.TrackProperty("prop_22", 0)
   374  	tracker.TrackProperty("prop_22", 0)
   375  	tracker.TrackProperty("prop_22", 0)
   376  
   377  	for _, prop := range props {
   378  		actualMean := float32(0)
   379  		for _, v := range prop.values {
   380  			actualMean += v
   381  		}
   382  		actualMean = actualMean / float32(len(prop.values))
   383  
   384  		res, err := tracker.PropertyMean(prop.propName)
   385  		require.Nil(t, err)
   386  
   387  		assert.InEpsilon(t, actualMean, res, 0.1)
   388  	}
   389  
   390  	// modify a prop on page 2 and verify
   391  	tracker.TrackProperty("prop_19", 24)
   392  	actualMeanForProp20 := float32(1+4+3+17+25) / 5.0
   393  	res, err := tracker.PropertyMean("prop_19")
   394  	require.Nil(t, err)
   395  
   396  	assert.InEpsilon(t, actualMeanForProp20, res, 0.1)
   397  
   398  	res, err = tracker.PropertyMean("prop_22")
   399  	require.Nil(t, err)
   400  	assert.EqualValues(t, res, 0)
   401  
   402  	sum, _, average, _ := tracker.PropertyTally("prop_22")
   403  	assert.EqualValues(t, 0, sum)
   404  	// assert.EqualValues(t, 3, count)
   405  	assert.EqualValues(t, 0, average)
   406  }
   407  
   408  // Test the old property length tracker
   409  
   410  func TestOldPropertyLengthTracker(t *testing.T) {
   411  	dirName := t.TempDir()
   412  	trackerPath := path.Join(dirName, "my_test_shard")
   413  
   414  	// This test suite doesn't actually test persistence, there is a separate
   415  	// one. However, we still need to supply a valid path. Since nothing is ever
   416  	// written, we can use the same one for each sub-test without them
   417  	// accidentally sharing state.
   418  
   419  	t.Run("single prop", func(t *testing.T) {
   420  		type test struct {
   421  			values       []float32
   422  			name         string
   423  			floatCompare bool
   424  		}
   425  
   426  		tests := []test{
   427  			{
   428  				values:       []float32{2, 2, 3, 100, 100, 500, 7},
   429  				name:         "mixed_values",
   430  				floatCompare: true,
   431  			}, {
   432  				values: []float32{
   433  					1000, 1200, 1000, 1300, 800, 2000, 2050,
   434  					2070, 900,
   435  				},
   436  				name:         "high_values",
   437  				floatCompare: true,
   438  			}, {
   439  				values: []float32{
   440  					60000, 50000, 65000,
   441  				},
   442  				name:         "very_high_values",
   443  				floatCompare: true,
   444  			}, {
   445  				values: []float32{
   446  					1, 2, 4, 3, 4, 2, 1, 5, 6, 7, 8, 2, 7, 2, 3, 5,
   447  					6, 3, 5, 9, 3, 4, 8,
   448  				},
   449  				name:         "very_low_values",
   450  				floatCompare: true,
   451  			}, {
   452  				values:       []float32{0, 0},
   453  				name:         "zeros",
   454  				floatCompare: false,
   455  			},
   456  		}
   457  
   458  		for _, test := range tests {
   459  			t.Run(test.name, func(t *testing.T) {
   460  				tracker, err := NewPropertyLengthTracker(trackerPath + test.name)
   461  				require.Nil(t, err)
   462  
   463  				actualMean := float32(0)
   464  				for _, v := range test.values {
   465  					tracker.TrackProperty("my-very-first-prop", v)
   466  					actualMean += v
   467  				}
   468  				actualMean = actualMean / float32(len(test.values))
   469  
   470  				res, err := tracker.PropertyMean("my-very-first-prop")
   471  				require.Nil(t, err)
   472  
   473  				if test.floatCompare {
   474  					assert.InEpsilon(t, actualMean, res, 0.1)
   475  				} else {
   476  					assert.Equal(t, actualMean, res)
   477  				}
   478  				require.Nil(t, tracker.Close())
   479  			})
   480  		}
   481  	})
   482  
   483  	t.Run("test untrack", func(t *testing.T) {
   484  		tracker, err := NewPropertyLengthTracker(trackerPath)
   485  		require.Nil(t, err)
   486  
   487  		tracker.TrackProperty("test-prop", 1)
   488  		tracker.TrackProperty("test-prop", 2)
   489  		tracker.TrackProperty("test-prop", 3)
   490  		tracker.Flush()
   491  
   492  		sum, count, mean, err := tracker.PropertyTally("test-prop")
   493  		require.Nil(t, err)
   494  		assert.Equal(t, 6, sum)
   495  		assert.Equal(t, 3, count)
   496  		assert.InEpsilon(t, 2, mean, 0.1)
   497  
   498  		tracker.UnTrackProperty("test-prop", 2)
   499  		sum, count, mean, err = tracker.PropertyTally("test-prop")
   500  		require.Nil(t, err)
   501  		assert.Equal(t, 4, sum)
   502  		assert.Equal(t, 2, count)
   503  		assert.InEpsilon(t, 2, mean, 0.1)
   504  
   505  		tracker.UnTrackProperty("test-prop", 1)
   506  		sum, count, mean, err = tracker.PropertyTally("test-prop")
   507  		require.Nil(t, err)
   508  		assert.Equal(t, 3, sum)
   509  		assert.Equal(t, 1, count)
   510  		assert.InEpsilon(t, 3, mean, 0.1)
   511  
   512  		require.Nil(t, tracker.Close())
   513  	})
   514  
   515  	t.Run("multiple properties (can all fit on one page)", func(t *testing.T) {
   516  		type prop struct {
   517  			values   []float32
   518  			propName string
   519  		}
   520  
   521  		props := []prop{
   522  			{
   523  				values:   []float32{2, 2, 3, 100, 100, 500, 7},
   524  				propName: "property-numero-uno",
   525  			}, {
   526  				values: []float32{
   527  					1000, 1200, 1000, 1300, 800, 2000, 2050,
   528  					2070, 900,
   529  				},
   530  				propName: "the-second-of-the-properties",
   531  			}, {
   532  				values: []float32{
   533  					60000, 50000, 65000,
   534  				},
   535  				propName: "property_nummer_DREI",
   536  			},
   537  		}
   538  
   539  		// This time we use a single tracker
   540  		tracker, err := NewPropertyLengthTracker(trackerPath)
   541  		require.Nil(t, err)
   542  
   543  		for _, prop := range props {
   544  			for _, v := range prop.values {
   545  				tracker.TrackProperty(prop.propName, v)
   546  			}
   547  		}
   548  
   549  		for _, prop := range props {
   550  			actualMean := float32(0)
   551  			for _, v := range prop.values {
   552  				actualMean += v
   553  			}
   554  			actualMean = actualMean / float32(len(prop.values))
   555  
   556  			res, err := tracker.PropertyMean(prop.propName)
   557  			require.Nil(t, err)
   558  
   559  			assert.InEpsilon(t, actualMean, res, 0.1)
   560  		}
   561  
   562  		require.Nil(t, tracker.Close())
   563  	})
   564  
   565  	t.Run("with more properties that can fit on one page", func(t *testing.T) {
   566  		// This time we use a single tracker
   567  		tracker, err := NewPropertyLengthTracker(trackerPath)
   568  		require.Nil(t, err)
   569  
   570  		create20PropsAndVerify_old(t, tracker)
   571  
   572  		require.Nil(t, tracker.Close())
   573  	})
   574  }
   575  
   576  func TestOldPropertyLengthTracker_Persistence(t *testing.T) {
   577  	dirName := t.TempDir()
   578  
   579  	path := path.Join(dirName, "my_test_shard")
   580  
   581  	var tracker *PropertyLengthTracker
   582  
   583  	t.Run("initializing an empty tracker, no file present", func(t *testing.T) {
   584  		tr, err := NewPropertyLengthTracker(path)
   585  		require.Nil(t, err)
   586  		tracker = tr
   587  	})
   588  
   589  	t.Run("importing multi-page data and verifying", func(t *testing.T) {
   590  		create20PropsAndVerify_old(t, tracker)
   591  	})
   592  
   593  	t.Run("commit the state to disk", func(t *testing.T) {
   594  		require.Nil(t, tracker.Flush())
   595  	})
   596  
   597  	t.Run("shut down the tracker", func(t *testing.T) {
   598  		require.Nil(t, tracker.Close())
   599  	})
   600  
   601  	var secondTracker *PropertyLengthTracker
   602  	t.Run("initializing a new tracker from the same file", func(t *testing.T) {
   603  		tr, err := NewPropertyLengthTracker(path)
   604  		require.Nil(t, err)
   605  		secondTracker = tr
   606  	})
   607  
   608  	t.Run("verify data is correct after read from disk", func(t *testing.T) {
   609  		// root page
   610  		actualMeanForProp0 := float32(1+4+3+17) / 4.0
   611  		res, err := secondTracker.PropertyMean("prop_0")
   612  		require.Nil(t, err)
   613  		assert.InEpsilon(t, actualMeanForProp0, res, 0.1)
   614  
   615  		// later page
   616  		actualMeanForProp20 := float32(1+4+3+17+25) / 5.0
   617  		res, err = secondTracker.PropertyMean("prop_19")
   618  		require.Nil(t, err)
   619  		assert.InEpsilon(t, actualMeanForProp20, res, 0.1)
   620  	})
   621  
   622  	t.Run("shut down the second tracker", func(t *testing.T) {
   623  		require.Nil(t, secondTracker.Close())
   624  	})
   625  }
   626  
   627  func Test_PropertyLengthTracker_Overflow(t *testing.T) {
   628  	dirName := t.TempDir()
   629  	path := path.Join(dirName, "my_test_shard")
   630  
   631  	tracker, err := NewPropertyLengthTracker(path)
   632  	require.Nil(t, err)
   633  
   634  	for i := 0; i < 16*15; i++ {
   635  		err := tracker.TrackProperty(fmt.Sprintf("prop_%v", i), float32(i))
   636  		require.Nil(t, err)
   637  	}
   638  
   639  	// Check that property that would cause the internal counter to overflow is not added
   640  	err = tracker.TrackProperty("OVERFLOW", float32(123))
   641  	require.NotNil(t, err)
   642  
   643  	require.Nil(t, tracker.Close())
   644  }