github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/vector/hnsw/persistence_integration_test.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  //go:build integrationTest
    13  // +build integrationTest
    14  
    15  package hnsw
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"io"
    21  	"os"
    22  	"path/filepath"
    23  	"testing"
    24  	"time"
    25  
    26  	"github.com/sirupsen/logrus/hooks/test"
    27  	"github.com/stretchr/testify/assert"
    28  	"github.com/stretchr/testify/require"
    29  	"github.com/weaviate/weaviate/adapters/repos/db/vector/hnsw/distancer"
    30  	"github.com/weaviate/weaviate/adapters/repos/db/vector/testinghelpers"
    31  	"github.com/weaviate/weaviate/entities/cyclemanager"
    32  	ent "github.com/weaviate/weaviate/entities/vectorindex/hnsw"
    33  )
    34  
    35  func TestHnswPersistence(t *testing.T) {
    36  	dirName := t.TempDir()
    37  	indexID := "integrationtest"
    38  
    39  	logger, _ := test.NewNullLogger()
    40  	cl, clErr := NewCommitLogger(dirName, indexID, logger,
    41  		cyclemanager.NewCallbackGroupNoop())
    42  	makeCL := func() (CommitLogger, error) {
    43  		return cl, clErr
    44  	}
    45  	index, err := New(Config{
    46  		RootPath:              dirName,
    47  		ID:                    indexID,
    48  		MakeCommitLoggerThunk: makeCL,
    49  		DistanceProvider:      distancer.NewCosineDistanceProvider(),
    50  		VectorForIDThunk:      testVectorForID,
    51  	}, ent.UserConfig{
    52  		MaxConnections: 30,
    53  		EFConstruction: 60,
    54  	}, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(),
    55  		cyclemanager.NewCallbackGroupNoop(), testinghelpers.NewDummyStore(t))
    56  	require.Nil(t, err)
    57  
    58  	for i, vec := range testVectors {
    59  		err := index.Add(uint64(i), vec)
    60  		require.Nil(t, err)
    61  	}
    62  
    63  	require.Nil(t, index.Flush())
    64  
    65  	// see index_test.go for more context
    66  	expectedResults := []uint64{
    67  		3, 5, 4, // cluster 2
    68  		7, 8, 6, // cluster 3
    69  		2, 1, 0, // cluster 1
    70  	}
    71  
    72  	t.Run("verify that the results match originally", func(t *testing.T) {
    73  		position := 3
    74  		res, _, err := index.knnSearchByVector(testVectors[position], 50, 36, nil)
    75  		require.Nil(t, err)
    76  		assert.Equal(t, expectedResults, res)
    77  	})
    78  
    79  	// destroy the index
    80  	index = nil
    81  
    82  	// build a new index from the (uncondensed) commit log
    83  	secondIndex, err := New(Config{
    84  		RootPath:              dirName,
    85  		ID:                    indexID,
    86  		MakeCommitLoggerThunk: makeCL,
    87  		DistanceProvider:      distancer.NewCosineDistanceProvider(),
    88  		VectorForIDThunk:      testVectorForID,
    89  	}, ent.UserConfig{
    90  		MaxConnections: 30,
    91  		EFConstruction: 60,
    92  	}, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(),
    93  		cyclemanager.NewCallbackGroupNoop(), testinghelpers.NewDummyStore(t))
    94  	require.Nil(t, err)
    95  
    96  	t.Run("verify that the results match after rebuilding from disk",
    97  		func(t *testing.T) {
    98  			position := 3
    99  			res, _, err := secondIndex.knnSearchByVector(testVectors[position], 50, 36, nil)
   100  			require.Nil(t, err)
   101  			assert.Equal(t, expectedResults, res)
   102  		})
   103  }
   104  
   105  func TestHnswPersistence_CorruptWAL(t *testing.T) {
   106  	dirName := t.TempDir()
   107  	indexID := "integrationtest_corrupt"
   108  
   109  	logger, _ := test.NewNullLogger()
   110  	cl, clErr := NewCommitLogger(dirName, indexID, logger,
   111  		cyclemanager.NewCallbackGroupNoop())
   112  	makeCL := func() (CommitLogger, error) {
   113  		return cl, clErr
   114  	}
   115  	index, err := New(Config{
   116  		RootPath:              dirName,
   117  		ID:                    indexID,
   118  		MakeCommitLoggerThunk: makeCL,
   119  		DistanceProvider:      distancer.NewCosineDistanceProvider(),
   120  		VectorForIDThunk:      testVectorForID,
   121  	}, ent.UserConfig{
   122  		MaxConnections: 30,
   123  		EFConstruction: 60,
   124  	}, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(),
   125  		cyclemanager.NewCallbackGroupNoop(), testinghelpers.NewDummyStore(t))
   126  	require.Nil(t, err)
   127  
   128  	for i, vec := range testVectors {
   129  		err := index.Add(uint64(i), vec)
   130  		require.Nil(t, err)
   131  	}
   132  
   133  	require.Nil(t, index.Flush())
   134  
   135  	// see index_test.go for more context
   136  	expectedResults := []uint64{
   137  		3, 5, 4, // cluster 2
   138  		7, 8, 6, // cluster 3
   139  		2, 1, 0, // cluster 1
   140  	}
   141  
   142  	t.Run("verify that the results match originally", func(t *testing.T) {
   143  		position := 3
   144  		res, _, err := index.knnSearchByVector(testVectors[position], 50, 36, nil)
   145  		require.Nil(t, err)
   146  		assert.Equal(t, expectedResults, res)
   147  	})
   148  
   149  	// destroy the index
   150  	index.Shutdown(context.Background())
   151  	index = nil
   152  	indexDir := filepath.Join(dirName, "integrationtest_corrupt.hnsw.commitlog.d")
   153  
   154  	t.Run("corrupt the commit log on purpose", func(t *testing.T) {
   155  		res, err := os.ReadDir(indexDir)
   156  		require.Nil(t, err)
   157  		require.Len(t, res, 1)
   158  		fName := filepath.Join(indexDir, res[0].Name())
   159  		newFName := filepath.Join(indexDir, fmt.Sprintf("%d", time.Now().Unix()))
   160  
   161  		orig, err := os.Open(fName)
   162  		require.Nil(t, err)
   163  
   164  		correctLog, err := io.ReadAll(orig)
   165  		require.Nil(t, err)
   166  		err = orig.Close()
   167  		require.Nil(t, err)
   168  
   169  		os.Remove(fName)
   170  
   171  		corruptLog := correctLog[:len(correctLog)-6]
   172  		corrupt, err := os.Create(newFName)
   173  		require.Nil(t, err)
   174  
   175  		_, err = corrupt.Write(corruptLog)
   176  		require.Nil(t, err)
   177  
   178  		err = corrupt.Close()
   179  		require.Nil(t, err)
   180  
   181  		// double check that we only have one file left (the corrupted one)
   182  		res, err = os.ReadDir(indexDir)
   183  		require.Nil(t, err)
   184  		require.Len(t, res, 1)
   185  	})
   186  
   187  	// build a new index from the (uncondensed, corrupted) commit log
   188  	secondIndex, err := New(Config{
   189  		RootPath:              dirName,
   190  		ID:                    indexID,
   191  		MakeCommitLoggerThunk: makeCL,
   192  		DistanceProvider:      distancer.NewCosineDistanceProvider(),
   193  		VectorForIDThunk:      testVectorForID,
   194  	}, ent.UserConfig{
   195  		MaxConnections: 30,
   196  		EFConstruction: 60,
   197  	}, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(),
   198  		cyclemanager.NewCallbackGroupNoop(), testinghelpers.NewDummyStore(t))
   199  	require.Nil(t, err)
   200  
   201  	// the minor corruption (just one missing link) will most likely not render
   202  	// the index unusable, so we should still expect to retrieve results as
   203  	// normal
   204  	t.Run("verify that the results match after rebuilding from disk",
   205  		func(t *testing.T) {
   206  			position := 3
   207  			res, _, err := secondIndex.knnSearchByVector(testVectors[position], 50, 36, nil)
   208  			require.Nil(t, err)
   209  			assert.Equal(t, expectedResults, res)
   210  		})
   211  }
   212  
   213  func TestHnswPersistence_WithDeletion_WithoutTombstoneCleanup(t *testing.T) {
   214  	dirName := t.TempDir()
   215  	indexID := "integrationtest_deletion"
   216  	logger, _ := test.NewNullLogger()
   217  	cl, clErr := NewCommitLogger(dirName, indexID, logger,
   218  		cyclemanager.NewCallbackGroupNoop())
   219  	makeCL := func() (CommitLogger, error) {
   220  		return cl, clErr
   221  	}
   222  	index, err := New(Config{
   223  		RootPath:              dirName,
   224  		ID:                    indexID,
   225  		MakeCommitLoggerThunk: makeCL,
   226  		DistanceProvider:      distancer.NewCosineDistanceProvider(),
   227  		VectorForIDThunk:      testVectorForID,
   228  	}, ent.UserConfig{
   229  		MaxConnections: 30,
   230  		EFConstruction: 60,
   231  	}, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(),
   232  		cyclemanager.NewCallbackGroupNoop(), testinghelpers.NewDummyStore(t))
   233  	require.Nil(t, err)
   234  
   235  	for i, vec := range testVectors {
   236  		err := index.Add(uint64(i), vec)
   237  		require.Nil(t, err)
   238  	}
   239  
   240  	t.Run("delete some elements", func(t *testing.T) {
   241  		err := index.Delete(6)
   242  		require.Nil(t, err)
   243  		err = index.Delete(8)
   244  		require.Nil(t, err)
   245  	})
   246  
   247  	// see index_test.go for more context
   248  	expectedResults := []uint64{
   249  		3, 5, 4, // cluster 2
   250  		7,       // cluster 3 with element 6 and 8 deleted
   251  		2, 1, 0, // cluster 1
   252  	}
   253  
   254  	require.Nil(t, index.Flush())
   255  
   256  	t.Run("verify that the results match originally", func(t *testing.T) {
   257  		position := 3
   258  		res, _, err := index.knnSearchByVector(testVectors[position], 50, 36, nil)
   259  		require.Nil(t, err)
   260  		assert.Equal(t, expectedResults, res)
   261  	})
   262  
   263  	dumpIndex(index, "without_cleanup_original_index_before_storage")
   264  
   265  	// destroy the index
   266  	index = nil
   267  
   268  	// build a new index from the (uncondensed) commit log
   269  	secondIndex, err := New(Config{
   270  		RootPath:              dirName,
   271  		ID:                    indexID,
   272  		MakeCommitLoggerThunk: makeCL,
   273  		DistanceProvider:      distancer.NewCosineDistanceProvider(),
   274  		VectorForIDThunk:      testVectorForID,
   275  	}, ent.UserConfig{
   276  		MaxConnections: 30,
   277  		EFConstruction: 60,
   278  	}, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(),
   279  		cyclemanager.NewCallbackGroupNoop(), testinghelpers.NewDummyStore(t))
   280  	require.Nil(t, err)
   281  
   282  	dumpIndex(secondIndex, "without_cleanup_after_rebuild")
   283  	t.Run("verify that the results match after rebuilding from disk",
   284  		func(t *testing.T) {
   285  			position := 3
   286  			res, _, err := secondIndex.knnSearchByVector(testVectors[position], 50, 36, nil)
   287  			require.Nil(t, err)
   288  			assert.Equal(t, expectedResults, res)
   289  		})
   290  }
   291  
   292  func TestHnswPersistence_WithDeletion_WithTombstoneCleanup(t *testing.T) {
   293  	dirName := t.TempDir()
   294  	indexID := "integrationtest_tombstonecleanup"
   295  
   296  	logger, _ := test.NewNullLogger()
   297  	makeCL := func() (CommitLogger, error) {
   298  		return NewCommitLogger(dirName, indexID, logger,
   299  			cyclemanager.NewCallbackGroupNoop())
   300  	}
   301  	index, err := New(Config{
   302  		RootPath:              dirName,
   303  		ID:                    indexID,
   304  		MakeCommitLoggerThunk: makeCL,
   305  		DistanceProvider:      distancer.NewCosineDistanceProvider(),
   306  		VectorForIDThunk:      testVectorForID,
   307  	}, ent.UserConfig{
   308  		MaxConnections: 30,
   309  		EFConstruction: 60,
   310  	}, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(),
   311  		cyclemanager.NewCallbackGroupNoop(), testinghelpers.NewDummyStore(t))
   312  	require.Nil(t, err)
   313  
   314  	for i, vec := range testVectors {
   315  		err := index.Add(uint64(i), vec)
   316  		require.Nil(t, err)
   317  	}
   318  	dumpIndex(index, "with cleanup after import")
   319  	require.Nil(t, index.Flush())
   320  
   321  	t.Run("delete some elements and permanently delete tombstoned elements",
   322  		func(t *testing.T) {
   323  			err := index.Delete(6)
   324  			require.Nil(t, err)
   325  			err = index.Delete(8)
   326  			require.Nil(t, err)
   327  
   328  			err = index.CleanUpTombstonedNodes(neverStop)
   329  			require.Nil(t, err)
   330  		})
   331  
   332  	dumpIndex(index, "with cleanup after delete")
   333  
   334  	require.Nil(t, index.Flush())
   335  
   336  	// see index_test.go for more context
   337  	expectedResults := []uint64{
   338  		3, 5, 4, // cluster 2
   339  		7,       // cluster 3 with element 6 and 8 deleted
   340  		2, 1, 0, // cluster 1
   341  	}
   342  
   343  	t.Run("verify that the results match originally", func(t *testing.T) {
   344  		position := 3
   345  		res, _, err := index.knnSearchByVector(testVectors[position], 50, 36, nil)
   346  		require.Nil(t, err)
   347  		assert.Equal(t, expectedResults, res)
   348  	})
   349  
   350  	// destroy the index
   351  	index.Shutdown(context.Background())
   352  	index = nil
   353  
   354  	// build a new index from the (uncondensed) commit log
   355  	secondIndex, err := New(Config{
   356  		RootPath:              dirName,
   357  		ID:                    indexID,
   358  		MakeCommitLoggerThunk: makeCL,
   359  		DistanceProvider:      distancer.NewCosineDistanceProvider(),
   360  		VectorForIDThunk:      testVectorForID,
   361  	}, ent.UserConfig{
   362  		MaxConnections: 30,
   363  		EFConstruction: 60,
   364  	}, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(),
   365  		cyclemanager.NewCallbackGroupNoop(), testinghelpers.NewDummyStore(t))
   366  	require.Nil(t, err)
   367  	dumpIndex(secondIndex, "with cleanup second index")
   368  
   369  	t.Run("verify that the results match after rebuilding from disk",
   370  		func(t *testing.T) {
   371  			position := 3
   372  			res, _, err := secondIndex.knnSearchByVector(testVectors[position], 50, 36, nil)
   373  			require.Nil(t, err)
   374  			assert.Equal(t, expectedResults, res)
   375  		})
   376  
   377  	t.Run("further deleting all elements and reimporting one", func(t *testing.T) {
   378  		toDelete := []uint64{0, 1, 2, 3, 4, 5, 7}
   379  
   380  		for _, id := range toDelete {
   381  			err := secondIndex.Delete(id)
   382  			require.Nil(t, err)
   383  		}
   384  
   385  		err = secondIndex.CleanUpTombstonedNodes(neverStop)
   386  		require.Nil(t, err)
   387  
   388  		err := secondIndex.Add(3, testVectors[3])
   389  		require.Nil(t, err)
   390  	})
   391  
   392  	require.Nil(t, secondIndex.Flush())
   393  
   394  	dumpIndex(secondIndex)
   395  
   396  	secondIndex.Shutdown(context.Background())
   397  	secondIndex = nil
   398  
   399  	// build a new index from the (uncondensed) commit log
   400  	thirdIndex, err := New(Config{
   401  		RootPath:              dirName,
   402  		ID:                    indexID,
   403  		MakeCommitLoggerThunk: makeCL,
   404  		DistanceProvider:      distancer.NewCosineDistanceProvider(),
   405  		VectorForIDThunk:      testVectorForID,
   406  	}, ent.UserConfig{
   407  		MaxConnections: 30,
   408  		EFConstruction: 60,
   409  	}, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(),
   410  		cyclemanager.NewCallbackGroupNoop(), testinghelpers.NewDummyStore(t))
   411  	require.Nil(t, err)
   412  
   413  	dumpIndex(thirdIndex)
   414  
   415  	t.Run("verify that the results match after rebuilding from disk",
   416  		func(t *testing.T) {
   417  			position := 3
   418  			res, _, err := thirdIndex.knnSearchByVector(testVectors[position], 50, 36, nil)
   419  			require.Nil(t, err)
   420  			assert.Equal(t, []uint64{3}, res)
   421  		})
   422  
   423  	t.Run("delete all elements so the commitlog ends with an empty graph", func(t *testing.T) {
   424  		toDelete := []uint64{3}
   425  
   426  		for _, id := range toDelete {
   427  			err := thirdIndex.Delete(id)
   428  			require.Nil(t, err)
   429  		}
   430  
   431  		err = thirdIndex.CleanUpTombstonedNodes(neverStop)
   432  		require.Nil(t, err)
   433  	})
   434  
   435  	require.Nil(t, thirdIndex.Flush())
   436  
   437  	thirdIndex.Shutdown(context.Background())
   438  	thirdIndex = nil
   439  	// build a new index from the (uncondensed) commit log
   440  	fourthIndex, err := New(Config{
   441  		RootPath:              dirName,
   442  		ID:                    indexID,
   443  		MakeCommitLoggerThunk: makeCL,
   444  		DistanceProvider:      distancer.NewCosineDistanceProvider(),
   445  		VectorForIDThunk:      testVectorForID,
   446  	}, ent.UserConfig{
   447  		MaxConnections: 30,
   448  		EFConstruction: 60,
   449  	}, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(),
   450  		cyclemanager.NewCallbackGroupNoop(), testinghelpers.NewDummyStore(t))
   451  	require.Nil(t, err)
   452  
   453  	t.Run("load from disk and try to insert again", func(t *testing.T) {
   454  		for i, vec := range testVectors {
   455  			err := fourthIndex.Add(uint64(i), vec)
   456  			require.Nil(t, err)
   457  		}
   458  	})
   459  
   460  	t.Run("verify that searching works normally", func(t *testing.T) {
   461  		expectedResults := []uint64{
   462  			3, 5, 4, // cluster 2
   463  			7, 8, 6, // cluster 3 with element 6 and 8 deleted
   464  			2, 1, 0, // cluster 1
   465  		}
   466  		position := 3
   467  		res, _, err := fourthIndex.knnSearchByVector(testVectors[position], 50, 36, nil)
   468  		require.Nil(t, err)
   469  		assert.Equal(t, expectedResults, res)
   470  	})
   471  
   472  	fourthIndex.Shutdown(context.Background())
   473  }