github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/vector/hnsw/persistence_integration_test.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 //go:build integrationTest 13 // +build integrationTest 14 15 package hnsw 16 17 import ( 18 "context" 19 "fmt" 20 "io" 21 "os" 22 "path/filepath" 23 "testing" 24 "time" 25 26 "github.com/sirupsen/logrus/hooks/test" 27 "github.com/stretchr/testify/assert" 28 "github.com/stretchr/testify/require" 29 "github.com/weaviate/weaviate/adapters/repos/db/vector/hnsw/distancer" 30 "github.com/weaviate/weaviate/adapters/repos/db/vector/testinghelpers" 31 "github.com/weaviate/weaviate/entities/cyclemanager" 32 ent "github.com/weaviate/weaviate/entities/vectorindex/hnsw" 33 ) 34 35 func TestHnswPersistence(t *testing.T) { 36 dirName := t.TempDir() 37 indexID := "integrationtest" 38 39 logger, _ := test.NewNullLogger() 40 cl, clErr := NewCommitLogger(dirName, indexID, logger, 41 cyclemanager.NewCallbackGroupNoop()) 42 makeCL := func() (CommitLogger, error) { 43 return cl, clErr 44 } 45 index, err := New(Config{ 46 RootPath: dirName, 47 ID: indexID, 48 MakeCommitLoggerThunk: makeCL, 49 DistanceProvider: distancer.NewCosineDistanceProvider(), 50 VectorForIDThunk: testVectorForID, 51 }, ent.UserConfig{ 52 MaxConnections: 30, 53 EFConstruction: 60, 54 }, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), 55 cyclemanager.NewCallbackGroupNoop(), testinghelpers.NewDummyStore(t)) 56 require.Nil(t, err) 57 58 for i, vec := range testVectors { 59 err := index.Add(uint64(i), vec) 60 require.Nil(t, err) 61 } 62 63 require.Nil(t, index.Flush()) 64 65 // see index_test.go for more context 66 expectedResults := []uint64{ 67 3, 5, 4, // cluster 2 68 7, 8, 6, // cluster 3 69 2, 1, 0, // cluster 1 70 } 71 72 t.Run("verify that the results match originally", func(t *testing.T) { 73 position := 3 74 res, _, err := index.knnSearchByVector(testVectors[position], 50, 36, nil) 75 require.Nil(t, err) 76 assert.Equal(t, expectedResults, res) 77 }) 78 79 // destroy the index 80 index = nil 81 82 // build a new index from the (uncondensed) commit log 83 secondIndex, err := New(Config{ 84 RootPath: dirName, 85 ID: indexID, 86 MakeCommitLoggerThunk: makeCL, 87 DistanceProvider: distancer.NewCosineDistanceProvider(), 88 VectorForIDThunk: testVectorForID, 89 }, ent.UserConfig{ 90 MaxConnections: 30, 91 EFConstruction: 60, 92 }, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), 93 cyclemanager.NewCallbackGroupNoop(), testinghelpers.NewDummyStore(t)) 94 require.Nil(t, err) 95 96 t.Run("verify that the results match after rebuilding from disk", 97 func(t *testing.T) { 98 position := 3 99 res, _, err := secondIndex.knnSearchByVector(testVectors[position], 50, 36, nil) 100 require.Nil(t, err) 101 assert.Equal(t, expectedResults, res) 102 }) 103 } 104 105 func TestHnswPersistence_CorruptWAL(t *testing.T) { 106 dirName := t.TempDir() 107 indexID := "integrationtest_corrupt" 108 109 logger, _ := test.NewNullLogger() 110 cl, clErr := NewCommitLogger(dirName, indexID, logger, 111 cyclemanager.NewCallbackGroupNoop()) 112 makeCL := func() (CommitLogger, error) { 113 return cl, clErr 114 } 115 index, err := New(Config{ 116 RootPath: dirName, 117 ID: indexID, 118 MakeCommitLoggerThunk: makeCL, 119 DistanceProvider: distancer.NewCosineDistanceProvider(), 120 VectorForIDThunk: testVectorForID, 121 }, ent.UserConfig{ 122 MaxConnections: 30, 123 EFConstruction: 60, 124 }, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), 125 cyclemanager.NewCallbackGroupNoop(), testinghelpers.NewDummyStore(t)) 126 require.Nil(t, err) 127 128 for i, vec := range testVectors { 129 err := index.Add(uint64(i), vec) 130 require.Nil(t, err) 131 } 132 133 require.Nil(t, index.Flush()) 134 135 // see index_test.go for more context 136 expectedResults := []uint64{ 137 3, 5, 4, // cluster 2 138 7, 8, 6, // cluster 3 139 2, 1, 0, // cluster 1 140 } 141 142 t.Run("verify that the results match originally", func(t *testing.T) { 143 position := 3 144 res, _, err := index.knnSearchByVector(testVectors[position], 50, 36, nil) 145 require.Nil(t, err) 146 assert.Equal(t, expectedResults, res) 147 }) 148 149 // destroy the index 150 index.Shutdown(context.Background()) 151 index = nil 152 indexDir := filepath.Join(dirName, "integrationtest_corrupt.hnsw.commitlog.d") 153 154 t.Run("corrupt the commit log on purpose", func(t *testing.T) { 155 res, err := os.ReadDir(indexDir) 156 require.Nil(t, err) 157 require.Len(t, res, 1) 158 fName := filepath.Join(indexDir, res[0].Name()) 159 newFName := filepath.Join(indexDir, fmt.Sprintf("%d", time.Now().Unix())) 160 161 orig, err := os.Open(fName) 162 require.Nil(t, err) 163 164 correctLog, err := io.ReadAll(orig) 165 require.Nil(t, err) 166 err = orig.Close() 167 require.Nil(t, err) 168 169 os.Remove(fName) 170 171 corruptLog := correctLog[:len(correctLog)-6] 172 corrupt, err := os.Create(newFName) 173 require.Nil(t, err) 174 175 _, err = corrupt.Write(corruptLog) 176 require.Nil(t, err) 177 178 err = corrupt.Close() 179 require.Nil(t, err) 180 181 // double check that we only have one file left (the corrupted one) 182 res, err = os.ReadDir(indexDir) 183 require.Nil(t, err) 184 require.Len(t, res, 1) 185 }) 186 187 // build a new index from the (uncondensed, corrupted) commit log 188 secondIndex, err := New(Config{ 189 RootPath: dirName, 190 ID: indexID, 191 MakeCommitLoggerThunk: makeCL, 192 DistanceProvider: distancer.NewCosineDistanceProvider(), 193 VectorForIDThunk: testVectorForID, 194 }, ent.UserConfig{ 195 MaxConnections: 30, 196 EFConstruction: 60, 197 }, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), 198 cyclemanager.NewCallbackGroupNoop(), testinghelpers.NewDummyStore(t)) 199 require.Nil(t, err) 200 201 // the minor corruption (just one missing link) will most likely not render 202 // the index unusable, so we should still expect to retrieve results as 203 // normal 204 t.Run("verify that the results match after rebuilding from disk", 205 func(t *testing.T) { 206 position := 3 207 res, _, err := secondIndex.knnSearchByVector(testVectors[position], 50, 36, nil) 208 require.Nil(t, err) 209 assert.Equal(t, expectedResults, res) 210 }) 211 } 212 213 func TestHnswPersistence_WithDeletion_WithoutTombstoneCleanup(t *testing.T) { 214 dirName := t.TempDir() 215 indexID := "integrationtest_deletion" 216 logger, _ := test.NewNullLogger() 217 cl, clErr := NewCommitLogger(dirName, indexID, logger, 218 cyclemanager.NewCallbackGroupNoop()) 219 makeCL := func() (CommitLogger, error) { 220 return cl, clErr 221 } 222 index, err := New(Config{ 223 RootPath: dirName, 224 ID: indexID, 225 MakeCommitLoggerThunk: makeCL, 226 DistanceProvider: distancer.NewCosineDistanceProvider(), 227 VectorForIDThunk: testVectorForID, 228 }, ent.UserConfig{ 229 MaxConnections: 30, 230 EFConstruction: 60, 231 }, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), 232 cyclemanager.NewCallbackGroupNoop(), testinghelpers.NewDummyStore(t)) 233 require.Nil(t, err) 234 235 for i, vec := range testVectors { 236 err := index.Add(uint64(i), vec) 237 require.Nil(t, err) 238 } 239 240 t.Run("delete some elements", func(t *testing.T) { 241 err := index.Delete(6) 242 require.Nil(t, err) 243 err = index.Delete(8) 244 require.Nil(t, err) 245 }) 246 247 // see index_test.go for more context 248 expectedResults := []uint64{ 249 3, 5, 4, // cluster 2 250 7, // cluster 3 with element 6 and 8 deleted 251 2, 1, 0, // cluster 1 252 } 253 254 require.Nil(t, index.Flush()) 255 256 t.Run("verify that the results match originally", func(t *testing.T) { 257 position := 3 258 res, _, err := index.knnSearchByVector(testVectors[position], 50, 36, nil) 259 require.Nil(t, err) 260 assert.Equal(t, expectedResults, res) 261 }) 262 263 dumpIndex(index, "without_cleanup_original_index_before_storage") 264 265 // destroy the index 266 index = nil 267 268 // build a new index from the (uncondensed) commit log 269 secondIndex, err := New(Config{ 270 RootPath: dirName, 271 ID: indexID, 272 MakeCommitLoggerThunk: makeCL, 273 DistanceProvider: distancer.NewCosineDistanceProvider(), 274 VectorForIDThunk: testVectorForID, 275 }, ent.UserConfig{ 276 MaxConnections: 30, 277 EFConstruction: 60, 278 }, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), 279 cyclemanager.NewCallbackGroupNoop(), testinghelpers.NewDummyStore(t)) 280 require.Nil(t, err) 281 282 dumpIndex(secondIndex, "without_cleanup_after_rebuild") 283 t.Run("verify that the results match after rebuilding from disk", 284 func(t *testing.T) { 285 position := 3 286 res, _, err := secondIndex.knnSearchByVector(testVectors[position], 50, 36, nil) 287 require.Nil(t, err) 288 assert.Equal(t, expectedResults, res) 289 }) 290 } 291 292 func TestHnswPersistence_WithDeletion_WithTombstoneCleanup(t *testing.T) { 293 dirName := t.TempDir() 294 indexID := "integrationtest_tombstonecleanup" 295 296 logger, _ := test.NewNullLogger() 297 makeCL := func() (CommitLogger, error) { 298 return NewCommitLogger(dirName, indexID, logger, 299 cyclemanager.NewCallbackGroupNoop()) 300 } 301 index, err := New(Config{ 302 RootPath: dirName, 303 ID: indexID, 304 MakeCommitLoggerThunk: makeCL, 305 DistanceProvider: distancer.NewCosineDistanceProvider(), 306 VectorForIDThunk: testVectorForID, 307 }, ent.UserConfig{ 308 MaxConnections: 30, 309 EFConstruction: 60, 310 }, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), 311 cyclemanager.NewCallbackGroupNoop(), testinghelpers.NewDummyStore(t)) 312 require.Nil(t, err) 313 314 for i, vec := range testVectors { 315 err := index.Add(uint64(i), vec) 316 require.Nil(t, err) 317 } 318 dumpIndex(index, "with cleanup after import") 319 require.Nil(t, index.Flush()) 320 321 t.Run("delete some elements and permanently delete tombstoned elements", 322 func(t *testing.T) { 323 err := index.Delete(6) 324 require.Nil(t, err) 325 err = index.Delete(8) 326 require.Nil(t, err) 327 328 err = index.CleanUpTombstonedNodes(neverStop) 329 require.Nil(t, err) 330 }) 331 332 dumpIndex(index, "with cleanup after delete") 333 334 require.Nil(t, index.Flush()) 335 336 // see index_test.go for more context 337 expectedResults := []uint64{ 338 3, 5, 4, // cluster 2 339 7, // cluster 3 with element 6 and 8 deleted 340 2, 1, 0, // cluster 1 341 } 342 343 t.Run("verify that the results match originally", func(t *testing.T) { 344 position := 3 345 res, _, err := index.knnSearchByVector(testVectors[position], 50, 36, nil) 346 require.Nil(t, err) 347 assert.Equal(t, expectedResults, res) 348 }) 349 350 // destroy the index 351 index.Shutdown(context.Background()) 352 index = nil 353 354 // build a new index from the (uncondensed) commit log 355 secondIndex, err := New(Config{ 356 RootPath: dirName, 357 ID: indexID, 358 MakeCommitLoggerThunk: makeCL, 359 DistanceProvider: distancer.NewCosineDistanceProvider(), 360 VectorForIDThunk: testVectorForID, 361 }, ent.UserConfig{ 362 MaxConnections: 30, 363 EFConstruction: 60, 364 }, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), 365 cyclemanager.NewCallbackGroupNoop(), testinghelpers.NewDummyStore(t)) 366 require.Nil(t, err) 367 dumpIndex(secondIndex, "with cleanup second index") 368 369 t.Run("verify that the results match after rebuilding from disk", 370 func(t *testing.T) { 371 position := 3 372 res, _, err := secondIndex.knnSearchByVector(testVectors[position], 50, 36, nil) 373 require.Nil(t, err) 374 assert.Equal(t, expectedResults, res) 375 }) 376 377 t.Run("further deleting all elements and reimporting one", func(t *testing.T) { 378 toDelete := []uint64{0, 1, 2, 3, 4, 5, 7} 379 380 for _, id := range toDelete { 381 err := secondIndex.Delete(id) 382 require.Nil(t, err) 383 } 384 385 err = secondIndex.CleanUpTombstonedNodes(neverStop) 386 require.Nil(t, err) 387 388 err := secondIndex.Add(3, testVectors[3]) 389 require.Nil(t, err) 390 }) 391 392 require.Nil(t, secondIndex.Flush()) 393 394 dumpIndex(secondIndex) 395 396 secondIndex.Shutdown(context.Background()) 397 secondIndex = nil 398 399 // build a new index from the (uncondensed) commit log 400 thirdIndex, err := New(Config{ 401 RootPath: dirName, 402 ID: indexID, 403 MakeCommitLoggerThunk: makeCL, 404 DistanceProvider: distancer.NewCosineDistanceProvider(), 405 VectorForIDThunk: testVectorForID, 406 }, ent.UserConfig{ 407 MaxConnections: 30, 408 EFConstruction: 60, 409 }, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), 410 cyclemanager.NewCallbackGroupNoop(), testinghelpers.NewDummyStore(t)) 411 require.Nil(t, err) 412 413 dumpIndex(thirdIndex) 414 415 t.Run("verify that the results match after rebuilding from disk", 416 func(t *testing.T) { 417 position := 3 418 res, _, err := thirdIndex.knnSearchByVector(testVectors[position], 50, 36, nil) 419 require.Nil(t, err) 420 assert.Equal(t, []uint64{3}, res) 421 }) 422 423 t.Run("delete all elements so the commitlog ends with an empty graph", func(t *testing.T) { 424 toDelete := []uint64{3} 425 426 for _, id := range toDelete { 427 err := thirdIndex.Delete(id) 428 require.Nil(t, err) 429 } 430 431 err = thirdIndex.CleanUpTombstonedNodes(neverStop) 432 require.Nil(t, err) 433 }) 434 435 require.Nil(t, thirdIndex.Flush()) 436 437 thirdIndex.Shutdown(context.Background()) 438 thirdIndex = nil 439 // build a new index from the (uncondensed) commit log 440 fourthIndex, err := New(Config{ 441 RootPath: dirName, 442 ID: indexID, 443 MakeCommitLoggerThunk: makeCL, 444 DistanceProvider: distancer.NewCosineDistanceProvider(), 445 VectorForIDThunk: testVectorForID, 446 }, ent.UserConfig{ 447 MaxConnections: 30, 448 EFConstruction: 60, 449 }, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), 450 cyclemanager.NewCallbackGroupNoop(), testinghelpers.NewDummyStore(t)) 451 require.Nil(t, err) 452 453 t.Run("load from disk and try to insert again", func(t *testing.T) { 454 for i, vec := range testVectors { 455 err := fourthIndex.Add(uint64(i), vec) 456 require.Nil(t, err) 457 } 458 }) 459 460 t.Run("verify that searching works normally", func(t *testing.T) { 461 expectedResults := []uint64{ 462 3, 5, 4, // cluster 2 463 7, 8, 6, // cluster 3 with element 6 and 8 deleted 464 2, 1, 0, // cluster 1 465 } 466 position := 3 467 res, _, err := fourthIndex.knnSearchByVector(testVectors[position], 50, 36, nil) 468 require.Nil(t, err) 469 assert.Equal(t, expectedResults, res) 470 }) 471 472 fourthIndex.Shutdown(context.Background()) 473 }