github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/batch_integration_test.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 //go:build integrationTest 13 // +build integrationTest 14 15 package db 16 17 import ( 18 "context" 19 "fmt" 20 "math/rand" 21 "sort" 22 "testing" 23 "time" 24 25 "github.com/go-openapi/strfmt" 26 "github.com/google/uuid" 27 "github.com/sirupsen/logrus" 28 "github.com/stretchr/testify/assert" 29 "github.com/stretchr/testify/require" 30 "github.com/weaviate/weaviate/adapters/repos/db/vector/hnsw/distancer" 31 "github.com/weaviate/weaviate/entities/dto" 32 "github.com/weaviate/weaviate/entities/filters" 33 "github.com/weaviate/weaviate/entities/models" 34 "github.com/weaviate/weaviate/entities/schema" 35 "github.com/weaviate/weaviate/entities/search" 36 enthnsw "github.com/weaviate/weaviate/entities/vectorindex/hnsw" 37 "github.com/weaviate/weaviate/usecases/objects" 38 ) 39 40 func TestBatchPutObjectsWithDimensions(t *testing.T) { 41 dirName := t.TempDir() 42 43 logger := logrus.New() 44 schemaGetter := &fakeSchemaGetter{ 45 schema: schema.Schema{Objects: &models.Schema{Classes: nil}}, 46 shardState: singleShardState(), 47 } 48 repo, err := New(logger, Config{ 49 MemtablesFlushDirtyAfter: 60, 50 RootPath: dirName, 51 QueryMaximumResults: 10000, 52 MaxImportGoroutinesFactor: 1, 53 TrackVectorDimensions: true, 54 }, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, &fakeReplicationClient{}, nil) 55 require.Nil(t, err) 56 repo.SetSchemaGetter(schemaGetter) 57 require.Nil(t, repo.WaitForStartup(testCtx())) 58 59 defer func() { 60 require.Nil(t, repo.Shutdown(context.Background())) 61 }() 62 migrator := NewMigrator(repo, logger) 63 64 t.Run("creating the thing class", testAddBatchObjectClass(repo, migrator, schemaGetter)) 65 66 dimBefore := GetDimensionsFromRepo(repo, "ThingForBatching") 67 require.Equal(t, 0, dimBefore, "Dimensions are empty before import") 68 69 simpleInsertObjects(t, repo, "ThingForBatching", 123) 70 71 dimAfter := GetDimensionsFromRepo(repo, "ThingForBatching") 72 require.Equal(t, 369, dimAfter, "Dimensions are present after import") 73 } 74 75 func TestBatchPutObjects(t *testing.T) { 76 dirName := t.TempDir() 77 78 logger := logrus.New() 79 schemaGetter := &fakeSchemaGetter{ 80 schema: schema.Schema{Objects: &models.Schema{Classes: nil}}, 81 shardState: singleShardState(), 82 } 83 repo, err := New(logger, Config{ 84 MemtablesFlushDirtyAfter: 60, 85 RootPath: dirName, 86 QueryMaximumResults: 10000, 87 MaxImportGoroutinesFactor: 1, 88 TrackVectorDimensions: true, 89 }, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, &fakeReplicationClient{}, nil) 90 require.Nil(t, err) 91 repo.SetSchemaGetter(schemaGetter) 92 require.Nil(t, repo.WaitForStartup(testCtx())) 93 94 defer func() { 95 require.Nil(t, repo.Shutdown(context.Background())) 96 }() 97 migrator := NewMigrator(repo, logger) 98 99 t.Run("creating the thing class", testAddBatchObjectClass(repo, migrator, schemaGetter)) 100 101 t.Run("batch import things", testBatchImportObjects(repo)) 102 t.Run("batch import things with geo props", testBatchImportGeoObjects(repo)) 103 } 104 105 func TestBatchPutObjectsNoVectorsWithDimensions(t *testing.T) { 106 dirName := t.TempDir() 107 108 logger := logrus.New() 109 schemaGetter := &fakeSchemaGetter{ 110 schema: schema.Schema{Objects: &models.Schema{Classes: nil}}, 111 shardState: singleShardState(), 112 } 113 repo, err := New(logger, Config{ 114 MemtablesFlushDirtyAfter: 60, 115 RootPath: dirName, 116 QueryMaximumResults: 10000, 117 MaxImportGoroutinesFactor: 1, 118 TrackVectorDimensions: true, 119 }, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, &fakeReplicationClient{}, nil) 120 require.Nil(t, err) 121 repo.SetSchemaGetter(schemaGetter) 122 require.Nil(t, repo.WaitForStartup(testCtx())) 123 124 defer func() { 125 require.Nil(t, repo.Shutdown(context.Background())) 126 }() 127 migrator := NewMigrator(repo, logger) 128 129 t.Run("creating the thing class", testAddBatchObjectClass(repo, migrator, 130 schemaGetter)) 131 132 dimensions := GetDimensionsFromRepo(repo, "ThingForBatching") 133 require.Equal(t, 0, dimensions, "Dimensions are empty before import") 134 135 t.Run("batch import things", testBatchImportObjectsNoVector(repo)) 136 137 dimAfter := GetDimensionsFromRepo(repo, "ThingForBatching") 138 require.Equal(t, 0, dimAfter, "Dimensions are empty after import (no vectors in import)") 139 } 140 141 func TestBatchPutObjectsNoVectors(t *testing.T) { 142 dirName := t.TempDir() 143 144 logger := logrus.New() 145 schemaGetter := &fakeSchemaGetter{ 146 schema: schema.Schema{Objects: &models.Schema{Classes: nil}}, 147 shardState: singleShardState(), 148 } 149 repo, err := New(logger, Config{ 150 MemtablesFlushDirtyAfter: 60, 151 RootPath: dirName, 152 QueryMaximumResults: 10000, 153 MaxImportGoroutinesFactor: 1, 154 }, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, &fakeReplicationClient{}, nil) 155 require.Nil(t, err) 156 repo.SetSchemaGetter(schemaGetter) 157 require.Nil(t, repo.WaitForStartup(testCtx())) 158 159 defer func() { 160 require.Nil(t, repo.Shutdown(context.Background())) 161 }() 162 migrator := NewMigrator(repo, logger) 163 164 t.Run("creating the thing class", testAddBatchObjectClass(repo, migrator, schemaGetter)) 165 166 t.Run("batch import things", testBatchImportObjectsNoVector(repo)) 167 } 168 169 func TestBatchDeleteObjectsWithDimensions(t *testing.T) { 170 className := "ThingForBatching" 171 dirName := t.TempDir() 172 173 logger := logrus.New() 174 schemaGetter := &fakeSchemaGetter{ 175 schema: schema.Schema{Objects: &models.Schema{Classes: nil}}, 176 shardState: singleShardState(), 177 } 178 repo, err := New(logger, Config{ 179 MemtablesFlushDirtyAfter: 1, 180 RootPath: dirName, 181 QueryMaximumResults: 10000, 182 MaxImportGoroutinesFactor: 1, 183 TrackVectorDimensions: true, 184 }, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, &fakeReplicationClient{}, nil) 185 require.Nil(t, err) 186 repo.SetSchemaGetter(schemaGetter) 187 require.Nil(t, repo.WaitForStartup(testCtx())) 188 defer func() { 189 require.Nil(t, repo.Shutdown(context.Background())) 190 }() 191 192 migrator := NewMigrator(repo, logger) 193 194 t.Run("creating the test class", testAddBatchObjectClass(repo, migrator, schemaGetter)) 195 196 dimBefore := GetDimensionsFromRepo(repo, className) 197 require.Equal(t, 0, dimBefore, "Dimensions are empty before import") 198 199 simpleInsertObjects(t, repo, className, 103) 200 201 dimAfter := GetDimensionsFromRepo(repo, className) 202 require.Equal(t, 309, dimAfter, "Dimensions are present before delete") 203 204 delete2Objects(t, repo, className) 205 206 dimFinal := GetDimensionsFromRepo(repo, className) 207 require.Equal(t, 303, dimFinal, "2 objects have been deleted") 208 } 209 210 func delete2Objects(t *testing.T, repo *DB, className string) { 211 batchDeleteRes, err := repo.BatchDeleteObjects(context.Background(), objects.BatchDeleteParams{ 212 ClassName: "ThingForBatching", 213 Filters: &filters.LocalFilter{ 214 Root: &filters.Clause{ 215 Operator: filters.OperatorOr, 216 Operands: []filters.Clause{ 217 { 218 Operator: filters.OperatorEqual, 219 On: &filters.Path{ 220 Class: "ThingForBatching", 221 Property: schema.PropertyName("id"), 222 }, 223 Value: &filters.Value{ 224 Value: "8d5a3aa2-3c8d-4589-9ae1-3f638f506003", 225 Type: schema.DataTypeText, 226 }, 227 }, 228 { 229 Operator: filters.OperatorEqual, 230 On: &filters.Path{ 231 Class: "ThingForBatching", 232 Property: schema.PropertyName("id"), 233 }, 234 Value: &filters.Value{ 235 Value: "8d5a3aa2-3c8d-4589-9ae1-3f638f506004", 236 Type: schema.DataTypeText, 237 }, 238 }, 239 }, 240 }, 241 }, 242 DryRun: false, 243 Output: "verbose", 244 }, nil, "") 245 require.Nil(t, err) 246 require.Equal(t, 2, len(batchDeleteRes.Objects), "Objects deleted") 247 } 248 249 func TestBatchDeleteObjects(t *testing.T) { 250 dirName := t.TempDir() 251 252 logger := logrus.New() 253 schemaGetter := &fakeSchemaGetter{ 254 schema: schema.Schema{Objects: &models.Schema{Classes: nil}}, 255 shardState: singleShardState(), 256 } 257 repo, err := New(logger, Config{ 258 MemtablesFlushDirtyAfter: 60, 259 RootPath: dirName, 260 QueryMaximumResults: 10000, 261 MaxImportGoroutinesFactor: 1, 262 TrackVectorDimensions: true, 263 }, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, &fakeReplicationClient{}, nil) 264 require.Nil(t, err) 265 repo.SetSchemaGetter(schemaGetter) 266 require.Nil(t, repo.WaitForStartup(testCtx())) 267 defer func() { 268 require.Nil(t, repo.Shutdown(context.Background())) 269 }() 270 migrator := NewMigrator(repo, logger) 271 272 t.Run("creating the thing class", testAddBatchObjectClass(repo, migrator, schemaGetter)) 273 274 t.Run("batch import things", testBatchImportObjects(repo)) 275 276 t.Run("batch delete things", testBatchDeleteObjects(repo)) 277 } 278 279 func TestBatchDeleteObjects_JourneyWithDimensions(t *testing.T) { 280 dirName := t.TempDir() 281 282 queryMaximumResults := int64(200) 283 logger := logrus.New() 284 schemaGetter := &fakeSchemaGetter{ 285 schema: schema.Schema{Objects: &models.Schema{Classes: nil}}, 286 shardState: singleShardState(), 287 } 288 repo, err := New(logger, Config{ 289 MemtablesFlushDirtyAfter: 60, 290 RootPath: dirName, 291 QueryMaximumResults: queryMaximumResults, 292 MaxImportGoroutinesFactor: 1, 293 TrackVectorDimensions: true, 294 }, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, &fakeReplicationClient{}, nil) 295 require.Nil(t, err) 296 repo.SetSchemaGetter(schemaGetter) 297 require.Nil(t, repo.WaitForStartup(testCtx())) 298 defer func() { 299 require.Nil(t, repo.Shutdown(context.Background())) 300 }() 301 migrator := NewMigrator(repo, logger) 302 303 t.Run("creating the thing class", testAddBatchObjectClass(repo, migrator, schemaGetter)) 304 305 dimBefore := GetDimensionsFromRepo(repo, "ThingForBatching") 306 require.Equal(t, 0, dimBefore, "Dimensions are empty before import") 307 308 simpleInsertObjects(t, repo, "ThingForBatching", 103) 309 310 dimAfter := GetDimensionsFromRepo(repo, "ThingForBatching") 311 require.Equal(t, 309, dimAfter, "Dimensions are present before delete") 312 313 delete2Objects(t, repo, "ThingForBatching") 314 315 dimFinal := GetDimensionsFromRepo(repo, "ThingForBatching") 316 require.Equal(t, 303, dimFinal, "Dimensions have been deleted") 317 } 318 319 func TestBatchDeleteObjects_Journey(t *testing.T) { 320 dirName := t.TempDir() 321 322 queryMaximumResults := int64(20) 323 logger := logrus.New() 324 schemaGetter := &fakeSchemaGetter{ 325 schema: schema.Schema{Objects: &models.Schema{Classes: nil}}, 326 shardState: singleShardState(), 327 } 328 repo, err := New(logger, Config{ 329 MemtablesFlushDirtyAfter: 60, 330 RootPath: dirName, 331 QueryMaximumResults: queryMaximumResults, 332 MaxImportGoroutinesFactor: 1, 333 }, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, &fakeReplicationClient{}, nil) 334 require.Nil(t, err) 335 repo.SetSchemaGetter(schemaGetter) 336 require.Nil(t, repo.WaitForStartup(testCtx())) 337 defer func() { 338 require.Nil(t, repo.Shutdown(context.Background())) 339 }() 340 migrator := NewMigrator(repo, logger) 341 342 t.Run("creating the thing class", testAddBatchObjectClass(repo, migrator, 343 schemaGetter)) 344 t.Run("batch import things", testBatchImportObjects(repo)) 345 t.Run("batch delete journey things", testBatchDeleteObjectsJourney(repo, queryMaximumResults)) 346 } 347 348 func testAddBatchObjectClass(repo *DB, migrator *Migrator, 349 schemaGetter *fakeSchemaGetter, 350 ) func(t *testing.T) { 351 return func(t *testing.T) { 352 class := &models.Class{ 353 Class: "ThingForBatching", 354 VectorIndexConfig: enthnsw.NewDefaultUserConfig(), 355 InvertedIndexConfig: invertedConfig(), 356 Properties: []*models.Property{ 357 { 358 Name: "stringProp", 359 DataType: schema.DataTypeText.PropString(), 360 Tokenization: models.PropertyTokenizationWhitespace, 361 }, 362 { 363 Name: "location", 364 DataType: []string{string(schema.DataTypeGeoCoordinates)}, 365 }, 366 }, 367 } 368 369 require.Nil(t, migrator.AddClass(context.Background(), class, schemaGetter.shardState)) 370 371 schemaGetter.schema.Objects = &models.Schema{ 372 Classes: []*models.Class{class}, 373 } 374 } 375 } 376 377 func testBatchImportObjectsNoVector(repo *DB) func(t *testing.T) { 378 return func(t *testing.T) { 379 t.Run("with a prior validation error, but nothing to cause errors in the db", func(t *testing.T) { 380 batch := objects.BatchObjects{ 381 objects.BatchObject{ 382 OriginalIndex: 0, 383 Err: nil, 384 Object: &models.Object{ 385 Class: "ThingForBatching", 386 Properties: map[string]interface{}{ 387 "stringProp": "first element", 388 }, 389 ID: "8d5a3aa2-3c8d-4589-9ae1-3f638f506970", 390 }, 391 UUID: "8d5a3aa2-3c8d-4589-9ae1-3f638f506970", 392 }, 393 objects.BatchObject{ 394 OriginalIndex: 1, 395 Err: fmt.Errorf("already has a validation error"), 396 Object: &models.Object{ 397 Class: "ThingForBatching", 398 Properties: map[string]interface{}{ 399 "stringProp": "second element", 400 }, 401 ID: "86a380e9-cb60-4b2a-bc48-51f52acd72d6", 402 }, 403 UUID: "86a380e9-cb60-4b2a-bc48-51f52acd72d6", 404 }, 405 objects.BatchObject{ 406 OriginalIndex: 2, 407 Err: nil, 408 Object: &models.Object{ 409 Class: "ThingForBatching", 410 Properties: map[string]interface{}{ 411 "stringProp": "third element", 412 }, 413 ID: "90ade18e-2b99-4903-aa34-1d5d648c932d", 414 }, 415 UUID: "90ade18e-2b99-4903-aa34-1d5d648c932d", 416 }, 417 } 418 419 t.Run("can import", func(t *testing.T) { 420 batchRes, err := repo.BatchPutObjects(context.Background(), batch, nil) 421 require.Nil(t, err) 422 423 assert.Nil(t, batchRes[0].Err) 424 assert.Nil(t, batchRes[2].Err) 425 }) 426 427 params := dto.GetParams{ 428 ClassName: "ThingForBatching", 429 Pagination: &filters.Pagination{Limit: 10}, 430 Filters: nil, 431 } 432 _, err := repo.Search(context.Background(), params) 433 require.Nil(t, err) 434 }) 435 } 436 } 437 438 func simpleInsertObjects(t *testing.T, repo *DB, class string, count int) { 439 batch := make(objects.BatchObjects, count) 440 for i := 0; i < count; i++ { 441 batch[i] = objects.BatchObject{ 442 OriginalIndex: i, 443 Err: nil, 444 Object: &models.Object{ 445 Class: class, 446 Properties: map[string]interface{}{ 447 "stringProp": fmt.Sprintf("element %d", i), 448 }, 449 ID: strfmt.UUID(fmt.Sprintf("8d5a3aa2-3c8d-4589-9ae1-3f638f506%03d", i)), 450 Vector: []float32{1, 2, 3}, 451 }, 452 UUID: strfmt.UUID(fmt.Sprintf("8d5a3aa2-3c8d-4589-9ae1-3f638f506%03d", i)), 453 } 454 } 455 456 repo.BatchPutObjects(context.Background(), batch, nil) 457 } 458 459 func testBatchImportObjects(repo *DB) func(t *testing.T) { 460 return func(t *testing.T) { 461 t.Run("with a prior validation error, but nothing to cause errors in the db", func(t *testing.T) { 462 batch := objects.BatchObjects{ 463 objects.BatchObject{ 464 OriginalIndex: 0, 465 Err: nil, 466 Object: &models.Object{ 467 Class: "ThingForBatching", 468 Properties: map[string]interface{}{ 469 "stringProp": "first element", 470 }, 471 ID: "8d5a3aa2-3c8d-4589-9ae1-3f638f506970", 472 Vector: []float32{1, 2, 3}, 473 }, 474 UUID: "8d5a3aa2-3c8d-4589-9ae1-3f638f506970", 475 }, 476 objects.BatchObject{ 477 OriginalIndex: 1, 478 Err: fmt.Errorf("already has a validation error"), 479 Object: &models.Object{ 480 Class: "ThingForBatching", 481 Properties: map[string]interface{}{ 482 "stringProp": "second element", 483 }, 484 ID: "86a380e9-cb60-4b2a-bc48-51f52acd72d6", 485 Vector: []float32{1, 2, 3}, 486 }, 487 UUID: "86a380e9-cb60-4b2a-bc48-51f52acd72d6", 488 }, 489 objects.BatchObject{ 490 OriginalIndex: 2, 491 Err: nil, 492 Object: &models.Object{ 493 Class: "ThingForBatching", 494 Properties: map[string]interface{}{ 495 "stringProp": "third element", 496 }, 497 ID: "90ade18e-2b99-4903-aa34-1d5d648c932d", 498 Vector: []float32{1, 2, 3}, 499 }, 500 UUID: "90ade18e-2b99-4903-aa34-1d5d648c932d", 501 }, 502 } 503 504 t.Run("can import", func(t *testing.T) { 505 batchRes, err := repo.BatchPutObjects(context.Background(), batch, nil) 506 require.Nil(t, err) 507 508 assert.Nil(t, batchRes[0].Err) 509 assert.Nil(t, batchRes[2].Err) 510 }) 511 512 params := dto.GetParams{ 513 ClassName: "ThingForBatching", 514 Pagination: &filters.Pagination{Limit: 10}, 515 Filters: nil, 516 } 517 res, err := repo.Search(context.Background(), params) 518 require.Nil(t, err) 519 520 t.Run("contains first element", func(t *testing.T) { 521 item, ok := findID(res, batch[0].Object.ID) 522 require.Equal(t, true, ok, "results should contain our desired id") 523 assert.Equal(t, "first element", item.Schema.(map[string]interface{})["stringProp"]) 524 }) 525 526 t.Run("contains third element", func(t *testing.T) { 527 item, ok := findID(res, batch[2].Object.ID) 528 require.Equal(t, true, ok, "results should contain our desired id") 529 assert.Equal(t, "third element", item.Schema.(map[string]interface{})["stringProp"]) 530 }) 531 532 t.Run("can be queried through the inverted index", func(t *testing.T) { 533 filter := buildFilter("stringProp", "third", eq, schema.DataTypeText) 534 params := dto.GetParams{ 535 ClassName: "ThingForBatching", 536 Pagination: &filters.Pagination{Limit: 10}, 537 Filters: filter, 538 } 539 res, err := repo.Search(context.Background(), params) 540 require.Nil(t, err) 541 542 require.Len(t, res, 1) 543 assert.Equal(t, strfmt.UUID("90ade18e-2b99-4903-aa34-1d5d648c932d"), 544 res[0].ID) 545 }) 546 }) 547 548 t.Run("with an import which will fail", func(t *testing.T) { 549 batch := objects.BatchObjects{ 550 objects.BatchObject{ 551 OriginalIndex: 0, 552 Err: nil, 553 Object: &models.Object{ 554 Class: "ThingForBatching", 555 Properties: map[string]interface{}{ 556 "stringProp": "first element", 557 }, 558 ID: "79aebd44-7486-4fed-9334-3a74cc09a1c3", 559 }, 560 UUID: "79aebd44-7486-4fed-9334-3a74cc09a1c3", 561 }, 562 objects.BatchObject{ 563 OriginalIndex: 1, 564 Err: fmt.Errorf("already had a prior error"), 565 Object: &models.Object{ 566 Class: "ThingForBatching", 567 Properties: map[string]interface{}{ 568 "stringProp": "second element", 569 }, 570 ID: "1c2d8ce6-32da-4081-9794-a81e23e673e4", 571 }, 572 UUID: "1c2d8ce6-32da-4081-9794-a81e23e673e4", 573 }, 574 objects.BatchObject{ 575 OriginalIndex: 2, 576 Err: nil, 577 Object: &models.Object{ 578 Class: "ThingForBatching", 579 Properties: map[string]interface{}{ 580 "stringProp": "third element", 581 }, 582 ID: "", // ID can't be empty in es, this should produce an error 583 }, 584 UUID: "", 585 }, 586 } 587 588 t.Run("can import", func(t *testing.T) { 589 batchRes, err := repo.BatchPutObjects(context.Background(), batch, nil) 590 require.Nil(t, err, "there shouldn't be an overall error, only individual ones") 591 592 t.Run("element errors are marked correctly", func(t *testing.T) { 593 require.Len(t, batchRes, 3) 594 assert.NotNil(t, batchRes[1].Err) // from validation 595 assert.NotNil(t, batchRes[2].Err) // from db 596 }) 597 }) 598 599 params := dto.GetParams{ 600 ClassName: "ThingForBatching", 601 Pagination: &filters.Pagination{Limit: 10}, 602 Filters: nil, 603 } 604 res, err := repo.Search(context.Background(), params) 605 require.Nil(t, err) 606 607 t.Run("does not contain second element (validation error)", func(t *testing.T) { 608 _, ok := findID(res, batch[1].Object.ID) 609 require.Equal(t, false, ok, "results should not contain our desired id") 610 }) 611 612 t.Run("does not contain third element (es error)", func(t *testing.T) { 613 _, ok := findID(res, batch[2].Object.ID) 614 require.Equal(t, false, ok, "results should not contain our desired id") 615 }) 616 }) 617 618 t.Run("upserting the same objects over and over again", func(t *testing.T) { 619 for i := 0; i < 20; i++ { 620 batch := objects.BatchObjects{ 621 objects.BatchObject{ 622 OriginalIndex: 0, 623 Err: nil, 624 Object: &models.Object{ 625 Class: "ThingForBatching", 626 Properties: map[string]interface{}{ 627 "stringProp": "first element", 628 }, 629 ID: "8d5a3aa2-3c8d-4589-9ae1-3f638f506970", 630 Vector: []float32{1, 2, 3}, 631 }, 632 UUID: "8d5a3aa2-3c8d-4589-9ae1-3f638f506970", 633 }, 634 objects.BatchObject{ 635 OriginalIndex: 1, 636 Err: nil, 637 Object: &models.Object{ 638 Class: "ThingForBatching", 639 Properties: map[string]interface{}{ 640 "stringProp": "third element", 641 }, 642 ID: "90ade18e-2b99-4903-aa34-1d5d648c932d", 643 Vector: []float32{1, 1, -3}, 644 }, 645 UUID: "90ade18e-2b99-4903-aa34-1d5d648c932d", 646 }, 647 } 648 649 t.Run("can import", func(t *testing.T) { 650 batchRes, err := repo.BatchPutObjects(context.Background(), batch, nil) 651 require.Nil(t, err) 652 653 assert.Nil(t, batchRes[0].Err) 654 assert.Nil(t, batchRes[1].Err) 655 }) 656 657 t.Run("a vector search returns the correct number of elements", func(t *testing.T) { 658 res, err := repo.VectorSearch(context.Background(), dto.GetParams{ 659 ClassName: "ThingForBatching", 660 Pagination: &filters.Pagination{ 661 Offset: 0, 662 Limit: 10, 663 }, 664 SearchVector: []float32{1, 2, 3}, 665 }) 666 require.Nil(t, err) 667 assert.Len(t, res, 2) 668 }) 669 670 } 671 }) 672 673 t.Run("with a duplicate UUID", func(t *testing.T) { 674 // it should ignore the first one as the second one would overwrite the 675 // first one anyway 676 batch := make(objects.BatchObjects, 53) 677 678 batch[0] = objects.BatchObject{ 679 OriginalIndex: 0, 680 Err: nil, 681 Object: &models.Object{ 682 Class: "ThingForBatching", 683 Properties: map[string]interface{}{ 684 "stringProp": "first element", 685 }, 686 ID: "79aebd44-7486-4fed-9334-3a74cc09a1c3", 687 Vector: []float32{7, 8, 9}, 688 }, 689 UUID: "79aebd44-7486-4fed-9334-3a74cc09a1c3", 690 } 691 692 // add 50 more nonsensical items, so we cross the transaction threshold 693 694 for i := 1; i < 51; i++ { 695 uid, err := uuid.NewRandom() 696 require.Nil(t, err) 697 id := strfmt.UUID(uid.String()) 698 batch[i] = objects.BatchObject{ 699 OriginalIndex: i, 700 Err: nil, 701 Object: &models.Object{ 702 Class: "ThingForBatching", 703 Properties: map[string]interface{}{ 704 "stringProp": "ignore me", 705 }, 706 ID: id, 707 Vector: []float32{0.05, 0.1, 0.2}, 708 }, 709 UUID: id, 710 } 711 } 712 713 batch[51] = objects.BatchObject{ 714 OriginalIndex: 51, 715 Err: fmt.Errorf("already had a prior error"), 716 Object: &models.Object{ 717 Class: "ThingForBatching", 718 Properties: map[string]interface{}{ 719 "stringProp": "first element", 720 }, 721 ID: "1c2d8ce6-32da-4081-9794-a81e23e673e4", 722 Vector: []float32{3, 2, 1}, 723 }, 724 UUID: "1c2d8ce6-32da-4081-9794-a81e23e673e4", 725 } 726 batch[52] = objects.BatchObject{ 727 OriginalIndex: 52, 728 Err: nil, 729 Object: &models.Object{ 730 Class: "ThingForBatching", 731 Properties: map[string]interface{}{ 732 "stringProp": "first element, imported a second time", 733 }, 734 ID: "79aebd44-7486-4fed-9334-3a74cc09a1c3", // note the duplicate id with item 1 735 Vector: []float32{1, 2, 3}, 736 }, 737 UUID: "79aebd44-7486-4fed-9334-3a74cc09a1c3", // note the duplicate id with item 1 738 } 739 740 t.Run("can import", func(t *testing.T) { 741 batchRes, err := repo.BatchPutObjects(context.Background(), batch, nil) 742 require.Nil(t, err, "there shouldn't be an overall error, only individual ones") 743 744 t.Run("element errors are marked correctly", func(t *testing.T) { 745 require.Len(t, batchRes, 53) 746 assert.NotNil(t, batchRes[51].Err) // from validation 747 }) 748 }) 749 750 params := dto.GetParams{ 751 ClassName: "ThingForBatching", 752 Pagination: &filters.Pagination{Limit: 10}, 753 Filters: nil, 754 } 755 res, err := repo.Search(context.Background(), params) 756 require.Nil(t, err) 757 758 t.Run("does not contain second element (validation error)", func(t *testing.T) { 759 _, ok := findID(res, batch[51].Object.ID) 760 require.Equal(t, false, ok, "results should not contain our desired id") 761 }) 762 763 t.Run("does not contain third element (es error)", func(t *testing.T) { 764 _, ok := findID(res, batch[52].Object.ID) 765 require.Equal(t, false, ok, "results should not contain our desired id") 766 }) 767 }) 768 769 t.Run("when a context expires", func(t *testing.T) { 770 // it should ignore the first one as the second one would overwrite the 771 // first one anyway 772 size := 50 773 batch := make(objects.BatchObjects, size) 774 // add 50 more nonsensical items, so we cross the transaction threshold 775 776 for i := 0; i < size; i++ { 777 uid, err := uuid.NewRandom() 778 require.Nil(t, err) 779 id := strfmt.UUID(uid.String()) 780 batch[i] = objects.BatchObject{ 781 Err: nil, 782 Object: &models.Object{ 783 Class: "ThingForBatching", 784 Properties: map[string]interface{}{ 785 "stringProp": "ignore me", 786 }, 787 ID: id, 788 Vector: []float32{0.05, 0.1, 0.2}, 789 }, 790 UUID: id, 791 } 792 } 793 794 t.Run("can import", func(t *testing.T) { 795 ctx, cancel := context.WithTimeout(context.Background(), 1*time.Millisecond) 796 defer cancel() 797 798 batchRes, err := repo.BatchPutObjects(ctx, batch, nil) 799 require.Nil(t, err, "there shouldn't be an overall error, only individual ones") 800 801 t.Run("some elements have error'd due to context", func(t *testing.T) { 802 require.Len(t, batchRes, 50) 803 804 errCount := 0 805 for _, elem := range batchRes { 806 if elem.Err != nil { 807 errCount++ 808 assert.Contains(t, elem.Err.Error(), "context deadline exceeded") 809 } 810 } 811 812 assert.True(t, errCount > 0) 813 }) 814 }) 815 }) 816 } 817 } 818 819 // geo props are the first props with property specific indices, so making sure 820 // that they work with batches at scale adds value beyond the regular batch 821 // import tests 822 func testBatchImportGeoObjects(repo *DB) func(t *testing.T) { 823 r := getRandomSeed() 824 return func(t *testing.T) { 825 size := 500 826 batchSize := 50 827 828 objs := make([]*models.Object, size) 829 830 t.Run("generate random vectors", func(t *testing.T) { 831 for i := 0; i < size; i++ { 832 id, _ := uuid.NewRandom() 833 objs[i] = &models.Object{ 834 Class: "ThingForBatching", 835 ID: strfmt.UUID(id.String()), 836 Properties: map[string]interface{}{ 837 "location": randGeoCoordinates(r), 838 }, 839 Vector: []float32{0.123, 0.234, rand.Float32()}, // does not matter for this test 840 } 841 } 842 }) 843 844 t.Run("import vectors in batches", func(t *testing.T) { 845 for i := 0; i < size; i += batchSize { 846 batch := make(objects.BatchObjects, batchSize) 847 for j := 0; j < batchSize; j++ { 848 batch[j] = objects.BatchObject{ 849 OriginalIndex: j, 850 Object: objs[i+j], 851 } 852 } 853 854 res, err := repo.BatchPutObjects(context.Background(), batch, nil) 855 require.Nil(t, err) 856 assertAllItemsErrorFree(t, res) 857 } 858 }) 859 860 const km = 1000 861 distances := []float32{ 862 0.1, 863 1, 864 10, 865 100, 866 1000, 867 2000, 868 5000, 869 7500, 870 10000, 871 12500, 872 15000, 873 20000, 874 35000, 875 100000, // larger than the circumference of the earth, should contain all 876 } 877 878 t.Run("query for expected results", func(t *testing.T) { 879 queryGeo := randGeoCoordinates(r) 880 881 for _, maxDist := range distances { 882 t.Run(fmt.Sprintf("with maxDist=%f", maxDist), func(t *testing.T) { 883 var relevant int 884 var retrieved int 885 886 controlList := bruteForceMaxDist(objs, []float32{ 887 *queryGeo.Latitude, 888 *queryGeo.Longitude, 889 }, maxDist*km) 890 891 res, err := repo.Search(context.Background(), dto.GetParams{ 892 ClassName: "ThingForBatching", 893 Pagination: &filters.Pagination{Limit: 500}, 894 Filters: buildFilter("location", filters.GeoRange{ 895 GeoCoordinates: queryGeo, 896 Distance: maxDist * km, 897 }, filters.OperatorWithinGeoRange, schema.DataTypeGeoCoordinates), 898 }) 899 require.Nil(t, err) 900 901 retrieved += len(res) 902 relevant += matchesInUUIDLists(controlList, resToUUIDs(res)) 903 904 if relevant == 0 { 905 // skip, as we risk dividing by zero, if both relevant and retrieved 906 // are zero, however, we want to fail with a divide-by-zero if only 907 // retrieved is 0 and relevant was more than 0 908 return 909 } 910 recall := float32(relevant) / float32(retrieved) 911 assert.True(t, recall >= 0.99) 912 }) 913 } 914 }) 915 916 t.Run("renew vector positions to test batch geo updates", func(t *testing.T) { 917 for i, obj := range objs { 918 obj.Properties = map[string]interface{}{ 919 "location": randGeoCoordinates(r), 920 } 921 objs[i] = obj 922 } 923 }) 924 925 t.Run("import in batches again (as update - same IDs!)", func(t *testing.T) { 926 for i := 0; i < size; i += batchSize { 927 batch := make(objects.BatchObjects, batchSize) 928 for j := 0; j < batchSize; j++ { 929 batch[j] = objects.BatchObject{ 930 OriginalIndex: j, 931 Object: objs[i+j], 932 } 933 } 934 935 res, err := repo.BatchPutObjects(context.Background(), batch, nil) 936 require.Nil(t, err) 937 assertAllItemsErrorFree(t, res) 938 } 939 }) 940 941 t.Run("query again to verify updates worked", func(t *testing.T) { 942 queryGeo := randGeoCoordinates(r) 943 944 for _, maxDist := range distances { 945 t.Run(fmt.Sprintf("with maxDist=%f", maxDist), func(t *testing.T) { 946 var relevant int 947 var retrieved int 948 949 controlList := bruteForceMaxDist(objs, []float32{ 950 *queryGeo.Latitude, 951 *queryGeo.Longitude, 952 }, maxDist*km) 953 954 res, err := repo.Search(context.Background(), dto.GetParams{ 955 ClassName: "ThingForBatching", 956 Pagination: &filters.Pagination{Limit: 500}, 957 Filters: buildFilter("location", filters.GeoRange{ 958 GeoCoordinates: queryGeo, 959 Distance: maxDist * km, 960 }, filters.OperatorWithinGeoRange, schema.DataTypeGeoCoordinates), 961 }) 962 require.Nil(t, err) 963 964 retrieved += len(res) 965 relevant += matchesInUUIDLists(controlList, resToUUIDs(res)) 966 967 if relevant == 0 { 968 // skip, as we risk dividing by zero, if both relevant and retrieved 969 // are zero, however, we want to fail with a divide-by-zero if only 970 // retrieved is 0 and relevant was more than 0 971 return 972 } 973 recall := float32(relevant) / float32(retrieved) 974 fmt.Printf("recall is %f\n", recall) 975 assert.True(t, recall >= 0.99) 976 }) 977 } 978 }) 979 } 980 } 981 982 func testBatchDeleteObjects(repo *DB) func(t *testing.T) { 983 return func(t *testing.T) { 984 getParams := func(dryRun bool, output string) objects.BatchDeleteParams { 985 return objects.BatchDeleteParams{ 986 ClassName: "ThingForBatching", 987 Filters: &filters.LocalFilter{ 988 Root: &filters.Clause{ 989 Operator: filters.OperatorLike, 990 Value: &filters.Value{ 991 Value: "*", 992 Type: schema.DataTypeText, 993 }, 994 On: &filters.Path{ 995 Property: schema.PropertyName("id"), 996 }, 997 }, 998 }, 999 DryRun: dryRun, 1000 Output: output, 1001 } 1002 } 1003 performClassSearch := func() ([]search.Result, error) { 1004 return repo.Search(context.Background(), dto.GetParams{ 1005 ClassName: "ThingForBatching", 1006 Pagination: &filters.Pagination{Limit: 10000}, 1007 }) 1008 } 1009 t.Run("batch delete with dryRun set to true", func(t *testing.T) { 1010 // get the initial count of the objects 1011 res, err := performClassSearch() 1012 require.Nil(t, err) 1013 beforeDelete := len(res) 1014 require.True(t, beforeDelete > 0) 1015 // dryRun == true, only test how many objects can be deleted 1016 batchDeleteRes, err := repo.BatchDeleteObjects(context.Background(), getParams(true, "verbose"), nil, "") 1017 require.Nil(t, err) 1018 require.Equal(t, int64(beforeDelete), batchDeleteRes.Matches) 1019 require.Equal(t, beforeDelete, len(batchDeleteRes.Objects)) 1020 for _, batchRes := range batchDeleteRes.Objects { 1021 require.Nil(t, batchRes.Err) 1022 } 1023 res, err = performClassSearch() 1024 require.Nil(t, err) 1025 assert.Equal(t, beforeDelete, len(res)) 1026 }) 1027 1028 t.Run("batch delete with dryRun set to true and output to minimal", func(t *testing.T) { 1029 // get the initial count of the objects 1030 res, err := performClassSearch() 1031 require.Nil(t, err) 1032 beforeDelete := len(res) 1033 require.True(t, beforeDelete > 0) 1034 // dryRun == true, only test how many objects can be deleted 1035 batchDeleteRes, err := repo.BatchDeleteObjects(context.Background(), getParams(true, "minimal"), nil, "") 1036 require.Nil(t, err) 1037 require.Equal(t, int64(beforeDelete), batchDeleteRes.Matches) 1038 require.Equal(t, beforeDelete, len(batchDeleteRes.Objects)) 1039 for _, batchRes := range batchDeleteRes.Objects { 1040 require.Nil(t, batchRes.Err) 1041 } 1042 res, err = performClassSearch() 1043 require.Nil(t, err) 1044 assert.Equal(t, beforeDelete, len(res)) 1045 }) 1046 1047 t.Run("batch delete only 2 given objects", func(t *testing.T) { 1048 // get the initial count of the objects 1049 res, err := performClassSearch() 1050 require.Nil(t, err) 1051 beforeDelete := len(res) 1052 require.True(t, beforeDelete > 0) 1053 // dryRun == true, only test how many objects can be deleted 1054 batchDeleteRes, err := repo.BatchDeleteObjects(context.Background(), objects.BatchDeleteParams{ 1055 ClassName: "ThingForBatching", 1056 Filters: &filters.LocalFilter{ 1057 Root: &filters.Clause{ 1058 Operator: filters.OperatorOr, 1059 Operands: []filters.Clause{ 1060 { 1061 Operator: filters.OperatorEqual, 1062 On: &filters.Path{ 1063 Class: "ThingForBatching", 1064 Property: schema.PropertyName("id"), 1065 }, 1066 Value: &filters.Value{ 1067 Value: "8d5a3aa2-3c8d-4589-9ae1-3f638f506970", 1068 Type: schema.DataTypeText, 1069 }, 1070 }, 1071 { 1072 Operator: filters.OperatorEqual, 1073 On: &filters.Path{ 1074 Class: "ThingForBatching", 1075 Property: schema.PropertyName("id"), 1076 }, 1077 Value: &filters.Value{ 1078 Value: "90ade18e-2b99-4903-aa34-1d5d648c932d", 1079 Type: schema.DataTypeText, 1080 }, 1081 }, 1082 }, 1083 }, 1084 }, 1085 DryRun: false, 1086 Output: "verbose", 1087 }, nil, "") 1088 require.Nil(t, err) 1089 require.Equal(t, int64(2), batchDeleteRes.Matches) 1090 require.Equal(t, 2, len(batchDeleteRes.Objects)) 1091 for _, batchRes := range batchDeleteRes.Objects { 1092 require.Nil(t, batchRes.Err) 1093 } 1094 res, err = performClassSearch() 1095 require.Nil(t, err) 1096 assert.Equal(t, beforeDelete-2, len(res)) 1097 }) 1098 1099 t.Run("batch delete with dryRun set to false", func(t *testing.T) { 1100 // get the initial count of the objects 1101 res, err := performClassSearch() 1102 require.Nil(t, err) 1103 beforeDelete := len(res) 1104 require.True(t, beforeDelete > 0) 1105 // dryRun == true, only test how many objects can be deleted 1106 batchDeleteRes, err := repo.BatchDeleteObjects(context.Background(), getParams(false, "verbose"), nil, "") 1107 require.Nil(t, err) 1108 require.Equal(t, int64(beforeDelete), batchDeleteRes.Matches) 1109 require.Equal(t, beforeDelete, len(batchDeleteRes.Objects)) 1110 for _, batchRes := range batchDeleteRes.Objects { 1111 require.Nil(t, batchRes.Err) 1112 } 1113 res, err = performClassSearch() 1114 require.Nil(t, err) 1115 assert.Equal(t, 0, len(res)) 1116 }) 1117 } 1118 } 1119 1120 func testBatchDeleteObjectsJourney(repo *DB, queryMaximumResults int64) func(t *testing.T) { 1121 return func(t *testing.T) { 1122 getParams := func(dryRun bool, output string) objects.BatchDeleteParams { 1123 return objects.BatchDeleteParams{ 1124 ClassName: "ThingForBatching", 1125 Filters: &filters.LocalFilter{ 1126 Root: &filters.Clause{ 1127 Operator: filters.OperatorLike, 1128 Value: &filters.Value{ 1129 Value: "*", 1130 Type: schema.DataTypeText, 1131 }, 1132 On: &filters.Path{ 1133 Property: schema.PropertyName("id"), 1134 }, 1135 }, 1136 }, 1137 DryRun: dryRun, 1138 Output: output, 1139 } 1140 } 1141 performClassSearch := func() ([]search.Result, error) { 1142 return repo.Search(context.Background(), dto.GetParams{ 1143 ClassName: "ThingForBatching", 1144 Pagination: &filters.Pagination{Limit: 20}, 1145 }) 1146 } 1147 t.Run("batch delete journey", func(t *testing.T) { 1148 // delete objects to limit 1149 batchDeleteRes, err := repo.BatchDeleteObjects(context.Background(), getParams(true, "verbose"), nil, "") 1150 require.Nil(t, err) 1151 objectsMatches := batchDeleteRes.Matches 1152 1153 leftToDelete := objectsMatches 1154 deleteIterationCount := 0 1155 deletedObjectsCount := 0 1156 for { 1157 // delete objects to limit 1158 batchDeleteRes, err := repo.BatchDeleteObjects(context.Background(), getParams(false, "verbose"), nil, "") 1159 require.Nil(t, err) 1160 matches, deleted := batchDeleteRes.Matches, len(batchDeleteRes.Objects) 1161 require.Equal(t, leftToDelete, matches) 1162 require.True(t, deleted > 0) 1163 deletedObjectsCount += deleted 1164 1165 batchDeleteRes, err = repo.BatchDeleteObjects(context.Background(), getParams(true, "verbose"), nil, "") 1166 require.Nil(t, err) 1167 leftToDelete = batchDeleteRes.Matches 1168 1169 res, err := performClassSearch() 1170 require.Nil(t, err) 1171 afterDelete := len(res) 1172 require.True(t, afterDelete >= 0) 1173 if afterDelete == 0 { 1174 // where have deleted all objects 1175 break 1176 } 1177 deleteIterationCount += 1 1178 if deleteIterationCount > 100 { 1179 // something went wrong 1180 break 1181 } 1182 } 1183 require.False(t, deleteIterationCount > 100, "Batch delete journey tests didn't stop properly") 1184 require.True(t, objectsMatches/int64(queryMaximumResults) <= int64(deleteIterationCount)) 1185 require.Equal(t, objectsMatches, int64(deletedObjectsCount)) 1186 }) 1187 } 1188 } 1189 1190 func assertAllItemsErrorFree(t *testing.T, res objects.BatchObjects) { 1191 for _, elem := range res { 1192 assert.Nil(t, elem.Err) 1193 } 1194 } 1195 1196 func bruteForceMaxDist(inputs []*models.Object, query []float32, maxDist float32) []strfmt.UUID { 1197 type distanceAndIndex struct { 1198 distance float32 1199 index int 1200 } 1201 1202 distances := make([]distanceAndIndex, len(inputs)) 1203 1204 distancer := distancer.NewGeoProvider().New(query) 1205 for i, elem := range inputs { 1206 coord := elem.Properties.(map[string]interface{})["location"].(*models.GeoCoordinates) 1207 vec := []float32{*coord.Latitude, *coord.Longitude} 1208 1209 dist, _, _ := distancer.Distance(vec) 1210 distances[i] = distanceAndIndex{ 1211 index: i, 1212 distance: dist, 1213 } 1214 } 1215 1216 sort.Slice(distances, func(a, b int) bool { 1217 return distances[a].distance < distances[b].distance 1218 }) 1219 1220 out := make([]strfmt.UUID, len(distances)) 1221 i := 0 1222 for _, elem := range distances { 1223 if elem.distance > maxDist { 1224 break 1225 } 1226 out[i] = inputs[distances[i].index].ID 1227 i++ 1228 } 1229 1230 return out[:i] 1231 } 1232 1233 func randGeoCoordinates(r *rand.Rand) *models.GeoCoordinates { 1234 maxLat := float32(90.0) 1235 minLat := float32(-90.0) 1236 maxLon := float32(180) 1237 minLon := float32(-180) 1238 1239 lat := minLat + (maxLat-minLat)*r.Float32() 1240 lon := minLon + (maxLon-minLon)*r.Float32() 1241 return &models.GeoCoordinates{ 1242 Latitude: &lat, 1243 Longitude: &lon, 1244 } 1245 } 1246 1247 func resToUUIDs(in []search.Result) []strfmt.UUID { 1248 out := make([]strfmt.UUID, len(in)) 1249 for i, obj := range in { 1250 out[i] = obj.ID 1251 } 1252 1253 return out 1254 } 1255 1256 func matchesInUUIDLists(control []strfmt.UUID, results []strfmt.UUID) int { 1257 desired := map[strfmt.UUID]struct{}{} 1258 for _, relevant := range control { 1259 desired[relevant] = struct{}{} 1260 } 1261 1262 var matches int 1263 for _, candidate := range results { 1264 _, ok := desired[candidate] 1265 if ok { 1266 matches++ 1267 } 1268 } 1269 1270 return matches 1271 }