github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/bm25f_test.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 //go:build integrationTest 13 14 package db 15 16 import ( 17 "context" 18 "fmt" 19 "testing" 20 21 "github.com/go-openapi/strfmt" 22 "github.com/google/uuid" 23 "github.com/sirupsen/logrus" 24 "github.com/stretchr/testify/assert" 25 "github.com/stretchr/testify/require" 26 "github.com/weaviate/weaviate/adapters/repos/db/inverted" 27 "github.com/weaviate/weaviate/entities/additional" 28 "github.com/weaviate/weaviate/entities/filters" 29 "github.com/weaviate/weaviate/entities/models" 30 "github.com/weaviate/weaviate/entities/schema" 31 "github.com/weaviate/weaviate/entities/searchparams" 32 enthnsw "github.com/weaviate/weaviate/entities/vectorindex/hnsw" 33 ) 34 35 func BM25FinvertedConfig(k1, b float32, stopWordPreset string) *models.InvertedIndexConfig { 36 return &models.InvertedIndexConfig{ 37 Bm25: &models.BM25Config{ 38 K1: k1, 39 B: b, 40 }, 41 CleanupIntervalSeconds: 60, 42 Stopwords: &models.StopwordConfig{ 43 Preset: stopWordPreset, 44 }, 45 IndexNullState: true, 46 IndexPropertyLength: true, 47 } 48 } 49 50 func SetupClass(t require.TestingT, repo *DB, schemaGetter *fakeSchemaGetter, logger logrus.FieldLogger, k1, b float32, 51 ) { 52 vFalse := false 53 vTrue := true 54 55 class := &models.Class{ 56 VectorIndexConfig: enthnsw.NewDefaultUserConfig(), 57 InvertedIndexConfig: BM25FinvertedConfig(k1, b, "none"), 58 Class: "MyClass", 59 60 Properties: []*models.Property{ 61 { 62 Name: "title", 63 DataType: schema.DataTypeText.PropString(), 64 Tokenization: models.PropertyTokenizationWord, 65 IndexFilterable: &vFalse, 66 IndexSearchable: &vTrue, 67 }, 68 { 69 Name: "description", 70 DataType: schema.DataTypeText.PropString(), 71 Tokenization: models.PropertyTokenizationWord, 72 IndexFilterable: &vFalse, 73 IndexSearchable: &vTrue, 74 }, 75 { 76 Name: "review", 77 DataType: schema.DataTypeText.PropString(), 78 Tokenization: models.PropertyTokenizationWord, 79 IndexFilterable: &vFalse, 80 IndexSearchable: &vTrue, 81 }, 82 { 83 Name: "textField", 84 DataType: schema.DataTypeText.PropString(), 85 Tokenization: models.PropertyTokenizationField, 86 IndexFilterable: &vFalse, 87 IndexSearchable: &vTrue, 88 }, 89 { 90 Name: "textWhitespace", 91 DataType: schema.DataTypeText.PropString(), 92 Tokenization: models.PropertyTokenizationWhitespace, 93 IndexFilterable: &vFalse, 94 IndexSearchable: &vTrue, 95 }, 96 { 97 Name: "relatedToGolf", 98 DataType: schema.DataTypeBoolean.PropString(), 99 IndexFilterable: &vFalse, 100 IndexSearchable: &vTrue, 101 }, 102 { 103 Name: "multiTitles", 104 DataType: schema.DataTypeTextArray.PropString(), 105 Tokenization: models.PropertyTokenizationWord, 106 IndexFilterable: &vFalse, 107 IndexSearchable: &vTrue, 108 }, 109 { 110 Name: "multiTextWhitespace", 111 DataType: schema.DataTypeTextArray.PropString(), 112 Tokenization: models.PropertyTokenizationWhitespace, 113 IndexFilterable: &vFalse, 114 IndexSearchable: &vTrue, 115 }, 116 }, 117 } 118 119 schema := schema.Schema{ 120 Objects: &models.Schema{ 121 Classes: []*models.Class{class}, 122 }, 123 } 124 125 schemaGetter.schema = schema 126 127 migrator := NewMigrator(repo, logger) 128 migrator.AddClass(context.Background(), class, schemaGetter.shardState) 129 130 testData := []map[string]interface{}{} 131 testData = append(testData, map[string]interface{}{"title": "Our journey to BM25F", "description": "This is how we get to BM25F", "review": "none none none", "multiTitles": []string{"breakfast", "dinner"}}) 132 testData = append(testData, map[string]interface{}{"title": "Why I dont like journey", "description": "This is about how we get somewhere", "multiTitles": []string{"going to a restaurant for dinner", "sandwiches and desert are a great lunch"}}) 133 testData = append(testData, map[string]interface{}{"title": "My journeys in Journey", "description": "A journey story about journeying"}) 134 testData = append(testData, map[string]interface{}{"title": "An unrelated title", "description": "Actually all about journey"}) 135 testData = append(testData, map[string]interface{}{"title": "journey journey", "description": "journey journey journey"}) 136 testData = append(testData, map[string]interface{}{"title": "journey", "description": "journey journey", "multiTextWhitespace": []string{"totally irrelevant:)", "we all MuuultiYell! together"}}) 137 testData = append(testData, map[string]interface{}{"title": "JOURNEY", "description": "A LOUD JOURNEY", "multiTextWhitespace": []string{"MuuultiYell!", "is fun"}}) 138 testData = append(testData, map[string]interface{}{"title": "An unrelated title", "description": "Absolutely nothing to do with the topic", "textField": "*&^$@#$%^&*()(Offtopic!!!!"}) 139 testData = append(testData, map[string]interface{}{"title": "none", "description": "other", "textField": "YELLING IS FUN"}) 140 testData = append(testData, map[string]interface{}{"title": "something", "description": "none none", "review": "none none none none none none"}) 141 142 for i, data := range testData { 143 id := strfmt.UUID(uuid.MustParse(fmt.Sprintf("%032d", i)).String()) 144 145 obj := &models.Object{Class: "MyClass", ID: id, Properties: data, CreationTimeUnix: 1565612833955, LastUpdateTimeUnix: 10000020} 146 vector := []float32{1, 3, 5, 0.4} 147 //{title: "Our journey to BM25F", description: " This is how we get to BM25F"}} 148 err := repo.PutObject(context.Background(), obj, vector, nil, nil) 149 require.Nil(t, err) 150 } 151 } 152 153 // DuplicatedFrom SetupClass to make sure this new test does not alter the results of the existing one 154 func SetupClassForFilterScoringTest(t require.TestingT, repo *DB, schemaGetter *fakeSchemaGetter, logger logrus.FieldLogger, k1, b float32, 155 ) { 156 vFalse := false 157 vTrue := true 158 159 class := &models.Class{ 160 VectorIndexConfig: enthnsw.NewDefaultUserConfig(), 161 InvertedIndexConfig: BM25FinvertedConfig(k1, b, "none"), 162 Class: "FilterClass", 163 164 Properties: []*models.Property{ 165 { 166 Name: "description", 167 DataType: schema.DataTypeText.PropString(), 168 Tokenization: models.PropertyTokenizationWord, 169 IndexFilterable: &vFalse, 170 IndexSearchable: &vTrue, 171 }, 172 { 173 Name: "relatedToGolf", 174 DataType: schema.DataTypeBoolean.PropString(), 175 IndexFilterable: &vTrue, 176 }, 177 }, 178 } 179 180 schema := schema.Schema{ 181 Objects: &models.Schema{ 182 Classes: []*models.Class{class}, 183 }, 184 } 185 186 schemaGetter.schema = schema 187 188 migrator := NewMigrator(repo, logger) 189 migrator.AddClass(context.Background(), class, schemaGetter.shardState) 190 191 testData := []map[string]interface{}{} 192 testData = append(testData, map[string]interface{}{"description": "Brooks Koepka appeared a lot in the ms marco dataset. I was surprised to see golf content in there. I assume if the dataset was newer, we'd see a lot more Rory though.", "relatedToGolf": true}) 193 testData = append(testData, map[string]interface{}{"description": "While one would expect Koepka to be a somewhat rare name, it did appear in msmarco also outside the context of Brooks.", "relatedToGolf": false}) 194 195 for i, data := range testData { 196 id := strfmt.UUID(uuid.MustParse(fmt.Sprintf("%032d", i)).String()) 197 198 obj := &models.Object{Class: "FilterClass", ID: id, Properties: data, CreationTimeUnix: 1565612833955, LastUpdateTimeUnix: 10000020} 199 vector := []float32{1, 3, 5, 0.4} 200 err := repo.PutObject(context.Background(), obj, vector, nil, nil) 201 require.Nil(t, err) 202 } 203 } 204 205 func TestBM25FJourney(t *testing.T) { 206 dirName := t.TempDir() 207 208 logger := logrus.New() 209 schemaGetter := &fakeSchemaGetter{ 210 schema: schema.Schema{Objects: &models.Schema{Classes: nil}}, 211 shardState: singleShardState(), 212 } 213 repo, err := New(logger, Config{ 214 MemtablesFlushDirtyAfter: 60, 215 RootPath: dirName, 216 QueryMaximumResults: 10000, 217 MaxImportGoroutinesFactor: 1, 218 }, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, nil, nil) 219 require.Nil(t, err) 220 repo.SetSchemaGetter(schemaGetter) 221 require.Nil(t, repo.WaitForStartup(context.TODO())) 222 defer repo.Shutdown(context.Background()) 223 224 SetupClass(t, repo, schemaGetter, logger, 1.2, 0.75) 225 226 idx := repo.GetIndex("MyClass") 227 require.NotNil(t, idx) 228 229 // Check basic search 230 addit := additional.Properties{} 231 232 t.Run("bm25f journey", func(t *testing.T) { 233 kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"title", "description", "textField"}, Query: "journey"} 234 res, scores, err := idx.objectSearch(context.TODO(), 1000, nil, kwr, nil, nil, addit, nil, "", 0) 235 require.Nil(t, err) 236 237 // Print results 238 t.Log("--- Start results for basic search ---") 239 for i, r := range res { 240 t.Logf("Result id: %v, score: %v, title: %v, description: %v, additional %+v\n", r.DocID, scores[i], r.Object.Properties.(map[string]interface{})["title"], r.Object.Properties.(map[string]interface{})["description"], r.Object.Additional) 241 } 242 243 // Check results in correct order 244 require.Equal(t, uint64(4), res[0].DocID) 245 require.Equal(t, uint64(5), res[1].DocID) 246 require.Equal(t, uint64(6), res[2].DocID) 247 require.Equal(t, uint64(3), res[3].DocID) 248 require.Equal(t, uint64(0), res[4].DocID) 249 require.Equal(t, uint64(2), res[5].DocID) 250 251 // Without additionalExplanations no explainScore entry should be present 252 require.NotContains(t, res[0].Object.Additional, "explainScore") 253 }) 254 255 // Check non-alpha search on string field 256 257 // text/field are tokenized entirely, so we can search for non-alpha characters 258 t.Run("bm25f textField non-alpha", func(t *testing.T) { 259 kwrTextField := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"title", "description", "textField"}, Query: "*&^$@#$%^&*()(Offtopic!!!!"} 260 addit = additional.Properties{} 261 resTextField, scores, err := idx.objectSearch(context.TODO(), 1000, nil, kwrTextField, nil, nil, addit, nil, "", 0) 262 require.Nil(t, err) 263 264 // Print results 265 t.Log("--- Start results for textField search ---") 266 for i, r := range resTextField { 267 t.Logf("Result id: %v, score: %v, title: %v, description: %v, additional %+v\n", r.DocID, scores[i], r.Object.Properties.(map[string]interface{})["title"], r.Object.Properties.(map[string]interface{})["description"], r.Object.Additional) 268 } 269 270 // Check results in correct order 271 require.Equal(t, uint64(7), resTextField[0].DocID) 272 }) 273 274 // text/field are not lower-cased before indexing, so upper case searches must be passed through unchanged. 275 t.Run("bm25f textField caps", func(t *testing.T) { 276 kwrTextField := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"textField"}, Query: "YELLING IS FUN"} 277 addit := additional.Properties{} 278 resTextField, scores, err := idx.objectSearch(context.TODO(), 1000, nil, kwrTextField, nil, nil, addit, nil, "", 0) 279 require.Nil(t, err) 280 281 // Print results 282 t.Log("--- Start results for textField caps search ---") 283 for i, r := range resTextField { 284 t.Logf("Result id: %v, score: %v, title: %v, description: %v, additional %+v\n", r.DocID, scores[i], r.Object.Properties.(map[string]interface{})["title"], r.Object.Properties.(map[string]interface{})["description"], r.Object.Additional) 285 } 286 287 // Check results in correct order 288 require.Equal(t, uint64(8), resTextField[0].DocID) 289 }) 290 291 // Check basic text search WITH CAPS 292 t.Run("bm25f text with caps", func(t *testing.T) { 293 kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"title", "description"}, Query: "JOURNEY"} 294 res, scores, err := idx.objectSearch(context.TODO(), 1000, nil, kwr, nil, nil, addit, nil, "", 0) 295 // Print results 296 t.Log("--- Start results for search with caps ---") 297 for i, r := range res { 298 t.Logf("Result id: %v, score: %v, title: %v, description: %v, additional %+v\n", r.DocID, scores[i], r.Object.Properties.(map[string]interface{})["title"], r.Object.Properties.(map[string]interface{})["description"], r.Object.Additional) 299 } 300 require.Nil(t, err) 301 302 // Check results in correct order 303 require.Equal(t, uint64(4), res[0].DocID) 304 require.Equal(t, uint64(5), res[1].DocID) 305 require.Equal(t, uint64(6), res[2].DocID) 306 require.Equal(t, uint64(2), res[3].DocID) 307 require.Equal(t, uint64(3), res[4].DocID) 308 require.Equal(t, uint64(0), res[5].DocID) 309 require.Equal(t, uint64(1), res[6].DocID) 310 }) 311 312 t.Run("bm25f journey boosted", func(t *testing.T) { 313 kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"title^3", "description"}, Query: "journey"} 314 res, scores, err := idx.objectSearch(context.TODO(), 1000, nil, kwr, nil, nil, addit, nil, "", 0) 315 316 require.Nil(t, err) 317 // Print results 318 t.Log("--- Start results for boosted search ---") 319 for i, r := range res { 320 t.Logf("Result id: %v, score: %v, title: %v, description: %v, additional %+v\n", r.DocID, scores[i], r.Object.Properties.(map[string]interface{})["title"], r.Object.Properties.(map[string]interface{})["description"], r.Object.Additional) 321 } 322 323 // Check results in correct order 324 require.Equal(t, uint64(4), res[0].DocID) 325 require.Equal(t, uint64(5), res[1].DocID) 326 require.Equal(t, uint64(6), res[2].DocID) 327 require.Equal(t, uint64(0), res[3].DocID) 328 require.Equal(t, uint64(1), res[4].DocID) 329 require.Equal(t, uint64(2), res[5].DocID) 330 require.Equal(t, uint64(3), res[6].DocID) 331 }) 332 333 t.Run("Check search with two terms", func(t *testing.T) { 334 kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"title", "description"}, Query: "journey somewhere"} 335 res, _, err := idx.objectSearch(context.TODO(), 1000, nil, kwr, nil, nil, addit, nil, "", 0) 336 require.Nil(t, err) 337 // Check results in correct order 338 require.Equal(t, uint64(1), res[0].DocID) 339 require.Equal(t, uint64(4), res[1].DocID) 340 require.Equal(t, uint64(5), res[2].DocID) 341 require.Equal(t, uint64(6), res[3].DocID) 342 require.Equal(t, uint64(2), res[4].DocID) 343 }) 344 345 t.Run("bm25f journey somewhere no properties", func(t *testing.T) { 346 // Check search with no properties (should include all properties) 347 kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{}, Query: "journey somewhere"} 348 res, _, err := idx.objectSearch(context.TODO(), 1000, nil, kwr, nil, nil, addit, nil, "", 0) 349 require.Nil(t, err) 350 351 // Check results in correct order 352 require.Equal(t, uint64(1), res[0].DocID) 353 require.Equal(t, uint64(4), res[1].DocID) 354 require.Equal(t, uint64(5), res[2].DocID) 355 require.Equal(t, uint64(6), res[3].DocID) 356 }) 357 358 t.Run("bm25f non alphanums", func(t *testing.T) { 359 // Check search with no properties (should include all properties) 360 kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{}, Query: "*&^$@#$%^&*()(Offtopic!!!!"} 361 res, _, err := idx.objectSearch(context.TODO(), 1000, nil, kwr, nil, nil, addit, nil, "", 0) 362 require.Nil(t, err) 363 require.Equal(t, uint64(7), res[0].DocID) 364 }) 365 366 t.Run("First result has high score", func(t *testing.T) { 367 kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"description"}, Query: "about BM25F"} 368 res, _, err := idx.objectSearch(context.TODO(), 5, nil, kwr, nil, nil, addit, nil, "", 0) 369 require.Nil(t, err) 370 371 require.Equal(t, uint64(0), res[0].DocID) 372 require.Len(t, res, 4) // four results have one of the terms 373 }) 374 375 t.Run("More results than limit", func(t *testing.T) { 376 kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"description"}, Query: "journey"} 377 res, _, err := idx.objectSearch(context.TODO(), 5, nil, kwr, nil, nil, addit, nil, "", 0) 378 require.Nil(t, err) 379 380 require.Equal(t, uint64(4), res[0].DocID) 381 require.Equal(t, uint64(5), res[1].DocID) 382 require.Equal(t, uint64(6), res[2].DocID) 383 require.Equal(t, uint64(3), res[3].DocID) 384 require.Equal(t, uint64(2), res[4].DocID) 385 require.Len(t, res, 5) // four results have one of the terms 386 }) 387 388 t.Run("Results from three properties", func(t *testing.T) { 389 kwr := &searchparams.KeywordRanking{Type: "bm25", Query: "none"} 390 res, _, err := idx.objectSearch(context.TODO(), 5, nil, kwr, nil, nil, addit, nil, "", 0) 391 require.Nil(t, err) 392 393 require.Equal(t, uint64(9), res[0].DocID) 394 require.Equal(t, uint64(0), res[1].DocID) 395 require.Equal(t, uint64(8), res[2].DocID) 396 require.Len(t, res, 3) 397 }) 398 399 t.Run("Include additional explanations", func(t *testing.T) { 400 kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"description"}, Query: "journey", AdditionalExplanations: true} 401 res, _, err := idx.objectSearch(context.TODO(), 5, nil, kwr, nil, nil, addit, nil, "", 0) 402 require.Nil(t, err) 403 404 // With additionalExplanations explainScore entry should be present 405 require.Contains(t, res[0].Object.Additional, "explainScore") 406 require.Contains(t, res[0].Object.Additional["explainScore"], "BM25") 407 }) 408 409 t.Run("Array fields text", func(t *testing.T) { 410 kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"multiTitles"}, Query: "dinner"} 411 res, _, err := idx.objectSearch(context.TODO(), 5, nil, kwr, nil, nil, addit, nil, "", 0) 412 require.Nil(t, err) 413 414 require.Len(t, res, 2) 415 require.Equal(t, uint64(0), res[0].DocID) 416 require.Equal(t, uint64(1), res[1].DocID) 417 }) 418 419 t.Run("Array fields string", func(t *testing.T) { 420 kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"multiTextWhitespace"}, Query: "MuuultiYell!"} 421 res, _, err := idx.objectSearch(context.TODO(), 5, nil, kwr, nil, nil, addit, nil, "", 0) 422 require.Nil(t, err) 423 424 require.Len(t, res, 2) 425 require.Equal(t, uint64(6), res[0].DocID) 426 require.Equal(t, uint64(5), res[1].DocID) 427 }) 428 429 t.Run("With autocut", func(t *testing.T) { 430 kwr := &searchparams.KeywordRanking{Type: "bm25", Query: "journey", Properties: []string{"description"}} 431 resNoAutoCut, noautocutscores, err := idx.objectSearch(context.TODO(), 10, nil, kwr, nil, nil, addit, nil, "", 0) 432 require.Nil(t, err) 433 434 resAutoCut, autocutscores, err := idx.objectSearch(context.TODO(), 10, nil, kwr, nil, nil, addit, nil, "", 1) 435 require.Nil(t, err) 436 437 require.Less(t, len(resAutoCut), len(resNoAutoCut)) 438 439 require.EqualValues(t, float32(0.5868752), noautocutscores[0]) 440 require.EqualValues(t, float32(0.5450892), noautocutscores[1]) // <= autocut last element 441 require.EqualValues(t, float32(0.34149727), noautocutscores[2]) 442 require.EqualValues(t, float32(0.3049518), noautocutscores[3]) 443 require.EqualValues(t, float32(0.27547202), noautocutscores[4]) 444 445 require.Len(t, resAutoCut, 2) 446 require.EqualValues(t, float32(0.5868752), autocutscores[0]) 447 require.EqualValues(t, float32(0.5450892), autocutscores[1]) 448 }) 449 } 450 451 func TestBM25FSingleProp(t *testing.T) { 452 dirName := t.TempDir() 453 454 logger := logrus.New() 455 schemaGetter := &fakeSchemaGetter{ 456 schema: schema.Schema{Objects: &models.Schema{Classes: nil}}, 457 shardState: singleShardState(), 458 } 459 repo, err := New(logger, Config{ 460 MemtablesFlushDirtyAfter: 60, 461 RootPath: dirName, 462 QueryMaximumResults: 10000, 463 MaxImportGoroutinesFactor: 1, 464 }, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, nil, nil) 465 require.Nil(t, err) 466 repo.SetSchemaGetter(schemaGetter) 467 require.Nil(t, repo.WaitForStartup(context.TODO())) 468 defer repo.Shutdown(context.Background()) 469 470 SetupClass(t, repo, schemaGetter, logger, 0.5, 100) 471 472 idx := repo.GetIndex("MyClass") 473 require.NotNil(t, idx) 474 475 // Check boosted 476 kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"description"}, Query: "journey"} 477 addit := additional.Properties{} 478 res, scores, err := idx.objectSearch(context.TODO(), 1000, nil, kwr, nil, nil, addit, nil, "", 0) 479 t.Log("--- Start results for singleprop search ---") 480 for i, r := range res { 481 t.Logf("Result id: %v, score: %v, title: %v, description: %v, additional %+v\n", r.DocID, scores[i], r.Object.Properties.(map[string]interface{})["title"], r.Object.Properties.(map[string]interface{})["description"], r.Object.Additional) 482 } 483 require.Nil(t, err) 484 // Check results in correct order 485 require.Equal(t, uint64(3), res[0].DocID) 486 require.Equal(t, uint64(4), res[3].DocID) 487 488 // Check scores 489 EqualFloats(t, float32(0.1248), scores[0], 5) 490 EqualFloats(t, float32(0.0363), scores[1], 5) 491 } 492 493 func TestBM25FWithFilters(t *testing.T) { 494 dirName := t.TempDir() 495 496 logger := logrus.New() 497 schemaGetter := &fakeSchemaGetter{ 498 schema: schema.Schema{Objects: &models.Schema{Classes: nil}}, 499 shardState: singleShardState(), 500 } 501 repo, err := New(logger, Config{ 502 MemtablesFlushDirtyAfter: 60, 503 RootPath: dirName, 504 QueryMaximumResults: 10000, 505 MaxImportGoroutinesFactor: 1, 506 }, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, nil, nil) 507 require.Nil(t, err) 508 repo.SetSchemaGetter(schemaGetter) 509 require.Nil(t, repo.WaitForStartup(context.TODO())) 510 defer repo.Shutdown(context.Background()) 511 512 SetupClass(t, repo, schemaGetter, logger, 0.5, 100) 513 514 idx := repo.GetIndex("MyClass") 515 require.NotNil(t, idx) 516 517 filter := &filters.LocalFilter{ 518 Root: &filters.Clause{ 519 Operator: filters.OperatorOr, 520 Operands: []filters.Clause{ 521 { 522 Operator: filters.OperatorEqual, 523 On: &filters.Path{ 524 Class: schema.ClassName("MyClass"), 525 Property: schema.PropertyName("title"), 526 }, 527 Value: &filters.Value{ 528 Value: "My", 529 Type: schema.DataType("text"), 530 }, 531 }, 532 { 533 Operator: filters.OperatorEqual, 534 On: &filters.Path{ 535 Class: schema.ClassName("MyClass"), 536 Property: schema.PropertyName("title"), 537 }, 538 Value: &filters.Value{ 539 Value: "journeys", 540 Type: schema.DataType("text"), 541 }, 542 }, 543 }, 544 }, 545 } 546 547 kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"description"}, Query: "journey"} 548 addit := additional.Properties{} 549 res, _, err := idx.objectSearch(context.TODO(), 1000, filter, kwr, nil, nil, addit, nil, "", 0) 550 551 require.Nil(t, err) 552 require.True(t, len(res) == 1) 553 require.Equal(t, uint64(2), res[0].DocID) 554 } 555 556 func TestBM25FWithFilters_ScoreIsIdenticalWithOrWithoutFilter(t *testing.T) { 557 dirName := t.TempDir() 558 559 logger := logrus.New() 560 schemaGetter := &fakeSchemaGetter{ 561 schema: schema.Schema{Objects: &models.Schema{Classes: nil}}, 562 shardState: singleShardState(), 563 } 564 repo, err := New(logger, Config{ 565 MemtablesFlushDirtyAfter: 60, 566 RootPath: dirName, 567 QueryMaximumResults: 10000, 568 MaxImportGoroutinesFactor: 1, 569 }, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, nil, nil) 570 require.Nil(t, err) 571 repo.SetSchemaGetter(schemaGetter) 572 require.Nil(t, repo.WaitForStartup(context.TODO())) 573 defer repo.Shutdown(context.Background()) 574 575 SetupClassForFilterScoringTest(t, repo, schemaGetter, logger, 1.2, 0.75) 576 577 idx := repo.GetIndex("FilterClass") 578 require.NotNil(t, idx) 579 580 filter := &filters.LocalFilter{ 581 Root: &filters.Clause{ 582 On: &filters.Path{ 583 Class: schema.ClassName("FilterClass"), 584 Property: schema.PropertyName("relatedToGolf"), 585 }, 586 Operator: filters.OperatorEqual, 587 Value: &filters.Value{ 588 Value: true, 589 Type: dtBool, 590 }, 591 }, 592 } 593 594 kwr := &searchparams.KeywordRanking{ 595 Type: "bm25", 596 Properties: []string{"description"}, 597 Query: "koepka golf", 598 } 599 600 addit := additional.Properties{} 601 filtered, filteredScores, err := idx.objectSearch(context.TODO(), 1000, filter, kwr, nil, nil, addit, nil, "", 0) 602 require.Nil(t, err) 603 unfiltered, unfilteredScores, err := idx.objectSearch(context.TODO(), 1000, nil, kwr, nil, nil, addit, nil, "", 0) 604 require.Nil(t, err) 605 606 require.Len(t, filtered, 1) // should match exactly one element 607 require.Len(t, unfiltered, 2) // contains irrelevant result 608 609 assert.Equal(t, uint64(0), filtered[0].DocID) // brooks koepka result 610 assert.Equal(t, uint64(0), unfiltered[0].DocID) // brooks koepka result 611 612 assert.Equal(t, filteredScores[0], unfilteredScores[0]) 613 } 614 615 func TestBM25FDifferentParamsJourney(t *testing.T) { 616 dirName := t.TempDir() 617 618 logger := logrus.New() 619 schemaGetter := &fakeSchemaGetter{ 620 schema: schema.Schema{Objects: &models.Schema{Classes: nil}}, 621 shardState: singleShardState(), 622 } 623 repo, err := New(logger, Config{ 624 MemtablesFlushDirtyAfter: 60, 625 RootPath: dirName, 626 QueryMaximumResults: 10000, 627 MaxImportGoroutinesFactor: 1, 628 }, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, nil, nil) 629 require.Nil(t, err) 630 repo.SetSchemaGetter(schemaGetter) 631 require.Nil(t, repo.WaitForStartup(context.TODO())) 632 defer repo.Shutdown(context.Background()) 633 634 SetupClass(t, repo, schemaGetter, logger, 0.5, 100) 635 636 idx := repo.GetIndex("MyClass") 637 require.NotNil(t, idx) 638 639 // Check boosted 640 kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"title^2", "description"}, Query: "journey"} 641 addit := additional.Properties{} 642 res, scores, err := idx.objectSearch(context.TODO(), 1000, nil, kwr, nil, nil, addit, nil, "", 0) 643 644 // Print results 645 t.Log("--- Start results for boosted search ---") 646 for i, r := range res { 647 t.Logf("Result id: %v, score: %v, title: %v, description: %v, additional %+v\n", r.DocID, scores[i], r.Object.Properties.(map[string]interface{})["title"], r.Object.Properties.(map[string]interface{})["description"], r.Object.Additional) 648 } 649 650 require.Nil(t, err) 651 652 // Check results in correct order 653 require.Equal(t, uint64(6), res[0].DocID) 654 require.Equal(t, uint64(1), res[3].DocID) 655 656 // Print results 657 t.Log("--- Start results for boosted search ---") 658 for i, r := range res { 659 t.Logf("Result id: %v, score: %v, title: %v, description: %v, additional %+v\n", r.DocID, scores[i], r.Object.Properties.(map[string]interface{})["title"], r.Object.Properties.(map[string]interface{})["description"], r.Object.Additional) 660 } 661 662 // Check scores 663 EqualFloats(t, float32(0.06023), scores[0], 6) 664 EqualFloats(t, float32(0.04238), scores[1], 6) 665 } 666 667 func EqualFloats(t *testing.T, expected, actual float32, significantFigures int) { 668 s1 := fmt.Sprintf("%v", expected) 669 s2 := fmt.Sprintf("%v", actual) 670 if len(s1) < 2 || len(s2) < 2 { 671 t.Fail() 672 } 673 if len(s1) <= significantFigures { 674 significantFigures = len(s1) - 1 675 } 676 if len(s2) <= significantFigures { 677 significantFigures = len(s2) - 1 678 } 679 require.Equal(t, s1[:significantFigures+1], s2[:significantFigures+1]) 680 } 681 682 // Compare with previous BM25 version to ensure the algorithm functions correctly 683 func TestBM25FCompare(t *testing.T) { 684 dirName := t.TempDir() 685 686 logger := logrus.New() 687 schemaGetter := &fakeSchemaGetter{ 688 schema: schema.Schema{Objects: &models.Schema{Classes: nil}}, 689 shardState: singleShardState(), 690 } 691 repo, err := New(logger, Config{ 692 MemtablesFlushDirtyAfter: 60, 693 RootPath: dirName, 694 QueryMaximumResults: 10000, 695 MaxImportGoroutinesFactor: 1, 696 }, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, nil, nil) 697 require.Nil(t, err) 698 repo.SetSchemaGetter(schemaGetter) 699 require.Nil(t, repo.WaitForStartup(context.TODO())) 700 defer repo.Shutdown(context.Background()) 701 702 SetupClass(t, repo, schemaGetter, logger, 0.5, 100) 703 704 idx := repo.GetIndex("MyClass") 705 require.NotNil(t, idx) 706 707 shardNames := idx.getSchema.CopyShardingState(idx.Config.ClassName.String()).AllPhysicalShards() 708 709 for _, shardName := range shardNames { 710 shard := idx.shards.Load(shardName) 711 t.Logf("------ BM25F --------\n") 712 kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"title"}, Query: "journey"} 713 addit := additional.Properties{} 714 715 withBM25Fobjs, withBM25Fscores, err := shard.ObjectSearch(context.TODO(), 1000, nil, kwr, nil, nil, addit) 716 require.Nil(t, err) 717 718 for i, r := range withBM25Fobjs { 719 t.Logf("Result id: %v, score: %v, title: %v, description: %v, additional %+v\n", r.DocID, withBM25Fscores[i], r.Object.Properties.(map[string]interface{})["title"], r.Object.Properties.(map[string]interface{})["description"], r.Object.Additional) 720 } 721 722 t.Logf("------ BM25 --------\n") 723 kwr.Type = "" 724 725 objs, scores, err := shard.ObjectSearch(context.TODO(), 1000, nil, kwr, nil, nil, addit) 726 require.Nil(t, err) 727 728 for i, r := range objs { 729 t.Logf("Result id: %v, score: %v, title: %v, description: %v, additional %+v\n", r.DocID, scores[i], r.Object.Properties.(map[string]interface{})["title"], r.Object.Properties.(map[string]interface{})["description"], r.Object.Additional) 730 } 731 732 require.Equal(t, len(withBM25Fobjs), len(objs)) 733 for i := range objs { 734 t.Logf("%v: BM25F score: %v, BM25 score: %v", i, withBM25Fscores[i], scores[i]) 735 EqualFloats(t, withBM25Fscores[i], scores[i], 9) 736 } 737 738 // Not all the scores are unique and the search is not stable, so pick ones that don't move 739 require.Equal(t, uint64(4), objs[0].DocID) 740 require.Equal(t, uint64(5), objs[1].DocID) 741 require.Equal(t, uint64(6), objs[2].DocID) 742 require.Equal(t, uint64(1), objs[3].DocID) 743 require.Equal(t, uint64(2), objs[4].DocID) 744 require.Equal(t, uint64(0), objs[5].DocID) 745 746 require.Equal(t, uint64(4), withBM25Fobjs[0].DocID) 747 require.Equal(t, uint64(5), withBM25Fobjs[1].DocID) 748 require.Equal(t, uint64(6), withBM25Fobjs[2].DocID) 749 require.Equal(t, uint64(1), withBM25Fobjs[3].DocID) 750 require.Equal(t, uint64(2), withBM25Fobjs[4].DocID) 751 require.Equal(t, uint64(0), withBM25Fobjs[5].DocID) 752 753 } 754 } 755 756 func Test_propertyHasSearchableIndex(t *testing.T) { 757 vFalse := false 758 vTrue := true 759 760 class := &models.Class{ 761 VectorIndexConfig: enthnsw.NewDefaultUserConfig(), 762 InvertedIndexConfig: BM25FinvertedConfig(1, 1, "none"), 763 Class: "MyClass", 764 765 Properties: []*models.Property{ 766 { 767 Name: "title", 768 DataType: schema.DataTypeText.PropString(), 769 Tokenization: models.PropertyTokenizationWord, 770 IndexFilterable: &vFalse, 771 IndexSearchable: nil, 772 }, 773 { 774 Name: "description", 775 DataType: schema.DataTypeText.PropString(), 776 Tokenization: models.PropertyTokenizationWord, 777 IndexFilterable: &vFalse, 778 IndexSearchable: &vTrue, 779 }, 780 { 781 Name: "textField", 782 DataType: schema.DataTypeText.PropString(), 783 Tokenization: models.PropertyTokenizationField, 784 IndexFilterable: &vFalse, 785 IndexSearchable: &vFalse, 786 }, 787 }, 788 } 789 790 ClassSchema := &models.Schema{ 791 Classes: []*models.Class{class}, 792 } 793 t.Run("Property index", func(t *testing.T) { 794 if got := inverted.PropertyHasSearchableIndex(ClassSchema, "MyClass", "description"); got != true { 795 t.Errorf("PropertyHasSearchableIndex() = %v, want %v", got, true) 796 } 797 798 if got := inverted.PropertyHasSearchableIndex(ClassSchema, "MyClass", "description^2"); got != true { 799 t.Errorf("PropertyHasSearchableIndex() = %v, want %v", got, true) 800 } 801 802 if got := inverted.PropertyHasSearchableIndex(ClassSchema, "MyClass", "textField"); got != false { 803 t.Errorf("PropertyHasSearchableIndex() = %v, want %v", got, false) 804 } 805 806 if got := inverted.PropertyHasSearchableIndex(ClassSchema, "MyClass", "title"); got != true { 807 t.Errorf("PropertyHasSearchableIndex() = %v, want %v", got, true) 808 } 809 }) 810 } 811 812 func SetupClassDocuments(t require.TestingT, repo *DB, schemaGetter *fakeSchemaGetter, logger logrus.FieldLogger, k1, b float32, preset string, 813 ) string { 814 vFalse := false 815 vTrue := true 816 817 className := "DocumentsPreset_" + preset 818 class := &models.Class{ 819 VectorIndexConfig: enthnsw.NewDefaultUserConfig(), 820 InvertedIndexConfig: BM25FinvertedConfig(k1, b, preset), 821 Class: className, 822 823 Properties: []*models.Property{ 824 { 825 Name: "document", 826 DataType: schema.DataTypeText.PropString(), 827 Tokenization: models.PropertyTokenizationWord, 828 IndexFilterable: &vFalse, 829 IndexSearchable: &vTrue, 830 }, 831 }, 832 } 833 schemaGetter.schema = schema.Schema{ 834 Objects: &models.Schema{ 835 Classes: []*models.Class{class}, 836 }, 837 } 838 839 migrator := NewMigrator(repo, logger) 840 migrator.AddClass(context.Background(), class, schemaGetter.shardState) 841 842 testData := []map[string]interface{}{} 843 testData = append(testData, map[string]interface{}{"document": "No matter what you do, the question of \"\"what is income\"\" is *always* going to be an extremely complex question. To use this particular example, is paying a royalty fee to an external party a legitimate business expense that is part of the cost of doing business and which subtracts from your \"\"income\"\"?"}) 844 testData = append(testData, map[string]interface{}{"document": "test"}) 845 testData = append(testData, map[string]interface{}{"document": "As long as the losing business is not considered \"\"passive activity\"\" or \"\"hobby\"\", then yes. Passive Activity is an activity where you do not have to actively do anything to generate income. For example - royalties or rentals. Hobby is an activity that doesn't generate profit. Generally, if your business doesn't consistently generate profit (the IRS looks at 3 out of the last 5 years), it may be characterized as hobby. For hobby, loss deduction is limited by the hobby income and the 2% AGI threshold."}) 846 testData = append(testData, map[string]interface{}{"document": "So you're basically saying that average market fluctuations have an affect on individual stocks, because individual stocks are often priced in relation to the growth of the market as a whole? Also, what kinds of investments would be considered \"\"risk free\"\" in this nomenclature?"}) 847 848 for i, data := range testData { 849 id := strfmt.UUID(uuid.MustParse(fmt.Sprintf("%032d", i)).String()) 850 851 obj := &models.Object{Class: className, ID: id, Properties: data, CreationTimeUnix: 1565612833955, LastUpdateTimeUnix: 10000020} 852 vector := []float32{1, 3, 5, 0.4} 853 //{title: "Our journey to BM25F", description: " This is how we get to BM25F"}} 854 err := repo.PutObject(context.Background(), obj, vector, nil, nil) 855 require.Nil(t, err) 856 } 857 return className 858 } 859 860 func TestBM25F_ComplexDocuments(t *testing.T) { 861 dirName := t.TempDir() 862 863 logger := logrus.New() 864 schemaGetter := &fakeSchemaGetter{ 865 schema: schema.Schema{Objects: &models.Schema{Classes: nil}}, 866 shardState: singleShardState(), 867 } 868 schemaGetter.schema = schema.Schema{ 869 Objects: &models.Schema{ 870 Classes: []*models.Class{}, 871 }, 872 } 873 repo, err := New(logger, Config{ 874 MemtablesFlushDirtyAfter: 60, 875 RootPath: dirName, 876 QueryMaximumResults: 10000, 877 MaxImportGoroutinesFactor: 1, 878 }, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, nil, nil) 879 require.Nil(t, err) 880 repo.SetSchemaGetter(schemaGetter) 881 require.Nil(t, repo.WaitForStartup(context.TODO())) 882 defer repo.Shutdown(context.Background()) 883 884 classNone := SetupClassDocuments(t, repo, schemaGetter, logger, 0.5, 0.75, "none") 885 idxNone := repo.GetIndex(schema.ClassName(classNone)) 886 require.NotNil(t, idxNone) 887 888 addit := additional.Properties{} 889 890 t.Run("single term", func(t *testing.T) { 891 kwr := &searchparams.KeywordRanking{Type: "bm25", Query: "considered a"} 892 res, scores, err := idxNone.objectSearch(context.TODO(), 10, nil, kwr, nil, nil, addit, nil, "", 0) 893 require.Nil(t, err) 894 895 // Print results 896 t.Log("--- Start results for boosted search ---") 897 for i, r := range res { 898 t.Logf("Result id: %v, score: %v, \n", r.DocID, scores[i]) 899 } 900 901 // Check results in correct order 902 require.Equal(t, uint64(3), res[0].DocID) 903 require.Equal(t, uint64(0), res[1].DocID) 904 require.Equal(t, uint64(2), res[2].DocID) 905 require.Len(t, res, 3) 906 907 // Check scores 908 EqualFloats(t, float32(0.8914), scores[0], 5) 909 EqualFloats(t, float32(0.5425), scores[1], 5) 910 EqualFloats(t, float32(0.3952), scores[2], 5) 911 }) 912 913 t.Run("Results without stopwords", func(t *testing.T) { 914 kwrNoStopwords := &searchparams.KeywordRanking{Type: "bm25", Query: "example losing business"} 915 resNoStopwords, resNoScores, err := idxNone.objectSearch(context.TODO(), 10, nil, kwrNoStopwords, nil, nil, addit, nil, "", 0) 916 require.Nil(t, err) 917 918 classEn := SetupClassDocuments(t, repo, schemaGetter, logger, 0.5, 0.75, "en") 919 idxEn := repo.GetIndex(schema.ClassName(classEn)) 920 require.NotNil(t, idxEn) 921 kwrStopwords := &searchparams.KeywordRanking{Type: "bm25", Query: "an example on losing the business"} 922 resStopwords, resScores, err := idxEn.objectSearch(context.TODO(), 10, nil, kwrStopwords, nil, nil, addit, nil, "", 0) 923 require.Nil(t, err) 924 925 require.Equal(t, len(resNoStopwords), len(resStopwords)) 926 for i, resNo := range resNoStopwords { 927 resYes := resStopwords[i] 928 require.Equal(t, resNo.DocID, resYes.DocID) 929 require.Equal(t, resNoScores[i], resScores[i]) 930 } 931 932 kwrStopwordsDuplicate := &searchparams.KeywordRanking{Type: "bm25", Query: "on an example on losing the business on"} 933 resStopwordsDuplicate, duplicateScores, err := idxEn.objectSearch(context.TODO(), 10, nil, kwrStopwordsDuplicate, nil, nil, addit, nil, "", 0) 934 require.Nil(t, err) 935 require.Equal(t, len(resNoStopwords), len(resStopwordsDuplicate)) 936 for i, resNo := range resNoStopwords { 937 resYes := resStopwordsDuplicate[i] 938 require.Equal(t, resNo.DocID, resYes.DocID) 939 require.Equal(t, resNoScores[i], duplicateScores[i]) 940 } 941 }) 942 } 943 944 func MultiPropClass(t require.TestingT, repo *DB, schemaGetter *fakeSchemaGetter, logger logrus.FieldLogger, k1, b float32) string { 945 vFalse := false 946 vTrue := true 947 948 className := "MultiProps" 949 class := &models.Class{ 950 VectorIndexConfig: enthnsw.NewDefaultUserConfig(), 951 InvertedIndexConfig: BM25FinvertedConfig(k1, b, "none"), 952 Class: className, 953 954 Properties: []*models.Property{ 955 { 956 Name: "document", 957 DataType: schema.DataTypeText.PropString(), 958 Tokenization: models.PropertyTokenizationWord, 959 IndexFilterable: &vFalse, 960 IndexSearchable: &vTrue, 961 }, 962 { 963 Name: "title", 964 DataType: schema.DataTypeText.PropString(), 965 Tokenization: models.PropertyTokenizationWord, 966 IndexFilterable: &vFalse, 967 IndexSearchable: &vTrue, 968 }, 969 }, 970 } 971 schemaGetter.schema = schema.Schema{ 972 Objects: &models.Schema{ 973 Classes: []*models.Class{class}, 974 }, 975 } 976 977 migrator := NewMigrator(repo, logger) 978 migrator.AddClass(context.Background(), class, schemaGetter.shardState) 979 980 testData := []map[string]interface{}{} 981 testData = append(testData, map[string]interface{}{"document": "test", "title": "pepper"}) 982 testData = append(testData, map[string]interface{}{"document": "banana", "title": "pepper"}) 983 testData = append(testData, map[string]interface{}{"document": "apple", "title": "banana taste great"}) 984 testData = append(testData, map[string]interface{}{"document": "banana burger", "title": "test"}) 985 testData = append(testData, map[string]interface{}{"document": "carotte", "title": "great"}) 986 987 for i, data := range testData { 988 id := strfmt.UUID(uuid.MustParse(fmt.Sprintf("%032d", i)).String()) 989 990 obj := &models.Object{Class: className, ID: id, Properties: data, CreationTimeUnix: 1565612833955, LastUpdateTimeUnix: 10000020} 991 vector := []float32{1, 3, 5, 0.4} 992 err := repo.PutObject(context.Background(), obj, vector, nil, nil) 993 require.Nil(t, err) 994 } 995 return className 996 } 997 998 func TestBM25F_SortMultiProp(t *testing.T) { 999 t.Skip("Currently failing") 1000 dirName := t.TempDir() 1001 1002 logger := logrus.New() 1003 schemaGetter := &fakeSchemaGetter{ 1004 schema: schema.Schema{Objects: &models.Schema{Classes: nil}}, 1005 shardState: singleShardState(), 1006 } 1007 schemaGetter.schema = schema.Schema{ 1008 Objects: &models.Schema{ 1009 Classes: []*models.Class{}, 1010 }, 1011 } 1012 repo, err := New(logger, Config{ 1013 MemtablesFlushDirtyAfter: 60, 1014 RootPath: dirName, 1015 QueryMaximumResults: 10000, 1016 MaxImportGoroutinesFactor: 1, 1017 }, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, nil, nil) 1018 require.Nil(t, err) 1019 repo.SetSchemaGetter(schemaGetter) 1020 require.Nil(t, repo.WaitForStartup(context.TODO())) 1021 defer repo.Shutdown(context.Background()) 1022 1023 idx := repo.GetIndex(schema.ClassName(MultiPropClass(t, repo, schemaGetter, logger, 0.5, 0.75))) 1024 require.NotNil(t, idx) 1025 1026 addit := additional.Properties{} 1027 1028 t.Run("single term", func(t *testing.T) { 1029 kwr := &searchparams.KeywordRanking{Type: "bm25", Query: "pepper banana"} 1030 res, scores, err := idx.objectSearch(context.TODO(), 1, nil, kwr, nil, nil, addit, nil, "", 0) 1031 require.Nil(t, err) 1032 1033 // Print results 1034 t.Log("--- Start results for boosted search ---") 1035 for i, r := range res { 1036 t.Logf("Result id: %v, score: %v, \n", r.DocID, scores[i]) 1037 } 1038 1039 // Document 1 is a result for both terms 1040 require.Len(t, res, 1) 1041 require.Equal(t, uint64(1), res[0].DocID) 1042 }) 1043 }