github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/inverted/analyzer_test.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package inverted 13 14 import ( 15 "bytes" 16 "math" 17 "sort" 18 "testing" 19 20 "github.com/stretchr/testify/assert" 21 "github.com/stretchr/testify/require" 22 "github.com/weaviate/weaviate/entities/models" 23 ) 24 25 func TestAnalyzer(t *testing.T) { 26 a := NewAnalyzer(nil) 27 28 countable := func(data []string, freq []int) []Countable { 29 countable := make([]Countable, len(data)) 30 for i := range data { 31 countable[i] = Countable{ 32 Data: []byte(data[i]), 33 TermFrequency: float32(freq[i]), 34 } 35 } 36 return countable 37 } 38 39 t.Run("with text", func(t *testing.T) { 40 type testCase struct { 41 name string 42 input string 43 tokenization string 44 expectedCountable []Countable 45 } 46 47 testCases := []testCase{ 48 { 49 name: "tokenization word, unique words", 50 input: "Hello, my name is John Doe", 51 tokenization: models.PropertyTokenizationWord, 52 expectedCountable: countable( 53 []string{"hello", "my", "name", "is", "john", "doe"}, 54 []int{1, 1, 1, 1, 1, 1}, 55 ), 56 }, 57 { 58 name: "tokenization word, duplicated words", 59 input: "Du. Du hast. Du hast. Du hast mich gefragt.", 60 tokenization: models.PropertyTokenizationWord, 61 expectedCountable: countable( 62 []string{"du", "hast", "mich", "gefragt"}, 63 []int{4, 3, 1, 1}, 64 ), 65 }, 66 { 67 name: "tokenization lowercase, unique words", 68 input: "My email is john-thats-jay.ohh.age.n+alloneword@doe.com", 69 tokenization: models.PropertyTokenizationLowercase, 70 expectedCountable: countable( 71 []string{"my", "email", "is", "john-thats-jay.ohh.age.n+alloneword@doe.com"}, 72 []int{1, 1, 1, 1}, 73 ), 74 }, 75 { 76 name: "tokenization lowercase, duplicated words", 77 input: "Du. Du hast. Du hast. Du hast mich gefragt.", 78 tokenization: models.PropertyTokenizationLowercase, 79 expectedCountable: countable( 80 []string{"du.", "du", "hast.", "hast", "mich", "gefragt."}, 81 []int{1, 3, 2, 1, 1, 1}, 82 ), 83 }, 84 { 85 name: "tokenization whitespace, unique words", 86 input: "My email is john-thats-jay.ohh.age.n+alloneword@doe.com", 87 tokenization: models.PropertyTokenizationWhitespace, 88 expectedCountable: countable( 89 []string{"My", "email", "is", "john-thats-jay.ohh.age.n+alloneword@doe.com"}, 90 []int{1, 1, 1, 1}, 91 ), 92 }, 93 { 94 name: "tokenization whitespace, duplicated words", 95 input: "Du. Du hast. Du hast. Du hast mich gefragt.", 96 tokenization: models.PropertyTokenizationWhitespace, 97 expectedCountable: countable( 98 []string{"Du.", "Du", "hast.", "hast", "mich", "gefragt."}, 99 []int{1, 3, 2, 1, 1, 1}, 100 ), 101 }, 102 { 103 name: "tokenization field", 104 input: "\n Du. Du hast. Du hast. Du hast mich gefragt.\t ", 105 tokenization: models.PropertyTokenizationField, 106 expectedCountable: countable( 107 []string{"Du. Du hast. Du hast. Du hast mich gefragt."}, 108 []int{1}, 109 ), 110 }, 111 { 112 name: "non existing tokenization", 113 input: "Du. Du hast. Du hast. Du hast mich gefragt.", 114 tokenization: "non_existing", 115 expectedCountable: []Countable{}, 116 }, 117 } 118 119 for _, tc := range testCases { 120 t.Run(tc.name, func(t *testing.T) { 121 countable := a.Text(tc.tokenization, tc.input) 122 assert.ElementsMatch(t, tc.expectedCountable, countable) 123 }) 124 } 125 }) 126 127 t.Run("with text array", func(t *testing.T) { 128 type testCase struct { 129 name string 130 input []string 131 tokenization string 132 expectedCountable []Countable 133 } 134 135 testCases := []testCase{ 136 { 137 name: "tokenization word, unique words", 138 input: []string{"Hello,", "my name is John Doe"}, 139 tokenization: models.PropertyTokenizationWord, 140 expectedCountable: countable( 141 []string{"hello", "my", "name", "is", "john", "doe"}, 142 []int{1, 1, 1, 1, 1, 1}, 143 ), 144 }, 145 { 146 name: "tokenization word, duplicated words", 147 input: []string{"Du. Du hast. Du hast.", "Du hast mich gefragt."}, 148 tokenization: models.PropertyTokenizationWord, 149 expectedCountable: countable( 150 []string{"du", "hast", "mich", "gefragt"}, 151 []int{4, 3, 1, 1}, 152 ), 153 }, 154 { 155 name: "tokenization lowercase, unique words", 156 input: []string{"My email", "is john-thats-jay.ohh.age.n+alloneword@doe.com"}, 157 tokenization: models.PropertyTokenizationLowercase, 158 expectedCountable: countable( 159 []string{"my", "email", "is", "john-thats-jay.ohh.age.n+alloneword@doe.com"}, 160 []int{1, 1, 1, 1}, 161 ), 162 }, 163 { 164 name: "tokenization lowercase, duplicated words", 165 input: []string{"Du. Du hast. Du hast.", "Du hast mich gefragt."}, 166 tokenization: models.PropertyTokenizationLowercase, 167 expectedCountable: countable( 168 []string{"du.", "du", "hast.", "hast", "mich", "gefragt."}, 169 []int{1, 3, 2, 1, 1, 1}, 170 ), 171 }, 172 { 173 name: "tokenization whitespace, unique words", 174 input: []string{"My email", "is john-thats-jay.ohh.age.n+alloneword@doe.com"}, 175 tokenization: models.PropertyTokenizationWhitespace, 176 expectedCountable: countable( 177 []string{"My", "email", "is", "john-thats-jay.ohh.age.n+alloneword@doe.com"}, 178 []int{1, 1, 1, 1}, 179 ), 180 }, 181 { 182 name: "tokenization whitespace, duplicated words", 183 input: []string{"Du. Du hast. Du hast.", "Du hast mich gefragt."}, 184 tokenization: models.PropertyTokenizationWhitespace, 185 expectedCountable: countable( 186 []string{"Du.", "Du", "hast.", "hast", "mich", "gefragt."}, 187 []int{1, 3, 2, 1, 1, 1}, 188 ), 189 }, 190 { 191 name: "tokenization field", 192 input: []string{"\n Du. Du hast. Du hast.", "Du hast mich gefragt.\t "}, 193 tokenization: models.PropertyTokenizationField, 194 expectedCountable: countable( 195 []string{"Du. Du hast. Du hast.", "Du hast mich gefragt."}, 196 []int{1, 1}, 197 ), 198 }, 199 { 200 name: "non existing tokenization", 201 input: []string{"Du. Du hast. Du hast.", "Du hast mich gefragt."}, 202 tokenization: "non_existing", 203 expectedCountable: []Countable{}, 204 }, 205 } 206 207 for _, tc := range testCases { 208 t.Run(tc.name, func(t *testing.T) { 209 countable := a.TextArray(tc.tokenization, tc.input) 210 assert.ElementsMatch(t, tc.expectedCountable, countable) 211 }) 212 } 213 }) 214 215 t.Run("with int it stays sortable", func(t *testing.T) { 216 getData := func(in []Countable, err error) []byte { 217 require.Nil(t, err) 218 return in[0].Data 219 } 220 221 results := [][]byte{ 222 getData(a.Float(math.MinInt64)), 223 getData(a.Int(-1000000)), 224 getData(a.Int(-400000)), 225 getData(a.Int(-20000)), 226 getData(a.Int(-9000)), 227 getData(a.Int(-301)), 228 getData(a.Int(-300)), 229 getData(a.Int(-299)), 230 getData(a.Int(-1)), 231 getData(a.Int(0)), 232 getData(a.Int(1)), 233 getData(a.Int(299)), 234 getData(a.Int(300)), 235 getData(a.Int(301)), 236 getData(a.Int(9000)), 237 getData(a.Int(20000)), 238 getData(a.Int(400000)), 239 getData(a.Int(1000000)), 240 getData(a.Float(math.MaxInt64)), 241 } 242 243 afterSort := make([][]byte, len(results)) 244 copy(afterSort, results) 245 sort.Slice(afterSort, func(a, b int) bool { return bytes.Compare(afterSort[a], afterSort[b]) == -1 }) 246 assert.Equal(t, results, afterSort) 247 }) 248 249 t.Run("with float it stays sortable", func(t *testing.T) { 250 getData := func(in []Countable, err error) []byte { 251 require.Nil(t, err) 252 return in[0].Data 253 } 254 255 results := [][]byte{ 256 getData(a.Float(-math.MaxFloat64)), 257 getData(a.Float(-1000000)), 258 getData(a.Float(-400000)), 259 getData(a.Float(-20000)), 260 getData(a.Float(-9000.9)), 261 getData(a.Float(-9000.8999)), 262 getData(a.Float(-9000.8998)), 263 getData(a.Float(-9000.79999)), 264 getData(a.Float(-301)), 265 getData(a.Float(-300)), 266 getData(a.Float(-299)), 267 getData(a.Float(-1)), 268 getData(a.Float(-0.09)), 269 getData(a.Float(-0.01)), 270 getData(a.Float(-0.009)), 271 getData(a.Float(0)), 272 getData(a.Float(math.SmallestNonzeroFloat64)), 273 getData(a.Float(0.009)), 274 getData(a.Float(0.01)), 275 getData(a.Float(0.09)), 276 getData(a.Float(0.1)), 277 getData(a.Float(0.9)), 278 getData(a.Float(1)), 279 getData(a.Float(299)), 280 getData(a.Float(300)), 281 getData(a.Float(301)), 282 getData(a.Float(9000)), 283 getData(a.Float(20000)), 284 getData(a.Float(400000)), 285 getData(a.Float(1000000)), 286 getData(a.Float(math.MaxFloat64)), 287 } 288 289 afterSort := make([][]byte, len(results)) 290 copy(afterSort, results) 291 sort.Slice(afterSort, func(a, b int) bool { return bytes.Compare(afterSort[a], afterSort[b]) == -1 }) 292 assert.Equal(t, results, afterSort) 293 }) 294 295 t.Run("with refCount it stays sortable", func(t *testing.T) { 296 getData := func(in []Countable, err error) []byte { 297 require.Nil(t, err) 298 return in[0].Data 299 } 300 301 results := [][]byte{ 302 getData(a.RefCount(make(models.MultipleRef, 0))), 303 getData(a.RefCount(make(models.MultipleRef, 1))), 304 getData(a.RefCount(make(models.MultipleRef, 2))), 305 getData(a.RefCount(make(models.MultipleRef, 99))), 306 getData(a.RefCount(make(models.MultipleRef, 100))), 307 getData(a.RefCount(make(models.MultipleRef, 101))), 308 getData(a.RefCount(make(models.MultipleRef, 256))), 309 getData(a.RefCount(make(models.MultipleRef, 300))), 310 getData(a.RefCount(make(models.MultipleRef, 456))), 311 } 312 313 afterSort := make([][]byte, len(results)) 314 copy(afterSort, results) 315 sort.Slice(afterSort, func(a, b int) bool { return bytes.Compare(afterSort[a], afterSort[b]) == -1 }) 316 assert.Equal(t, results, afterSort) 317 }) 318 319 byteTrue := []byte{0x1} 320 byteFalse := []byte{0x0} 321 322 t.Run("analyze bool", func(t *testing.T) { 323 t.Run("true", func(t *testing.T) { 324 countable, err := a.Bool(true) 325 require.Nil(t, err) 326 require.Len(t, countable, 1) 327 328 c := countable[0] 329 assert.Equal(t, byteTrue, c.Data) 330 assert.Equal(t, float32(0), c.TermFrequency) 331 }) 332 333 t.Run("false", func(t *testing.T) { 334 countable, err := a.Bool(false) 335 require.Nil(t, err) 336 require.Len(t, countable, 1) 337 338 c := countable[0] 339 assert.Equal(t, byteFalse, c.Data) 340 assert.Equal(t, float32(0), c.TermFrequency) 341 }) 342 }) 343 344 t.Run("analyze bool array", func(t *testing.T) { 345 type testCase struct { 346 name string 347 values []bool 348 expected [][]byte 349 } 350 351 testCases := []testCase{ 352 { 353 name: "[true]", 354 values: []bool{true}, 355 expected: [][]byte{byteTrue}, 356 }, 357 { 358 name: "[false]", 359 values: []bool{false}, 360 expected: [][]byte{byteFalse}, 361 }, 362 { 363 name: "[true, true, true]", 364 values: []bool{true, true, true}, 365 expected: [][]byte{byteTrue, byteTrue, byteTrue}, 366 }, 367 { 368 name: "[false, false, false]", 369 values: []bool{false, false, false}, 370 expected: [][]byte{byteFalse, byteFalse, byteFalse}, 371 }, 372 { 373 name: "[false, true, false, true]", 374 values: []bool{false, true, false, true}, 375 expected: [][]byte{byteFalse, byteTrue, byteFalse, byteTrue}, 376 }, 377 { 378 name: "[]", 379 values: []bool{}, 380 expected: [][]byte{}, 381 }, 382 } 383 384 for _, tc := range testCases { 385 t.Run(tc.name, func(t *testing.T) { 386 countable, err := a.BoolArray(tc.values) 387 require.Nil(t, err) 388 require.Len(t, countable, len(tc.expected)) 389 390 for i := range countable { 391 assert.Equal(t, tc.expected[i], countable[i].Data) 392 assert.Equal(t, float32(0), countable[i].TermFrequency) 393 } 394 }) 395 } 396 }) 397 } 398 399 func TestAnalyzer_DefaultEngPreset(t *testing.T) { 400 countable := func(data []string, freq []int) []Countable { 401 countable := make([]Countable, len(data)) 402 for i := range data { 403 countable[i] = Countable{ 404 Data: []byte(data[i]), 405 TermFrequency: float32(freq[i]), 406 } 407 } 408 return countable 409 } 410 411 a := NewAnalyzer(nil) 412 input := "Hello you-beautiful_World" 413 414 t.Run("with text", func(t *testing.T) { 415 type testCase struct { 416 name string 417 tokenization string 418 input string 419 expectedCountable []Countable 420 } 421 422 testCases := []testCase{ 423 { 424 name: "tokenization word", 425 tokenization: models.PropertyTokenizationWord, 426 input: input, 427 expectedCountable: countable( 428 []string{"hello", "you", "beautiful", "world"}, 429 []int{1, 1, 1, 1}, 430 ), 431 }, 432 { 433 name: "tokenization lowercase", 434 tokenization: models.PropertyTokenizationLowercase, 435 input: input, 436 expectedCountable: countable( 437 []string{"hello", "you-beautiful_world"}, 438 []int{1, 1}, 439 ), 440 }, 441 { 442 name: "tokenization whitespace", 443 tokenization: models.PropertyTokenizationWhitespace, 444 input: input, 445 expectedCountable: countable( 446 []string{"Hello", "you-beautiful_World"}, 447 []int{1, 1}, 448 ), 449 }, 450 { 451 name: "tokenization field", 452 tokenization: models.PropertyTokenizationField, 453 input: input, 454 expectedCountable: countable( 455 []string{"Hello you-beautiful_World"}, 456 []int{1}, 457 ), 458 }, 459 { 460 name: "non existing tokenization", 461 tokenization: "non_existing", 462 input: input, 463 expectedCountable: []Countable{}, 464 }, 465 } 466 467 for _, tc := range testCases { 468 countable := a.Text(tc.tokenization, tc.input) 469 assert.ElementsMatch(t, tc.expectedCountable, countable) 470 } 471 }) 472 473 t.Run("with text array", func(t *testing.T) { 474 type testCase struct { 475 name string 476 tokenization string 477 input []string 478 expectedCountable []Countable 479 } 480 481 testCases := []testCase{ 482 { 483 name: "tokenization word", 484 tokenization: models.PropertyTokenizationWord, 485 input: []string{input, input}, 486 expectedCountable: countable( 487 []string{"hello", "you", "beautiful", "world"}, 488 []int{2, 2, 2, 2}, 489 ), 490 }, 491 { 492 name: "tokenization lowercase", 493 tokenization: models.PropertyTokenizationLowercase, 494 input: []string{input, input}, 495 expectedCountable: countable( 496 []string{"hello", "you-beautiful_world"}, 497 []int{2, 2}, 498 ), 499 }, 500 { 501 name: "tokenization whitespace", 502 tokenization: models.PropertyTokenizationWhitespace, 503 input: []string{input, input}, 504 expectedCountable: countable( 505 []string{"Hello", "you-beautiful_World"}, 506 []int{2, 2}, 507 ), 508 }, 509 { 510 name: "tokenization field", 511 tokenization: models.PropertyTokenizationField, 512 input: []string{input, input}, 513 expectedCountable: countable( 514 []string{"Hello you-beautiful_World"}, 515 []int{2}, 516 ), 517 }, 518 { 519 name: "non existing tokenization", 520 tokenization: "non_existing", 521 input: []string{input, input}, 522 expectedCountable: []Countable{}, 523 }, 524 } 525 526 for _, tc := range testCases { 527 countable := a.TextArray(tc.tokenization, tc.input) 528 assert.ElementsMatch(t, tc.expectedCountable, countable) 529 } 530 }) 531 } 532 533 type fakeStopwordDetector struct{} 534 535 func (fsd fakeStopwordDetector) IsStopword(word string) bool { 536 return false 537 } 538 539 func TestDedupItems(t *testing.T) { 540 props := []Property{ 541 { 542 Name: "propNothingToDo", 543 Items: []Countable{ 544 {Data: []byte("fff"), TermFrequency: 3}, 545 {Data: []byte("eee"), TermFrequency: 2}, 546 {Data: []byte("ddd"), TermFrequency: 1}, 547 }, 548 }, 549 { 550 Name: "propToDedup1", 551 Items: []Countable{ 552 {Data: []byte("aaa"), TermFrequency: 1}, 553 {Data: []byte("bbb"), TermFrequency: 2}, 554 {Data: []byte("ccc"), TermFrequency: 3}, 555 {Data: []byte("aaa"), TermFrequency: 4}, 556 {Data: []byte("ccc"), TermFrequency: 0}, 557 }, 558 }, 559 { 560 Name: "propToDedup2", 561 Items: []Countable{ 562 {Data: []uint8{1}, TermFrequency: 5}, 563 {Data: []uint8{1}, TermFrequency: 4}, 564 {Data: []uint8{1}, TermFrequency: 3}, 565 {Data: []uint8{1}, TermFrequency: 2}, 566 {Data: []uint8{1}, TermFrequency: 1}, 567 }, 568 }, 569 } 570 571 expectedProps := []Property{ 572 { 573 Name: "propNothingToDo", 574 Items: []Countable{ 575 {Data: []byte("fff"), TermFrequency: 3}, 576 {Data: []byte("eee"), TermFrequency: 2}, 577 {Data: []byte("ddd"), TermFrequency: 1}, 578 }, 579 }, 580 { 581 Name: "propToDedup1", 582 Items: []Countable{ 583 {Data: []byte("bbb"), TermFrequency: 2}, 584 {Data: []byte("aaa"), TermFrequency: 4}, 585 {Data: []byte("ccc"), TermFrequency: 0}, 586 }, 587 }, 588 { 589 Name: "propToDedup2", 590 Items: []Countable{ 591 {Data: []uint8{1}, TermFrequency: 1}, 592 }, 593 }, 594 } 595 596 dedupProps := DedupItems(props) 597 assert.Equal(t, expectedProps, dedupProps) 598 }