github.com/weaviate/weaviate@v1.24.6/modules/text2vec-contextionary/vectorizer/schema_config_test.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package vectorizer 13 14 import ( 15 "context" 16 "testing" 17 18 "github.com/sirupsen/logrus" 19 ltest "github.com/sirupsen/logrus/hooks/test" 20 "github.com/stretchr/testify/assert" 21 "github.com/stretchr/testify/require" 22 "github.com/weaviate/weaviate/entities/models" 23 "github.com/weaviate/weaviate/entities/schema" 24 ) 25 26 func TestConfigValidator(t *testing.T) { 27 t.Run("validate class names", func(t *testing.T) { 28 type testCase struct { 29 input string 30 valid bool 31 name string 32 vectorize bool 33 } 34 35 // for all test cases keep in mind that the word "carrot" is not present in 36 // the fake c11y, but every other word is. 37 // 38 // Additionally, the word "the" is a stopword 39 // 40 // all inputs represent class names (!) 41 tests := []testCase{ 42 // valid names 43 { 44 name: "Single uppercase word present in the c11y", 45 input: "Car", 46 valid: true, 47 vectorize: true, 48 }, 49 { 50 name: "Single lowercase word present in the c11y, stored as uppercase", 51 input: "car", 52 valid: true, 53 vectorize: true, 54 }, 55 { 56 name: "combination of valid words starting with uppercase letter", 57 input: "CarGarage", 58 valid: true, 59 vectorize: true, 60 }, 61 { 62 name: "combination of valid words starting with lowercase letter, stored as uppercase", 63 input: "carGarage", 64 valid: true, 65 vectorize: true, 66 }, 67 { 68 name: "combination of valid words and stopwords, starting with uppercase", 69 input: "TheCarGarage", 70 valid: true, 71 vectorize: true, 72 }, 73 { 74 name: "combination of valid words and stopwords starting with lowercase letter, stored as uppercase", 75 input: "carTheGarage", 76 valid: true, 77 vectorize: true, 78 }, 79 80 // invalid names 81 { 82 name: "Single uppercase word NOT present in the c11y", 83 input: "Carrot", 84 valid: false, 85 vectorize: true, 86 }, 87 { 88 name: "Single lowercase word NOT present in the c11y", 89 input: "carrot", 90 valid: false, 91 vectorize: true, 92 }, 93 { 94 name: "Single uppercase stopword", 95 input: "The", 96 valid: false, 97 vectorize: true, 98 }, 99 { 100 name: "Single lowercase stopword", 101 input: "the", 102 valid: false, 103 vectorize: true, 104 }, 105 { 106 name: "combination of valid and invalid words, valid word first lowercased", 107 input: "potatoCarrot", 108 valid: false, 109 vectorize: true, 110 }, 111 { 112 name: "combination of valid and invalid words, valid word first uppercased", 113 input: "PotatoCarrot", 114 valid: false, 115 vectorize: true, 116 }, 117 { 118 name: "combination of valid and invalid words, invalid word first lowercased", 119 input: "carrotPotato", 120 valid: false, 121 vectorize: true, 122 }, 123 { 124 name: "combination of valid and invalid words, invalid word first uppercased", 125 input: "CarrotPotato", 126 valid: false, 127 vectorize: true, 128 }, 129 { 130 name: "combination of only stopwords, starting with lowercase", 131 input: "theThe", 132 valid: false, 133 vectorize: true, 134 }, 135 { 136 name: "combination of only stopwords, starting with uppercase", 137 input: "TheThe", 138 valid: false, 139 vectorize: true, 140 }, 141 142 // vectorize turned off 143 { 144 name: "non-vectorized: combination of only stopwords, starting with uppercase", 145 input: "TheThe", 146 valid: true, 147 vectorize: false, 148 }, 149 { 150 name: "non-vectorized: excluded word", 151 input: "carrot", 152 valid: true, 153 vectorize: false, 154 }, 155 } 156 157 for _, test := range tests { 158 t.Run(test.name+" object class", func(t *testing.T) { 159 class := &models.Class{ 160 Class: test.input, 161 Properties: []*models.Property{{ 162 Name: "dummyPropSoWeDontRunIntoAllNoindexedError", 163 DataType: schema.DataTypeText.PropString(), 164 Tokenization: models.PropertyTokenizationWhitespace, 165 }}, 166 } 167 168 logger, _ := ltest.NewNullLogger() 169 v := NewConfigValidator(&fakeRemote{}, logger) 170 err := v.Do(context.Background(), class, nil, &fakeIndexChecker{ 171 vectorizeClassName: test.vectorize, 172 propertyIndexed: true, 173 }) 174 assert.Equal(t, test.valid, err == nil) 175 176 // only proceed if input was supposed to be valid 177 if test.valid == false { 178 return 179 } 180 }) 181 } 182 }) 183 184 t.Run("validate property names", func(t *testing.T) { 185 type testCase struct { 186 input string 187 valid bool 188 name string 189 vectorize bool 190 } 191 192 // for all test cases keep in mind that the word "carrot" is not present in 193 // the fake c11y, but every other word is 194 // 195 // all inputs represent property names (!) 196 tests := []testCase{ 197 // valid names 198 { 199 name: "Single uppercase word present in the c11y, stored as lowercase", 200 input: "Brand", 201 valid: true, 202 vectorize: true, 203 }, 204 { 205 name: "Single lowercase word present in the c11y", 206 input: "brand", 207 valid: true, 208 vectorize: true, 209 }, 210 { 211 name: "combination of valid words starting with uppercase letter, stored as lowercase", 212 input: "BrandGarage", 213 valid: true, 214 vectorize: true, 215 }, 216 { 217 name: "combination of valid words starting with lowercase letter", 218 input: "brandGarage", 219 valid: true, 220 vectorize: true, 221 }, 222 { 223 name: "combination of valid words and stop words starting with uppercase letter, stored as lowercase", 224 input: "TheGarage", 225 valid: true, 226 vectorize: true, 227 }, 228 { 229 name: "combination of valid words and stop words starting with lowercase letter", 230 input: "theGarage", 231 valid: true, 232 vectorize: true, 233 }, 234 235 // invalid names 236 { 237 name: "Single uppercase word NOT present in the c11y", 238 input: "Carrot", 239 valid: false, 240 vectorize: true, 241 }, 242 { 243 name: "Single lowercase word NOT present in the c11y", 244 input: "carrot", 245 valid: false, 246 vectorize: true, 247 }, 248 { 249 name: "Single lowercase stop word", 250 input: "the", 251 valid: false, 252 vectorize: true, 253 }, 254 { 255 name: "combination of valid and invalid words, valid word first lowercased", 256 input: "potatoCarrot", 257 valid: false, 258 vectorize: true, 259 }, 260 { 261 name: "combination of valid and invalid words, valid word first uppercased", 262 input: "PotatoCarrot", 263 valid: false, 264 vectorize: true, 265 }, 266 { 267 name: "combination of valid and invalid words, invalid word first lowercased", 268 input: "carrotPotato", 269 valid: false, 270 vectorize: true, 271 }, 272 { 273 name: "combination of valid and invalid words, invalid word first uppercased", 274 input: "CarrotPotato", 275 valid: false, 276 vectorize: true, 277 }, 278 { 279 name: "combination of only stop words, first lowercased", 280 input: "theThe", 281 valid: false, 282 vectorize: true, 283 }, 284 { 285 name: "combination of only stop words, first uppercased", 286 input: "TheThe", 287 valid: false, 288 vectorize: true, 289 }, 290 291 // without vectorizing 292 { 293 name: "non-vectorizing: combination of only stop words, first uppercased", 294 input: "TheThe", 295 valid: true, 296 vectorize: false, 297 }, 298 { 299 name: "non-vectorizing: combination of only stop words, first uppercased", 300 input: "carrot", 301 valid: true, 302 vectorize: false, 303 }, 304 } 305 306 for _, test := range tests { 307 t.Run(test.name+" object class", func(t *testing.T) { 308 class := &models.Class{ 309 Class: "ValidName", 310 Properties: []*models.Property{{ 311 DataType: schema.DataTypeText.PropString(), 312 Tokenization: models.PropertyTokenizationWhitespace, 313 Name: test.input, 314 }}, 315 } 316 317 logger, _ := ltest.NewNullLogger() 318 v := NewConfigValidator(&fakeRemote{}, logger) 319 err := v.Do(context.Background(), class, nil, &fakeIndexChecker{ 320 vectorizePropertyName: test.vectorize, 321 propertyIndexed: true, 322 }) 323 assert.Equal(t, test.valid, err == nil) 324 }) 325 } 326 }) 327 328 t.Run("all usable props no-indexed", func(t *testing.T) { 329 t.Run("all schema vectorization turned off", func(t *testing.T) { 330 class := &models.Class{ 331 Vectorizer: "text2vec-contextionary", 332 Class: "ValidName", 333 Properties: []*models.Property{ 334 { 335 DataType: []string{"text"}, 336 Name: "description", 337 }, 338 { 339 DataType: schema.DataTypeText.PropString(), 340 Tokenization: models.PropertyTokenizationWhitespace, 341 Name: "name", 342 }, 343 { 344 DataType: []string{"int"}, 345 Name: "amount", 346 }, 347 }, 348 } 349 350 logger, _ := ltest.NewNullLogger() 351 v := NewConfigValidator(&fakeRemote{}, logger) 352 err := v.Do(context.Background(), class, nil, &fakeIndexChecker{ 353 vectorizePropertyName: false, 354 vectorizeClassName: false, 355 propertyIndexed: false, 356 }) 357 assert.NotNil(t, err) 358 }) 359 }) 360 361 t.Run("with only array types", func(t *testing.T) { 362 class := &models.Class{ 363 Vectorizer: "text2vec-contextionary", 364 Class: "ValidName", 365 Properties: []*models.Property{ 366 { 367 DataType: []string{"text[]"}, 368 Name: "descriptions", 369 }, 370 { 371 DataType: schema.DataTypeTextArray.PropString(), 372 Tokenization: models.PropertyTokenizationWhitespace, 373 Name: "names", 374 }, 375 }, 376 } 377 378 logger, _ := ltest.NewNullLogger() 379 v := NewConfigValidator(&fakeRemote{}, logger) 380 err := v.Do(context.Background(), class, nil, &fakeIndexChecker{ 381 vectorizePropertyName: false, 382 vectorizeClassName: false, 383 propertyIndexed: true, 384 }) 385 assert.Nil(t, err) 386 }) 387 } 388 389 func TestConfigValidator_RiskOfDuplicateVectors(t *testing.T) { 390 type test struct { 391 name string 392 in *models.Class 393 expectWarning bool 394 indexChecker *fakeIndexChecker 395 } 396 397 tests := []test{ 398 { 399 name: "usable properties", 400 in: &models.Class{ 401 Class: "ValidName", 402 Properties: []*models.Property{ 403 { 404 DataType: []string{string(schema.DataTypeText)}, 405 Name: "textProp", 406 }, 407 }, 408 }, 409 expectWarning: false, 410 indexChecker: &fakeIndexChecker{ 411 vectorizePropertyName: false, 412 vectorizeClassName: true, 413 propertyIndexed: true, 414 }, 415 }, 416 { 417 name: "no properties", 418 in: &models.Class{ 419 Class: "ValidName", 420 }, 421 expectWarning: true, 422 indexChecker: &fakeIndexChecker{ 423 vectorizePropertyName: false, 424 vectorizeClassName: true, 425 propertyIndexed: false, 426 }, 427 }, 428 { 429 name: "usable properties, but they are no-indexed", 430 in: &models.Class{ 431 Class: "ValidName", 432 Properties: []*models.Property{ 433 { 434 DataType: []string{string(schema.DataTypeText)}, 435 Name: "textProp", 436 }, 437 }, 438 }, 439 expectWarning: true, 440 indexChecker: &fakeIndexChecker{ 441 vectorizePropertyName: false, 442 vectorizeClassName: true, 443 propertyIndexed: false, 444 }, 445 }, 446 { 447 name: "only unusable properties", 448 in: &models.Class{ 449 Class: "ValidName", 450 Properties: []*models.Property{ 451 { 452 DataType: []string{string(schema.DataTypeInt)}, 453 Name: "intProp", 454 }, 455 }, 456 }, 457 expectWarning: true, 458 indexChecker: &fakeIndexChecker{ 459 vectorizePropertyName: false, 460 vectorizeClassName: true, 461 propertyIndexed: false, 462 }, 463 }, 464 } 465 466 for _, test := range tests { 467 t.Run(test.name, func(t *testing.T) { 468 logger, hook := ltest.NewNullLogger() 469 v := NewConfigValidator(&fakeRemote{}, logger) 470 err := v.Do(context.Background(), test.in, nil, test.indexChecker) 471 require.Nil(t, err) 472 473 entry := hook.LastEntry() 474 if test.expectWarning { 475 require.NotNil(t, entry) 476 assert.Equal(t, logrus.WarnLevel, entry.Level) 477 } else { 478 assert.Nil(t, entry) 479 } 480 }) 481 } 482 } 483 484 type fakeIndexChecker struct { 485 vectorizeClassName bool 486 vectorizePropertyName bool 487 propertyIndexed bool 488 } 489 490 func (f *fakeIndexChecker) VectorizeClassName() bool { 491 return f.vectorizeClassName 492 } 493 494 func (f *fakeIndexChecker) VectorizePropertyName(propName string) bool { 495 return f.vectorizePropertyName 496 } 497 498 func (f *fakeIndexChecker) PropertyIndexed(propName string) bool { 499 return f.propertyIndexed 500 } 501 502 // Every word in this fake c11y remote client is present except for the word 503 // Carrot which is not present 504 type fakeRemote struct{} 505 506 func (f *fakeRemote) IsWordPresent(ctx context.Context, word string) (bool, error) { 507 if word == "carrot" || word == "the" { 508 return false, nil 509 } 510 return true, nil 511 } 512 513 func (f *fakeRemote) IsStopWord(ctx context.Context, word string) (bool, error) { 514 return word == "the", nil 515 }