github.com/weaviate/weaviate@v1.24.6/modules/text2vec-transformers/config.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package modtransformers 13 14 import ( 15 "context" 16 "fmt" 17 18 "github.com/pkg/errors" 19 "github.com/sirupsen/logrus" 20 "github.com/weaviate/weaviate/entities/models" 21 "github.com/weaviate/weaviate/entities/modulecapabilities" 22 "github.com/weaviate/weaviate/entities/moduletools" 23 "github.com/weaviate/weaviate/entities/schema" 24 "github.com/weaviate/weaviate/modules/text2vec-transformers/vectorizer" 25 ) 26 27 func (m *TransformersModule) ClassConfigDefaults() map[string]interface{} { 28 return map[string]interface{}{ 29 "vectorizeClassName": vectorizer.DefaultVectorizeClassName, 30 "poolingStrategy": vectorizer.DefaultPoolingStrategy, 31 } 32 } 33 34 func (m *TransformersModule) PropertyConfigDefaults( 35 dt *schema.DataType, 36 ) map[string]interface{} { 37 return map[string]interface{}{ 38 "skip": !vectorizer.DefaultPropertyIndexed, 39 "vectorizePropertyName": vectorizer.DefaultVectorizePropertyName, 40 } 41 } 42 43 func (m *TransformersModule) ValidateClass(ctx context.Context, 44 class *models.Class, cfg moduletools.ClassConfig, 45 ) error { 46 settings := vectorizer.NewClassSettings(cfg) 47 if err := settings.Validate(class); err != nil { 48 return err 49 } 50 return NewConfigValidator(m.logger).Do(ctx, class, cfg, settings) 51 } 52 53 var _ = modulecapabilities.ClassConfigurator(New()) 54 55 type ConfigValidator struct { 56 logger logrus.FieldLogger 57 } 58 59 type ClassSettings interface { 60 VectorizeClassName() bool 61 VectorizePropertyName(propName string) bool 62 PropertyIndexed(propName string) bool 63 } 64 65 func NewConfigValidator(logger logrus.FieldLogger) *ConfigValidator { 66 return &ConfigValidator{logger: logger} 67 } 68 69 func (cv *ConfigValidator) Do(ctx context.Context, class *models.Class, 70 cfg moduletools.ClassConfig, settings ClassSettings, 71 ) error { 72 // In text2vec-transformers (as opposed to e.g. text2vec-contextionary) the 73 // assumption is that the models will be able to deal with any words, even 74 // previously unseen ones. Therefore we do not need to validate individual 75 // properties, but only the overall "index state" 76 77 if err := cv.validateIndexState(ctx, class, settings); err != nil { 78 return errors.Errorf("invalid combination of properties") 79 } 80 81 cv.checkForPossibilityOfDuplicateVectors(ctx, class, settings) 82 83 return nil 84 } 85 86 func (cv *ConfigValidator) validateIndexState(ctx context.Context, 87 class *models.Class, settings ClassSettings, 88 ) error { 89 if settings.VectorizeClassName() { 90 // if the user chooses to vectorize the classname, vector-building will 91 // always be possible, no need to investigate further 92 93 return nil 94 } 95 96 // search if there is at least one indexed, string/text prop. If found pass 97 // validation 98 for _, prop := range class.Properties { 99 if len(prop.DataType) < 1 { 100 return errors.Errorf("property %s must have at least one datatype: "+ 101 "got %v", prop.Name, prop.DataType) 102 } 103 104 if prop.DataType[0] != string(schema.DataTypeText) { 105 // we can only vectorize text-like props 106 continue 107 } 108 109 if settings.PropertyIndexed(prop.Name) { 110 // found at least one, this is a valid schema 111 return nil 112 } 113 } 114 115 return fmt.Errorf("invalid properties: didn't find a single property which is " + 116 "of type string or text and is not excluded from indexing. In addition the " + 117 "class name is excluded from vectorization as well, meaning that it cannot be " + 118 "used to determine the vector position. To fix this, set 'vectorizeClassName' " + 119 "to true if the class name is contextionary-valid. Alternatively add at least " + 120 "contextionary-valid text/string property which is not excluded from " + 121 "indexing.") 122 } 123 124 func (cv *ConfigValidator) checkForPossibilityOfDuplicateVectors( 125 ctx context.Context, class *models.Class, settings ClassSettings, 126 ) { 127 if !settings.VectorizeClassName() { 128 // if the user choses not to vectorize the class name, this means they must 129 // have chosen something else to vectorize, otherwise the validation would 130 // have error'd before we ever got here. We can skip further checking. 131 132 return 133 } 134 135 // search if there is at least one indexed, string/text prop. If found exit 136 for _, prop := range class.Properties { 137 // length check skipped, because validation has already passed 138 if prop.DataType[0] != string(schema.DataTypeText) { 139 // we can only vectorize text-like props 140 continue 141 } 142 143 if settings.PropertyIndexed(prop.Name) { 144 // found at least one 145 return 146 } 147 } 148 149 cv.logger.WithField("module", "text2vec-transformers"). 150 WithField("class", class.Class). 151 Warnf("text2vec-contextionary: Class %q does not have any properties "+ 152 "indexed (or only non text-properties indexed) and the vector position is "+ 153 "only determined by the class name. Each object will end up with the same "+ 154 "vector which leads to a severe performance penalty on imports. Consider "+ 155 "setting vectorIndexConfig.skip=true for this property", class.Class) 156 }