github.com/weaviate/weaviate@v1.24.6/modules/text2vec-contextionary/vectorizer/schema_config.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package vectorizer 13 14 import ( 15 "context" 16 "fmt" 17 "strings" 18 19 "github.com/fatih/camelcase" 20 "github.com/pkg/errors" 21 "github.com/sirupsen/logrus" 22 "github.com/weaviate/weaviate/entities/models" 23 "github.com/weaviate/weaviate/entities/moduletools" 24 "github.com/weaviate/weaviate/entities/schema" 25 ) 26 27 type ConfigValidator struct { 28 remote RemoteClient 29 logger logrus.FieldLogger 30 } 31 32 type IndexChecker interface { 33 VectorizeClassName() bool 34 VectorizePropertyName(propName string) bool 35 PropertyIndexed(propName string) bool 36 } 37 38 type RemoteClient interface { 39 IsStopWord(ctx context.Context, word string) (bool, error) 40 IsWordPresent(ctx context.Context, word string) (bool, error) 41 } 42 43 func NewConfigValidator(rc RemoteClient, 44 logger logrus.FieldLogger, 45 ) *ConfigValidator { 46 return &ConfigValidator{remote: rc, logger: logger} 47 } 48 49 func (cv *ConfigValidator) Do(ctx context.Context, class *models.Class, 50 cfg moduletools.ClassConfig, icheck IndexChecker, 51 ) error { 52 err := cv.validateClassName(ctx, class.Class, icheck.VectorizeClassName()) 53 if err != nil { 54 return fmt.Errorf("invalid class name: %w", err) 55 } 56 57 for _, prop := range class.Properties { 58 if !icheck.PropertyIndexed(prop.Name) { 59 continue 60 } 61 62 err = cv.validatePropertyName(ctx, prop.Name, 63 icheck.VectorizePropertyName(prop.Name)) 64 if err != nil { 65 return errors.Wrapf(err, "class %q: invalid property name", class.Class) 66 } 67 } 68 69 if err := cv.validateIndexState(ctx, class, icheck); err != nil { 70 return errors.Wrap(err, "invalid combination of properties") 71 } 72 73 cv.checkForPossibilityOfDuplicateVectors(ctx, class, icheck) 74 75 return nil 76 } 77 78 func (cv *ConfigValidator) validateClassName(ctx context.Context, className string, 79 vectorizeClass bool, 80 ) error { 81 // class name 82 if !vectorizeClass { 83 // if the user chooses not to vectorize the class, we don't need to check 84 // if its c11y-valid or not 85 return nil 86 } 87 88 camelParts := camelcase.Split(className) 89 stopWordsFound := 0 90 for _, part := range camelParts { 91 word := strings.ToLower(part) 92 sw, err := cv.remote.IsStopWord(ctx, word) 93 if err != nil { 94 return fmt.Errorf("check stopword: %v", err) 95 } 96 97 if sw { 98 stopWordsFound++ 99 continue 100 } 101 102 present, err := cv.remote.IsWordPresent(ctx, word) 103 if err != nil { 104 return fmt.Errorf("check word presence: %v", err) 105 } 106 107 if !present { 108 return fmt.Errorf("could not find the word '%s' from the class name '%s' "+ 109 "in the contextionary", word, className) 110 } 111 } 112 113 if len(camelParts) == stopWordsFound { 114 return fmt.Errorf("className '%s' consists of only stopwords and is therefore "+ 115 "not a contextionary-valid class name, make sure at least one word in the "+ 116 "classname is not a stop word", className) 117 } 118 119 return nil 120 } 121 122 func (cv *ConfigValidator) validatePropertyName(ctx context.Context, 123 propertyName string, vectorize bool, 124 ) error { 125 if !vectorize { 126 // user does not want to vectorize this property name, so we don't have to 127 // validate it 128 return nil 129 } 130 131 camelParts := camelcase.Split(propertyName) 132 stopWordsFound := 0 133 for _, part := range camelParts { 134 word := strings.ToLower(part) 135 sw, err := cv.remote.IsStopWord(ctx, word) 136 if err != nil { 137 return fmt.Errorf("check stopword: %v", err) 138 } 139 140 if sw { 141 stopWordsFound++ 142 continue 143 } 144 145 present, err := cv.remote.IsWordPresent(ctx, word) 146 if err != nil { 147 return fmt.Errorf("check word presence: %v", err) 148 } 149 150 if !present { 151 return fmt.Errorf("could not find word '%s' of the property '%s' in the "+ 152 "contextionary", word, propertyName) 153 } 154 } 155 156 if len(camelParts) == stopWordsFound { 157 return fmt.Errorf("the propertyName '%s' consists of only stopwords and is "+ 158 "therefore not a contextionary-valid property name, make sure at least one word "+ 159 "in the property name is not a stop word", propertyName) 160 } 161 162 return nil 163 } 164 165 func (cv *ConfigValidator) validateIndexState(ctx context.Context, 166 class *models.Class, icheck IndexChecker, 167 ) error { 168 if icheck.VectorizeClassName() { 169 // if the user chooses to vectorize the classname, vector-building will 170 // always be possible, no need to investigate further 171 172 return nil 173 } 174 175 // search if there is at least one indexed, string/text or string/text[] 176 // prop. If found pass validation 177 for _, prop := range class.Properties { 178 if len(prop.DataType) < 1 { 179 return errors.Errorf("property %s must have at least one datatype: "+ 180 "got %v", prop.Name, prop.DataType) 181 } 182 183 if prop.DataType[0] != string(schema.DataTypeText) && 184 prop.DataType[0] != string(schema.DataTypeTextArray) { 185 // we can only vectorize text-like props 186 continue 187 } 188 189 if icheck.PropertyIndexed(prop.Name) { 190 // found at least one, this is a valid schema 191 return nil 192 } 193 } 194 195 return fmt.Errorf("invalid properties: didn't find a single property which is " + 196 "of type string or text and is not excluded from indexing. In addition the " + 197 "class name is excluded from vectorization as well, meaning that it cannot be " + 198 "used to determine the vector position. To fix this, set 'vectorizeClassName' " + 199 "to true if the class name is contextionary-valid. Alternatively add at least " + 200 "contextionary-valid text/string property which is not excluded from " + 201 "indexing.") 202 } 203 204 func (cv *ConfigValidator) checkForPossibilityOfDuplicateVectors( 205 ctx context.Context, class *models.Class, icheck IndexChecker, 206 ) { 207 if !icheck.VectorizeClassName() { 208 // if the user choses not to vectorize the class name, this means they must 209 // have chosen something else to vectorize, otherwise the validation would 210 // have error'd before we ever got here. We can skip further checking. 211 212 return 213 } 214 215 // search if there is at least one indexed, string/text prop. If found exit 216 for _, prop := range class.Properties { 217 // length check skipped, because validation has already passed 218 if prop.DataType[0] != string(schema.DataTypeText) { 219 // we can only vectorize text-like props 220 continue 221 } 222 223 if icheck.PropertyIndexed(prop.Name) { 224 // found at least one 225 return 226 } 227 } 228 229 cv.logger.WithField("module", "text2vec-contextionary"). 230 WithField("class", class.Class). 231 Warnf("text2vec-contextionary: Class %q does not have any properties "+ 232 "indexed (or only non text-properties indexed) and the vector position is "+ 233 "only determined by the class name. Each object will end up with the same "+ 234 "vector which leads to a severe performance penalty on imports. Consider "+ 235 "setting vectorIndexConfig.skip=true for this property", class.Class) 236 }