github.com/weaviate/weaviate@v1.24.6/modules/text2vec-contextionary/vectorizer/vectorizer.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package vectorizer 13 14 // TODO: This entire package should be part of the text2vec-contextionary 15 // module, if methods/objects in here are used from non-modular code, they 16 // probably shouldn't be in here 17 18 import ( 19 "context" 20 "fmt" 21 "strings" 22 23 "github.com/fatih/camelcase" 24 "github.com/weaviate/weaviate/entities/models" 25 "github.com/weaviate/weaviate/entities/moduletools" 26 txt2vecmodels "github.com/weaviate/weaviate/modules/text2vec-contextionary/additional/models" 27 objectsvectorizer "github.com/weaviate/weaviate/usecases/modulecomponents/vectorizer" 28 ) 29 30 // Vectorizer turns objects into vectors 31 type Vectorizer struct { 32 client client 33 objectVectorizer *objectsvectorizer.ObjectVectorizer 34 } 35 36 type ErrNoUsableWords struct { 37 Err error 38 } 39 40 func (e ErrNoUsableWords) Error() string { 41 return e.Err.Error() 42 } 43 44 func NewErrNoUsableWordsf(pattern string, args ...interface{}) ErrNoUsableWords { 45 return ErrNoUsableWords{Err: fmt.Errorf(pattern, args...)} 46 } 47 48 type client interface { 49 VectorForCorpi(ctx context.Context, corpi []string, 50 overrides map[string]string) ([]float32, []txt2vecmodels.InterpretationSource, error) 51 } 52 53 // IndexCheck returns whether a property of a class should be indexed 54 type ClassIndexCheck interface { 55 PropertyIndexed(property string) bool 56 VectorizeClassName() bool 57 VectorizePropertyName(propertyName string) bool 58 } 59 60 // New from c11y client 61 func New(client client) *Vectorizer { 62 return &Vectorizer{ 63 client: client, 64 objectVectorizer: objectsvectorizer.New(), 65 } 66 } 67 68 func (v *Vectorizer) Texts(ctx context.Context, inputs []string, 69 cfg moduletools.ClassConfig, 70 ) ([]float32, error) { 71 return v.Corpi(ctx, inputs) 72 } 73 74 // Object object to vector 75 func (v *Vectorizer) Object(ctx context.Context, object *models.Object, 76 comp moduletools.VectorizablePropsComparator, cfg moduletools.ClassConfig, 77 ) ([]float32, models.AdditionalProperties, error) { 78 var overrides map[string]string 79 if object.VectorWeights != nil { 80 overrides = object.VectorWeights.(map[string]string) 81 } 82 83 vec, sources, err := v.object(ctx, object.Class, comp, overrides, cfg) 84 if err != nil { 85 return nil, nil, err 86 } 87 88 additional := models.AdditionalProperties{} 89 additional["interpretation"] = &txt2vecmodels.Interpretation{ 90 Source: sourceFromInputElements(sources), 91 } 92 93 return vec, additional, nil 94 } 95 96 func (v *Vectorizer) object(ctx context.Context, className string, 97 comp moduletools.VectorizablePropsComparator, overrides map[string]string, 98 cfg moduletools.ClassConfig, 99 ) ([]float32, []txt2vecmodels.InterpretationSource, error) { 100 icheck := NewIndexChecker(cfg) 101 corpi, vector := v.objectVectorizer.TextsOrVector(ctx, className, comp, icheck, cfg.TargetVector()) 102 // no property was changed, old vector can be used 103 if vector != nil { 104 // dont' re-vectorize 105 return vector, []txt2vecmodels.InterpretationSource{}, nil 106 } 107 // vectorize text 108 vector, ie, err := v.client.VectorForCorpi(ctx, []string{corpi}, overrides) 109 if err != nil { 110 switch err.(type) { 111 case ErrNoUsableWords: 112 return nil, nil, fmt.Errorf("The object is invalid, as weaviate could not extract "+ 113 "any contextionary-valid words from it. This is the case when you have "+ 114 "set the options 'vectorizeClassName: false' and 'vectorizePropertyName: false' in this class' schema definition "+ 115 "and not a single property's value "+ 116 "contains at least one contextionary-valid word. To fix this, you have several "+ 117 "options:\n\n1.) Make sure that the schema class name or the set properties are "+ 118 "a contextionary-valid term and include them in vectorization using the "+ 119 "'vectorizeClassName' or 'vectorizePropertyName' setting. In this case the vector position "+ 120 "will be composed of both the class/property names and the values for those fields. "+ 121 "Even if no property values are contextionary-valid, the overall word corpus is still valid "+ 122 "due to the contextionary-valid class/property names."+ 123 "\n\n2.) Alternatively, if you do not want to include schema class/property names "+ 124 "in vectorization, you must make sure that at least one text/string property contains "+ 125 "at least one contextionary-valid word."+ 126 "\n\n3.) If the word corpus weaviate extracted from your object "+ 127 "(see below) does contain enough meaning to build a vector position, but the contextionary "+ 128 "did not recognize the words, you can extend the contextionary using the "+ 129 "REST API. This is the case when you use mostly industry-specific terms which are "+ 130 "not known to the common language contextionary. Once extended, simply reimport this object."+ 131 "\n\nThe following words were extracted from your object: %v"+ 132 "\n\nTo learn more about the contextionary and how it behaves, check out: https://www.semi.technology/documentation/weaviate/current/contextionary.html"+ 133 "\n\nOriginal error: %v", corpi, err) 134 default: 135 return nil, nil, fmt.Errorf("vectorizing object with corpus '%+v': %v", corpi, err) 136 } 137 } 138 139 return vector, ie, nil 140 } 141 142 // Corpi takes any list of strings and builds a common vector for all of them 143 func (v *Vectorizer) Corpi(ctx context.Context, corpi []string, 144 ) ([]float32, error) { 145 for i, corpus := range corpi { 146 corpi[i] = camelCaseToLower(corpus) 147 } 148 149 vector, _, err := v.client.VectorForCorpi(ctx, corpi, nil) 150 if err != nil { 151 return nil, fmt.Errorf("vectorizing corpus '%+v': %v", corpi, err) 152 } 153 154 return vector, nil 155 } 156 157 func camelCaseToLower(in string) string { 158 parts := camelcase.Split(in) 159 var sb strings.Builder 160 for i, part := range parts { 161 if part == " " { 162 continue 163 } 164 165 if i > 0 { 166 sb.WriteString(" ") 167 } 168 169 sb.WriteString(strings.ToLower(part)) 170 } 171 172 return sb.String() 173 } 174 175 func sourceFromInputElements(in []txt2vecmodels.InterpretationSource) []*txt2vecmodels.InterpretationSource { 176 out := make([]*txt2vecmodels.InterpretationSource, len(in)) 177 for i, elem := range in { 178 out[i] = &txt2vecmodels.InterpretationSource{ 179 Concept: elem.Concept, 180 Occurrence: elem.Occurrence, 181 Weight: float64(elem.Weight), 182 } 183 } 184 185 return out 186 }