github.com/weaviate/weaviate@v1.24.6/modules/text2vec-contextionary/vectorizer/inspector.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package vectorizer 13 14 import ( 15 "context" 16 "fmt" 17 "strings" 18 "unicode" 19 "unicode/utf8" 20 21 "github.com/weaviate/weaviate/entities/models" 22 txt2vecmodels "github.com/weaviate/weaviate/modules/text2vec-contextionary/additional/models" 23 ) 24 25 type InspectorClient interface { 26 VectorForWord(ctx context.Context, word string) ([]float32, error) 27 VectorForCorpi(ctx context.Context, words []string, 28 overrides map[string]string) ([]float32, []txt2vecmodels.InterpretationSource, error) 29 NearestWordsByVector(ctx context.Context, vector []float32, n int, k int) ([]string, []float32, error) 30 IsWordPresent(ctx context.Context, word string) (bool, error) 31 } 32 33 type Inspector struct { 34 client InspectorClient 35 } 36 37 func NewInspector(client InspectorClient) *Inspector { 38 return &Inspector{client: client} 39 } 40 41 func (i *Inspector) GetWords(ctx context.Context, words string) (*models.C11yWordsResponse, error) { 42 wordArray, err := i.validateAndSplit(words) 43 if err != nil { 44 return nil, err 45 } 46 47 concatWord, err := i.concatWord(ctx, words, wordArray) 48 if err != nil { 49 return nil, err 50 } 51 52 individualWords, err := i.individualWords(ctx, wordArray) 53 if err != nil { 54 return nil, err 55 } 56 57 return &models.C11yWordsResponse{ 58 ConcatenatedWord: concatWord, 59 IndividualWords: individualWords, 60 }, nil 61 } 62 63 func (i *Inspector) validateAndSplit(words string) ([]string, error) { 64 // set first character to lowercase 65 wordChars := []rune(words) 66 wordChars[0] = unicode.ToLower(wordChars[0]) 67 words = string(wordChars) 68 69 for _, r := range words { 70 if !unicode.IsLetter(r) && !unicode.IsNumber(r) { 71 return nil, fmt.Errorf("invalid word input: words must only contain unicode letters and digits") 72 } 73 } 74 75 return split(words), nil 76 } 77 78 func (i *Inspector) concatWord(ctx context.Context, words string, 79 wordArray []string, 80 ) (*models.C11yWordsResponseConcatenatedWord, error) { 81 if len(wordArray) < 2 { 82 // only build a concat response if we have more than a single word 83 return nil, nil 84 } 85 86 // join the words into a single corpus. While the contextionary also supports 87 // building a centroid from multiple corpi (thus []string for Corpi, an 88 // occurrence-based weighing can only happen within a corpus. It is thus - by 89 // far - preferable in this case, to concat the words into one corpus, rather 90 // than treating each word as its own. 91 corpus := strings.Join(wordArray, " ") 92 vector, _, err := i.client.VectorForCorpi(ctx, []string{corpus}, nil) 93 if err != nil { 94 return nil, err 95 } 96 97 nearestNeighbors, err := i.nearestNeighbors(ctx, vector) 98 if err != nil { 99 return nil, err 100 } 101 102 return &models.C11yWordsResponseConcatenatedWord{ 103 ConcatenatedWord: words, 104 SingleWords: wordArray, 105 ConcatenatedVector: vector, 106 ConcatenatedNearestNeighbors: nearestNeighbors, 107 }, nil 108 } 109 110 func (i *Inspector) nearestNeighbors(ctx context.Context, 111 vector []float32, 112 ) ([]*models.C11yNearestNeighborsItems0, error) { 113 // relate words of centroid 114 words, dists, err := i.client.NearestWordsByVector(ctx, vector, 12, 32) 115 if err != nil { 116 return nil, err 117 } 118 119 nearestNeighbors := []*models.C11yNearestNeighborsItems0{} 120 121 // loop over NN Idx' and append to the return object 122 for i, word := range words { 123 item := models.C11yNearestNeighborsItems0{ 124 Word: word, 125 Distance: dists[i], 126 } 127 128 nearestNeighbors = append(nearestNeighbors, &item) 129 } 130 131 return nearestNeighbors, nil 132 } 133 134 func (i *Inspector) individualWords(ctx context.Context, 135 wordArray []string, 136 ) ([]*models.C11yWordsResponseIndividualWordsItems0, error) { 137 var res []*models.C11yWordsResponseIndividualWordsItems0 138 139 for _, word := range wordArray { 140 iw, err := i.individualWord(ctx, word) 141 if err != nil { 142 return nil, fmt.Errorf("word '%s': %v", word, err) 143 } 144 145 res = append(res, iw) 146 } 147 148 return res, nil 149 } 150 151 func (i *Inspector) individualWord(ctx context.Context, 152 word string, 153 ) (*models.C11yWordsResponseIndividualWordsItems0, error) { 154 ok, err := i.client.IsWordPresent(ctx, word) 155 if err != nil { 156 return nil, fmt.Errorf("could not check word presence: %v", err) 157 } 158 159 if !ok { 160 return i.individualWordNotPresent(word), nil 161 } 162 163 return i.individualWordPresent(ctx, word) 164 } 165 166 func (i *Inspector) individualWordNotPresent(word string) *models.C11yWordsResponseIndividualWordsItems0 { 167 return &models.C11yWordsResponseIndividualWordsItems0{ 168 Word: word, 169 Present: false, 170 } 171 } 172 173 func (i *Inspector) individualWordPresent(ctx context.Context, 174 word string, 175 ) (*models.C11yWordsResponseIndividualWordsItems0, error) { 176 info, err := i.individualWordInfo(ctx, word) 177 if err != nil { 178 return nil, err 179 } 180 181 return &models.C11yWordsResponseIndividualWordsItems0{ 182 Word: word, 183 Present: true, 184 Info: info, 185 }, nil 186 } 187 188 func (i *Inspector) individualWordInfo(ctx context.Context, 189 word string, 190 ) (*models.C11yWordsResponseIndividualWordsItems0Info, error) { 191 vector, err := i.client.VectorForWord(ctx, word) 192 if err != nil { 193 return nil, err 194 } 195 196 nns, err := i.nearestNeighbors(ctx, vector) 197 if err != nil { 198 return nil, err 199 } 200 201 return &models.C11yWordsResponseIndividualWordsItems0Info{ 202 Vector: vector, 203 NearestNeighbors: nns, 204 }, nil 205 } 206 207 // Splits a CamelCase string to an array 208 // Based on: https://github.com/fatih/camelcase 209 func split(src string) (entries []string) { 210 // don't split invalid utf8 211 if !utf8.ValidString(src) { 212 return []string{src} 213 } 214 entries = []string{} 215 var runes [][]rune 216 lastClass := 0 217 class := 0 218 // split into fields based on class of unicode character 219 for _, r := range src { 220 switch true { 221 case unicode.IsLower(r): 222 class = 1 223 case unicode.IsUpper(r): 224 class = 2 225 case unicode.IsDigit(r): 226 class = 1 227 default: 228 class = 4 229 } 230 if class == lastClass { 231 runes[len(runes)-1] = append(runes[len(runes)-1], r) 232 } else { 233 runes = append(runes, []rune{r}) 234 } 235 lastClass = class 236 } 237 // handle upper case -> lower case sequences, e.g. 238 // "PDFL", "oader" -> "PDF", "Loader" 239 for i := 0; i < len(runes)-1; i++ { 240 if unicode.IsUpper(runes[i][0]) && unicode.IsLower(runes[i+1][0]) { 241 runes[i+1] = append([]rune{runes[i][len(runes[i])-1]}, runes[i+1]...) 242 runes[i] = runes[i][:len(runes[i])-1] 243 } 244 } 245 // construct []string from results 246 for _, s := range runes { 247 if len(s) > 0 { 248 entries = append(entries, strings.ToLower(string(s))) 249 } 250 } 251 return 252 }