github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/helpers/tokenizer.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package helpers 13 14 import ( 15 "os" 16 "strings" 17 "sync" 18 "unicode" 19 20 "github.com/go-ego/gse" 21 "github.com/weaviate/weaviate/entities/models" 22 ) 23 24 var ( 25 gseTokenizer *gse.Segmenter 26 gseTokenizerLock = &sync.Mutex{} 27 UseGse = false 28 ) 29 30 var Tokenizations []string = []string{ 31 models.PropertyTokenizationWord, 32 models.PropertyTokenizationLowercase, 33 models.PropertyTokenizationWhitespace, 34 models.PropertyTokenizationField, 35 models.PropertyTokenizationTrigram, 36 models.PropertyTokenizationGse, 37 } 38 39 func init() { 40 init_gse() 41 } 42 43 func init_gse() { 44 if os.Getenv("USE_GSE") == "true" { 45 UseGse = true 46 } 47 if UseGse { 48 gseTokenizerLock.Lock() 49 defer gseTokenizerLock.Unlock() 50 if gseTokenizer == nil { 51 seg, err := gse.New("ja") 52 if err != nil { 53 return //[]string{} 54 } 55 gseTokenizer = &seg 56 } 57 } 58 } 59 60 func Tokenize(tokenization string, in string) []string { 61 switch tokenization { 62 case models.PropertyTokenizationWord: 63 return tokenizeWord(in) 64 case models.PropertyTokenizationLowercase: 65 return tokenizeLowercase(in) 66 case models.PropertyTokenizationWhitespace: 67 return tokenizeWhitespace(in) 68 case models.PropertyTokenizationField: 69 return tokenizeField(in) 70 case models.PropertyTokenizationTrigram: 71 return tokenizetrigram(in) 72 case models.PropertyTokenizationGse: 73 return tokenizeGSE(in) 74 default: 75 return []string{} 76 } 77 } 78 79 func TokenizeWithWildcards(tokenization string, in string) []string { 80 switch tokenization { 81 case models.PropertyTokenizationWord: 82 return tokenizeWordWithWildcards(in) 83 case models.PropertyTokenizationLowercase: 84 return tokenizeLowercase(in) 85 case models.PropertyTokenizationWhitespace: 86 return tokenizeWhitespace(in) 87 case models.PropertyTokenizationField: 88 return tokenizeField(in) 89 case models.PropertyTokenizationTrigram: 90 return tokenizetrigramWithWildcards(in) 91 case models.PropertyTokenizationGse: 92 return tokenizeGSE(in) 93 default: 94 return []string{} 95 } 96 } 97 98 // tokenizeField trims white spaces 99 // (former DataTypeString/Field) 100 func tokenizeField(in string) []string { 101 return []string{strings.TrimFunc(in, unicode.IsSpace)} 102 } 103 104 // tokenizeWhitespace splits on white spaces, does not alter casing 105 // (former DataTypeString/Word) 106 func tokenizeWhitespace(in string) []string { 107 return strings.FieldsFunc(in, unicode.IsSpace) 108 } 109 110 // tokenizeLowercase splits on white spaces and lowercases the words 111 func tokenizeLowercase(in string) []string { 112 terms := tokenizeWhitespace(in) 113 return lowercase(terms) 114 } 115 116 // tokenizeWord splits on any non-alphanumerical and lowercases the words 117 // (former DataTypeText/Word) 118 func tokenizeWord(in string) []string { 119 terms := strings.FieldsFunc(in, func(r rune) bool { 120 return !unicode.IsLetter(r) && !unicode.IsNumber(r) 121 }) 122 return lowercase(terms) 123 } 124 125 // tokenizetrigram splits on any non-alphanumerical and lowercases the words, joins them together, then groups them into trigrams 126 func tokenizetrigram(in string) []string { 127 // Strip whitespace and punctuation from the input string 128 inputString := strings.ToLower(strings.Join(strings.FieldsFunc(in, func(r rune) bool { 129 return !unicode.IsLetter(r) && !unicode.IsNumber(r) 130 }), "")) 131 runes := []rune(inputString) 132 var trirunes [][]rune 133 for i := 0; i < len(runes)-2; i++ { 134 trirunes = append(trirunes, runes[i:i+3]) 135 } 136 137 var trigrams []string 138 for _, trirune := range trirunes { 139 trigrams = append(trigrams, string(trirune)) 140 } 141 return trigrams 142 } 143 144 // tokenizeGSE uses the gse tokenizer to tokenise Chinese and Japanese 145 func tokenizeGSE(in string) []string { 146 if !UseGse { 147 return []string{} 148 } 149 gseTokenizerLock.Lock() 150 defer gseTokenizerLock.Unlock() 151 terms := gseTokenizer.CutAll(in) 152 153 // Remove empty strings from terms 154 for i := 0; i < len(terms); i++ { 155 if terms[i] == "" || terms[i] == " " { 156 terms = append(terms[:i], terms[i+1:]...) 157 i-- 158 } 159 } 160 161 alpha := tokenizeWord(in) 162 return append(terms, alpha...) 163 } 164 165 // tokenizeWordWithWildcards splits on any non-alphanumerical except wildcard-symbols and 166 // lowercases the words 167 func tokenizeWordWithWildcards(in string) []string { 168 terms := strings.FieldsFunc(in, func(r rune) bool { 169 return !unicode.IsLetter(r) && !unicode.IsNumber(r) && r != '?' && r != '*' 170 }) 171 return lowercase(terms) 172 } 173 174 // tokenizetrigramWithWildcards splits on any non-alphanumerical and lowercases the words, applies any wildcards, then joins them together, then groups them into trigrams 175 // this is unlikely to be useful, but is included for completeness 176 func tokenizetrigramWithWildcards(in string) []string { 177 terms := tokenizeWordWithWildcards(in) 178 inputString := strings.Join(terms, "") 179 var trigrams []string 180 for i := 0; i < len(inputString)-2; i++ { 181 trigrams = append(trigrams, inputString[i:i+3]) 182 } 183 return trigrams 184 } 185 186 func lowercase(terms []string) []string { 187 for i := range terms { 188 terms[i] = strings.ToLower(terms[i]) 189 } 190 return terms 191 } 192 193 func TokenizeAndCountDuplicates(tokenization string, in string) ([]string, []int) { 194 counts := map[string]int{} 195 for _, term := range Tokenize(tokenization, in) { 196 counts[term]++ 197 } 198 199 unique := make([]string, len(counts)) 200 boosts := make([]int, len(counts)) 201 202 i := 0 203 for term, boost := range counts { 204 unique[i] = term 205 boosts[i] = boost 206 i++ 207 } 208 209 return unique, boosts 210 }