github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/inverted/analyzer.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package inverted 13 14 import ( 15 "bytes" 16 "encoding/binary" 17 18 "github.com/google/uuid" 19 "github.com/weaviate/weaviate/adapters/repos/db/helpers" 20 "github.com/weaviate/weaviate/entities/models" 21 ) 22 23 type IsFallbackToSearchable func() bool 24 25 type Countable struct { 26 Data []byte 27 TermFrequency float32 28 } 29 30 type Property struct { 31 Name string 32 Items []Countable 33 Length int 34 HasFilterableIndex bool // roaring set index 35 HasSearchableIndex bool // map index (with frequencies) 36 } 37 38 type NilProperty struct { 39 Name string 40 AddToPropertyLength bool 41 } 42 43 func DedupItems(props []Property) []Property { 44 for i := range props { 45 seen := map[string]struct{}{} 46 items := props[i].Items 47 48 var key string 49 // reverse order to keep latest elements 50 for j := len(items) - 1; j >= 0; j-- { 51 key = string(items[j].Data) 52 if _, ok := seen[key]; ok { 53 // remove element already seen 54 items = append(items[:j], items[j+1:]...) 55 } 56 seen[key] = struct{}{} 57 } 58 props[i].Items = items 59 } 60 return props 61 } 62 63 type Analyzer struct { 64 isFallbackToSearchable IsFallbackToSearchable 65 } 66 67 // Text tokenizes given input according to selected tokenization, 68 // then aggregates duplicates 69 func (a *Analyzer) Text(tokenization, in string) []Countable { 70 return a.TextArray(tokenization, []string{in}) 71 } 72 73 // TextArray tokenizes given input according to selected tokenization, 74 // then aggregates duplicates 75 func (a *Analyzer) TextArray(tokenization string, inArr []string) []Countable { 76 var terms []string 77 for _, in := range inArr { 78 terms = append(terms, helpers.Tokenize(tokenization, in)...) 79 } 80 81 counts := map[string]uint64{} 82 for _, term := range terms { 83 counts[term]++ 84 } 85 86 countable := make([]Countable, len(counts)) 87 i := 0 88 for term, count := range counts { 89 countable[i] = Countable{ 90 Data: []byte(term), 91 TermFrequency: float32(count), 92 } 93 i++ 94 } 95 return countable 96 } 97 98 // Int requires no analysis, so it's actually just a simple conversion to a 99 // string-formatted byte slice of the int 100 func (a *Analyzer) Int(in int64) ([]Countable, error) { 101 data, err := LexicographicallySortableInt64(in) 102 if err != nil { 103 return nil, err 104 } 105 106 return []Countable{ 107 { 108 Data: data, 109 }, 110 }, nil 111 } 112 113 // UUID requires no analysis, so it's just dumping the raw binary representation 114 func (a *Analyzer) UUID(in uuid.UUID) ([]Countable, error) { 115 return []Countable{ 116 { 117 Data: in[:], 118 }, 119 }, nil 120 } 121 122 // UUID array requires no analysis, so it's just dumping the raw binary 123 // representation of each contained element 124 func (a *Analyzer) UUIDArray(in []uuid.UUID) ([]Countable, error) { 125 out := make([]Countable, len(in)) 126 for i := range in { 127 out[i] = Countable{ 128 Data: in[i][:], 129 } 130 } 131 132 return out, nil 133 } 134 135 // Int array requires no analysis, so it's actually just a simple conversion to a 136 // string-formatted byte slice of the int 137 func (a *Analyzer) IntArray(in []int64) ([]Countable, error) { 138 out := make([]Countable, len(in)) 139 for i := range in { 140 data, err := LexicographicallySortableInt64(in[i]) 141 if err != nil { 142 return nil, err 143 } 144 out[i] = Countable{Data: data} 145 } 146 147 return out, nil 148 } 149 150 // Float requires no analysis, so it's actually just a simple conversion to a 151 // lexicographically sortable byte slice. 152 func (a *Analyzer) Float(in float64) ([]Countable, error) { 153 data, err := LexicographicallySortableFloat64(in) 154 if err != nil { 155 return nil, err 156 } 157 158 return []Countable{ 159 { 160 Data: data, 161 }, 162 }, nil 163 } 164 165 // Float array requires no analysis, so it's actually just a simple conversion to a 166 // lexicographically sortable byte slice. 167 func (a *Analyzer) FloatArray(in []float64) ([]Countable, error) { 168 out := make([]Countable, len(in)) 169 for i := range in { 170 data, err := LexicographicallySortableFloat64(in[i]) 171 if err != nil { 172 return nil, err 173 } 174 out[i] = Countable{Data: data} 175 } 176 177 return out, nil 178 } 179 180 // BoolArray requires no analysis, so it's actually just a simple conversion to a 181 // little-endian ordered byte slice 182 func (a *Analyzer) BoolArray(in []bool) ([]Countable, error) { 183 out := make([]Countable, len(in)) 184 for i := range in { 185 b := bytes.NewBuffer(nil) 186 err := binary.Write(b, binary.LittleEndian, &in[i]) 187 if err != nil { 188 return nil, err 189 } 190 out[i] = Countable{Data: b.Bytes()} 191 } 192 193 return out, nil 194 } 195 196 // Bool requires no analysis, so it's actually just a simple conversion to a 197 // little-endian ordered byte slice 198 func (a *Analyzer) Bool(in bool) ([]Countable, error) { 199 b := bytes.NewBuffer(nil) 200 err := binary.Write(b, binary.LittleEndian, &in) 201 if err != nil { 202 return nil, err 203 } 204 205 return []Countable{ 206 { 207 Data: b.Bytes(), 208 }, 209 }, nil 210 } 211 212 // RefCount does not index the content of the refs, but only the count with 0 213 // being an explicitly allowed value as well. 214 func (a *Analyzer) RefCount(in models.MultipleRef) ([]Countable, error) { 215 length := uint64(len(in)) 216 data, err := LexicographicallySortableUint64(length) 217 if err != nil { 218 return nil, err 219 } 220 221 return []Countable{ 222 { 223 Data: data, 224 }, 225 }, nil 226 } 227 228 // Ref indexes references as beacon-strings 229 func (a *Analyzer) Ref(in models.MultipleRef) ([]Countable, error) { 230 out := make([]Countable, len(in)) 231 232 for i, ref := range in { 233 out[i] = Countable{ 234 Data: []byte(ref.Beacon), 235 } 236 } 237 238 return out, nil 239 } 240 241 func NewAnalyzer(isFallbackToSearchable IsFallbackToSearchable) *Analyzer { 242 if isFallbackToSearchable == nil { 243 isFallbackToSearchable = func() bool { return false } 244 } 245 return &Analyzer{isFallbackToSearchable: isFallbackToSearchable} 246 }