github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/inverted/row_reader_frequency.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package inverted 13 14 import ( 15 "bytes" 16 "context" 17 "encoding/binary" 18 "fmt" 19 20 "github.com/weaviate/sroar" 21 "github.com/weaviate/weaviate/adapters/repos/db/lsmkv" 22 "github.com/weaviate/weaviate/adapters/repos/db/roaringset" 23 "github.com/weaviate/weaviate/entities/filters" 24 ) 25 26 // RowReaderFrequency reads one or many row(s) depending on the specified operator 27 type RowReaderFrequency struct { 28 value []byte 29 bucket *lsmkv.Bucket 30 operator filters.Operator 31 keyOnly bool 32 shardVersion uint16 33 bitmapFactory *roaringset.BitmapFactory 34 } 35 36 func NewRowReaderFrequency(bucket *lsmkv.Bucket, value []byte, 37 operator filters.Operator, keyOnly bool, shardVersion uint16, 38 bitmapFactory *roaringset.BitmapFactory, 39 ) *RowReaderFrequency { 40 return &RowReaderFrequency{ 41 bucket: bucket, 42 value: value, 43 operator: operator, 44 keyOnly: keyOnly, 45 shardVersion: shardVersion, 46 bitmapFactory: bitmapFactory, 47 } 48 } 49 50 func (rr *RowReaderFrequency) Read(ctx context.Context, readFn ReadFn) error { 51 switch rr.operator { 52 case filters.OperatorEqual: 53 return rr.equal(ctx, readFn) 54 case filters.OperatorNotEqual: 55 return rr.notEqual(ctx, readFn) 56 case filters.OperatorGreaterThan: 57 return rr.greaterThan(ctx, readFn, false) 58 case filters.OperatorGreaterThanEqual: 59 return rr.greaterThan(ctx, readFn, true) 60 case filters.OperatorLessThan: 61 return rr.lessThan(ctx, readFn, false) 62 case filters.OperatorLessThanEqual: 63 return rr.lessThan(ctx, readFn, true) 64 case filters.OperatorLike: 65 return rr.like(ctx, readFn) 66 default: 67 return fmt.Errorf("operator %v supported", rr.operator) 68 } 69 } 70 71 // equal is a special case, as we don't need to iterate, but just read a single 72 // row 73 func (rr *RowReaderFrequency) equal(ctx context.Context, readFn ReadFn) error { 74 v, err := rr.equalHelper(ctx) 75 if err != nil { 76 return err 77 } 78 79 _, err = readFn(rr.value, rr.transformToBitmap(v)) 80 return err 81 } 82 83 func (rr *RowReaderFrequency) notEqual(ctx context.Context, readFn ReadFn) error { 84 v, err := rr.equalHelper(ctx) 85 if err != nil { 86 return err 87 } 88 89 // Invert the Equal results for an efficient NotEqual 90 inverted := rr.bitmapFactory.GetBitmap() 91 inverted.AndNot(rr.transformToBitmap(v)) 92 _, err = readFn(rr.value, inverted) 93 return err 94 } 95 96 // greaterThan reads from the specified value to the end. The first row is only 97 // included if allowEqual==true, otherwise it starts with the next one 98 func (rr *RowReaderFrequency) greaterThan(ctx context.Context, readFn ReadFn, 99 allowEqual bool, 100 ) error { 101 c := rr.newCursor() 102 defer c.Close() 103 104 for k, v := c.Seek(rr.value); k != nil; k, v = c.Next() { 105 if err := ctx.Err(); err != nil { 106 return err 107 } 108 109 if bytes.Equal(k, rr.value) && !allowEqual { 110 continue 111 } 112 113 continueReading, err := readFn(k, rr.transformToBitmap(v)) 114 if err != nil { 115 return err 116 } 117 118 if !continueReading { 119 break 120 } 121 } 122 123 return nil 124 } 125 126 // lessThan reads from the very begging to the specified value. The last 127 // matching row is only included if allowEqual==true, otherwise it ends one 128 // prior to that. 129 func (rr *RowReaderFrequency) lessThan(ctx context.Context, readFn ReadFn, 130 allowEqual bool, 131 ) error { 132 c := rr.newCursor() 133 defer c.Close() 134 135 for k, v := c.First(); k != nil && bytes.Compare(k, rr.value) != 1; k, v = c.Next() { 136 if err := ctx.Err(); err != nil { 137 return err 138 } 139 140 if bytes.Equal(k, rr.value) && !allowEqual { 141 continue 142 } 143 144 continueReading, err := readFn(k, rr.transformToBitmap(v)) 145 if err != nil { 146 return err 147 } 148 149 if !continueReading { 150 break 151 } 152 } 153 154 return nil 155 } 156 157 func (rr *RowReaderFrequency) like(ctx context.Context, readFn ReadFn) error { 158 like, err := parseLikeRegexp(rr.value) 159 if err != nil { 160 return fmt.Errorf("parse like value: %w", err) 161 } 162 163 // TODO: don't we need to check here if this is a doc id vs a object search? 164 // Or is this not a problem because the latter removes duplicates anyway? 165 c := rr.newCursor(lsmkv.MapListAcceptDuplicates()) 166 defer c.Close() 167 168 var ( 169 initialK []byte 170 initialV []lsmkv.MapPair 171 ) 172 173 if like.optimizable { 174 initialK, initialV = c.Seek(like.min) 175 } else { 176 initialK, initialV = c.First() 177 } 178 179 for k, v := initialK, initialV; k != nil; k, v = c.Next() { 180 if err := ctx.Err(); err != nil { 181 return err 182 } 183 184 if like.optimizable { 185 // if the query is optimizable, i.e. it doesn't start with a wildcard, we 186 // can abort once we've moved past the point where the fixed characters 187 // no longer match 188 if len(k) < len(like.min) { 189 break 190 } 191 192 if bytes.Compare(like.min, k[:len(like.min)]) == -1 { 193 break 194 } 195 } 196 197 if !like.regexp.Match(k) { 198 continue 199 } 200 201 continueReading, err := readFn(k, rr.transformToBitmap(v)) 202 if err != nil { 203 return err 204 } 205 206 if !continueReading { 207 break 208 } 209 } 210 211 return nil 212 } 213 214 // newCursor will either return a regular cursor - or a key-only cursor if 215 // keyOnly==true 216 func (rr *RowReaderFrequency) newCursor( 217 opts ...lsmkv.MapListOption, 218 ) *lsmkv.CursorMap { 219 if rr.shardVersion < 2 { 220 opts = append(opts, lsmkv.MapListLegacySortingRequired()) 221 } 222 223 if rr.keyOnly { 224 return rr.bucket.MapCursorKeyOnly(opts...) 225 } 226 227 return rr.bucket.MapCursor(opts...) 228 } 229 230 func (rr *RowReaderFrequency) transformToBitmap(pairs []lsmkv.MapPair) *sroar.Bitmap { 231 out := sroar.NewBitmap() 232 for _, pair := range pairs { 233 // this entry has a frequency, but that's only used for bm25, not for 234 // pure filtering, so we can ignore it here 235 if rr.shardVersion < 2 { 236 out.Set(binary.LittleEndian.Uint64(pair.Key)) 237 } else { 238 out.Set(binary.BigEndian.Uint64(pair.Key)) 239 } 240 } 241 return out 242 } 243 244 // equalHelper exists, because the Equal and NotEqual operators share this functionality 245 func (rr *RowReaderFrequency) equalHelper(ctx context.Context) (v []lsmkv.MapPair, err error) { 246 if err = ctx.Err(); err != nil { 247 return 248 } 249 250 if rr.shardVersion < 2 { 251 v, err = rr.bucket.MapList(rr.value, lsmkv.MapListAcceptDuplicates(), 252 lsmkv.MapListLegacySortingRequired()) 253 if err != nil { 254 return 255 } 256 } else { 257 v, err = rr.bucket.MapList(rr.value, lsmkv.MapListAcceptDuplicates()) 258 if err != nil { 259 return 260 } 261 } 262 return 263 }