github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/shard_read.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package db 13 14 import ( 15 "bytes" 16 "context" 17 "encoding/binary" 18 "fmt" 19 "time" 20 21 "github.com/go-openapi/strfmt" 22 "github.com/google/uuid" 23 "github.com/pkg/errors" 24 "github.com/sirupsen/logrus" 25 "github.com/weaviate/weaviate/adapters/repos/db/helpers" 26 "github.com/weaviate/weaviate/adapters/repos/db/inverted" 27 "github.com/weaviate/weaviate/adapters/repos/db/sorter" 28 "github.com/weaviate/weaviate/adapters/repos/db/vector/common" 29 "github.com/weaviate/weaviate/entities/additional" 30 "github.com/weaviate/weaviate/entities/filters" 31 "github.com/weaviate/weaviate/entities/multi" 32 "github.com/weaviate/weaviate/entities/schema" 33 "github.com/weaviate/weaviate/entities/search" 34 "github.com/weaviate/weaviate/entities/searchparams" 35 "github.com/weaviate/weaviate/entities/storobj" 36 ) 37 38 func (s *Shard) ObjectByID(ctx context.Context, id strfmt.UUID, props search.SelectProperties, additional additional.Properties) (*storobj.Object, error) { 39 idBytes, err := uuid.MustParse(id.String()).MarshalBinary() 40 if err != nil { 41 return nil, err 42 } 43 44 bytes, err := s.store.Bucket(helpers.ObjectsBucketLSM).Get(idBytes) 45 if err != nil { 46 return nil, err 47 } 48 49 if bytes == nil { 50 return nil, nil 51 } 52 53 obj, err := storobj.FromBinary(bytes) 54 if err != nil { 55 return nil, errors.Wrap(err, "unmarshal object") 56 } 57 58 return obj, nil 59 } 60 61 func (s *Shard) MultiObjectByID(ctx context.Context, query []multi.Identifier) ([]*storobj.Object, error) { 62 objects := make([]*storobj.Object, len(query)) 63 64 ids := make([][]byte, len(query)) 65 for i, q := range query { 66 idBytes, err := uuid.MustParse(q.ID).MarshalBinary() 67 if err != nil { 68 return nil, err 69 } 70 71 ids[i] = idBytes 72 } 73 74 bucket := s.store.Bucket(helpers.ObjectsBucketLSM) 75 for i, id := range ids { 76 bytes, err := bucket.Get(id) 77 if err != nil { 78 return nil, err 79 } 80 81 if bytes == nil { 82 continue 83 } 84 85 obj, err := storobj.FromBinary(bytes) 86 if err != nil { 87 return nil, errors.Wrap(err, "unmarshal kind object") 88 } 89 objects[i] = obj 90 } 91 92 return objects, nil 93 } 94 95 // TODO: This does an actual read which is not really needed, if we see this 96 // come up in profiling, we could optimize this by adding an explicit Exists() 97 // on the LSMKV which only checks the bloom filters, which at least in the case 98 // of a true negative would be considerably faster. For a (false) positive, 99 // we'd still need to check, though. 100 func (s *Shard) Exists(ctx context.Context, id strfmt.UUID) (bool, error) { 101 idBytes, err := uuid.MustParse(id.String()).MarshalBinary() 102 if err != nil { 103 return false, err 104 } 105 106 bytes, err := s.store.Bucket(helpers.ObjectsBucketLSM).Get(idBytes) 107 if err != nil { 108 return false, errors.Wrap(err, "read request") 109 } 110 111 if bytes == nil { 112 return false, nil 113 } 114 115 return true, nil 116 } 117 118 func (s *Shard) objectByIndexID(ctx context.Context, indexID uint64, acceptDeleted bool) (*storobj.Object, error) { 119 keyBuf := make([]byte, 8) 120 binary.LittleEndian.PutUint64(keyBuf, indexID) 121 122 bytes, err := s.store.Bucket(helpers.ObjectsBucketLSM). 123 GetBySecondary(0, keyBuf) 124 if err != nil { 125 return nil, err 126 } 127 128 if bytes == nil { 129 return nil, storobj.NewErrNotFoundf(indexID, 130 "uuid found for docID, but object is nil") 131 } 132 133 obj, err := storobj.FromBinary(bytes) 134 if err != nil { 135 return nil, errors.Wrap(err, "unmarshal kind object") 136 } 137 138 return obj, nil 139 } 140 141 func (s *Shard) vectorByIndexID(ctx context.Context, indexID uint64) ([]float32, error) { 142 keyBuf := make([]byte, 8) 143 return s.readVectorByIndexIDIntoSlice(ctx, indexID, &common.VectorSlice{Buff8: keyBuf}) 144 } 145 146 func (s *Shard) readVectorByIndexIDIntoSlice(ctx context.Context, indexID uint64, container *common.VectorSlice) ([]float32, error) { 147 binary.LittleEndian.PutUint64(container.Buff8, indexID) 148 149 bytes, newBuff, err := s.store.Bucket(helpers.ObjectsBucketLSM). 150 GetBySecondaryIntoMemory(0, container.Buff8, container.Buff) 151 if err != nil { 152 return nil, err 153 } 154 155 if bytes == nil { 156 return nil, storobj.NewErrNotFoundf(indexID, 157 "no object for doc id, it could have been deleted") 158 } 159 160 container.Buff = newBuff 161 return storobj.VectorFromBinary(bytes, container.Slice) 162 } 163 164 func (s *Shard) ObjectSearch(ctx context.Context, limit int, filters *filters.LocalFilter, 165 keywordRanking *searchparams.KeywordRanking, sort []filters.Sort, cursor *filters.Cursor, 166 additional additional.Properties, 167 ) ([]*storobj.Object, []float32, error) { 168 if keywordRanking != nil { 169 if v := s.versioner.Version(); v < 2 { 170 return nil, nil, errors.Errorf( 171 "shard was built with an older version of " + 172 "Weaviate which does not yet support BM25 search") 173 } 174 175 var bm25objs []*storobj.Object 176 var bm25count []float32 177 var err error 178 var objs helpers.AllowList 179 var filterDocIds helpers.AllowList 180 181 if filters != nil { 182 objs, err = inverted.NewSearcher(s.index.logger, s.store, 183 s.index.getSchema.GetSchemaSkipAuth(), s.propertyIndices, 184 s.index.classSearcher, s.index.stopwords, s.versioner.Version(), 185 s.isFallbackToSearchable, s.tenant(), s.index.Config.QueryNestedRefLimit, 186 s.bitmapFactory). 187 DocIDs(ctx, filters, additional, s.index.Config.ClassName) 188 if err != nil { 189 return nil, nil, err 190 } 191 192 filterDocIds = objs 193 } 194 195 className := s.index.Config.ClassName 196 bm25Config := s.index.getInvertedIndexConfig().BM25 197 logger := s.index.logger.WithFields(logrus.Fields{"class": s.index.Config.ClassName, "shard": s.name}) 198 bm25searcher := inverted.NewBM25Searcher(bm25Config, s.store, 199 s.index.getSchema.GetSchemaSkipAuth(), s.propertyIndices, s.index.classSearcher, 200 s.GetPropertyLengthTracker(), logger, s.versioner.Version()) 201 bm25objs, bm25count, err = bm25searcher.BM25F(ctx, filterDocIds, className, limit, *keywordRanking) 202 if err != nil { 203 return nil, nil, err 204 } 205 206 return bm25objs, bm25count, nil 207 } 208 209 if filters == nil { 210 objs, err := s.ObjectList(ctx, limit, sort, 211 cursor, additional, s.index.Config.ClassName) 212 return objs, nil, err 213 } 214 objs, err := inverted.NewSearcher(s.index.logger, s.store, s.index.getSchema.GetSchemaSkipAuth(), 215 s.propertyIndices, s.index.classSearcher, s.index.stopwords, s.versioner.Version(), 216 s.isFallbackToSearchable, s.tenant(), s.index.Config.QueryNestedRefLimit, s.bitmapFactory). 217 Objects(ctx, limit, filters, sort, additional, s.index.Config.ClassName) 218 return objs, nil, err 219 } 220 221 func (s *Shard) getIndexQueue(targetVector string) (*IndexQueue, error) { 222 if s.hasTargetVectors() { 223 if targetVector == "" { 224 return nil, fmt.Errorf("index queue: missing target vector") 225 } 226 queue, ok := s.queues[targetVector] 227 if !ok { 228 return nil, fmt.Errorf("index queue for target vector: %s doesn't exist", targetVector) 229 } 230 return queue, nil 231 } 232 return s.queue, nil 233 } 234 235 func (s *Shard) ObjectVectorSearch(ctx context.Context, searchVector []float32, targetVector string, targetDist float32, limit int, filters *filters.LocalFilter, sort []filters.Sort, groupBy *searchparams.GroupBy, additional additional.Properties) ([]*storobj.Object, []float32, error) { 236 var ( 237 ids []uint64 238 dists []float32 239 err error 240 allowList helpers.AllowList 241 ) 242 243 if filters != nil { 244 beforeFilter := time.Now() 245 list, err := s.buildAllowList(ctx, filters, additional) 246 if err != nil { 247 return nil, nil, err 248 } 249 allowList = list 250 s.metrics.FilteredVectorFilter(time.Since(beforeFilter)) 251 } 252 253 queue, err := s.getIndexQueue(targetVector) 254 if err != nil { 255 return nil, nil, err 256 } 257 258 beforeVector := time.Now() 259 if limit < 0 { 260 ids, dists, err = queue.SearchByVectorDistance( 261 searchVector, targetDist, s.index.Config.QueryMaximumResults, allowList) 262 if err != nil { 263 return nil, nil, errors.Wrap(err, "vector search by distance") 264 } 265 } else { 266 ids, dists, err = queue.SearchByVector(searchVector, limit, allowList) 267 if err != nil { 268 return nil, nil, errors.Wrap(err, "vector search") 269 } 270 } 271 if len(ids) == 0 { 272 return nil, nil, nil 273 } 274 275 if filters != nil { 276 s.metrics.FilteredVectorVector(time.Since(beforeVector)) 277 } 278 279 if groupBy != nil { 280 return s.groupResults(ctx, ids, dists, groupBy, additional) 281 } 282 283 if len(sort) > 0 { 284 beforeSort := time.Now() 285 ids, dists, err = s.sortDocIDsAndDists(ctx, limit, sort, 286 s.index.Config.ClassName, ids, dists) 287 if err != nil { 288 return nil, nil, errors.Wrap(err, "vector search sort") 289 } 290 if filters != nil { 291 s.metrics.FilteredVectorSort(time.Since(beforeSort)) 292 } 293 } 294 295 beforeObjects := time.Now() 296 297 bucket := s.store.Bucket(helpers.ObjectsBucketLSM) 298 objs, err := storobj.ObjectsByDocID(bucket, ids, additional) 299 if err != nil { 300 return nil, nil, err 301 } 302 303 if filters != nil { 304 s.metrics.FilteredVectorObjects(time.Since(beforeObjects)) 305 } 306 307 return objs, dists, nil 308 } 309 310 func (s *Shard) ObjectList(ctx context.Context, limit int, sort []filters.Sort, cursor *filters.Cursor, additional additional.Properties, className schema.ClassName) ([]*storobj.Object, error) { 311 if len(sort) > 0 { 312 docIDs, err := s.sortedObjectList(ctx, limit, sort, className) 313 if err != nil { 314 return nil, err 315 } 316 bucket := s.store.Bucket(helpers.ObjectsBucketLSM) 317 return storobj.ObjectsByDocID(bucket, docIDs, additional) 318 } 319 320 if cursor == nil { 321 cursor = &filters.Cursor{After: "", Limit: limit} 322 } 323 return s.cursorObjectList(ctx, cursor, additional, className) 324 } 325 326 func (s *Shard) cursorObjectList(ctx context.Context, c *filters.Cursor, 327 additional additional.Properties, 328 className schema.ClassName, 329 ) ([]*storobj.Object, error) { 330 cursor := s.store.Bucket(helpers.ObjectsBucketLSM).Cursor() 331 defer cursor.Close() 332 333 var key, val []byte 334 if c.After == "" { 335 key, val = cursor.First() 336 } else { 337 uuidBytes, err := uuid.MustParse(c.After).MarshalBinary() 338 if err != nil { 339 return nil, errors.Wrap(err, "after argument is not a valid uuid") 340 } 341 key, val = cursor.Seek(uuidBytes) 342 if bytes.Equal(key, uuidBytes) { 343 // move cursor by one if it's the same ID 344 key, val = cursor.Next() 345 } 346 } 347 348 i := 0 349 out := make([]*storobj.Object, c.Limit) 350 351 for ; key != nil && i < c.Limit; key, val = cursor.Next() { 352 obj, err := storobj.FromBinary(val) 353 if err != nil { 354 return nil, errors.Wrapf(err, "unmarhsal item %d", i) 355 } 356 357 out[i] = obj 358 i++ 359 } 360 361 return out[:i], nil 362 } 363 364 func (s *Shard) sortedObjectList(ctx context.Context, limit int, sort []filters.Sort, className schema.ClassName) ([]uint64, error) { 365 lsmSorter, err := sorter.NewLSMSorter(s.store, s.index.getSchema.GetSchemaSkipAuth(), className) 366 if err != nil { 367 return nil, errors.Wrap(err, "sort object list") 368 } 369 docIDs, err := lsmSorter.Sort(ctx, limit, sort) 370 if err != nil { 371 return nil, errors.Wrap(err, "sort object list") 372 } 373 return docIDs, nil 374 } 375 376 func (s *Shard) sortDocIDsAndDists(ctx context.Context, limit int, sort []filters.Sort, className schema.ClassName, docIDs []uint64, dists []float32) ([]uint64, []float32, error) { 377 lsmSorter, err := sorter.NewLSMSorter(s.store, s.index.getSchema.GetSchemaSkipAuth(), className) 378 if err != nil { 379 return nil, nil, errors.Wrap(err, "sort objects with distances") 380 } 381 sortedDocIDs, sortedDists, err := lsmSorter.SortDocIDsAndDists(ctx, limit, sort, docIDs, dists) 382 if err != nil { 383 return nil, nil, errors.Wrap(err, "sort objects with distances") 384 } 385 return sortedDocIDs, sortedDists, nil 386 } 387 388 func (s *Shard) buildAllowList(ctx context.Context, filters *filters.LocalFilter, addl additional.Properties) (helpers.AllowList, error) { 389 list, err := inverted.NewSearcher(s.index.logger, s.store, s.index.getSchema.GetSchemaSkipAuth(), 390 s.propertyIndices, s.index.classSearcher, s.index.stopwords, s.versioner.Version(), 391 s.isFallbackToSearchable, s.tenant(), s.index.Config.QueryNestedRefLimit, s.bitmapFactory). 392 DocIDs(ctx, filters, addl, s.index.Config.ClassName) 393 if err != nil { 394 return nil, errors.Wrap(err, "build inverted filter allow list") 395 } 396 397 return list, nil 398 } 399 400 func (s *Shard) uuidFromDocID(docID uint64) (strfmt.UUID, error) { 401 bucket := s.store.Bucket(helpers.ObjectsBucketLSM) 402 if bucket == nil { 403 return "", errors.Errorf("objects bucket not found") 404 } 405 406 keyBuf := bytes.NewBuffer(nil) 407 binary.Write(keyBuf, binary.LittleEndian, &docID) 408 docIDBytes := keyBuf.Bytes() 409 res, err := bucket.GetBySecondary(0, docIDBytes) 410 if err != nil { 411 return "", err 412 } 413 414 prop, _, err := storobj.ParseAndExtractProperty(res, "id") 415 if err != nil { 416 return "", err 417 } 418 419 return strfmt.UUID(prop[0]), nil 420 } 421 422 func (s *Shard) batchDeleteObject(ctx context.Context, id strfmt.UUID) error { 423 idBytes, err := uuid.MustParse(id.String()).MarshalBinary() 424 if err != nil { 425 return err 426 } 427 428 var docID uint64 429 bucket := s.store.Bucket(helpers.ObjectsBucketLSM) 430 existing, err := bucket.Get(idBytes) 431 if err != nil { 432 return errors.Wrap(err, "unexpected error on previous lookup") 433 } 434 435 if existing == nil { 436 // nothing to do 437 return nil 438 } 439 440 // we need the doc ID so we can clean up inverted indices currently 441 // pointing to this object 442 docID, err = storobj.DocIDFromBinary(existing) 443 if err != nil { 444 return errors.Wrap(err, "get existing doc id from object binary") 445 } 446 447 err = bucket.Delete(idBytes) 448 if err != nil { 449 return errors.Wrap(err, "delete object from bucket") 450 } 451 452 err = s.cleanupInvertedIndexOnDelete(existing, docID) 453 if err != nil { 454 return errors.Wrap(err, "delete object from bucket") 455 } 456 457 if s.hasTargetVectors() { 458 for targetVector, queue := range s.queues { 459 if err = queue.Delete(docID); err != nil { 460 return fmt.Errorf("delete from vector index queue of vector %q: %w", targetVector, err) 461 } 462 } 463 } else { 464 if err = s.queue.Delete(docID); err != nil { 465 return errors.Wrap(err, "delete from vector index queue") 466 } 467 } 468 469 return nil 470 } 471 472 func (s *Shard) WasDeleted(ctx context.Context, id strfmt.UUID) (bool, error) { 473 idBytes, err := uuid.MustParse(id.String()).MarshalBinary() 474 if err != nil { 475 return false, err 476 } 477 478 bucket := s.store.Bucket(helpers.ObjectsBucketLSM) 479 return bucket.WasDeleted(idBytes) 480 }