github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/shard_read.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package db
    13  
    14  import (
    15  	"bytes"
    16  	"context"
    17  	"encoding/binary"
    18  	"fmt"
    19  	"time"
    20  
    21  	"github.com/go-openapi/strfmt"
    22  	"github.com/google/uuid"
    23  	"github.com/pkg/errors"
    24  	"github.com/sirupsen/logrus"
    25  	"github.com/weaviate/weaviate/adapters/repos/db/helpers"
    26  	"github.com/weaviate/weaviate/adapters/repos/db/inverted"
    27  	"github.com/weaviate/weaviate/adapters/repos/db/sorter"
    28  	"github.com/weaviate/weaviate/adapters/repos/db/vector/common"
    29  	"github.com/weaviate/weaviate/entities/additional"
    30  	"github.com/weaviate/weaviate/entities/filters"
    31  	"github.com/weaviate/weaviate/entities/multi"
    32  	"github.com/weaviate/weaviate/entities/schema"
    33  	"github.com/weaviate/weaviate/entities/search"
    34  	"github.com/weaviate/weaviate/entities/searchparams"
    35  	"github.com/weaviate/weaviate/entities/storobj"
    36  )
    37  
    38  func (s *Shard) ObjectByID(ctx context.Context, id strfmt.UUID, props search.SelectProperties, additional additional.Properties) (*storobj.Object, error) {
    39  	idBytes, err := uuid.MustParse(id.String()).MarshalBinary()
    40  	if err != nil {
    41  		return nil, err
    42  	}
    43  
    44  	bytes, err := s.store.Bucket(helpers.ObjectsBucketLSM).Get(idBytes)
    45  	if err != nil {
    46  		return nil, err
    47  	}
    48  
    49  	if bytes == nil {
    50  		return nil, nil
    51  	}
    52  
    53  	obj, err := storobj.FromBinary(bytes)
    54  	if err != nil {
    55  		return nil, errors.Wrap(err, "unmarshal object")
    56  	}
    57  
    58  	return obj, nil
    59  }
    60  
    61  func (s *Shard) MultiObjectByID(ctx context.Context, query []multi.Identifier) ([]*storobj.Object, error) {
    62  	objects := make([]*storobj.Object, len(query))
    63  
    64  	ids := make([][]byte, len(query))
    65  	for i, q := range query {
    66  		idBytes, err := uuid.MustParse(q.ID).MarshalBinary()
    67  		if err != nil {
    68  			return nil, err
    69  		}
    70  
    71  		ids[i] = idBytes
    72  	}
    73  
    74  	bucket := s.store.Bucket(helpers.ObjectsBucketLSM)
    75  	for i, id := range ids {
    76  		bytes, err := bucket.Get(id)
    77  		if err != nil {
    78  			return nil, err
    79  		}
    80  
    81  		if bytes == nil {
    82  			continue
    83  		}
    84  
    85  		obj, err := storobj.FromBinary(bytes)
    86  		if err != nil {
    87  			return nil, errors.Wrap(err, "unmarshal kind object")
    88  		}
    89  		objects[i] = obj
    90  	}
    91  
    92  	return objects, nil
    93  }
    94  
    95  // TODO: This does an actual read which is not really needed, if we see this
    96  // come up in profiling, we could optimize this by adding an explicit Exists()
    97  // on the LSMKV which only checks the bloom filters, which at least in the case
    98  // of a true negative would be considerably faster. For a (false) positive,
    99  // we'd still need to check, though.
   100  func (s *Shard) Exists(ctx context.Context, id strfmt.UUID) (bool, error) {
   101  	idBytes, err := uuid.MustParse(id.String()).MarshalBinary()
   102  	if err != nil {
   103  		return false, err
   104  	}
   105  
   106  	bytes, err := s.store.Bucket(helpers.ObjectsBucketLSM).Get(idBytes)
   107  	if err != nil {
   108  		return false, errors.Wrap(err, "read request")
   109  	}
   110  
   111  	if bytes == nil {
   112  		return false, nil
   113  	}
   114  
   115  	return true, nil
   116  }
   117  
   118  func (s *Shard) objectByIndexID(ctx context.Context, indexID uint64, acceptDeleted bool) (*storobj.Object, error) {
   119  	keyBuf := make([]byte, 8)
   120  	binary.LittleEndian.PutUint64(keyBuf, indexID)
   121  
   122  	bytes, err := s.store.Bucket(helpers.ObjectsBucketLSM).
   123  		GetBySecondary(0, keyBuf)
   124  	if err != nil {
   125  		return nil, err
   126  	}
   127  
   128  	if bytes == nil {
   129  		return nil, storobj.NewErrNotFoundf(indexID,
   130  			"uuid found for docID, but object is nil")
   131  	}
   132  
   133  	obj, err := storobj.FromBinary(bytes)
   134  	if err != nil {
   135  		return nil, errors.Wrap(err, "unmarshal kind object")
   136  	}
   137  
   138  	return obj, nil
   139  }
   140  
   141  func (s *Shard) vectorByIndexID(ctx context.Context, indexID uint64) ([]float32, error) {
   142  	keyBuf := make([]byte, 8)
   143  	return s.readVectorByIndexIDIntoSlice(ctx, indexID, &common.VectorSlice{Buff8: keyBuf})
   144  }
   145  
   146  func (s *Shard) readVectorByIndexIDIntoSlice(ctx context.Context, indexID uint64, container *common.VectorSlice) ([]float32, error) {
   147  	binary.LittleEndian.PutUint64(container.Buff8, indexID)
   148  
   149  	bytes, newBuff, err := s.store.Bucket(helpers.ObjectsBucketLSM).
   150  		GetBySecondaryIntoMemory(0, container.Buff8, container.Buff)
   151  	if err != nil {
   152  		return nil, err
   153  	}
   154  
   155  	if bytes == nil {
   156  		return nil, storobj.NewErrNotFoundf(indexID,
   157  			"no object for doc id, it could have been deleted")
   158  	}
   159  
   160  	container.Buff = newBuff
   161  	return storobj.VectorFromBinary(bytes, container.Slice)
   162  }
   163  
   164  func (s *Shard) ObjectSearch(ctx context.Context, limit int, filters *filters.LocalFilter,
   165  	keywordRanking *searchparams.KeywordRanking, sort []filters.Sort, cursor *filters.Cursor,
   166  	additional additional.Properties,
   167  ) ([]*storobj.Object, []float32, error) {
   168  	if keywordRanking != nil {
   169  		if v := s.versioner.Version(); v < 2 {
   170  			return nil, nil, errors.Errorf(
   171  				"shard was built with an older version of " +
   172  					"Weaviate which does not yet support BM25 search")
   173  		}
   174  
   175  		var bm25objs []*storobj.Object
   176  		var bm25count []float32
   177  		var err error
   178  		var objs helpers.AllowList
   179  		var filterDocIds helpers.AllowList
   180  
   181  		if filters != nil {
   182  			objs, err = inverted.NewSearcher(s.index.logger, s.store,
   183  				s.index.getSchema.GetSchemaSkipAuth(), s.propertyIndices,
   184  				s.index.classSearcher, s.index.stopwords, s.versioner.Version(),
   185  				s.isFallbackToSearchable, s.tenant(), s.index.Config.QueryNestedRefLimit,
   186  				s.bitmapFactory).
   187  				DocIDs(ctx, filters, additional, s.index.Config.ClassName)
   188  			if err != nil {
   189  				return nil, nil, err
   190  			}
   191  
   192  			filterDocIds = objs
   193  		}
   194  
   195  		className := s.index.Config.ClassName
   196  		bm25Config := s.index.getInvertedIndexConfig().BM25
   197  		logger := s.index.logger.WithFields(logrus.Fields{"class": s.index.Config.ClassName, "shard": s.name})
   198  		bm25searcher := inverted.NewBM25Searcher(bm25Config, s.store,
   199  			s.index.getSchema.GetSchemaSkipAuth(), s.propertyIndices, s.index.classSearcher,
   200  			s.GetPropertyLengthTracker(), logger, s.versioner.Version())
   201  		bm25objs, bm25count, err = bm25searcher.BM25F(ctx, filterDocIds, className, limit, *keywordRanking)
   202  		if err != nil {
   203  			return nil, nil, err
   204  		}
   205  
   206  		return bm25objs, bm25count, nil
   207  	}
   208  
   209  	if filters == nil {
   210  		objs, err := s.ObjectList(ctx, limit, sort,
   211  			cursor, additional, s.index.Config.ClassName)
   212  		return objs, nil, err
   213  	}
   214  	objs, err := inverted.NewSearcher(s.index.logger, s.store, s.index.getSchema.GetSchemaSkipAuth(),
   215  		s.propertyIndices, s.index.classSearcher, s.index.stopwords, s.versioner.Version(),
   216  		s.isFallbackToSearchable, s.tenant(), s.index.Config.QueryNestedRefLimit, s.bitmapFactory).
   217  		Objects(ctx, limit, filters, sort, additional, s.index.Config.ClassName)
   218  	return objs, nil, err
   219  }
   220  
   221  func (s *Shard) getIndexQueue(targetVector string) (*IndexQueue, error) {
   222  	if s.hasTargetVectors() {
   223  		if targetVector == "" {
   224  			return nil, fmt.Errorf("index queue: missing target vector")
   225  		}
   226  		queue, ok := s.queues[targetVector]
   227  		if !ok {
   228  			return nil, fmt.Errorf("index queue for target vector: %s doesn't exist", targetVector)
   229  		}
   230  		return queue, nil
   231  	}
   232  	return s.queue, nil
   233  }
   234  
   235  func (s *Shard) ObjectVectorSearch(ctx context.Context, searchVector []float32, targetVector string, targetDist float32, limit int, filters *filters.LocalFilter, sort []filters.Sort, groupBy *searchparams.GroupBy, additional additional.Properties) ([]*storobj.Object, []float32, error) {
   236  	var (
   237  		ids       []uint64
   238  		dists     []float32
   239  		err       error
   240  		allowList helpers.AllowList
   241  	)
   242  
   243  	if filters != nil {
   244  		beforeFilter := time.Now()
   245  		list, err := s.buildAllowList(ctx, filters, additional)
   246  		if err != nil {
   247  			return nil, nil, err
   248  		}
   249  		allowList = list
   250  		s.metrics.FilteredVectorFilter(time.Since(beforeFilter))
   251  	}
   252  
   253  	queue, err := s.getIndexQueue(targetVector)
   254  	if err != nil {
   255  		return nil, nil, err
   256  	}
   257  
   258  	beforeVector := time.Now()
   259  	if limit < 0 {
   260  		ids, dists, err = queue.SearchByVectorDistance(
   261  			searchVector, targetDist, s.index.Config.QueryMaximumResults, allowList)
   262  		if err != nil {
   263  			return nil, nil, errors.Wrap(err, "vector search by distance")
   264  		}
   265  	} else {
   266  		ids, dists, err = queue.SearchByVector(searchVector, limit, allowList)
   267  		if err != nil {
   268  			return nil, nil, errors.Wrap(err, "vector search")
   269  		}
   270  	}
   271  	if len(ids) == 0 {
   272  		return nil, nil, nil
   273  	}
   274  
   275  	if filters != nil {
   276  		s.metrics.FilteredVectorVector(time.Since(beforeVector))
   277  	}
   278  
   279  	if groupBy != nil {
   280  		return s.groupResults(ctx, ids, dists, groupBy, additional)
   281  	}
   282  
   283  	if len(sort) > 0 {
   284  		beforeSort := time.Now()
   285  		ids, dists, err = s.sortDocIDsAndDists(ctx, limit, sort,
   286  			s.index.Config.ClassName, ids, dists)
   287  		if err != nil {
   288  			return nil, nil, errors.Wrap(err, "vector search sort")
   289  		}
   290  		if filters != nil {
   291  			s.metrics.FilteredVectorSort(time.Since(beforeSort))
   292  		}
   293  	}
   294  
   295  	beforeObjects := time.Now()
   296  
   297  	bucket := s.store.Bucket(helpers.ObjectsBucketLSM)
   298  	objs, err := storobj.ObjectsByDocID(bucket, ids, additional)
   299  	if err != nil {
   300  		return nil, nil, err
   301  	}
   302  
   303  	if filters != nil {
   304  		s.metrics.FilteredVectorObjects(time.Since(beforeObjects))
   305  	}
   306  
   307  	return objs, dists, nil
   308  }
   309  
   310  func (s *Shard) ObjectList(ctx context.Context, limit int, sort []filters.Sort, cursor *filters.Cursor, additional additional.Properties, className schema.ClassName) ([]*storobj.Object, error) {
   311  	if len(sort) > 0 {
   312  		docIDs, err := s.sortedObjectList(ctx, limit, sort, className)
   313  		if err != nil {
   314  			return nil, err
   315  		}
   316  		bucket := s.store.Bucket(helpers.ObjectsBucketLSM)
   317  		return storobj.ObjectsByDocID(bucket, docIDs, additional)
   318  	}
   319  
   320  	if cursor == nil {
   321  		cursor = &filters.Cursor{After: "", Limit: limit}
   322  	}
   323  	return s.cursorObjectList(ctx, cursor, additional, className)
   324  }
   325  
   326  func (s *Shard) cursorObjectList(ctx context.Context, c *filters.Cursor,
   327  	additional additional.Properties,
   328  	className schema.ClassName,
   329  ) ([]*storobj.Object, error) {
   330  	cursor := s.store.Bucket(helpers.ObjectsBucketLSM).Cursor()
   331  	defer cursor.Close()
   332  
   333  	var key, val []byte
   334  	if c.After == "" {
   335  		key, val = cursor.First()
   336  	} else {
   337  		uuidBytes, err := uuid.MustParse(c.After).MarshalBinary()
   338  		if err != nil {
   339  			return nil, errors.Wrap(err, "after argument is not a valid uuid")
   340  		}
   341  		key, val = cursor.Seek(uuidBytes)
   342  		if bytes.Equal(key, uuidBytes) {
   343  			// move cursor by one if it's the same ID
   344  			key, val = cursor.Next()
   345  		}
   346  	}
   347  
   348  	i := 0
   349  	out := make([]*storobj.Object, c.Limit)
   350  
   351  	for ; key != nil && i < c.Limit; key, val = cursor.Next() {
   352  		obj, err := storobj.FromBinary(val)
   353  		if err != nil {
   354  			return nil, errors.Wrapf(err, "unmarhsal item %d", i)
   355  		}
   356  
   357  		out[i] = obj
   358  		i++
   359  	}
   360  
   361  	return out[:i], nil
   362  }
   363  
   364  func (s *Shard) sortedObjectList(ctx context.Context, limit int, sort []filters.Sort, className schema.ClassName) ([]uint64, error) {
   365  	lsmSorter, err := sorter.NewLSMSorter(s.store, s.index.getSchema.GetSchemaSkipAuth(), className)
   366  	if err != nil {
   367  		return nil, errors.Wrap(err, "sort object list")
   368  	}
   369  	docIDs, err := lsmSorter.Sort(ctx, limit, sort)
   370  	if err != nil {
   371  		return nil, errors.Wrap(err, "sort object list")
   372  	}
   373  	return docIDs, nil
   374  }
   375  
   376  func (s *Shard) sortDocIDsAndDists(ctx context.Context, limit int, sort []filters.Sort, className schema.ClassName, docIDs []uint64, dists []float32) ([]uint64, []float32, error) {
   377  	lsmSorter, err := sorter.NewLSMSorter(s.store, s.index.getSchema.GetSchemaSkipAuth(), className)
   378  	if err != nil {
   379  		return nil, nil, errors.Wrap(err, "sort objects with distances")
   380  	}
   381  	sortedDocIDs, sortedDists, err := lsmSorter.SortDocIDsAndDists(ctx, limit, sort, docIDs, dists)
   382  	if err != nil {
   383  		return nil, nil, errors.Wrap(err, "sort objects with distances")
   384  	}
   385  	return sortedDocIDs, sortedDists, nil
   386  }
   387  
   388  func (s *Shard) buildAllowList(ctx context.Context, filters *filters.LocalFilter, addl additional.Properties) (helpers.AllowList, error) {
   389  	list, err := inverted.NewSearcher(s.index.logger, s.store, s.index.getSchema.GetSchemaSkipAuth(),
   390  		s.propertyIndices, s.index.classSearcher, s.index.stopwords, s.versioner.Version(),
   391  		s.isFallbackToSearchable, s.tenant(), s.index.Config.QueryNestedRefLimit, s.bitmapFactory).
   392  		DocIDs(ctx, filters, addl, s.index.Config.ClassName)
   393  	if err != nil {
   394  		return nil, errors.Wrap(err, "build inverted filter allow list")
   395  	}
   396  
   397  	return list, nil
   398  }
   399  
   400  func (s *Shard) uuidFromDocID(docID uint64) (strfmt.UUID, error) {
   401  	bucket := s.store.Bucket(helpers.ObjectsBucketLSM)
   402  	if bucket == nil {
   403  		return "", errors.Errorf("objects bucket not found")
   404  	}
   405  
   406  	keyBuf := bytes.NewBuffer(nil)
   407  	binary.Write(keyBuf, binary.LittleEndian, &docID)
   408  	docIDBytes := keyBuf.Bytes()
   409  	res, err := bucket.GetBySecondary(0, docIDBytes)
   410  	if err != nil {
   411  		return "", err
   412  	}
   413  
   414  	prop, _, err := storobj.ParseAndExtractProperty(res, "id")
   415  	if err != nil {
   416  		return "", err
   417  	}
   418  
   419  	return strfmt.UUID(prop[0]), nil
   420  }
   421  
   422  func (s *Shard) batchDeleteObject(ctx context.Context, id strfmt.UUID) error {
   423  	idBytes, err := uuid.MustParse(id.String()).MarshalBinary()
   424  	if err != nil {
   425  		return err
   426  	}
   427  
   428  	var docID uint64
   429  	bucket := s.store.Bucket(helpers.ObjectsBucketLSM)
   430  	existing, err := bucket.Get(idBytes)
   431  	if err != nil {
   432  		return errors.Wrap(err, "unexpected error on previous lookup")
   433  	}
   434  
   435  	if existing == nil {
   436  		// nothing to do
   437  		return nil
   438  	}
   439  
   440  	// we need the doc ID so we can clean up inverted indices currently
   441  	// pointing to this object
   442  	docID, err = storobj.DocIDFromBinary(existing)
   443  	if err != nil {
   444  		return errors.Wrap(err, "get existing doc id from object binary")
   445  	}
   446  
   447  	err = bucket.Delete(idBytes)
   448  	if err != nil {
   449  		return errors.Wrap(err, "delete object from bucket")
   450  	}
   451  
   452  	err = s.cleanupInvertedIndexOnDelete(existing, docID)
   453  	if err != nil {
   454  		return errors.Wrap(err, "delete object from bucket")
   455  	}
   456  
   457  	if s.hasTargetVectors() {
   458  		for targetVector, queue := range s.queues {
   459  			if err = queue.Delete(docID); err != nil {
   460  				return fmt.Errorf("delete from vector index queue of vector %q: %w", targetVector, err)
   461  			}
   462  		}
   463  	} else {
   464  		if err = s.queue.Delete(docID); err != nil {
   465  			return errors.Wrap(err, "delete from vector index queue")
   466  		}
   467  	}
   468  
   469  	return nil
   470  }
   471  
   472  func (s *Shard) WasDeleted(ctx context.Context, id strfmt.UUID) (bool, error) {
   473  	idBytes, err := uuid.MustParse(id.String()).MarshalBinary()
   474  	if err != nil {
   475  		return false, err
   476  	}
   477  
   478  	bucket := s.store.Bucket(helpers.ObjectsBucketLSM)
   479  	return bucket.WasDeleted(idBytes)
   480  }