github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/shard_write_merge.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package db
    13  
    14  import (
    15  	"context"
    16  	"fmt"
    17  
    18  	"github.com/google/uuid"
    19  	"github.com/pkg/errors"
    20  	"github.com/weaviate/weaviate/adapters/repos/db/helpers"
    21  	"github.com/weaviate/weaviate/entities/models"
    22  	"github.com/weaviate/weaviate/entities/storagestate"
    23  	"github.com/weaviate/weaviate/entities/storobj"
    24  	"github.com/weaviate/weaviate/usecases/objects"
    25  )
    26  
    27  func (s *Shard) MergeObject(ctx context.Context, merge objects.MergeDocument) error {
    28  	if s.isReadOnly() {
    29  		return storagestate.ErrStatusReadOnly
    30  	}
    31  
    32  	if s.hasTargetVectors() {
    33  		for targetVector, vector := range merge.Vectors {
    34  			// validation needs to happen before any changes are done. Otherwise, insertion is aborted somewhere in-between.
    35  			vectorIndex := s.VectorIndexForName(targetVector)
    36  			if vectorIndex == nil {
    37  				return errors.Errorf("Validate vector index for update of %v for target vector %s: vector index not found", merge.ID, targetVector)
    38  			}
    39  			err := vectorIndex.ValidateBeforeInsert(vector)
    40  			if err != nil {
    41  				return errors.Wrapf(err, "Validate vector index for update of %v for target vector %s", merge.ID, targetVector)
    42  			}
    43  		}
    44  	} else {
    45  		if merge.Vector != nil {
    46  			// validation needs to happen before any changes are done. Otherwise, insertion is aborted somewhere in-between.
    47  			err := s.vectorIndex.ValidateBeforeInsert(merge.Vector)
    48  			if err != nil {
    49  				return errors.Wrapf(err, "Validate vector index for update of %v", merge.ID)
    50  			}
    51  		}
    52  	}
    53  
    54  	idBytes, err := uuid.MustParse(merge.ID.String()).MarshalBinary()
    55  	if err != nil {
    56  		return err
    57  	}
    58  
    59  	return s.merge(ctx, idBytes, merge)
    60  }
    61  
    62  func (s *Shard) merge(ctx context.Context, idBytes []byte, doc objects.MergeDocument) error {
    63  	obj, status, err := s.mergeObjectInStorage(doc, idBytes)
    64  	if err != nil {
    65  		return err
    66  	}
    67  
    68  	// object was not changed, no further updates are required
    69  	// https://github.com/weaviate/weaviate/issues/3949
    70  	if status.skipUpsert {
    71  		return nil
    72  	}
    73  
    74  	if s.hasTargetVectors() {
    75  		for targetVector, vector := range obj.Vectors {
    76  			if err := s.updateVectorIndexForName(vector, status, targetVector); err != nil {
    77  				return errors.Wrapf(err, "update vector index for target vector %s", targetVector)
    78  			}
    79  		}
    80  	} else {
    81  		if err := s.updateVectorIndex(obj.Vector, status); err != nil {
    82  			return errors.Wrap(err, "update vector index")
    83  		}
    84  	}
    85  
    86  	if err := s.updatePropertySpecificIndices(obj, status); err != nil {
    87  		return errors.Wrap(err, "update property-specific indices")
    88  	}
    89  
    90  	if err := s.store.WriteWALs(); err != nil {
    91  		return errors.Wrap(err, "flush all buffered WALs")
    92  	}
    93  
    94  	return nil
    95  }
    96  
    97  func (s *Shard) mergeObjectInStorage(merge objects.MergeDocument,
    98  	idBytes []byte,
    99  ) (*storobj.Object, objectInsertStatus, error) {
   100  	bucket := s.store.Bucket(helpers.ObjectsBucketLSM)
   101  
   102  	var prevObj, obj *storobj.Object
   103  	var status objectInsertStatus
   104  
   105  	// see comment in shard_write_put.go::putObjectLSM
   106  	lock := &s.docIdLock[s.uuidToIdLockPoolId(idBytes)]
   107  
   108  	// wrapped in function to handle lock/unlock
   109  	if err := func() error {
   110  		lock.Lock()
   111  		defer lock.Unlock()
   112  
   113  		var err error
   114  		prevObj, err = fetchObject(bucket, idBytes)
   115  		if err != nil {
   116  			return errors.Wrap(err, "get bucket")
   117  		}
   118  
   119  		obj, _, err = s.mergeObjectData(prevObj, merge)
   120  		if err != nil {
   121  			return errors.Wrap(err, "merge object data")
   122  		}
   123  
   124  		status, err = s.determineInsertStatus(prevObj, obj)
   125  		if err != nil {
   126  			return errors.Wrap(err, "check insert/update status")
   127  		}
   128  
   129  		obj.DocID = status.docID
   130  		if status.skipUpsert {
   131  			return nil
   132  		}
   133  
   134  		objBytes, err := obj.MarshalBinary()
   135  		if err != nil {
   136  			return errors.Wrapf(err, "marshal object %s to binary", obj.ID())
   137  		}
   138  
   139  		if err := s.upsertObjectDataLSM(bucket, idBytes, objBytes, status.docID); err != nil {
   140  			return errors.Wrap(err, "upsert object data")
   141  		}
   142  
   143  		return nil
   144  	}(); err != nil {
   145  		return nil, objectInsertStatus{}, err
   146  	} else if status.skipUpsert {
   147  		return obj, status, nil
   148  	}
   149  
   150  	if err := s.updateInvertedIndexLSM(obj, status, prevObj); err != nil {
   151  		return nil, status, errors.Wrap(err, "update inverted indices")
   152  	}
   153  
   154  	return obj, status, nil
   155  }
   156  
   157  // mutableMergeObjectLSM is a special version of mergeObjectInTx where no doc
   158  // id increases will be made, but instead the old doc ID will be re-used. This
   159  // is only possible if the following two conditions are met:
   160  //
   161  //  1. We only add to the inverted index, but there is nothing which requires
   162  //     cleaning up. Example `name: "John"` is updated to `name: "John Doe"`,
   163  //     this is valid because we only add new entry for "Doe", but do not alter
   164  //     the existing entry for "John"
   165  //     An invalid update would be `name:"John"` is updated to `name:"Diane"`,
   166  //     this would require a cleanup for the existing link from "John" to this
   167  //     doc id, which is not possible. The only way to clean up is to increase
   168  //     the doc id and delete all entries for the old one
   169  //
   170  //  2. The vector position is not altered. Vector Indices cannot be mutated
   171  //     therefore a vector update would not be reflected
   172  //
   173  // The above makes this a perfect candidate for a batch reference update as
   174  // this alters neither the vector position, nor does it remove anything from
   175  // the inverted index
   176  func (s *Shard) mutableMergeObjectLSM(merge objects.MergeDocument,
   177  	idBytes []byte,
   178  ) (mutableMergeResult, error) {
   179  	bucket := s.store.Bucket(helpers.ObjectsBucketLSM)
   180  	out := mutableMergeResult{}
   181  
   182  	// see comment in shard_write_put.go::putObjectLSM
   183  	lock := &s.docIdLock[s.uuidToIdLockPoolId(idBytes)]
   184  	lock.Lock()
   185  	defer lock.Unlock()
   186  
   187  	prevObj, err := fetchObject(bucket, idBytes)
   188  	if err != nil {
   189  		return out, err
   190  	}
   191  
   192  	if prevObj == nil {
   193  		uid := uuid.UUID{}
   194  		uid.UnmarshalBinary(idBytes)
   195  		return out, fmt.Errorf("object with id %s not found", uid)
   196  	}
   197  
   198  	obj, notEmptyPrevObj, err := s.mergeObjectData(prevObj, merge)
   199  	if err != nil {
   200  		return out, errors.Wrap(err, "merge object data")
   201  	}
   202  
   203  	out.next = obj
   204  	out.previous = notEmptyPrevObj
   205  
   206  	status, err := s.determineMutableInsertStatus(prevObj, obj)
   207  	if err != nil {
   208  		return out, errors.Wrap(err, "check insert/update status")
   209  	}
   210  	out.status = status
   211  
   212  	obj.DocID = status.docID // is not changed
   213  	objBytes, err := obj.MarshalBinary()
   214  	if err != nil {
   215  		return out, errors.Wrapf(err, "marshal object %s to binary", obj.ID())
   216  	}
   217  
   218  	if err := s.upsertObjectDataLSM(bucket, idBytes, objBytes, status.docID); err != nil {
   219  		return out, errors.Wrap(err, "upsert object data")
   220  	}
   221  
   222  	// do not updated inverted index, since this requires delta analysis, which
   223  	// must be done by the caller!
   224  
   225  	return out, nil
   226  }
   227  
   228  type mutableMergeResult struct {
   229  	next     *storobj.Object
   230  	previous *storobj.Object
   231  	status   objectInsertStatus
   232  }
   233  
   234  func (s *Shard) mergeObjectData(prevObj *storobj.Object,
   235  	merge objects.MergeDocument,
   236  ) (*storobj.Object, *storobj.Object, error) {
   237  	if prevObj == nil {
   238  		// DocID must be overwritten after status check, simply set to initial
   239  		// value
   240  		prevObj = storobj.New(0)
   241  		prevObj.SetClass(merge.Class)
   242  		prevObj.SetID(merge.ID)
   243  	}
   244  
   245  	return mergeProps(prevObj, merge), prevObj, nil
   246  }
   247  
   248  func mergeProps(previous *storobj.Object,
   249  	merge objects.MergeDocument,
   250  ) *storobj.Object {
   251  	next := previous.DeepCopyDangerous()
   252  	properties, ok := next.Properties().(map[string]interface{})
   253  	if !ok || properties == nil {
   254  		properties = map[string]interface{}{}
   255  	}
   256  
   257  	// remove properties from object that have been set to nil
   258  	for _, propToDelete := range merge.PropertiesToDelete {
   259  		delete(properties, propToDelete)
   260  	}
   261  
   262  	for propName, value := range merge.PrimitiveSchema {
   263  		// for primitive props, we simply need to overwrite
   264  		properties[propName] = value
   265  	}
   266  
   267  	for _, ref := range merge.References {
   268  		propName := ref.From.Property.String()
   269  		prop := properties[propName]
   270  		propParsed, ok := prop.(models.MultipleRef)
   271  		if !ok {
   272  			propParsed = models.MultipleRef{}
   273  		}
   274  		propParsed = append(propParsed, ref.To.SingleRef())
   275  		properties[propName] = propParsed
   276  	}
   277  
   278  	if merge.Vector == nil {
   279  		next.Vector = previous.Vector
   280  	} else {
   281  		next.Vector = merge.Vector
   282  	}
   283  
   284  	if len(merge.Vectors) == 0 {
   285  		next.Vectors = previous.Vectors
   286  	} else {
   287  		next.Vectors = vectorsAsMap(merge.Vectors)
   288  	}
   289  
   290  	next.Object.LastUpdateTimeUnix = merge.UpdateTime
   291  	next.SetProperties(properties)
   292  
   293  	return next
   294  }
   295  
   296  func vectorsAsMap(in models.Vectors) map[string][]float32 {
   297  	if len(in) > 0 {
   298  		out := make(map[string][]float32)
   299  		for targetVector, vector := range in {
   300  			out[targetVector] = vector
   301  		}
   302  		return out
   303  	}
   304  	return nil
   305  }