github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/shard_write_batch_objects.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package db
    13  
    14  import (
    15  	"context"
    16  	"fmt"
    17  	"os"
    18  	"runtime/debug"
    19  	"sync"
    20  	"time"
    21  
    22  	enterrors "github.com/weaviate/weaviate/entities/errors"
    23  	"github.com/weaviate/weaviate/usecases/configbase"
    24  
    25  	"github.com/go-openapi/strfmt"
    26  	"github.com/google/uuid"
    27  	"github.com/pkg/errors"
    28  	"github.com/weaviate/weaviate/entities/storagestate"
    29  	"github.com/weaviate/weaviate/entities/storobj"
    30  )
    31  
    32  // return value map[int]error gives the error for the index as it received it
    33  func (s *Shard) PutObjectBatch(ctx context.Context,
    34  	objects []*storobj.Object,
    35  ) []error {
    36  	if s.isReadOnly() {
    37  		return []error{storagestate.ErrStatusReadOnly}
    38  	}
    39  
    40  	return s.putBatch(ctx, objects)
    41  }
    42  
    43  // asyncEnabled is a quick and dirty way to create a feature flag for async
    44  // indexing.
    45  func asyncEnabled() bool {
    46  	return configbase.Enabled(os.Getenv("ASYNC_INDEXING"))
    47  }
    48  
    49  // Workers are started with the first batch and keep working as there are objects to add from any batch. Each batch
    50  // adds its jobs (that contain the respective object) to a single queue that is then processed by the workers.
    51  // When the last batch finishes, all workers receive a shutdown signal and exit
    52  func (s *Shard) putBatch(ctx context.Context,
    53  	objects []*storobj.Object,
    54  ) []error {
    55  	if asyncEnabled() {
    56  		return s.putBatchAsync(ctx, objects)
    57  	}
    58  	// Workers are started with the first batch and keep working as there are objects to add from any batch. Each batch
    59  	// adds its jobs (that contain the respective object) to a single queue that is then processed by the workers.
    60  	// When the last batch finishes, all workers receive a shutdown signal and exit
    61  	batcher := newObjectsBatcher(s)
    62  	err := batcher.Objects(ctx, objects)
    63  
    64  	// block until all objects of batch have been added
    65  	batcher.wg.Wait()
    66  	s.metrics.VectorIndex(batcher.batchStartTime)
    67  
    68  	return err
    69  }
    70  
    71  func (s *Shard) putBatchAsync(ctx context.Context, objects []*storobj.Object) []error {
    72  	beforeBatch := time.Now()
    73  	defer s.metrics.BatchObject(beforeBatch, len(objects))
    74  
    75  	batcher := newObjectsBatcher(s)
    76  
    77  	batcher.init(objects)
    78  	batcher.storeInObjectStore(ctx)
    79  	batcher.markDeletedInVectorStorage(ctx)
    80  	batcher.storeAdditionalStorageWithAsyncQueue(ctx)
    81  	batcher.flushWALs(ctx)
    82  
    83  	return batcher.errs
    84  }
    85  
    86  // objectsBatcher is a helper type wrapping around an underlying shard that can
    87  // execute objects batch operations on a shard (as opposed to references batch
    88  // operations)
    89  type objectsBatcher struct {
    90  	sync.Mutex
    91  	shard          ShardLike
    92  	statuses       map[strfmt.UUID]objectInsertStatus
    93  	errs           []error
    94  	duplicates     map[int]struct{}
    95  	objects        []*storobj.Object
    96  	wg             sync.WaitGroup
    97  	batchStartTime time.Time
    98  }
    99  
   100  func newObjectsBatcher(s ShardLike) *objectsBatcher {
   101  	return &objectsBatcher{shard: s}
   102  }
   103  
   104  // Objects imports the specified objects in parallel in a batch-fashion
   105  func (ob *objectsBatcher) Objects(ctx context.Context,
   106  	objects []*storobj.Object,
   107  ) []error {
   108  	beforeBatch := time.Now()
   109  	defer ob.shard.Metrics().BatchObject(beforeBatch, len(objects))
   110  
   111  	ob.init(objects)
   112  	ob.storeInObjectStore(ctx)
   113  	ob.markDeletedInVectorStorage(ctx)
   114  	ob.storeAdditionalStorageWithWorkers(ctx)
   115  	ob.flushWALs(ctx)
   116  	return ob.errs
   117  }
   118  
   119  func (ob *objectsBatcher) init(objects []*storobj.Object) {
   120  	ob.objects = objects
   121  	ob.statuses = map[strfmt.UUID]objectInsertStatus{}
   122  	ob.errs = make([]error, len(objects))
   123  	ob.duplicates = findDuplicatesInBatchObjects(objects)
   124  }
   125  
   126  // storeInObjectStore performs all storage operations on the underlying
   127  // key/value store, this is they object-by-id store, the docID-lookup tables,
   128  // as well as all inverted indices.
   129  func (ob *objectsBatcher) storeInObjectStore(ctx context.Context) {
   130  	beforeObjectStore := time.Now()
   131  
   132  	errs := ob.storeSingleBatchInLSM(ctx, ob.objects)
   133  	for i, err := range errs {
   134  		if err != nil {
   135  			ob.setErrorAtIndex(err, i)
   136  		}
   137  	}
   138  
   139  	ob.shard.Metrics().ObjectStore(beforeObjectStore)
   140  }
   141  
   142  func (ob *objectsBatcher) storeSingleBatchInLSM(ctx context.Context,
   143  	batch []*storobj.Object,
   144  ) []error {
   145  	errs := make([]error, len(batch))
   146  	errLock := &sync.Mutex{}
   147  
   148  	// if the context is expired fail all
   149  	if err := ctx.Err(); err != nil {
   150  		for i := range errs {
   151  			errs[i] = errors.Wrap(err, "begin batch")
   152  		}
   153  		return errs
   154  	}
   155  
   156  	wg := &sync.WaitGroup{}
   157  	concurrencyLimit := make(chan struct{}, _NUMCPU)
   158  
   159  	for j, object := range batch {
   160  		wg.Add(1)
   161  		object := object
   162  		index := j
   163  		f := func() {
   164  			defer wg.Done()
   165  
   166  			// Acquire a semaphore to control the concurrency. Otherwise we would
   167  			// spawn one routine per object here. With very large batch sizes (e.g.
   168  			// 1000 or 10000+), this isn't helpuful and just leads to more lock
   169  			// contention down the line – especially when there's lots of text to be
   170  			// indexed in the inverted index.
   171  			concurrencyLimit <- struct{}{}
   172  			defer func() {
   173  				// Release the semaphore when the goroutine is done.
   174  				<-concurrencyLimit
   175  			}()
   176  
   177  			if err := ob.storeObjectOfBatchInLSM(ctx, index, object); err != nil {
   178  				errLock.Lock()
   179  				errs[index] = err
   180  				errLock.Unlock()
   181  			}
   182  		}
   183  		enterrors.GoWrapper(f, ob.shard.Index().logger)
   184  
   185  	}
   186  	wg.Wait()
   187  
   188  	return errs
   189  }
   190  
   191  func (ob *objectsBatcher) storeObjectOfBatchInLSM(ctx context.Context,
   192  	objectIndex int, object *storobj.Object,
   193  ) error {
   194  	if _, ok := ob.duplicates[objectIndex]; ok {
   195  		return nil
   196  	}
   197  	uuidParsed, err := uuid.Parse(object.ID().String())
   198  	if err != nil {
   199  		return errors.Wrap(err, "invalid id")
   200  	}
   201  
   202  	idBytes, err := uuidParsed.MarshalBinary()
   203  	if err != nil {
   204  		return err
   205  	}
   206  
   207  	status, err := ob.shard.putObjectLSM(object, idBytes)
   208  	if err != nil {
   209  		return err
   210  	}
   211  
   212  	ob.setStatusForID(status, object.ID())
   213  
   214  	if err := ctx.Err(); err != nil {
   215  		return errors.Wrapf(err, "end store object %d of batch", objectIndex)
   216  	}
   217  	return nil
   218  }
   219  
   220  // setStatusForID is thread-safe as it uses the underlying mutex to lock the
   221  // statuses map when writing into it
   222  func (ob *objectsBatcher) setStatusForID(status objectInsertStatus, id strfmt.UUID) {
   223  	ob.Lock()
   224  	defer ob.Unlock()
   225  	ob.statuses[id] = status
   226  }
   227  
   228  func (ob *objectsBatcher) markDeletedInVectorStorage(ctx context.Context) {
   229  	var docIDsToDelete []uint64
   230  	var positions []int
   231  	for pos, object := range ob.objects {
   232  		status := ob.statuses[object.ID()]
   233  		if status.docIDChanged {
   234  			docIDsToDelete = append(docIDsToDelete, status.oldDocID)
   235  			positions = append(positions, pos)
   236  		}
   237  	}
   238  
   239  	if len(docIDsToDelete) == 0 {
   240  		return
   241  	}
   242  
   243  	if ob.shard.hasTargetVectors() {
   244  		for targetVector, queue := range ob.shard.Queues() {
   245  			if err := queue.Delete(docIDsToDelete...); err != nil {
   246  				for _, pos := range positions {
   247  					ob.setErrorAtIndex(fmt.Errorf("target vector %s: %w", targetVector, err), pos)
   248  				}
   249  			}
   250  		}
   251  	} else {
   252  		if err := ob.shard.Queue().Delete(docIDsToDelete...); err != nil {
   253  			for _, pos := range positions {
   254  				ob.setErrorAtIndex(err, pos)
   255  			}
   256  		}
   257  	}
   258  }
   259  
   260  // storeAdditionalStorageWithWorkers stores the object in all non-key-value
   261  // stores, such as the main vector index as well as the property-specific
   262  // indices, such as the geo-index.
   263  func (ob *objectsBatcher) storeAdditionalStorageWithWorkers(ctx context.Context) {
   264  	if ok := ob.checkContext(ctx); !ok {
   265  		// if the context is no longer OK, there's no point in continuing - abort
   266  		// early
   267  		return
   268  	}
   269  
   270  	ob.batchStartTime = time.Now()
   271  
   272  	for i, object := range ob.objects {
   273  		status := ob.statuses[object.ID()]
   274  		if ob.shouldSkipInAdditionalStorage(i, status) {
   275  			continue
   276  		}
   277  
   278  		ob.wg.Add(1)
   279  		ob.shard.addJobToQueue(job{
   280  			object:  object,
   281  			status:  status,
   282  			index:   i,
   283  			ctx:     ctx,
   284  			batcher: ob,
   285  		})
   286  	}
   287  }
   288  
   289  func (ob *objectsBatcher) storeAdditionalStorageWithAsyncQueue(ctx context.Context) {
   290  	if ok := ob.checkContext(ctx); !ok {
   291  		// if the context is no longer OK, there's no point in continuing - abort
   292  		// early
   293  		return
   294  	}
   295  
   296  	ob.batchStartTime = time.Now()
   297  	shouldGeoIndex := ob.shard.hasGeoIndex()
   298  
   299  	var vectors []vectorDescriptor
   300  	var targetVectors map[string][]vectorDescriptor
   301  	hasTargetVectors := ob.shard.hasTargetVectors()
   302  	if hasTargetVectors {
   303  		targetVectors = make(map[string][]vectorDescriptor)
   304  	} else {
   305  		vectors = make([]vectorDescriptor, 0, len(ob.objects))
   306  	}
   307  
   308  	for i, object := range ob.objects {
   309  		status := ob.statuses[object.ID()]
   310  
   311  		if ob.shouldSkipInAdditionalStorage(i, status) {
   312  			continue
   313  		}
   314  
   315  		if shouldGeoIndex {
   316  			if err := ob.shard.updatePropertySpecificIndices(object, status); err != nil {
   317  				ob.setErrorAtIndex(errors.Wrap(err, "update prop-specific indices"), i)
   318  				continue
   319  			}
   320  		}
   321  
   322  		// skip vector update, as vector was not changed
   323  		// https://github.com/weaviate/weaviate/issues/3948
   324  		if status.docIDPreserved {
   325  			continue
   326  		}
   327  
   328  		if len(object.Vector) == 0 && len(object.Vectors) == 0 {
   329  			continue
   330  		}
   331  
   332  		if hasTargetVectors {
   333  			for targetVector, vector := range object.Vectors {
   334  				targetVectors[targetVector] = append(targetVectors[targetVector], vectorDescriptor{
   335  					id:     status.docID,
   336  					vector: vector,
   337  				})
   338  			}
   339  		} else {
   340  			if len(object.Vector) > 0 {
   341  				vectors = append(vectors, vectorDescriptor{
   342  					id:     status.docID,
   343  					vector: object.Vector,
   344  				})
   345  			}
   346  		}
   347  	}
   348  
   349  	if hasTargetVectors {
   350  		for targetVector, vectors := range targetVectors {
   351  			queue, ok := ob.shard.Queues()[targetVector]
   352  			if !ok {
   353  				ob.setErrorAtIndex(fmt.Errorf("queue not found for target vector %s", targetVector), 0)
   354  			} else {
   355  				err := queue.Push(ctx, vectors...)
   356  				if err != nil {
   357  					ob.setErrorAtIndex(err, 0)
   358  				}
   359  			}
   360  		}
   361  	} else {
   362  		err := ob.shard.Queue().Push(ctx, vectors...)
   363  		if err != nil {
   364  			ob.setErrorAtIndex(err, 0)
   365  		}
   366  	}
   367  }
   368  
   369  func (ob *objectsBatcher) shouldSkipInAdditionalStorage(i int, status objectInsertStatus) bool {
   370  	if ok := ob.hasErrorAtIndex(i); ok {
   371  		// had an error prior, ignore
   372  		return true
   373  	}
   374  
   375  	// object was not changed, skip further updates
   376  	// https://github.com/weaviate/weaviate/issues/3949
   377  	if status.skipUpsert {
   378  		return true
   379  	}
   380  
   381  	// no need to lock the mutex for a duplicate check, as we only ever write
   382  	// during init() in there - not concurrently
   383  	if _, ok := ob.duplicates[i]; ok {
   384  		// is a duplicate, ignore
   385  		return true
   386  	}
   387  
   388  	return false
   389  }
   390  
   391  func (ob *objectsBatcher) storeSingleObjectInAdditionalStorage(ctx context.Context,
   392  	object *storobj.Object, status objectInsertStatus, index int,
   393  ) {
   394  	defer func() {
   395  		err := recover()
   396  		if err != nil {
   397  			ob.setErrorAtIndex(fmt.Errorf("an unexpected error occurred: %s", err), index)
   398  			fmt.Fprintf(os.Stderr, "panic: %s\n", err)
   399  			debug.PrintStack()
   400  		}
   401  	}()
   402  
   403  	if err := ctx.Err(); err != nil {
   404  		ob.setErrorAtIndex(errors.Wrap(err, "insert to vector index"), index)
   405  		return
   406  	}
   407  
   408  	if object.Vector != nil || len(object.Vectors) > 0 {
   409  		// By this time all required deletes (e.g. because of DocID changes) have
   410  		// already been grouped and performed in bulk. Only the insertions are
   411  		// left. The motivation for this change is explained in
   412  		// https://github.com/weaviate/weaviate/pull/2697.
   413  		//
   414  		// Before this change, two identical batches in sequence would lead to
   415  		// massive lock contention in the hnsw index, as each individual delete
   416  		// requires a costly RW.Lock() operation which first drains all "readers"
   417  		// which represent the regular imports. See "deleteVsInsertLock" inside the
   418  		// hnsw store.
   419  		//
   420  		// With the improved logic, we group all batches up front in a single call,
   421  		// so this highly concurrent method no longer needs to compete for those
   422  		// expensive locks.
   423  		//
   424  		// Since this behavior is exclusive to batching, we can no longer call
   425  		// shard.updateVectorIndex which would also handle the delete as required
   426  		// for a non-batch update. Instead a new method has been introduced that
   427  		// ignores deletes.
   428  		if ob.shard.hasTargetVectors() {
   429  			if len(object.Vectors) > 0 {
   430  				if err := ob.shard.updateVectorIndexesIgnoreDelete(object.Vectors, status); err != nil {
   431  					ob.setErrorAtIndex(errors.Wrap(err, "insert to vector index"), index)
   432  					return
   433  				}
   434  			}
   435  		} else {
   436  			if object.Vector != nil {
   437  				if err := ob.shard.updateVectorIndexIgnoreDelete(object.Vector, status); err != nil {
   438  					ob.setErrorAtIndex(errors.Wrap(err, "insert to vector index"), index)
   439  					return
   440  				}
   441  			}
   442  		}
   443  	}
   444  
   445  	if err := ob.shard.updatePropertySpecificIndices(object, status); err != nil {
   446  		ob.setErrorAtIndex(errors.Wrap(err, "update prop-specific indices"), index)
   447  		return
   448  	}
   449  }
   450  
   451  // hasErrorAtIndex is thread-safe as it uses the underlying mutex to lock
   452  // before reading from the errs map
   453  func (ob *objectsBatcher) hasErrorAtIndex(i int) bool {
   454  	ob.Lock()
   455  	defer ob.Unlock()
   456  	return ob.errs[i] != nil
   457  }
   458  
   459  // setErrorAtIndex is thread-safe as it uses the underlying mutex to lock
   460  // writing into the errs map
   461  func (ob *objectsBatcher) setErrorAtIndex(err error, index int) {
   462  	ob.Lock()
   463  	defer ob.Unlock()
   464  	ob.errs[index] = err
   465  }
   466  
   467  // checkContext does nothing if the context is still active. But if the context
   468  // has error'd, it marks all objects which have not previously error'd yet with
   469  // the ctx error
   470  func (ob *objectsBatcher) checkContext(ctx context.Context) bool {
   471  	if err := ctx.Err(); err != nil {
   472  		for i, err := range ob.errs {
   473  			if err == nil {
   474  				// already has an error, ignore
   475  				continue
   476  			}
   477  
   478  			ob.errs[i] = errors.Wrapf(err,
   479  				"inverted indexing complete, about to start vector indexing")
   480  		}
   481  
   482  		return false
   483  	}
   484  
   485  	return true
   486  }
   487  
   488  func (ob *objectsBatcher) flushWALs(ctx context.Context) {
   489  	if err := ob.shard.Store().WriteWALs(); err != nil {
   490  		for i := range ob.objects {
   491  			ob.setErrorAtIndex(err, i)
   492  		}
   493  	}
   494  
   495  	if ob.shard.hasTargetVectors() {
   496  		for targetVector, vectorIndex := range ob.shard.VectorIndexes() {
   497  			if err := vectorIndex.Flush(); err != nil {
   498  				for i := range ob.objects {
   499  					ob.setErrorAtIndex(fmt.Errorf("target vector %s: %w", targetVector, err), i)
   500  				}
   501  			}
   502  		}
   503  	} else {
   504  		if err := ob.shard.VectorIndex().Flush(); err != nil {
   505  			for i := range ob.objects {
   506  				ob.setErrorAtIndex(err, i)
   507  			}
   508  		}
   509  	}
   510  
   511  	if err := ob.shard.GetPropertyLengthTracker().Flush(false); err != nil {
   512  		for i := range ob.objects {
   513  			ob.setErrorAtIndex(err, i)
   514  		}
   515  	}
   516  }
   517  
   518  // returns the originalIndexIDs to be ignored
   519  func findDuplicatesInBatchObjects(in []*storobj.Object) map[int]struct{} {
   520  	count := map[strfmt.UUID]int{}
   521  	for _, obj := range in {
   522  		count[obj.ID()] = count[obj.ID()] + 1
   523  	}
   524  
   525  	ignore := map[int]struct{}{}
   526  	for i, obj := range in {
   527  		if c := count[obj.ID()]; c > 1 {
   528  			count[obj.ID()] = c - 1
   529  			ignore[i] = struct{}{}
   530  		}
   531  	}
   532  
   533  	return ignore
   534  }