github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/batch.go

github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/batch.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package db
    13  
    14  import (
    15  	"context"
    16  	"fmt"
    17  
    18  	"github.com/go-openapi/strfmt"
    19  	"github.com/pkg/errors"
    20  	"github.com/weaviate/weaviate/entities/additional"
    21  	"github.com/weaviate/weaviate/entities/schema"
    22  	"github.com/weaviate/weaviate/entities/storobj"
    23  	"github.com/weaviate/weaviate/usecases/objects"
    24  )
    25  
    26  type batchQueue struct {
    27  	objects       []*storobj.Object
    28  	originalIndex []int
    29  }
    30  
    31  func (db *DB) BatchPutObjects(ctx context.Context, objs objects.BatchObjects,
    32  	repl *additional.ReplicationProperties,
    33  ) (objects.BatchObjects, error) {
    34  	objectByClass := make(map[string]batchQueue)
    35  	indexByClass := make(map[string]*Index)
    36  
    37  	if err := db.memMonitor.CheckAlloc(estimateBatchMemory(objs)); err != nil {
    38  		return nil, fmt.Errorf("cannot process batch: %w", err)
    39  	}
    40  
    41  	for _, item := range objs {
    42  		if item.Err != nil {
    43  			// item has a validation error or another reason to ignore
    44  			continue
    45  		}
    46  		queue := objectByClass[item.Object.Class]
    47  		queue.objects = append(queue.objects, storobj.FromObject(item.Object, item.Object.Vector, item.Object.Vectors))
    48  		queue.originalIndex = append(queue.originalIndex, item.OriginalIndex)
    49  		objectByClass[item.Object.Class] = queue
    50  	}
    51  
    52  	// wrapped by func to acquire and safely release indexLock only for duration of loop
    53  	func() {
    54  		db.indexLock.RLock()
    55  		defer db.indexLock.RUnlock()
    56  
    57  		for class, queue := range objectByClass {
    58  			index, ok := db.indices[indexID(schema.ClassName(class))]
    59  			if !ok {
    60  				msg := fmt.Sprintf("could not find index for class %v. It might have been deleted in the meantime", class)
    61  				db.logger.Warn(msg)
    62  				for _, origIdx := range queue.originalIndex {
    63  					if origIdx >= len(objs) {
    64  						db.logger.Errorf(
    65  							"batch add queue index out of bounds. len(objs) == %d, queue.originalIndex == %d",
    66  							len(objs), origIdx)
    67  						break
    68  					}
    69  					objs[origIdx].Err = fmt.Errorf(msg)
    70  				}
    71  				continue
    72  			}
    73  			index.dropIndex.RLock()
    74  			indexByClass[class] = index
    75  		}
    76  	}()
    77  
    78  	// safely release remaining locks (in case of panic)
    79  	defer func() {
    80  		for _, index := range indexByClass {
    81  			if index != nil {
    82  				index.dropIndex.RUnlock()
    83  			}
    84  		}
    85  	}()
    86  
    87  	for class, index := range indexByClass {
    88  		queue := objectByClass[class]
    89  		errs := index.putObjectBatch(ctx, queue.objects, repl)
    90  		// remove index from map to skip releasing its lock in defer
    91  		indexByClass[class] = nil
    92  		index.dropIndex.RUnlock()
    93  		for i, err := range errs {
    94  			if err != nil {
    95  				objs[queue.originalIndex[i]].Err = err
    96  			}
    97  		}
    98  	}
    99  
   100  	return objs, nil
   101  }
   102  
   103  func (db *DB) AddBatchReferences(ctx context.Context, references objects.BatchReferences,
   104  	repl *additional.ReplicationProperties,
   105  ) (objects.BatchReferences, error) {
   106  	refByClass := make(map[schema.ClassName]objects.BatchReferences)
   107  	indexByClass := make(map[schema.ClassName]*Index)
   108  
   109  	for _, item := range references {
   110  		if item.Err != nil {
   111  			// item has a validation error or another reason to ignore
   112  			continue
   113  		}
   114  		refByClass[item.From.Class] = append(refByClass[item.From.Class], item)
   115  	}
   116  
   117  	// wrapped by func to acquire and safely release indexLock only for duration of loop
   118  	func() {
   119  		db.indexLock.RLock()
   120  		defer db.indexLock.RUnlock()
   121  
   122  		for class, queue := range refByClass {
   123  			index, ok := db.indices[indexID(class)]
   124  			if !ok {
   125  				for _, item := range queue {
   126  					references[item.OriginalIndex].Err = fmt.Errorf("could not find index for class %v. It might have been deleted in the meantime", class)
   127  				}
   128  				continue
   129  			}
   130  			index.dropIndex.RLock()
   131  			indexByClass[class] = index
   132  		}
   133  	}()
   134  
   135  	// safely release remaining locks (in case of panic)
   136  	defer func() {
   137  		for _, index := range indexByClass {
   138  			if index != nil {
   139  				index.dropIndex.RUnlock()
   140  			}
   141  		}
   142  	}()
   143  
   144  	for class, index := range indexByClass {
   145  		queue := refByClass[class]
   146  		errs := index.AddReferencesBatch(ctx, queue, repl)
   147  		// remove index from map to skip releasing its lock in defer
   148  		indexByClass[class] = nil
   149  		index.dropIndex.RUnlock()
   150  		for i, err := range errs {
   151  			if err != nil {
   152  				references[queue[i].OriginalIndex].Err = err
   153  			}
   154  		}
   155  	}
   156  
   157  	return references, nil
   158  }
   159  
   160  func (db *DB) BatchDeleteObjects(ctx context.Context, params objects.BatchDeleteParams,
   161  	repl *additional.ReplicationProperties, tenant string,
   162  ) (objects.BatchDeleteResult, error) {
   163  	// get index for a given class
   164  	className := params.ClassName
   165  	idx := db.GetIndex(className)
   166  	if idx == nil {
   167  		return objects.BatchDeleteResult{}, errors.Errorf("cannot find index for class %v", className)
   168  	}
   169  
   170  	// find all DocIDs in all shards that match the filter
   171  	shardDocIDs, err := idx.findUUIDs(ctx, params.Filters, tenant)
   172  	if err != nil {
   173  		return objects.BatchDeleteResult{}, errors.Wrapf(err, "cannot find objects")
   174  	}
   175  	// prepare to be deleted list of DocIDs from all shards
   176  	toDelete := map[string][]strfmt.UUID{}
   177  	limit := db.config.QueryMaximumResults
   178  
   179  	matches := int64(0)
   180  	for shardName, docIDs := range shardDocIDs {
   181  		docIDsLength := int64(len(docIDs))
   182  		if matches <= limit {
   183  			if matches+docIDsLength <= limit {
   184  				toDelete[shardName] = docIDs
   185  			} else {
   186  				toDelete[shardName] = docIDs[:limit-matches]
   187  			}
   188  		}
   189  		matches += docIDsLength
   190  	}
   191  	// delete the DocIDs in given shards
   192  	deletedObjects, err := idx.batchDeleteObjects(ctx, toDelete, params.DryRun, repl)
   193  	if err != nil {
   194  		return objects.BatchDeleteResult{}, errors.Wrapf(err, "cannot delete objects")
   195  	}
   196  
   197  	result := objects.BatchDeleteResult{
   198  		Matches: matches,
   199  		Limit:   db.config.QueryMaximumResults,
   200  		DryRun:  params.DryRun,
   201  		Objects: deletedObjects,
   202  	}
   203  	return result, nil
   204  }
   205  
   206  func estimateBatchMemory(objs objects.BatchObjects) int64 {
   207  	var sum int64
   208  	for _, item := range objs {
   209  		// Note: This is very much oversimplified. It assumes that we always need
   210  		// the footprint of the full vector and it assumes a fixed overhead of 30B
   211  		// per vector. In reality this depends on the HNSW settings - and possibly
   212  		// in the future we might have completely different index types.
   213  		//
   214  		// However, in the meantime this should be a fairly reasonable estimate, as
   215  		// it's not meant to fail exactly on the last available byte, but rather
   216  		// prevent OOM crashes. Given the fuzziness and async style of the
   217  		// memtrackinga somewhat decent estimate should be good enough.
   218  		sum += int64(len(item.Object.Vector)*4 + 30)
   219  	}
   220  
   221  	return sum
   222  }