github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/shard_write_batch_objects.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package db 13 14 import ( 15 "context" 16 "fmt" 17 "os" 18 "runtime/debug" 19 "sync" 20 "time" 21 22 enterrors "github.com/weaviate/weaviate/entities/errors" 23 "github.com/weaviate/weaviate/usecases/configbase" 24 25 "github.com/go-openapi/strfmt" 26 "github.com/google/uuid" 27 "github.com/pkg/errors" 28 "github.com/weaviate/weaviate/entities/storagestate" 29 "github.com/weaviate/weaviate/entities/storobj" 30 ) 31 32 // return value map[int]error gives the error for the index as it received it 33 func (s *Shard) PutObjectBatch(ctx context.Context, 34 objects []*storobj.Object, 35 ) []error { 36 if s.isReadOnly() { 37 return []error{storagestate.ErrStatusReadOnly} 38 } 39 40 return s.putBatch(ctx, objects) 41 } 42 43 // asyncEnabled is a quick and dirty way to create a feature flag for async 44 // indexing. 45 func asyncEnabled() bool { 46 return configbase.Enabled(os.Getenv("ASYNC_INDEXING")) 47 } 48 49 // Workers are started with the first batch and keep working as there are objects to add from any batch. Each batch 50 // adds its jobs (that contain the respective object) to a single queue that is then processed by the workers. 51 // When the last batch finishes, all workers receive a shutdown signal and exit 52 func (s *Shard) putBatch(ctx context.Context, 53 objects []*storobj.Object, 54 ) []error { 55 if asyncEnabled() { 56 return s.putBatchAsync(ctx, objects) 57 } 58 // Workers are started with the first batch and keep working as there are objects to add from any batch. Each batch 59 // adds its jobs (that contain the respective object) to a single queue that is then processed by the workers. 60 // When the last batch finishes, all workers receive a shutdown signal and exit 61 batcher := newObjectsBatcher(s) 62 err := batcher.Objects(ctx, objects) 63 64 // block until all objects of batch have been added 65 batcher.wg.Wait() 66 s.metrics.VectorIndex(batcher.batchStartTime) 67 68 return err 69 } 70 71 func (s *Shard) putBatchAsync(ctx context.Context, objects []*storobj.Object) []error { 72 beforeBatch := time.Now() 73 defer s.metrics.BatchObject(beforeBatch, len(objects)) 74 75 batcher := newObjectsBatcher(s) 76 77 batcher.init(objects) 78 batcher.storeInObjectStore(ctx) 79 batcher.markDeletedInVectorStorage(ctx) 80 batcher.storeAdditionalStorageWithAsyncQueue(ctx) 81 batcher.flushWALs(ctx) 82 83 return batcher.errs 84 } 85 86 // objectsBatcher is a helper type wrapping around an underlying shard that can 87 // execute objects batch operations on a shard (as opposed to references batch 88 // operations) 89 type objectsBatcher struct { 90 sync.Mutex 91 shard ShardLike 92 statuses map[strfmt.UUID]objectInsertStatus 93 errs []error 94 duplicates map[int]struct{} 95 objects []*storobj.Object 96 wg sync.WaitGroup 97 batchStartTime time.Time 98 } 99 100 func newObjectsBatcher(s ShardLike) *objectsBatcher { 101 return &objectsBatcher{shard: s} 102 } 103 104 // Objects imports the specified objects in parallel in a batch-fashion 105 func (ob *objectsBatcher) Objects(ctx context.Context, 106 objects []*storobj.Object, 107 ) []error { 108 beforeBatch := time.Now() 109 defer ob.shard.Metrics().BatchObject(beforeBatch, len(objects)) 110 111 ob.init(objects) 112 ob.storeInObjectStore(ctx) 113 ob.markDeletedInVectorStorage(ctx) 114 ob.storeAdditionalStorageWithWorkers(ctx) 115 ob.flushWALs(ctx) 116 return ob.errs 117 } 118 119 func (ob *objectsBatcher) init(objects []*storobj.Object) { 120 ob.objects = objects 121 ob.statuses = map[strfmt.UUID]objectInsertStatus{} 122 ob.errs = make([]error, len(objects)) 123 ob.duplicates = findDuplicatesInBatchObjects(objects) 124 } 125 126 // storeInObjectStore performs all storage operations on the underlying 127 // key/value store, this is they object-by-id store, the docID-lookup tables, 128 // as well as all inverted indices. 129 func (ob *objectsBatcher) storeInObjectStore(ctx context.Context) { 130 beforeObjectStore := time.Now() 131 132 errs := ob.storeSingleBatchInLSM(ctx, ob.objects) 133 for i, err := range errs { 134 if err != nil { 135 ob.setErrorAtIndex(err, i) 136 } 137 } 138 139 ob.shard.Metrics().ObjectStore(beforeObjectStore) 140 } 141 142 func (ob *objectsBatcher) storeSingleBatchInLSM(ctx context.Context, 143 batch []*storobj.Object, 144 ) []error { 145 errs := make([]error, len(batch)) 146 errLock := &sync.Mutex{} 147 148 // if the context is expired fail all 149 if err := ctx.Err(); err != nil { 150 for i := range errs { 151 errs[i] = errors.Wrap(err, "begin batch") 152 } 153 return errs 154 } 155 156 wg := &sync.WaitGroup{} 157 concurrencyLimit := make(chan struct{}, _NUMCPU) 158 159 for j, object := range batch { 160 wg.Add(1) 161 object := object 162 index := j 163 f := func() { 164 defer wg.Done() 165 166 // Acquire a semaphore to control the concurrency. Otherwise we would 167 // spawn one routine per object here. With very large batch sizes (e.g. 168 // 1000 or 10000+), this isn't helpuful and just leads to more lock 169 // contention down the line – especially when there's lots of text to be 170 // indexed in the inverted index. 171 concurrencyLimit <- struct{}{} 172 defer func() { 173 // Release the semaphore when the goroutine is done. 174 <-concurrencyLimit 175 }() 176 177 if err := ob.storeObjectOfBatchInLSM(ctx, index, object); err != nil { 178 errLock.Lock() 179 errs[index] = err 180 errLock.Unlock() 181 } 182 } 183 enterrors.GoWrapper(f, ob.shard.Index().logger) 184 185 } 186 wg.Wait() 187 188 return errs 189 } 190 191 func (ob *objectsBatcher) storeObjectOfBatchInLSM(ctx context.Context, 192 objectIndex int, object *storobj.Object, 193 ) error { 194 if _, ok := ob.duplicates[objectIndex]; ok { 195 return nil 196 } 197 uuidParsed, err := uuid.Parse(object.ID().String()) 198 if err != nil { 199 return errors.Wrap(err, "invalid id") 200 } 201 202 idBytes, err := uuidParsed.MarshalBinary() 203 if err != nil { 204 return err 205 } 206 207 status, err := ob.shard.putObjectLSM(object, idBytes) 208 if err != nil { 209 return err 210 } 211 212 ob.setStatusForID(status, object.ID()) 213 214 if err := ctx.Err(); err != nil { 215 return errors.Wrapf(err, "end store object %d of batch", objectIndex) 216 } 217 return nil 218 } 219 220 // setStatusForID is thread-safe as it uses the underlying mutex to lock the 221 // statuses map when writing into it 222 func (ob *objectsBatcher) setStatusForID(status objectInsertStatus, id strfmt.UUID) { 223 ob.Lock() 224 defer ob.Unlock() 225 ob.statuses[id] = status 226 } 227 228 func (ob *objectsBatcher) markDeletedInVectorStorage(ctx context.Context) { 229 var docIDsToDelete []uint64 230 var positions []int 231 for pos, object := range ob.objects { 232 status := ob.statuses[object.ID()] 233 if status.docIDChanged { 234 docIDsToDelete = append(docIDsToDelete, status.oldDocID) 235 positions = append(positions, pos) 236 } 237 } 238 239 if len(docIDsToDelete) == 0 { 240 return 241 } 242 243 if ob.shard.hasTargetVectors() { 244 for targetVector, queue := range ob.shard.Queues() { 245 if err := queue.Delete(docIDsToDelete...); err != nil { 246 for _, pos := range positions { 247 ob.setErrorAtIndex(fmt.Errorf("target vector %s: %w", targetVector, err), pos) 248 } 249 } 250 } 251 } else { 252 if err := ob.shard.Queue().Delete(docIDsToDelete...); err != nil { 253 for _, pos := range positions { 254 ob.setErrorAtIndex(err, pos) 255 } 256 } 257 } 258 } 259 260 // storeAdditionalStorageWithWorkers stores the object in all non-key-value 261 // stores, such as the main vector index as well as the property-specific 262 // indices, such as the geo-index. 263 func (ob *objectsBatcher) storeAdditionalStorageWithWorkers(ctx context.Context) { 264 if ok := ob.checkContext(ctx); !ok { 265 // if the context is no longer OK, there's no point in continuing - abort 266 // early 267 return 268 } 269 270 ob.batchStartTime = time.Now() 271 272 for i, object := range ob.objects { 273 status := ob.statuses[object.ID()] 274 if ob.shouldSkipInAdditionalStorage(i, status) { 275 continue 276 } 277 278 ob.wg.Add(1) 279 ob.shard.addJobToQueue(job{ 280 object: object, 281 status: status, 282 index: i, 283 ctx: ctx, 284 batcher: ob, 285 }) 286 } 287 } 288 289 func (ob *objectsBatcher) storeAdditionalStorageWithAsyncQueue(ctx context.Context) { 290 if ok := ob.checkContext(ctx); !ok { 291 // if the context is no longer OK, there's no point in continuing - abort 292 // early 293 return 294 } 295 296 ob.batchStartTime = time.Now() 297 shouldGeoIndex := ob.shard.hasGeoIndex() 298 299 var vectors []vectorDescriptor 300 var targetVectors map[string][]vectorDescriptor 301 hasTargetVectors := ob.shard.hasTargetVectors() 302 if hasTargetVectors { 303 targetVectors = make(map[string][]vectorDescriptor) 304 } else { 305 vectors = make([]vectorDescriptor, 0, len(ob.objects)) 306 } 307 308 for i, object := range ob.objects { 309 status := ob.statuses[object.ID()] 310 311 if ob.shouldSkipInAdditionalStorage(i, status) { 312 continue 313 } 314 315 if shouldGeoIndex { 316 if err := ob.shard.updatePropertySpecificIndices(object, status); err != nil { 317 ob.setErrorAtIndex(errors.Wrap(err, "update prop-specific indices"), i) 318 continue 319 } 320 } 321 322 // skip vector update, as vector was not changed 323 // https://github.com/weaviate/weaviate/issues/3948 324 if status.docIDPreserved { 325 continue 326 } 327 328 if len(object.Vector) == 0 && len(object.Vectors) == 0 { 329 continue 330 } 331 332 if hasTargetVectors { 333 for targetVector, vector := range object.Vectors { 334 targetVectors[targetVector] = append(targetVectors[targetVector], vectorDescriptor{ 335 id: status.docID, 336 vector: vector, 337 }) 338 } 339 } else { 340 if len(object.Vector) > 0 { 341 vectors = append(vectors, vectorDescriptor{ 342 id: status.docID, 343 vector: object.Vector, 344 }) 345 } 346 } 347 } 348 349 if hasTargetVectors { 350 for targetVector, vectors := range targetVectors { 351 queue, ok := ob.shard.Queues()[targetVector] 352 if !ok { 353 ob.setErrorAtIndex(fmt.Errorf("queue not found for target vector %s", targetVector), 0) 354 } else { 355 err := queue.Push(ctx, vectors...) 356 if err != nil { 357 ob.setErrorAtIndex(err, 0) 358 } 359 } 360 } 361 } else { 362 err := ob.shard.Queue().Push(ctx, vectors...) 363 if err != nil { 364 ob.setErrorAtIndex(err, 0) 365 } 366 } 367 } 368 369 func (ob *objectsBatcher) shouldSkipInAdditionalStorage(i int, status objectInsertStatus) bool { 370 if ok := ob.hasErrorAtIndex(i); ok { 371 // had an error prior, ignore 372 return true 373 } 374 375 // object was not changed, skip further updates 376 // https://github.com/weaviate/weaviate/issues/3949 377 if status.skipUpsert { 378 return true 379 } 380 381 // no need to lock the mutex for a duplicate check, as we only ever write 382 // during init() in there - not concurrently 383 if _, ok := ob.duplicates[i]; ok { 384 // is a duplicate, ignore 385 return true 386 } 387 388 return false 389 } 390 391 func (ob *objectsBatcher) storeSingleObjectInAdditionalStorage(ctx context.Context, 392 object *storobj.Object, status objectInsertStatus, index int, 393 ) { 394 defer func() { 395 err := recover() 396 if err != nil { 397 ob.setErrorAtIndex(fmt.Errorf("an unexpected error occurred: %s", err), index) 398 fmt.Fprintf(os.Stderr, "panic: %s\n", err) 399 debug.PrintStack() 400 } 401 }() 402 403 if err := ctx.Err(); err != nil { 404 ob.setErrorAtIndex(errors.Wrap(err, "insert to vector index"), index) 405 return 406 } 407 408 if object.Vector != nil || len(object.Vectors) > 0 { 409 // By this time all required deletes (e.g. because of DocID changes) have 410 // already been grouped and performed in bulk. Only the insertions are 411 // left. The motivation for this change is explained in 412 // https://github.com/weaviate/weaviate/pull/2697. 413 // 414 // Before this change, two identical batches in sequence would lead to 415 // massive lock contention in the hnsw index, as each individual delete 416 // requires a costly RW.Lock() operation which first drains all "readers" 417 // which represent the regular imports. See "deleteVsInsertLock" inside the 418 // hnsw store. 419 // 420 // With the improved logic, we group all batches up front in a single call, 421 // so this highly concurrent method no longer needs to compete for those 422 // expensive locks. 423 // 424 // Since this behavior is exclusive to batching, we can no longer call 425 // shard.updateVectorIndex which would also handle the delete as required 426 // for a non-batch update. Instead a new method has been introduced that 427 // ignores deletes. 428 if ob.shard.hasTargetVectors() { 429 if len(object.Vectors) > 0 { 430 if err := ob.shard.updateVectorIndexesIgnoreDelete(object.Vectors, status); err != nil { 431 ob.setErrorAtIndex(errors.Wrap(err, "insert to vector index"), index) 432 return 433 } 434 } 435 } else { 436 if object.Vector != nil { 437 if err := ob.shard.updateVectorIndexIgnoreDelete(object.Vector, status); err != nil { 438 ob.setErrorAtIndex(errors.Wrap(err, "insert to vector index"), index) 439 return 440 } 441 } 442 } 443 } 444 445 if err := ob.shard.updatePropertySpecificIndices(object, status); err != nil { 446 ob.setErrorAtIndex(errors.Wrap(err, "update prop-specific indices"), index) 447 return 448 } 449 } 450 451 // hasErrorAtIndex is thread-safe as it uses the underlying mutex to lock 452 // before reading from the errs map 453 func (ob *objectsBatcher) hasErrorAtIndex(i int) bool { 454 ob.Lock() 455 defer ob.Unlock() 456 return ob.errs[i] != nil 457 } 458 459 // setErrorAtIndex is thread-safe as it uses the underlying mutex to lock 460 // writing into the errs map 461 func (ob *objectsBatcher) setErrorAtIndex(err error, index int) { 462 ob.Lock() 463 defer ob.Unlock() 464 ob.errs[index] = err 465 } 466 467 // checkContext does nothing if the context is still active. But if the context 468 // has error'd, it marks all objects which have not previously error'd yet with 469 // the ctx error 470 func (ob *objectsBatcher) checkContext(ctx context.Context) bool { 471 if err := ctx.Err(); err != nil { 472 for i, err := range ob.errs { 473 if err == nil { 474 // already has an error, ignore 475 continue 476 } 477 478 ob.errs[i] = errors.Wrapf(err, 479 "inverted indexing complete, about to start vector indexing") 480 } 481 482 return false 483 } 484 485 return true 486 } 487 488 func (ob *objectsBatcher) flushWALs(ctx context.Context) { 489 if err := ob.shard.Store().WriteWALs(); err != nil { 490 for i := range ob.objects { 491 ob.setErrorAtIndex(err, i) 492 } 493 } 494 495 if ob.shard.hasTargetVectors() { 496 for targetVector, vectorIndex := range ob.shard.VectorIndexes() { 497 if err := vectorIndex.Flush(); err != nil { 498 for i := range ob.objects { 499 ob.setErrorAtIndex(fmt.Errorf("target vector %s: %w", targetVector, err), i) 500 } 501 } 502 } 503 } else { 504 if err := ob.shard.VectorIndex().Flush(); err != nil { 505 for i := range ob.objects { 506 ob.setErrorAtIndex(err, i) 507 } 508 } 509 } 510 511 if err := ob.shard.GetPropertyLengthTracker().Flush(false); err != nil { 512 for i := range ob.objects { 513 ob.setErrorAtIndex(err, i) 514 } 515 } 516 } 517 518 // returns the originalIndexIDs to be ignored 519 func findDuplicatesInBatchObjects(in []*storobj.Object) map[int]struct{} { 520 count := map[strfmt.UUID]int{} 521 for _, obj := range in { 522 count[obj.ID()] = count[obj.ID()] + 1 523 } 524 525 ignore := map[int]struct{}{} 526 for i, obj := range in { 527 if c := count[obj.ID()]; c > 1 { 528 count[obj.ID()] = c - 1 529 ignore[i] = struct{}{} 530 } 531 } 532 533 return ignore 534 }