github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/shard_write_put.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package db 13 14 import ( 15 "bytes" 16 "context" 17 "encoding/binary" 18 "fmt" 19 "reflect" 20 "time" 21 22 "github.com/google/uuid" 23 "github.com/pkg/errors" 24 "github.com/weaviate/weaviate/adapters/repos/db/helpers" 25 "github.com/weaviate/weaviate/adapters/repos/db/inverted" 26 "github.com/weaviate/weaviate/adapters/repos/db/lsmkv" 27 "github.com/weaviate/weaviate/adapters/repos/db/vector/common" 28 "github.com/weaviate/weaviate/entities/models" 29 "github.com/weaviate/weaviate/entities/storagestate" 30 "github.com/weaviate/weaviate/entities/storobj" 31 ) 32 33 func (s *Shard) PutObject(ctx context.Context, object *storobj.Object) error { 34 if s.isReadOnly() { 35 return storagestate.ErrStatusReadOnly 36 } 37 uuid, err := uuid.MustParse(object.ID().String()).MarshalBinary() 38 if err != nil { 39 return err 40 } 41 return s.putOne(ctx, uuid, object) 42 } 43 44 func (s *Shard) putOne(ctx context.Context, uuid []byte, object *storobj.Object) error { 45 if s.hasTargetVectors() { 46 if len(object.Vectors) > 0 { 47 for targetVector, vector := range object.Vectors { 48 if vectorIndex := s.VectorIndexForName(targetVector); vectorIndex != nil { 49 if err := vectorIndex.ValidateBeforeInsert(vector); err != nil { 50 return errors.Wrapf(err, "Validate vector index %s for target vector %s", targetVector, object.ID()) 51 } 52 } 53 } 54 } 55 } else { 56 if object.Vector != nil { 57 // validation needs to happen before any changes are done. Otherwise, insertion is aborted somewhere in-between. 58 err := s.vectorIndex.ValidateBeforeInsert(object.Vector) 59 if err != nil { 60 return errors.Wrapf(err, "Validate vector index for %s", object.ID()) 61 } 62 } 63 } 64 65 status, err := s.putObjectLSM(object, uuid) 66 if err != nil { 67 return errors.Wrap(err, "store object in LSM store") 68 } 69 70 // object was not changed, no further updates are required 71 // https://github.com/weaviate/weaviate/issues/3949 72 if status.skipUpsert { 73 return nil 74 } 75 76 if s.hasTargetVectors() { 77 for targetVector, vector := range object.Vectors { 78 if err := s.updateVectorIndexForName(vector, status, targetVector); err != nil { 79 return errors.Wrapf(err, "update vector index for target vector %s", targetVector) 80 } 81 } 82 } else { 83 if err := s.updateVectorIndex(object.Vector, status); err != nil { 84 return errors.Wrap(err, "update vector index") 85 } 86 } 87 88 if err := s.updatePropertySpecificIndices(object, status); err != nil { 89 return errors.Wrap(err, "update property-specific indices") 90 } 91 92 if err := s.store.WriteWALs(); err != nil { 93 return errors.Wrap(err, "flush all buffered WALs") 94 } 95 96 if err := s.GetPropertyLengthTracker().Flush(false); err != nil { 97 return errors.Wrap(err, "flush prop length tracker to disk") 98 } 99 100 return nil 101 } 102 103 // as the name implies this method only performs the insertions, but completely 104 // ignores any deletes. It thus assumes that the caller has already taken care 105 // of all the deletes in another way 106 func (s *Shard) updateVectorIndexIgnoreDelete(vector []float32, 107 status objectInsertStatus, 108 ) error { 109 // vector was not changed, object was not changed or changed without changing vector 110 // https://github.com/weaviate/weaviate/issues/3948 111 // https://github.com/weaviate/weaviate/issues/3949 112 if status.docIDPreserved || status.skipUpsert { 113 return nil 114 } 115 116 // vector is now optional as of 117 // https://github.com/weaviate/weaviate/issues/1800 118 if len(vector) == 0 { 119 return nil 120 } 121 122 if err := s.vectorIndex.Add(status.docID, vector); err != nil { 123 return errors.Wrapf(err, "insert doc id %d to vector index", status.docID) 124 } 125 126 return nil 127 } 128 129 // as the name implies this method only performs the insertions, but completely 130 // ignores any deletes. It thus assumes that the caller has already taken care 131 // of all the deletes in another way 132 func (s *Shard) updateVectorIndexesIgnoreDelete(vectors map[string][]float32, 133 status objectInsertStatus, 134 ) error { 135 // vector was not changed, object was not changed or changed without changing vector 136 // https://github.com/weaviate/weaviate/issues/3948 137 // https://github.com/weaviate/weaviate/issues/3949 138 if status.docIDPreserved || status.skipUpsert { 139 return nil 140 } 141 142 // vector is now optional as of 143 // https://github.com/weaviate/weaviate/issues/1800 144 if len(vectors) == 0 { 145 return nil 146 } 147 148 for targetVector, vector := range vectors { 149 if vectorIndex := s.VectorIndexForName(targetVector); vectorIndex != nil { 150 if err := vectorIndex.Add(status.docID, vector); err != nil { 151 return errors.Wrapf(err, "insert doc id %d to vector index for target vector %s", status.docID, targetVector) 152 } 153 } 154 } 155 156 return nil 157 } 158 159 func (s *Shard) updateVectorIndex(vector []float32, 160 status objectInsertStatus, 161 ) error { 162 return s.updateVectorInVectorIndex(vector, status, s.queue, s.vectorIndex) 163 } 164 165 func (s *Shard) updateVectorIndexForName(vector []float32, 166 status objectInsertStatus, targetVector string, 167 ) error { 168 queue, ok := s.queues[targetVector] 169 if !ok { 170 return fmt.Errorf("vector queue not found for target vector %s", targetVector) 171 } 172 vectorIndex := s.VectorIndexForName(targetVector) 173 if vectorIndex == nil { 174 return fmt.Errorf("vector index not found for target vector %s", targetVector) 175 } 176 return s.updateVectorInVectorIndex(vector, status, queue, vectorIndex) 177 } 178 179 func (s *Shard) updateVectorInVectorIndex(vector []float32, 180 status objectInsertStatus, queue *IndexQueue, vectorIndex VectorIndex, 181 ) error { 182 // even if no vector is provided in an update, we still need 183 // to delete the previous vector from the index, if it 184 // exists. otherwise, the associated doc id is left dangling, 185 // resulting in failed attempts to merge an object on restarts. 186 if status.docIDChanged { 187 if err := queue.Delete(status.oldDocID); err != nil { 188 return errors.Wrapf(err, "delete doc id %d from vector index", status.oldDocID) 189 } 190 } 191 192 // vector was not changed, object was updated without changing docID 193 // https://github.com/weaviate/weaviate/issues/3948 194 if status.docIDPreserved { 195 return nil 196 } 197 198 // vector is now optional as of 199 // https://github.com/weaviate/weaviate/issues/1800 200 if len(vector) == 0 { 201 return nil 202 } 203 204 if err := vectorIndex.Add(status.docID, vector); err != nil { 205 return errors.Wrapf(err, "insert doc id %d to vector index", status.docID) 206 } 207 208 if err := vectorIndex.Flush(); err != nil { 209 return errors.Wrap(err, "flush all vector index buffered WALs") 210 } 211 212 return nil 213 } 214 215 func fetchObject(bucket *lsmkv.Bucket, idBytes []byte) (*storobj.Object, error) { 216 objBytes, err := bucket.Get(idBytes) 217 if err != nil { 218 return nil, err 219 } 220 if len(objBytes) == 0 { 221 return nil, nil 222 } 223 224 obj, err := storobj.FromBinary(objBytes) 225 if err != nil { 226 return nil, err 227 } 228 229 return obj, nil 230 } 231 232 func (s *Shard) putObjectLSM(obj *storobj.Object, idBytes []byte, 233 ) (objectInsertStatus, error) { 234 before := time.Now() 235 defer s.metrics.PutObject(before) 236 237 bucket := s.store.Bucket(helpers.ObjectsBucketLSM) 238 var prevObj *storobj.Object 239 var status objectInsertStatus 240 241 // First the object bucket is checked if an object with the same uuid is alreadypresent, 242 // to determine if it is insert or an update. 243 // Afterwards the bucket is updated. To avoid races, only one goroutine can do this at once. 244 lock := &s.docIdLock[s.uuidToIdLockPoolId(idBytes)] 245 246 // wrapped in function to handle lock/unlock 247 if err := func() error { 248 lock.Lock() 249 defer lock.Unlock() 250 251 var err error 252 253 before = time.Now() 254 prevObj, err = fetchObject(bucket, idBytes) 255 if err != nil { 256 return err 257 } 258 259 status, err = s.determineInsertStatus(prevObj, obj) 260 if err != nil { 261 return err 262 } 263 s.metrics.PutObjectDetermineStatus(before) 264 265 obj.DocID = status.docID 266 if status.skipUpsert { 267 return nil 268 } 269 270 objBinary, err := obj.MarshalBinary() 271 if err != nil { 272 return errors.Wrapf(err, "marshal object %s to binary", obj.ID()) 273 } 274 275 before = time.Now() 276 if err := s.upsertObjectDataLSM(bucket, idBytes, objBinary, status.docID); err != nil { 277 return errors.Wrap(err, "upsert object data") 278 } 279 s.metrics.PutObjectUpsertObject(before) 280 281 return nil 282 }(); err != nil { 283 return objectInsertStatus{}, err 284 } else if status.skipUpsert { 285 return status, nil 286 } 287 288 before = time.Now() 289 if err := s.updateInvertedIndexLSM(obj, status, prevObj); err != nil { 290 return objectInsertStatus{}, errors.Wrap(err, "update inverted indices") 291 } 292 s.metrics.PutObjectUpdateInverted(before) 293 294 return status, nil 295 } 296 297 type objectInsertStatus struct { 298 docID uint64 299 docIDChanged bool 300 oldDocID uint64 301 // docID was not changed, although object itself did. DocID can be preserved if 302 // object's vector remain the same, allowing to omit vector index update which is time 303 // consuming operation. New object is saved and inverted indexes updated if required. 304 docIDPreserved bool 305 // object was not changed, all properties and additional properties are the same as in 306 // the one already stored. No object update, inverted indexes update and vector index 307 // update is required. 308 skipUpsert bool 309 } 310 311 // to be called with the current contents of a row, if the row is empty (i.e. 312 // didn't exist before), we will get a new docID from the central counter. 313 // Otherwise, we will reuse the previous docID and mark this as an update 314 func (s *Shard) determineInsertStatus(prevObj, nextObj *storobj.Object) (objectInsertStatus, error) { 315 var out objectInsertStatus 316 317 if prevObj == nil { 318 docID, err := s.counter.GetAndInc() 319 if err != nil { 320 return out, errors.Wrap(err, "initial doc id: get new doc id from counter") 321 } 322 out.docID = docID 323 return out, nil 324 } 325 326 out.oldDocID = prevObj.DocID 327 328 // If object was not changed (props and additional props of prev and next objects are the same) 329 // skip updates of object, inverted indexes and vector index. 330 // https://github.com/weaviate/weaviate/issues/3949 331 // 332 // If object was changed (props or additional props of prev and next objects differ) 333 // update objects and inverted indexes, skip update of vector index. 334 // https://github.com/weaviate/weaviate/issues/3948 335 // 336 // Due to geo index's (using HNSW vector index) requirement new docID for delete+insert 337 // (delete initially adds tombstone, which "overwrite" following insert of the same docID) 338 // any update of geo property needs new docID for updating geo index. 339 if preserve, skip := compareObjsForInsertStatus(prevObj, nextObj); preserve || skip { 340 out.docID = prevObj.DocID 341 out.docIDPreserved = preserve 342 out.skipUpsert = skip 343 return out, nil 344 } 345 346 docID, err := s.counter.GetAndInc() 347 if err != nil { 348 return out, errors.Wrap(err, "doc id update: get new doc id from counter") 349 } 350 out.docID = docID 351 out.docIDChanged = true 352 353 return out, nil 354 } 355 356 // determineMutableInsertStatus is a special version of determineInsertStatus 357 // where it does not alter the doc id if one already exists. Calling this 358 // method only makes sense under very special conditions, such as those 359 // outlined in mutableMergeObjectInTx 360 func (s *Shard) determineMutableInsertStatus(previous, next *storobj.Object) (objectInsertStatus, error) { 361 var out objectInsertStatus 362 363 if previous == nil { 364 docID, err := s.counter.GetAndInc() 365 if err != nil { 366 return out, errors.Wrap(err, "initial doc id: get new doc id from counter") 367 } 368 out.docID = docID 369 return out, nil 370 } 371 372 out.docID = previous.DocID 373 374 // we are planning on mutating and thus not altering the doc id 375 return out, nil 376 } 377 378 func (s *Shard) upsertObjectDataLSM(bucket *lsmkv.Bucket, id []byte, data []byte, 379 docID uint64, 380 ) error { 381 keyBuf := bytes.NewBuffer(nil) 382 binary.Write(keyBuf, binary.LittleEndian, &docID) 383 docIDBytes := keyBuf.Bytes() 384 385 return bucket.Put(id, data, lsmkv.WithSecondaryKey(0, docIDBytes)) 386 } 387 388 func (s *Shard) updateInvertedIndexLSM(object *storobj.Object, 389 status objectInsertStatus, prevObject *storobj.Object, 390 ) error { 391 props, nilprops, err := s.AnalyzeObject(object) 392 if err != nil { 393 return errors.Wrap(err, "analyze next object") 394 } 395 396 var prevProps []inverted.Property 397 var prevNilprops []inverted.NilProperty 398 399 if prevObject != nil { 400 prevProps, prevNilprops, err = s.AnalyzeObject(prevObject) 401 if err != nil { 402 return fmt.Errorf("analyze previous object: %w", err) 403 } 404 } 405 406 // if object updated (with or without docID changed) 407 if status.docIDChanged || status.docIDPreserved { 408 if err := s.subtractPropLengths(prevProps); err != nil { 409 s.index.logger.WithField("action", "subtractPropLengths").WithError(err).Error("could not subtract prop lengths") 410 } 411 } 412 413 if err := s.SetPropertyLengths(props); err != nil { 414 return errors.Wrap(err, "store field length values for props") 415 } 416 417 var propsToAdd []inverted.Property 418 var propsToDel []inverted.Property 419 var nilpropsToAdd []inverted.NilProperty 420 var nilpropsToDel []inverted.NilProperty 421 422 // determine only changed properties to avoid unnecessary updates of inverted indexes 423 if status.docIDPreserved { 424 delta := inverted.Delta(prevProps, props) 425 propsToAdd = delta.ToAdd 426 propsToDel = delta.ToDelete 427 deltaNil := inverted.DeltaNil(prevNilprops, nilprops) 428 nilpropsToAdd = deltaNil.ToAdd 429 nilpropsToDel = deltaNil.ToDelete 430 } else { 431 propsToAdd = inverted.DedupItems(props) 432 propsToDel = inverted.DedupItems(prevProps) 433 nilpropsToAdd = nilprops 434 nilpropsToDel = prevNilprops 435 } 436 437 if prevObject != nil { 438 // TODO: metrics 439 if err := s.deleteFromInvertedIndicesLSM(propsToDel, nilpropsToDel, status.oldDocID); err != nil { 440 return fmt.Errorf("delete inverted indices props: %w", err) 441 } 442 if s.index.Config.TrackVectorDimensions { 443 if s.hasTargetVectors() { 444 for vecName, vec := range prevObject.Vectors { 445 if err := s.removeDimensionsForVecLSM(len(vec), status.oldDocID, vecName); err != nil { 446 return fmt.Errorf("track dimensions of '%s' (delete): %w", vecName, err) 447 } 448 } 449 } else { 450 if err := s.removeDimensionsLSM(len(prevObject.Vector), status.oldDocID); err != nil { 451 return fmt.Errorf("track dimensions (delete): %w", err) 452 } 453 } 454 } 455 } 456 457 before := time.Now() 458 if err := s.extendInvertedIndicesLSM(propsToAdd, nilpropsToAdd, status.docID); err != nil { 459 return fmt.Errorf("put inverted indices props: %w", err) 460 } 461 s.metrics.InvertedExtend(before, len(propsToAdd)) 462 463 if s.index.Config.TrackVectorDimensions { 464 if s.hasTargetVectors() { 465 for vecName, vec := range object.Vectors { 466 if err := s.extendDimensionTrackerForVecLSM(len(vec), status.docID, vecName); err != nil { 467 return fmt.Errorf("track dimensions of '%s': %w", vecName, err) 468 } 469 } 470 } else { 471 if err := s.extendDimensionTrackerLSM(len(object.Vector), status.docID); err != nil { 472 return fmt.Errorf("track dimensions: %w", err) 473 } 474 } 475 } 476 477 return nil 478 } 479 480 func compareObjsForInsertStatus(prevObj, nextObj *storobj.Object) (preserve, skip bool) { 481 prevProps, ok := prevObj.Object.Properties.(map[string]interface{}) 482 if !ok { 483 return false, false 484 } 485 nextProps, ok := nextObj.Object.Properties.(map[string]interface{}) 486 if !ok { 487 return false, false 488 } 489 if !geoPropsEqual(prevProps, nextProps) { 490 return false, false 491 } 492 if !common.VectorsEqual(prevObj.Vector, nextObj.Vector) { 493 return false, false 494 } 495 if !targetVectorsEqual(prevObj.Vectors, nextObj.Vectors) { 496 return false, false 497 } 498 if !addPropsEqual(prevObj.Object.Additional, nextObj.Object.Additional) { 499 return true, false 500 } 501 if !propsEqual(prevProps, nextProps) { 502 return true, false 503 } 504 return false, true 505 } 506 507 func geoPropsEqual(prevProps, nextProps map[string]interface{}) bool { 508 geoPropsCompared := map[string]struct{}{} 509 510 for name, prevVal := range prevProps { 511 switch prevGeoVal := prevVal.(type) { 512 case *models.GeoCoordinates: 513 nextVal, ok := nextProps[name] 514 if !ok { 515 // matching prop does not exist in next 516 return false 517 } 518 519 switch nextGeoVal := nextVal.(type) { 520 case *models.GeoCoordinates: 521 if !reflect.DeepEqual(prevGeoVal, nextGeoVal) { 522 // matching geo props in prev and next differ 523 return false 524 } 525 default: 526 // matching prop in next is not geo 527 return false 528 } 529 geoPropsCompared[name] = struct{}{} 530 } 531 } 532 533 for name, nextVal := range nextProps { 534 switch nextVal.(type) { 535 case *models.GeoCoordinates: 536 if _, ok := geoPropsCompared[name]; !ok { 537 // matching geo prop does not exist in prev 538 return false 539 } 540 } 541 } 542 543 return true 544 } 545 546 func timeToString(t time.Time) string { 547 if b, err := t.MarshalText(); err == nil { 548 return string(b) 549 } 550 return "" 551 } 552 553 func uuidToString(u uuid.UUID) string { 554 if b, err := u.MarshalText(); err == nil { 555 return string(b) 556 } 557 return "" 558 } 559 560 func targetVectorsEqual(prevTargetVectors, nextTargetVectors map[string][]float32) bool { 561 if len(prevTargetVectors) == 0 && len(nextTargetVectors) == 0 { 562 return true 563 } 564 565 visited := map[string]struct{}{} 566 for vecName, vec := range prevTargetVectors { 567 if !common.VectorsEqual(vec, nextTargetVectors[vecName]) { 568 return false 569 } 570 visited[vecName] = struct{}{} 571 } 572 for vecName, vec := range nextTargetVectors { 573 if _, ok := visited[vecName]; !ok { 574 if !common.VectorsEqual(vec, prevTargetVectors[vecName]) { 575 return false 576 } 577 } 578 } 579 580 return true 581 } 582 583 func addPropsEqual(prevAddProps, nextAddProps models.AdditionalProperties) bool { 584 return reflect.DeepEqual(prevAddProps, nextAddProps) 585 } 586 587 func propsEqual(prevProps, nextProps map[string]interface{}) bool { 588 if len(prevProps) != len(nextProps) { 589 return false 590 } 591 592 for name := range nextProps { 593 if _, ok := prevProps[name]; !ok { 594 return false 595 } 596 597 switch nextVal := nextProps[name].(type) { 598 case time.Time: 599 if timeToString(nextVal) != prevProps[name] { 600 return false 601 } 602 603 case []time.Time: 604 prevVal, ok := prevProps[name].([]string) 605 if !ok { 606 return false 607 } 608 if len(nextVal) != len(prevVal) { 609 return false 610 } 611 for i := range nextVal { 612 if timeToString(nextVal[i]) != prevVal[i] { 613 return false 614 } 615 } 616 617 case uuid.UUID: 618 if uuidToString(nextVal) != prevProps[name] { 619 return false 620 } 621 622 case []uuid.UUID: 623 prevVal, ok := prevProps[name].([]string) 624 if !ok { 625 return false 626 } 627 if len(nextVal) != len(prevVal) { 628 return false 629 } 630 for i := range nextVal { 631 if uuidToString(nextVal[i]) != prevVal[i] { 632 return false 633 } 634 } 635 636 case map[string]interface{}: // data type "object" 637 prevVal, ok := prevProps[name].(map[string]interface{}) 638 if !ok { 639 return false 640 } 641 if !propsEqual(prevVal, nextVal) { 642 return false 643 } 644 645 case []interface{}: // data type "objects" 646 prevVal, ok := prevProps[name].([]interface{}) 647 if !ok { 648 return false 649 } 650 if len(nextVal) != len(prevVal) { 651 return false 652 } 653 for i := range nextVal { 654 nextValI, ok := nextVal[i].(map[string]interface{}) 655 if !ok { 656 return false 657 } 658 prevValI, ok := prevVal[i].(map[string]interface{}) 659 if !ok { 660 return false 661 } 662 if !propsEqual(prevValI, nextValI) { 663 return false 664 } 665 } 666 667 default: 668 if !reflect.DeepEqual(nextProps[name], prevProps[name]) { 669 return false 670 } 671 } 672 } 673 674 return true 675 }