github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/shard_write_batch_references.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package db 13 14 import ( 15 "context" 16 "fmt" 17 "sync" 18 "time" 19 20 "github.com/google/uuid" 21 "github.com/pkg/errors" 22 "github.com/weaviate/weaviate/adapters/repos/db/helpers" 23 "github.com/weaviate/weaviate/adapters/repos/db/inverted" 24 "github.com/weaviate/weaviate/entities/models" 25 "github.com/weaviate/weaviate/entities/schema" 26 "github.com/weaviate/weaviate/entities/storobj" 27 "github.com/weaviate/weaviate/usecases/objects" 28 ) 29 30 // return value map[int]error gives the error for the index as it received it 31 func (s *Shard) AddReferencesBatch(ctx context.Context, refs objects.BatchReferences) []error { 32 if s.isReadOnly() { 33 return []error{errors.Errorf("shard is read-only")} 34 } 35 36 return newReferencesBatcher(s).References(ctx, refs) 37 } 38 39 // referencesBatcher is a helper type wrapping around an underlying shard that can 40 // execute references batch operations on a shard (as opposed to object batch 41 // operations) 42 type referencesBatcher struct { 43 sync.Mutex 44 shard ShardLike 45 errs []error 46 refs objects.BatchReferences 47 } 48 49 func newReferencesBatcher(s ShardLike) *referencesBatcher { 50 return &referencesBatcher{ 51 shard: s, 52 } 53 } 54 55 func (b *referencesBatcher) References(ctx context.Context, 56 refs objects.BatchReferences, 57 ) []error { 58 b.init(refs) 59 b.storeInObjectStore(ctx) 60 b.flushWALs(ctx) 61 return b.errs 62 } 63 64 func (b *referencesBatcher) init(refs objects.BatchReferences) { 65 b.refs = refs 66 b.errs = make([]error, len(refs)) 67 } 68 69 func (b *referencesBatcher) storeInObjectStore( 70 ctx context.Context, 71 ) { 72 errs := b.storeSingleBatchInLSM(ctx, b.refs) 73 for i, err := range errs { 74 if err != nil { 75 b.setErrorAtIndex(err, i) 76 } 77 } 78 79 // adding references can not alter the vector position, so no need to alter 80 // the vector index 81 } 82 83 func (b *referencesBatcher) storeSingleBatchInLSM(ctx context.Context, batch objects.BatchReferences) []error { 84 errs := make([]error, len(batch)) 85 errLock := &sync.Mutex{} 86 87 // if the context is expired fail all 88 if err := ctx.Err(); err != nil { 89 for i := range errs { 90 errs[i] = errors.Wrap(err, "begin batch") 91 } 92 return errs 93 } 94 95 invertedMerger := inverted.NewDeltaMerger() 96 propsByName, err := b.getSchemaPropsByName() 97 if err != nil { 98 for i := range errs { 99 errs[i] = errors.Wrap(err, "getting schema properties") 100 } 101 return errs 102 } 103 104 // TODO: is there any benefit in having this parallelized? if so, don't forget to lock before assigning errors 105 // If we want them to run in parallel we need to look individual objects, 106 // otherwise we have a race inside the merge functions 107 // wg := &sync.WaitGroup{} 108 for i, ref := range batch { 109 // wg.Add(1) 110 // go func(index int, reference objects.BatchReference) { 111 // defer wg.Done() 112 uuidParsed, err := uuid.Parse(ref.From.TargetID.String()) 113 if err != nil { 114 errLock.Lock() 115 errs[i] = errors.Wrap(err, "invalid id") 116 errLock.Unlock() 117 continue 118 } 119 120 idBytes, err := uuidParsed.MarshalBinary() 121 if err != nil { 122 errLock.Lock() 123 errs[i] = err 124 errLock.Unlock() 125 continue 126 } 127 128 mergeDoc := mergeDocFromBatchReference(ref) 129 res, err := b.shard.mutableMergeObjectLSM(mergeDoc, idBytes) 130 if err != nil { 131 errLock.Lock() 132 errs[i] = err 133 errLock.Unlock() 134 continue 135 } 136 137 prop, ok := propsByName[ref.From.Property.String()] 138 if !ok { 139 errLock.Lock() 140 errs[i] = fmt.Errorf("property '%s' not found in schema", ref.From.Property) 141 errLock.Unlock() 142 continue 143 } 144 145 // generally the batch ref is an append only change which does not alter 146 // the vector position. There is however one inverted index link that needs 147 // to be cleanup: the ref count 148 if err := b.analyzeInverted(invertedMerger, res, ref, prop); err != nil { 149 errLock.Lock() 150 errs[i] = err 151 errLock.Unlock() 152 continue 153 } 154 } 155 156 if err := b.writeInverted(invertedMerger.Merge()); err != nil { 157 for i := range errs { 158 errs[i] = errors.Wrap(err, "write inverted batch") 159 } 160 return errs 161 } 162 163 return errs 164 } 165 166 func (b *referencesBatcher) analyzeInverted(invertedMerger *inverted.DeltaMerger, mergeResult mutableMergeResult, ref objects.BatchReference, prop *models.Property) error { 167 prevProps, err := b.analyzeRef(mergeResult.previous, ref, prop) 168 if err != nil { 169 return err 170 } 171 172 nextProps, err := b.analyzeRef(mergeResult.next, ref, prop) 173 if err != nil { 174 return err 175 } 176 177 delta := inverted.Delta(prevProps, nextProps) 178 invertedMerger.AddAdditions(delta.ToAdd, mergeResult.status.docID) 179 invertedMerger.AddDeletions(delta.ToDelete, mergeResult.status.docID) 180 181 return nil 182 } 183 184 func (b *referencesBatcher) writeInverted(in inverted.DeltaMergeResult) error { 185 before := time.Now() 186 if err := b.writeInvertedAdditions(in.Additions); err != nil { 187 return errors.Wrap(err, "write additions") 188 } 189 b.shard.Metrics().InvertedExtend(before, len(in.Additions)) 190 191 before = time.Now() 192 if err := b.writeInvertedDeletions(in.Deletions); err != nil { 193 return errors.Wrap(err, "write deletions") 194 } 195 b.shard.Metrics().InvertedDeleteDelta(before) 196 197 return nil 198 } 199 200 // TODO text_rbm_inverted_index unify bucket write 201 func (b *referencesBatcher) writeInvertedDeletions(in []inverted.MergeProperty) error { 202 for _, prop := range in { 203 // in the references batcher we can only ever write ref count entire which 204 // are guaranteed to be not have a frequency, meaning they will use the 205 // "Set" strategy in the lsmkv store 206 if prop.HasFilterableIndex { 207 bucket := b.shard.Store().Bucket(helpers.BucketFromPropNameLSM(prop.Name)) 208 if bucket == nil { 209 return errors.Errorf("no bucket for prop '%s' found", prop.Name) 210 } 211 212 for _, item := range prop.MergeItems { 213 for _, id := range item.DocIDs { 214 err := b.shard.deleteFromPropertySetBucket(bucket, id.DocID, item.Data) 215 if err != nil { 216 return err 217 } 218 } 219 } 220 } 221 } 222 223 return nil 224 } 225 226 // TODO text_rbm_inverted_index unify bucket write 227 func (b *referencesBatcher) writeInvertedAdditions(in []inverted.MergeProperty) error { 228 for _, prop := range in { 229 // in the references batcher we can only ever write ref count entire which 230 // are guaranteed to be not have a frequency, meaning they will use the 231 // "Set" strategy in the lsmkv store 232 if prop.HasFilterableIndex { 233 bucket := b.shard.Store().Bucket(helpers.BucketFromPropNameLSM(prop.Name)) 234 if bucket == nil { 235 return errors.Errorf("no bucket for prop '%s' found", prop.Name) 236 } 237 238 for _, item := range prop.MergeItems { 239 err := b.shard.batchExtendInvertedIndexItemsLSMNoFrequency(bucket, item) 240 if err != nil { 241 return err 242 } 243 } 244 } 245 } 246 247 return nil 248 } 249 250 func (b *referencesBatcher) analyzeRef(obj *storobj.Object, ref objects.BatchReference, prop *models.Property) ([]inverted.Property, error) { 251 if prop == nil { 252 return nil, fmt.Errorf("analyzeRef: property %q not found in schema", ref.From.Property) 253 } 254 255 props := obj.Properties() 256 if props == nil { 257 return nil, nil 258 } 259 260 propMap, ok := props.(map[string]interface{}) 261 if !ok { 262 return nil, nil 263 } 264 265 var refs models.MultipleRef 266 refProp, ok := propMap[ref.From.Property.String()] 267 if !ok { 268 refs = make(models.MultipleRef, 0) // explicitly mark as length zero 269 } else { 270 parsed, ok := refProp.(models.MultipleRef) 271 if !ok { 272 return nil, errors.Errorf("prop %s is present, but not a ref, got: %T", 273 ref.From.Property.String(), refProp) 274 } 275 refs = parsed 276 } 277 278 a := inverted.NewAnalyzer(nil) 279 280 countItems, err := a.RefCount(refs) 281 if err != nil { 282 return nil, err 283 } 284 285 valueItems, err := a.Ref(refs) 286 if err != nil { 287 return nil, err 288 } 289 290 return []inverted.Property{{ 291 Name: helpers.MetaCountProp(ref.From.Property.String()), 292 Items: countItems, 293 HasFilterableIndex: inverted.HasFilterableIndexMetaCount && inverted.HasInvertedIndex(prop), 294 HasSearchableIndex: inverted.HasSearchableIndexMetaCount && inverted.HasInvertedIndex(prop), 295 }, { 296 Name: ref.From.Property.String(), 297 Items: valueItems, 298 HasFilterableIndex: inverted.HasFilterableIndex(prop), 299 HasSearchableIndex: inverted.HasSearchableIndex(prop), 300 }}, nil 301 } 302 303 func (b *referencesBatcher) setErrorAtIndex(err error, i int) { 304 b.Lock() 305 defer b.Unlock() 306 307 err = errors.Wrap(err, "ref batch") 308 b.errs[i] = err 309 } 310 311 func mergeDocFromBatchReference(ref objects.BatchReference) objects.MergeDocument { 312 return objects.MergeDocument{ 313 Class: ref.From.Class.String(), 314 ID: ref.From.TargetID, 315 UpdateTime: time.Now().UnixMilli(), 316 References: objects.BatchReferences{ref}, 317 } 318 } 319 320 func (b *referencesBatcher) flushWALs(ctx context.Context) { 321 if err := b.shard.Store().WriteWALs(); err != nil { 322 for i := range b.refs { 323 b.setErrorAtIndex(err, i) 324 } 325 } 326 327 if b.shard.hasTargetVectors() { 328 for targetVector, vectorIndex := range b.shard.VectorIndexes() { 329 if err := vectorIndex.Flush(); err != nil { 330 for i := range b.refs { 331 b.setErrorAtIndex(fmt.Errorf("target vector %s: %w", targetVector, err), i) 332 } 333 } 334 } 335 } else { 336 if err := b.shard.VectorIndex().Flush(); err != nil { 337 for i := range b.refs { 338 b.setErrorAtIndex(err, i) 339 } 340 } 341 } 342 } 343 344 func (b *referencesBatcher) getSchemaPropsByName() (map[string]*models.Property, error) { 345 idx := b.shard.Index() 346 sch := idx.getSchema.GetSchemaSkipAuth().Objects 347 class, err := schema.GetClassByName(sch, idx.Config.ClassName.String()) 348 if err != nil { 349 return nil, err 350 } 351 352 propsByName := map[string]*models.Property{} 353 for _, prop := range class.Properties { 354 propsByName[prop.Name] = prop 355 } 356 return propsByName, nil 357 }