github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/shard.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package db 13 14 import ( 15 "context" 16 "fmt" 17 "io" 18 "os" 19 "path" 20 "sync" 21 "time" 22 23 enterrors "github.com/weaviate/weaviate/entities/errors" 24 25 "github.com/go-openapi/strfmt" 26 "github.com/pkg/errors" 27 "github.com/sirupsen/logrus" 28 "github.com/weaviate/weaviate/adapters/repos/db/helpers" 29 "github.com/weaviate/weaviate/adapters/repos/db/indexcheckpoint" 30 "github.com/weaviate/weaviate/adapters/repos/db/indexcounter" 31 "github.com/weaviate/weaviate/adapters/repos/db/inverted" 32 "github.com/weaviate/weaviate/adapters/repos/db/lsmkv" 33 "github.com/weaviate/weaviate/adapters/repos/db/propertyspecific" 34 "github.com/weaviate/weaviate/adapters/repos/db/roaringset" 35 "github.com/weaviate/weaviate/adapters/repos/db/vector/flat" 36 "github.com/weaviate/weaviate/adapters/repos/db/vector/hnsw" 37 "github.com/weaviate/weaviate/adapters/repos/db/vector/hnsw/distancer" 38 "github.com/weaviate/weaviate/adapters/repos/db/vector/noop" 39 "github.com/weaviate/weaviate/entities/additional" 40 "github.com/weaviate/weaviate/entities/aggregation" 41 "github.com/weaviate/weaviate/entities/backup" 42 "github.com/weaviate/weaviate/entities/cyclemanager" 43 "github.com/weaviate/weaviate/entities/filters" 44 "github.com/weaviate/weaviate/entities/models" 45 "github.com/weaviate/weaviate/entities/multi" 46 "github.com/weaviate/weaviate/entities/schema" 47 "github.com/weaviate/weaviate/entities/search" 48 "github.com/weaviate/weaviate/entities/searchparams" 49 "github.com/weaviate/weaviate/entities/storagestate" 50 "github.com/weaviate/weaviate/entities/storobj" 51 "github.com/weaviate/weaviate/entities/vectorindex" 52 "github.com/weaviate/weaviate/entities/vectorindex/common" 53 flatent "github.com/weaviate/weaviate/entities/vectorindex/flat" 54 hnswent "github.com/weaviate/weaviate/entities/vectorindex/hnsw" 55 "github.com/weaviate/weaviate/usecases/monitoring" 56 "github.com/weaviate/weaviate/usecases/objects" 57 "github.com/weaviate/weaviate/usecases/replica" 58 ) 59 60 const IdLockPoolSize = 128 61 62 type ShardLike interface { 63 Index() *Index // Get the parent index 64 Name() string // Get the shard name 65 Store() *lsmkv.Store // Get the underlying store 66 NotifyReady() // Set shard status to ready 67 GetStatus() storagestate.Status // Return the shard status 68 UpdateStatus(status string) error // Set shard status 69 FindUUIDs(ctx context.Context, filters *filters.LocalFilter) ([]strfmt.UUID, error) // Search and return document ids 70 71 Counter() *indexcounter.Counter 72 ObjectCount() int 73 ObjectCountAsync() int 74 GetPropertyLengthTracker() *inverted.JsonPropertyLengthTracker 75 76 PutObject(context.Context, *storobj.Object) error 77 PutObjectBatch(context.Context, []*storobj.Object) []error 78 ObjectByID(ctx context.Context, id strfmt.UUID, props search.SelectProperties, additional additional.Properties) (*storobj.Object, error) 79 Exists(ctx context.Context, id strfmt.UUID) (bool, error) 80 ObjectSearch(ctx context.Context, limit int, filters *filters.LocalFilter, keywordRanking *searchparams.KeywordRanking, sort []filters.Sort, cursor *filters.Cursor, additional additional.Properties) ([]*storobj.Object, []float32, error) 81 ObjectVectorSearch(ctx context.Context, searchVector []float32, targetVector string, targetDist float32, limit int, filters *filters.LocalFilter, sort []filters.Sort, groupBy *searchparams.GroupBy, additional additional.Properties) ([]*storobj.Object, []float32, error) 82 UpdateVectorIndexConfig(ctx context.Context, updated schema.VectorIndexConfig) error 83 UpdateVectorIndexConfigs(ctx context.Context, updated map[string]schema.VectorIndexConfig) error 84 AddReferencesBatch(ctx context.Context, refs objects.BatchReferences) []error 85 DeleteObjectBatch(ctx context.Context, ids []strfmt.UUID, dryRun bool) objects.BatchSimpleObjects // Delete many objects by id 86 DeleteObject(ctx context.Context, id strfmt.UUID) error // Delete object by id 87 MultiObjectByID(ctx context.Context, query []multi.Identifier) ([]*storobj.Object, error) 88 ID() string // Get the shard id 89 drop() error 90 addIDProperty(ctx context.Context) error 91 addDimensionsProperty(ctx context.Context) error 92 addTimestampProperties(ctx context.Context) error 93 createPropertyIndex(ctx context.Context, prop *models.Property, eg *enterrors.ErrorGroupWrapper) 94 BeginBackup(ctx context.Context) error 95 ListBackupFiles(ctx context.Context, ret *backup.ShardDescriptor) error 96 resumeMaintenanceCycles(ctx context.Context) error 97 SetPropertyLengths(props []inverted.Property) error 98 AnalyzeObject(*storobj.Object) ([]inverted.Property, []inverted.NilProperty, error) 99 100 Aggregate(ctx context.Context, params aggregation.Params) (*aggregation.Result, error) 101 MergeObject(ctx context.Context, object objects.MergeDocument) error 102 Queue() *IndexQueue 103 Queues() map[string]*IndexQueue 104 Shutdown(context.Context) error // Shutdown the shard 105 // TODO tests only 106 ObjectList(ctx context.Context, limit int, sort []filters.Sort, cursor *filters.Cursor, 107 additional additional.Properties, className schema.ClassName) ([]*storobj.Object, error) // Search and return objects 108 WasDeleted(ctx context.Context, id strfmt.UUID) (bool, error) // Check if an object was deleted 109 VectorIndex() VectorIndex // Get the vector index 110 VectorIndexes() map[string]VectorIndex // Get the vector indexes 111 hasTargetVectors() bool 112 // TODO tests only 113 Versioner() *shardVersioner // Get the shard versioner 114 115 isReadOnly() bool 116 117 preparePutObject(context.Context, string, *storobj.Object) replica.SimpleResponse 118 preparePutObjects(context.Context, string, []*storobj.Object) replica.SimpleResponse 119 prepareMergeObject(context.Context, string, *objects.MergeDocument) replica.SimpleResponse 120 prepareDeleteObject(context.Context, string, strfmt.UUID) replica.SimpleResponse 121 prepareDeleteObjects(context.Context, string, []strfmt.UUID, bool) replica.SimpleResponse 122 prepareAddReferences(context.Context, string, []objects.BatchReference) replica.SimpleResponse 123 124 commitReplication(context.Context, string, *backupMutex) interface{} 125 abortReplication(context.Context, string) replica.SimpleResponse 126 reinit(context.Context) error 127 filePutter(context.Context, string) (io.WriteCloser, error) 128 129 // TODO tests only 130 Dimensions() int // dim(vector)*number vectors 131 // TODO tests only 132 QuantizedDimensions(segments int) int 133 extendDimensionTrackerLSM(dimLength int, docID uint64) error 134 extendDimensionTrackerForVecLSM(dimLength int, docID uint64, vecName string) error 135 publishDimensionMetrics() 136 137 addToPropertySetBucket(bucket *lsmkv.Bucket, docID uint64, key []byte) error 138 addToPropertyMapBucket(bucket *lsmkv.Bucket, pair lsmkv.MapPair, key []byte) error 139 pairPropertyWithFrequency(docID uint64, freq, propLen float32) lsmkv.MapPair 140 141 setFallbackToSearchable(fallback bool) 142 addJobToQueue(job job) 143 uuidFromDocID(docID uint64) (strfmt.UUID, error) 144 batchDeleteObject(ctx context.Context, id strfmt.UUID) error 145 putObjectLSM(object *storobj.Object, idBytes []byte) (objectInsertStatus, error) 146 mutableMergeObjectLSM(merge objects.MergeDocument, idBytes []byte) (mutableMergeResult, error) 147 deleteFromPropertySetBucket(bucket *lsmkv.Bucket, docID uint64, key []byte) error 148 batchExtendInvertedIndexItemsLSMNoFrequency(b *lsmkv.Bucket, item inverted.MergeItem) error 149 updatePropertySpecificIndices(object *storobj.Object, status objectInsertStatus) error 150 updateVectorIndexIgnoreDelete(vector []float32, status objectInsertStatus) error 151 updateVectorIndexesIgnoreDelete(vectors map[string][]float32, status objectInsertStatus) error 152 hasGeoIndex() bool 153 154 Metrics() *Metrics 155 } 156 157 // Shard is the smallest completely-contained index unit. A shard manages 158 // database files for all the objects it owns. How a shard is determined for a 159 // target object (e.g. Murmur hash, etc.) is still open at this point 160 type Shard struct { 161 index *Index // a reference to the underlying index, which in turn contains schema information 162 queue *IndexQueue 163 queues map[string]*IndexQueue 164 name string 165 store *lsmkv.Store 166 counter *indexcounter.Counter 167 indexCheckpoints *indexcheckpoint.Checkpoints 168 vectorIndex VectorIndex 169 vectorIndexes map[string]VectorIndex 170 metrics *Metrics 171 promMetrics *monitoring.PrometheusMetrics 172 propertyIndices propertyspecific.Indices 173 propLenTracker *inverted.JsonPropertyLengthTracker 174 versioner *shardVersioner 175 176 status storagestate.Status 177 statusLock sync.Mutex 178 propertyIndicesLock sync.RWMutex 179 stopMetrics chan struct{} 180 181 centralJobQueue chan job // reference to queue used by all shards 182 183 docIdLock []sync.Mutex 184 // replication 185 replicationMap pendingReplicaTasks 186 187 // Indicates whether searchable buckets should be used 188 // when filterable buckets are missing for text/text[] properties 189 // This can happen for db created before v1.19, where 190 // only map (now called searchable) buckets were created as inverted 191 // indexes for text/text[] props. 192 // Now roaring set (filterable) and map (searchable) buckets can 193 // coexists for text/text[] props, and by default both are enabled. 194 // So despite property's IndexFilterable and IndexSearchable settings 195 // being enabled, only searchable bucket exists 196 fallbackToSearchable bool 197 198 cycleCallbacks *shardCycleCallbacks 199 bitmapFactory *roaringset.BitmapFactory 200 } 201 202 func NewShard(ctx context.Context, promMetrics *monitoring.PrometheusMetrics, 203 shardName string, index *Index, class *models.Class, jobQueueCh chan job, 204 indexCheckpoints *indexcheckpoint.Checkpoints, 205 ) (*Shard, error) { 206 before := time.Now() 207 var err error 208 s := &Shard{ 209 index: index, 210 name: shardName, 211 promMetrics: promMetrics, 212 metrics: NewMetrics(index.logger, promMetrics, 213 string(index.Config.ClassName), shardName), 214 stopMetrics: make(chan struct{}), 215 replicationMap: pendingReplicaTasks{Tasks: make(map[string]replicaTask, 32)}, 216 centralJobQueue: jobQueueCh, 217 indexCheckpoints: indexCheckpoints, 218 } 219 s.initCycleCallbacks() 220 221 s.docIdLock = make([]sync.Mutex, IdLockPoolSize) 222 223 defer s.metrics.ShardStartup(before) 224 225 _, err = os.Stat(s.path()) 226 exists := false 227 if err == nil { 228 exists = true 229 } 230 231 if err := os.MkdirAll(s.path(), os.ModePerm); err != nil { 232 return nil, err 233 } 234 235 if err := s.initNonVector(ctx, class); err != nil { 236 return nil, errors.Wrapf(err, "init shard %q", s.ID()) 237 } 238 239 if s.hasTargetVectors() { 240 if err := s.initTargetVectors(ctx); err != nil { 241 return nil, err 242 } 243 if err := s.initTargetQueues(); err != nil { 244 return nil, err 245 } 246 } else { 247 if err := s.initLegacyVector(ctx); err != nil { 248 return nil, err 249 } 250 if err := s.initLegacyQueue(); err != nil { 251 return nil, err 252 } 253 } 254 255 s.initDimensionTracking() 256 257 if asyncEnabled() { 258 f := func() { 259 // preload unindexed objects in the background 260 if s.hasTargetVectors() { 261 for targetVector, queue := range s.queues { 262 err := queue.PreloadShard(s) 263 if err != nil { 264 queue.Logger.WithError(err).Errorf("preload shard for target vector: %s", targetVector) 265 } 266 } 267 } else { 268 err := s.queue.PreloadShard(s) 269 if err != nil { 270 s.queue.Logger.WithError(err).Error("preload shard") 271 } 272 } 273 } 274 enterrors.GoWrapper(f, s.index.logger) 275 } 276 s.NotifyReady() 277 278 if exists { 279 s.index.logger.Printf("Completed loading shard %s in %s", s.ID(), time.Since(before)) 280 } else { 281 s.index.logger.Printf("Created shard %s in %s", s.ID(), time.Since(before)) 282 } 283 return s, nil 284 } 285 286 func (s *Shard) hasTargetVectors() bool { 287 return hasTargetVectors(s.index.vectorIndexUserConfig, s.index.vectorIndexUserConfigs) 288 } 289 290 // target vectors and legacy vector are (supposed to be) exclusive 291 // method allows to distinguish which of them is configured for the class 292 func hasTargetVectors(cfg schema.VectorIndexConfig, targetCfgs map[string]schema.VectorIndexConfig) bool { 293 return len(targetCfgs) != 0 294 } 295 296 func (s *Shard) initTargetVectors(ctx context.Context) error { 297 s.vectorIndexes = make(map[string]VectorIndex) 298 for targetVector, vectorIndexConfig := range s.index.vectorIndexUserConfigs { 299 vectorIndex, err := s.initVectorIndex(ctx, targetVector, vectorIndexConfig) 300 if err != nil { 301 return fmt.Errorf("cannot create vector index for %q: %w", targetVector, err) 302 } 303 s.vectorIndexes[targetVector] = vectorIndex 304 } 305 return nil 306 } 307 308 func (s *Shard) initTargetQueues() error { 309 s.queues = make(map[string]*IndexQueue) 310 for targetVector, vectorIndex := range s.vectorIndexes { 311 queue, err := NewIndexQueue(s.ID(), targetVector, s, vectorIndex, s.centralJobQueue, 312 s.indexCheckpoints, IndexQueueOptions{Logger: s.index.logger}) 313 if err != nil { 314 return fmt.Errorf("cannot create index queue for %q: %w", targetVector, err) 315 } 316 s.queues[targetVector] = queue 317 } 318 return nil 319 } 320 321 func (s *Shard) initLegacyVector(ctx context.Context) error { 322 vectorindex, err := s.initVectorIndex(ctx, "", s.index.vectorIndexUserConfig) 323 if err != nil { 324 return err 325 } 326 s.vectorIndex = vectorindex 327 return nil 328 } 329 330 func (s *Shard) initLegacyQueue() error { 331 queue, err := NewIndexQueue(s.ID(), "", s, s.vectorIndex, s.centralJobQueue, 332 s.indexCheckpoints, IndexQueueOptions{Logger: s.index.logger}) 333 if err != nil { 334 return err 335 } 336 s.queue = queue 337 return nil 338 } 339 340 func (s *Shard) initVectorIndex(ctx context.Context, 341 targetVector string, vectorIndexUserConfig schema.VectorIndexConfig, 342 ) (VectorIndex, error) { 343 var distProv distancer.Provider 344 345 switch vectorIndexUserConfig.DistanceName() { 346 case "", common.DistanceCosine: 347 distProv = distancer.NewCosineDistanceProvider() 348 case common.DistanceDot: 349 distProv = distancer.NewDotProductProvider() 350 case common.DistanceL2Squared: 351 distProv = distancer.NewL2SquaredProvider() 352 case common.DistanceManhattan: 353 distProv = distancer.NewManhattanProvider() 354 case common.DistanceHamming: 355 distProv = distancer.NewHammingProvider() 356 default: 357 return nil, fmt.Errorf("init vector index: %w", 358 errors.Errorf("unrecognized distance metric %q,"+ 359 "choose one of [\"cosine\", \"dot\", \"l2-squared\", \"manhattan\",\"hamming\"]", vectorIndexUserConfig.DistanceName())) 360 } 361 362 var vectorIndex VectorIndex 363 364 switch vectorIndexUserConfig.IndexType() { 365 case vectorindex.VectorIndexTypeHNSW: 366 hnswUserConfig, ok := vectorIndexUserConfig.(hnswent.UserConfig) 367 if !ok { 368 return nil, errors.Errorf("hnsw vector index: config is not hnsw.UserConfig: %T", 369 vectorIndexUserConfig) 370 } 371 372 if hnswUserConfig.Skip { 373 vectorIndex = noop.NewIndex() 374 } else { 375 // starts vector cycles if vector is configured 376 s.index.cycleCallbacks.vectorCommitLoggerCycle.Start() 377 s.index.cycleCallbacks.vectorTombstoneCleanupCycle.Start() 378 379 // a shard can actually have multiple vector indexes: 380 // - the main index, which is used for all normal object vectors 381 // - a geo property index for each geo prop in the schema 382 // 383 // here we label the main vector index as such. 384 vecIdxID := s.vectorIndexID(targetVector) 385 386 vi, err := hnsw.New(hnsw.Config{ 387 Logger: s.index.logger, 388 RootPath: s.path(), 389 ID: vecIdxID, 390 ShardName: s.name, 391 ClassName: s.index.Config.ClassName.String(), 392 PrometheusMetrics: s.promMetrics, 393 VectorForIDThunk: s.vectorByIndexID, 394 TempVectorForIDThunk: s.readVectorByIndexIDIntoSlice, 395 DistanceProvider: distProv, 396 MakeCommitLoggerThunk: func() (hnsw.CommitLogger, error) { 397 return hnsw.NewCommitLogger(s.path(), vecIdxID, 398 s.index.logger, s.cycleCallbacks.vectorCommitLoggerCallbacks) 399 }, 400 }, hnswUserConfig, s.cycleCallbacks.vectorTombstoneCleanupCallbacks, 401 s.cycleCallbacks.compactionCallbacks, s.cycleCallbacks.flushCallbacks, s.store) 402 if err != nil { 403 return nil, errors.Wrapf(err, "init shard %q: hnsw index", s.ID()) 404 } 405 vectorIndex = vi 406 } 407 case vectorindex.VectorIndexTypeFLAT: 408 flatUserConfig, ok := vectorIndexUserConfig.(flatent.UserConfig) 409 if !ok { 410 return nil, errors.Errorf("flat vector index: config is not flat.UserConfig: %T", 411 vectorIndexUserConfig) 412 } 413 s.index.cycleCallbacks.vectorCommitLoggerCycle.Start() 414 415 // a shard can actually have multiple vector indexes: 416 // - the main index, which is used for all normal object vectors 417 // - a geo property index for each geo prop in the schema 418 // 419 // here we label the main vector index as such. 420 vecIdxID := s.vectorIndexID(targetVector) 421 422 vi, err := flat.New(flat.Config{ 423 ID: vecIdxID, 424 TargetVector: targetVector, 425 Logger: s.index.logger, 426 DistanceProvider: distProv, 427 }, flatUserConfig, s.store) 428 if err != nil { 429 return nil, errors.Wrapf(err, "init shard %q: flat index", s.ID()) 430 } 431 vectorIndex = vi 432 default: 433 return nil, fmt.Errorf("Unknown vector index type: %q. Choose one from [\"%s\", \"%s\"]", 434 vectorIndexUserConfig.IndexType(), vectorindex.VectorIndexTypeHNSW, vectorindex.VectorIndexTypeFLAT) 435 } 436 defer vectorIndex.PostStartup() 437 return vectorIndex, nil 438 } 439 440 func (s *Shard) initNonVector(ctx context.Context, class *models.Class) error { 441 err := s.initLSMStore(ctx) 442 if err != nil { 443 return errors.Wrapf(err, "init shard %q: shard db", s.ID()) 444 } 445 446 counter, err := indexcounter.New(s.path()) 447 if err != nil { 448 return errors.Wrapf(err, "init shard %q: index counter", s.ID()) 449 } 450 s.counter = counter 451 s.bitmapFactory = roaringset.NewBitmapFactory(s.counter.Get, s.index.logger) 452 453 dataPresent := s.counter.PreviewNext() != 0 454 versionPath := path.Join(s.path(), "version") 455 versioner, err := newShardVersioner(versionPath, dataPresent) 456 if err != nil { 457 return errors.Wrapf(err, "init shard %q: check versions", s.ID()) 458 } 459 s.versioner = versioner 460 461 plPath := path.Join(s.path(), "proplengths") 462 tracker, err := inverted.NewJsonPropertyLengthTracker(plPath, s.index.logger) 463 if err != nil { 464 return errors.Wrapf(err, "init shard %q: prop length tracker", s.ID()) 465 } 466 467 s.propLenTracker = tracker 468 469 if err := s.initProperties(class); err != nil { 470 return errors.Wrapf(err, "init shard %q: init per property indices", s.ID()) 471 } 472 473 return nil 474 } 475 476 func (s *Shard) ID() string { 477 return shardId(s.index.ID(), s.name) 478 } 479 480 func (s *Shard) path() string { 481 return shardPath(s.index.path(), s.name) 482 } 483 484 func (s *Shard) pathLSM() string { 485 return path.Join(s.path(), "lsm") 486 } 487 488 func (s *Shard) vectorIndexID(targetVector string) string { 489 if targetVector != "" { 490 return fmt.Sprintf("vectors_%s", targetVector) 491 } 492 return "main" 493 } 494 495 func (s *Shard) uuidToIdLockPoolId(idBytes []byte) uint8 { 496 // use the last byte of the uuid to determine which locking-pool a given object should use. The last byte is used 497 // as uuids probably often have some kind of order and the last byte will in general be the one that changes the most 498 return idBytes[15] % IdLockPoolSize 499 } 500 501 func (s *Shard) initLSMStore(ctx context.Context) error { 502 annotatedLogger := s.index.logger.WithFields(logrus.Fields{ 503 "shard": s.name, 504 "index": s.index.ID(), 505 "class": s.index.Config.ClassName, 506 }) 507 var metrics *lsmkv.Metrics 508 if s.promMetrics != nil { 509 metrics = lsmkv.NewMetrics(s.promMetrics, string(s.index.Config.ClassName), s.name) 510 } 511 512 store, err := lsmkv.New(s.pathLSM(), s.path(), annotatedLogger, metrics, 513 s.cycleCallbacks.compactionCallbacks, s.cycleCallbacks.flushCallbacks) 514 if err != nil { 515 return errors.Wrapf(err, "init lsmkv store at %s", s.pathLSM()) 516 } 517 518 opts := []lsmkv.BucketOption{ 519 lsmkv.WithStrategy(lsmkv.StrategyReplace), 520 lsmkv.WithSecondaryIndices(1), 521 lsmkv.WithPread(s.index.Config.AvoidMMap), 522 lsmkv.WithKeepTombstones(true), 523 s.dynamicMemtableSizing(), 524 s.memtableDirtyConfig(), 525 } 526 527 if s.metrics != nil && !s.metrics.grouped { 528 // If metrics are grouped we cannot observe the count of an individual 529 // shard's object store because there is just a single metric. We would 530 // override it. See https://github.com/weaviate/weaviate/issues/4396 for 531 // details. 532 opts = append(opts, lsmkv.WithMonitorCount()) 533 } 534 err = store.CreateOrLoadBucket(ctx, helpers.ObjectsBucketLSM, opts...) 535 if err != nil { 536 return errors.Wrap(err, "create objects bucket") 537 } 538 539 s.store = store 540 541 return nil 542 } 543 544 // IMPORTANT: 545 // Be advised there exists LazyLoadShard::drop() implementation intended 546 // to drop shard that was not loaded (instantiated) yet. 547 // It deletes shard by performing required actions and removing entire shard directory. 548 // If there is any action that needs to be performed beside files/dirs being removed 549 // from shard directory, it needs to be reflected as well in LazyLoadShard::drop() 550 // method to keep drop behaviour consistent. 551 func (s *Shard) drop() error { 552 s.metrics.DeleteShardLabels(s.index.Config.ClassName.String(), s.name) 553 s.metrics.baseMetrics.StartUnloadingShard(s.index.Config.ClassName.String()) 554 s.replicationMap.clear() 555 556 if s.index.Config.TrackVectorDimensions { 557 // tracking vector dimensions goroutine only works when tracking is enabled 558 // that's why we are trying to stop it only in this case 559 s.stopMetrics <- struct{}{} 560 // send 0 in when index gets dropped 561 s.clearDimensionMetrics() 562 } 563 564 ctx, cancel := context.WithTimeout(context.TODO(), 5*time.Second) 565 defer cancel() 566 567 // unregister all callbacks at once, in parallel 568 if err := cyclemanager.NewCombinedCallbackCtrl(0, s.index.logger, 569 s.cycleCallbacks.compactionCallbacksCtrl, 570 s.cycleCallbacks.flushCallbacksCtrl, 571 s.cycleCallbacks.vectorCombinedCallbacksCtrl, 572 s.cycleCallbacks.geoPropsCombinedCallbacksCtrl, 573 ).Unregister(ctx); err != nil { 574 return err 575 } 576 577 if err := s.store.Shutdown(ctx); err != nil { 578 return errors.Wrap(err, "stop lsmkv store") 579 } 580 581 if _, err := os.Stat(s.pathLSM()); err == nil { 582 err := os.RemoveAll(s.pathLSM()) 583 if err != nil { 584 return errors.Wrapf(err, "remove lsm store at %s", s.pathLSM()) 585 } 586 } 587 // delete indexcount 588 err := s.counter.Drop() 589 if err != nil { 590 return errors.Wrapf(err, "remove indexcount at %s", s.path()) 591 } 592 593 // delete version 594 err = s.versioner.Drop() 595 if err != nil { 596 return errors.Wrapf(err, "remove version at %s", s.path()) 597 } 598 599 if s.hasTargetVectors() { 600 // TODO run in parallel? 601 for targetVector, queue := range s.queues { 602 if err = queue.Drop(); err != nil { 603 return fmt.Errorf("close queue of vector %q at %s: %w", targetVector, s.path(), err) 604 } 605 } 606 for targetVector, vectorIndex := range s.vectorIndexes { 607 if err = vectorIndex.Drop(ctx); err != nil { 608 return fmt.Errorf("remove vector index of vector %q at %s: %w", targetVector, s.path(), err) 609 } 610 } 611 } else { 612 // delete queue cursor 613 if err = s.queue.Drop(); err != nil { 614 return errors.Wrapf(err, "close queue at %s", s.path()) 615 } 616 // remove vector index 617 if err = s.vectorIndex.Drop(ctx); err != nil { 618 return errors.Wrapf(err, "remove vector index at %s", s.path()) 619 } 620 } 621 622 // delete property length tracker 623 err = s.GetPropertyLengthTracker().Drop() 624 if err != nil { 625 return errors.Wrapf(err, "remove prop length tracker at %s", s.path()) 626 } 627 628 s.propertyIndicesLock.Lock() 629 err = s.propertyIndices.DropAll(ctx) 630 s.propertyIndicesLock.Unlock() 631 if err != nil { 632 return errors.Wrapf(err, "remove property specific indices at %s", s.path()) 633 } 634 635 s.metrics.baseMetrics.FinishUnloadingShard(s.index.Config.ClassName.String()) 636 637 return nil 638 } 639 640 func (s *Shard) addIDProperty(ctx context.Context) error { 641 if s.isReadOnly() { 642 return storagestate.ErrStatusReadOnly 643 } 644 645 return s.store.CreateOrLoadBucket(ctx, 646 helpers.BucketFromPropNameLSM(filters.InternalPropID), 647 s.memtableDirtyConfig(), 648 lsmkv.WithStrategy(lsmkv.StrategySetCollection), 649 lsmkv.WithPread(s.index.Config.AvoidMMap)) 650 } 651 652 func (s *Shard) addDimensionsProperty(ctx context.Context) error { 653 if s.isReadOnly() { 654 return storagestate.ErrStatusReadOnly 655 } 656 657 // Note: this data would fit the "Set" type better, but since the "Map" type 658 // is currently optimized better, it is more efficient to use a Map here. 659 err := s.store.CreateOrLoadBucket(ctx, 660 helpers.DimensionsBucketLSM, 661 lsmkv.WithStrategy(lsmkv.StrategyMapCollection), 662 lsmkv.WithPread(s.index.Config.AvoidMMap)) 663 if err != nil { 664 return err 665 } 666 667 return nil 668 } 669 670 func (s *Shard) addTimestampProperties(ctx context.Context) error { 671 if s.isReadOnly() { 672 return storagestate.ErrStatusReadOnly 673 } 674 675 if err := s.addCreationTimeUnixProperty(ctx); err != nil { 676 return err 677 } 678 if err := s.addLastUpdateTimeUnixProperty(ctx); err != nil { 679 return err 680 } 681 682 return nil 683 } 684 685 func (s *Shard) addCreationTimeUnixProperty(ctx context.Context) error { 686 return s.store.CreateOrLoadBucket(ctx, 687 helpers.BucketFromPropNameLSM(filters.InternalPropCreationTimeUnix), 688 s.memtableDirtyConfig(), 689 lsmkv.WithStrategy(lsmkv.StrategyRoaringSet), 690 lsmkv.WithPread(s.index.Config.AvoidMMap)) 691 } 692 693 func (s *Shard) addLastUpdateTimeUnixProperty(ctx context.Context) error { 694 return s.store.CreateOrLoadBucket(ctx, 695 helpers.BucketFromPropNameLSM(filters.InternalPropLastUpdateTimeUnix), 696 s.memtableDirtyConfig(), 697 lsmkv.WithStrategy(lsmkv.StrategyRoaringSet), 698 lsmkv.WithPread(s.index.Config.AvoidMMap)) 699 } 700 701 func (s *Shard) memtableDirtyConfig() lsmkv.BucketOption { 702 return lsmkv.WithDirtyThreshold( 703 time.Duration(s.index.Config.MemtablesFlushDirtyAfter) * time.Second) 704 } 705 706 func (s *Shard) dynamicMemtableSizing() lsmkv.BucketOption { 707 return lsmkv.WithDynamicMemtableSizing( 708 s.index.Config.MemtablesInitialSizeMB, 709 s.index.Config.MemtablesMaxSizeMB, 710 s.index.Config.MemtablesMinActiveSeconds, 711 s.index.Config.MemtablesMaxActiveSeconds, 712 ) 713 } 714 715 func (s *Shard) createPropertyIndex(ctx context.Context, prop *models.Property, eg *enterrors.ErrorGroupWrapper) { 716 if !inverted.HasInvertedIndex(prop) { 717 return 718 } 719 720 eg.Go(func() error { 721 if err := s.createPropertyValueIndex(ctx, prop); err != nil { 722 return errors.Wrapf(err, "create property '%s' value index on shard '%s'", prop.Name, s.ID()) 723 } 724 725 if s.index.invertedIndexConfig.IndexNullState { 726 eg.Go(func() error { 727 if err := s.createPropertyNullIndex(ctx, prop); err != nil { 728 return errors.Wrapf(err, "create property '%s' null index on shard '%s'", prop.Name, s.ID()) 729 } 730 return nil 731 }) 732 } 733 734 if s.index.invertedIndexConfig.IndexPropertyLength { 735 eg.Go(func() error { 736 if err := s.createPropertyLengthIndex(ctx, prop); err != nil { 737 return errors.Wrapf(err, "create property '%s' length index on shard '%s'", prop.Name, s.ID()) 738 } 739 return nil 740 }) 741 } 742 743 return nil 744 }) 745 } 746 747 func (s *Shard) createPropertyValueIndex(ctx context.Context, prop *models.Property) error { 748 if s.isReadOnly() { 749 return storagestate.ErrStatusReadOnly 750 } 751 752 bucketOpts := []lsmkv.BucketOption{ 753 s.memtableDirtyConfig(), 754 s.dynamicMemtableSizing(), 755 lsmkv.WithPread(s.index.Config.AvoidMMap), 756 } 757 758 if inverted.HasFilterableIndex(prop) { 759 if dt, _ := schema.AsPrimitive(prop.DataType); dt == schema.DataTypeGeoCoordinates { 760 return s.initGeoProp(prop) 761 } 762 763 if schema.IsRefDataType(prop.DataType) { 764 if err := s.store.CreateOrLoadBucket(ctx, 765 helpers.BucketFromPropNameMetaCountLSM(prop.Name), 766 append(bucketOpts, lsmkv.WithStrategy(lsmkv.StrategyRoaringSet))..., 767 ); err != nil { 768 return err 769 } 770 } 771 772 if err := s.store.CreateOrLoadBucket(ctx, 773 helpers.BucketFromPropNameLSM(prop.Name), 774 append(bucketOpts, lsmkv.WithStrategy(lsmkv.StrategyRoaringSet))..., 775 ); err != nil { 776 return err 777 } 778 } 779 780 if inverted.HasSearchableIndex(prop) { 781 searchableBucketOpts := append(bucketOpts, 782 lsmkv.WithStrategy(lsmkv.StrategyMapCollection), lsmkv.WithPread(s.index.Config.AvoidMMap)) 783 if s.versioner.Version() < 2 { 784 searchableBucketOpts = append(searchableBucketOpts, lsmkv.WithLegacyMapSorting()) 785 } 786 787 if err := s.store.CreateOrLoadBucket(ctx, 788 helpers.BucketSearchableFromPropNameLSM(prop.Name), 789 searchableBucketOpts..., 790 ); err != nil { 791 return err 792 } 793 } 794 795 return nil 796 } 797 798 func (s *Shard) createPropertyLengthIndex(ctx context.Context, prop *models.Property) error { 799 if s.isReadOnly() { 800 return storagestate.ErrStatusReadOnly 801 } 802 803 // some datatypes are not added to the inverted index, so we can skip them here 804 switch schema.DataType(prop.DataType[0]) { 805 case schema.DataTypeGeoCoordinates, schema.DataTypePhoneNumber, schema.DataTypeBlob, schema.DataTypeInt, 806 schema.DataTypeNumber, schema.DataTypeBoolean, schema.DataTypeDate: 807 return nil 808 default: 809 } 810 811 return s.store.CreateOrLoadBucket(ctx, 812 helpers.BucketFromPropNameLengthLSM(prop.Name), 813 lsmkv.WithStrategy(lsmkv.StrategyRoaringSet), 814 lsmkv.WithPread(s.index.Config.AvoidMMap)) 815 } 816 817 func (s *Shard) createPropertyNullIndex(ctx context.Context, prop *models.Property) error { 818 if s.isReadOnly() { 819 return storagestate.ErrStatusReadOnly 820 } 821 822 return s.store.CreateOrLoadBucket(ctx, 823 helpers.BucketFromPropNameNullLSM(prop.Name), 824 lsmkv.WithStrategy(lsmkv.StrategyRoaringSet), 825 lsmkv.WithPread(s.index.Config.AvoidMMap)) 826 } 827 828 func (s *Shard) UpdateVectorIndexConfig(ctx context.Context, updated schema.VectorIndexConfig) error { 829 if s.isReadOnly() { 830 return storagestate.ErrStatusReadOnly 831 } 832 833 err := s.UpdateStatus(storagestate.StatusReadOnly.String()) 834 if err != nil { 835 return fmt.Errorf("attempt to mark read-only: %w", err) 836 } 837 838 return s.VectorIndex().UpdateUserConfig(updated, func() { 839 s.UpdateStatus(storagestate.StatusReady.String()) 840 }) 841 } 842 843 func (s *Shard) UpdateVectorIndexConfigs(ctx context.Context, updated map[string]schema.VectorIndexConfig) error { 844 if s.isReadOnly() { 845 return storagestate.ErrStatusReadOnly 846 } 847 if err := s.UpdateStatus(storagestate.StatusReadOnly.String()); err != nil { 848 return fmt.Errorf("attempt to mark read-only: %w", err) 849 } 850 851 wg := new(sync.WaitGroup) 852 var err error 853 for targetName, targetCfg := range updated { 854 wg.Add(1) 855 if err = s.VectorIndexForName(targetName).UpdateUserConfig(targetCfg, wg.Done); err != nil { 856 break 857 } 858 } 859 860 f := func() { 861 wg.Wait() 862 s.UpdateStatus(storagestate.StatusReady.String()) 863 } 864 enterrors.GoWrapper(f, s.index.logger) 865 866 return err 867 } 868 869 func (s *Shard) Shutdown(ctx context.Context) error { 870 if s.index.Config.TrackVectorDimensions { 871 // tracking vector dimensions goroutine only works when tracking is enabled 872 // that's why we are trying to stop it only in this case 873 s.stopMetrics <- struct{}{} 874 } 875 876 var err error 877 if err = s.GetPropertyLengthTracker().Close(); err != nil { 878 return errors.Wrap(err, "close prop length tracker") 879 } 880 881 if s.hasTargetVectors() { 882 // TODO run in parallel? 883 for targetVector, queue := range s.queues { 884 if err = queue.Close(); err != nil { 885 return fmt.Errorf("shut down vector index queue of vector %q: %w", targetVector, err) 886 } 887 } 888 for targetVector, vectorIndex := range s.vectorIndexes { 889 if err = vectorIndex.Flush(); err != nil { 890 return fmt.Errorf("flush vector index commitlog of vector %q: %w", targetVector, err) 891 } 892 if err = vectorIndex.Shutdown(ctx); err != nil { 893 return fmt.Errorf("shut down vector index of vector %q: %w", targetVector, err) 894 } 895 } 896 } else { 897 if err = s.queue.Close(); err != nil { 898 return errors.Wrap(err, "shut down vector index queue") 899 } 900 // to ensure that all commitlog entries are written to disk. 901 // otherwise in some cases the tombstone cleanup process' 902 // 'RemoveTombstone' entry is not picked up on restarts 903 // resulting in perpetually attempting to remove a tombstone 904 // which doesn't actually exist anymore 905 if err = s.vectorIndex.Flush(); err != nil { 906 return errors.Wrap(err, "flush vector index commitlog") 907 } 908 if err = s.vectorIndex.Shutdown(ctx); err != nil { 909 return errors.Wrap(err, "shut down vector index") 910 } 911 } 912 913 // unregister all callbacks at once, in parallel 914 if err = cyclemanager.NewCombinedCallbackCtrl(0, s.index.logger, 915 s.cycleCallbacks.compactionCallbacksCtrl, 916 s.cycleCallbacks.flushCallbacksCtrl, 917 s.cycleCallbacks.vectorCombinedCallbacksCtrl, 918 s.cycleCallbacks.geoPropsCombinedCallbacksCtrl, 919 ).Unregister(ctx); err != nil { 920 return err 921 } 922 923 if err = s.store.Shutdown(ctx); err != nil { 924 return errors.Wrap(err, "stop lsmkv store") 925 } 926 927 return nil 928 } 929 930 func (s *Shard) NotifyReady() { 931 s.initStatus() 932 s.index.logger. 933 WithField("action", "startup"). 934 Debugf("shard=%s is ready", s.name) 935 } 936 937 // ObjectCount returns the exact count at any moment 938 func (s *Shard) ObjectCount() int { 939 b := s.store.Bucket(helpers.ObjectsBucketLSM) 940 if b == nil { 941 return 0 942 } 943 944 return b.Count() 945 } 946 947 // ObjectCountAsync returns the eventually consistent "async" count which is 948 // much cheaper to obtain 949 func (s *Shard) ObjectCountAsync() int { 950 b := s.store.Bucket(helpers.ObjectsBucketLSM) 951 if b == nil { 952 return 0 953 } 954 955 return b.CountAsync() 956 } 957 958 func (s *Shard) isFallbackToSearchable() bool { 959 return s.fallbackToSearchable 960 } 961 962 func (s *Shard) tenant() string { 963 // TODO provide better impl 964 if s.index.partitioningEnabled { 965 return s.name 966 } 967 return "" 968 } 969 970 func shardId(indexId, shardName string) string { 971 return fmt.Sprintf("%s_%s", indexId, shardName) 972 } 973 974 func shardPath(indexPath, shardName string) string { 975 return path.Join(indexPath, shardName) 976 } 977 978 func bucketKeyPropertyLength(length int) ([]byte, error) { 979 return inverted.LexicographicallySortableInt64(int64(length)) 980 } 981 982 func bucketKeyPropertyNull(isNull bool) ([]byte, error) { 983 if isNull { 984 return []byte{uint8(filters.InternalNullState)}, nil 985 } 986 return []byte{uint8(filters.InternalNotNullState)}, nil 987 }