github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/inverted_reindexer.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package db
    13  
    14  import (
    15  	"context"
    16  	"fmt"
    17  
    18  	"github.com/pkg/errors"
    19  	"github.com/sirupsen/logrus"
    20  	"github.com/weaviate/weaviate/adapters/repos/db/helpers"
    21  	"github.com/weaviate/weaviate/adapters/repos/db/inverted"
    22  	"github.com/weaviate/weaviate/adapters/repos/db/lsmkv"
    23  	"github.com/weaviate/weaviate/entities/models"
    24  	"github.com/weaviate/weaviate/entities/schema"
    25  	"github.com/weaviate/weaviate/entities/storagestate"
    26  	"github.com/weaviate/weaviate/entities/storobj"
    27  )
    28  
    29  type ShardInvertedReindexTask interface {
    30  	GetPropertiesToReindex(ctx context.Context, shard ShardLike,
    31  	) ([]ReindexableProperty, error)
    32  	// right now only OnResume is needed, but in the future more
    33  	// callbacks could be added
    34  	// (like OnPrePauseStore, OnPostPauseStore, OnPreResumeStore, etc)
    35  	OnPostResumeStore(ctx context.Context, shard ShardLike) error
    36  }
    37  
    38  type ReindexableProperty struct {
    39  	PropertyName    string
    40  	IndexType       PropertyIndexType
    41  	NewIndex        bool // is new index, there is no bucket to replace with
    42  	DesiredStrategy string
    43  	BucketOptions   []lsmkv.BucketOption
    44  }
    45  
    46  type ShardInvertedReindexer struct {
    47  	logger logrus.FieldLogger
    48  	shard  ShardLike
    49  
    50  	tasks []ShardInvertedReindexTask
    51  	class *models.Class
    52  }
    53  
    54  func NewShardInvertedReindexer(shard ShardLike, logger logrus.FieldLogger) *ShardInvertedReindexer {
    55  	class, _ := schema.GetClassByName(shard.Index().getSchema.GetSchemaSkipAuth().Objects,
    56  		shard.Index().Config.ClassName.String())
    57  
    58  	return &ShardInvertedReindexer{
    59  		logger: logger,
    60  		shard:  shard,
    61  		tasks:  []ShardInvertedReindexTask{},
    62  		class:  class,
    63  	}
    64  }
    65  
    66  func (r *ShardInvertedReindexer) AddTask(task ShardInvertedReindexTask) {
    67  	r.tasks = append(r.tasks, task)
    68  }
    69  
    70  func (r *ShardInvertedReindexer) Do(ctx context.Context) error {
    71  	for _, task := range r.tasks {
    72  		if err := r.checkContextExpired(ctx, "remaining tasks skipped due to context canceled"); err != nil {
    73  			return err
    74  		}
    75  		if err := r.doTask(ctx, task); err != nil {
    76  			return err
    77  		}
    78  	}
    79  	return nil
    80  }
    81  
    82  func (r *ShardInvertedReindexer) doTask(ctx context.Context, task ShardInvertedReindexTask) error {
    83  	reindexProperties, err := task.GetPropertiesToReindex(ctx, r.shard)
    84  	if err != nil {
    85  		r.logError(err, "failed getting reindex properties")
    86  		return errors.Wrapf(err, "failed getting reindex properties")
    87  	}
    88  	if len(reindexProperties) == 0 {
    89  		r.logger.
    90  			WithField("action", "inverted reindex").
    91  			WithField("index", r.shard.Index().ID()).
    92  			WithField("shard", r.shard.ID()).
    93  			Debug("no properties to reindex")
    94  		return nil
    95  	}
    96  
    97  	if err := r.checkContextExpired(ctx, "pausing store stopped due to context canceled"); err != nil {
    98  		return err
    99  	}
   100  
   101  	if err := r.pauseStoreActivity(ctx); err != nil {
   102  		r.logError(err, "failed pausing store activity")
   103  		return err
   104  	}
   105  
   106  	bucketsToReindex := make([]string, len(reindexProperties))
   107  	for i, reindexProperty := range reindexProperties {
   108  		if err := r.checkContextExpired(ctx, "creating temp buckets stopped due to context canceled"); err != nil {
   109  			return err
   110  		}
   111  
   112  		if !isIndexTypeSupportedByStrategy(reindexProperty.IndexType, reindexProperty.DesiredStrategy) {
   113  			err := fmt.Errorf("strategy '%s' is not supported for given index type '%d",
   114  				reindexProperty.DesiredStrategy, reindexProperty.IndexType)
   115  			r.logError(err, "invalid strategy")
   116  			return err
   117  		}
   118  
   119  		// TODO verify if property indeed need reindex before creating buckets
   120  		// (is filterable / is searchable / null or prop length index enabled)
   121  		bucketsToReindex[i] = r.bucketName(reindexProperty.PropertyName, reindexProperty.IndexType)
   122  		if err := r.createTempBucket(ctx, bucketsToReindex[i], reindexProperty.DesiredStrategy,
   123  			reindexProperty.BucketOptions...); err != nil {
   124  			r.logError(err, "failed creating temporary bucket")
   125  			return err
   126  		}
   127  		r.logger.
   128  			WithField("action", "inverted reindex").
   129  			WithField("shard", r.shard.Name()).
   130  			WithField("property", reindexProperty.PropertyName).
   131  			WithField("strategy", reindexProperty.DesiredStrategy).
   132  			WithField("index_type", reindexProperty.IndexType).
   133  			Debug("created temporary bucket")
   134  	}
   135  
   136  	if err := r.reindexProperties(ctx, reindexProperties); err != nil {
   137  		r.logError(err, "failed reindexing properties")
   138  		return errors.Wrapf(err, "failed reindexing properties on shard '%s'", r.shard.Name())
   139  	}
   140  
   141  	for i := range bucketsToReindex {
   142  		if err := r.checkContextExpired(ctx, "replacing buckets stopped due to context canceled"); err != nil {
   143  			return err
   144  		}
   145  		tempBucketName := helpers.TempBucketFromBucketName(bucketsToReindex[i])
   146  		tempBucket := r.shard.Store().Bucket(tempBucketName)
   147  		tempBucket.FlushMemtable()
   148  		tempBucket.UpdateStatus(storagestate.StatusReadOnly)
   149  
   150  		if reindexProperties[i].NewIndex {
   151  			if err := r.shard.Store().RenameBucket(ctx, tempBucketName, bucketsToReindex[i]); err != nil {
   152  				r.logError(err, "failed renaming buckets")
   153  				return err
   154  			}
   155  
   156  			r.logger.
   157  				WithField("action", "inverted reindex").
   158  				WithField("shard", r.shard.Name()).
   159  				WithField("bucket", bucketsToReindex[i]).
   160  				WithField("temp_bucket", tempBucketName).
   161  				Debug("renamed bucket")
   162  		} else {
   163  			if err := r.shard.Store().ReplaceBuckets(ctx, bucketsToReindex[i], tempBucketName); err != nil {
   164  				r.logError(err, "failed replacing buckets")
   165  				return err
   166  			}
   167  
   168  			r.logger.
   169  				WithField("action", "inverted reindex").
   170  				WithField("shard", r.shard.Name()).
   171  				WithField("bucket", bucketsToReindex[i]).
   172  				WithField("temp_bucket", tempBucketName).
   173  				Debug("replaced buckets")
   174  		}
   175  	}
   176  
   177  	if err := r.checkContextExpired(ctx, "resuming store stopped due to context canceled"); err != nil {
   178  		return err
   179  	}
   180  
   181  	if err := r.resumeStoreActivity(ctx, task); err != nil {
   182  		r.logError(err, "failed resuming store activity")
   183  		return err
   184  	}
   185  
   186  	return nil
   187  }
   188  
   189  func (r *ShardInvertedReindexer) pauseStoreActivity(ctx context.Context) error {
   190  	if err := r.shard.Store().PauseCompaction(ctx); err != nil {
   191  		return errors.Wrapf(err, "failed pausing compaction for shard '%s'", r.shard.Name())
   192  	}
   193  	if err := r.shard.Store().FlushMemtables(ctx); err != nil {
   194  		return errors.Wrapf(err, "failed flushing memtables for shard '%s'", r.shard.Name())
   195  	}
   196  	r.shard.Store().UpdateBucketsStatus(storagestate.StatusReadOnly)
   197  
   198  	r.logger.
   199  		WithField("action", "inverted reindex").
   200  		WithField("shard", r.shard.Name()).
   201  		Debug("paused store activity")
   202  
   203  	return nil
   204  }
   205  
   206  func (r *ShardInvertedReindexer) resumeStoreActivity(ctx context.Context, task ShardInvertedReindexTask) error {
   207  	if err := r.shard.Store().ResumeCompaction(ctx); err != nil {
   208  		return errors.Wrapf(err, "failed resuming compaction for shard '%s'", r.shard.Name())
   209  	}
   210  	r.shard.Store().UpdateBucketsStatus(storagestate.StatusReady)
   211  	if err := task.OnPostResumeStore(ctx, r.shard); err != nil {
   212  		return errors.Wrap(err, "failed OnPostResumeStore")
   213  	}
   214  
   215  	r.logger.
   216  		WithField("action", "inverted reindex").
   217  		WithField("shard", r.shard.Name()).
   218  		Debug("resumed store activity")
   219  
   220  	return nil
   221  }
   222  
   223  func (r *ShardInvertedReindexer) createTempBucket(ctx context.Context, name string,
   224  	strategy string, options ...lsmkv.BucketOption,
   225  ) error {
   226  	tempName := helpers.TempBucketFromBucketName(name)
   227  	bucketOptions := append(options, lsmkv.WithStrategy(strategy))
   228  
   229  	if err := r.shard.Store().CreateBucket(ctx, tempName, bucketOptions...); err != nil {
   230  		return errors.Wrapf(err, "failed creating temp bucket '%s'", tempName)
   231  	}
   232  	return nil
   233  }
   234  
   235  func (r *ShardInvertedReindexer) reindexProperties(ctx context.Context, reindexableProperties []ReindexableProperty) error {
   236  	checker := newReindexablePropertyChecker(reindexableProperties, r.class)
   237  	objectsBucket := r.shard.Store().Bucket(helpers.ObjectsBucketLSM)
   238  
   239  	r.logger.
   240  		WithField("action", "inverted reindex").
   241  		WithField("shard", r.shard.Name()).
   242  		Debug("starting populating indexes")
   243  
   244  	i := 0
   245  	if err := objectsBucket.IterateObjects(ctx, func(object *storobj.Object) error {
   246  		// check context expired every 100k objects
   247  		if i%100_000 == 0 && i != 0 {
   248  			if err := r.checkContextExpired(ctx, "iterating through objects stopped due to context canceled"); err != nil {
   249  				return err
   250  			}
   251  			r.logger.
   252  				WithField("action", "inverted reindex").
   253  				WithField("shard", r.shard.Name()).
   254  				Debugf("iterating through objects: %d done", i)
   255  		}
   256  		docID := object.DocID
   257  		properties, nilProperties, err := r.shard.AnalyzeObject(object)
   258  		if err != nil {
   259  			return errors.Wrapf(err, "failed analyzying object")
   260  		}
   261  
   262  		for _, property := range properties {
   263  			if err := r.handleProperty(ctx, checker, docID, property); err != nil {
   264  				return errors.Wrapf(err, "failed reindexing property '%s' of object '%d'", property.Name, docID)
   265  			}
   266  		}
   267  		for _, nilProperty := range nilProperties {
   268  			if err := r.handleNilProperty(ctx, checker, docID, nilProperty); err != nil {
   269  				return errors.Wrapf(err, "failed reindexing property '%s' of object '%d'", nilProperty.Name, docID)
   270  			}
   271  		}
   272  
   273  		i++
   274  		return nil
   275  	}); err != nil {
   276  		return err
   277  	}
   278  
   279  	r.logger.
   280  		WithField("action", "inverted reindex").
   281  		WithField("shard", r.shard.Name()).
   282  		Debugf("iterating through objects: %d done", i)
   283  
   284  	return nil
   285  }
   286  
   287  func (r *ShardInvertedReindexer) handleProperty(ctx context.Context, checker *reindexablePropertyChecker,
   288  	docID uint64, property inverted.Property,
   289  ) error {
   290  	reindexablePropValue := checker.isReindexable(property.Name, IndexTypePropValue)
   291  	reindexablePropSearchableValue := checker.isReindexable(property.Name, IndexTypePropSearchableValue)
   292  
   293  	if reindexablePropValue || reindexablePropSearchableValue {
   294  		schemaProp := checker.getSchemaProp(property.Name)
   295  
   296  		var bucketValue, bucketSearchableValue *lsmkv.Bucket
   297  
   298  		if reindexablePropValue {
   299  			bucketValue = r.tempBucket(property.Name, IndexTypePropValue)
   300  			if bucketValue == nil {
   301  				return fmt.Errorf("no bucket for prop '%s' value found", property.Name)
   302  			}
   303  		}
   304  		if reindexablePropSearchableValue {
   305  			bucketSearchableValue = r.tempBucket(property.Name, IndexTypePropSearchableValue)
   306  			if bucketSearchableValue == nil {
   307  				return fmt.Errorf("no bucket searchable for prop '%s' value found", property.Name)
   308  			}
   309  		}
   310  
   311  		propLen := float32(len(property.Items))
   312  		for _, item := range property.Items {
   313  			key := item.Data
   314  			if reindexablePropSearchableValue && inverted.HasSearchableIndex(schemaProp) {
   315  				pair := r.shard.pairPropertyWithFrequency(docID, item.TermFrequency, propLen)
   316  				if err := r.shard.addToPropertyMapBucket(bucketSearchableValue, pair, key); err != nil {
   317  					return errors.Wrapf(err, "failed adding to prop '%s' value bucket", property.Name)
   318  				}
   319  			}
   320  			if reindexablePropValue && inverted.HasFilterableIndex(schemaProp) {
   321  				if err := r.shard.addToPropertySetBucket(bucketValue, docID, key); err != nil {
   322  					return errors.Wrapf(err, "failed adding to prop '%s' value bucket", property.Name)
   323  				}
   324  			}
   325  		}
   326  	}
   327  
   328  	// add non-nil properties to the null-state inverted index,
   329  	// but skip internal properties (__meta_count, _id etc)
   330  	if isMetaCountProperty(property) || isInternalProperty(property) {
   331  		return nil
   332  	}
   333  
   334  	// properties where defining a length does not make sense (floats etc.) have a negative entry as length
   335  	if r.shard.Index().invertedIndexConfig.IndexPropertyLength && property.Length >= 0 {
   336  		key, err := bucketKeyPropertyLength(property.Length)
   337  		if err != nil {
   338  			return errors.Wrapf(err, "failed creating key for prop '%s' length", property.Name)
   339  		}
   340  		if checker.isReindexable(property.Name, IndexTypePropLength) {
   341  			bucketLength := r.tempBucket(property.Name, IndexTypePropLength)
   342  			if bucketLength == nil {
   343  				return fmt.Errorf("no bucket for prop '%s' length found", property.Name)
   344  			}
   345  			if err := r.shard.addToPropertySetBucket(bucketLength, docID, key); err != nil {
   346  				return errors.Wrapf(err, "failed adding to prop '%s' length bucket", property.Name)
   347  			}
   348  		}
   349  	}
   350  
   351  	if r.shard.Index().invertedIndexConfig.IndexNullState {
   352  		key, err := bucketKeyPropertyNull(property.Length == 0)
   353  		if err != nil {
   354  			return errors.Wrapf(err, "failed creating key for prop '%s' null", property.Name)
   355  		}
   356  		if checker.isReindexable(property.Name, IndexTypePropNull) {
   357  			bucketNull := r.tempBucket(property.Name, IndexTypePropNull)
   358  			if bucketNull == nil {
   359  				return fmt.Errorf("no bucket for prop '%s' null found", property.Name)
   360  			}
   361  			if err := r.shard.addToPropertySetBucket(bucketNull, docID, key); err != nil {
   362  				return errors.Wrapf(err, "failed adding to prop '%s' null bucket", property.Name)
   363  			}
   364  		}
   365  	}
   366  
   367  	return nil
   368  }
   369  
   370  func (r *ShardInvertedReindexer) handleNilProperty(ctx context.Context, checker *reindexablePropertyChecker,
   371  	docID uint64, nilProperty inverted.NilProperty,
   372  ) error {
   373  	if r.shard.Index().invertedIndexConfig.IndexPropertyLength && nilProperty.AddToPropertyLength {
   374  		key, err := bucketKeyPropertyLength(0)
   375  		if err != nil {
   376  			return errors.Wrapf(err, "failed creating key for prop '%s' length", nilProperty.Name)
   377  		}
   378  		if checker.isReindexable(nilProperty.Name, IndexTypePropLength) {
   379  			bucketLength := r.tempBucket(nilProperty.Name, IndexTypePropLength)
   380  			if bucketLength == nil {
   381  				return fmt.Errorf("no bucket for prop '%s' length found", nilProperty.Name)
   382  			}
   383  			if err := r.shard.addToPropertySetBucket(bucketLength, docID, key); err != nil {
   384  				return errors.Wrapf(err, "failed adding to prop '%s' length bucket", nilProperty.Name)
   385  			}
   386  		}
   387  	}
   388  
   389  	if r.shard.Index().invertedIndexConfig.IndexNullState {
   390  		key, err := bucketKeyPropertyNull(true)
   391  		if err != nil {
   392  			return errors.Wrapf(err, "failed creating key for prop '%s' null", nilProperty.Name)
   393  		}
   394  		if checker.isReindexable(nilProperty.Name, IndexTypePropNull) {
   395  			bucketNull := r.tempBucket(nilProperty.Name, IndexTypePropNull)
   396  			if bucketNull == nil {
   397  				return fmt.Errorf("no bucket for prop '%s' null found", nilProperty.Name)
   398  			}
   399  			if err := r.shard.addToPropertySetBucket(bucketNull, docID, key); err != nil {
   400  				return errors.Wrapf(err, "failed adding to prop '%s' null bucket", nilProperty.Name)
   401  			}
   402  		}
   403  	}
   404  
   405  	return nil
   406  }
   407  
   408  func (r *ShardInvertedReindexer) bucketName(propName string, indexType PropertyIndexType) string {
   409  	checkSupportedPropertyIndexType(indexType)
   410  
   411  	switch indexType {
   412  	case IndexTypePropValue:
   413  		return helpers.BucketFromPropNameLSM(propName)
   414  	case IndexTypePropSearchableValue:
   415  		return helpers.BucketSearchableFromPropNameLSM(propName)
   416  	case IndexTypePropLength:
   417  		return helpers.BucketFromPropNameLengthLSM(propName)
   418  	case IndexTypePropNull:
   419  		return helpers.BucketFromPropNameNullLSM(propName)
   420  	default:
   421  		return ""
   422  	}
   423  }
   424  
   425  func (r *ShardInvertedReindexer) tempBucket(propName string, indexType PropertyIndexType) *lsmkv.Bucket {
   426  	tempBucketName := helpers.TempBucketFromBucketName(r.bucketName(propName, indexType))
   427  	return r.shard.Store().Bucket(tempBucketName)
   428  }
   429  
   430  func (r *ShardInvertedReindexer) checkContextExpired(ctx context.Context, msg string) error {
   431  	if ctx.Err() != nil {
   432  		r.logError(ctx.Err(), msg)
   433  		return errors.Wrapf(ctx.Err(), msg)
   434  	}
   435  	return nil
   436  }
   437  
   438  func (r *ShardInvertedReindexer) logError(err error, msg string, args ...interface{}) {
   439  	r.logger.
   440  		WithField("action", "inverted reindex").
   441  		WithField("shard", r.shard.Name()).
   442  		WithError(err).
   443  		Errorf(msg, args...)
   444  }