github.com/weaviate/weaviate@v1.24.6/usecases/schema/manager.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package schema
    13  
    14  import (
    15  	"context"
    16  	"fmt"
    17  	"sync"
    18  
    19  	enterrors "github.com/weaviate/weaviate/entities/errors"
    20  
    21  	"github.com/pkg/errors"
    22  	"github.com/sirupsen/logrus"
    23  	"github.com/weaviate/weaviate/entities/models"
    24  	"github.com/weaviate/weaviate/entities/replication"
    25  	"github.com/weaviate/weaviate/entities/schema"
    26  	"github.com/weaviate/weaviate/usecases/cluster"
    27  	"github.com/weaviate/weaviate/usecases/config"
    28  	"github.com/weaviate/weaviate/usecases/replica"
    29  	"github.com/weaviate/weaviate/usecases/scaler"
    30  	"github.com/weaviate/weaviate/usecases/schema/migrate"
    31  	"github.com/weaviate/weaviate/usecases/sharding"
    32  )
    33  
    34  // Manager Manages schema changes at a use-case level, i.e. agnostic of
    35  // underlying databases or storage providers
    36  type Manager struct {
    37  	migrator                migrate.Migrator
    38  	repo                    SchemaStore
    39  	callbacks               []func(updatedSchema schema.Schema)
    40  	logger                  logrus.FieldLogger
    41  	Authorizer              authorizer
    42  	config                  config.Config
    43  	vectorizerValidator     VectorizerValidator
    44  	moduleConfig            ModuleConfig
    45  	cluster                 *cluster.TxManager
    46  	clusterState            clusterState
    47  	configParser            VectorConfigParser
    48  	invertedConfigValidator InvertedConfigValidator
    49  	scaleOut                scaleOut
    50  	RestoreStatus           sync.Map
    51  	RestoreError            sync.Map
    52  	sync.RWMutex
    53  
    54  	// As outlined in [*cluster.TxManager.TryResumeDanglingTxs] the current
    55  	// implementation isn't perfect. It does not actually know if a tx was meant
    56  	// to be committed or not. Instead we do a simple workaround. We check if the
    57  	// schema is out of sync and only then do we try to resume transactions.
    58  	shouldTryToResumeTx bool
    59  
    60  	schemaCache
    61  }
    62  
    63  type VectorConfigParser func(in interface{}, vectorIndexType string) (schema.VectorIndexConfig, error)
    64  
    65  type InvertedConfigValidator func(in *models.InvertedIndexConfig) error
    66  
    67  type SchemaGetter interface {
    68  	GetSchemaSkipAuth() schema.Schema
    69  	Nodes() []string
    70  	NodeName() string
    71  	ClusterHealthScore() int
    72  	ResolveParentNodes(string, string) (map[string]string, error)
    73  
    74  	CopyShardingState(class string) *sharding.State
    75  	ShardOwner(class, shard string) (string, error)
    76  	TenantShard(class, tenant string) (string, string)
    77  	ShardFromUUID(class string, uuid []byte) string
    78  	ShardReplicas(class, shard string) ([]string, error)
    79  }
    80  
    81  type VectorizerValidator interface {
    82  	ValidateVectorizer(moduleName string) error
    83  }
    84  
    85  type ModuleConfig interface {
    86  	SetClassDefaults(class *models.Class)
    87  	SetSinglePropertyDefaults(class *models.Class, prop *models.Property)
    88  	ValidateClass(ctx context.Context, class *models.Class) error
    89  }
    90  
    91  // SchemaStore is responsible for persisting the schema
    92  // by providing support for both partial and complete schema updates
    93  type SchemaStore interface {
    94  	// Save saves the complete schema to the persistent storage
    95  	Save(ctx context.Context, schema State) error
    96  
    97  	// Load loads the complete schema from the persistent storage
    98  	Load(context.Context) (State, error)
    99  
   100  	// NewClass creates a new class if it doesn't exists, otherwise return an error
   101  	NewClass(context.Context, ClassPayload) error
   102  
   103  	// UpdateClass if it exists, otherwise return an error
   104  	UpdateClass(context.Context, ClassPayload) error
   105  
   106  	// DeleteClass deletes class
   107  	DeleteClass(ctx context.Context, class string) error
   108  
   109  	// NewShards creates new shards of an existing class
   110  	NewShards(ctx context.Context, class string, shards []KeyValuePair) error
   111  
   112  	// UpdateShards updates (replaces) shards of on existing class
   113  	// Error is returned if class or shard does not exist
   114  	UpdateShards(ctx context.Context, class string, shards []KeyValuePair) error
   115  
   116  	// DeleteShards deletes shards from a class
   117  	// If the class or a shard does not exist then nothing is done and a nil error is returned
   118  	DeleteShards(ctx context.Context, class string, shards []string) error
   119  }
   120  
   121  // KeyValuePair is used to serialize shards updates
   122  type KeyValuePair struct {
   123  	Key   string
   124  	Value []byte
   125  }
   126  
   127  // ClassPayload is used to serialize class updates
   128  type ClassPayload struct {
   129  	Name          string
   130  	Metadata      []byte
   131  	ShardingState []byte
   132  	Shards        []KeyValuePair
   133  	ReplaceShards bool
   134  	Error         error
   135  }
   136  
   137  type clusterState interface {
   138  	// Hostnames initializes a broadcast
   139  	Hostnames() []string
   140  
   141  	// AllNames initializes shard distribution across nodes
   142  	AllNames() []string
   143  	Candidates() []string
   144  	LocalName() string
   145  	NodeCount() int
   146  	NodeHostname(nodeName string) (string, bool)
   147  
   148  	// ClusterHealthScore gets the whole cluster health, the lower number the better
   149  	ClusterHealthScore() int
   150  
   151  	SchemaSyncIgnored() bool
   152  	SkipSchemaRepair() bool
   153  }
   154  
   155  type scaleOut interface {
   156  	SetSchemaManager(sm scaler.SchemaManager)
   157  	Scale(ctx context.Context, className string,
   158  		updated sharding.Config, prevReplFactor, newReplFactor int64) (*sharding.State, error)
   159  }
   160  
   161  // NewManager creates a new manager
   162  func NewManager(migrator migrate.Migrator, repo SchemaStore,
   163  	logger logrus.FieldLogger, authorizer authorizer, config config.Config,
   164  	configParser VectorConfigParser, vectorizerValidator VectorizerValidator,
   165  	invertedConfigValidator InvertedConfigValidator,
   166  	moduleConfig ModuleConfig, clusterState clusterState,
   167  	txClient cluster.Client, txPersistence cluster.Persistence,
   168  	scaleoutManager scaleOut,
   169  ) (*Manager, error) {
   170  	txBroadcaster := cluster.NewTxBroadcaster(clusterState, txClient, logger)
   171  	m := &Manager{
   172  		config:                  config,
   173  		migrator:                migrator,
   174  		repo:                    repo,
   175  		schemaCache:             schemaCache{State: State{}},
   176  		logger:                  logger,
   177  		Authorizer:              authorizer,
   178  		configParser:            configParser,
   179  		vectorizerValidator:     vectorizerValidator,
   180  		invertedConfigValidator: invertedConfigValidator,
   181  		moduleConfig:            moduleConfig,
   182  		cluster:                 cluster.NewTxManager(txBroadcaster, txPersistence, logger),
   183  		clusterState:            clusterState,
   184  		scaleOut:                scaleoutManager,
   185  	}
   186  
   187  	m.scaleOut.SetSchemaManager(m)
   188  
   189  	m.cluster.SetCommitFn(m.handleCommit)
   190  	m.cluster.SetResponseFn(m.handleTxResponse)
   191  	m.cluster.SetAllowUnready(allowUnreadyTxs)
   192  	txBroadcaster.SetConsensusFunction(newReadConsensus(m.parseConfigs, m.logger))
   193  
   194  	err := m.loadOrInitializeSchema(context.Background())
   195  	if err != nil {
   196  		return nil, fmt.Errorf("could not load or initialize schema: %v", err)
   197  	}
   198  
   199  	return m, nil
   200  }
   201  
   202  func (m *Manager) Shutdown(ctx context.Context) error {
   203  	allCommitsDone := make(chan struct{})
   204  	enterrors.GoWrapper(func() {
   205  		m.cluster.Shutdown()
   206  		allCommitsDone <- struct{}{}
   207  	}, m.logger)
   208  
   209  	select {
   210  	case <-ctx.Done():
   211  		return fmt.Errorf("waiting for transactions to commit: %w", ctx.Err())
   212  	case <-allCommitsDone:
   213  		return nil
   214  	}
   215  }
   216  
   217  func (m *Manager) TxManager() *cluster.TxManager {
   218  	return m.cluster
   219  }
   220  
   221  type authorizer interface {
   222  	Authorize(principal *models.Principal, verb, resource string) error
   223  }
   224  
   225  func (m *Manager) saveSchema(ctx context.Context, st State) error {
   226  	m.logger.
   227  		WithField("action", "schema.save").
   228  		Debug("saving updated schema to configuration store")
   229  
   230  	if err := m.repo.Save(ctx, st); err != nil {
   231  		return err
   232  	}
   233  	m.triggerSchemaUpdateCallbacks()
   234  	return nil
   235  }
   236  
   237  // RegisterSchemaUpdateCallback allows other usecases to register a primitive
   238  // type update callback. The callbacks will be called any time we persist a
   239  // schema update
   240  func (m *Manager) RegisterSchemaUpdateCallback(callback func(updatedSchema schema.Schema)) {
   241  	m.callbacks = append(m.callbacks, callback)
   242  }
   243  
   244  func (m *Manager) triggerSchemaUpdateCallbacks() {
   245  	schema := m.getSchema()
   246  
   247  	for _, cb := range m.callbacks {
   248  		cb(schema)
   249  	}
   250  }
   251  
   252  func (m *Manager) loadOrInitializeSchema(ctx context.Context) error {
   253  	localSchema, err := m.repo.Load(ctx)
   254  	if err != nil {
   255  		return fmt.Errorf("could not load schema:  %v", err)
   256  	}
   257  	if err := m.parseConfigs(ctx, &localSchema); err != nil {
   258  		return errors.Wrap(err, "load schema")
   259  	}
   260  
   261  	if err := m.migrateSchemaIfNecessary(ctx, &localSchema); err != nil {
   262  		return fmt.Errorf("migrate schema: %w", err)
   263  	}
   264  
   265  	// There was a bug that allowed adding the same prop multiple times. This
   266  	// leads to a race at startup. If an instance is already affected by this,
   267  	// this step can remove the duplicate ones.
   268  	//
   269  	// See https://github.com/weaviate/weaviate/issues/2609
   270  	for _, c := range localSchema.ObjectSchema.Classes {
   271  		c.Properties = m.deduplicateProps(c.Properties, c.Class)
   272  	}
   273  
   274  	// set internal state since it is used by startupClusterSync
   275  	m.schemaCache.setState(localSchema)
   276  
   277  	// make sure that all migrations have completed before checking sync,
   278  	// otherwise two identical schemas might fail the check based on form rather
   279  	// than content
   280  
   281  	if err := m.startupClusterSync(ctx); err != nil {
   282  		return errors.Wrap(err, "sync schema with other nodes in the cluster")
   283  	}
   284  
   285  	// store in persistent storage
   286  	// TODO: investigate if save() is redundant because it is called in startupClusterSync()
   287  	err = m.RLockGuard(func() error { return m.repo.Save(ctx, m.schemaCache.State) })
   288  	if err != nil {
   289  		return fmt.Errorf("store to persistent storage: %v", err)
   290  	}
   291  
   292  	return nil
   293  }
   294  
   295  // StartServing indicates that the schema manager is ready to accept incoming
   296  // connections in cluster mode, i.e. it will accept opening transactions.
   297  //
   298  // Some transactions are exempt, such as ReadSchema which is required for nodes
   299  // to start up.
   300  //
   301  // This method should be called when all backends, primarily the DB, are ready
   302  // to serve.
   303  func (m *Manager) StartServing(ctx context.Context) error {
   304  	if err := m.resumeDanglingTransactions(ctx); err != nil {
   305  		return err
   306  	}
   307  
   308  	// only start accepting incoming connections when dangling txs have been
   309  	// resumed, otherwise there is potential for conflict
   310  	m.cluster.StartAcceptIncoming()
   311  
   312  	return nil
   313  }
   314  
   315  // resumeDanglingTransactions iterates over any transaction that may have been left
   316  // dangling after a restart and retries to commit them if appropriate.
   317  //
   318  // This can only be called when all areas responding to side effects of
   319  // committing a transaction are ready. In practice this means, the DB must be
   320  // ready to try and call this method.
   321  func (m *Manager) resumeDanglingTransactions(ctx context.Context) error {
   322  	var shouldResume bool
   323  	m.RLockGuard(func() error {
   324  		shouldResume = m.shouldTryToResumeTx
   325  		return nil
   326  	})
   327  
   328  	if !shouldResume {
   329  		// nothing to do for us
   330  		return nil
   331  	}
   332  
   333  	ok, err := m.cluster.TryResumeDanglingTxs(ctx, resumableTxs)
   334  	if err != nil {
   335  		return fmt.Errorf("try resuming dangling transactions: %w", err)
   336  	}
   337  
   338  	if !ok {
   339  		// no tx was applied, we are done
   340  		return nil
   341  	}
   342  
   343  	// a tx was applied which means the previous schema check was skipped, we
   344  	// now need to check the schema again
   345  	err = m.validateSchemaCorruption(ctx)
   346  	if err == nil {
   347  		// all is fine, continue as normal
   348  		return nil
   349  	}
   350  
   351  	if m.clusterState.SchemaSyncIgnored() {
   352  		m.logger.WithError(err).WithFields(logrusStartupSyncFields()).
   353  			Warning("schema out of sync, but ignored because " +
   354  				"CLUSTER_IGNORE_SCHEMA_SYNC=true")
   355  		return nil
   356  	}
   357  
   358  	return fmt.Errorf(
   359  		"applied dangling tx, but schema still out of sync: %w", err)
   360  }
   361  
   362  func (m *Manager) migrateSchemaIfNecessary(ctx context.Context, localSchema *State) error {
   363  	// introduced when Weaviate started supporting multi-shards per class in v1.8
   364  	if err := m.checkSingleShardMigration(ctx, localSchema); err != nil {
   365  		return errors.Wrap(err, "migrating sharding state from previous version")
   366  	}
   367  
   368  	// introduced when Weaviate started supporting replication in v1.17
   369  	if err := m.checkShardingStateForReplication(ctx, localSchema); err != nil {
   370  		return errors.Wrap(err, "migrating sharding state from previous version (before replication)")
   371  	}
   372  
   373  	// if other migrations become necessary in the future, you can add them here.
   374  	return nil
   375  }
   376  
   377  func (m *Manager) checkSingleShardMigration(ctx context.Context, localSchema *State) error {
   378  	for _, c := range localSchema.ObjectSchema.Classes {
   379  		if _, ok := localSchema.ShardingState[c.Class]; ok { // there is sharding state for this class. Nothing to do
   380  			continue
   381  		}
   382  
   383  		m.logger.WithField("className", c.Class).WithField("action", "initialize_schema").
   384  			Warningf("No sharding state found for class %q, initializing new state. "+
   385  				"This is expected behavior if the schema was created with an older Weaviate "+
   386  				"version, prior to supporting multi-shard indices.", c.Class)
   387  
   388  		// there is no sharding state for this class, let's create the correct
   389  		// config. This class must have been created prior to the sharding feature,
   390  		// so we now that the shardCount==1 - we do not care about any of the other
   391  		// parameters and simply use the defaults for those
   392  		c.ShardingConfig = map[string]interface{}{
   393  			"desiredCount": 1,
   394  		}
   395  		if err := m.parseShardingConfig(ctx, c); err != nil {
   396  			return err
   397  		}
   398  
   399  		if err := replica.ValidateConfig(c, m.config.Replication); err != nil {
   400  			return fmt.Errorf("validate replication config: %w", err)
   401  		}
   402  		shardState, err := sharding.InitState(c.Class,
   403  			c.ShardingConfig.(sharding.Config),
   404  			m.clusterState, c.ReplicationConfig.Factor,
   405  			schema.MultiTenancyEnabled(c))
   406  		if err != nil {
   407  			return errors.Wrap(err, "init sharding state")
   408  		}
   409  
   410  		if localSchema.ShardingState == nil {
   411  			localSchema.ShardingState = map[string]*sharding.State{}
   412  		}
   413  		localSchema.ShardingState[c.Class] = shardState
   414  
   415  	}
   416  
   417  	return nil
   418  }
   419  
   420  func (m *Manager) checkShardingStateForReplication(ctx context.Context, localSchema *State) error {
   421  	for _, classState := range localSchema.ShardingState {
   422  		classState.MigrateFromOldFormat()
   423  	}
   424  	return nil
   425  }
   426  
   427  func newSchema() *State {
   428  	return &State{
   429  		ObjectSchema: &models.Schema{
   430  			Classes: []*models.Class{},
   431  		},
   432  		ShardingState: map[string]*sharding.State{},
   433  	}
   434  }
   435  
   436  func (m *Manager) parseConfigs(ctx context.Context, schema *State) error {
   437  	for _, class := range schema.ObjectSchema.Classes {
   438  		for _, prop := range class.Properties {
   439  			setPropertyDefaults(prop)
   440  			migratePropertySettings(prop)
   441  		}
   442  
   443  		if err := m.parseVectorIndexConfig(ctx, class); err != nil {
   444  			return errors.Wrapf(err, "class %s: vector index config", class.Class)
   445  		}
   446  
   447  		if err := m.parseShardingConfig(ctx, class); err != nil {
   448  			return errors.Wrapf(err, "class %s: sharding config", class.Class)
   449  		}
   450  
   451  		// Pass dummy replication config with minimum factor 1. Otherwise the
   452  		// setting is not backward-compatible. The user may have created a class
   453  		// with factor=1 before the change was introduced. Now their setup would no
   454  		// longer start up if the required minimum is now higher than 1. We want
   455  		// the required minimum to only apply to newly created classes - not block
   456  		// loading existing ones.
   457  		if err := replica.ValidateConfig(class, replication.GlobalConfig{MinimumFactor: 1}); err != nil {
   458  			return fmt.Errorf("replication config: %w", err)
   459  		}
   460  	}
   461  	m.schemaCache.LockGuard(func() {
   462  		for _, shardState := range schema.ShardingState {
   463  			shardState.SetLocalName(m.clusterState.LocalName())
   464  		}
   465  	})
   466  
   467  	return nil
   468  }