vitess.io/vitess@v0.16.2/go/vt/schemamanager/tablet_executor.go (about)

     1  /*
     2  Copyright 2019 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package schemamanager
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sync"
    23  	"time"
    24  
    25  	"vitess.io/vitess/go/sync2"
    26  	"vitess.io/vitess/go/timer"
    27  	"vitess.io/vitess/go/vt/logutil"
    28  	"vitess.io/vitess/go/vt/schema"
    29  	"vitess.io/vitess/go/vt/sqlparser"
    30  	"vitess.io/vitess/go/vt/topo"
    31  	"vitess.io/vitess/go/vt/vtctl/schematools"
    32  	"vitess.io/vitess/go/vt/vterrors"
    33  	"vitess.io/vitess/go/vt/vttablet/tmclient"
    34  
    35  	querypb "vitess.io/vitess/go/vt/proto/query"
    36  	tabletmanagerdatapb "vitess.io/vitess/go/vt/proto/tabletmanagerdata"
    37  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    38  )
    39  
    40  // TabletExecutor applies schema changes to all tablets.
    41  type TabletExecutor struct {
    42  	migrationContext     string
    43  	ts                   *topo.Server
    44  	tmc                  tmclient.TabletManagerClient
    45  	logger               logutil.Logger
    46  	tablets              []*topodatapb.Tablet
    47  	isClosed             bool
    48  	allowBigSchemaChange bool
    49  	keyspace             string
    50  	waitReplicasTimeout  time.Duration
    51  	ddlStrategySetting   *schema.DDLStrategySetting
    52  	uuids                []string
    53  	skipPreflight        bool
    54  }
    55  
    56  // NewTabletExecutor creates a new TabletExecutor instance
    57  func NewTabletExecutor(migrationContext string, ts *topo.Server, tmc tmclient.TabletManagerClient, logger logutil.Logger, waitReplicasTimeout time.Duration) *TabletExecutor {
    58  	return &TabletExecutor{
    59  		ts:                   ts,
    60  		tmc:                  tmc,
    61  		logger:               logger,
    62  		isClosed:             true,
    63  		allowBigSchemaChange: false,
    64  		waitReplicasTimeout:  waitReplicasTimeout,
    65  		migrationContext:     migrationContext,
    66  	}
    67  }
    68  
    69  // AllowBigSchemaChange changes TabletExecutor such that big schema changes
    70  // will no longer be rejected.
    71  func (exec *TabletExecutor) AllowBigSchemaChange() {
    72  	exec.allowBigSchemaChange = true
    73  }
    74  
    75  // DisallowBigSchemaChange enables the check for big schema changes such that
    76  // TabletExecutor will reject these.
    77  func (exec *TabletExecutor) DisallowBigSchemaChange() {
    78  	exec.allowBigSchemaChange = false
    79  }
    80  
    81  // SetDDLStrategy applies ddl_strategy from command line flags
    82  func (exec *TabletExecutor) SetDDLStrategy(ddlStrategy string) error {
    83  	ddlStrategySetting, err := schema.ParseDDLStrategy(ddlStrategy)
    84  	if err != nil {
    85  		return err
    86  	}
    87  	exec.ddlStrategySetting = ddlStrategySetting
    88  	return nil
    89  }
    90  
    91  // SetUUIDList sets a (possibly empty) list of provided UUIDs for schema migrations
    92  func (exec *TabletExecutor) SetUUIDList(uuids []string) error {
    93  	uuidsMap := map[string]bool{}
    94  	for _, uuid := range uuids {
    95  		if !schema.IsOnlineDDLUUID(uuid) {
    96  			return fmt.Errorf("Not a valid UUID: %s", uuid)
    97  		}
    98  		uuidsMap[uuid] = true
    99  	}
   100  	if len(uuidsMap) != len(uuids) {
   101  		return fmt.Errorf("UUID values must be unique")
   102  	}
   103  	exec.uuids = uuids
   104  	return nil
   105  }
   106  
   107  // hasProvidedUUIDs returns true when UUIDs were provided
   108  func (exec *TabletExecutor) hasProvidedUUIDs() bool {
   109  	return len(exec.uuids) != 0
   110  }
   111  
   112  // SkipPreflight disables preflight checks
   113  func (exec *TabletExecutor) SkipPreflight() {
   114  	exec.skipPreflight = true
   115  }
   116  
   117  // Open opens a connection to the primary for every shard.
   118  func (exec *TabletExecutor) Open(ctx context.Context, keyspace string) error {
   119  	if !exec.isClosed {
   120  		return nil
   121  	}
   122  	exec.keyspace = keyspace
   123  	shardNames, err := exec.ts.GetShardNames(ctx, keyspace)
   124  	if err != nil {
   125  		return fmt.Errorf("unable to get shard names for keyspace: %s, error: %v", keyspace, err)
   126  	}
   127  	exec.tablets = make([]*topodatapb.Tablet, len(shardNames))
   128  	for i, shardName := range shardNames {
   129  		shardInfo, err := exec.ts.GetShard(ctx, keyspace, shardName)
   130  		if err != nil {
   131  			return fmt.Errorf("unable to get shard info, keyspace: %s, shard: %s, error: %v", keyspace, shardName, err)
   132  		}
   133  		if !shardInfo.HasPrimary() {
   134  			return fmt.Errorf("shard: %s does not have a primary", shardName)
   135  		}
   136  		tabletInfo, err := exec.ts.GetTablet(ctx, shardInfo.PrimaryAlias)
   137  		if err != nil {
   138  			return fmt.Errorf("unable to get primary tablet info, keyspace: %s, shard: %s, error: %v", keyspace, shardName, err)
   139  		}
   140  		exec.tablets[i] = tabletInfo.Tablet
   141  	}
   142  
   143  	if len(exec.tablets) == 0 {
   144  		return fmt.Errorf("keyspace: %s does not contain any primary tablets", keyspace)
   145  	}
   146  	exec.isClosed = false
   147  	return nil
   148  }
   149  
   150  // Validate validates a list of sql statements.
   151  func (exec *TabletExecutor) Validate(ctx context.Context, sqls []string) error {
   152  	if exec.isClosed {
   153  		return fmt.Errorf("executor is closed")
   154  	}
   155  
   156  	// We ignore DATABASE-level DDLs here because detectBigSchemaChanges doesn't
   157  	// look at them anyway.
   158  	parsedDDLs, _, _, _, err := exec.parseDDLs(sqls)
   159  	if err != nil {
   160  		return err
   161  	}
   162  
   163  	bigSchemaChange, err := exec.detectBigSchemaChanges(ctx, parsedDDLs)
   164  	if bigSchemaChange && exec.allowBigSchemaChange {
   165  		exec.logger.Warningf("Processing big schema change. This may cause visible MySQL downtime.")
   166  		return nil
   167  	}
   168  	return err
   169  }
   170  
   171  func (exec *TabletExecutor) parseDDLs(sqls []string) ([]sqlparser.DDLStatement, []sqlparser.DBDDLStatement, [](*sqlparser.RevertMigration), [](*sqlparser.AlterMigration), error) {
   172  	parsedDDLs := make([]sqlparser.DDLStatement, 0)
   173  	parsedDBDDLs := make([]sqlparser.DBDDLStatement, 0)
   174  	revertStatements := make([](*sqlparser.RevertMigration), 0)
   175  	alterMigrationStatements := make([](*sqlparser.AlterMigration), 0)
   176  	for _, sql := range sqls {
   177  		stmt, err := sqlparser.Parse(sql)
   178  		if err != nil {
   179  			return nil, nil, nil, nil, fmt.Errorf("failed to parse sql: %s, got error: %v", sql, err)
   180  		}
   181  		switch stmt := stmt.(type) {
   182  		case sqlparser.DDLStatement:
   183  			parsedDDLs = append(parsedDDLs, stmt)
   184  		case sqlparser.DBDDLStatement:
   185  			parsedDBDDLs = append(parsedDBDDLs, stmt)
   186  		case *sqlparser.RevertMigration:
   187  			revertStatements = append(revertStatements, stmt)
   188  		case *sqlparser.AlterMigration:
   189  			alterMigrationStatements = append(alterMigrationStatements, stmt)
   190  		default:
   191  			if len(exec.tablets) != 1 {
   192  				return nil, nil, nil, nil, fmt.Errorf("non-ddl statements can only be executed for single shard keyspaces: %s", sql)
   193  			}
   194  		}
   195  	}
   196  	return parsedDDLs, parsedDBDDLs, revertStatements, alterMigrationStatements, nil
   197  }
   198  
   199  // IsOnlineSchemaDDL returns true if we expect to run a online schema change DDL
   200  func (exec *TabletExecutor) isOnlineSchemaDDL(stmt sqlparser.Statement) (isOnline bool) {
   201  	switch stmt := stmt.(type) {
   202  	case sqlparser.DDLStatement:
   203  		if exec.ddlStrategySetting == nil {
   204  			return false
   205  		}
   206  		if exec.ddlStrategySetting.Strategy.IsDirect() {
   207  			return false
   208  		}
   209  		switch stmt.GetAction() {
   210  		case sqlparser.CreateDDLAction, sqlparser.DropDDLAction, sqlparser.AlterDDLAction:
   211  			return true
   212  		}
   213  	case *sqlparser.RevertMigration:
   214  		return true
   215  	}
   216  	return false
   217  }
   218  
   219  // a schema change that satisfies any following condition is considered
   220  // to be a big schema change and will be rejected.
   221  //  1. Alter more than 100,000 rows.
   222  //  2. Change a table with more than 2,000,000 rows (Drops are fine).
   223  func (exec *TabletExecutor) detectBigSchemaChanges(ctx context.Context, parsedDDLs []sqlparser.DDLStatement) (bool, error) {
   224  	// We want to avoid any overhead if possible. If all DDLs are online schema changes, then we want to
   225  	// skip GetSchema altogether.
   226  	foundAnyNonOnlineDDL := false
   227  	for _, ddl := range parsedDDLs {
   228  		if !exec.isOnlineSchemaDDL(ddl) {
   229  			foundAnyNonOnlineDDL = true
   230  		}
   231  	}
   232  	if !foundAnyNonOnlineDDL {
   233  		return false, nil
   234  	}
   235  	// exec.tablets is guaranteed to have at least one element;
   236  	// Otherwise, Open should fail and executor should fail.
   237  	primaryTabletInfo := exec.tablets[0]
   238  	// get database schema, excluding views.
   239  	req := &tabletmanagerdatapb.GetSchemaRequest{Tables: []string{}, ExcludeTables: []string{}, TableSchemaOnly: true}
   240  	dbSchema, err := exec.tmc.GetSchema(ctx, primaryTabletInfo, req)
   241  	if err != nil {
   242  		return false, fmt.Errorf("unable to get database schema, error: %v", err)
   243  	}
   244  	tableWithCount := make(map[string]uint64, len(dbSchema.TableDefinitions))
   245  	for _, tableSchema := range dbSchema.TableDefinitions {
   246  		tableWithCount[tableSchema.Name] = tableSchema.RowCount
   247  	}
   248  	for _, ddl := range parsedDDLs {
   249  		if exec.isOnlineSchemaDDL(ddl) {
   250  			// Since this is an online schema change, there is no need to worry about big changes
   251  			continue
   252  		}
   253  		switch ddl.GetAction() {
   254  		case sqlparser.DropDDLAction, sqlparser.CreateDDLAction, sqlparser.TruncateDDLAction, sqlparser.RenameDDLAction:
   255  			continue
   256  		}
   257  		tableName := ddl.GetTable().Name.String()
   258  		if rowCount, ok := tableWithCount[tableName]; ok {
   259  			if rowCount > 100000 && ddl.GetAction() == sqlparser.AlterDDLAction {
   260  				return true, fmt.Errorf(
   261  					"big schema change detected. Disable check with -allow_long_unavailability. ddl: %s alters a table with more than 100 thousand rows", sqlparser.String(ddl))
   262  			}
   263  			if rowCount > 2000000 {
   264  				return true, fmt.Errorf(
   265  					"big schema change detected. Disable check with -allow_long_unavailability. ddl: %s changes a table with more than 2 million rows", sqlparser.String(ddl))
   266  			}
   267  		}
   268  	}
   269  	return false, nil
   270  }
   271  
   272  func (exec *TabletExecutor) preflightSchemaChanges(ctx context.Context, sqls []string) error {
   273  	if exec.skipPreflight {
   274  		return nil
   275  	}
   276  	_, err := exec.tmc.PreflightSchema(ctx, exec.tablets[0], sqls)
   277  	return err
   278  }
   279  
   280  // executeSQL executes a single SQL statement either as online DDL or synchronously on all tablets.
   281  // In online DDL case, the query may be exploded into multiple queries during
   282  func (exec *TabletExecutor) executeSQL(ctx context.Context, sql string, providedUUID string, execResult *ExecuteResult) (executedAsynchronously bool, err error) {
   283  	stmt, err := sqlparser.Parse(sql)
   284  	if err != nil {
   285  		return false, err
   286  	}
   287  	switch stmt := stmt.(type) {
   288  	case sqlparser.DDLStatement:
   289  		if exec.isOnlineSchemaDDL(stmt) {
   290  			onlineDDLs, err := schema.NewOnlineDDLs(exec.keyspace, sql, stmt, exec.ddlStrategySetting, exec.migrationContext, providedUUID)
   291  			if err != nil {
   292  				execResult.ExecutorErr = err.Error()
   293  				return false, err
   294  			}
   295  			for _, onlineDDL := range onlineDDLs {
   296  				exec.executeOnAllTablets(ctx, execResult, onlineDDL.SQL, true)
   297  				if len(execResult.SuccessShards) > 0 {
   298  					execResult.UUIDs = append(execResult.UUIDs, onlineDDL.UUID)
   299  					exec.logger.Printf("%s\n", onlineDDL.UUID)
   300  				}
   301  			}
   302  			return true, nil
   303  		}
   304  	case *sqlparser.RevertMigration:
   305  		strategySetting := schema.NewDDLStrategySetting(schema.DDLStrategyOnline, exec.ddlStrategySetting.Options)
   306  		onlineDDL, err := schema.NewOnlineDDL(exec.keyspace, "", sqlparser.String(stmt), strategySetting, exec.migrationContext, providedUUID)
   307  		if err != nil {
   308  			execResult.ExecutorErr = err.Error()
   309  			return false, err
   310  		}
   311  		exec.executeOnAllTablets(ctx, execResult, onlineDDL.SQL, true)
   312  		execResult.UUIDs = append(execResult.UUIDs, onlineDDL.UUID)
   313  		exec.logger.Printf("%s\n", onlineDDL.UUID)
   314  		return true, nil
   315  	case *sqlparser.AlterMigration:
   316  		exec.executeOnAllTablets(ctx, execResult, sql, true)
   317  		return true, nil
   318  	}
   319  	exec.executeOnAllTablets(ctx, execResult, sql, false)
   320  	return false, nil
   321  }
   322  
   323  // Execute applies schema changes
   324  func (exec *TabletExecutor) Execute(ctx context.Context, sqls []string) *ExecuteResult {
   325  	execResult := ExecuteResult{}
   326  	execResult.Sqls = sqls
   327  	if exec.isClosed {
   328  		execResult.ExecutorErr = "executor is closed"
   329  		return &execResult
   330  	}
   331  	startTime := time.Now()
   332  	defer func() { execResult.TotalTimeSpent = time.Since(startTime) }()
   333  
   334  	// Lock the keyspace so our schema change doesn't overlap with other
   335  	// keyspace-wide operations like resharding migrations.
   336  	ctx, unlock, lockErr := exec.ts.LockKeyspace(ctx, exec.keyspace, "ApplySchemaKeyspace")
   337  	if lockErr != nil {
   338  		execResult.ExecutorErr = vterrors.Wrapf(lockErr, "lockErr in ApplySchemaKeyspace %v", exec.keyspace).Error()
   339  		return &execResult
   340  	}
   341  	defer func() {
   342  		// This is complicated because execResult.ExecutorErr
   343  		// is not of type error.
   344  		var unlockErr error
   345  		unlock(&unlockErr)
   346  		if execResult.ExecutorErr == "" && unlockErr != nil {
   347  			execResult.ExecutorErr = vterrors.Wrapf(unlockErr, "unlockErr in ApplySchemaKeyspace %v", exec.keyspace).Error()
   348  		}
   349  	}()
   350  
   351  	// Make sure the schema changes introduce a table definition change.
   352  	if err := exec.preflightSchemaChanges(ctx, sqls); err != nil {
   353  		execResult.ExecutorErr = err.Error()
   354  		return &execResult
   355  	}
   356  
   357  	if exec.hasProvidedUUIDs() && len(exec.uuids) != len(sqls) {
   358  		execResult.ExecutorErr = fmt.Sprintf("provided %v UUIDs do not match number of DDLs %v", len(exec.uuids), len(sqls))
   359  		return &execResult
   360  	}
   361  	providedUUID := ""
   362  
   363  	rl := timer.NewRateLimiter(topo.RemoteOperationTimeout / 4)
   364  	defer rl.Stop()
   365  
   366  	syncOperationExecuted := false
   367  
   368  	// ReloadSchema once. Do it even if we do an early return on error
   369  	defer func() {
   370  		if !syncOperationExecuted {
   371  			exec.logger.Infof("Skipped ReloadSchema since all SQLs executed asynchronously")
   372  			return
   373  		}
   374  		// same shards will appear multiple times in execResult.SuccessShards when there are
   375  		// multiple SQLs
   376  		uniqueShards := map[string]*ShardResult{}
   377  		for i := range execResult.SuccessShards {
   378  			// Please do not change the above iteration to "for result := range ...".
   379  			// This is because we want to end up grabbing a pointer to the result. But golang's "for"
   380  			// implementation reuses the iteration parameter, and we end up reusing the same pointer.
   381  			result := &execResult.SuccessShards[i]
   382  			uniqueShards[result.Shard] = result
   383  		}
   384  		var wg sync.WaitGroup
   385  		// If all shards succeeded, wait (up to waitReplicasTimeout) for replicas to
   386  		// execute the schema change via replication. This is best-effort, meaning
   387  		// we still return overall success if the timeout expires.
   388  		concurrency := sync2.NewSemaphore(10, 0)
   389  		reloadCtx, cancel := context.WithTimeout(ctx, exec.waitReplicasTimeout)
   390  		defer cancel()
   391  		for _, result := range uniqueShards {
   392  			wg.Add(1)
   393  			go func(result *ShardResult) {
   394  				defer wg.Done()
   395  				exec.logger.Infof("ReloadSchema on shard: %s", result.Shard)
   396  				schematools.ReloadShard(
   397  					reloadCtx,
   398  					exec.ts,
   399  					exec.tmc,
   400  					exec.logger,
   401  					exec.keyspace,
   402  					result.Shard,
   403  					result.Position,
   404  					concurrency,
   405  					true, /* includePrimary */
   406  				)
   407  			}(result)
   408  		}
   409  		wg.Wait()
   410  	}()
   411  
   412  	for index, sql := range sqls {
   413  		// Attempt to renew lease:
   414  		if err := rl.Do(func() error { return topo.CheckKeyspaceLockedAndRenew(ctx, exec.keyspace) }); err != nil {
   415  			execResult.ExecutorErr = vterrors.Wrapf(err, "CheckKeyspaceLocked in ApplySchemaKeyspace %v", exec.keyspace).Error()
   416  			return &execResult
   417  		}
   418  		execResult.CurSQLIndex = index
   419  		if exec.hasProvidedUUIDs() {
   420  			providedUUID = exec.uuids[index]
   421  		}
   422  		executedAsynchronously, err := exec.executeSQL(ctx, sql, providedUUID, &execResult)
   423  		if err != nil {
   424  			execResult.ExecutorErr = err.Error()
   425  			return &execResult
   426  		}
   427  		if !executedAsynchronously {
   428  			syncOperationExecuted = true
   429  		}
   430  		if len(execResult.FailedShards) > 0 {
   431  			break
   432  		}
   433  	}
   434  
   435  	return &execResult
   436  }
   437  
   438  // executeOnAllTablets runs a query on all tablets, synchronously. This can be a long running operation.
   439  func (exec *TabletExecutor) executeOnAllTablets(ctx context.Context, execResult *ExecuteResult, sql string, viaQueryService bool) {
   440  	var wg sync.WaitGroup
   441  	numOfPrimaryTablets := len(exec.tablets)
   442  	wg.Add(numOfPrimaryTablets)
   443  	errChan := make(chan ShardWithError, numOfPrimaryTablets)
   444  	successChan := make(chan ShardResult, numOfPrimaryTablets)
   445  	for _, tablet := range exec.tablets {
   446  		go func(tablet *topodatapb.Tablet) {
   447  			defer wg.Done()
   448  			exec.executeOneTablet(ctx, tablet, sql, viaQueryService, errChan, successChan)
   449  		}(tablet)
   450  	}
   451  	wg.Wait()
   452  	close(errChan)
   453  	close(successChan)
   454  	execResult.FailedShards = make([]ShardWithError, 0, len(errChan))
   455  	execResult.SuccessShards = make([]ShardResult, 0, len(successChan))
   456  	for e := range errChan {
   457  		execResult.FailedShards = append(execResult.FailedShards, e)
   458  	}
   459  	for r := range successChan {
   460  		execResult.SuccessShards = append(execResult.SuccessShards, r)
   461  	}
   462  
   463  	if len(execResult.FailedShards) > 0 {
   464  		return
   465  	}
   466  }
   467  
   468  func (exec *TabletExecutor) executeOneTablet(
   469  	ctx context.Context,
   470  	tablet *topodatapb.Tablet,
   471  	sql string,
   472  	viaQueryService bool,
   473  	errChan chan ShardWithError,
   474  	successChan chan ShardResult) {
   475  
   476  	var result *querypb.QueryResult
   477  	var err error
   478  	if viaQueryService {
   479  		result, err = exec.tmc.ExecuteQuery(ctx, tablet, &tabletmanagerdatapb.ExecuteQueryRequest{
   480  			Query:   []byte(sql),
   481  			MaxRows: 10,
   482  		})
   483  	} else {
   484  		if exec.ddlStrategySetting != nil && exec.ddlStrategySetting.IsAllowZeroInDateFlag() {
   485  			// --allow-zero-in-date Applies to DDLs
   486  			stmt, err := sqlparser.Parse(string(sql))
   487  			if err != nil {
   488  				errChan <- ShardWithError{Shard: tablet.Shard, Err: err.Error()}
   489  				return
   490  			}
   491  			if ddlStmt, ok := stmt.(sqlparser.DDLStatement); ok {
   492  				// Add comments directive to allow zero in date
   493  				const directive = `/*vt+ allowZeroInDate=true */`
   494  				ddlStmt.SetComments(ddlStmt.GetParsedComments().Prepend(directive))
   495  				sql = sqlparser.String(ddlStmt)
   496  			}
   497  		}
   498  		result, err = exec.tmc.ExecuteFetchAsDba(ctx, tablet, false, &tabletmanagerdatapb.ExecuteFetchAsDbaRequest{
   499  			Query:   []byte(sql),
   500  			MaxRows: 10,
   501  		})
   502  	}
   503  	if err != nil {
   504  		errChan <- ShardWithError{Shard: tablet.Shard, Err: err.Error()}
   505  		return
   506  	}
   507  	// Get a replication position that's guaranteed to be after the schema change
   508  	// was applied on the primary.
   509  	pos, err := exec.tmc.PrimaryPosition(ctx, tablet)
   510  	if err != nil {
   511  		errChan <- ShardWithError{
   512  			Shard: tablet.Shard,
   513  			Err:   fmt.Sprintf("couldn't get replication position after applying schema change on primary: %v", err),
   514  		}
   515  		return
   516  	}
   517  	successChan <- ShardResult{
   518  		Shard:    tablet.Shard,
   519  		Result:   result,
   520  		Position: pos,
   521  	}
   522  }
   523  
   524  // Close clears tablet executor states
   525  func (exec *TabletExecutor) Close() {
   526  	if !exec.isClosed {
   527  		exec.tablets = nil
   528  		exec.isClosed = true
   529  	}
   530  }
   531  
   532  var _ Executor = (*TabletExecutor)(nil)