vitess.io/vitess@v0.16.2/go/vt/vttablet/onlineddl/executor.go (about) 1 /* 2 Copyright 2019 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 /* 18 Functionality of this Executor is tested in go/test/endtoend/onlineddl/... 19 */ 20 21 package onlineddl 22 23 import ( 24 "context" 25 "errors" 26 "fmt" 27 "math" 28 "os" 29 "path" 30 "strconv" 31 "strings" 32 "sync" 33 "sync/atomic" 34 "syscall" 35 "time" 36 37 "github.com/spf13/pflag" 38 39 "google.golang.org/protobuf/proto" 40 41 "google.golang.org/protobuf/encoding/prototext" 42 43 "vitess.io/vitess/go/mysql" 44 "vitess.io/vitess/go/sqlescape" 45 "vitess.io/vitess/go/sqltypes" 46 "vitess.io/vitess/go/textutil" 47 "vitess.io/vitess/go/timer" 48 "vitess.io/vitess/go/vt/binlog/binlogplayer" 49 "vitess.io/vitess/go/vt/dbconnpool" 50 "vitess.io/vitess/go/vt/log" 51 binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" 52 querypb "vitess.io/vitess/go/vt/proto/query" 53 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 54 vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" 55 "vitess.io/vitess/go/vt/schema" 56 "vitess.io/vitess/go/vt/schemadiff" 57 "vitess.io/vitess/go/vt/servenv" 58 "vitess.io/vitess/go/vt/sqlparser" 59 "vitess.io/vitess/go/vt/topo" 60 "vitess.io/vitess/go/vt/topo/topoproto" 61 "vitess.io/vitess/go/vt/vterrors" 62 "vitess.io/vitess/go/vt/vttablet/tabletserver/connpool" 63 "vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv" 64 "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle" 65 "vitess.io/vitess/go/vt/vttablet/tmclient" 66 "vitess.io/vitess/go/vt/vttablet/vexec" 67 ) 68 69 var ( 70 // ErrExecutorNotWritableTablet is generated when executor is asked to run gh-ost on a read-only server 71 ErrExecutorNotWritableTablet = errors.New("cannot run migration on non-writable tablet") 72 // ErrExecutorMigrationAlreadyRunning is generated when an attempt is made to run an operation that conflicts with a running migration 73 ErrExecutorMigrationAlreadyRunning = errors.New("cannot run migration since a migration is already running") 74 // ErrMigrationNotFound is returned by readMigration when given UUI cannot be found 75 ErrMigrationNotFound = errors.New("migration not found") 76 ) 77 78 var vexecUpdateTemplates = []string{ 79 `update _vt.schema_migrations set migration_status='val1' where mysql_schema='val2'`, 80 `update _vt.schema_migrations set migration_status='val1' where migration_uuid='val2' and mysql_schema='val3'`, 81 `update _vt.schema_migrations set migration_status='val1' where migration_uuid='val2' and mysql_schema='val3' and shard='val4'`, 82 } 83 84 var vexecInsertTemplates = []string{ 85 `INSERT IGNORE INTO _vt.schema_migrations ( 86 migration_uuid, 87 keyspace, 88 shard, 89 mysql_schema, 90 mysql_table, 91 migration_statement, 92 strategy, 93 options, 94 ddl_action, 95 requested_timestamp, 96 migration_context, 97 migration_status 98 ) VALUES ( 99 'val1', 'val2', 'val3', 'val4', 'val5', 'val6', 'val7', 'val8', 'val9', FROM_UNIXTIME(0), 'vala', 'valb' 100 )`, 101 } 102 103 var emptyResult = &sqltypes.Result{} 104 var acceptableDropTableIfExistsErrorCodes = []int{mysql.ERCantFindFile, mysql.ERNoSuchTable} 105 var copyAlgorithm = sqlparser.AlgorithmValue(sqlparser.CopyStr) 106 107 var ( 108 ghostOverridePath string 109 ptOSCOverridePath string 110 migrationCheckInterval = 1 * time.Minute 111 retainOnlineDDLTables = 24 * time.Hour 112 maxConcurrentOnlineDDLs = 256 113 ) 114 115 func init() { 116 servenv.OnParseFor("vtcombo", registerOnlineDDLFlags) 117 servenv.OnParseFor("vttablet", registerOnlineDDLFlags) 118 } 119 120 func registerOnlineDDLFlags(fs *pflag.FlagSet) { 121 fs.StringVar(&ghostOverridePath, "gh-ost-path", ghostOverridePath, "override default gh-ost binary full path") 122 fs.StringVar(&ptOSCOverridePath, "pt-osc-path", ptOSCOverridePath, "override default pt-online-schema-change binary full path") 123 fs.DurationVar(&migrationCheckInterval, "migration_check_interval", migrationCheckInterval, "Interval between migration checks") 124 fs.DurationVar(&retainOnlineDDLTables, "retain_online_ddl_tables", retainOnlineDDLTables, "How long should vttablet keep an old migrated table before purging it") 125 fs.IntVar(&maxConcurrentOnlineDDLs, "max_concurrent_online_ddl", maxConcurrentOnlineDDLs, "Maximum number of online DDL changes that may run concurrently") 126 } 127 128 var migrationNextCheckIntervals = []time.Duration{1 * time.Second, 5 * time.Second, 10 * time.Second, 20 * time.Second} 129 var maxConstraintNameLength = 64 130 131 const ( 132 maxPasswordLength = 32 // MySQL's *replication* password may not exceed 32 characters 133 staleMigrationMinutes = 180 134 progressPctStarted float64 = 0 135 progressPctFull float64 = 100.0 136 etaSecondsUnknown = -1 137 etaSecondsNow = 0 138 rowsCopiedUnknown = 0 139 emptyHint = "" 140 readyToCompleteHint = "ready_to_complete" 141 databasePoolSize = 3 142 vreplicationCutOverThreshold = 5 * time.Second 143 vreplicationTestSuiteWaitSeconds = 5 144 ) 145 146 var ( 147 migrationLogFileName = "migration.log" 148 migrationFailureFileName = "migration-failure.log" 149 onlineDDLUser = "vt-online-ddl-internal" 150 onlineDDLGrant = fmt.Sprintf("'%s'@'%s'", onlineDDLUser, "%") 151 throttlerOnlineDDLApp = "online-ddl" 152 throttleCheckFlags = &throttle.CheckFlags{} 153 ) 154 155 type ConstraintType int 156 157 const ( 158 UnknownConstraintType ConstraintType = iota 159 CheckConstraintType 160 ForeignKeyConstraintType 161 ) 162 163 var ( 164 constraintIndicatorMap = map[int]string{ 165 int(CheckConstraintType): "chk", 166 int(ForeignKeyConstraintType): "fk", 167 } 168 ) 169 170 func GetConstraintType(constraintInfo sqlparser.ConstraintInfo) ConstraintType { 171 if _, ok := constraintInfo.(*sqlparser.CheckConstraintDefinition); ok { 172 return CheckConstraintType 173 } 174 if _, ok := constraintInfo.(*sqlparser.ForeignKeyDefinition); ok { 175 return ForeignKeyConstraintType 176 } 177 return UnknownConstraintType 178 } 179 180 type mysqlVariables struct { 181 host string 182 port int 183 readOnly bool 184 version string 185 versionComment string 186 } 187 188 // Executor wraps and manages the execution of a gh-ost migration. 189 type Executor struct { 190 env tabletenv.Env 191 pool *connpool.Pool 192 tabletTypeFunc func() topodatapb.TabletType 193 ts *topo.Server 194 lagThrottler *throttle.Throttler 195 toggleBufferTableFunc func(cancelCtx context.Context, tableName string, bufferQueries bool) 196 tabletAlias *topodatapb.TabletAlias 197 198 keyspace string 199 shard string 200 dbName string 201 202 initMutex sync.Mutex 203 migrationMutex sync.Mutex 204 submitMutex sync.Mutex // used when submitting migrations 205 // ownedRunningMigrations lists UUIDs owned by this executor (consider this a map[string]bool) 206 // A UUID listed in this map stands for a migration that is executing, and that this executor can control. 207 // Migrations found to be running which are not listed in this map will either: 208 // - be adopted by this executor (possible for vreplication migrations), or 209 // - be terminated (example: pt-osc migration gone rogue, process still running even as the migration failed) 210 // The Executor auto-reviews the map and cleans up migrations thought to be running which are not running. 211 ownedRunningMigrations sync.Map 212 vreplicationLastError map[string]*vterrors.LastError 213 tickReentranceFlag int64 214 reviewedRunningMigrationsFlag bool 215 216 ticks *timer.Timer 217 isOpen int64 218 schemaInitialized bool 219 220 initVreplicationDDLOnce sync.Once 221 } 222 223 type cancellableMigration struct { 224 uuid string 225 message string 226 } 227 228 func newCancellableMigration(uuid string, message string) *cancellableMigration { 229 return &cancellableMigration{uuid: uuid, message: message} 230 } 231 232 // GhostBinaryFileName returns the full path+name of the gh-ost binary 233 func GhostBinaryFileName() (fileName string, isOverride bool) { 234 if ghostOverridePath != "" { 235 return ghostOverridePath, true 236 } 237 return path.Join(os.TempDir(), "vt-gh-ost"), false 238 } 239 240 // PTOSCFileName returns the full path+name of the pt-online-schema-change binary 241 // Note that vttablet does not include pt-online-schema-change 242 func PTOSCFileName() (fileName string, isOverride bool) { 243 if ptOSCOverridePath != "" { 244 return ptOSCOverridePath, true 245 } 246 return "/usr/bin/pt-online-schema-change", false 247 } 248 249 // newGCTableRetainTime returns the time until which a new GC table is to be retained 250 func newGCTableRetainTime() time.Time { 251 return time.Now().UTC().Add(retainOnlineDDLTables) 252 } 253 254 // NewExecutor creates a new gh-ost executor. 255 func NewExecutor(env tabletenv.Env, tabletAlias *topodatapb.TabletAlias, ts *topo.Server, 256 lagThrottler *throttle.Throttler, 257 tabletTypeFunc func() topodatapb.TabletType, 258 toggleBufferTableFunc func(cancelCtx context.Context, tableName string, bufferQueries bool), 259 ) *Executor { 260 // sanitize flags 261 if maxConcurrentOnlineDDLs < 1 { 262 maxConcurrentOnlineDDLs = 1 // or else nothing will ever run 263 } 264 return &Executor{ 265 env: env, 266 tabletAlias: proto.Clone(tabletAlias).(*topodatapb.TabletAlias), 267 268 pool: connpool.NewPool(env, "OnlineDDLExecutorPool", tabletenv.ConnPoolConfig{ 269 Size: databasePoolSize, 270 IdleTimeoutSeconds: env.Config().OltpReadPool.IdleTimeoutSeconds, 271 }), 272 tabletTypeFunc: tabletTypeFunc, 273 ts: ts, 274 lagThrottler: lagThrottler, 275 toggleBufferTableFunc: toggleBufferTableFunc, 276 ticks: timer.NewTimer(migrationCheckInterval), 277 } 278 } 279 280 func (e *Executor) execQuery(ctx context.Context, query string) (result *sqltypes.Result, err error) { 281 defer e.env.LogError() 282 283 conn, err := e.pool.Get(ctx, nil) 284 if err != nil { 285 return result, err 286 } 287 defer conn.Recycle() 288 return conn.Exec(ctx, query, math.MaxInt32, true) 289 } 290 291 // TabletAliasString returns tablet alias as string (duh) 292 func (e *Executor) TabletAliasString() string { 293 return topoproto.TabletAliasString(e.tabletAlias) 294 } 295 296 // InitDBConfig initializes keysapce 297 func (e *Executor) InitDBConfig(keyspace, shard, dbName string) { 298 e.keyspace = keyspace 299 e.shard = shard 300 e.dbName = dbName 301 } 302 303 // Open opens database pool and initializes the schema 304 func (e *Executor) Open() error { 305 e.initMutex.Lock() 306 defer e.initMutex.Unlock() 307 if atomic.LoadInt64(&e.isOpen) > 0 || !e.env.Config().EnableOnlineDDL { 308 return nil 309 } 310 log.Infof("onlineDDL Executor Open()") 311 312 e.reviewedRunningMigrationsFlag = false // will be set as "true" by reviewRunningMigrations() 313 e.ownedRunningMigrations.Range(func(k, _ any) bool { 314 e.ownedRunningMigrations.Delete(k) 315 return true 316 }) 317 e.vreplicationLastError = make(map[string]*vterrors.LastError) 318 319 e.pool.Open(e.env.Config().DB.AppWithDB(), e.env.Config().DB.DbaWithDB(), e.env.Config().DB.AppDebugWithDB()) 320 e.ticks.Start(e.onMigrationCheckTick) 321 e.triggerNextCheckInterval() 322 323 if _, err := sqlparser.QueryMatchesTemplates("select 1 from dual", vexecUpdateTemplates); err != nil { 324 // this validates vexecUpdateTemplates 325 return err 326 } 327 atomic.StoreInt64(&e.isOpen, 1) 328 329 return nil 330 } 331 332 // Close frees resources 333 func (e *Executor) Close() { 334 e.initMutex.Lock() 335 defer e.initMutex.Unlock() 336 if atomic.LoadInt64(&e.isOpen) == 0 { 337 return 338 } 339 log.Infof("onlineDDL Executor Close()") 340 341 e.ticks.Stop() 342 e.pool.Close() 343 atomic.StoreInt64(&e.isOpen, 0) 344 } 345 346 // triggerNextCheckInterval the next tick sooner than normal 347 func (e *Executor) triggerNextCheckInterval() { 348 for _, interval := range migrationNextCheckIntervals { 349 e.ticks.TriggerAfter(interval) 350 } 351 } 352 353 // matchesShards checks whether given comma delimited shard names include this tablet's shard. If the input param is empty then 354 // that implicitly means "true" 355 func (e *Executor) matchesShards(commaDelimitedShards string) bool { 356 shards := textutil.SplitDelimitedList(commaDelimitedShards) 357 if len(shards) == 0 { 358 // Nothing explicitly defined, so implicitly all shards are allowed 359 return true 360 } 361 for _, shard := range shards { 362 if shard == e.shard { 363 return true 364 } 365 } 366 return false 367 } 368 369 // countOwnedRunningMigrations returns an estimate of current count of running migrations; this is 370 // normally an accurate number, but can be inexact because the exdcutor peridocially reviews 371 // e.ownedRunningMigrations and adds/removes migrations based on actual migration state. 372 func (e *Executor) countOwnedRunningMigrations() (count int) { 373 e.ownedRunningMigrations.Range(func(_, val any) bool { 374 if _, ok := val.(*schema.OnlineDDL); ok { 375 count++ 376 } 377 return true // continue iteration 378 }) 379 return count 380 } 381 382 // allowConcurrentMigration checks if the given migration is allowed to run concurrently. 383 // First, the migration itself must declare --allow-concurrent. But then, there's also some 384 // restrictions on which migrations exactly are allowed such concurrency. 385 func (e *Executor) allowConcurrentMigration(onlineDDL *schema.OnlineDDL) (action sqlparser.DDLAction, allowConcurrent bool) { 386 if !onlineDDL.StrategySetting().IsAllowConcurrent() { 387 return action, false 388 } 389 390 var err error 391 action, err = onlineDDL.GetAction() 392 if err != nil { 393 return action, false 394 } 395 switch action { 396 case sqlparser.CreateDDLAction, sqlparser.DropDDLAction: 397 // CREATE TABLE, DROP TABLE are allowed to run concurrently. 398 return action, true 399 case sqlparser.AlterDDLAction: 400 // ALTER is only allowed concurrent execution if this is a Vitess migration 401 strategy := onlineDDL.StrategySetting().Strategy 402 return action, (strategy == schema.DDLStrategyOnline || strategy == schema.DDLStrategyVitess) 403 case sqlparser.RevertDDLAction: 404 // REVERT is allowed to run concurrently. 405 // Reminder that REVERT is supported for CREATE, DROP and for 'vitess' ALTER, but never for 406 // 'gh-ost' or 'pt-osc' ALTERs 407 return action, true 408 } 409 return action, false 410 } 411 412 func (e *Executor) proposedMigrationConflictsWithRunningMigration(runningMigration, proposedMigration *schema.OnlineDDL) bool { 413 if runningMigration.Table == proposedMigration.Table { 414 // migrations operate on same table 415 return true 416 } 417 _, isRunningMigrationAllowConcurrent := e.allowConcurrentMigration(runningMigration) 418 proposedMigrationAction, isProposedMigrationAllowConcurrent := e.allowConcurrentMigration(proposedMigration) 419 if !isRunningMigrationAllowConcurrent && !isProposedMigrationAllowConcurrent { 420 // neither allowed concurrently 421 return true 422 } 423 if proposedMigrationAction == sqlparser.AlterDDLAction { 424 // A new ALTER migration conflicts with an existing migration if the existing migration is still not ready to complete. 425 // Specifically, if the running migration is an ALTER, and is still busy with copying rows (copy_state), then 426 // we consider the two to be conflicting. But, if the running migration is done copying rows, and is now only 427 // applying binary logs, and is up-to-date, then we consider a new ALTER migration to be non-conflicting. 428 return atomic.LoadInt64(&runningMigration.ReadyToComplete) == 0 429 } 430 return false 431 } 432 433 // isAnyConflictingMigrationRunning checks if there's any running migration that conflicts with the 434 // given migration, such that they can't both run concurrently. 435 func (e *Executor) isAnyConflictingMigrationRunning(onlineDDL *schema.OnlineDDL) (conflictFound bool, conflictingMigration *schema.OnlineDDL) { 436 e.ownedRunningMigrations.Range(func(_, val any) bool { 437 runningMigration, ok := val.(*schema.OnlineDDL) 438 if !ok { 439 return true // continue iteration 440 } 441 if e.proposedMigrationConflictsWithRunningMigration(runningMigration, onlineDDL) { 442 conflictingMigration = runningMigration 443 return false // stop iteration, no need to review other migrations 444 } 445 return true // continue iteration 446 }) 447 return (conflictingMigration != nil), conflictingMigration 448 } 449 450 func (e *Executor) ghostPanicFlagFileName(uuid string) string { 451 return path.Join(os.TempDir(), fmt.Sprintf("ghost.%s.panic.flag", uuid)) 452 } 453 454 func (e *Executor) createGhostPanicFlagFile(uuid string) error { 455 _, err := os.Create(e.ghostPanicFlagFileName(uuid)) 456 return err 457 } 458 459 func (e *Executor) deleteGhostPanicFlagFile(uuid string) error { 460 // We use RemoveAll because if the file does not exist that's fine. Remove will return an error 461 // if file does not exist; RemoveAll does not. 462 return os.RemoveAll(e.ghostPanicFlagFileName(uuid)) 463 } 464 465 func (e *Executor) ghostPostponeFlagFileName(uuid string) string { 466 return path.Join(os.TempDir(), fmt.Sprintf("ghost.%s.postpone.flag", uuid)) 467 } 468 469 func (e *Executor) deleteGhostPostponeFlagFile(uuid string) error { 470 // We use RemoveAll because if the file does not exist that's fine. Remove will return an error 471 // if file does not exist; RemoveAll does not. 472 return os.RemoveAll(e.ghostPostponeFlagFileName(uuid)) 473 } 474 475 func (e *Executor) ptPidFileName(uuid string) string { 476 return path.Join(os.TempDir(), fmt.Sprintf("pt-online-schema-change.%s.pid", uuid)) 477 } 478 479 // readMySQLVariables contacts the backend MySQL server to read some of its configuration 480 func (e *Executor) readMySQLVariables(ctx context.Context) (variables *mysqlVariables, err error) { 481 conn, err := e.pool.Get(ctx, nil) 482 if err != nil { 483 return nil, err 484 } 485 defer conn.Recycle() 486 487 tm, err := conn.Exec(ctx, `select 488 @@global.hostname as hostname, 489 @@global.port as port, 490 @@global.read_only as read_only, 491 @@global.version AS version, 492 @@global.version_comment AS version_comment 493 from dual`, 1, true) 494 if err != nil { 495 return nil, vterrors.Errorf(vtrpcpb.Code_UNKNOWN, "could not read MySQL variables: %v", err) 496 } 497 row := tm.Named().Row() 498 if row == nil { 499 return nil, vterrors.Errorf(vtrpcpb.Code_UNKNOWN, "unexpected result for MySQL variables: %+v", tm.Rows) 500 } 501 variables = &mysqlVariables{} 502 503 if e.env.Config().DB.Host != "" { 504 variables.host = e.env.Config().DB.Host 505 } else { 506 variables.host = row["hostname"].ToString() 507 } 508 509 if e.env.Config().DB.Port != 0 { 510 variables.port = e.env.Config().DB.Port 511 } else if port, err := row.ToInt64("port"); err != nil { 512 return nil, vterrors.Errorf(vtrpcpb.Code_UNKNOWN, "could not parse @@global.port %v: %v", tm, err) 513 } else { 514 variables.port = int(port) 515 } 516 if variables.readOnly, err = row.ToBool("read_only"); err != nil { 517 return nil, vterrors.Errorf(vtrpcpb.Code_UNKNOWN, "could not parse @@global.read_only %v: %v", tm, err) 518 } 519 520 variables.version = row["version"].ToString() 521 variables.versionComment = row["version_comment"].ToString() 522 523 return variables, nil 524 } 525 526 // createOnlineDDLUser creates a gh-ost or pt-osc user account with all 527 // neccessary privileges and with a random password 528 func (e *Executor) createOnlineDDLUser(ctx context.Context) (password string, err error) { 529 conn, err := dbconnpool.NewDBConnection(ctx, e.env.Config().DB.DbaConnector()) 530 if err != nil { 531 return password, err 532 } 533 defer conn.Close() 534 535 password = RandomHash()[0:maxPasswordLength] 536 537 for _, query := range sqlCreateOnlineDDLUser { 538 parsed := sqlparser.BuildParsedQuery(query, onlineDDLGrant, password) 539 if _, err := conn.ExecuteFetch(parsed.Query, 0, false); err != nil { 540 return password, err 541 } 542 } 543 for _, query := range sqlGrantOnlineDDLSuper { 544 parsed := sqlparser.BuildParsedQuery(query, onlineDDLGrant) 545 conn.ExecuteFetch(parsed.Query, 0, false) 546 // We ignore failure, since we might not be able to grant 547 // SUPER privs (e.g. Aurora) 548 } 549 for _, query := range sqlGrantOnlineDDLUser { 550 parsed := sqlparser.BuildParsedQuery(query, onlineDDLGrant) 551 if _, err := conn.ExecuteFetch(parsed.Query, 0, false); err != nil { 552 return password, err 553 } 554 } 555 return password, err 556 } 557 558 // dropOnlineDDLUser drops the given ddl user account at the end of migration 559 func (e *Executor) dropOnlineDDLUser(ctx context.Context) error { 560 conn, err := dbconnpool.NewDBConnection(ctx, e.env.Config().DB.DbaConnector()) 561 if err != nil { 562 return err 563 } 564 defer conn.Close() 565 566 parsed := sqlparser.BuildParsedQuery(sqlDropOnlineDDLUser, onlineDDLGrant) 567 _, err = conn.ExecuteFetch(parsed.Query, 0, false) 568 return err 569 } 570 571 // tableExists checks if a given table exists. 572 func (e *Executor) tableExists(ctx context.Context, tableName string) (bool, error) { 573 tableName = strings.ReplaceAll(tableName, `_`, `\_`) 574 parsed := sqlparser.BuildParsedQuery(sqlShowTablesLike, tableName) 575 rs, err := e.execQuery(ctx, parsed.Query) 576 if err != nil { 577 return false, err 578 } 579 row := rs.Named().Row() 580 return (row != nil), nil 581 } 582 583 // showCreateTable returns the SHOW CREATE statement for a table or a view 584 func (e *Executor) showCreateTable(ctx context.Context, tableName string) (string, error) { 585 parsed := sqlparser.BuildParsedQuery(sqlShowCreateTable, tableName) 586 rs, err := e.execQuery(ctx, parsed.Query) 587 if err != nil { 588 return "", err 589 } 590 if len(rs.Rows) == 0 { 591 return "", nil 592 } 593 row := rs.Rows[0] 594 return row[1].ToString(), nil 595 } 596 597 func (e *Executor) parseAlterOptions(ctx context.Context, onlineDDL *schema.OnlineDDL) string { 598 // Temporary hack (2020-08-11) 599 // Because sqlparser does not do full blown ALTER TABLE parsing, 600 // and because we don't want gh-ost to know about WITH_GHOST and WITH_PT syntax, 601 // we resort to regexp-based parsing of the query. 602 // TODO(shlomi): generate _alter options_ via sqlparser when it full supports ALTER TABLE syntax. 603 _, _, alterOptions := schema.ParseAlterTableOptions(onlineDDL.SQL) 604 return alterOptions 605 } 606 607 // executeDirectly runs a DDL query directly on the backend MySQL server 608 func (e *Executor) executeDirectly(ctx context.Context, onlineDDL *schema.OnlineDDL, acceptableMySQLErrorCodes ...int) (acceptableErrorCodeFound bool, err error) { 609 conn, err := dbconnpool.NewDBConnection(ctx, e.env.Config().DB.DbaWithDB()) 610 if err != nil { 611 return false, err 612 } 613 defer conn.Close() 614 615 restoreSQLModeFunc, err := e.initMigrationSQLMode(ctx, onlineDDL, conn) 616 defer restoreSQLModeFunc() 617 if err != nil { 618 return false, err 619 } 620 621 _ = e.onSchemaMigrationStatus(ctx, onlineDDL.UUID, schema.OnlineDDLStatusRunning, false, progressPctStarted, etaSecondsUnknown, rowsCopiedUnknown, emptyHint) 622 _, err = conn.ExecuteFetch(onlineDDL.SQL, 0, false) 623 624 if err != nil { 625 // let's see if this error is actually acceptable 626 if merr, ok := err.(*mysql.SQLError); ok { 627 for _, acceptableCode := range acceptableMySQLErrorCodes { 628 if merr.Num == acceptableCode { 629 // we don't consider this to be an error. 630 acceptableErrorCodeFound = true 631 err = nil 632 break 633 } 634 } 635 } 636 } 637 if err != nil { 638 return false, err 639 } 640 defer e.reloadSchema(ctx) 641 _ = e.onSchemaMigrationStatus(ctx, onlineDDL.UUID, schema.OnlineDDLStatusComplete, false, progressPctFull, etaSecondsNow, rowsCopiedUnknown, emptyHint) 642 643 return acceptableErrorCodeFound, nil 644 } 645 646 // doesConnectionInfoMatch checks if theres a MySQL connection in PROCESSLIST whose Info matches given text 647 func (e *Executor) doesConnectionInfoMatch(ctx context.Context, connID int64, submatch string) (bool, error) { 648 findProcessQuery, err := sqlparser.ParseAndBind(sqlFindProcess, 649 sqltypes.Int64BindVariable(connID), 650 sqltypes.StringBindVariable("%"+submatch+"%"), 651 ) 652 if err != nil { 653 return false, err 654 } 655 rs, err := e.execQuery(ctx, findProcessQuery) 656 if err != nil { 657 return false, err 658 } 659 return len(rs.Rows) == 1, nil 660 } 661 662 // tableParticipatesInForeignKeyRelationship checks if a given table is either a parent or a child in at least one foreign key constraint 663 func (e *Executor) tableParticipatesInForeignKeyRelationship(ctx context.Context, schema string, table string) (bool, error) { 664 for _, fkQuery := range []string{selSelectCountFKParentConstraints, selSelectCountFKChildConstraints} { 665 query, err := sqlparser.ParseAndBind(fkQuery, 666 sqltypes.StringBindVariable(schema), 667 sqltypes.StringBindVariable(table), 668 ) 669 if err != nil { 670 return false, err 671 } 672 r, err := e.execQuery(ctx, query) 673 if err != nil { 674 return false, err 675 } 676 row := r.Named().Row() 677 if row == nil { 678 return false, vterrors.Errorf(vtrpcpb.Code_UNKNOWN, "unexpected result from INFORMATION_SCHEMA.KEY_COLUMN_USAGE query: %s", query) 679 } 680 countFKConstraints := row.AsInt64("num_fk_constraints", 0) 681 if countFKConstraints > 0 { 682 return true, nil 683 } 684 } 685 return false, nil 686 } 687 688 // validateTableForAlterAction checks whether a table is good to undergo a ALTER operation. It returns detailed error if not. 689 func (e *Executor) validateTableForAlterAction(ctx context.Context, onlineDDL *schema.OnlineDDL) (err error) { 690 if !onlineDDL.StrategySetting().IsAllowForeignKeysFlag() { 691 // Validate table does not participate in foreign key relationship: 692 participates, err := e.tableParticipatesInForeignKeyRelationship(ctx, onlineDDL.Schema, onlineDDL.Table) 693 if err != nil { 694 return vterrors.Wrapf(err, "error while attempting to validate whether table %s participates in FOREIGN KEY constraint", onlineDDL.Table) 695 } 696 if participates { 697 return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "table %s participates in a FOREIGN KEY constraint and FOREIGN KEY constraints are not supported in Online DDL unless the *experimental and unsafe* --unsafe-allow-foreign-keys strategy flag is specified", onlineDDL.Table) 698 } 699 } 700 return nil 701 } 702 703 // primaryPosition returns the MySQL/MariaDB position (typically GTID pos) on the tablet 704 func (e *Executor) primaryPosition(ctx context.Context) (pos mysql.Position, err error) { 705 conn, err := dbconnpool.NewDBConnection(ctx, e.env.Config().DB.DbaWithDB()) 706 if err != nil { 707 return pos, err 708 } 709 defer conn.Close() 710 711 pos, err = conn.PrimaryPosition() 712 return pos, err 713 } 714 715 // terminateVReplMigration stops vreplication, then removes the _vt.vreplication entry for the given migration 716 func (e *Executor) terminateVReplMigration(ctx context.Context, uuid string) error { 717 tmClient := e.tabletManagerClient() 718 defer tmClient.Close() 719 720 tablet, err := e.ts.GetTablet(ctx, e.tabletAlias) 721 if err != nil { 722 return err 723 } 724 query, err := sqlparser.ParseAndBind(sqlStopVReplStream, 725 sqltypes.StringBindVariable(e.dbName), 726 sqltypes.StringBindVariable(uuid), 727 ) 728 if err != nil { 729 return err 730 } 731 // silently skip error; stopping the stream is just a graceful act; later deleting it is more important 732 if _, err := e.vreplicationExec(ctx, tablet.Tablet, query); err != nil { 733 log.Errorf("FAIL vreplicationExec: uuid=%s, query=%v, error=%v", uuid, query, err) 734 } 735 736 if err := e.deleteVReplicationEntry(ctx, uuid); err != nil { 737 return err 738 } 739 return nil 740 } 741 742 // cutOverVReplMigration stops vreplication, then removes the _vt.vreplication entry for the given migration 743 func (e *Executor) cutOverVReplMigration(ctx context.Context, s *VReplStream) error { 744 if err := e.incrementCutoverAttempts(ctx, s.workflow); err != nil { 745 return err 746 } 747 748 tmClient := e.tabletManagerClient() 749 defer tmClient.Close() 750 751 // sanity checks: 752 vreplTable, err := getVreplTable(ctx, s) 753 if err != nil { 754 return err 755 } 756 757 // get topology client & entities: 758 tablet, err := e.ts.GetTablet(ctx, e.tabletAlias) 759 if err != nil { 760 return err 761 } 762 763 // information about source tablet 764 onlineDDL, _, err := e.readMigration(ctx, s.workflow) 765 if err != nil { 766 return err 767 } 768 isVreplicationTestSuite := onlineDDL.StrategySetting().IsVreplicationTestSuite() 769 e.updateMigrationStage(ctx, onlineDDL.UUID, "starting cut-over") 770 771 var sentryTableName string 772 773 waitForPos := func(s *VReplStream, pos mysql.Position) error { 774 ctx, cancel := context.WithTimeout(ctx, vreplicationCutOverThreshold) 775 defer cancel() 776 // Wait for target to reach the up-to-date pos 777 if err := tmClient.VReplicationWaitForPos(ctx, tablet.Tablet, int(s.id), mysql.EncodePosition(pos)); err != nil { 778 return err 779 } 780 // Target is now in sync with source! 781 return nil 782 } 783 784 if !isVreplicationTestSuite { 785 // A bit early on, we generate names for stowaway and temporary tables 786 // We do this here because right now we're in a safe place where nothing happened yet. If there's an error now, bail out 787 // and no harm done. 788 // Later on, when traffic is blocked and tables renamed, that's a more dangerous place to be in; we want as little logic 789 // in that place as possible. 790 sentryTableName, err = schema.GenerateGCTableName(schema.HoldTableGCState, newGCTableRetainTime()) 791 if err != nil { 792 return nil 793 } 794 795 // We create the sentry table before toggling writes, because this involves a WaitForPos, which takes some time. We 796 // don't want to overload the buffering time with this excessive wait. 797 798 if err := e.updateArtifacts(ctx, onlineDDL.UUID, sentryTableName); err != nil { 799 return err 800 } 801 parsed := sqlparser.BuildParsedQuery(sqlCreateSentryTable, sentryTableName) 802 if _, err := e.execQuery(ctx, parsed.Query); err != nil { 803 return err 804 } 805 e.updateMigrationStage(ctx, onlineDDL.UUID, "sentry table created: %s", sentryTableName) 806 807 postSentryPos, err := e.primaryPosition(ctx) 808 if err != nil { 809 return err 810 } 811 e.updateMigrationStage(ctx, onlineDDL.UUID, "waiting for post-sentry pos: %v", mysql.EncodePosition(postSentryPos)) 812 if err := waitForPos(s, postSentryPos); err != nil { 813 return err 814 } 815 e.updateMigrationStage(ctx, onlineDDL.UUID, "post-sentry pos reached") 816 } 817 818 lockConn, err := e.pool.Get(ctx, nil) 819 if err != nil { 820 return err 821 } 822 defer lockConn.Recycle() 823 defer lockConn.Exec(ctx, sqlUnlockTables, 1, false) 824 825 renameConn, err := e.pool.Get(ctx, nil) 826 if err != nil { 827 return err 828 } 829 defer renameConn.Recycle() 830 defer renameConn.Kill("premature exit while renaming tables", 0) 831 renameQuery := sqlparser.BuildParsedQuery(sqlSwapTables, onlineDDL.Table, sentryTableName, vreplTable, onlineDDL.Table, sentryTableName, vreplTable) 832 833 waitForRenameProcess := func() error { 834 // This function waits until it finds the RENAME TABLE... query running in MySQL's PROCESSLIST, or until timeout 835 // The function assumes that one of the renamed tables is locked, thus causing the RENAME to block. If nothing 836 // is locked, then the RENAME will be near-instantaneious and it's unlikely that the function will find it. 837 renameWaitCtx, cancel := context.WithTimeout(ctx, vreplicationCutOverThreshold) 838 defer cancel() 839 840 for { 841 renameProcessFound, err := e.doesConnectionInfoMatch(renameWaitCtx, renameConn.ID(), "rename") 842 if err != nil { 843 return err 844 } 845 if renameProcessFound { 846 return nil 847 } 848 select { 849 case <-renameWaitCtx.Done(): 850 return vterrors.Errorf(vtrpcpb.Code_ABORTED, "timeout for rename query: %s", renameQuery.Query) 851 case <-time.After(time.Second): 852 // sleep 853 } 854 } 855 } 856 857 renameCompleteChan := make(chan error) 858 859 bufferingCtx, bufferingContextCancel := context.WithCancel(ctx) 860 defer bufferingContextCancel() 861 // Preparation is complete. We proceed to cut-over. 862 toggleBuffering := func(bufferQueries bool) error { 863 log.Infof("toggling buffering: %t in migration %v", bufferQueries, onlineDDL.UUID) 864 e.toggleBufferTableFunc(bufferingCtx, onlineDDL.Table, bufferQueries) 865 if !bufferQueries { 866 // called after new table is in place. 867 // unbuffer existing queries: 868 bufferingContextCancel() 869 // force re-read of tables 870 if err := tmClient.RefreshState(ctx, tablet.Tablet); err != nil { 871 return err 872 } 873 } 874 log.Infof("toggled buffering: %t in migration %v", bufferQueries, onlineDDL.UUID) 875 return nil 876 } 877 878 var reenableOnce sync.Once 879 reenableWritesOnce := func() { 880 reenableOnce.Do(func() { 881 log.Infof("re-enabling writes in migration %v", onlineDDL.UUID) 882 toggleBuffering(false) 883 go log.Infof("cutOverVReplMigration %v: unbuffered queries", s.workflow) 884 }) 885 } 886 e.updateMigrationStage(ctx, onlineDDL.UUID, "buffering queries") 887 // stop writes on source: 888 err = toggleBuffering(true) 889 defer reenableWritesOnce() 890 if err != nil { 891 return err 892 } 893 // Give a fraction of a second for a scenario where a query is in 894 // query executor, it passed the ACLs and is _about to_ execute. This will be nicer to those queries: 895 // they will be able to complete before the rename, rather than block briefly on the rename only to find 896 // the table no longer exists. 897 e.updateMigrationStage(ctx, onlineDDL.UUID, "graceful wait for buffering") 898 time.Sleep(100 * time.Millisecond) 899 900 if isVreplicationTestSuite { 901 // The testing suite may inject queries internally from the server via a recurring EVENT. 902 // Those queries are unaffected by query rules (ACLs) because they don't go through Vitess. 903 // We therefore hard-rename the table into an agreed upon name, and we won't swap it with 904 // the original table. We will actually make the table disappear, creating a void. 905 testSuiteBeforeTableName := fmt.Sprintf("%s_before", onlineDDL.Table) 906 parsed := sqlparser.BuildParsedQuery(sqlRenameTable, onlineDDL.Table, testSuiteBeforeTableName) 907 if _, err := e.execQuery(ctx, parsed.Query); err != nil { 908 return err 909 } 910 e.updateMigrationStage(ctx, onlineDDL.UUID, "test suite 'before' table renamed") 911 } else { 912 // real production 913 914 e.updateMigrationStage(ctx, onlineDDL.UUID, "locking tables") 915 lockCtx, cancel := context.WithTimeout(ctx, vreplicationCutOverThreshold) 916 defer cancel() 917 lockTableQuery := sqlparser.BuildParsedQuery(sqlLockTwoTablesWrite, sentryTableName, onlineDDL.Table) 918 if _, err := lockConn.Exec(lockCtx, lockTableQuery.Query, 1, false); err != nil { 919 return err 920 } 921 922 e.updateMigrationStage(ctx, onlineDDL.UUID, "renaming tables") 923 go func() { 924 _, err := renameConn.Exec(ctx, renameQuery.Query, 1, false) 925 renameCompleteChan <- err 926 }() 927 // the rename should block, because of the LOCK. Wait for it to show up. 928 e.updateMigrationStage(ctx, onlineDDL.UUID, "waiting for RENAME to block") 929 if err := waitForRenameProcess(); err != nil { 930 return err 931 } 932 e.updateMigrationStage(ctx, onlineDDL.UUID, "RENAME found") 933 } 934 935 e.updateMigrationStage(ctx, onlineDDL.UUID, "reading post-lock pos") 936 postWritesPos, err := e.primaryPosition(ctx) 937 if err != nil { 938 return err 939 } 940 941 // Right now: new queries are buffered, any existing query will have executed, and worst case scenario is 942 // that some leftover query finds the table is not actually there anymore... 943 // At any case, there's definitely no more writes to the table since it does not exist. We can 944 // safely take the (GTID) pos now. 945 _ = e.updateMigrationTimestamp(ctx, "liveness_timestamp", s.workflow) 946 947 // Writes are now disabled on table. Read up-to-date vreplication info, specifically to get latest (and fixed) pos: 948 s, err = e.readVReplStream(ctx, s.workflow, false) 949 if err != nil { 950 return err 951 } 952 953 e.updateMigrationStage(ctx, onlineDDL.UUID, "waiting for post-lock pos: %v", mysql.EncodePosition(postWritesPos)) 954 if err := waitForPos(s, postWritesPos); err != nil { 955 e.updateMigrationStage(ctx, onlineDDL.UUID, "timeout while waiting for post-lock pos: %v", err) 956 return err 957 } 958 go log.Infof("cutOverVReplMigration %v: done waiting for position %v", s.workflow, mysql.EncodePosition(postWritesPos)) 959 // Stop vreplication 960 e.updateMigrationStage(ctx, onlineDDL.UUID, "stopping vreplication") 961 if _, err := e.vreplicationExec(ctx, tablet.Tablet, binlogplayer.StopVReplication(uint32(s.id), "stopped for online DDL cutover")); err != nil { 962 return err 963 } 964 go log.Infof("cutOverVReplMigration %v: stopped vreplication", s.workflow) 965 966 // rename tables atomically (remember, writes on source tables are stopped) 967 { 968 if isVreplicationTestSuite { 969 // this is used in Vitess endtoend testing suite 970 testSuiteAfterTableName := fmt.Sprintf("%s_after", onlineDDL.Table) 971 parsed := sqlparser.BuildParsedQuery(sqlRenameTable, vreplTable, testSuiteAfterTableName) 972 if _, err := e.execQuery(ctx, parsed.Query); err != nil { 973 return err 974 } 975 e.updateMigrationStage(ctx, onlineDDL.UUID, "test suite 'after' table renamed") 976 } else { 977 e.updateMigrationStage(ctx, onlineDDL.UUID, "validating rename is still in place") 978 if err := waitForRenameProcess(); err != nil { 979 return err 980 } 981 982 // Normal (non-testing) alter table 983 e.updateMigrationStage(ctx, onlineDDL.UUID, "dropping sentry table") 984 985 { 986 dropTableQuery := sqlparser.BuildParsedQuery(sqlDropTable, sentryTableName) 987 lockCtx, cancel := context.WithTimeout(ctx, vreplicationCutOverThreshold) 988 defer cancel() 989 if _, err := lockConn.Exec(lockCtx, dropTableQuery.Query, 1, false); err != nil { 990 return err 991 } 992 } 993 { 994 lockCtx, cancel := context.WithTimeout(ctx, vreplicationCutOverThreshold) 995 defer cancel() 996 e.updateMigrationStage(ctx, onlineDDL.UUID, "unlocking tables") 997 if _, err := lockConn.Exec(lockCtx, sqlUnlockTables, 1, false); err != nil { 998 return err 999 } 1000 } 1001 { 1002 lockCtx, cancel := context.WithTimeout(ctx, vreplicationCutOverThreshold) 1003 defer cancel() 1004 e.updateMigrationStage(lockCtx, onlineDDL.UUID, "waiting for RENAME to complete") 1005 if err := <-renameCompleteChan; err != nil { 1006 return err 1007 } 1008 } 1009 } 1010 } 1011 e.updateMigrationStage(ctx, onlineDDL.UUID, "cut-over complete") 1012 e.ownedRunningMigrations.Delete(onlineDDL.UUID) 1013 1014 go func() { 1015 // Tables are swapped! Let's take the opportunity to ReloadSchema now 1016 // We do this in a goroutine because it might take time on a schema with thousands of tables, and we don't want to delay 1017 // the cut-over. 1018 // this means ReloadSchema is not in sync with the actual schema change. Users will still need to run tracker if they want to sync. 1019 // In the future, we will want to reload the single table, instead of reloading the schema. 1020 if err := e.reloadSchema(ctx); err != nil { 1021 vterrors.Errorf(vtrpcpb.Code_UNKNOWN, "Error on ReloadSchema while cutting over vreplication migration UUID: %+v", onlineDDL.UUID) 1022 } 1023 }() 1024 1025 // Tables are now swapped! Migration is successful 1026 e.updateMigrationStage(ctx, onlineDDL.UUID, "re-enabling writes") 1027 reenableWritesOnce() // this function is also deferred, in case of early return; but now would be a good time to resume writes, before we publish the migration as "complete" 1028 go log.Infof("cutOverVReplMigration %v: marking as complete", s.workflow) 1029 _ = e.onSchemaMigrationStatus(ctx, onlineDDL.UUID, schema.OnlineDDLStatusComplete, false, progressPctFull, etaSecondsNow, s.rowsCopied, emptyHint) 1030 return nil 1031 1032 // deferred function will re-enable writes now 1033 } 1034 1035 // initMigrationSQLMode sets sql_mode according to DDL strategy, and returns a function that 1036 // restores sql_mode to original state 1037 func (e *Executor) initMigrationSQLMode(ctx context.Context, onlineDDL *schema.OnlineDDL, conn *dbconnpool.DBConnection) (deferFunc func(), err error) { 1038 deferFunc = func() {} 1039 if !onlineDDL.StrategySetting().IsAllowZeroInDateFlag() { 1040 // No need to change sql_mode. 1041 return deferFunc, nil 1042 } 1043 1044 // Grab current sql_mode value 1045 rs, err := conn.ExecuteFetch(`select @@session.sql_mode as sql_mode`, 1, true) 1046 if err != nil { 1047 return deferFunc, vterrors.Errorf(vtrpcpb.Code_UNKNOWN, "could not read sql_mode: %v", err) 1048 } 1049 sqlMode, err := rs.Named().Row().ToString("sql_mode") 1050 if err != nil { 1051 return deferFunc, vterrors.Errorf(vtrpcpb.Code_UNKNOWN, "could not read sql_mode: %v", err) 1052 } 1053 // Pre-calculate restore function 1054 deferFunc = func() { 1055 restoreSQLModeQuery := fmt.Sprintf("set @@session.sql_mode='%s'", sqlMode) 1056 conn.ExecuteFetch(restoreSQLModeQuery, 0, false) 1057 } 1058 // Change sql_mode 1059 changeSQLModeQuery := fmt.Sprintf("set @@session.sql_mode=REPLACE(REPLACE('%s', 'NO_ZERO_DATE', ''), 'NO_ZERO_IN_DATE', '')", sqlMode) 1060 if _, err := conn.ExecuteFetch(changeSQLModeQuery, 0, false); err != nil { 1061 return deferFunc, err 1062 } 1063 return deferFunc, nil 1064 } 1065 1066 // newConstraintName generates a new, unique name for a constraint. Our problem is that a MySQL 1067 // constraint's name is unique in the schema (!). And so as we duplicate the original table, we must 1068 // create completely new names for all constraints. 1069 // Moreover, we really want this name to be consistent across all shards. We therefore use a deterministic 1070 // UUIDv5 (SHA) function over the migration UUID, table name, and constraint's _contents_. 1071 // We _also_ include the original constraint name as prefix, as room allows 1072 // for example, if the original constraint name is "check_1", 1073 // we might generate "check_1_cps1okb4uafunfqusi2lp22u3". 1074 // If we then again migrate a table whose constraint name is "check_1_cps1okb4uafunfqusi2lp22u3 " we 1075 // get for example "check_1_19l09s37kbhj4axnzmi10e18k" (hash changes, and we still try to preserve original name) 1076 // 1077 // Furthermore, per bug report https://bugs.mysql.com/bug.php?id=107772, if the user doesn't provide a name for 1078 // their CHECK constraint, then MySQL picks a name in this format <tablename>_chk_<number>. 1079 // Example: sometable_chk_1 1080 // Next, when MySQL is asked to RENAME TABLE and sees a constraint with this format, it attempts to rename 1081 // the constraint with the new table's name. This is problematic for Vitess, because we often rename tables to 1082 // very long names, such as _vt_HOLD_394f9e6dfc3d11eca0390a43f95f28a3_20220706091048. 1083 // As we rename the constraint to e.g. `sometable_chk_1_cps1okb4uafunfqusi2lp22u3`, this makes MySQL want to 1084 // call the new constraint something like _vt_HOLD_394f9e6dfc3d11eca0390a43f95f28a3_20220706091048_chk_1_cps1okb4uafunfqusi2lp22u3, 1085 // which exceeds the 64 character limit for table names. Long story short, we also trim down <tablename> if the constraint seems 1086 // to be auto-generated. 1087 func (e *Executor) newConstraintName(onlineDDL *schema.OnlineDDL, constraintType ConstraintType, hashExists map[string]bool, seed string, oldName string) string { 1088 constraintIndicator := constraintIndicatorMap[int(constraintType)] 1089 oldName = schemadiff.ExtractConstraintOriginalName(oldName) 1090 autoGeneratedName := fmt.Sprintf("%s_%s_", onlineDDL.Table, constraintIndicator) 1091 if strings.HasPrefix(oldName, autoGeneratedName) { 1092 // strip out table name 1093 oldName = constraintIndicator + "_" + oldName[len(autoGeneratedName):] 1094 } 1095 1096 hash := textutil.UUIDv5Base36(onlineDDL.UUID, onlineDDL.Table, seed) 1097 for i := 1; hashExists[hash]; i++ { 1098 hash = textutil.UUIDv5Base36(onlineDDL.UUID, onlineDDL.Table, seed, fmt.Sprintf("%d", i)) 1099 } 1100 hashExists[hash] = true 1101 suffix := "_" + hash 1102 maxAllowedNameLength := maxConstraintNameLength - len(suffix) 1103 newName := oldName 1104 if newName == "" { 1105 newName = constraintIndicator // start with something that looks consistent with MySQL's naming 1106 } 1107 if len(newName) > maxAllowedNameLength { 1108 newName = newName[0:maxAllowedNameLength] 1109 } 1110 newName = newName + suffix 1111 return newName 1112 } 1113 1114 // validateAndEditCreateTableStatement inspects the CreateTable AST and does the following: 1115 // - extra validation (no FKs for now...) 1116 // - generate new and unique names for all constraints (CHECK and FK; yes, why not handle FK names; even as we don't support FKs today, we may in the future) 1117 func (e *Executor) validateAndEditCreateTableStatement(ctx context.Context, onlineDDL *schema.OnlineDDL, createTable *sqlparser.CreateTable) (constraintMap map[string]string, err error) { 1118 constraintMap = map[string]string{} 1119 hashExists := map[string]bool{} 1120 1121 validateWalk := func(node sqlparser.SQLNode) (kontinue bool, err error) { 1122 switch node := node.(type) { 1123 case *sqlparser.ForeignKeyDefinition: 1124 if !onlineDDL.StrategySetting().IsAllowForeignKeysFlag() { 1125 return false, schema.ErrForeignKeyFound 1126 } 1127 case *sqlparser.ConstraintDefinition: 1128 oldName := node.Name.String() 1129 newName := e.newConstraintName(onlineDDL, GetConstraintType(node.Details), hashExists, sqlparser.CanonicalString(node.Details), oldName) 1130 node.Name = sqlparser.NewIdentifierCI(newName) 1131 constraintMap[oldName] = newName 1132 } 1133 return true, nil 1134 } 1135 if err := sqlparser.Walk(validateWalk, createTable); err != nil { 1136 return constraintMap, err 1137 } 1138 return constraintMap, nil 1139 } 1140 1141 // validateAndEditAlterTableStatement inspects the AlterTable statement and: 1142 // - modifies any CONSTRAINT name according to given name mapping 1143 // - explode ADD FULLTEXT KEY into multiple statements 1144 func (e *Executor) validateAndEditAlterTableStatement(ctx context.Context, onlineDDL *schema.OnlineDDL, alterTable *sqlparser.AlterTable, constraintMap map[string]string) (alters []*sqlparser.AlterTable, err error) { 1145 hashExists := map[string]bool{} 1146 validateWalk := func(node sqlparser.SQLNode) (kontinue bool, err error) { 1147 switch node := node.(type) { 1148 case *sqlparser.DropKey: 1149 if node.Type == sqlparser.CheckKeyType { 1150 // drop a check constraint 1151 mappedName, ok := constraintMap[node.Name.String()] 1152 if !ok { 1153 return false, vterrors.Errorf(vtrpcpb.Code_INTERNAL, "Found DROP CONSTRAINT: %v, but could not find constraint name in map", sqlparser.CanonicalString(node)) 1154 } 1155 node.Name = sqlparser.NewIdentifierCI(mappedName) 1156 } 1157 case *sqlparser.AddConstraintDefinition: 1158 oldName := node.ConstraintDefinition.Name.String() 1159 newName := e.newConstraintName(onlineDDL, GetConstraintType(node.ConstraintDefinition.Details), hashExists, sqlparser.CanonicalString(node.ConstraintDefinition.Details), oldName) 1160 node.ConstraintDefinition.Name = sqlparser.NewIdentifierCI(newName) 1161 constraintMap[oldName] = newName 1162 } 1163 return true, nil 1164 } 1165 if err := sqlparser.Walk(validateWalk, alterTable); err != nil { 1166 return alters, err 1167 } 1168 alters = append(alters, alterTable) 1169 // Handle ADD FULLTEXT KEY statements 1170 countAddFullTextStatements := 0 1171 redactedOptions := make([]sqlparser.AlterOption, 0, len(alterTable.AlterOptions)) 1172 for i := range alterTable.AlterOptions { 1173 opt := alterTable.AlterOptions[i] 1174 switch opt := opt.(type) { 1175 case sqlparser.AlgorithmValue: 1176 // we do not pass ALGORITHM. We choose our own ALGORITHM. 1177 continue 1178 case *sqlparser.AddIndexDefinition: 1179 if opt.IndexDefinition.Info.Fulltext { 1180 countAddFullTextStatements++ 1181 if countAddFullTextStatements > 1 { 1182 // We've already got one ADD FULLTEXT KEY. We can't have another 1183 // in the same statement 1184 extraAlterTable := &sqlparser.AlterTable{ 1185 Table: alterTable.Table, 1186 AlterOptions: []sqlparser.AlterOption{opt, copyAlgorithm}, 1187 } 1188 alters = append(alters, extraAlterTable) 1189 continue 1190 } 1191 } 1192 } 1193 redactedOptions = append(redactedOptions, opt) 1194 } 1195 alterTable.AlterOptions = redactedOptions 1196 alterTable.AlterOptions = append(alterTable.AlterOptions, copyAlgorithm) 1197 return alters, nil 1198 } 1199 1200 // createTableLike creates the table named by `newTableName` in the likeness of onlineDDL.Table 1201 // This function emulates MySQL's `CREATE TABLE LIKE ...` statement. The difference is that this function takes control over the generated CONSTRAINT names, 1202 // if any, such that they are detrministic across shards, as well as preserve original names where possible. 1203 func (e *Executor) createTableLike(ctx context.Context, newTableName string, onlineDDL *schema.OnlineDDL, conn *dbconnpool.DBConnection) (constraintMap map[string]string, err error) { 1204 existingShowCreateTable, err := e.showCreateTable(ctx, onlineDDL.Table) 1205 if err != nil { 1206 return nil, vterrors.Wrapf(err, "in createTableLike(), newTableName=%s", newTableName) 1207 } 1208 stmt, err := sqlparser.ParseStrictDDL(existingShowCreateTable) 1209 if err != nil { 1210 return nil, err 1211 } 1212 createTable, ok := stmt.(*sqlparser.CreateTable) 1213 if !ok { 1214 return nil, vterrors.Errorf(vtrpcpb.Code_INTERNAL, "expected CreateTable statement, got: %v", sqlparser.CanonicalString(stmt)) 1215 } 1216 createTable.SetTable(createTable.GetTable().Qualifier.CompliantName(), newTableName) 1217 // manipulate CreateTable statement: take care of constraints names which have to be 1218 // unique across the schema 1219 constraintMap, err = e.validateAndEditCreateTableStatement(ctx, onlineDDL, createTable) 1220 if err != nil { 1221 return nil, err 1222 } 1223 // Create the table 1224 if _, err := conn.ExecuteFetch(sqlparser.CanonicalString(createTable), 0, false); err != nil { 1225 return nil, err 1226 } 1227 return constraintMap, nil 1228 } 1229 1230 // initVreplicationOriginalMigration performs the first steps towards running a VRepl ALTER migration: 1231 // - analyze the original table 1232 // - formalize a new CreateTable statement 1233 // - inspect the ALTER TABLE query 1234 // - formalize an AlterTable statement 1235 // - create the vrepl table 1236 // - modify the vrepl table 1237 // - Create and return a VRepl instance 1238 func (e *Executor) initVreplicationOriginalMigration(ctx context.Context, onlineDDL *schema.OnlineDDL, conn *dbconnpool.DBConnection) (v *VRepl, err error) { 1239 restoreSQLModeFunc, err := e.initMigrationSQLMode(ctx, onlineDDL, conn) 1240 defer restoreSQLModeFunc() 1241 if err != nil { 1242 return v, err 1243 } 1244 1245 vreplTableName := fmt.Sprintf("_%s_%s_vrepl", onlineDDL.UUID, ReadableTimestamp()) 1246 if err := e.updateArtifacts(ctx, onlineDDL.UUID, vreplTableName); err != nil { 1247 return v, err 1248 } 1249 constraintMap, err := e.createTableLike(ctx, vreplTableName, onlineDDL, conn) 1250 if err != nil { 1251 return nil, err 1252 } 1253 { 1254 stmt, err := sqlparser.ParseStrictDDL(onlineDDL.SQL) 1255 if err != nil { 1256 return nil, err 1257 } 1258 alterTable, ok := stmt.(*sqlparser.AlterTable) 1259 if !ok { 1260 return nil, vterrors.Errorf(vtrpcpb.Code_INTERNAL, "expected AlterTable statement, got: %v", sqlparser.CanonicalString(stmt)) 1261 } 1262 // ALTER TABLE should apply to the vrepl table 1263 alterTable.SetTable(alterTable.GetTable().Qualifier.CompliantName(), vreplTableName) 1264 // Also, change any constraint names: 1265 alters, err := e.validateAndEditAlterTableStatement(ctx, onlineDDL, alterTable, constraintMap) 1266 if err != nil { 1267 return v, err 1268 } 1269 // Apply ALTER TABLE to materialized table 1270 for _, alter := range alters { 1271 if _, err := conn.ExecuteFetch(sqlparser.CanonicalString(alter), 0, false); err != nil { 1272 return v, err 1273 } 1274 } 1275 } 1276 v = NewVRepl(onlineDDL.UUID, e.keyspace, e.shard, e.dbName, onlineDDL.Table, vreplTableName, onlineDDL.SQL) 1277 return v, nil 1278 } 1279 1280 // postInitVreplicationOriginalMigration runs extra changes after a vreplication online DDL has been initialized. 1281 // This function is called after both source and target tables have been analyzed, so there's more information 1282 // about the two, and about the transition between the two. 1283 func (e *Executor) postInitVreplicationOriginalMigration(ctx context.Context, onlineDDL *schema.OnlineDDL, v *VRepl, conn *dbconnpool.DBConnection) (err error) { 1284 if v.sourceAutoIncrement > 0 && !v.parser.IsAutoIncrementDefined() { 1285 restoreSQLModeFunc, err := e.initMigrationSQLMode(ctx, onlineDDL, conn) 1286 defer restoreSQLModeFunc() 1287 if err != nil { 1288 return err 1289 } 1290 1291 // Apply ALTER TABLE AUTO_INCREMENT=? 1292 parsed := sqlparser.BuildParsedQuery(sqlAlterTableAutoIncrement, v.targetTable, ":auto_increment") 1293 bindVars := map[string]*querypb.BindVariable{ 1294 "auto_increment": sqltypes.Uint64BindVariable(v.sourceAutoIncrement), 1295 } 1296 bound, err := parsed.GenerateQuery(bindVars, nil) 1297 if err != nil { 1298 return err 1299 } 1300 if _, err := conn.ExecuteFetch(bound, 0, false); err != nil { 1301 return err 1302 } 1303 } 1304 return nil 1305 } 1306 1307 func (e *Executor) initVreplicationRevertMigration(ctx context.Context, onlineDDL *schema.OnlineDDL, revertMigration *schema.OnlineDDL) (v *VRepl, err error) { 1308 // Getting here we've already validated that migration is revertible 1309 1310 // Validation: vreplication still exists for reverted migration 1311 revertStream, err := e.readVReplStream(ctx, revertMigration.UUID, false) 1312 if err != nil { 1313 // cannot read the vreplication stream which we want to revert 1314 return nil, fmt.Errorf("can not revert vreplication migration %s because vreplication stream %s was not found", revertMigration.UUID, revertMigration.UUID) 1315 } 1316 1317 onlineDDL.Table = revertMigration.Table 1318 if err := e.updateMySQLTable(ctx, onlineDDL.UUID, onlineDDL.Table); err != nil { 1319 return nil, err 1320 } 1321 1322 vreplTableName, err := getVreplTable(ctx, revertStream) 1323 if err != nil { 1324 return nil, err 1325 } 1326 1327 if err := e.updateArtifacts(ctx, onlineDDL.UUID, vreplTableName); err != nil { 1328 return v, err 1329 } 1330 v = NewVRepl(onlineDDL.UUID, e.keyspace, e.shard, e.dbName, onlineDDL.Table, vreplTableName, "") 1331 v.pos = revertStream.pos 1332 return v, nil 1333 } 1334 1335 // ExecuteWithVReplication sets up the grounds for a vreplication schema migration 1336 func (e *Executor) ExecuteWithVReplication(ctx context.Context, onlineDDL *schema.OnlineDDL, revertMigration *schema.OnlineDDL) error { 1337 // make sure there's no vreplication workflow running under same name 1338 _ = e.terminateVReplMigration(ctx, onlineDDL.UUID) 1339 1340 if conflictFound, conflictingMigration := e.isAnyConflictingMigrationRunning(onlineDDL); conflictFound { 1341 return vterrors.Wrapf(ErrExecutorMigrationAlreadyRunning, "conflicting migration: %v over table: %v", conflictingMigration.UUID, conflictingMigration.Table) 1342 } 1343 1344 if e.tabletTypeFunc() != topodatapb.TabletType_PRIMARY { 1345 return ErrExecutorNotWritableTablet 1346 } 1347 1348 conn, err := dbconnpool.NewDBConnection(ctx, e.env.Config().DB.DbaWithDB()) 1349 if err != nil { 1350 return err 1351 } 1352 defer conn.Close() 1353 1354 e.ownedRunningMigrations.Store(onlineDDL.UUID, onlineDDL) 1355 if err := e.onSchemaMigrationStatus(ctx, onlineDDL.UUID, schema.OnlineDDLStatusRunning, false, progressPctStarted, etaSecondsUnknown, rowsCopiedUnknown, emptyHint); err != nil { 1356 return err 1357 } 1358 1359 var v *VRepl 1360 if revertMigration == nil { 1361 // Original ALTER TABLE request for vreplication 1362 v, err = e.initVreplicationOriginalMigration(ctx, onlineDDL, conn) 1363 } else { 1364 // this is a revert request 1365 v, err = e.initVreplicationRevertMigration(ctx, onlineDDL, revertMigration) 1366 } 1367 if err != nil { 1368 return err 1369 } 1370 if err := v.analyze(ctx, conn); err != nil { 1371 return err 1372 } 1373 if err := e.updateMigrationTableRows(ctx, onlineDDL.UUID, v.tableRows); err != nil { 1374 return err 1375 } 1376 removedUniqueKeyNames := []string{} 1377 for _, uniqueKey := range v.removedUniqueKeys { 1378 removedUniqueKeyNames = append(removedUniqueKeyNames, uniqueKey.Name) 1379 } 1380 1381 if err := e.updateSchemaAnalysis(ctx, onlineDDL.UUID, 1382 len(v.addedUniqueKeys), 1383 len(v.removedUniqueKeys), 1384 strings.Join(sqlescape.EscapeIDs(removedUniqueKeyNames), ","), 1385 strings.Join(sqlescape.EscapeIDs(v.droppedNoDefaultColumnNames), ","), 1386 strings.Join(sqlescape.EscapeIDs(v.expandedColumnNames), ","), 1387 v.revertibleNotes, 1388 ); err != nil { 1389 return err 1390 } 1391 if revertMigration == nil { 1392 // Original ALTER TABLE request for vreplication 1393 if err := e.validateTableForAlterAction(ctx, onlineDDL); err != nil { 1394 return err 1395 } 1396 if err := e.postInitVreplicationOriginalMigration(ctx, onlineDDL, v, conn); err != nil { 1397 return err 1398 } 1399 } 1400 1401 { 1402 // We need to talk to tabletmanager's VREngine. But we're on TabletServer. While we live in the same 1403 // process as VREngine, it is actually simpler to get hold of it via gRPC, just like wrangler does. 1404 tablet, err := e.ts.GetTablet(ctx, e.tabletAlias) 1405 if err != nil { 1406 return err 1407 } 1408 1409 // reload schema before migration 1410 if err := e.reloadSchema(ctx); err != nil { 1411 return err 1412 } 1413 1414 // create vreplication entry 1415 insertVReplicationQuery, err := v.generateInsertStatement(ctx) 1416 if err != nil { 1417 return err 1418 } 1419 if _, err := e.vreplicationExec(ctx, tablet.Tablet, insertVReplicationQuery); err != nil { 1420 return err 1421 } 1422 1423 { 1424 // temporary hack. todo: this should be done when inserting any _vt.vreplication record across all workflow types 1425 query := fmt.Sprintf("update _vt.vreplication set workflow_type = %d where workflow = '%s'", 1426 binlogdatapb.VReplicationWorkflowType_OnlineDDL, v.workflow) 1427 if _, err := e.vreplicationExec(ctx, tablet.Tablet, query); err != nil { 1428 return vterrors.Wrapf(err, "VReplicationExec(%v, %s)", tablet.Tablet, query) 1429 } 1430 } 1431 // start stream! 1432 startVReplicationQuery, err := v.generateStartStatement(ctx) 1433 if err != nil { 1434 return err 1435 } 1436 if _, err := e.vreplicationExec(ctx, tablet.Tablet, startVReplicationQuery); err != nil { 1437 return err 1438 } 1439 } 1440 return nil 1441 } 1442 1443 // ExecuteWithGhost validates and runs a gh-ost process. 1444 // Validation included testing the backend MySQL server and the gh-ost binary itself 1445 // Execution runs first a dry run, then an actual migration 1446 func (e *Executor) ExecuteWithGhost(ctx context.Context, onlineDDL *schema.OnlineDDL) error { 1447 if conflictFound, conflictingMigration := e.isAnyConflictingMigrationRunning(onlineDDL); conflictFound { 1448 return vterrors.Wrapf(ErrExecutorMigrationAlreadyRunning, "conflicting migration: %v over table: %v", conflictingMigration.UUID, conflictingMigration.Table) 1449 } 1450 1451 if e.tabletTypeFunc() != topodatapb.TabletType_PRIMARY { 1452 return ErrExecutorNotWritableTablet 1453 } 1454 variables, err := e.readMySQLVariables(ctx) 1455 if err != nil { 1456 log.Errorf("Error before running gh-ost: %+v", err) 1457 return err 1458 } 1459 if variables.readOnly { 1460 err := fmt.Errorf("Error before running gh-ost: MySQL server is read_only") 1461 log.Errorf(err.Error()) 1462 return err 1463 } 1464 onlineDDLPassword, err := e.createOnlineDDLUser(ctx) 1465 if err != nil { 1466 err := fmt.Errorf("Error creating gh-ost user: %+v", err) 1467 log.Errorf(err.Error()) 1468 return err 1469 } 1470 tempDir, err := createTempDir(onlineDDL.UUID) 1471 if err != nil { 1472 log.Errorf("Error creating temporary directory: %+v", err) 1473 return err 1474 } 1475 binaryFileName, _ := GhostBinaryFileName() 1476 credentialsConfigFileContent := fmt.Sprintf(`[client] 1477 user=%s 1478 password=${ONLINE_DDL_PASSWORD} 1479 `, onlineDDLUser) 1480 credentialsConfigFileName, err := createTempScript(tempDir, "gh-ost-conf.cfg", credentialsConfigFileContent) 1481 if err != nil { 1482 log.Errorf("Error creating config file: %+v", err) 1483 return err 1484 } 1485 wrapperScriptContent := fmt.Sprintf(`#!/bin/bash 1486 ghost_log_path="%s" 1487 ghost_log_file="%s" 1488 ghost_log_failure_file="%s" 1489 1490 mkdir -p "$ghost_log_path" 1491 1492 export ONLINE_DDL_PASSWORD 1493 %s "$@" > "$ghost_log_path/$ghost_log_file" 2>&1 1494 exit_code=$? 1495 grep -o '\bFATAL\b.*' "$ghost_log_path/$ghost_log_file" | tail -1 > "$ghost_log_path/$ghost_log_failure_file" 1496 exit $exit_code 1497 `, tempDir, migrationLogFileName, migrationFailureFileName, binaryFileName, 1498 ) 1499 wrapperScriptFileName, err := createTempScript(tempDir, "gh-ost-wrapper.sh", wrapperScriptContent) 1500 if err != nil { 1501 log.Errorf("Error creating wrapper script: %+v", err) 1502 return err 1503 } 1504 onHookContent := func(status schema.OnlineDDLStatus, hint string) string { 1505 return fmt.Sprintf(`#!/bin/bash 1506 curl --max-time 10 -s 'http://localhost:%d/schema-migration/report-status?uuid=%s&status=%s&hint=%s&dryrun='"$GH_OST_DRY_RUN"'&progress='"$GH_OST_PROGRESS"'&eta='"$GH_OST_ETA_SECONDS"'&rowscopied='"$GH_OST_COPIED_ROWS" 1507 `, servenv.Port(), onlineDDL.UUID, string(status), hint) 1508 } 1509 if _, err := createTempScript(tempDir, "gh-ost-on-startup", onHookContent(schema.OnlineDDLStatusRunning, emptyHint)); err != nil { 1510 log.Errorf("Error creating script: %+v", err) 1511 return err 1512 } 1513 if _, err := createTempScript(tempDir, "gh-ost-on-status", onHookContent(schema.OnlineDDLStatusRunning, emptyHint)); err != nil { 1514 log.Errorf("Error creating script: %+v", err) 1515 return err 1516 } 1517 if _, err := createTempScript(tempDir, "gh-ost-on-success", onHookContent(schema.OnlineDDLStatusComplete, emptyHint)); err != nil { 1518 log.Errorf("Error creating script: %+v", err) 1519 return err 1520 } 1521 if _, err := createTempScript(tempDir, "gh-ost-on-failure", onHookContent(schema.OnlineDDLStatusFailed, emptyHint)); err != nil { 1522 log.Errorf("Error creating script: %+v", err) 1523 return err 1524 } 1525 if _, err := createTempScript(tempDir, "gh-ost-on-begin-postponed", onHookContent(schema.OnlineDDLStatusRunning, readyToCompleteHint)); err != nil { 1526 log.Errorf("Error creating script: %+v", err) 1527 return err 1528 } 1529 serveSocketFile := path.Join(tempDir, "serve.sock") 1530 1531 if err := e.deleteGhostPanicFlagFile(onlineDDL.UUID); err != nil { 1532 log.Errorf("Error removing gh-ost panic flag file %s: %+v", e.ghostPanicFlagFileName(onlineDDL.UUID), err) 1533 return err 1534 } 1535 if err := e.deleteGhostPostponeFlagFile(onlineDDL.UUID); err != nil { 1536 log.Errorf("Error removing gh-ost postpone flag file %s before migration: %+v", e.ghostPostponeFlagFileName(onlineDDL.UUID), err) 1537 return err 1538 } 1539 // Validate gh-ost binary: 1540 _ = e.updateMigrationMessage(ctx, onlineDDL.UUID, "validating gh-ost --version") 1541 log.Infof("Will now validate gh-ost binary") 1542 _, err = execCmd( 1543 "bash", 1544 []string{ 1545 wrapperScriptFileName, 1546 "--version", 1547 }, 1548 os.Environ(), 1549 "/tmp", 1550 nil, 1551 nil, 1552 ) 1553 if err != nil { 1554 log.Errorf("Error testing gh-ost binary: %+v", err) 1555 return err 1556 } 1557 _ = e.updateMigrationMessage(ctx, onlineDDL.UUID, "validated gh-ost --version") 1558 log.Infof("+ OK") 1559 1560 if err := e.updateMigrationLogPath(ctx, onlineDDL.UUID, variables.host, tempDir); err != nil { 1561 return err 1562 } 1563 1564 runGhost := func(execute bool) error { 1565 alterOptions := e.parseAlterOptions(ctx, onlineDDL) 1566 forceTableNames := fmt.Sprintf("%s_%s", onlineDDL.UUID, ReadableTimestamp()) 1567 1568 if err := e.updateArtifacts(ctx, onlineDDL.UUID, 1569 fmt.Sprintf("_%s_gho", forceTableNames), 1570 fmt.Sprintf("_%s_ghc", forceTableNames), 1571 fmt.Sprintf("_%s_del", forceTableNames), 1572 ); err != nil { 1573 return err 1574 } 1575 1576 os.Setenv("ONLINE_DDL_PASSWORD", onlineDDLPassword) 1577 args := []string{ 1578 wrapperScriptFileName, 1579 fmt.Sprintf(`--host=%s`, variables.host), 1580 fmt.Sprintf(`--port=%d`, variables.port), 1581 fmt.Sprintf(`--conf=%s`, credentialsConfigFileName), // user & password found here 1582 `--allow-on-master`, 1583 `--max-load=Threads_running=900`, 1584 `--critical-load=Threads_running=1000`, 1585 `--critical-load-hibernate-seconds=60`, 1586 `--approve-renamed-columns`, 1587 `--debug`, 1588 `--exact-rowcount`, 1589 `--default-retries=120`, 1590 fmt.Sprintf("--force-table-names=%s", forceTableNames), 1591 fmt.Sprintf("--serve-socket-file=%s", serveSocketFile), 1592 fmt.Sprintf("--hooks-path=%s", tempDir), 1593 fmt.Sprintf(`--hooks-hint-token=%s`, onlineDDL.UUID), 1594 fmt.Sprintf(`--throttle-http=http://localhost:%d/throttler/check?app=%s:gh-ost:%s&p=low`, servenv.Port(), throttlerOnlineDDLApp, onlineDDL.UUID), 1595 fmt.Sprintf(`--database=%s`, e.dbName), 1596 fmt.Sprintf(`--table=%s`, onlineDDL.Table), 1597 fmt.Sprintf(`--alter=%s`, alterOptions), 1598 fmt.Sprintf(`--panic-flag-file=%s`, e.ghostPanicFlagFileName(onlineDDL.UUID)), 1599 fmt.Sprintf(`--execute=%t`, execute), 1600 } 1601 if onlineDDL.StrategySetting().IsAllowZeroInDateFlag() { 1602 args = append(args, "--allow-zero-in-date") 1603 } 1604 if execute && onlineDDL.StrategySetting().IsPostponeCompletion() { 1605 args = append(args, "--postpone-cut-over-flag-file", e.ghostPostponeFlagFileName(onlineDDL.UUID)) 1606 } 1607 1608 args = append(args, onlineDDL.StrategySetting().RuntimeOptions()...) 1609 _ = e.updateMigrationMessage(ctx, onlineDDL.UUID, fmt.Sprintf("executing gh-ost --execute=%v", execute)) 1610 _, err := execCmd("bash", args, os.Environ(), "/tmp", nil, nil) 1611 _ = e.updateMigrationMessage(ctx, onlineDDL.UUID, fmt.Sprintf("executed gh-ost --execute=%v, err=%v", execute, err)) 1612 if err != nil { 1613 // See if we can get more info from the failure file 1614 if content, ferr := os.ReadFile(path.Join(tempDir, migrationFailureFileName)); ferr == nil { 1615 failureMessage := strings.TrimSpace(string(content)) 1616 if failureMessage != "" { 1617 // This message was produced by gh-ost itself. It is more informative than the default "migration failed..." message. Overwrite. 1618 return errors.New(failureMessage) 1619 } 1620 } 1621 } 1622 return err 1623 } 1624 1625 e.ownedRunningMigrations.Store(onlineDDL.UUID, onlineDDL) 1626 1627 go func() error { 1628 defer e.ownedRunningMigrations.Delete(onlineDDL.UUID) 1629 defer e.deleteGhostPostponeFlagFile(onlineDDL.UUID) // irrespective whether the file was in fact in use or not 1630 defer e.dropOnlineDDLUser(ctx) 1631 defer e.gcArtifacts(ctx) 1632 1633 log.Infof("Will now dry-run gh-ost on: %s:%d", variables.host, variables.port) 1634 if err := runGhost(false); err != nil { 1635 // perhaps gh-ost was interrupted midway and didn't have the chance to send a "failed" status 1636 _ = e.failMigration(ctx, onlineDDL, err) 1637 1638 log.Errorf("Error executing gh-ost dry run: %+v", err) 1639 return err 1640 } 1641 log.Infof("+ OK") 1642 1643 log.Infof("Will now run gh-ost on: %s:%d", variables.host, variables.port) 1644 startedMigrations.Add(1) 1645 if err := runGhost(true); err != nil { 1646 // perhaps gh-ost was interrupted midway and didn't have the chance to send a "failes" status 1647 _ = e.failMigration(ctx, onlineDDL, err) 1648 failedMigrations.Add(1) 1649 log.Errorf("Error running gh-ost: %+v", err) 1650 return err 1651 } 1652 // Migration successful! 1653 defer e.reloadSchema(ctx) 1654 successfulMigrations.Add(1) 1655 log.Infof("+ OK") 1656 return nil 1657 }() 1658 return nil 1659 } 1660 1661 // ExecuteWithPTOSC validates and runs a pt-online-schema-change process. 1662 // Validation included testing the backend MySQL server and the pt-online-schema-change binary itself 1663 // Execution runs first a dry run, then an actual migration 1664 func (e *Executor) ExecuteWithPTOSC(ctx context.Context, onlineDDL *schema.OnlineDDL) error { 1665 if conflictFound, conflictingMigration := e.isAnyConflictingMigrationRunning(onlineDDL); conflictFound { 1666 return vterrors.Wrapf(ErrExecutorMigrationAlreadyRunning, "conflicting migration: %v over table: %v", conflictingMigration.UUID, conflictingMigration.Table) 1667 } 1668 1669 if e.tabletTypeFunc() != topodatapb.TabletType_PRIMARY { 1670 return ErrExecutorNotWritableTablet 1671 } 1672 variables, err := e.readMySQLVariables(ctx) 1673 if err != nil { 1674 log.Errorf("Error before running pt-online-schema-change: %+v", err) 1675 return err 1676 } 1677 if variables.readOnly { 1678 err := fmt.Errorf("Error before running pt-online-schema-change: MySQL server is read_only") 1679 log.Errorf(err.Error()) 1680 return err 1681 } 1682 onlineDDLPassword, err := e.createOnlineDDLUser(ctx) 1683 if err != nil { 1684 err := fmt.Errorf("Error creating pt-online-schema-change user: %+v", err) 1685 log.Errorf(err.Error()) 1686 return err 1687 } 1688 tempDir, err := createTempDir(onlineDDL.UUID) 1689 if err != nil { 1690 log.Errorf("Error creating temporary directory: %+v", err) 1691 return err 1692 } 1693 1694 binaryFileName, _ := PTOSCFileName() 1695 wrapperScriptContent := fmt.Sprintf(`#!/bin/bash 1696 pt_log_path="%s" 1697 pt_log_file="%s" 1698 1699 mkdir -p "$pt_log_path" 1700 1701 export MYSQL_PWD 1702 %s "$@" > "$pt_log_path/$pt_log_file" 2>&1 1703 `, tempDir, migrationLogFileName, binaryFileName, 1704 ) 1705 wrapperScriptFileName, err := createTempScript(tempDir, "pt-online-schema-change-wrapper.sh", wrapperScriptContent) 1706 if err != nil { 1707 log.Errorf("Error creating wrapper script: %+v", err) 1708 return err 1709 } 1710 pluginCode := ` 1711 package pt_online_schema_change_plugin; 1712 1713 use strict; 1714 use LWP::Simple; 1715 1716 sub new { 1717 my($class, % args) = @_; 1718 my $self = { %args }; 1719 return bless $self, $class; 1720 } 1721 1722 sub init { 1723 my($self, % args) = @_; 1724 } 1725 1726 sub before_create_new_table { 1727 my($self, % args) = @_; 1728 get("http://localhost:{{VTTABLET_PORT}}/schema-migration/report-status?uuid={{MIGRATION_UUID}}&status={{OnlineDDLStatusRunning}}&hint=&dryrun={{DRYRUN}}"); 1729 } 1730 1731 sub before_exit { 1732 my($self, % args) = @_; 1733 my $exit_status = $args{exit_status}; 1734 if ($exit_status == 0) { 1735 get("http://localhost:{{VTTABLET_PORT}}/schema-migration/report-status?uuid={{MIGRATION_UUID}}&status={{OnlineDDLStatusComplete}}&hint=&dryrun={{DRYRUN}}"); 1736 } else { 1737 get("http://localhost:{{VTTABLET_PORT}}/schema-migration/report-status?uuid={{MIGRATION_UUID}}&status={{OnlineDDLStatusFailed}}&hint=&dryrun={{DRYRUN}}"); 1738 } 1739 } 1740 1741 sub get_slave_lag { 1742 my ($self, %args) = @_; 1743 1744 return sub { 1745 if (head("http://localhost:{{VTTABLET_PORT}}/throttler/check?app={{THROTTLER_ONLINE_DDL_APP}}:pt-osc:{{MIGRATION_UUID}}&p=low")) { 1746 # Got HTTP 200 OK, means throttler is happy 1747 return 0; 1748 } else { 1749 # Throttler requests to hold back 1750 return 2147483647; # maxint, report *very* high lag 1751 } 1752 }; 1753 } 1754 1755 1; 1756 ` 1757 pluginCode = strings.ReplaceAll(pluginCode, "{{VTTABLET_PORT}}", fmt.Sprintf("%d", servenv.Port())) 1758 pluginCode = strings.ReplaceAll(pluginCode, "{{MIGRATION_UUID}}", onlineDDL.UUID) 1759 pluginCode = strings.ReplaceAll(pluginCode, "{{THROTTLER_ONLINE_DDL_APP}}", throttlerOnlineDDLApp) 1760 1761 pluginCode = strings.ReplaceAll(pluginCode, "{{OnlineDDLStatusRunning}}", string(schema.OnlineDDLStatusRunning)) 1762 pluginCode = strings.ReplaceAll(pluginCode, "{{OnlineDDLStatusComplete}}", string(schema.OnlineDDLStatusComplete)) 1763 pluginCode = strings.ReplaceAll(pluginCode, "{{OnlineDDLStatusFailed}}", string(schema.OnlineDDLStatusFailed)) 1764 1765 // Validate pt-online-schema-change binary: 1766 log.Infof("Will now validate pt-online-schema-change binary") 1767 _, err = execCmd( 1768 "bash", 1769 []string{ 1770 wrapperScriptFileName, 1771 "--version", 1772 }, 1773 os.Environ(), 1774 "/tmp", 1775 nil, 1776 nil, 1777 ) 1778 if err != nil { 1779 log.Errorf("Error testing pt-online-schema-change binary: %+v", err) 1780 return err 1781 } 1782 log.Infof("+ OK") 1783 1784 if err := e.updateMigrationLogPath(ctx, onlineDDL.UUID, variables.host, tempDir); err != nil { 1785 return err 1786 } 1787 1788 alterOptions := e.parseAlterOptions(ctx, onlineDDL) 1789 1790 // The following sleep() is temporary and artificial. Because we create a new user for this 1791 // migration, and because we throttle by replicas, we need to wait for the replicas to be 1792 // caught up with the new user creation. Otherwise, the OSC tools will fail connecting to the replicas... 1793 // Once we have a built in throttling service , we will no longe rneed to have the OSC tools probe the 1794 // replicas. Instead, they will consult with our throttling service. 1795 // TODO(shlomi): replace/remove this when we have a proper throttling solution 1796 time.Sleep(time.Second) 1797 1798 runPTOSC := func(execute bool) error { 1799 os.Setenv("MYSQL_PWD", onlineDDLPassword) 1800 newTableName := fmt.Sprintf("_%s_%s_new", onlineDDL.UUID, ReadableTimestamp()) 1801 1802 if err := e.updateArtifacts(ctx, onlineDDL.UUID, 1803 fmt.Sprintf("_%s_old", onlineDDL.Table), 1804 fmt.Sprintf("__%s_old", onlineDDL.Table), 1805 newTableName, 1806 ); err != nil { 1807 return err 1808 } 1809 1810 executeFlag := "--dry-run" 1811 if execute { 1812 executeFlag = "--execute" 1813 } 1814 finalPluginCode := strings.ReplaceAll(pluginCode, "{{DRYRUN}}", fmt.Sprintf("%t", !execute)) 1815 pluginFile, err := createTempScript(tempDir, "pt-online-schema-change-plugin", finalPluginCode) 1816 if err != nil { 1817 log.Errorf("Error creating script: %+v", err) 1818 return err 1819 } 1820 args := []string{ 1821 wrapperScriptFileName, 1822 `--pid`, 1823 e.ptPidFileName(onlineDDL.UUID), 1824 `--plugin`, 1825 pluginFile, 1826 `--new-table-name`, 1827 newTableName, 1828 `--alter`, 1829 alterOptions, 1830 `--check-slave-lag`, // We use primary's identity so that pt-online-schema-change calls our lag plugin for exactly 1 server 1831 fmt.Sprintf(`h=%s,P=%d,D=%s,t=%s,u=%s`, variables.host, variables.port, e.dbName, onlineDDL.Table, onlineDDLUser), 1832 executeFlag, 1833 fmt.Sprintf(`h=%s,P=%d,D=%s,t=%s,u=%s`, variables.host, variables.port, e.dbName, onlineDDL.Table, onlineDDLUser), 1834 } 1835 1836 if execute { 1837 args = append(args, 1838 `--no-drop-new-table`, 1839 `--no-drop-old-table`, 1840 ) 1841 } 1842 args = append(args, onlineDDL.StrategySetting().RuntimeOptions()...) 1843 _, err = execCmd("bash", args, os.Environ(), "/tmp", nil, nil) 1844 return err 1845 } 1846 1847 e.ownedRunningMigrations.Store(onlineDDL.UUID, onlineDDL) 1848 1849 go func() error { 1850 defer e.ownedRunningMigrations.Delete(onlineDDL.UUID) 1851 defer e.dropOnlineDDLUser(ctx) 1852 defer e.gcArtifacts(ctx) 1853 1854 log.Infof("Will now dry-run pt-online-schema-change on: %s:%d", variables.host, variables.port) 1855 if err := runPTOSC(false); err != nil { 1856 // perhaps pt-osc was interrupted midway and didn't have the chance to send a "failes" status 1857 _ = e.failMigration(ctx, onlineDDL, err) 1858 _ = e.updateMigrationTimestamp(ctx, "completed_timestamp", onlineDDL.UUID) 1859 log.Errorf("Error executing pt-online-schema-change dry run: %+v", err) 1860 return err 1861 } 1862 log.Infof("+ OK") 1863 1864 log.Infof("Will now run pt-online-schema-change on: %s:%d", variables.host, variables.port) 1865 startedMigrations.Add(1) 1866 if err := runPTOSC(true); err != nil { 1867 // perhaps pt-osc was interrupted midway and didn't have the chance to send a "failes" status 1868 _ = e.failMigration(ctx, onlineDDL, err) 1869 _ = e.updateMigrationTimestamp(ctx, "completed_timestamp", onlineDDL.UUID) 1870 _ = e.dropPTOSCMigrationTriggers(ctx, onlineDDL) 1871 failedMigrations.Add(1) 1872 log.Errorf("Error running pt-online-schema-change: %+v", err) 1873 return err 1874 } 1875 // Migration successful! 1876 defer e.reloadSchema(ctx) 1877 successfulMigrations.Add(1) 1878 log.Infof("+ OK") 1879 return nil 1880 }() 1881 return nil 1882 } 1883 1884 func (e *Executor) readMigration(ctx context.Context, uuid string) (onlineDDL *schema.OnlineDDL, row sqltypes.RowNamedValues, err error) { 1885 1886 parsed := sqlparser.BuildParsedQuery(sqlSelectMigration, ":migration_uuid") 1887 bindVars := map[string]*querypb.BindVariable{ 1888 "migration_uuid": sqltypes.StringBindVariable(uuid), 1889 } 1890 bound, err := parsed.GenerateQuery(bindVars, nil) 1891 if err != nil { 1892 return onlineDDL, nil, err 1893 } 1894 r, err := e.execQuery(ctx, bound) 1895 if err != nil { 1896 return onlineDDL, nil, err 1897 } 1898 row = r.Named().Row() 1899 if row == nil { 1900 // No results 1901 return nil, nil, ErrMigrationNotFound 1902 } 1903 onlineDDL = &schema.OnlineDDL{ 1904 Keyspace: row["keyspace"].ToString(), 1905 Table: row["mysql_table"].ToString(), 1906 Schema: row["mysql_schema"].ToString(), 1907 SQL: row["migration_statement"].ToString(), 1908 UUID: row["migration_uuid"].ToString(), 1909 Strategy: schema.DDLStrategy(row["strategy"].ToString()), 1910 Options: row["options"].ToString(), 1911 Status: schema.OnlineDDLStatus(row["migration_status"].ToString()), 1912 Retries: row.AsInt64("retries", 0), 1913 ReadyToComplete: row.AsInt64("ready_to_complete", 0), 1914 TabletAlias: row["tablet"].ToString(), 1915 MigrationContext: row["migration_context"].ToString(), 1916 } 1917 return onlineDDL, row, nil 1918 } 1919 1920 // readPendingMigrationsUUIDs returns UUIDs for migrations in pending state (queued/ready/running) 1921 func (e *Executor) readPendingMigrationsUUIDs(ctx context.Context) (uuids []string, err error) { 1922 r, err := e.execQuery(ctx, sqlSelectPendingMigrations) 1923 if err != nil { 1924 return uuids, err 1925 } 1926 for _, row := range r.Named().Rows { 1927 uuid := row["migration_uuid"].ToString() 1928 uuids = append(uuids, uuid) 1929 } 1930 return uuids, err 1931 } 1932 1933 // terminateMigration attempts to interrupt and hard-stop a running migration 1934 func (e *Executor) terminateMigration(ctx context.Context, onlineDDL *schema.OnlineDDL) (foundRunning bool, err error) { 1935 log.Infof("terminateMigration: request to terminate %s", onlineDDL.UUID) 1936 // It's possible the killing the migration fails for whatever reason, in which case 1937 // the logic will retry killing it later on. 1938 // Whatever happens in this function, this executor stops owning the given migration. 1939 defer e.ownedRunningMigrations.Delete(onlineDDL.UUID) 1940 1941 switch onlineDDL.Strategy { 1942 case schema.DDLStrategyOnline, schema.DDLStrategyVitess: 1943 // migration could have started by a different tablet. We need to actively verify if it is running 1944 s, _ := e.readVReplStream(ctx, onlineDDL.UUID, true) 1945 foundRunning = (s != nil && s.isRunning()) 1946 if err := e.terminateVReplMigration(ctx, onlineDDL.UUID); err != nil { 1947 return foundRunning, fmt.Errorf("Error terminating migration, vreplication exec error: %+v", err) 1948 } 1949 case schema.DDLStrategyPTOSC: 1950 // see if pt-osc is running (could have been executed by this vttablet or one that crashed in the past) 1951 if running, pid, _ := e.isPTOSCMigrationRunning(ctx, onlineDDL.UUID); running { 1952 foundRunning = true 1953 // Because pt-osc doesn't offer much control, we take a brute force approach to killing it, 1954 // revoking its privileges, and cleaning up its triggers. 1955 if err := syscall.Kill(pid, syscall.SIGTERM); err != nil { 1956 return foundRunning, nil 1957 } 1958 if err := syscall.Kill(pid, syscall.SIGKILL); err != nil { 1959 return foundRunning, nil 1960 } 1961 if err := e.dropOnlineDDLUser(ctx); err != nil { 1962 return foundRunning, nil 1963 } 1964 if err := e.dropPTOSCMigrationTriggers(ctx, onlineDDL); err != nil { 1965 return foundRunning, nil 1966 } 1967 } 1968 case schema.DDLStrategyGhost: 1969 // double check: is the running migration the very same one we wish to cancel? 1970 if _, ok := e.ownedRunningMigrations.Load(onlineDDL.UUID); ok { 1971 // assuming all goes well in next steps, we can already report that there has indeed been a migration 1972 foundRunning = true 1973 } 1974 // gh-ost migrations are easy to kill: just touch their specific panic flag files. We trust 1975 // gh-ost to terminate. No need to KILL it. And there's no trigger cleanup. 1976 if err := e.createGhostPanicFlagFile(onlineDDL.UUID); err != nil { 1977 return foundRunning, fmt.Errorf("Error terminating gh-ost migration, flag file error: %+v", err) 1978 } 1979 } 1980 return foundRunning, nil 1981 } 1982 1983 // CancelMigration attempts to abort a scheduled or a running migration 1984 func (e *Executor) CancelMigration(ctx context.Context, uuid string, message string, issuedByUser bool) (result *sqltypes.Result, err error) { 1985 if atomic.LoadInt64(&e.isOpen) == 0 { 1986 return nil, vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, "online ddl is disabled") 1987 } 1988 log.Infof("CancelMigration: request to cancel %s with message: %v", uuid, message) 1989 1990 e.migrationMutex.Lock() 1991 defer e.migrationMutex.Unlock() 1992 1993 var rowsAffected uint64 1994 1995 onlineDDL, _, err := e.readMigration(ctx, uuid) 1996 if err != nil { 1997 return nil, err 1998 } 1999 2000 switch onlineDDL.Status { 2001 case schema.OnlineDDLStatusComplete, schema.OnlineDDLStatusFailed, schema.OnlineDDLStatusCancelled: 2002 log.Infof("CancelMigration: migration %s is in non-cancellable status: %v", uuid, onlineDDL.Status) 2003 return emptyResult, nil 2004 } 2005 // From this point on, we're actually cancelling a migration 2006 if issuedByUser { 2007 // if this was issued by the user, then we mark the `cancelled_timestamp`, and based on that, 2008 // the migration state will be 'cancelled'. 2009 // If this was not issued by the user, then this is an internal state machine cancellation of the 2010 // migration, e.g. because it is stale or has an unrecoverable error. In this case we do not mark 2011 // the timestamp, and as result, the state will transition to 'failed' 2012 if err := e.updateMigrationTimestamp(ctx, "cancelled_timestamp", uuid); err != nil { 2013 return nil, err 2014 } 2015 } 2016 defer e.failMigration(ctx, onlineDDL, errors.New(message)) 2017 defer e.triggerNextCheckInterval() 2018 2019 switch onlineDDL.Status { 2020 case schema.OnlineDDLStatusQueued, schema.OnlineDDLStatusReady: 2021 log.Infof("CancelMigration: cancelling %s with status: %v", uuid, onlineDDL.Status) 2022 return &sqltypes.Result{RowsAffected: 1}, nil 2023 } 2024 2025 migrationFound, err := e.terminateMigration(ctx, onlineDDL) 2026 if migrationFound { 2027 log.Infof("CancelMigration: terminated %s with status: %v", uuid, onlineDDL.Status) 2028 rowsAffected = 1 2029 } else { 2030 log.Infof("CancelMigration: migration %s wasn't found to be running", uuid) 2031 } 2032 if err != nil { 2033 return result, err 2034 } 2035 2036 result = &sqltypes.Result{ 2037 RowsAffected: rowsAffected, 2038 } 2039 return result, nil 2040 } 2041 2042 // cancelMigrations attempts to abort a list of migrations 2043 func (e *Executor) cancelMigrations(ctx context.Context, cancellable []*cancellableMigration, issuedByUser bool) (err error) { 2044 for _, migration := range cancellable { 2045 log.Infof("cancelMigrations: cancelling %s; reason: %s", migration.uuid, migration.message) 2046 if _, err := e.CancelMigration(ctx, migration.uuid, migration.message, issuedByUser); err != nil { 2047 return err 2048 } 2049 } 2050 return nil 2051 } 2052 2053 // CancelPendingMigrations cancels all pending migrations (that are expected to run or are running) 2054 // for this keyspace 2055 func (e *Executor) CancelPendingMigrations(ctx context.Context, message string, issuedByUser bool) (result *sqltypes.Result, err error) { 2056 if atomic.LoadInt64(&e.isOpen) == 0 { 2057 return nil, vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, "online ddl is disabled") 2058 } 2059 2060 uuids, err := e.readPendingMigrationsUUIDs(ctx) 2061 if err != nil { 2062 return result, err 2063 } 2064 log.Infof("CancelPendingMigrations: iterating %v migrations %s", len(uuids)) 2065 2066 result = &sqltypes.Result{} 2067 for _, uuid := range uuids { 2068 log.Infof("CancelPendingMigrations: cancelling %s", uuid) 2069 res, err := e.CancelMigration(ctx, uuid, message, issuedByUser) 2070 if err != nil { 2071 return result, err 2072 } 2073 result.AppendResult(res) 2074 } 2075 log.Infof("CancelPendingMigrations: done iterating %v migrations %s", len(uuids)) 2076 return result, nil 2077 } 2078 2079 func (e *Executor) validateThrottleParams(ctx context.Context, expireString string, ratioLiteral *sqlparser.Literal) (duration time.Duration, ratio float64, err error) { 2080 duration = time.Hour * 24 * 365 * 100 2081 if expireString != "" { 2082 duration, err = time.ParseDuration(expireString) 2083 if err != nil || duration < 0 { 2084 return duration, ratio, vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid EXPIRE value: %s. Try '120s', '30m', '1h', etc. Allowed units are (s)ec, (m)in, (h)hour", expireString) 2085 } 2086 } 2087 ratio = 1.0 2088 if ratioLiteral != nil { 2089 ratio, err = strconv.ParseFloat(ratioLiteral.Val, 64) 2090 if err != nil || ratio < 0 || ratio > 1 { 2091 return duration, ratio, vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid RATIO value: %s. Try any decimal number between '0.0' (no throttle) and `1.0` (fully throttled)", ratioLiteral.Val) 2092 } 2093 } 2094 return duration, ratio, nil 2095 } 2096 2097 // ThrottleMigration 2098 func (e *Executor) ThrottleMigration(ctx context.Context, uuid string, expireString string, ratioLiteral *sqlparser.Literal) (result *sqltypes.Result, err error) { 2099 duration, ratio, err := e.validateThrottleParams(ctx, expireString, ratioLiteral) 2100 if err != nil { 2101 return nil, err 2102 } 2103 if err := e.lagThrottler.CheckIsReady(); err != nil { 2104 return nil, err 2105 } 2106 _ = e.lagThrottler.ThrottleApp(uuid, time.Now().Add(duration), ratio) 2107 return emptyResult, nil 2108 } 2109 2110 // ThrottleAllMigrations 2111 func (e *Executor) ThrottleAllMigrations(ctx context.Context, expireString string, ratioLiteral *sqlparser.Literal) (result *sqltypes.Result, err error) { 2112 duration, ratio, err := e.validateThrottleParams(ctx, expireString, ratioLiteral) 2113 if err != nil { 2114 return nil, err 2115 } 2116 if err := e.lagThrottler.CheckIsReady(); err != nil { 2117 return nil, err 2118 } 2119 _ = e.lagThrottler.ThrottleApp(throttlerOnlineDDLApp, time.Now().Add(duration), ratio) 2120 return emptyResult, nil 2121 } 2122 2123 // UnthrottleMigration 2124 func (e *Executor) UnthrottleMigration(ctx context.Context, uuid string) (result *sqltypes.Result, err error) { 2125 if err := e.lagThrottler.CheckIsReady(); err != nil { 2126 return nil, err 2127 } 2128 defer e.triggerNextCheckInterval() 2129 _ = e.lagThrottler.UnthrottleApp(uuid) 2130 return emptyResult, nil 2131 } 2132 2133 // UnthrottleAllMigrations 2134 func (e *Executor) UnthrottleAllMigrations(ctx context.Context) (result *sqltypes.Result, err error) { 2135 if err := e.lagThrottler.CheckIsReady(); err != nil { 2136 return nil, err 2137 } 2138 defer e.triggerNextCheckInterval() 2139 _ = e.lagThrottler.UnthrottleApp(throttlerOnlineDDLApp) 2140 return emptyResult, nil 2141 } 2142 2143 // scheduleNextMigration attemps to schedule a single migration to run next. 2144 // possibly there are migrations to run. 2145 // The effect of this function is to move a migration from 'queued' state to 'ready' state, is all. 2146 func (e *Executor) scheduleNextMigration(ctx context.Context) error { 2147 e.migrationMutex.Lock() 2148 defer e.migrationMutex.Unlock() 2149 2150 var onlyScheduleOneMigration sync.Once 2151 2152 r, err := e.execQuery(ctx, sqlSelectQueuedMigrations) 2153 if err != nil { 2154 return err 2155 } 2156 for _, row := range r.Named().Rows { 2157 uuid := row["migration_uuid"].ToString() 2158 postponeLaunch := row.AsBool("postpone_launch", false) 2159 postponeCompletion := row.AsBool("postpone_completion", false) 2160 readyToComplete := row.AsBool("ready_to_complete", false) 2161 isImmediateOperation := row.AsBool("is_immediate_operation", false) 2162 2163 if postponeLaunch { 2164 // We don't even look into this migration until its postpone_launch flag is cleared 2165 continue 2166 } 2167 2168 if !readyToComplete { 2169 // see if we need to update ready_to_complete 2170 if isImmediateOperation { 2171 // Whether postponsed or not, CREATE and DROP operations, as well as VIEW operations, 2172 // are inherently "ready to complete" because their operation is immediate. 2173 if err := e.updateMigrationReadyToComplete(ctx, uuid, true); err != nil { 2174 return err 2175 } 2176 } 2177 } 2178 2179 if !(isImmediateOperation && postponeCompletion) { 2180 // Any non-postponed migration can be scheduled 2181 // postponed ALTER can be scheduled (because gh-ost or vreplication will postpone the cut-over) 2182 // We only schedule a single migration in the execution of this function 2183 onlyScheduleOneMigration.Do(func() { 2184 err = e.updateMigrationStatus(ctx, uuid, schema.OnlineDDLStatusReady) 2185 log.Infof("Executor.scheduleNextMigration: scheduling migration %s; err: %v", uuid, err) 2186 e.triggerNextCheckInterval() 2187 }) 2188 if err != nil { 2189 return err 2190 } 2191 } 2192 } 2193 return err 2194 } 2195 2196 // reviewEmptyTableRevertMigrations reviews a queued REVERT migration. Such a migration has the following SQL: 2197 // "REVERT VITESS_MIGRATION '...'" 2198 // There's nothing in this SQL to indicate: 2199 // - which table is involved? 2200 // - is this a table or a view? 2201 // - Are we reverting a CREATE? A DROP? An ALTER? 2202 // This function fills in the blanks and updates the database row. 2203 func (e *Executor) reviewEmptyTableRevertMigrations(ctx context.Context, onlineDDL *schema.OnlineDDL) (changesMade bool, err error) { 2204 if onlineDDL.Table != "" { 2205 return false, nil 2206 } 2207 // Table name is empty. Let's populate it. 2208 2209 // Try to update table name and ddl_action 2210 // Failure to do so fails the migration 2211 revertUUID, err := onlineDDL.GetRevertUUID() 2212 if err != nil { 2213 return false, e.failMigration(ctx, onlineDDL, fmt.Errorf("cannot analyze revert UUID for revert migration %s: %v", onlineDDL.UUID, err)) 2214 } 2215 revertedMigration, revertedRow, err := e.readMigration(ctx, revertUUID) 2216 if err != nil { 2217 return false, e.failMigration(ctx, onlineDDL, fmt.Errorf("cannot read migration %s reverted by migration %s: %s", revertUUID, onlineDDL.UUID, err)) 2218 } 2219 revertedActionStr := revertedRow["ddl_action"].ToString() 2220 2221 mimickedActionStr := "" 2222 switch revertedActionStr { 2223 case sqlparser.CreateStr: 2224 mimickedActionStr = sqlparser.DropStr 2225 case sqlparser.DropStr: 2226 mimickedActionStr = sqlparser.CreateStr 2227 case sqlparser.AlterStr: 2228 mimickedActionStr = sqlparser.AlterStr 2229 default: 2230 return false, e.failMigration(ctx, onlineDDL, fmt.Errorf("cannot run migration %s reverting %s: unexpected action %s", onlineDDL.UUID, revertedMigration.UUID, revertedActionStr)) 2231 } 2232 if err := e.updateDDLAction(ctx, onlineDDL.UUID, mimickedActionStr); err != nil { 2233 return false, err 2234 } 2235 if err := e.updateMigrationIsView(ctx, onlineDDL.UUID, revertedRow.AsBool("is_view", false)); err != nil { 2236 return false, err 2237 } 2238 if err := e.updateMySQLTable(ctx, onlineDDL.UUID, revertedMigration.Table); err != nil { 2239 return false, err 2240 } 2241 return true, nil 2242 } 2243 2244 // reviewImmediateOperations reviews a queued migration and determines whether it is an "immediate operation". 2245 // Immediate operations are ones that can be performed within a split second, or rather, do not require long 2246 // running processes. Immediate operations are: 2247 // - CREATE TABLE 2248 // - DROP TABLE (which we convert into RENAME) 2249 // - All VIEW operations 2250 // - An INSTANT DDL accompanied by relevant ddl strategy flags 2251 // Non immediate operations are: 2252 // - A gh-ost migration 2253 // - A vitess (vreplication) migration 2254 func (e *Executor) reviewImmediateOperations(ctx context.Context, capableOf mysql.CapableOf, onlineDDL *schema.OnlineDDL, ddlAction string, isRevert bool, isView bool) (bool, error) { 2255 switch ddlAction { 2256 case sqlparser.CreateStr, sqlparser.DropStr: 2257 return true, nil 2258 case sqlparser.AlterStr: 2259 switch { 2260 case isView: 2261 return true, nil 2262 case isRevert: 2263 // REVERT for a true ALTER TABLE. not an immediate operation 2264 return false, nil 2265 default: 2266 specialPlan, err := e.analyzeSpecialAlterPlan(ctx, onlineDDL, capableOf) 2267 if err != nil { 2268 return false, err 2269 } 2270 return (specialPlan != nil), nil 2271 } 2272 } 2273 return false, nil 2274 } 2275 2276 // reviewQueuedMigrations iterates through queued migrations and sees if any information needs to be updated. 2277 // The function analyzes the queued migration and fills in some blanks: 2278 // - If this is a REVERT migration, what table is affected? What's the operation? 2279 // - Is this migration an "immediate operation"? 2280 func (e *Executor) reviewQueuedMigrations(ctx context.Context) error { 2281 conn, err := dbconnpool.NewDBConnection(ctx, e.env.Config().DB.DbaWithDB()) 2282 if err != nil { 2283 return err 2284 } 2285 defer conn.Close() 2286 _, capableOf, _ := mysql.GetFlavor(conn.ServerVersion, nil) 2287 2288 e.migrationMutex.Lock() 2289 defer e.migrationMutex.Unlock() 2290 2291 r, err := e.execQuery(ctx, sqlSelectQueuedUnreviewedMigrations) 2292 if err != nil { 2293 return err 2294 } 2295 2296 for _, uuidRow := range r.Named().Rows { 2297 uuid := uuidRow["migration_uuid"].ToString() 2298 onlineDDL, row, err := e.readMigration(ctx, uuid) 2299 if err != nil { 2300 return err 2301 } 2302 // handle REVERT migrations: populate table name and update ddl action and is_view: 2303 ddlAction := row["ddl_action"].ToString() 2304 isRevert := false 2305 if ddlAction == schema.RevertActionStr { 2306 isRevert = true 2307 rowModified, err := e.reviewEmptyTableRevertMigrations(ctx, onlineDDL) 2308 if err != nil { 2309 return err 2310 } 2311 if rowModified { 2312 // re-read migration and entire row 2313 onlineDDL, row, err = e.readMigration(ctx, uuid) 2314 if err != nil { 2315 return err 2316 } 2317 ddlAction = row["ddl_action"].ToString() 2318 } 2319 } 2320 isView := row.AsBool("is_view", false) 2321 isImmediate, err := e.reviewImmediateOperations(ctx, capableOf, onlineDDL, ddlAction, isRevert, isView) 2322 if err != nil { 2323 return err 2324 } 2325 if isImmediate { 2326 if err := e.updateMigrationSetImmediateOperation(ctx, onlineDDL.UUID); err != nil { 2327 return err 2328 } 2329 } 2330 // Find conditions where the migration cannot take place: 2331 switch onlineDDL.Strategy { 2332 case schema.DDLStrategyMySQL: 2333 strategySetting := onlineDDL.StrategySetting() 2334 if strategySetting.IsPostponeCompletion() { 2335 e.failMigration(ctx, onlineDDL, vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "--postpone-completion not supported in 'mysql' strategy")) 2336 } 2337 if strategySetting.IsAllowZeroInDateFlag() { 2338 e.failMigration(ctx, onlineDDL, vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "--allow-zero-in-date not supported in 'mysql' strategy")) 2339 } 2340 } 2341 2342 // The review is complete. We've backfilled details on the migration row. We mark 2343 // the migration as having been reviewed. The function scheduleNextMigration() will then 2344 // have access to this row. 2345 if err := e.updateMigrationTimestamp(ctx, "reviewed_timestamp", uuid); err != nil { 2346 return err 2347 } 2348 2349 } 2350 return nil 2351 } 2352 2353 func (e *Executor) validateMigrationRevertible(ctx context.Context, revertMigration *schema.OnlineDDL, revertingMigrationUUID string) (err error) { 2354 // Validation: migration to revert exists and is in complete state 2355 action, actionStr, err := revertMigration.GetActionStr() 2356 if err != nil { 2357 return err 2358 } 2359 switch action { 2360 case sqlparser.AlterDDLAction: 2361 if revertMigration.Strategy != schema.DDLStrategyOnline && revertMigration.Strategy != schema.DDLStrategyVitess { 2362 return fmt.Errorf("can only revert a %s strategy migration. Migration %s has %s strategy", schema.DDLStrategyOnline, revertMigration.UUID, revertMigration.Strategy) 2363 } 2364 case sqlparser.RevertDDLAction: 2365 case sqlparser.CreateDDLAction: 2366 case sqlparser.DropDDLAction: 2367 default: 2368 return fmt.Errorf("cannot revert migration %s: unexpected action %s", revertMigration.UUID, actionStr) 2369 } 2370 if revertMigration.Status != schema.OnlineDDLStatusComplete { 2371 return fmt.Errorf("can only revert a migration in a '%s' state. Migration %s is in '%s' state", schema.OnlineDDLStatusComplete, revertMigration.UUID, revertMigration.Status) 2372 } 2373 { 2374 // Validation: see if there's a pending migration on this table: 2375 r, err := e.execQuery(ctx, sqlSelectPendingMigrations) 2376 if err != nil { 2377 return err 2378 } 2379 // we identify running migrations on requested table 2380 for _, row := range r.Named().Rows { 2381 pendingUUID := row["migration_uuid"].ToString() 2382 if pendingUUID == revertingMigrationUUID { 2383 // that's fine; the migration we're looking at is the very one that's trying to issue this revert 2384 continue 2385 } 2386 keyspace := row["keyspace"].ToString() 2387 table := row["mysql_table"].ToString() 2388 status := schema.OnlineDDLStatus(row["migration_status"].ToString()) 2389 2390 if keyspace == e.keyspace && table == revertMigration.Table { 2391 return fmt.Errorf("can not revert migration %s on table %s because migration %s is in %s status. May only revert if all migrations on this table are completed or failed", revertMigration.UUID, revertMigration.Table, pendingUUID, status) 2392 } 2393 } 2394 { 2395 // Validation: see that we're reverting the last successful migration on this table: 2396 query, err := sqlparser.ParseAndBind(sqlSelectCompleteMigrationsOnTable, 2397 sqltypes.StringBindVariable(e.keyspace), 2398 sqltypes.StringBindVariable(revertMigration.Table), 2399 ) 2400 if err != nil { 2401 return err 2402 } 2403 r, err := e.execQuery(ctx, query) 2404 if err != nil { 2405 return err 2406 } 2407 for _, row := range r.Named().Rows { 2408 completeUUID := row["migration_uuid"].ToString() 2409 if completeUUID != revertMigration.UUID { 2410 return fmt.Errorf("can not revert migration %s on table %s because it is not the last migration to complete on that table. The last migration to complete was %s", revertMigration.UUID, revertMigration.Table, completeUUID) 2411 } 2412 } 2413 } 2414 } 2415 return nil 2416 } 2417 2418 // executeRevert is called for 'revert' migrations (SQL is of the form "revert 99caeca2_74e2_11eb_a693_f875a4d24e90", not a real SQL of course). 2419 // In this function we: 2420 // - figure out whether the revert is valid: can we really revert requested migration? 2421 // - what type of migration we're reverting? (CREATE/DROP/ALTER) 2422 // - revert appropriately to the type of migration 2423 func (e *Executor) executeRevert(ctx context.Context, onlineDDL *schema.OnlineDDL) (err error) { 2424 revertUUID, err := onlineDDL.GetRevertUUID() 2425 if err != nil { 2426 return fmt.Errorf("cannot run a revert migration %v: %+v", onlineDDL.UUID, err) 2427 } 2428 2429 revertMigration, row, err := e.readMigration(ctx, revertUUID) 2430 if err != nil { 2431 return err 2432 } 2433 if err := e.validateMigrationRevertible(ctx, revertMigration, onlineDDL.UUID); err != nil { 2434 return err 2435 } 2436 revertedActionStr := row["ddl_action"].ToString() 2437 if onlineDDL.Table == "" { 2438 // table name should be populated by reviewQueuedMigrations 2439 // but this was a newly added functionality. To be backwards compatible, 2440 // we double check here, and populate table name and ddl_action. 2441 2442 // TODO: remove in v14 2443 mimickedActionStr := "" 2444 2445 switch revertedActionStr { 2446 case sqlparser.CreateStr: 2447 mimickedActionStr = sqlparser.DropStr 2448 case sqlparser.DropStr: 2449 mimickedActionStr = sqlparser.CreateStr 2450 case sqlparser.AlterStr: 2451 mimickedActionStr = sqlparser.AlterStr 2452 default: 2453 return fmt.Errorf("cannot run migration %s reverting %s: unexpected action %s", onlineDDL.UUID, revertMigration.UUID, revertedActionStr) 2454 } 2455 if err := e.updateDDLAction(ctx, onlineDDL.UUID, mimickedActionStr); err != nil { 2456 return err 2457 } 2458 if err := e.updateMySQLTable(ctx, onlineDDL.UUID, revertMigration.Table); err != nil { 2459 return err 2460 } 2461 } 2462 2463 switch revertedActionStr { 2464 case sqlparser.CreateStr: 2465 { 2466 // We are reverting a CREATE migration. The revert is to DROP, only we don't actually 2467 // drop the table, we rename it into lifecycle 2468 // Possibly this was a CREATE TABLE IF NOT EXISTS, and possibly the table already existed 2469 // before the DDL, in which case the CREATE was a noop. In that scenario we _do not_ drop 2470 // the table. 2471 // We can tell the difference by looking at the artifacts. A successful CREATE TABLE, where 2472 // a table actually gets created, has a sentry, dummy artifact. A noop has not. 2473 2474 artifacts := row["artifacts"].ToString() 2475 artifactTables := textutil.SplitDelimitedList(artifacts) 2476 if len(artifactTables) > 1 { 2477 return fmt.Errorf("cannot run migration %s reverting %s: found %d artifact tables, expected maximum 1", onlineDDL.UUID, revertMigration.UUID, len(artifactTables)) 2478 } 2479 if len(artifactTables) == 0 { 2480 // This indicates no table was actually created. this must have been a CREATE TABLE IF NOT EXISTS where the table already existed. 2481 _ = e.onSchemaMigrationStatus(ctx, onlineDDL.UUID, schema.OnlineDDLStatusComplete, false, progressPctFull, etaSecondsNow, rowsCopiedUnknown, emptyHint) 2482 } 2483 2484 for _, artifactTable := range artifactTables { 2485 if err := e.updateArtifacts(ctx, onlineDDL.UUID, artifactTable); err != nil { 2486 return err 2487 } 2488 onlineDDL.SQL = sqlparser.BuildParsedQuery(sqlRenameTable, revertMigration.Table, artifactTable).Query 2489 if _, err := e.executeDirectly(ctx, onlineDDL); err != nil { 2490 return err 2491 } 2492 } 2493 } 2494 case sqlparser.DropStr: 2495 { 2496 // We are reverting a DROP migration. But the table wasn't really dropped, because that's not how 2497 // we run DROP migrations. It was renamed. So we need to rename it back. 2498 // But we impose as if we are now CREATE-ing the table. 2499 2500 artifacts := row["artifacts"].ToString() 2501 artifactTables := textutil.SplitDelimitedList(artifacts) 2502 if len(artifactTables) > 1 { 2503 return fmt.Errorf("cannot run migration %s reverting %s: found %d artifact tables, expected maximum 1", onlineDDL.UUID, revertMigration.UUID, len(artifactTables)) 2504 } 2505 if len(artifactTables) == 0 { 2506 // Could happen on `DROP TABLE IF EXISTS` where the table did not exist... 2507 _ = e.onSchemaMigrationStatus(ctx, onlineDDL.UUID, schema.OnlineDDLStatusComplete, false, progressPctFull, etaSecondsNow, rowsCopiedUnknown, emptyHint) 2508 } 2509 for _, artifactTable := range artifactTables { 2510 if err := e.updateArtifacts(ctx, onlineDDL.UUID, artifactTable); err != nil { 2511 return err 2512 } 2513 onlineDDL.SQL = sqlparser.BuildParsedQuery(sqlRenameTable, artifactTable, revertMigration.Table).Query 2514 if _, err := e.executeDirectly(ctx, onlineDDL); err != nil { 2515 return err 2516 } 2517 } 2518 } 2519 case sqlparser.AlterStr: 2520 { 2521 if row.AsBool("is_view", false) { 2522 artifacts := row["artifacts"].ToString() 2523 artifactTables := textutil.SplitDelimitedList(artifacts) 2524 if len(artifactTables) > 1 { 2525 return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "cannot run migration %s reverting %s: found %d artifact tables, expected maximum 1", onlineDDL.UUID, revertMigration.UUID, len(artifactTables)) 2526 } 2527 if len(artifactTables) == 0 { 2528 return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "cannot run migration %s reverting %s: found %d artifact tables, expected 1", onlineDDL.UUID, revertMigration.UUID, len(artifactTables)) 2529 } 2530 for _, artifactTable := range artifactTables { 2531 if err := e.updateArtifacts(ctx, onlineDDL.UUID, artifactTable); err != nil { 2532 return err 2533 } 2534 onlineDDL.SQL, _, err = e.generateSwapTablesStatement(ctx, onlineDDL.Table, artifactTable) 2535 if err != nil { 2536 return err 2537 } 2538 if _, err := e.executeDirectly(ctx, onlineDDL); err != nil { 2539 return err 2540 } 2541 } 2542 return nil 2543 } 2544 // Real table 2545 if err := e.ExecuteWithVReplication(ctx, onlineDDL, revertMigration); err != nil { 2546 return err 2547 } 2548 } 2549 default: 2550 return fmt.Errorf("cannot run migration %s reverting %s: unexpected action %s", onlineDDL.UUID, revertMigration.UUID, revertedActionStr) 2551 } 2552 2553 return nil 2554 } 2555 2556 // evaluateDeclarativeDiff is called for -declarative CREATE statements, where the table already exists. The function generates a SQL diff, which can be: 2557 // - empty, in which case the migration is noop and implicitly successful, or 2558 // - non-empty, in which case the migration turns to be an ALTER 2559 func (e *Executor) evaluateDeclarativeDiff(ctx context.Context, onlineDDL *schema.OnlineDDL) (diff schemadiff.EntityDiff, err error) { 2560 2561 // Modify the CREATE TABLE statement to indicate a different, made up table name, known as the "comparison table" 2562 ddlStmt, _, err := schema.ParseOnlineDDLStatement(onlineDDL.SQL) 2563 if err != nil { 2564 return nil, err 2565 } 2566 // Is this CREATE TABLE or CREATE VIEW? 2567 comparisonTableName, err := schema.GenerateGCTableName(schema.HoldTableGCState, newGCTableRetainTime()) 2568 if err != nil { 2569 return nil, err 2570 } 2571 2572 conn, err := dbconnpool.NewDBConnection(ctx, e.env.Config().DB.DbaWithDB()) 2573 if err != nil { 2574 return nil, err 2575 } 2576 defer conn.Close() 2577 2578 { 2579 // Create the comparison table 2580 ddlStmt.SetTable("", comparisonTableName) 2581 modifiedCreateSQL := sqlparser.String(ddlStmt) 2582 2583 restoreSQLModeFunc, err := e.initMigrationSQLMode(ctx, onlineDDL, conn) 2584 defer restoreSQLModeFunc() 2585 if err != nil { 2586 return nil, err 2587 } 2588 2589 if _, err := conn.ExecuteFetch(modifiedCreateSQL, 0, false); err != nil { 2590 return nil, err 2591 } 2592 2593 defer func() { 2594 // Drop the comparison table 2595 parsed := sqlparser.BuildParsedQuery(sqlDropTable, comparisonTableName) 2596 _, _ = conn.ExecuteFetch(parsed.Query, 0, false) 2597 // Nothing bad happens for not checking the error code. The table is GC/HOLD. If we 2598 // can't drop it now, it still gets collected later by tablegc mechanism 2599 }() 2600 } 2601 2602 existingShowCreateTable, err := e.showCreateTable(ctx, onlineDDL.Table) 2603 if err != nil { 2604 return nil, vterrors.Wrapf(err, "in evaluateDeclarativeDiff(), for onlineDDL.Table") 2605 } 2606 if existingShowCreateTable == "" { 2607 return nil, vterrors.Errorf(vtrpcpb.Code_NOT_FOUND, "unexpected: cannot find table or view %v", onlineDDL.Table) 2608 } 2609 newShowCreateTable, err := e.showCreateTable(ctx, comparisonTableName) 2610 if err != nil { 2611 return nil, vterrors.Wrapf(err, "in evaluateDeclarativeDiff(), for comparisonTableName") 2612 } 2613 if newShowCreateTable == "" { 2614 return nil, vterrors.Errorf(vtrpcpb.Code_INTERNAL, "unexpected: cannot find table or view even as it was just created: %v", onlineDDL.Table) 2615 } 2616 hints := &schemadiff.DiffHints{AutoIncrementStrategy: schemadiff.AutoIncrementApplyHigher} 2617 switch ddlStmt.(type) { 2618 case *sqlparser.CreateTable: 2619 diff, err = schemadiff.DiffCreateTablesQueries(existingShowCreateTable, newShowCreateTable, hints) 2620 case *sqlparser.CreateView: 2621 diff, err = schemadiff.DiffCreateViewsQueries(existingShowCreateTable, newShowCreateTable, hints) 2622 default: 2623 return nil, vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "expected CREATE TABLE or CREATE VIEW in online DDL statement: %v", onlineDDL.SQL) 2624 } 2625 if err != nil { 2626 return nil, err 2627 } 2628 return diff, nil 2629 } 2630 2631 // getCompletedMigrationByContextAndSQL chceks if there exists a completed migration with exact same 2632 // context and SQL as given migration. If so, it returns its UUID. 2633 func (e *Executor) getCompletedMigrationByContextAndSQL(ctx context.Context, onlineDDL *schema.OnlineDDL) (completedUUID string, err error) { 2634 if onlineDDL.MigrationContext == "" { 2635 // only applies to migrations with an explicit context 2636 return "", nil 2637 } 2638 query, err := sqlparser.ParseAndBind(sqlSelectCompleteMigrationsByContextAndSQL, 2639 sqltypes.StringBindVariable(e.keyspace), 2640 sqltypes.StringBindVariable(onlineDDL.MigrationContext), 2641 sqltypes.StringBindVariable(onlineDDL.SQL), 2642 ) 2643 if err != nil { 2644 return "", err 2645 } 2646 r, err := e.execQuery(ctx, query) 2647 if err != nil { 2648 return "", err 2649 } 2650 for _, row := range r.Named().Rows { 2651 completedUUID = row["migration_uuid"].ToString() 2652 } 2653 return completedUUID, nil 2654 } 2655 2656 // failMigration marks a migration as failed 2657 func (e *Executor) failMigration(ctx context.Context, onlineDDL *schema.OnlineDDL, withError error) error { 2658 defer e.triggerNextCheckInterval() 2659 _ = e.updateMigrationStatusFailedOrCancelled(ctx, onlineDDL.UUID) 2660 if withError != nil { 2661 _ = e.updateMigrationMessage(ctx, onlineDDL.UUID, withError.Error()) 2662 } 2663 e.ownedRunningMigrations.Delete(onlineDDL.UUID) 2664 return withError 2665 } 2666 2667 func (e *Executor) executeDropDDLActionMigration(ctx context.Context, onlineDDL *schema.OnlineDDL) error { 2668 failMigration := func(err error) error { 2669 return e.failMigration(ctx, onlineDDL, err) 2670 } 2671 e.migrationMutex.Lock() 2672 defer e.migrationMutex.Unlock() 2673 2674 // Drop statement. 2675 // Normally, we're going to modify DROP to RENAME (see later on). But if table name is 2676 // already a GC-lifecycle table, then we don't put it through yet another GC lifecycle, 2677 // we just drop it. 2678 if schema.IsGCTableName(onlineDDL.Table) { 2679 if _, err := e.executeDirectly(ctx, onlineDDL); err != nil { 2680 return failMigration(err) 2681 } 2682 return nil 2683 } 2684 2685 // We transform a DROP TABLE into a RENAME TABLE statement, so as to remove the table safely and asynchronously. 2686 2687 ddlStmt, _, err := schema.ParseOnlineDDLStatement(onlineDDL.SQL) 2688 if err != nil { 2689 return failMigration(err) 2690 } 2691 2692 var toTableName string 2693 onlineDDL.SQL, toTableName, err = schema.GenerateRenameStatementWithUUID(onlineDDL.Table, schema.HoldTableGCState, onlineDDL.GetGCUUID(), newGCTableRetainTime()) 2694 if err != nil { 2695 return failMigration(err) 2696 } 2697 if err := e.updateArtifacts(ctx, onlineDDL.UUID, toTableName); err != nil { 2698 return err 2699 } 2700 2701 acceptableErrorCodes := []int{} 2702 if ddlStmt.GetIfExists() { 2703 acceptableErrorCodes = acceptableDropTableIfExistsErrorCodes 2704 } 2705 acceptableErrCodeFound, err := e.executeDirectly(ctx, onlineDDL, acceptableErrorCodes...) 2706 if err != nil { 2707 return failMigration(err) 2708 } 2709 if acceptableErrCodeFound { 2710 // Table did not exist after all. There is no artifact 2711 if err := e.clearArtifacts(ctx, onlineDDL.UUID); err != nil { 2712 return err 2713 } 2714 } 2715 2716 return nil 2717 } 2718 2719 func (e *Executor) executeCreateDDLActionMigration(ctx context.Context, onlineDDL *schema.OnlineDDL) error { 2720 failMigration := func(err error) error { 2721 return e.failMigration(ctx, onlineDDL, err) 2722 } 2723 e.migrationMutex.Lock() 2724 defer e.migrationMutex.Unlock() 2725 2726 ddlStmt, _, err := schema.ParseOnlineDDLStatement(onlineDDL.SQL) 2727 if err != nil { 2728 return failMigration(err) 2729 } 2730 if _, isCreateView := ddlStmt.(*sqlparser.CreateView); isCreateView { 2731 if ddlStmt.GetIsReplace() { 2732 // This is a CREATE OR REPLACE VIEW 2733 exists, err := e.tableExists(ctx, onlineDDL.Table) 2734 if err != nil { 2735 return failMigration(err) 2736 } 2737 if exists { 2738 // the view already exists. This CREATE OR REPLACE VIEW statement should 2739 // actually turn into an ALTER 2740 if err := e.executeAlterViewOnline(ctx, onlineDDL); err != nil { 2741 return failMigration(err) 2742 } 2743 return nil 2744 } 2745 } 2746 } 2747 // from now on, whether a VIEW or a TABLE, they get the same treatment 2748 2749 sentryArtifactTableName, err := schema.GenerateGCTableName(schema.HoldTableGCState, newGCTableRetainTime()) 2750 if err != nil { 2751 return failMigration(err) 2752 } 2753 // we create a dummy artifact. Its existence means the table was created by this migration. 2754 // It will be read by the revert operation. 2755 if err := e.updateArtifacts(ctx, onlineDDL.UUID, sentryArtifactTableName); err != nil { 2756 return err 2757 } 2758 2759 if ddlStmt.GetIfNotExists() { 2760 // This is a CREATE TABLE IF NOT EXISTS 2761 // We want to know if the table actually exists before running this migration. 2762 // If so, then the operation is noop, and when we revert the migration, we also do a noop. 2763 exists, err := e.tableExists(ctx, onlineDDL.Table) 2764 if err != nil { 2765 return failMigration(err) 2766 } 2767 if exists { 2768 // the table already exists. This CREATE TABLE IF NOT EXISTS statement is a noop. 2769 // We therefore clear the artifact field. A revert operation will use this as a hint. 2770 if err := e.clearArtifacts(ctx, onlineDDL.UUID); err != nil { 2771 return failMigration(err) 2772 } 2773 } 2774 } 2775 if _, err := e.executeDirectly(ctx, onlineDDL); err != nil { 2776 return failMigration(err) 2777 } 2778 return nil 2779 } 2780 2781 // generateSwapTablesStatement creates a RENAME statement that swaps two tables, with assistance 2782 // of temporary third table. It returns the name of generated third table, though normally 2783 // that table should not exist before & after operation, only _during_ operation time. 2784 func (e *Executor) generateSwapTablesStatement(ctx context.Context, tableName1, tableName2 string) (query string, swapTableName string, err error) { 2785 swapTableName, err = schema.GenerateGCTableName(schema.HoldTableGCState, newGCTableRetainTime()) 2786 if err != nil { 2787 return "", swapTableName, err 2788 } 2789 parsed := sqlparser.BuildParsedQuery(sqlSwapTables, 2790 tableName1, swapTableName, 2791 tableName2, tableName1, 2792 swapTableName, tableName2, 2793 ) 2794 return parsed.Query, swapTableName, nil 2795 } 2796 2797 // renameTableIfApplicable renames a table, assuming it exists and that the target does not exist. 2798 func (e *Executor) renameTableIfApplicable(ctx context.Context, fromTableName, toTableName string) (attemptMade bool, err error) { 2799 if fromTableName == "" { 2800 return false, nil 2801 } 2802 exists, err := e.tableExists(ctx, fromTableName) 2803 if err != nil { 2804 return false, err 2805 } 2806 if !exists { 2807 // can't rename from table when it does not exist 2808 return false, nil 2809 } 2810 exists, err = e.tableExists(ctx, toTableName) 2811 if err != nil { 2812 return false, err 2813 } 2814 if exists { 2815 // target table exists, abort. 2816 return false, nil 2817 } 2818 parsed := sqlparser.BuildParsedQuery(sqlRenameTable, fromTableName, toTableName) 2819 _, err = e.execQuery(ctx, parsed.Query) 2820 return true, err 2821 } 2822 2823 func (e *Executor) executeAlterViewOnline(ctx context.Context, onlineDDL *schema.OnlineDDL) (err error) { 2824 artifactViewName, err := schema.GenerateGCTableName(schema.HoldTableGCState, newGCTableRetainTime()) 2825 if err != nil { 2826 return err 2827 } 2828 stmt, _, err := schema.ParseOnlineDDLStatement(onlineDDL.SQL) 2829 if err != nil { 2830 return err 2831 } 2832 switch viewStmt := stmt.(type) { 2833 case *sqlparser.CreateView: 2834 stmt.SetTable("", artifactViewName) 2835 case *sqlparser.AlterView: 2836 // consolidate the logic. We treat ALTER like we treat CREATE OR REPLACE 2837 // it actually easier for us to issue a CREATE OR REPLACE, because it 2838 // actually creates a view... 2839 stmt = &sqlparser.CreateView{ 2840 Algorithm: viewStmt.Algorithm, 2841 Definer: viewStmt.Definer, 2842 Security: viewStmt.Security, 2843 Columns: viewStmt.Columns, 2844 Select: viewStmt.Select, 2845 CheckOption: viewStmt.CheckOption, 2846 IsReplace: true, 2847 Comments: viewStmt.Comments, 2848 } 2849 stmt.SetTable("", artifactViewName) 2850 default: 2851 return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "executeAlterViewOnline only supports CreateView and AlterView statements. Got: %v", sqlparser.String(viewStmt)) 2852 } 2853 artifactViewCreateSQL := sqlparser.String(stmt) 2854 2855 conn, err := dbconnpool.NewDBConnection(ctx, e.env.Config().DB.DbaWithDB()) 2856 if err != nil { 2857 return err 2858 } 2859 defer conn.Close() 2860 2861 _ = e.onSchemaMigrationStatus(ctx, onlineDDL.UUID, schema.OnlineDDLStatusRunning, false, progressPctStarted, etaSecondsUnknown, rowsCopiedUnknown, emptyHint) 2862 2863 if _, err := conn.ExecuteFetch(artifactViewCreateSQL, 0, false); err != nil { 2864 return err 2865 } 2866 if err := e.clearArtifacts(ctx, onlineDDL.UUID); err != nil { 2867 return err 2868 } 2869 if err := e.updateArtifacts(ctx, onlineDDL.UUID, artifactViewName); err != nil { 2870 return err 2871 } 2872 2873 // view created in requested format, but under different name. We now swap the views 2874 swapQuery, _, err := e.generateSwapTablesStatement(ctx, onlineDDL.Table, artifactViewName) 2875 if err != nil { 2876 return err 2877 } 2878 if _, err := conn.ExecuteFetch(swapQuery, 0, false); err != nil { 2879 return err 2880 } 2881 // Make sure this is considered as an ALTER. 2882 // Either the user issued a ALTER VIEW, and the action is trivially ALTER, 2883 // or the user issues a CREATE OR REPLACE, and the view existed, in which case this is implicitly an ALTER 2884 if err := e.updateDDLAction(ctx, onlineDDL.UUID, sqlparser.AlterStr); err != nil { 2885 return err 2886 } 2887 2888 _ = e.onSchemaMigrationStatus(ctx, onlineDDL.UUID, schema.OnlineDDLStatusComplete, false, progressPctFull, etaSecondsNow, rowsCopiedUnknown, emptyHint) 2889 2890 return nil 2891 } 2892 2893 // addInstantAlgorithm adds or modifies the AlterTable's ALGORITHM to INSTANT 2894 func (e *Executor) addInstantAlgorithm(alterTable *sqlparser.AlterTable) { 2895 instantOpt := sqlparser.AlgorithmValue("INSTANT") 2896 for i, opt := range alterTable.AlterOptions { 2897 if _, ok := opt.(sqlparser.AlgorithmValue); ok { 2898 // replace an existing algorithm 2899 alterTable.AlterOptions[i] = instantOpt 2900 return 2901 } 2902 } 2903 // append an algorithm 2904 alterTable.AlterOptions = append(alterTable.AlterOptions, instantOpt) 2905 } 2906 2907 // executeSpecialAlterDDLActionMigrationIfApplicable sees if the given migration can be executed via special execution path, that isn't a full blown online schema change process. 2908 func (e *Executor) executeSpecialAlterDDLActionMigrationIfApplicable(ctx context.Context, onlineDDL *schema.OnlineDDL) (specialMigrationExecuted bool, err error) { 2909 // Before we jump on to strategies... Some ALTERs can be optimized without having to run through 2910 // a full online schema change process. Let's find out if this is the case! 2911 conn, err := dbconnpool.NewDBConnection(ctx, e.env.Config().DB.DbaWithDB()) 2912 if err != nil { 2913 return false, err 2914 } 2915 defer conn.Close() 2916 _, capableOf, _ := mysql.GetFlavor(conn.ServerVersion, nil) 2917 2918 specialPlan, err := e.analyzeSpecialAlterPlan(ctx, onlineDDL, capableOf) 2919 if err != nil { 2920 return false, err 2921 } 2922 if specialPlan == nil { 2923 return false, nil 2924 } 2925 2926 switch specialPlan.operation { 2927 case instantDDLSpecialOperation: 2928 e.addInstantAlgorithm(specialPlan.alterTable) 2929 onlineDDL.SQL = sqlparser.CanonicalString(specialPlan.alterTable) 2930 if _, err := e.executeDirectly(ctx, onlineDDL); err != nil { 2931 return false, err 2932 } 2933 case dropRangePartitionSpecialOperation: 2934 dropPartition := func() error { 2935 artifactTableName, err := schema.GenerateGCTableName(schema.HoldTableGCState, newGCTableRetainTime()) 2936 if err != nil { 2937 return err 2938 } 2939 if err := e.updateArtifacts(ctx, onlineDDL.UUID, artifactTableName); err != nil { 2940 return err 2941 } 2942 2943 // Apply CREATE TABLE for artifact table 2944 if _, err := e.createTableLike(ctx, artifactTableName, onlineDDL, conn); err != nil { 2945 return err 2946 } 2947 // Remove partitioning 2948 parsed := sqlparser.BuildParsedQuery(sqlAlterTableRemovePartitioning, artifactTableName) 2949 if _, err := conn.ExecuteFetch(parsed.Query, 0, false); err != nil { 2950 return err 2951 } 2952 // Exchange with partition 2953 partitionName := specialPlan.Detail("partition_name") 2954 parsed = sqlparser.BuildParsedQuery(sqlAlterTableExchangePartition, onlineDDL.Table, partitionName, artifactTableName) 2955 if _, err := conn.ExecuteFetch(parsed.Query, 0, false); err != nil { 2956 return err 2957 } 2958 // Drop table's partition 2959 parsed = sqlparser.BuildParsedQuery(sqlAlterTableDropPartition, onlineDDL.Table, partitionName) 2960 if _, err := conn.ExecuteFetch(parsed.Query, 0, false); err != nil { 2961 return err 2962 } 2963 return nil 2964 } 2965 if err := dropPartition(); err != nil { 2966 return false, err 2967 } 2968 case addRangePartitionSpecialOperation: 2969 if _, err := e.executeDirectly(ctx, onlineDDL); err != nil { 2970 return false, err 2971 } 2972 default: 2973 return false, nil 2974 } 2975 if err := e.updateMigrationSpecialPlan(ctx, onlineDDL.UUID, specialPlan.String()); err != nil { 2976 return true, err 2977 } 2978 _ = e.onSchemaMigrationStatus(ctx, onlineDDL.UUID, schema.OnlineDDLStatusComplete, false, progressPctFull, etaSecondsNow, rowsCopiedUnknown, emptyHint) 2979 return true, nil 2980 } 2981 2982 // executeAlterDDLActionMigration 2983 func (e *Executor) executeAlterDDLActionMigration(ctx context.Context, onlineDDL *schema.OnlineDDL) error { 2984 failMigration := func(err error) error { 2985 return e.failMigration(ctx, onlineDDL, err) 2986 } 2987 ddlStmt, _, err := schema.ParseOnlineDDLStatement(onlineDDL.SQL) 2988 if err != nil { 2989 return failMigration(err) 2990 } 2991 if _, isAlterView := ddlStmt.(*sqlparser.AlterView); isAlterView { 2992 // Same treatment for all online strategies 2993 exists, err := e.tableExists(ctx, onlineDDL.Table) 2994 if err != nil { 2995 return failMigration(err) 2996 } 2997 if !exists { 2998 // We cannot ALTER VIEW if the view does not exist. We could bail out directly here, 2999 // but we prefer to actually get an authentic MySQL error. We know MySQL will fail running 3000 // this statement. 3001 _, err := e.executeDirectly(ctx, onlineDDL) 3002 return failMigration(err) 3003 } 3004 // OK, view exists 3005 if err := e.executeAlterViewOnline(ctx, onlineDDL); err != nil { 3006 return failMigration(err) 3007 } 3008 return nil 3009 } 3010 // This is a real TABLE and not a VIEW 3011 3012 // Before we jump on to strategies... Some ALTERs can be optimized without having to run through 3013 // a full online schema change process. Let's find out if this is the case! 3014 specialMigrationExecuted, err := e.executeSpecialAlterDDLActionMigrationIfApplicable(ctx, onlineDDL) 3015 if err != nil { 3016 return failMigration(err) 3017 } 3018 if specialMigrationExecuted { 3019 return nil 3020 } 3021 3022 // OK, nothing special about this ALTER. Let's go ahead and execute it. 3023 switch onlineDDL.Strategy { 3024 case schema.DDLStrategyOnline, schema.DDLStrategyVitess: 3025 go func() { 3026 e.migrationMutex.Lock() 3027 defer e.migrationMutex.Unlock() 3028 3029 if err := e.ExecuteWithVReplication(ctx, onlineDDL, nil); err != nil { 3030 failMigration(err) 3031 } 3032 }() 3033 case schema.DDLStrategyGhost: 3034 go func() { 3035 e.migrationMutex.Lock() 3036 defer e.migrationMutex.Unlock() 3037 3038 if err := e.ExecuteWithGhost(ctx, onlineDDL); err != nil { 3039 failMigration(err) 3040 } 3041 }() 3042 case schema.DDLStrategyPTOSC: 3043 go func() { 3044 e.migrationMutex.Lock() 3045 defer e.migrationMutex.Unlock() 3046 3047 if err := e.ExecuteWithPTOSC(ctx, onlineDDL); err != nil { 3048 failMigration(err) 3049 } 3050 }() 3051 case schema.DDLStrategyMySQL: 3052 go func() { 3053 e.migrationMutex.Lock() 3054 defer e.migrationMutex.Unlock() 3055 3056 if _, err := e.executeDirectly(ctx, onlineDDL); err != nil { 3057 failMigration(err) 3058 } 3059 }() 3060 default: 3061 { 3062 return failMigration(fmt.Errorf("Unsupported strategy: %+v", onlineDDL.Strategy)) 3063 } 3064 } 3065 return nil 3066 } 3067 3068 // executeMigration executes a single migration. It analyzes the migration type: 3069 // - is it declarative? 3070 // - is it CREATE / DROP / ALTER? 3071 // - it is a Revert request? 3072 // - what's the migration strategy? 3073 // The function invokes the appropriate handlers for each of those cases. 3074 func (e *Executor) executeMigration(ctx context.Context, onlineDDL *schema.OnlineDDL) error { 3075 defer e.triggerNextCheckInterval() 3076 failMigration := func(err error) error { 3077 return e.failMigration(ctx, onlineDDL, err) 3078 } 3079 3080 ddlAction, err := onlineDDL.GetAction() 3081 if err != nil { 3082 return failMigration(err) 3083 } 3084 3085 // See if this is a duplicate submission. A submission is considered duplicate if it has the exact same 3086 // migration context and DDL as a previous one. We are only interested in our scenario in a duplicate 3087 // whose predecessor is "complete". If this is the case, then we can mark our own migration as 3088 // implicitly "complete", too. 3089 { 3090 completedUUID, err := e.getCompletedMigrationByContextAndSQL(ctx, onlineDDL) 3091 if err != nil { 3092 return err 3093 } 3094 if completedUUID != "" { 3095 // Yep. We mark this migration as implicitly complete, and we're done with it! 3096 _ = e.onSchemaMigrationStatus(ctx, onlineDDL.UUID, schema.OnlineDDLStatusComplete, false, progressPctFull, etaSecondsNow, rowsCopiedUnknown, emptyHint) 3097 _ = e.updateMigrationMessage(ctx, onlineDDL.UUID, fmt.Sprintf("duplicate DDL as %s for migration context %s", completedUUID, onlineDDL.MigrationContext)) 3098 return nil 3099 } 3100 } 3101 3102 if onlineDDL.StrategySetting().IsDeclarative() { 3103 switch ddlAction { 3104 case sqlparser.RevertDDLAction: 3105 // No special action. Declarative Revert migrations are handled like any normal Revert migration. 3106 case sqlparser.AlterDDLAction: 3107 return failMigration(vterrors.Errorf(vtrpcpb.Code_UNIMPLEMENTED, "strategy is declarative. ALTER cannot run in declarative mode for migration %v", onlineDDL.UUID)) 3108 case sqlparser.DropDDLAction: 3109 // This DROP is declarative, meaning it may: 3110 // - actually DROP a table, if that table exists, or 3111 // - Implicitly do nothing, if the table does not exist 3112 { 3113 // Sanity: reject IF NOT EXISTS statements, because they don't make sense (or are ambiguous) in declarative mode 3114 ddlStmt, _, err := schema.ParseOnlineDDLStatement(onlineDDL.SQL) 3115 if err != nil { 3116 return failMigration(err) 3117 } 3118 if ddlStmt.GetIfExists() { 3119 return failMigration(vterrors.Errorf(vtrpcpb.Code_UNIMPLEMENTED, "strategy is declarative. IF EXISTS does not work in declarative mode for migration %v", onlineDDL.UUID)) 3120 } 3121 } 3122 exists, err := e.tableExists(ctx, onlineDDL.Table) 3123 if err != nil { 3124 return failMigration(err) 3125 } 3126 if exists { 3127 // table does exist, so this declarative DROP turns out to really be an actual DROP. No further action is needed here 3128 } else { 3129 // table does not exist. We mark this DROP as implicitly sucessful 3130 _ = e.onSchemaMigrationStatus(ctx, onlineDDL.UUID, schema.OnlineDDLStatusComplete, false, progressPctFull, etaSecondsNow, rowsCopiedUnknown, emptyHint) 3131 _ = e.updateMigrationMessage(ctx, onlineDDL.UUID, "no change") 3132 return nil 3133 } 3134 case sqlparser.CreateDDLAction: 3135 // This CREATE is declarative, meaning it may: 3136 // - actually CREATE a table, if that table does not exist, or 3137 // - ALTER the table, if it exists and is different, or 3138 // - Implicitly do nothing, if the table exists and is identical to CREATE statement 3139 3140 // Sanity: reject IF NOT EXISTS statements, because they don't make sense (or are ambiguous) in declarative mode 3141 ddlStmt, _, err := schema.ParseOnlineDDLStatement(onlineDDL.SQL) 3142 if err != nil { 3143 return failMigration(err) 3144 } 3145 if ddlStmt.GetIfNotExists() { 3146 return failMigration(vterrors.Errorf(vtrpcpb.Code_UNIMPLEMENTED, "strategy is declarative. IF NOT EXISTS does not work in declarative mode for migration %v", onlineDDL.UUID)) 3147 } 3148 if ddlStmt.GetIsReplace() { 3149 return failMigration(vterrors.Errorf(vtrpcpb.Code_UNIMPLEMENTED, "strategy is declarative. OR REPLACE does not work in declarative mode for migration %v", onlineDDL.UUID)) 3150 } 3151 3152 exists, err := e.tableExists(ctx, onlineDDL.Table) 3153 if err != nil { 3154 return failMigration(err) 3155 } 3156 if exists { 3157 diff, err := e.evaluateDeclarativeDiff(ctx, onlineDDL) 3158 if err != nil { 3159 return failMigration(err) 3160 } 3161 if diff == nil || diff.IsEmpty() { 3162 // No diff! We mark this CREATE as implicitly sucessful 3163 _ = e.onSchemaMigrationStatus(ctx, onlineDDL.UUID, schema.OnlineDDLStatusComplete, false, progressPctFull, etaSecondsNow, rowsCopiedUnknown, emptyHint) 3164 _ = e.updateMigrationMessage(ctx, onlineDDL.UUID, "no change") 3165 return nil 3166 } 3167 // alterClause is non empty. We convert this migration into an ALTER 3168 if err := e.updateDDLAction(ctx, onlineDDL.UUID, sqlparser.AlterStr); err != nil { 3169 return failMigration(err) 3170 } 3171 if createViewStmt, isCreateView := ddlStmt.(*sqlparser.CreateView); isCreateView { 3172 // Rewrite as CREATE OR REPLACE 3173 // this will be handled later on. 3174 createViewStmt.IsReplace = true 3175 onlineDDL.SQL = sqlparser.String(createViewStmt) 3176 } else { 3177 // a TABLE 3178 ddlAction = sqlparser.AlterDDLAction 3179 onlineDDL.SQL = diff.CanonicalStatementString() 3180 } 3181 _ = e.updateMigrationMessage(ctx, onlineDDL.UUID, diff.CanonicalStatementString()) 3182 } else { 3183 { 3184 // table does not exist, so this declarative CREATE turns out to really be an actual CREATE. No further action is needed here. 3185 // the statement is empty, but I want to keep the 'else' clause here just for sake of this comment. 3186 } 3187 } 3188 } 3189 } // endif onlineDDL.IsDeclarative() 3190 // Noting that if the migration is declarative, then it may have been modified in the above block, to meet the next operations. 3191 3192 switch ddlAction { 3193 case sqlparser.DropDDLAction: 3194 go func() error { 3195 return e.executeDropDDLActionMigration(ctx, onlineDDL) 3196 }() 3197 case sqlparser.CreateDDLAction: 3198 go func() error { 3199 return e.executeCreateDDLActionMigration(ctx, onlineDDL) 3200 }() 3201 case sqlparser.AlterDDLAction: 3202 return e.executeAlterDDLActionMigration(ctx, onlineDDL) 3203 case sqlparser.RevertDDLAction: 3204 go func() { 3205 e.migrationMutex.Lock() 3206 defer e.migrationMutex.Unlock() 3207 3208 if err := e.executeRevert(ctx, onlineDDL); err != nil { 3209 failMigration(err) 3210 } 3211 }() 3212 } 3213 return nil 3214 } 3215 3216 // runNextMigration picks up to one 'ready' migration that is able to run, and executes it. 3217 // Possible scenarios: 3218 // - no migration is in 'ready' state -- nothing to be done 3219 // - a migration is 'ready', but conflicts with other running migrations -- try another 'ready' migration 3220 // - multiple migrations are 'ready' -- we just handle one here 3221 // Note that per the above breakdown, and due to potential conflicts, it is possible to have one or 3222 // more 'ready' migration, and still none is executed. 3223 func (e *Executor) runNextMigration(ctx context.Context) error { 3224 e.migrationMutex.Lock() 3225 defer e.migrationMutex.Unlock() 3226 3227 if !e.reviewedRunningMigrationsFlag { 3228 // Since Open(), we havent's once executed reviewRunningMigrations() successfully. 3229 // This means we may not have a good picture of what is actually running. Perhaps there's 3230 // a vreplication migration from a pre-PRS/ERS that we still need to learn about? 3231 // We're going to be careful here, and avoid running new migrations until we have 3232 // a better picture. It will likely take a couple seconds till next iteration. 3233 // This delay only takes place shortly after Open(). 3234 return nil 3235 } 3236 3237 // getNonConflictingMigration finds a single 'ready' migration which does not conflict with running migrations. 3238 // Conflicts are: 3239 // - a migration is 'ready' but is not set to run _concurrently_, and there's a running migration that is also non-concurrent 3240 // - a migration is 'ready' but there's another migration 'running' on the exact same table 3241 getNonConflictingMigration := func() (*schema.OnlineDDL, error) { 3242 pendingMigrationsUUIDs, err := e.readPendingMigrationsUUIDs(ctx) 3243 if err != nil { 3244 return nil, err 3245 } 3246 r, err := e.execQuery(ctx, sqlSelectReadyMigrations) 3247 if err != nil { 3248 return nil, err 3249 } 3250 for _, row := range r.Named().Rows { 3251 uuid := row["migration_uuid"].ToString() 3252 onlineDDL, migrationRow, err := e.readMigration(ctx, uuid) 3253 if err != nil { 3254 return nil, err 3255 } 3256 isImmediateOperation := migrationRow.AsBool("is_immediate_operation", false) 3257 3258 if conflictFound, _ := e.isAnyConflictingMigrationRunning(onlineDDL); conflictFound { 3259 continue // this migration conflicts with a running one 3260 } 3261 if e.countOwnedRunningMigrations() >= maxConcurrentOnlineDDLs { 3262 continue // too many running migrations 3263 } 3264 if isImmediateOperation && onlineDDL.StrategySetting().IsInOrderCompletion() { 3265 // This migration is immediate: if we run it now, it will complete within a second or two at most. 3266 if len(pendingMigrationsUUIDs) > 0 && pendingMigrationsUUIDs[0] != onlineDDL.UUID { 3267 continue 3268 } 3269 } 3270 // This migration seems good to go 3271 return onlineDDL, err 3272 } 3273 // no non-conflicting migration found... 3274 // Either all ready migrations are conflicting, or there are no ready migrations... 3275 return nil, nil 3276 } 3277 onlineDDL, err := getNonConflictingMigration() 3278 if err != nil { 3279 return err 3280 } 3281 if onlineDDL == nil { 3282 // nothing to do 3283 return nil 3284 } 3285 { 3286 // We strip out any VT query comments because our simplified parser doesn't work well with comments 3287 ddlStmt, _, err := schema.ParseOnlineDDLStatement(onlineDDL.SQL) 3288 if err == nil { 3289 ddlStmt.SetComments(sqlparser.Comments{}) 3290 onlineDDL.SQL = sqlparser.String(ddlStmt) 3291 } 3292 } 3293 log.Infof("Executor.runNextMigration: migration %s is non conflicting and will be executed next", onlineDDL.UUID) 3294 e.executeMigration(ctx, onlineDDL) 3295 return nil 3296 } 3297 3298 // isPTOSCMigrationRunning sees if pt-online-schema-change is running a specific migration, 3299 // by examining its PID file 3300 func (e *Executor) isPTOSCMigrationRunning(ctx context.Context, uuid string) (isRunning bool, pid int, err error) { 3301 // Try and read its PID file: 3302 content, err := os.ReadFile(e.ptPidFileName(uuid)) 3303 if err != nil { 3304 // file probably does not exist (migration not running) 3305 // or any other issue --> we can't confirm that the migration is actually running 3306 return false, pid, err 3307 } 3308 contentString := strings.TrimSpace(string(content)) 3309 // 3310 pid, err = strconv.Atoi(contentString) 3311 if err != nil { 3312 // can't get the PID right. Can't confirm migration is running. 3313 return false, pid, err 3314 } 3315 p, err := os.FindProcess(pid) 3316 if err != nil { 3317 // can't find the process. Can't confirm migration is running. 3318 return false, pid, err 3319 } 3320 err = p.Signal(syscall.Signal(0)) 3321 if err != nil { 3322 // can't verify process is running. Can't confirm migration is running. 3323 return false, pid, err 3324 } 3325 // AHA! We are able to confirm this pt-osc migration is actually running! 3326 return true, pid, nil 3327 } 3328 3329 // dropOnlineDDLUser drops the given ddl user account at the end of migration 3330 func (e *Executor) dropPTOSCMigrationTriggers(ctx context.Context, onlineDDL *schema.OnlineDDL) error { 3331 conn, err := dbconnpool.NewDBConnection(ctx, e.env.Config().DB.DbaConnector()) 3332 if err != nil { 3333 return err 3334 } 3335 defer conn.Close() 3336 3337 parsed := sqlparser.BuildParsedQuery(sqlSelectPTOSCMigrationTriggers, ":mysql_schema", ":mysql_table") 3338 bindVars := map[string]*querypb.BindVariable{ 3339 "mysql_schema": sqltypes.StringBindVariable(onlineDDL.Schema), 3340 "mysql_table": sqltypes.StringBindVariable(onlineDDL.Table), 3341 } 3342 bound, err := parsed.GenerateQuery(bindVars, nil) 3343 if err != nil { 3344 return err 3345 } 3346 r, err := e.execQuery(ctx, bound) 3347 if err != nil { 3348 return err 3349 } 3350 for _, row := range r.Named().Rows { 3351 // iterate pt-osc triggers and drop them 3352 triggerSchema := row.AsString("trigger_schema", "") 3353 triggerName := row.AsString("trigger_name", "") 3354 3355 dropParsed := sqlparser.BuildParsedQuery(sqlDropTrigger, triggerSchema, triggerName) 3356 if _, err := conn.ExecuteFetch(dropParsed.Query, 0, false); err != nil { 3357 return err 3358 } 3359 } 3360 3361 return err 3362 } 3363 3364 // readVReplStream reads _vt.vreplication entries for given workflow 3365 func (e *Executor) readVReplStream(ctx context.Context, uuid string, okIfMissing bool) (*VReplStream, error) { 3366 query, err := sqlparser.ParseAndBind(sqlReadVReplStream, 3367 sqltypes.StringBindVariable(uuid), 3368 ) 3369 if err != nil { 3370 return nil, err 3371 } 3372 r, err := e.execQuery(ctx, query) 3373 if err != nil { 3374 return nil, err 3375 } 3376 if len(r.Rows) == 0 && okIfMissing { 3377 return nil, nil 3378 } 3379 row := r.Named().Row() 3380 if row == nil { 3381 return nil, vterrors.Errorf(vtrpcpb.Code_UNKNOWN, "Cannot find unique workflow for UUID: %+v", uuid) 3382 } 3383 s := &VReplStream{ 3384 id: row.AsInt64("id", 0), 3385 workflow: row.AsString("workflow", ""), 3386 source: row.AsString("source", ""), 3387 pos: row.AsString("pos", ""), 3388 timeUpdated: row.AsInt64("time_updated", 0), 3389 timeHeartbeat: row.AsInt64("time_heartbeat", 0), 3390 timeThrottled: row.AsInt64("time_throttled", 0), 3391 componentThrottled: row.AsString("component_throttled", ""), 3392 transactionTimestamp: row.AsInt64("transaction_timestamp", 0), 3393 state: row.AsString("state", ""), 3394 message: row.AsString("message", ""), 3395 rowsCopied: row.AsInt64("rows_copied", 0), 3396 bls: &binlogdatapb.BinlogSource{}, 3397 } 3398 if err := prototext.Unmarshal([]byte(s.source), s.bls); err != nil { 3399 return nil, err 3400 } 3401 return s, nil 3402 } 3403 3404 // isVReplMigrationReadyToCutOver sees if the vreplication migration has completed the row copy 3405 // and is up to date with the binlogs. 3406 func (e *Executor) isVReplMigrationReadyToCutOver(ctx context.Context, s *VReplStream) (isReady bool, err error) { 3407 // Check all the cases where migration is still running: 3408 { 3409 // when ready to cut-over, pos must have some value 3410 if s.pos == "" { 3411 return false, nil 3412 } 3413 } 3414 { 3415 // Both time_updated and transaction_timestamp must be in close priximity to each 3416 // other and to the time now, otherwise that means we're lagging and it's not a good time 3417 // to cut-over 3418 durationDiff := func(t1, t2 time.Time) time.Duration { 3419 diff := t1.Sub(t2) 3420 if diff < 0 { 3421 diff = -diff 3422 } 3423 return diff 3424 } 3425 timeNow := time.Now() 3426 timeUpdated := time.Unix(s.timeUpdated, 0) 3427 if durationDiff(timeNow, timeUpdated) > vreplicationCutOverThreshold { 3428 return false, nil 3429 } 3430 // Let's look at transaction timestamp. This gets written by any ongoing 3431 // writes on the server (whether on this table or any other table) 3432 transactionTimestamp := time.Unix(s.transactionTimestamp, 0) 3433 if durationDiff(timeNow, transactionTimestamp) > vreplicationCutOverThreshold { 3434 return false, nil 3435 } 3436 } 3437 { 3438 // copy_state must have no entries for this vreplication id: if entries are 3439 // present that means copy is still in progress 3440 query, err := sqlparser.ParseAndBind(sqlReadCountCopyState, 3441 sqltypes.Int64BindVariable(s.id), 3442 ) 3443 if err != nil { 3444 return false, err 3445 } 3446 r, err := e.execQuery(ctx, query) 3447 if err != nil { 3448 return false, err 3449 } 3450 csRow := r.Named().Row() 3451 if csRow == nil { 3452 return false, err 3453 } 3454 count := csRow.AsInt64("cnt", 0) 3455 if count > 0 { 3456 // Still copying 3457 return false, nil 3458 } 3459 } 3460 3461 return true, nil 3462 } 3463 3464 // isVReplMigrationRunning sees if there is a VReplication migration actively running 3465 func (e *Executor) isVReplMigrationRunning(ctx context.Context, uuid string) (isRunning bool, s *VReplStream, err error) { 3466 s, err = e.readVReplStream(ctx, uuid, true) 3467 if err != nil { 3468 return false, s, err 3469 } 3470 if s == nil { 3471 return false, s, nil 3472 } 3473 switch s.state { 3474 case binlogplayer.BlpError: 3475 return false, s, nil 3476 case binlogplayer.VReplicationInit, binlogplayer.VReplicationCopying, binlogplayer.BlpRunning: 3477 return true, s, nil 3478 } 3479 if strings.Contains(strings.ToLower(s.message), "error") { 3480 return false, s, nil 3481 } 3482 return false, s, nil 3483 } 3484 3485 // reviewRunningMigrations iterates migrations in 'running' state. Normally there's only one running, which was 3486 // spawned by this tablet; but vreplication migrations could also resume from failure. 3487 func (e *Executor) reviewRunningMigrations(ctx context.Context) (countRunnning int, cancellable []*cancellableMigration, err error) { 3488 e.migrationMutex.Lock() 3489 defer e.migrationMutex.Unlock() 3490 3491 if atomic.LoadInt64(&e.isOpen) == 0 { 3492 return countRunnning, cancellable, nil 3493 } 3494 3495 var currentUserThrottleRatio float64 3496 if err := e.lagThrottler.CheckIsReady(); err == nil { 3497 // No point in reviewing throttler info if it's not enabled&open 3498 for _, app := range e.lagThrottler.ThrottledApps() { 3499 if app.AppName == throttlerOnlineDDLApp { 3500 currentUserThrottleRatio = app.Ratio 3501 break 3502 } 3503 } 3504 } 3505 3506 var throttlerOnce sync.Once 3507 r, err := e.execQuery(ctx, sqlSelectRunningMigrations) 3508 if err != nil { 3509 return countRunnning, cancellable, err 3510 } 3511 pendingMigrationsUUIDs, err := e.readPendingMigrationsUUIDs(ctx) 3512 if err != nil { 3513 return countRunnning, cancellable, err 3514 } 3515 uuidsFoundRunning := map[string]bool{} 3516 for _, row := range r.Named().Rows { 3517 uuid := row["migration_uuid"].ToString() 3518 onlineDDL, migrationRow, err := e.readMigration(ctx, uuid) 3519 if err != nil { 3520 return countRunnning, cancellable, err 3521 } 3522 postponeCompletion := row.AsBool("postpone_completion", false) 3523 elapsedSeconds := row.AsInt64("elapsed_seconds", 0) 3524 3525 if stowawayTable := row.AsString("stowaway_table", ""); stowawayTable != "" { 3526 // whoa 3527 // stowawayTable is an original table stowed away while cutting over a vrepl migration, see call to cutOverVReplMigration() down below in this function. 3528 // In a normal operation, the table should not exist outside the scope of cutOverVReplMigration 3529 // If it exists, that means a tablet crashed while running a cut-over, and left the database in a bad state, where the migrated table does not exist. 3530 // thankfully, we have tracked this situation and just realized what happened. Now, first thing to do is to restore the original table. 3531 log.Infof("found stowaway table %s journal in migration %s for table %s", stowawayTable, uuid, onlineDDL.Table) 3532 attemptMade, err := e.renameTableIfApplicable(ctx, stowawayTable, onlineDDL.Table) 3533 if err != nil { 3534 // unable to restore table; we bail out, and we will try again next round. 3535 return countRunnning, cancellable, err 3536 } 3537 // success 3538 if attemptMade { 3539 log.Infof("stowaway table %s restored back into %s", stowawayTable, onlineDDL.Table) 3540 } else { 3541 log.Infof("stowaway table %s did not exist and there was no need to restore it", stowawayTable) 3542 } 3543 // OK good, table restored. We can remove the record. 3544 if err := e.updateMigrationStowawayTable(ctx, uuid, ""); err != nil { 3545 return countRunnning, cancellable, err 3546 } 3547 } 3548 3549 uuidsFoundRunning[uuid] = true 3550 3551 _ = e.updateMigrationUserThrottleRatio(ctx, uuid, currentUserThrottleRatio) 3552 switch onlineDDL.StrategySetting().Strategy { 3553 case schema.DDLStrategyOnline, schema.DDLStrategyVitess: 3554 { 3555 // We check the _vt.vreplication table 3556 s, err := e.readVReplStream(ctx, uuid, true) 3557 if err != nil { 3558 return countRunnning, cancellable, err 3559 } 3560 isVreplicationTestSuite := onlineDDL.StrategySetting().IsVreplicationTestSuite() 3561 if isVreplicationTestSuite { 3562 e.triggerNextCheckInterval() 3563 } 3564 if s == nil { 3565 continue 3566 } 3567 // Let's see if vreplication indicates an error. Many errors are recoverable, and 3568 // we do not wish to fail on first sight. We will use LastError to repeatedly 3569 // check if this error persists, until finally, after some timeout, we give up. 3570 if _, ok := e.vreplicationLastError[uuid]; !ok { 3571 e.vreplicationLastError[uuid] = vterrors.NewLastError( 3572 fmt.Sprintf("Online DDL migration %v", uuid), 3573 staleMigrationMinutes*time.Minute, 3574 ) 3575 } 3576 lastError := e.vreplicationLastError[uuid] 3577 isTerminal, vreplError := s.hasError() 3578 lastError.Record(vreplError) 3579 if isTerminal || !lastError.ShouldRetry() { 3580 cancellable = append(cancellable, newCancellableMigration(uuid, s.message)) 3581 } 3582 if s.isRunning() { 3583 // This VRepl migration may have started from outside this tablet, so 3584 // this executor may not own the migration _yet_. We make sure to own it. 3585 // VReplication migrations are unique in this respect: we are able to complete 3586 // a vreplicaiton migration started by another tablet. 3587 e.ownedRunningMigrations.Store(uuid, onlineDDL) 3588 if lastVitessLivenessIndicator := migrationRow.AsInt64("vitess_liveness_indicator", 0); lastVitessLivenessIndicator < s.livenessTimeIndicator() { 3589 _ = e.updateMigrationTimestamp(ctx, "liveness_timestamp", uuid) 3590 _ = e.updateVitessLivenessIndicator(ctx, uuid, s.livenessTimeIndicator()) 3591 } 3592 if onlineDDL.TabletAlias != e.TabletAliasString() { 3593 _ = e.updateMigrationTablet(ctx, uuid) 3594 log.Infof("migration %s adopted by tablet %s", uuid, e.TabletAliasString()) 3595 } 3596 _ = e.updateRowsCopied(ctx, uuid, s.rowsCopied) 3597 _ = e.updateMigrationProgressByRowsCopied(ctx, uuid, s.rowsCopied) 3598 _ = e.updateMigrationETASecondsByProgress(ctx, uuid) 3599 _ = e.updateMigrationLastThrottled(ctx, uuid, s.timeThrottled, s.componentThrottled) 3600 3601 isReady, err := e.isVReplMigrationReadyToCutOver(ctx, s) 3602 if err != nil { 3603 _ = e.updateMigrationMessage(ctx, uuid, err.Error()) 3604 return countRunnning, cancellable, err 3605 } 3606 if isReady && isVreplicationTestSuite { 3607 // This is a endtoend test suite execution. We intentionally delay it by at least 3608 // vreplicationTestSuiteWaitSeconds 3609 if elapsedSeconds < vreplicationTestSuiteWaitSeconds { 3610 isReady = false 3611 } 3612 } 3613 // Indicate to outside observers whether the migration is generally ready to complete. 3614 // In the case of a postponed migration, we will not complete it, but the user will 3615 // understand whether "now is a good time" or "not there yet" 3616 _ = e.updateMigrationReadyToComplete(ctx, uuid, isReady) 3617 if postponeCompletion { 3618 // override. Even if migration is ready, we do not complete it. 3619 isReady = false 3620 } 3621 if isReady && onlineDDL.StrategySetting().IsInOrderCompletion() { 3622 if len(pendingMigrationsUUIDs) > 0 && pendingMigrationsUUIDs[0] != onlineDDL.UUID { 3623 // wait for earlier pending migrations to complete 3624 isReady = false 3625 } 3626 } 3627 if isReady { 3628 if err := e.cutOverVReplMigration(ctx, s); err != nil { 3629 _ = e.updateMigrationMessage(ctx, uuid, err.Error()) 3630 log.Errorf("cutOverVReplMigration failed: err=%v", err) 3631 if merr, ok := err.(*mysql.SQLError); ok { 3632 switch merr.Num { 3633 case mysql.ERTooLongIdent: 3634 go e.CancelMigration(ctx, uuid, err.Error(), false) 3635 } 3636 } 3637 return countRunnning, cancellable, err 3638 } 3639 } 3640 go throttlerOnce.Do(func() { 3641 if e.lagThrottler.CheckIsReady() != nil { 3642 return 3643 } 3644 // Self healing: in the following scenario: 3645 // - a vitess migration 3646 // - with on demand heartbeats 3647 // - the streamer running on a replica 3648 // - the streamer was throttled for long enough 3649 // - then vplayer and vcopier are locked, waiting for the streamer to do something 3650 // - since they are blocked, they're not running throttler checks 3651 // - since streamer runs on replica, it only checks that replica 3652 // - therefore no one asking for on-demand heartbeats 3653 // - then, if the conditions for the streamer's throttling are done, the streamer then thinks there's replication lag, with nothing to remediate it. 3654 // - it's a deadlock. 3655 // And so, once per reviewRunningMigrations(), and assuming there _are_ running migrations, we ensure to hit a throttler check. This will kick 3656 // on-demand heartbeats, unlocking the deadlock. 3657 e.lagThrottler.CheckByType(ctx, throttlerOnlineDDLApp, "", throttleCheckFlags, throttle.ThrottleCheckPrimaryWrite) 3658 }) 3659 } 3660 } 3661 case schema.DDLStrategyPTOSC: 3662 { 3663 // Since pt-osc doesn't have a "liveness" plugin entry point, we do it externally: 3664 // if the process is alive, we update the `liveness_timestamp` for this migration. 3665 running, _, err := e.isPTOSCMigrationRunning(ctx, uuid) 3666 if err != nil { 3667 return countRunnning, cancellable, err 3668 } 3669 if running { 3670 _ = e.updateMigrationTimestamp(ctx, "liveness_timestamp", uuid) 3671 } 3672 if _, ok := e.ownedRunningMigrations.Load(uuid); !ok { 3673 // Ummm, the migration is running but we don't own it. This means the migration 3674 // is rogue. Maybe executed by another tablet. Anyway, if we don't own it, we can't 3675 // complete the migration. Even if it runs, the logic around announcing it as complete 3676 // is missing. So we may as well cancel it. 3677 message := fmt.Sprintf("cancelling a pt-osc running migration %s which is not owned (not started, or is assumed to be terminated) by this executor", uuid) 3678 cancellable = append(cancellable, newCancellableMigration(uuid, message)) 3679 } 3680 } 3681 case schema.DDLStrategyGhost: 3682 { 3683 if _, ok := e.ownedRunningMigrations.Load(uuid); !ok { 3684 // Ummm, the migration is running but we don't own it. This means the migration 3685 // is rogue. Maybe executed by another tablet. Anyway, if we don't own it, we can't 3686 // complete the migration. Even if it runs, the logic around announcing it as complete 3687 // is missing. So we may as well cancel it. 3688 message := fmt.Sprintf("cancelling a gh-ost running migration %s which is not owned (not started, or is assumed to be terminated) by this executor", uuid) 3689 cancellable = append(cancellable, newCancellableMigration(uuid, message)) 3690 } 3691 } 3692 } 3693 countRunnning++ 3694 } 3695 { 3696 // now, let's look at UUIDs we own and _think_ should be running, and see which of tham _isn't_ actually running or pending... 3697 uuidsFoundPending := map[string]bool{} 3698 for _, uuid := range pendingMigrationsUUIDs { 3699 uuidsFoundPending[uuid] = true 3700 } 3701 3702 e.ownedRunningMigrations.Range(func(k, _ any) bool { 3703 uuid, ok := k.(string) 3704 if !ok { 3705 return true 3706 } 3707 // due to race condition, it's possible that ownedRunningMigrations will list a migration 3708 // that is _just about to run_ but is still, in fact, in `ready` state. This is fine. 3709 // If we find such a migration, we do nothing. We're only looking for migrations we really 3710 // don't have any information of. 3711 if !uuidsFoundRunning[uuid] && !uuidsFoundPending[uuid] { 3712 log.Infof("removing migration %s from ownedRunningMigrations because it's not running and not pending", uuid) 3713 e.ownedRunningMigrations.Delete(uuid) 3714 } 3715 return true 3716 }) 3717 } 3718 3719 e.reviewedRunningMigrationsFlag = true 3720 return countRunnning, cancellable, nil 3721 } 3722 3723 // reviewStaleMigrations marks as 'failed' migrations whose status is 'running' but which have 3724 // shown no liveness in past X minutes. It also attempts to terminate them 3725 func (e *Executor) reviewStaleMigrations(ctx context.Context) error { 3726 e.migrationMutex.Lock() 3727 defer e.migrationMutex.Unlock() 3728 3729 query, err := sqlparser.ParseAndBind(sqlSelectStaleMigrations, 3730 sqltypes.Int64BindVariable(staleMigrationMinutes), 3731 ) 3732 if err != nil { 3733 return err 3734 } 3735 r, err := e.execQuery(ctx, query) 3736 if err != nil { 3737 return err 3738 } 3739 for _, row := range r.Named().Rows { 3740 uuid := row["migration_uuid"].ToString() 3741 3742 onlineDDL, _, err := e.readMigration(ctx, uuid) 3743 if err != nil { 3744 return err 3745 } 3746 log.Infof("reviewStaleMigrations: stale migration found: %s", onlineDDL.UUID) 3747 message := fmt.Sprintf("stale migration %s: found running but indicates no liveness in the past %v minutes", onlineDDL.UUID, staleMigrationMinutes) 3748 if onlineDDL.TabletAlias != e.TabletAliasString() { 3749 // This means another tablet started the migration, and the migration has failed due to the tablet failure (e.g. primary failover) 3750 if err := e.updateTabletFailure(ctx, onlineDDL.UUID); err != nil { 3751 return err 3752 } 3753 message = fmt.Sprintf("%s; executed by different tablet %s", message, onlineDDL.TabletAlias) 3754 } 3755 if _, err := e.terminateMigration(ctx, onlineDDL); err != nil { 3756 message = fmt.Sprintf("error terminating migration (%v): %v", message, err) 3757 e.updateMigrationMessage(ctx, onlineDDL.UUID, message) 3758 continue // we still want to handle rest of migrations 3759 } 3760 if err := e.updateMigrationMessage(ctx, onlineDDL.UUID, message); err != nil { 3761 return err 3762 } 3763 if err := e.updateMigrationStatus(ctx, onlineDDL.UUID, schema.OnlineDDLStatusFailed); err != nil { 3764 return err 3765 } 3766 defer e.triggerNextCheckInterval() 3767 _ = e.updateMigrationStartedTimestamp(ctx, uuid) 3768 // Because the migration is stale, it may not update completed_timestamp. It is essential to set completed_timestamp 3769 // as this is then used when cleaning artifacts 3770 if err := e.updateMigrationTimestamp(ctx, "completed_timestamp", onlineDDL.UUID); err != nil { 3771 return err 3772 } 3773 } 3774 3775 return nil 3776 } 3777 3778 // retryTabletFailureMigrations looks for migrations failed by tablet failure (e.g. by failover) 3779 // and retry them (put them back in the queue) 3780 func (e *Executor) retryTabletFailureMigrations(ctx context.Context) error { 3781 _, err := e.retryMigrationWhere(ctx, sqlWhereTabletFailure) 3782 return err 3783 } 3784 3785 func (e *Executor) tabletManagerClient() tmclient.TabletManagerClient { 3786 return tmclient.NewTabletManagerClient() 3787 } 3788 3789 // vreplicationExec runs a vreplication query, and makes sure to initialize vreplication 3790 func (e *Executor) vreplicationExec(ctx context.Context, tablet *topodatapb.Tablet, query string) (*querypb.QueryResult, error) { 3791 tmClient := e.tabletManagerClient() 3792 defer tmClient.Close() 3793 3794 return tmClient.VReplicationExec(ctx, tablet, query) 3795 } 3796 3797 // reloadSchema issues a ReloadSchema on this tablet 3798 func (e *Executor) reloadSchema(ctx context.Context) error { 3799 tmClient := e.tabletManagerClient() 3800 defer tmClient.Close() 3801 3802 tablet, err := e.ts.GetTablet(ctx, e.tabletAlias) 3803 if err != nil { 3804 return err 3805 } 3806 return tmClient.ReloadSchema(ctx, tablet.Tablet, "") 3807 } 3808 3809 // deleteVReplicationEntry cleans up a _vt.vreplication entry; this function is called as part of 3810 // migration termination and as part of artifact cleanup 3811 func (e *Executor) deleteVReplicationEntry(ctx context.Context, uuid string) error { 3812 query, err := sqlparser.ParseAndBind(sqlDeleteVReplStream, 3813 sqltypes.StringBindVariable(e.dbName), 3814 sqltypes.StringBindVariable(uuid), 3815 ) 3816 if err != nil { 3817 return err 3818 } 3819 tablet, err := e.ts.GetTablet(ctx, e.tabletAlias) 3820 if err != nil { 3821 return err 3822 } 3823 3824 if _, err := e.vreplicationExec(ctx, tablet.Tablet, query); err != nil { 3825 return err 3826 } 3827 return nil 3828 } 3829 3830 // gcArtifactTable garbage-collects a single table 3831 func (e *Executor) gcArtifactTable(ctx context.Context, artifactTable, uuid string, t time.Time) (string, error) { 3832 tableExists, err := e.tableExists(ctx, artifactTable) 3833 if err != nil { 3834 return "", err 3835 } 3836 if !tableExists { 3837 return "", nil 3838 } 3839 // We've already concluded in gcArtifacts() that this table was held for long enough. 3840 // We therefore move it into PURGE state. 3841 renameStatement, toTableName, err := schema.GenerateRenameStatementWithUUID(artifactTable, schema.PurgeTableGCState, schema.OnlineDDLToGCUUID(uuid), t) 3842 if err != nil { 3843 return toTableName, err 3844 } 3845 _, err = e.execQuery(ctx, renameStatement) 3846 return toTableName, err 3847 } 3848 3849 // gcArtifacts garbage-collects migration artifacts from completed/failed migrations 3850 func (e *Executor) gcArtifacts(ctx context.Context) error { 3851 e.migrationMutex.Lock() 3852 defer e.migrationMutex.Unlock() 3853 3854 if _, err := e.execQuery(ctx, sqlFixCompletedTimestamp); err != nil { 3855 // This query fixes a bug where stale migrations were marked as 'failed' without updating 'completed_timestamp' 3856 // see https://github.com/vitessio/vitess/issues/8499 3857 // Running this query retroactively sets completed_timestamp 3858 // This 'if' clause can be removed in version v13 3859 return err 3860 } 3861 query, err := sqlparser.ParseAndBind(sqlSelectUncollectedArtifacts, 3862 sqltypes.Int64BindVariable(int64((retainOnlineDDLTables).Seconds())), 3863 ) 3864 if err != nil { 3865 return err 3866 } 3867 r, err := e.execQuery(ctx, query) 3868 if err != nil { 3869 return err 3870 } 3871 for _, row := range r.Named().Rows { 3872 uuid := row["migration_uuid"].ToString() 3873 artifacts := row["artifacts"].ToString() 3874 logPath := row["log_path"].ToString() 3875 3876 log.Infof("Executor.gcArtifacts: will GC artifacts for migration %s", uuid) 3877 // Remove tables: 3878 artifactTables := textutil.SplitDelimitedList(artifacts) 3879 3880 timeNow := time.Now() 3881 for i, artifactTable := range artifactTables { 3882 // We wish to generate distinct timestamp values for each table in this UUID, 3883 // because all tables will be renamed as _something_UUID_timestamp. Since UUID 3884 // is shared for all artifacts in this loop, we differentiate via timestamp 3885 log.Infof("Executor.gcArtifacts: will GC artifact %s for migration %s", artifactTable, uuid) 3886 t := timeNow.Add(time.Duration(i) * time.Second).UTC() 3887 toTableName, err := e.gcArtifactTable(ctx, artifactTable, uuid, t) 3888 if err != nil { 3889 return vterrors.Wrapf(err, "in gcArtifacts() for %s", artifactTable) 3890 } 3891 log.Infof("Executor.gcArtifacts: renamed away artifact %s to %s", artifactTable, toTableName) 3892 } 3893 3894 // Remove logs: 3895 { 3896 // logPath is in 'hostname:/path/to/logs' format 3897 tokens := strings.SplitN(logPath, ":", 2) 3898 logPath = tokens[len(tokens)-1] 3899 if err := os.RemoveAll(logPath); err != nil { 3900 return err 3901 } 3902 } 3903 3904 // while the next function only applies to 'online' strategy ALTER and REVERT, there is no 3905 // harm in invoking it for other migrations. 3906 if err := e.deleteVReplicationEntry(ctx, uuid); err != nil { 3907 return err 3908 } 3909 3910 if err := e.updateMigrationTimestamp(ctx, "cleanup_timestamp", uuid); err != nil { 3911 return err 3912 } 3913 log.Infof("Executor.gcArtifacts: done migration %s", uuid) 3914 } 3915 3916 return nil 3917 } 3918 3919 // onMigrationCheckTick runs all migrations life cycle 3920 func (e *Executor) onMigrationCheckTick() { 3921 // This function can be called by multiple triggers. First, there's the normal ticker. 3922 // Then, any time a migration completes, we set a timer to trigger this function. 3923 // also, any time a new INSERT arrives, we set a timer to trigger this function. 3924 // Some of these may be correlated. To avoid spamming of this function we: 3925 // - ensure the function is non-reentrant, using tickReentranceFlag 3926 // - clean up tickReentranceFlag 1 second after function completes; this throttles calls to 3927 // this function at no more than 1/sec rate. 3928 if atomic.CompareAndSwapInt64(&e.tickReentranceFlag, 0, 1) { 3929 defer time.AfterFunc(time.Second, func() { atomic.StoreInt64(&e.tickReentranceFlag, 0) }) 3930 } else { 3931 // An instance of this function is already running 3932 return 3933 } 3934 3935 if e.tabletTypeFunc() != topodatapb.TabletType_PRIMARY { 3936 return 3937 } 3938 if e.keyspace == "" { 3939 log.Errorf("Executor.onMigrationCheckTick(): empty keyspace") 3940 return 3941 } 3942 3943 ctx := context.Background() 3944 if err := e.retryTabletFailureMigrations(ctx); err != nil { 3945 log.Error(err) 3946 } 3947 if err := e.reviewQueuedMigrations(ctx); err != nil { 3948 log.Error(err) 3949 } 3950 if err := e.scheduleNextMigration(ctx); err != nil { 3951 log.Error(err) 3952 } 3953 if err := e.runNextMigration(ctx); err != nil { 3954 log.Error(err) 3955 } 3956 if _, cancellable, err := e.reviewRunningMigrations(ctx); err != nil { 3957 log.Error(err) 3958 } else if err := e.cancelMigrations(ctx, cancellable, false); err != nil { 3959 log.Error(err) 3960 } 3961 if err := e.reviewStaleMigrations(ctx); err != nil { 3962 log.Error(err) 3963 } 3964 if err := e.gcArtifacts(ctx); err != nil { 3965 log.Error(err) 3966 } 3967 } 3968 3969 func (e *Executor) updateMigrationStartedTimestamp(ctx context.Context, uuid string) error { 3970 parsed := sqlparser.BuildParsedQuery(sqlUpdateMigrationStartedTimestamp, 3971 ":migration_uuid", 3972 ) 3973 bindVars := map[string]*querypb.BindVariable{ 3974 "migration_uuid": sqltypes.StringBindVariable(uuid), 3975 } 3976 bound, err := parsed.GenerateQuery(bindVars, nil) 3977 if err != nil { 3978 return err 3979 } 3980 _, err = e.execQuery(ctx, bound) 3981 if err != nil { 3982 log.Errorf("FAIL updateMigrationStartedTimestamp: uuid=%s, error=%v", uuid, err) 3983 } 3984 return err 3985 } 3986 3987 func (e *Executor) updateMigrationTimestamp(ctx context.Context, timestampColumn string, uuid string) error { 3988 parsed := sqlparser.BuildParsedQuery(sqlUpdateMigrationTimestamp, timestampColumn, 3989 ":migration_uuid", 3990 ) 3991 bindVars := map[string]*querypb.BindVariable{ 3992 "migration_uuid": sqltypes.StringBindVariable(uuid), 3993 } 3994 bound, err := parsed.GenerateQuery(bindVars, nil) 3995 if err != nil { 3996 return err 3997 } 3998 _, err = e.execQuery(ctx, bound) 3999 if err != nil { 4000 log.Errorf("FAIL updateMigrationStartedTimestamp: uuid=%s, timestampColumn=%v, error=%v", uuid, timestampColumn, err) 4001 } 4002 return err 4003 } 4004 4005 func (e *Executor) updateMigrationLogPath(ctx context.Context, uuid string, hostname, logPath string) error { 4006 logFile := path.Join(logPath, migrationLogFileName) 4007 hostLogPath := fmt.Sprintf("%s:%s", hostname, logPath) 4008 query, err := sqlparser.ParseAndBind(sqlUpdateMigrationLogPath, 4009 sqltypes.StringBindVariable(hostLogPath), 4010 sqltypes.StringBindVariable(logFile), 4011 sqltypes.StringBindVariable(uuid), 4012 ) 4013 if err != nil { 4014 return err 4015 } 4016 _, err = e.execQuery(ctx, query) 4017 return err 4018 } 4019 4020 func (e *Executor) updateArtifacts(ctx context.Context, uuid string, artifacts ...string) error { 4021 bindArtifacts := strings.Join(artifacts, ",") 4022 query, err := sqlparser.ParseAndBind(sqlUpdateArtifacts, 4023 sqltypes.StringBindVariable(bindArtifacts), 4024 sqltypes.StringBindVariable(uuid), 4025 ) 4026 if err != nil { 4027 return err 4028 } 4029 _, err = e.execQuery(ctx, query) 4030 return err 4031 } 4032 4033 func (e *Executor) clearArtifacts(ctx context.Context, uuid string) error { 4034 query, err := sqlparser.ParseAndBind(sqlClearArtifacts, 4035 sqltypes.StringBindVariable(uuid), 4036 ) 4037 if err != nil { 4038 return err 4039 } 4040 _, err = e.execQuery(ctx, query) 4041 return err 4042 } 4043 4044 func (e *Executor) updateMigrationSpecialPlan(ctx context.Context, uuid string, specialPlan string) error { 4045 query, err := sqlparser.ParseAndBind(sqlUpdateSpecialPlan, 4046 sqltypes.StringBindVariable(specialPlan), 4047 sqltypes.StringBindVariable(uuid), 4048 ) 4049 if err != nil { 4050 return err 4051 } 4052 _, err = e.execQuery(ctx, query) 4053 return err 4054 } 4055 4056 func (e *Executor) updateMigrationStage(ctx context.Context, uuid string, stage string, args ...interface{}) error { 4057 msg := fmt.Sprintf(stage, args...) 4058 log.Infof("updateMigrationStage: uuid=%s, stage=%s", uuid, msg) 4059 query, err := sqlparser.ParseAndBind(sqlUpdateStage, 4060 sqltypes.StringBindVariable(msg), 4061 sqltypes.StringBindVariable(uuid), 4062 ) 4063 if err != nil { 4064 return err 4065 } 4066 _, err = e.execQuery(ctx, query) 4067 return err 4068 } 4069 4070 func (e *Executor) incrementCutoverAttempts(ctx context.Context, uuid string) error { 4071 query, err := sqlparser.ParseAndBind(sqlIncrementCutoverAttempts, 4072 sqltypes.StringBindVariable(uuid), 4073 ) 4074 if err != nil { 4075 return err 4076 } 4077 _, err = e.execQuery(ctx, query) 4078 return err 4079 } 4080 4081 // updateMigrationTablet sets 'tablet' column to be this executor's tablet alias for given migration 4082 func (e *Executor) updateMigrationTablet(ctx context.Context, uuid string) error { 4083 query, err := sqlparser.ParseAndBind(sqlUpdateTablet, 4084 sqltypes.StringBindVariable(e.TabletAliasString()), 4085 sqltypes.StringBindVariable(uuid), 4086 ) 4087 if err != nil { 4088 return err 4089 } 4090 _, err = e.execQuery(ctx, query) 4091 return err 4092 } 4093 4094 // updateTabletFailure marks a given migration as "tablet_failed" 4095 func (e *Executor) updateTabletFailure(ctx context.Context, uuid string) error { 4096 parsed := sqlparser.BuildParsedQuery(sqlUpdateTabletFailure, 4097 ":migration_uuid", 4098 ) 4099 bindVars := map[string]*querypb.BindVariable{ 4100 "migration_uuid": sqltypes.StringBindVariable(uuid), 4101 } 4102 bound, err := parsed.GenerateQuery(bindVars, nil) 4103 if err != nil { 4104 return err 4105 } 4106 _, err = e.execQuery(ctx, bound) 4107 return err 4108 } 4109 4110 func (e *Executor) updateMigrationStatusFailedOrCancelled(ctx context.Context, uuid string) error { 4111 log.Infof("updateMigrationStatus: transitioning migration: %s into status failed or cancelled", uuid) 4112 query, err := sqlparser.ParseAndBind(sqlUpdateMigrationStatusFailedOrCancelled, 4113 sqltypes.StringBindVariable(uuid), 4114 ) 4115 if err != nil { 4116 return err 4117 } 4118 _, err = e.execQuery(ctx, query) 4119 return err 4120 } 4121 4122 func (e *Executor) updateMigrationStatus(ctx context.Context, uuid string, status schema.OnlineDDLStatus) error { 4123 log.Infof("updateMigrationStatus: transitioning migration: %s into status: %s", uuid, string(status)) 4124 query, err := sqlparser.ParseAndBind(sqlUpdateMigrationStatus, 4125 sqltypes.StringBindVariable(string(status)), 4126 sqltypes.StringBindVariable(uuid), 4127 ) 4128 if err != nil { 4129 return err 4130 } 4131 _, err = e.execQuery(ctx, query) 4132 if err != nil { 4133 log.Errorf("FAIL updateMigrationStatus: uuid=%s, query=%v, error=%v", uuid, query, err) 4134 } 4135 return err 4136 } 4137 4138 func (e *Executor) updateDDLAction(ctx context.Context, uuid string, actionStr string) error { 4139 query, err := sqlparser.ParseAndBind(sqlUpdateDDLAction, 4140 sqltypes.StringBindVariable(actionStr), 4141 sqltypes.StringBindVariable(uuid), 4142 ) 4143 if err != nil { 4144 return err 4145 } 4146 _, err = e.execQuery(ctx, query) 4147 return err 4148 } 4149 4150 func (e *Executor) updateMigrationMessage(ctx context.Context, uuid string, message string) error { 4151 log.Infof("updateMigrationMessage: uuid=%s, message=%s", uuid, message) 4152 4153 maxlen := 16383 4154 update := func(message string) error { 4155 if len(message) > maxlen { 4156 message = message[0:maxlen] 4157 } 4158 message = strings.ToValidUTF8(message, "�") 4159 query, err := sqlparser.ParseAndBind(sqlUpdateMessage, 4160 sqltypes.StringBindVariable(message), 4161 sqltypes.StringBindVariable(uuid), 4162 ) 4163 if err != nil { 4164 return err 4165 } 4166 _, err = e.execQuery(ctx, query) 4167 return err 4168 } 4169 err := update(message) 4170 if err != nil { 4171 // If, for some reason, we're unable to update the error message, let's write a generic message 4172 err = update("unable to update with original migration error message") 4173 } 4174 return err 4175 } 4176 4177 func (e *Executor) updateSchemaAnalysis(ctx context.Context, uuid string, 4178 addedUniqueKeys, removedUnqiueKeys int, removedUniqueKeyNames string, 4179 droppedNoDefaultColumnNames string, expandedColumnNames string, 4180 revertibleNotes string) error { 4181 query, err := sqlparser.ParseAndBind(sqlUpdateSchemaAnalysis, 4182 sqltypes.Int64BindVariable(int64(addedUniqueKeys)), 4183 sqltypes.Int64BindVariable(int64(removedUnqiueKeys)), 4184 sqltypes.StringBindVariable(removedUniqueKeyNames), 4185 sqltypes.StringBindVariable(droppedNoDefaultColumnNames), 4186 sqltypes.StringBindVariable(expandedColumnNames), 4187 sqltypes.StringBindVariable(revertibleNotes), 4188 sqltypes.StringBindVariable(uuid), 4189 ) 4190 if err != nil { 4191 return err 4192 } 4193 _, err = e.execQuery(ctx, query) 4194 return err 4195 } 4196 4197 func (e *Executor) updateMySQLTable(ctx context.Context, uuid string, tableName string) error { 4198 query, err := sqlparser.ParseAndBind(sqlUpdateMySQLTable, 4199 sqltypes.StringBindVariable(tableName), 4200 sqltypes.StringBindVariable(uuid), 4201 ) 4202 if err != nil { 4203 return err 4204 } 4205 _, err = e.execQuery(ctx, query) 4206 return err 4207 } 4208 4209 func (e *Executor) updateMigrationETASeconds(ctx context.Context, uuid string, etaSeconds int64) error { 4210 query, err := sqlparser.ParseAndBind(sqlUpdateMigrationETASeconds, 4211 sqltypes.Int64BindVariable(etaSeconds), 4212 sqltypes.StringBindVariable(uuid), 4213 ) 4214 if err != nil { 4215 return err 4216 } 4217 _, err = e.execQuery(ctx, query) 4218 return err 4219 } 4220 4221 func (e *Executor) updateMigrationProgress(ctx context.Context, uuid string, progress float64) error { 4222 if progress <= 0 { 4223 // progress starts at 0, and can only increase. 4224 // A value of "0" either means "This is the actual current progress" or "No information" 4225 // In both cases there's nothing to update 4226 return nil 4227 } 4228 query, err := sqlparser.ParseAndBind(sqlUpdateMigrationProgress, 4229 sqltypes.Float64BindVariable(progress), 4230 sqltypes.StringBindVariable(uuid), 4231 ) 4232 if err != nil { 4233 return err 4234 } 4235 _, err = e.execQuery(ctx, query) 4236 return err 4237 } 4238 4239 func (e *Executor) updateMigrationProgressByRowsCopied(ctx context.Context, uuid string, rowsCopied int64) error { 4240 query, err := sqlparser.ParseAndBind(sqlUpdateMigrationProgressByRowsCopied, 4241 sqltypes.Int64BindVariable(rowsCopied), 4242 sqltypes.StringBindVariable(uuid), 4243 ) 4244 if err != nil { 4245 return err 4246 } 4247 _, err = e.execQuery(ctx, query) 4248 return err 4249 } 4250 4251 func (e *Executor) updateMigrationETASecondsByProgress(ctx context.Context, uuid string) error { 4252 query, err := sqlparser.ParseAndBind(sqlUpdateMigrationETASecondsByProgress, 4253 sqltypes.StringBindVariable(uuid), 4254 ) 4255 if err != nil { 4256 return err 4257 } 4258 _, err = e.execQuery(ctx, query) 4259 return err 4260 } 4261 4262 func (e *Executor) updateMigrationLastThrottled(ctx context.Context, uuid string, lastThrottledUnixTime int64, throttledCompnent string) error { 4263 query, err := sqlparser.ParseAndBind(sqlUpdateLastThrottled, 4264 sqltypes.Int64BindVariable(lastThrottledUnixTime), 4265 sqltypes.StringBindVariable(throttledCompnent), 4266 sqltypes.StringBindVariable(uuid), 4267 ) 4268 if err != nil { 4269 return err 4270 } 4271 _, err = e.execQuery(ctx, query) 4272 return err 4273 } 4274 4275 func (e *Executor) updateMigrationTableRows(ctx context.Context, uuid string, tableRows int64) error { 4276 query, err := sqlparser.ParseAndBind(sqlUpdateMigrationTableRows, 4277 sqltypes.Int64BindVariable(tableRows), 4278 sqltypes.StringBindVariable(uuid), 4279 ) 4280 if err != nil { 4281 return err 4282 } 4283 _, err = e.execQuery(ctx, query) 4284 return err 4285 } 4286 4287 func (e *Executor) updateRowsCopied(ctx context.Context, uuid string, rowsCopied int64) error { 4288 if rowsCopied <= 0 { 4289 // Number of rows can only be positive. Zero or negative must mean "no information" and 4290 // we don't update the table value. 4291 return nil 4292 } 4293 query, err := sqlparser.ParseAndBind(sqlUpdateMigrationRowsCopied, 4294 sqltypes.Int64BindVariable(rowsCopied), 4295 sqltypes.StringBindVariable(uuid), 4296 ) 4297 if err != nil { 4298 return err 4299 } 4300 _, err = e.execQuery(ctx, query) 4301 return err 4302 } 4303 4304 func (e *Executor) updateVitessLivenessIndicator(ctx context.Context, uuid string, livenessIndicator int64) error { 4305 query, err := sqlparser.ParseAndBind(sqlUpdateMigrationVitessLivenessIndicator, 4306 sqltypes.Int64BindVariable(livenessIndicator), 4307 sqltypes.StringBindVariable(uuid), 4308 ) 4309 if err != nil { 4310 return err 4311 } 4312 _, err = e.execQuery(ctx, query) 4313 return err 4314 } 4315 4316 func (e *Executor) updateMigrationIsView(ctx context.Context, uuid string, isView bool) error { 4317 query, err := sqlparser.ParseAndBind(sqlUpdateMigrationIsView, 4318 sqltypes.BoolBindVariable(isView), 4319 sqltypes.StringBindVariable(uuid), 4320 ) 4321 if err != nil { 4322 return err 4323 } 4324 _, err = e.execQuery(ctx, query) 4325 return err 4326 } 4327 4328 func (e *Executor) updateMigrationSetImmediateOperation(ctx context.Context, uuid string) error { 4329 query, err := sqlparser.ParseAndBind(sqlUpdateMigrationSetImmediateOperation, 4330 sqltypes.StringBindVariable(uuid), 4331 ) 4332 if err != nil { 4333 return err 4334 } 4335 _, err = e.execQuery(ctx, query) 4336 return err 4337 } 4338 4339 func (e *Executor) updateMigrationReadyToComplete(ctx context.Context, uuid string, isReady bool) error { 4340 query, err := sqlparser.ParseAndBind(sqlUpdateMigrationReadyToComplete, 4341 sqltypes.BoolBindVariable(isReady), 4342 sqltypes.StringBindVariable(uuid), 4343 ) 4344 if err != nil { 4345 return err 4346 } 4347 if _, err := e.execQuery(ctx, query); err != nil { 4348 return err 4349 } 4350 if val, ok := e.ownedRunningMigrations.Load(uuid); ok { 4351 if runningMigration, ok := val.(*schema.OnlineDDL); ok { 4352 var storeValue int64 4353 if isReady { 4354 storeValue = 1 4355 } 4356 atomic.StoreInt64(&runningMigration.ReadyToComplete, storeValue) 4357 } 4358 } 4359 return nil 4360 } 4361 4362 func (e *Executor) updateMigrationStowawayTable(ctx context.Context, uuid string, tableName string) error { 4363 query, err := sqlparser.ParseAndBind(sqlUpdateMigrationStowawayTable, 4364 sqltypes.StringBindVariable(tableName), 4365 sqltypes.StringBindVariable(uuid), 4366 ) 4367 if err != nil { 4368 return err 4369 } 4370 _, err = e.execQuery(ctx, query) 4371 return err 4372 } 4373 4374 func (e *Executor) updateMigrationUserThrottleRatio(ctx context.Context, uuid string, ratio float64) error { 4375 query, err := sqlparser.ParseAndBind(sqlUpdateMigrationUserThrottleRatio, 4376 sqltypes.Float64BindVariable(ratio), 4377 sqltypes.StringBindVariable(uuid), 4378 ) 4379 if err != nil { 4380 return err 4381 } 4382 _, err = e.execQuery(ctx, query) 4383 return err 4384 } 4385 4386 // retryMigrationWhere retries a migration based on a given WHERE clause 4387 func (e *Executor) retryMigrationWhere(ctx context.Context, whereExpr string) (result *sqltypes.Result, err error) { 4388 e.migrationMutex.Lock() 4389 defer e.migrationMutex.Unlock() 4390 parsed := sqlparser.BuildParsedQuery(sqlRetryMigrationWhere, ":tablet", whereExpr) 4391 bindVars := map[string]*querypb.BindVariable{ 4392 "tablet": sqltypes.StringBindVariable(e.TabletAliasString()), 4393 } 4394 bound, err := parsed.GenerateQuery(bindVars, nil) 4395 if err != nil { 4396 return nil, err 4397 } 4398 result, err = e.execQuery(ctx, bound) 4399 return result, err 4400 } 4401 4402 // RetryMigration marks given migration for retry 4403 func (e *Executor) RetryMigration(ctx context.Context, uuid string) (result *sqltypes.Result, err error) { 4404 if atomic.LoadInt64(&e.isOpen) == 0 { 4405 return nil, vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, "online ddl is disabled") 4406 } 4407 if !schema.IsOnlineDDLUUID(uuid) { 4408 return nil, vterrors.Errorf(vtrpcpb.Code_UNKNOWN, "Not a valid migration ID in RETRY: %s", uuid) 4409 } 4410 e.migrationMutex.Lock() 4411 defer e.migrationMutex.Unlock() 4412 4413 query, err := sqlparser.ParseAndBind(sqlRetryMigration, 4414 sqltypes.StringBindVariable(e.TabletAliasString()), 4415 sqltypes.StringBindVariable(uuid), 4416 ) 4417 if err != nil { 4418 return nil, err 4419 } 4420 defer e.triggerNextCheckInterval() 4421 return e.execQuery(ctx, query) 4422 } 4423 4424 // CleanupMigration sets migration is ready for artifact cleanup. Artifacts are not immediately deleted: 4425 // all we do is set retain_artifacts_seconds to a very small number (it's actually a negative) so that the 4426 // next iteration of gcArtifacts() picks up the migration's artifacts and schedules them for deletion 4427 func (e *Executor) CleanupMigration(ctx context.Context, uuid string) (result *sqltypes.Result, err error) { 4428 if atomic.LoadInt64(&e.isOpen) == 0 { 4429 return nil, vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, "online ddl is disabled") 4430 } 4431 if !schema.IsOnlineDDLUUID(uuid) { 4432 return nil, vterrors.Errorf(vtrpcpb.Code_UNKNOWN, "Not a valid migration ID in CLEANUP: %s", uuid) 4433 } 4434 log.Infof("CleanupMigration: request to cleanup migration %s", uuid) 4435 e.migrationMutex.Lock() 4436 defer e.migrationMutex.Unlock() 4437 4438 query, err := sqlparser.ParseAndBind(sqlUpdateReadyForCleanup, 4439 sqltypes.StringBindVariable(uuid), 4440 ) 4441 if err != nil { 4442 return nil, err 4443 } 4444 rs, err := e.execQuery(ctx, query) 4445 if err != nil { 4446 return nil, err 4447 } 4448 log.Infof("CleanupMigration: migration %s marked as ready to clean up", uuid) 4449 return rs, nil 4450 } 4451 4452 // CompleteMigration clears the postpone_completion flag for a given migration, assuming it was set in the first place 4453 func (e *Executor) CompleteMigration(ctx context.Context, uuid string) (result *sqltypes.Result, err error) { 4454 if atomic.LoadInt64(&e.isOpen) == 0 { 4455 return nil, vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, "online ddl is disabled") 4456 } 4457 if !schema.IsOnlineDDLUUID(uuid) { 4458 return nil, vterrors.Errorf(vtrpcpb.Code_UNKNOWN, "Not a valid migration ID in COMPLETE: %s", uuid) 4459 } 4460 log.Infof("CompleteMigration: request to complete migration %s", uuid) 4461 4462 e.migrationMutex.Lock() 4463 defer e.migrationMutex.Unlock() 4464 4465 query, err := sqlparser.ParseAndBind(sqlUpdateCompleteMigration, 4466 sqltypes.StringBindVariable(uuid), 4467 ) 4468 if err != nil { 4469 return nil, err 4470 } 4471 defer e.triggerNextCheckInterval() 4472 if err := e.deleteGhostPostponeFlagFile(uuid); err != nil { 4473 // This should work without error even if the migration is not a gh-ost migration, and even 4474 // if the file does not exist. An error here indicates a general system error of sorts. 4475 return nil, err 4476 } 4477 rs, err := e.execQuery(ctx, query) 4478 if err != nil { 4479 return nil, err 4480 } 4481 log.Infof("CompleteMigration: migration %s marked as unpostponed", uuid) 4482 return rs, nil 4483 } 4484 4485 // CompletePendingMigrations completes all pending migrations (that are expected to run or are running) 4486 // for this keyspace 4487 func (e *Executor) CompletePendingMigrations(ctx context.Context) (result *sqltypes.Result, err error) { 4488 if atomic.LoadInt64(&e.isOpen) == 0 { 4489 return nil, vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, "online ddl is disabled") 4490 } 4491 4492 uuids, err := e.readPendingMigrationsUUIDs(ctx) 4493 if err != nil { 4494 return result, err 4495 } 4496 log.Infof("CompletePendingMigrations: iterating %v migrations %s", len(uuids)) 4497 4498 result = &sqltypes.Result{} 4499 for _, uuid := range uuids { 4500 log.Infof("CompletePendingMigrations: completing %s", uuid) 4501 res, err := e.CompleteMigration(ctx, uuid) 4502 if err != nil { 4503 return result, err 4504 } 4505 result.AppendResult(res) 4506 } 4507 log.Infof("CompletePendingMigrations: done iterating %v migrations %s", len(uuids)) 4508 return result, nil 4509 } 4510 4511 // LaunchMigration clears the postpone_launch flag for a given migration, assuming it was set in the first place 4512 func (e *Executor) LaunchMigration(ctx context.Context, uuid string, shardsArg string) (result *sqltypes.Result, err error) { 4513 if atomic.LoadInt64(&e.isOpen) == 0 { 4514 return nil, vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, "online ddl is disabled") 4515 } 4516 if !schema.IsOnlineDDLUUID(uuid) { 4517 return nil, vterrors.Errorf(vtrpcpb.Code_UNKNOWN, "Not a valid migration ID in EXECUTE: %s", uuid) 4518 } 4519 if !e.matchesShards(shardsArg) { 4520 // Does not apply to this shard! 4521 return &sqltypes.Result{}, nil 4522 } 4523 log.Infof("LaunchMigration: request to execute migration %s", uuid) 4524 4525 e.migrationMutex.Lock() 4526 defer e.migrationMutex.Unlock() 4527 4528 query, err := sqlparser.ParseAndBind(sqlUpdateLaunchMigration, 4529 sqltypes.StringBindVariable(uuid), 4530 ) 4531 if err != nil { 4532 return nil, err 4533 } 4534 defer e.triggerNextCheckInterval() 4535 rs, err := e.execQuery(ctx, query) 4536 if err != nil { 4537 return nil, err 4538 } 4539 log.Infof("LaunchMigration: migration %s marked as unpostponed", uuid) 4540 return rs, nil 4541 } 4542 4543 // LaunchMigrations launches all launch-postponed queued migrations for this keyspace 4544 func (e *Executor) LaunchMigrations(ctx context.Context) (result *sqltypes.Result, err error) { 4545 if atomic.LoadInt64(&e.isOpen) == 0 { 4546 return nil, vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, "online ddl is disabled") 4547 } 4548 4549 uuids, err := e.readPendingMigrationsUUIDs(ctx) 4550 if err != nil { 4551 return result, err 4552 } 4553 r, err := e.execQuery(ctx, sqlSelectQueuedMigrations) 4554 if err != nil { 4555 return result, err 4556 } 4557 rows := r.Named().Rows 4558 log.Infof("LaunchMigrations: iterating %v migrations %s", len(rows)) 4559 result = &sqltypes.Result{} 4560 for _, row := range rows { 4561 uuid := row["migration_uuid"].ToString() 4562 log.Infof("LaunchMigrations: unpostponing %s", uuid) 4563 res, err := e.LaunchMigration(ctx, uuid, "") 4564 if err != nil { 4565 return result, err 4566 } 4567 result.AppendResult(res) 4568 } 4569 log.Infof("LaunchMigrations: done iterating %v migrations %s", len(uuids)) 4570 return result, nil 4571 } 4572 4573 func (e *Executor) submittedMigrationConflictsWithPendingMigrationInSingletonContext( 4574 ctx context.Context, submittedMigration, pendingOnlineDDL *schema.OnlineDDL, 4575 ) bool { 4576 if pendingOnlineDDL.MigrationContext == submittedMigration.MigrationContext { 4577 // same migration context. this is obviously allowed 4578 return false 4579 } 4580 // Let's see if the pending migration is a revert: 4581 if _, err := pendingOnlineDDL.GetRevertUUID(); err != nil { 4582 // Not a revert. So the pending migration definitely conflicts with our migration. 4583 return true 4584 } 4585 4586 // The pending migration is a revert 4587 if !pendingOnlineDDL.StrategySetting().IsSingletonContext() { 4588 // Aha! So, our "conflict" is with a REVERT migration, which does _not_ have a -singleton-context 4589 // flag. Because we want to allow REVERT migrations to run as concurrently as possible, we allow this scenario. 4590 return false 4591 } 4592 return true 4593 } 4594 4595 // submitCallbackIfNonConflicting is called internally by SubmitMigration, and is given a callack to execute 4596 // if the given migration does not conflict any terms. Specifically, this function looks for singleton or 4597 // singleton-context conflicts. 4598 // The call back can be an insertion of a new migration, or a retry of an existing migration, or whatnot. 4599 func (e *Executor) submitCallbackIfNonConflicting( 4600 ctx context.Context, 4601 onlineDDL *schema.OnlineDDL, 4602 callback func() (*sqltypes.Result, error), 4603 ) ( 4604 result *sqltypes.Result, err error, 4605 ) { 4606 if !onlineDDL.StrategySetting().IsSingleton() && !onlineDDL.StrategySetting().IsSingletonContext() { 4607 // not a singleton. No conflict 4608 return callback() 4609 } 4610 // This is either singleton or singleton-context 4611 4612 // This entire next logic is wrapped in an anonymous func just to get the migrationMutex released 4613 // before calling the callback function. Reason is: the callback function itself may need to acquire 4614 // the mutex. And specifically, one of the callback functions used is e.RetryMigration(), which does 4615 // lock the mutex... 4616 err = func() error { 4617 e.migrationMutex.Lock() 4618 defer e.migrationMutex.Unlock() 4619 4620 pendingUUIDs, err := e.readPendingMigrationsUUIDs(ctx) 4621 if err != nil { 4622 return err 4623 } 4624 switch { 4625 case onlineDDL.StrategySetting().IsSingleton(): 4626 // We will reject this migration if there's any pending migration 4627 if len(pendingUUIDs) > 0 { 4628 return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "singleton migration rejected: found pending migrations [%s]", strings.Join(pendingUUIDs, ", ")) 4629 } 4630 case onlineDDL.StrategySetting().IsSingletonContext(): 4631 // We will reject this migration if there's any pending migration within a different context 4632 for _, pendingUUID := range pendingUUIDs { 4633 pendingOnlineDDL, _, err := e.readMigration(ctx, pendingUUID) 4634 if err != nil { 4635 return vterrors.Wrapf(err, "validateSingleton() migration: %s", pendingUUID) 4636 } 4637 if e.submittedMigrationConflictsWithPendingMigrationInSingletonContext(ctx, onlineDDL, pendingOnlineDDL) { 4638 return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "singleton-context migration rejected: found pending migration: %s in different context: %s", pendingUUID, pendingOnlineDDL.MigrationContext) 4639 } 4640 // no conflict? continue looking for other pending migrations 4641 } 4642 } 4643 return nil 4644 }() 4645 if err != nil { 4646 return nil, err 4647 } 4648 // OK to go! 4649 return callback() 4650 } 4651 4652 // SubmitMigration inserts a new migration request 4653 func (e *Executor) SubmitMigration( 4654 ctx context.Context, 4655 stmt sqlparser.Statement, 4656 ) (*sqltypes.Result, error) { 4657 if atomic.LoadInt64(&e.isOpen) == 0 { 4658 return nil, vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, "online ddl is disabled") 4659 } 4660 4661 log.Infof("SubmitMigration: request to submit migration with statement: %0.50s...", sqlparser.CanonicalString(stmt)) 4662 if ddlStmt, ok := stmt.(sqlparser.DDLStatement); ok { 4663 // This validation should have taken place on submission. However, the query may have mutated 4664 // during transfer, and this validation is here to catch any malformed mutation. 4665 if !ddlStmt.IsFullyParsed() { 4666 return nil, vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, "error parsing statement") 4667 } 4668 } 4669 4670 onlineDDL, err := schema.OnlineDDLFromCommentedStatement(stmt) 4671 if err != nil { 4672 return nil, vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "Error submitting migration %s: %v", sqlparser.String(stmt), err) 4673 } 4674 4675 // The logic below has multiple steps. We hence protect the rest of the code with a mutex, only used by this function. 4676 e.submitMutex.Lock() 4677 defer e.submitMutex.Unlock() 4678 4679 // Is there already a migration by this same UUID? 4680 storedMigration, _, err := e.readMigration(ctx, onlineDDL.UUID) 4681 if err != nil && err != ErrMigrationNotFound { 4682 return nil, vterrors.Wrapf(err, "while checking whether migration %s exists", onlineDDL.UUID) 4683 } 4684 if storedMigration != nil { 4685 log.Infof("SubmitMigration: migration %s already exists with migration_context=%s, table=%s", onlineDDL.UUID, storedMigration.MigrationContext, onlineDDL.Table) 4686 // A migration already exists with the same UUID. This is fine, we allow re-submitting migrations 4687 // with the same UUID, as we provide idempotency. 4688 // So we will _mostly_ ignore the request: we will not submit a new migration. However, we will do 4689 // these things: 4690 4691 // 1. Check that the requested submmited migration macthes the existing one's migration-context, otherwise 4692 // this doesn't seem right, not the idempotency we were looking for 4693 if storedMigration.MigrationContext != onlineDDL.MigrationContext { 4694 return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "migration rejected: found migration %s with different context: %s than submmitted migration's context: %s", onlineDDL.UUID, storedMigration.MigrationContext, onlineDDL.MigrationContext) 4695 } 4696 // 2. Possibly, the existing migration is in 'failed' or 'cancelled' state, in which case this 4697 // resubmission should retry the migration. 4698 return e.submitCallbackIfNonConflicting( 4699 ctx, onlineDDL, 4700 func() (*sqltypes.Result, error) { return e.RetryMigration(ctx, onlineDDL.UUID) }, 4701 ) 4702 } 4703 4704 // OK, this is a new UUID 4705 4706 _, actionStr, err := onlineDDL.GetActionStr() 4707 if err != nil { 4708 return nil, err 4709 } 4710 log.Infof("SubmitMigration: request to submit migration %s; action=%s, table=%s", onlineDDL.UUID, actionStr, onlineDDL.Table) 4711 4712 revertedUUID, _ := onlineDDL.GetRevertUUID() // Empty value if the migration is not actually a REVERT. Safe to ignore error. 4713 retainArtifactsSeconds := int64((retainOnlineDDLTables).Seconds()) 4714 _, allowConcurrentMigration := e.allowConcurrentMigration(onlineDDL) 4715 submitQuery, err := sqlparser.ParseAndBind(sqlInsertMigration, 4716 sqltypes.StringBindVariable(onlineDDL.UUID), 4717 sqltypes.StringBindVariable(e.keyspace), 4718 sqltypes.StringBindVariable(e.shard), 4719 sqltypes.StringBindVariable(e.dbName), 4720 sqltypes.StringBindVariable(onlineDDL.Table), 4721 sqltypes.StringBindVariable(onlineDDL.SQL), 4722 sqltypes.StringBindVariable(string(onlineDDL.Strategy)), 4723 sqltypes.StringBindVariable(onlineDDL.Options), 4724 sqltypes.StringBindVariable(actionStr), 4725 sqltypes.StringBindVariable(onlineDDL.MigrationContext), 4726 sqltypes.StringBindVariable(string(schema.OnlineDDLStatusQueued)), 4727 sqltypes.StringBindVariable(e.TabletAliasString()), 4728 sqltypes.Int64BindVariable(retainArtifactsSeconds), 4729 sqltypes.BoolBindVariable(onlineDDL.StrategySetting().IsPostponeLaunch()), 4730 sqltypes.BoolBindVariable(onlineDDL.StrategySetting().IsPostponeCompletion()), 4731 sqltypes.BoolBindVariable(allowConcurrentMigration), 4732 sqltypes.StringBindVariable(revertedUUID), 4733 sqltypes.BoolBindVariable(onlineDDL.IsView()), 4734 ) 4735 if err != nil { 4736 return nil, err 4737 } 4738 result, err := e.submitCallbackIfNonConflicting( 4739 ctx, onlineDDL, 4740 func() (*sqltypes.Result, error) { return e.execQuery(ctx, submitQuery) }, 4741 ) 4742 if err != nil { 4743 return nil, vterrors.Wrapf(err, "submitting migration %v", onlineDDL.UUID) 4744 4745 } 4746 log.Infof("SubmitMigration: migration %s submitted", onlineDDL.UUID) 4747 4748 defer e.triggerNextCheckInterval() 4749 4750 return result, nil 4751 } 4752 4753 // ShowMigrationLogs reads the migration log for a given migration 4754 func (e *Executor) ShowMigrationLogs(ctx context.Context, stmt *sqlparser.ShowMigrationLogs) (result *sqltypes.Result, err error) { 4755 if atomic.LoadInt64(&e.isOpen) == 0 { 4756 return nil, vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, "online ddl is disabled") 4757 } 4758 _, row, err := e.readMigration(ctx, stmt.UUID) 4759 if err != nil { 4760 return nil, err 4761 } 4762 logFile := row["log_file"].ToString() 4763 if logFile == "" { 4764 return nil, vterrors.Errorf(vtrpcpb.Code_NOT_FOUND, "No log file for migration %v", stmt.UUID) 4765 } 4766 content, err := os.ReadFile(logFile) 4767 if err != nil { 4768 return nil, err 4769 } 4770 4771 result = &sqltypes.Result{ 4772 Fields: []*querypb.Field{ 4773 { 4774 Name: "migration_log", 4775 Type: sqltypes.VarChar, 4776 }, 4777 }, 4778 Rows: [][]sqltypes.Value{}, 4779 } 4780 result.Rows = append(result.Rows, []sqltypes.Value{ 4781 sqltypes.NewVarChar(string(content)), 4782 }) 4783 return result, nil 4784 } 4785 4786 // onSchemaMigrationStatus is called when a status is set/changed for a running migration 4787 func (e *Executor) onSchemaMigrationStatus(ctx context.Context, 4788 uuid string, status schema.OnlineDDLStatus, dryRun bool, progressPct float64, etaSeconds int64, rowsCopied int64, hint string) (err error) { 4789 if dryRun && status != schema.OnlineDDLStatusFailed { 4790 // We don't consider dry-run reports unless there's a failure 4791 return nil 4792 } 4793 switch status { 4794 case schema.OnlineDDLStatusReady: 4795 { 4796 err = e.updateMigrationTimestamp(ctx, "ready_timestamp", uuid) 4797 } 4798 case schema.OnlineDDLStatusRunning: 4799 { 4800 _ = e.updateMigrationStartedTimestamp(ctx, uuid) 4801 err = e.updateMigrationTimestamp(ctx, "liveness_timestamp", uuid) 4802 } 4803 case schema.OnlineDDLStatusComplete: 4804 { 4805 progressPct = progressPctFull 4806 _ = e.updateMigrationStartedTimestamp(ctx, uuid) 4807 err = e.updateMigrationTimestamp(ctx, "completed_timestamp", uuid) 4808 } 4809 case schema.OnlineDDLStatusFailed: 4810 { 4811 _ = e.updateMigrationStartedTimestamp(ctx, uuid) 4812 err = e.updateMigrationTimestamp(ctx, "completed_timestamp", uuid) 4813 } 4814 } 4815 if err != nil { 4816 return err 4817 } 4818 if err = e.updateMigrationStatus(ctx, uuid, status); err != nil { 4819 return err 4820 } 4821 if err = e.updateMigrationProgress(ctx, uuid, progressPct); err != nil { 4822 return err 4823 } 4824 if err = e.updateMigrationETASeconds(ctx, uuid, etaSeconds); err != nil { 4825 return err 4826 } 4827 if err := e.updateRowsCopied(ctx, uuid, rowsCopied); err != nil { 4828 return err 4829 } 4830 if hint == readyToCompleteHint { 4831 if err := e.updateMigrationReadyToComplete(ctx, uuid, true); err != nil { 4832 return err 4833 } 4834 } 4835 if !dryRun { 4836 switch status { 4837 case schema.OnlineDDLStatusComplete, schema.OnlineDDLStatusFailed: 4838 e.triggerNextCheckInterval() 4839 } 4840 } 4841 4842 return nil 4843 } 4844 4845 // OnSchemaMigrationStatus is called by TabletServer's API, which is invoked by a running gh-ost migration's hooks. 4846 func (e *Executor) OnSchemaMigrationStatus(ctx context.Context, 4847 uuidParam, statusParam, dryrunParam, progressParam, etaParam, rowsCopiedParam, hint string) (err error) { 4848 status := schema.OnlineDDLStatus(statusParam) 4849 dryRun := (dryrunParam == "true") 4850 var progressPct float64 4851 if pct, err := strconv.ParseFloat(progressParam, 64); err == nil { 4852 progressPct = pct 4853 } 4854 var etaSeconds int64 = etaSecondsUnknown 4855 if eta, err := strconv.ParseInt(etaParam, 10, 64); err == nil { 4856 etaSeconds = eta 4857 } 4858 var rowsCopied int64 4859 if rows, err := strconv.ParseInt(rowsCopiedParam, 10, 64); err == nil { 4860 rowsCopied = rows 4861 } 4862 4863 return e.onSchemaMigrationStatus(ctx, uuidParam, status, dryRun, progressPct, etaSeconds, rowsCopied, hint) 4864 } 4865 4866 // VExec is called by a VExec invocation 4867 // Implements vitess.io/vitess/go/vt/vttablet/vexec.Executor interface 4868 func (e *Executor) VExec(ctx context.Context, vx *vexec.TabletVExec) (qr *querypb.QueryResult, err error) { 4869 response := func(result *sqltypes.Result, err error) (*querypb.QueryResult, error) { 4870 if err != nil { 4871 return nil, err 4872 } 4873 return sqltypes.ResultToProto3(result), nil 4874 } 4875 4876 switch stmt := vx.Stmt.(type) { 4877 case *sqlparser.Delete: 4878 return nil, fmt.Errorf("DELETE statements not supported for this table. query=%s", vx.Query) 4879 case *sqlparser.Select: 4880 return response(e.execQuery(ctx, vx.Query)) 4881 case *sqlparser.Insert: 4882 match, err := sqlparser.QueryMatchesTemplates(vx.Query, vexecInsertTemplates) 4883 if err != nil { 4884 return nil, err 4885 } 4886 if !match { 4887 return nil, fmt.Errorf("Query must match one of these templates: %s", strings.Join(vexecInsertTemplates, "; ")) 4888 } 4889 // Vexec naturally runs outside shard/schema context. It does not supply values for those columns. 4890 // We can fill them in. 4891 vx.ReplaceInsertColumnVal("shard", vx.ToStringVal(e.shard)) 4892 vx.ReplaceInsertColumnVal("mysql_schema", vx.ToStringVal(e.dbName)) 4893 vx.AddOrReplaceInsertColumnVal("tablet", vx.ToStringVal(e.TabletAliasString())) 4894 e.triggerNextCheckInterval() 4895 return response(e.execQuery(ctx, vx.Query)) 4896 case *sqlparser.Update: 4897 match, err := sqlparser.QueryMatchesTemplates(vx.Query, vexecUpdateTemplates) 4898 if err != nil { 4899 return nil, err 4900 } 4901 if !match { 4902 return nil, fmt.Errorf("Query must match one of these templates: %s; query=%s", strings.Join(vexecUpdateTemplates, "; "), vx.Query) 4903 } 4904 if shard, _ := vx.ColumnStringVal(vx.WhereCols, "shard"); shard != "" { 4905 // shard is specified. 4906 if shard != e.shard { 4907 // specified shard is not _this_ shard. So we're skipping this UPDATE 4908 return sqltypes.ResultToProto3(emptyResult), nil 4909 } 4910 } 4911 statusVal, err := vx.ColumnStringVal(vx.UpdateCols, "migration_status") 4912 if err != nil { 4913 return nil, err 4914 } 4915 switch statusVal { 4916 case retryMigrationHint: 4917 return response(e.retryMigrationWhere(ctx, sqlparser.String(stmt.Where.Expr))) 4918 case completeMigrationHint: 4919 uuid, err := vx.ColumnStringVal(vx.WhereCols, "migration_uuid") 4920 if err != nil { 4921 return nil, err 4922 } 4923 if !schema.IsOnlineDDLUUID(uuid) { 4924 return nil, fmt.Errorf("Not an Online DDL UUID: %s", uuid) 4925 } 4926 return response(e.CompleteMigration(ctx, uuid)) 4927 case cancelMigrationHint: 4928 uuid, err := vx.ColumnStringVal(vx.WhereCols, "migration_uuid") 4929 if err != nil { 4930 return nil, err 4931 } 4932 if !schema.IsOnlineDDLUUID(uuid) { 4933 return nil, fmt.Errorf("Not an Online DDL UUID: %s", uuid) 4934 } 4935 return response(e.CancelMigration(ctx, uuid, "cancel by user", true)) 4936 case cancelAllMigrationHint: 4937 uuid, _ := vx.ColumnStringVal(vx.WhereCols, "migration_uuid") 4938 if uuid != "" { 4939 return nil, fmt.Errorf("Unexpetced UUID: %s", uuid) 4940 } 4941 return response(e.CancelPendingMigrations(ctx, "cancel-all by user", true)) 4942 default: 4943 return nil, fmt.Errorf("Unexpected value for migration_status: %v. Supported values are: %s, %s", 4944 statusVal, retryMigrationHint, cancelMigrationHint) 4945 } 4946 default: 4947 return nil, fmt.Errorf("No handler for this query: %s", vx.Query) 4948 } 4949 }