vitess.io/vitess@v0.16.2/go/vt/vttablet/tabletmanager/restore.go (about) 1 /* 2 Copyright 2019 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package tabletmanager 18 19 import ( 20 "context" 21 "fmt" 22 "io" 23 "time" 24 25 "github.com/spf13/pflag" 26 27 "vitess.io/vitess/go/mysql" 28 "vitess.io/vitess/go/vt/dbconfigs" 29 "vitess.io/vitess/go/vt/hook" 30 "vitess.io/vitess/go/vt/log" 31 "vitess.io/vitess/go/vt/logutil" 32 "vitess.io/vitess/go/vt/mysqlctl" 33 "vitess.io/vitess/go/vt/proto/vttime" 34 "vitess.io/vitess/go/vt/servenv" 35 "vitess.io/vitess/go/vt/topo" 36 "vitess.io/vitess/go/vt/topo/topoproto" 37 "vitess.io/vitess/go/vt/vterrors" 38 "vitess.io/vitess/go/vt/vttablet/tabletmanager/vreplication" 39 "vitess.io/vitess/go/vt/vttablet/tmclient" 40 41 binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" 42 tabletmanagerdatapb "vitess.io/vitess/go/vt/proto/tabletmanagerdata" 43 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 44 vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" 45 ) 46 47 // This file handles the initial backup restore upon startup. 48 // It is only enabled if restore_from_backup is set. 49 50 var ( 51 restoreFromBackup bool 52 restoreFromBackupTsStr string 53 restoreConcurrency = 4 54 waitForBackupInterval time.Duration 55 ) 56 57 func registerRestoreFlags(fs *pflag.FlagSet) { 58 fs.BoolVar(&restoreFromBackup, "restore_from_backup", restoreFromBackup, "(init restore parameter) will check BackupStorage for a recent backup at startup and start there") 59 fs.StringVar(&restoreFromBackupTsStr, "restore_from_backup_ts", restoreFromBackupTsStr, "(init restore parameter) if set, restore the latest backup taken at or before this timestamp. Example: '2021-04-29.133050'") 60 fs.IntVar(&restoreConcurrency, "restore_concurrency", restoreConcurrency, "(init restore parameter) how many concurrent files to restore at once") 61 fs.DurationVar(&waitForBackupInterval, "wait_for_backup_interval", waitForBackupInterval, "(init restore parameter) if this is greater than 0, instead of starting up empty when no backups are found, keep checking at this interval for a backup to appear") 62 } 63 64 var ( 65 // Flags for PITR 66 binlogHost string 67 binlogPort int 68 binlogUser string 69 binlogPwd string 70 timeoutForGTIDLookup = 60 * time.Second 71 binlogSslCa string 72 binlogSslCert string 73 binlogSslKey string 74 binlogSslServerName string 75 ) 76 77 func registerPointInTimeRestoreFlags(fs *pflag.FlagSet) { 78 fs.StringVar(&binlogHost, "binlog_host", binlogHost, "PITR restore parameter: hostname/IP of binlog server.") 79 fs.IntVar(&binlogPort, "binlog_port", binlogPort, "PITR restore parameter: port of binlog server.") 80 fs.StringVar(&binlogUser, "binlog_user", binlogUser, "PITR restore parameter: username of binlog server.") 81 fs.StringVar(&binlogPwd, "binlog_password", binlogPwd, "PITR restore parameter: password of binlog server.") 82 fs.DurationVar(&timeoutForGTIDLookup, "pitr_gtid_lookup_timeout", timeoutForGTIDLookup, "PITR restore parameter: timeout for fetching gtid from timestamp.") 83 fs.StringVar(&binlogSslCa, "binlog_ssl_ca", binlogSslCa, "PITR restore parameter: Filename containing TLS CA certificate to verify binlog server TLS certificate against.") 84 fs.StringVar(&binlogSslCert, "binlog_ssl_cert", binlogSslCert, "PITR restore parameter: Filename containing mTLS client certificate to present to binlog server as authentication.") 85 fs.StringVar(&binlogSslKey, "binlog_ssl_key", binlogSslKey, "PITR restore parameter: Filename containing mTLS client private key for use in binlog server authentication.") 86 fs.StringVar(&binlogSslServerName, "binlog_ssl_server_name", binlogSslServerName, "PITR restore parameter: TLS server name (common name) to verify against for the binlog server we are connecting to (If not set: use the hostname or IP supplied in --binlog_host).") 87 } 88 89 func init() { 90 servenv.OnParseFor("vtcombo", registerRestoreFlags) 91 servenv.OnParseFor("vttablet", registerRestoreFlags) 92 93 servenv.OnParseFor("vtcombo", registerPointInTimeRestoreFlags) 94 servenv.OnParseFor("vttablet", registerPointInTimeRestoreFlags) 95 } 96 97 // RestoreData is the main entry point for backup restore. 98 // It will either work, fail gracefully, or return 99 // an error in case of a non-recoverable error. 100 // It takes the action lock so no RPC interferes. 101 func (tm *TabletManager) RestoreData(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool, backupTime time.Time) error { 102 if err := tm.lock(ctx); err != nil { 103 return err 104 } 105 defer tm.unlock() 106 if tm.Cnf == nil { 107 return fmt.Errorf("cannot perform restore without my.cnf, please restart vttablet with a my.cnf file specified") 108 } 109 110 var ( 111 err error 112 startTime time.Time 113 ) 114 115 defer func() { 116 stopTime := time.Now() 117 118 h := hook.NewSimpleHook("vttablet_restore_done") 119 h.ExtraEnv = tm.hookExtraEnv() 120 h.ExtraEnv["TM_RESTORE_DATA_START_TS"] = startTime.UTC().Format(time.RFC3339) 121 h.ExtraEnv["TM_RESTORE_DATA_STOP_TS"] = stopTime.UTC().Format(time.RFC3339) 122 h.ExtraEnv["TM_RESTORE_DATA_DURATION"] = stopTime.Sub(startTime).String() 123 124 if err != nil { 125 h.ExtraEnv["TM_RESTORE_DATA_ERROR"] = err.Error() 126 } 127 128 // vttablet_restore_done is best-effort (for now?). 129 go func() { 130 // Package vthook already logs the stdout/stderr of hooks when they 131 // are run, so we don't duplicate that here. 132 hr := h.Execute() 133 switch hr.ExitStatus { 134 case hook.HOOK_SUCCESS: 135 case hook.HOOK_DOES_NOT_EXIST: 136 log.Info("No vttablet_restore_done hook.") 137 default: 138 log.Warning("vttablet_restore_done hook failed") 139 } 140 }() 141 }() 142 143 startTime = time.Now() 144 145 req := &tabletmanagerdatapb.RestoreFromBackupRequest{ 146 BackupTime: logutil.TimeToProto(backupTime), 147 } 148 err = tm.restoreDataLocked(ctx, logger, waitForBackupInterval, deleteBeforeRestore, req) 149 if err != nil { 150 return err 151 } 152 return nil 153 } 154 155 func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool, request *tabletmanagerdatapb.RestoreFromBackupRequest) error { 156 157 tablet := tm.Tablet() 158 originalType := tablet.Type 159 // Try to restore. Depending on the reason for failure, we may be ok. 160 // If we're not ok, return an error and the tm will log.Fatalf, 161 // causing the process to be restarted and the restore retried. 162 163 keyspace := tablet.Keyspace 164 keyspaceInfo, err := tm.TopoServer.GetKeyspace(ctx, keyspace) 165 if err != nil { 166 return err 167 } 168 169 // For a SNAPSHOT keyspace, we have to look for backups of BaseKeyspace 170 // so we will pass the BaseKeyspace in RestoreParams instead of tablet.Keyspace 171 if keyspaceInfo.KeyspaceType == topodatapb.KeyspaceType_SNAPSHOT { 172 if keyspaceInfo.BaseKeyspace == "" { 173 return vterrors.New(vtrpcpb.Code_INVALID_ARGUMENT, fmt.Sprintf("snapshot keyspace %v has no base_keyspace set", tablet.Keyspace)) 174 } 175 keyspace = keyspaceInfo.BaseKeyspace 176 log.Infof("Using base_keyspace %v to restore keyspace %v using a backup time of %v", keyspace, tablet.Keyspace, logutil.ProtoToTime(request.BackupTime)) 177 } 178 179 params := mysqlctl.RestoreParams{ 180 Cnf: tm.Cnf, 181 Mysqld: tm.MysqlDaemon, 182 Logger: logger, 183 Concurrency: restoreConcurrency, 184 HookExtraEnv: tm.hookExtraEnv(), 185 DeleteBeforeRestore: deleteBeforeRestore, 186 DbName: topoproto.TabletDbName(tablet), 187 Keyspace: keyspace, 188 Shard: tablet.Shard, 189 StartTime: logutil.ProtoToTime(request.BackupTime), 190 DryRun: request.DryRun, 191 } 192 if request.RestoreToPos != "" { 193 pos, err := mysql.DecodePosition(request.RestoreToPos) 194 if err != nil { 195 return vterrors.Wrapf(err, "restore failed: unable to decode --restore_to_pos: %s", request.RestoreToPos) 196 } 197 params.RestoreToPos = pos 198 } 199 params.Logger.Infof("Restore: original tablet type=%v", originalType) 200 201 // Check whether we're going to restore before changing to RESTORE type, 202 // so we keep our PrimaryTermStartTime (if any) if we aren't actually restoring. 203 ok, err := mysqlctl.ShouldRestore(ctx, params) 204 if err != nil { 205 return err 206 } 207 if !ok { 208 params.Logger.Infof("Attempting to restore, but mysqld already contains data. Assuming vttablet was just restarted.") 209 return nil 210 } 211 // We should not become primary after restore, because that would incorrectly 212 // start a new primary term, and it's likely our data dir will be out of date. 213 if originalType == topodatapb.TabletType_PRIMARY { 214 originalType = tm.baseTabletType 215 } 216 if err := tm.tmState.ChangeTabletType(ctx, topodatapb.TabletType_RESTORE, DBActionNone); err != nil { 217 return err 218 } 219 // Loop until a backup exists, unless we were told to give up immediately. 220 var backupManifest *mysqlctl.BackupManifest 221 for { 222 backupManifest, err = mysqlctl.Restore(ctx, params) 223 params.Logger.Infof("Restore: got a restore manifest: %v, err=%v, waitForBackupInterval=%v", backupManifest, err, waitForBackupInterval) 224 if waitForBackupInterval == 0 { 225 break 226 } 227 // We only retry a specific set of errors. The rest we return immediately. 228 if err != mysqlctl.ErrNoBackup && err != mysqlctl.ErrNoCompleteBackup { 229 break 230 } 231 232 log.Infof("No backup found. Waiting %v (from -wait_for_backup_interval flag) to check again.", waitForBackupInterval) 233 select { 234 case <-ctx.Done(): 235 return ctx.Err() 236 case <-time.After(waitForBackupInterval): 237 } 238 } 239 240 var pos mysql.Position 241 if backupManifest != nil { 242 pos = backupManifest.Position 243 params.Logger.Infof("Restore: pos=%v", mysql.EncodePosition(pos)) 244 } 245 // If SnapshotTime is set , then apply the incremental change 246 if keyspaceInfo.SnapshotTime != nil { 247 params.Logger.Infof("Restore: Restoring to time %v from binlog", keyspaceInfo.SnapshotTime) 248 err = tm.restoreToTimeFromBinlog(ctx, pos, keyspaceInfo.SnapshotTime) 249 if err != nil { 250 log.Errorf("unable to restore to the specified time %s, error : %v", keyspaceInfo.SnapshotTime.String(), err) 251 return nil 252 } 253 } 254 switch { 255 case err == nil && backupManifest != nil: 256 // Starting from here we won't be able to recover if we get stopped by a cancelled 257 // context. Thus we use the background context to get through to the finish. 258 if params.IsIncrementalRecovery() && !params.DryRun { 259 // The whole point of point-in-time recovery is that we want to restore up to a given position, 260 // and to NOT proceed from that position. We want to disable replication and NOT let the replica catch 261 // up with the primary. 262 params.Logger.Infof("Restore: disabling replication") 263 if err := tm.disableReplication(context.Background()); err != nil { 264 return err 265 } 266 } else if keyspaceInfo.KeyspaceType == topodatapb.KeyspaceType_NORMAL { 267 // Reconnect to primary only for "NORMAL" keyspaces 268 params.Logger.Infof("Restore: starting replication at position %v", pos) 269 if err := tm.startReplication(context.Background(), pos, originalType); err != nil { 270 return err 271 } 272 } 273 case err == mysqlctl.ErrNoBackup: 274 // Starting with empty database. 275 // We just need to initialize replication 276 _, err := tm.initializeReplication(ctx, originalType) 277 if err != nil { 278 return err 279 } 280 case err == nil && params.DryRun: 281 // Do nothing here, let the rest of code run 282 params.Logger.Infof("Dry run. No changes made") 283 default: 284 // If anything failed, we should reset the original tablet type 285 if err := tm.tmState.ChangeTabletType(ctx, originalType, DBActionNone); err != nil { 286 log.Errorf("Could not change back to original tablet type %v: %v", originalType, err) 287 } 288 return vterrors.Wrap(err, "Can't restore backup") 289 } 290 291 // If we had type BACKUP or RESTORE it's better to set our type to the init_tablet_type to make result of the restore 292 // similar to completely clean start from scratch. 293 if (originalType == topodatapb.TabletType_BACKUP || originalType == topodatapb.TabletType_RESTORE) && initTabletType != "" { 294 initType, err := topoproto.ParseTabletType(initTabletType) 295 if err == nil { 296 originalType = initType 297 } 298 } 299 if params.IsIncrementalRecovery() && !params.DryRun { 300 // override 301 params.Logger.Infof("Restore: will set tablet type to DRAINED as this is a point in time recovery") 302 originalType = topodatapb.TabletType_DRAINED 303 } 304 params.Logger.Infof("Restore: changing tablet type to %v for %s", originalType, tm.tabletAlias.String()) 305 // Change type back to original type if we're ok to serve. 306 return tm.tmState.ChangeTabletType(ctx, originalType, DBActionNone) 307 } 308 309 // restoreToTimeFromBinlog restores to the snapshot time of the keyspace 310 // currently this works with mysql based database only (as it uses mysql specific queries for restoring) 311 func (tm *TabletManager) restoreToTimeFromBinlog(ctx context.Context, pos mysql.Position, restoreTime *vttime.Time) error { 312 // validate the minimal settings necessary for connecting to binlog server 313 if binlogHost == "" || binlogPort <= 0 || binlogUser == "" { 314 log.Warning("invalid binlog server setting, restoring to last available backup.") 315 return nil 316 } 317 318 timeoutCtx, cancelFnc := context.WithTimeout(ctx, timeoutForGTIDLookup) 319 defer cancelFnc() 320 321 afterGTIDPos, beforeGTIDPos, err := tm.getGTIDFromTimestamp(timeoutCtx, pos, restoreTime.Seconds) 322 if err != nil { 323 return err 324 } 325 326 if afterGTIDPos == "" && beforeGTIDPos == "" { 327 return vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, fmt.Sprintf("unable to fetch the GTID for the specified time - %s", restoreTime.String())) 328 } else if afterGTIDPos == "" && beforeGTIDPos != "" { 329 log.Info("no afterGTIDPos found, which implies we reached the end of all GTID events") 330 } 331 332 log.Infof("going to restore upto the GTID - %s", afterGTIDPos) 333 // when we don't have before GTID, we will take it as current backup pos's last GTID 334 // this is case where someone tries to restore just to the 1st event after backup 335 if beforeGTIDPos == "" { 336 beforeGTIDPos = pos.GTIDSet.Last() 337 } 338 err = tm.catchupToGTID(timeoutCtx, afterGTIDPos, beforeGTIDPos) 339 if err != nil { 340 return vterrors.Wrapf(err, "unable to replicate upto desired GTID : %s", afterGTIDPos) 341 } 342 343 return nil 344 } 345 346 // getGTIDFromTimestamp computes 2 GTIDs based on restoreTime 347 // afterPos is the GTID of the first event at or after restoreTime. 348 // beforePos is the GTID of the last event before restoreTime. This is the GTID upto which replication will be applied 349 // afterPos can be used directly in the query `START SLAVE UNTIL SQL_BEFORE_GTIDS = ”` 350 // beforePos will be used to check if replication was able to catch up from the binlog server 351 func (tm *TabletManager) getGTIDFromTimestamp(ctx context.Context, pos mysql.Position, restoreTime int64) (afterPos string, beforePos string, err error) { 352 connParams := &mysql.ConnParams{ 353 Host: binlogHost, 354 Port: binlogPort, 355 Uname: binlogUser, 356 SslCa: binlogSslCa, 357 SslCert: binlogSslCert, 358 SslKey: binlogSslKey, 359 ServerName: binlogSslServerName, 360 } 361 if binlogPwd != "" { 362 connParams.Pass = binlogPwd 363 } 364 if binlogSslCa != "" || binlogSslCert != "" { 365 connParams.EnableSSL() 366 } 367 dbCfgs := &dbconfigs.DBConfigs{ 368 Host: connParams.Host, 369 Port: connParams.Port, 370 } 371 dbCfgs.SetDbParams(*connParams, *connParams, *connParams) 372 vsClient := vreplication.NewReplicaConnector(connParams) 373 374 filter := &binlogdatapb.Filter{ 375 Rules: []*binlogdatapb.Rule{{ 376 Match: "/.*", 377 }}, 378 } 379 380 // get current lastPos of binlog server, so that if we hit that in vstream, we'll return from there 381 binlogConn, err := mysql.Connect(ctx, connParams) 382 if err != nil { 383 return "", "", err 384 } 385 defer binlogConn.Close() 386 lastPos, err := binlogConn.PrimaryPosition() 387 if err != nil { 388 return "", "", err 389 } 390 391 gtidsChan := make(chan []string, 1) 392 393 go func() { 394 err := vsClient.VStream(ctx, mysql.EncodePosition(pos), filter, func(events []*binlogdatapb.VEvent) error { 395 for _, event := range events { 396 if event.Gtid != "" { 397 // check if we reached the lastPos then return 398 eventPos, err := mysql.DecodePosition(event.Gtid) 399 if err != nil { 400 return err 401 } 402 403 if event.Timestamp >= restoreTime { 404 afterPos = event.Gtid 405 gtidsChan <- []string{event.Gtid, beforePos} 406 return io.EOF 407 } 408 409 if eventPos.AtLeast(lastPos) { 410 gtidsChan <- []string{"", beforePos} 411 return io.EOF 412 } 413 beforePos = event.Gtid 414 } 415 } 416 return nil 417 }) 418 if err != nil && err != io.EOF { 419 log.Warningf("Error using VStream to find timestamp for GTID position: %v error: %v", pos, err) 420 gtidsChan <- []string{"", ""} 421 } 422 }() 423 defer vsClient.Close(ctx) 424 select { 425 case val := <-gtidsChan: 426 return val[0], val[1], nil 427 case <-ctx.Done(): 428 log.Warningf("Can't find the GTID from restore time stamp, exiting.") 429 return "", beforePos, vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, "unable to find GTID from the snapshot time as context timed out") 430 } 431 } 432 433 // catchupToGTID replicates upto specified GTID from binlog server 434 // 435 // copies the data from binlog server by pointing to as replica 436 // waits till all events to GTID replicated 437 // once done, it will reset the replication 438 func (tm *TabletManager) catchupToGTID(ctx context.Context, afterGTIDPos string, beforeGTIDPos string) error { 439 var afterGTIDStr string 440 if afterGTIDPos != "" { 441 afterGTIDParsed, err := mysql.DecodePosition(afterGTIDPos) 442 if err != nil { 443 return err 444 } 445 afterGTIDStr = afterGTIDParsed.GTIDSet.Last() 446 } 447 448 beforeGTIDPosParsed, err := mysql.DecodePosition(beforeGTIDPos) 449 if err != nil { 450 return err 451 } 452 453 // it uses mysql specific queries here 454 cmds := []string{ 455 "STOP SLAVE FOR CHANNEL '' ", 456 "STOP SLAVE IO_THREAD FOR CHANNEL ''", 457 } 458 459 if binlogSslCa != "" || binlogSslCert != "" { 460 // We need to use TLS 461 cmd := fmt.Sprintf("CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='%s', MASTER_PASSWORD='%s', MASTER_AUTO_POSITION=1, MASTER_SSL=1", binlogHost, binlogPort, binlogUser, binlogPwd) 462 if binlogSslCa != "" { 463 cmd += fmt.Sprintf(", MASTER_SSL_CA='%s'", binlogSslCa) 464 } 465 if binlogSslCert != "" { 466 cmd += fmt.Sprintf(", MASTER_SSL_CERT='%s'", binlogSslCert) 467 } 468 if binlogSslKey != "" { 469 cmd += fmt.Sprintf(", MASTER_SSL_KEY='%s'", binlogSslKey) 470 } 471 cmds = append(cmds, cmd+";") 472 } else { 473 // No TLS 474 cmds = append(cmds, fmt.Sprintf("CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='%s', MASTER_PASSWORD='%s', MASTER_AUTO_POSITION=1;", binlogHost, binlogPort, binlogUser, binlogPwd)) 475 } 476 477 if afterGTIDPos == "" { // when the there is no afterPos, that means need to replicate completely 478 cmds = append(cmds, "START SLAVE") 479 } else { 480 cmds = append(cmds, fmt.Sprintf("START SLAVE UNTIL SQL_BEFORE_GTIDS = '%s'", afterGTIDStr)) 481 } 482 483 if err := tm.MysqlDaemon.ExecuteSuperQueryList(ctx, cmds); err != nil { 484 return vterrors.Wrap(err, fmt.Sprintf("failed to restart the replication until %s GTID", afterGTIDStr)) 485 } 486 log.Infof("Waiting for position to reach", beforeGTIDPosParsed.GTIDSet.Last()) 487 // Could not use `agent.MysqlDaemon.WaitSourcePos` as replication is stopped with `START SLAVE UNTIL SQL_BEFORE_GTIDS` 488 // this is as per https://dev.mysql.com/doc/refman/5.6/en/start-slave.html 489 // We need to wait until replication catches upto the specified afterGTIDPos 490 chGTIDCaughtup := make(chan bool) 491 go func() { 492 timeToWait := time.Now().Add(timeoutForGTIDLookup) 493 for time.Now().Before(timeToWait) { 494 pos, err := tm.MysqlDaemon.PrimaryPosition() 495 if err != nil { 496 chGTIDCaughtup <- false 497 } 498 499 if pos.AtLeast(beforeGTIDPosParsed) { 500 chGTIDCaughtup <- true 501 } 502 select { 503 case <-ctx.Done(): 504 chGTIDCaughtup <- false 505 default: 506 time.Sleep(300 * time.Millisecond) 507 } 508 } 509 }() 510 select { 511 case resp := <-chGTIDCaughtup: 512 if resp { 513 cmds := []string{ 514 "STOP SLAVE", 515 "RESET SLAVE ALL", 516 } 517 if err := tm.MysqlDaemon.ExecuteSuperQueryList(ctx, cmds); err != nil { 518 return vterrors.Wrap(err, "failed to stop replication") 519 } 520 return nil 521 } 522 return vterrors.Wrap(err, "error while fetching the current GTID position") 523 case <-ctx.Done(): 524 log.Warningf("Could not copy up to GTID.") 525 return vterrors.Wrapf(err, "context timeout while restoring up to specified GTID - %s", beforeGTIDPos) 526 } 527 } 528 529 // disableReplication stopes and resets replication on the mysql server. It moreover sets impossible replication 530 // source params, so that the replica can't possibly reconnect. It would take a `CHANGE [MASTER|REPLICATION SOURCE] TO ...` to 531 // make the mysql server replicate again (available via tm.MysqlDaemon.SetReplicationPosition) 532 func (tm *TabletManager) disableReplication(ctx context.Context) error { 533 cmds := []string{ 534 "STOP SLAVE", 535 "RESET SLAVE ALL", // "ALL" makes it forget primary host:port. 536 } 537 if err := tm.MysqlDaemon.ExecuteSuperQueryList(ctx, cmds); err != nil { 538 return vterrors.Wrap(err, "failed to reset replication") 539 } 540 if err := tm.MysqlDaemon.SetReplicationSource(ctx, "//", 0, false /* stopReplicationBefore */, true /* startReplicationAfter */); err != nil { 541 return vterrors.Wrap(err, "failed to disable replication") 542 } 543 544 return nil 545 } 546 547 func (tm *TabletManager) startReplication(ctx context.Context, pos mysql.Position, tabletType topodatapb.TabletType) error { 548 cmds := []string{ 549 "STOP SLAVE", 550 "RESET SLAVE ALL", // "ALL" makes it forget primary host:port. 551 } 552 if err := tm.MysqlDaemon.ExecuteSuperQueryList(ctx, cmds); err != nil { 553 return vterrors.Wrap(err, "failed to reset replication") 554 } 555 556 // Set the position at which to resume from the primary. 557 if err := tm.MysqlDaemon.SetReplicationPosition(ctx, pos); err != nil { 558 return vterrors.Wrap(err, "failed to set replication position") 559 } 560 561 primary, err := tm.initializeReplication(ctx, tabletType) 562 // If we ran into an error while initializing replication, then there is no point in waiting for catch-up. 563 // Also, if there is no primary tablet in the shard, we don't need to proceed further. 564 if err != nil || primary == nil { 565 return err 566 } 567 568 // wait for reliable replication_lag_seconds 569 // we have pos where we want to resume from 570 // if PrimaryPosition is the same, that means no writes 571 // have happened to primary, so we are up-to-date 572 // otherwise, wait for replica's Position to change from 573 // the initial pos before proceeding 574 tmc := tmclient.NewTabletManagerClient() 575 defer tmc.Close() 576 remoteCtx, remoteCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout) 577 defer remoteCancel() 578 posStr, err := tmc.PrimaryPosition(remoteCtx, primary.Tablet) 579 if err != nil { 580 // It is possible that though PrimaryAlias is set, the primary tablet is unreachable 581 // Log a warning and let tablet restore in that case 582 // If we had instead considered this fatal, all tablets would crash-loop 583 // until a primary appears, which would make it impossible to elect a primary. 584 log.Warningf("Can't get primary replication position after restore: %v", err) 585 return nil 586 } 587 primaryPos, err := mysql.DecodePosition(posStr) 588 if err != nil { 589 return vterrors.Wrapf(err, "can't decode primary replication position: %q", posStr) 590 } 591 592 if !pos.Equal(primaryPos) { 593 for { 594 if err := ctx.Err(); err != nil { 595 return err 596 } 597 status, err := tm.MysqlDaemon.ReplicationStatus() 598 if err != nil { 599 return vterrors.Wrap(err, "can't get replication status") 600 } 601 newPos := status.Position 602 if !newPos.Equal(pos) { 603 break 604 } 605 time.Sleep(1 * time.Second) 606 } 607 } 608 609 return nil 610 } 611 612 func (tm *TabletManager) getLocalMetadataValues(tabletType topodatapb.TabletType) map[string]string { 613 tablet := tm.Tablet() 614 values := map[string]string{ 615 "Alias": topoproto.TabletAliasString(tablet.Alias), 616 "ClusterAlias": fmt.Sprintf("%s.%s", tablet.Keyspace, tablet.Shard), 617 "DataCenter": tablet.Alias.Cell, 618 "PromotionRule": "must_not", 619 } 620 if isPrimaryEligible(tabletType) { 621 values["PromotionRule"] = "neutral" 622 } 623 return values 624 }