vitess.io/vitess@v0.16.2/go/vt/vttablet/tabletmanager/restore.go (about)

     1  /*
     2  Copyright 2019 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package tabletmanager
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"io"
    23  	"time"
    24  
    25  	"github.com/spf13/pflag"
    26  
    27  	"vitess.io/vitess/go/mysql"
    28  	"vitess.io/vitess/go/vt/dbconfigs"
    29  	"vitess.io/vitess/go/vt/hook"
    30  	"vitess.io/vitess/go/vt/log"
    31  	"vitess.io/vitess/go/vt/logutil"
    32  	"vitess.io/vitess/go/vt/mysqlctl"
    33  	"vitess.io/vitess/go/vt/proto/vttime"
    34  	"vitess.io/vitess/go/vt/servenv"
    35  	"vitess.io/vitess/go/vt/topo"
    36  	"vitess.io/vitess/go/vt/topo/topoproto"
    37  	"vitess.io/vitess/go/vt/vterrors"
    38  	"vitess.io/vitess/go/vt/vttablet/tabletmanager/vreplication"
    39  	"vitess.io/vitess/go/vt/vttablet/tmclient"
    40  
    41  	binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata"
    42  	tabletmanagerdatapb "vitess.io/vitess/go/vt/proto/tabletmanagerdata"
    43  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    44  	vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc"
    45  )
    46  
    47  // This file handles the initial backup restore upon startup.
    48  // It is only enabled if restore_from_backup is set.
    49  
    50  var (
    51  	restoreFromBackup      bool
    52  	restoreFromBackupTsStr string
    53  	restoreConcurrency     = 4
    54  	waitForBackupInterval  time.Duration
    55  )
    56  
    57  func registerRestoreFlags(fs *pflag.FlagSet) {
    58  	fs.BoolVar(&restoreFromBackup, "restore_from_backup", restoreFromBackup, "(init restore parameter) will check BackupStorage for a recent backup at startup and start there")
    59  	fs.StringVar(&restoreFromBackupTsStr, "restore_from_backup_ts", restoreFromBackupTsStr, "(init restore parameter) if set, restore the latest backup taken at or before this timestamp. Example: '2021-04-29.133050'")
    60  	fs.IntVar(&restoreConcurrency, "restore_concurrency", restoreConcurrency, "(init restore parameter) how many concurrent files to restore at once")
    61  	fs.DurationVar(&waitForBackupInterval, "wait_for_backup_interval", waitForBackupInterval, "(init restore parameter) if this is greater than 0, instead of starting up empty when no backups are found, keep checking at this interval for a backup to appear")
    62  }
    63  
    64  var (
    65  	// Flags for PITR
    66  	binlogHost           string
    67  	binlogPort           int
    68  	binlogUser           string
    69  	binlogPwd            string
    70  	timeoutForGTIDLookup = 60 * time.Second
    71  	binlogSslCa          string
    72  	binlogSslCert        string
    73  	binlogSslKey         string
    74  	binlogSslServerName  string
    75  )
    76  
    77  func registerPointInTimeRestoreFlags(fs *pflag.FlagSet) {
    78  	fs.StringVar(&binlogHost, "binlog_host", binlogHost, "PITR restore parameter: hostname/IP of binlog server.")
    79  	fs.IntVar(&binlogPort, "binlog_port", binlogPort, "PITR restore parameter: port of binlog server.")
    80  	fs.StringVar(&binlogUser, "binlog_user", binlogUser, "PITR restore parameter: username of binlog server.")
    81  	fs.StringVar(&binlogPwd, "binlog_password", binlogPwd, "PITR restore parameter: password of binlog server.")
    82  	fs.DurationVar(&timeoutForGTIDLookup, "pitr_gtid_lookup_timeout", timeoutForGTIDLookup, "PITR restore parameter: timeout for fetching gtid from timestamp.")
    83  	fs.StringVar(&binlogSslCa, "binlog_ssl_ca", binlogSslCa, "PITR restore parameter: Filename containing TLS CA certificate to verify binlog server TLS certificate against.")
    84  	fs.StringVar(&binlogSslCert, "binlog_ssl_cert", binlogSslCert, "PITR restore parameter: Filename containing mTLS client certificate to present to binlog server as authentication.")
    85  	fs.StringVar(&binlogSslKey, "binlog_ssl_key", binlogSslKey, "PITR restore parameter: Filename containing mTLS client private key for use in binlog server authentication.")
    86  	fs.StringVar(&binlogSslServerName, "binlog_ssl_server_name", binlogSslServerName, "PITR restore parameter: TLS server name (common name) to verify against for the binlog server we are connecting to (If not set: use the hostname or IP supplied in --binlog_host).")
    87  }
    88  
    89  func init() {
    90  	servenv.OnParseFor("vtcombo", registerRestoreFlags)
    91  	servenv.OnParseFor("vttablet", registerRestoreFlags)
    92  
    93  	servenv.OnParseFor("vtcombo", registerPointInTimeRestoreFlags)
    94  	servenv.OnParseFor("vttablet", registerPointInTimeRestoreFlags)
    95  }
    96  
    97  // RestoreData is the main entry point for backup restore.
    98  // It will either work, fail gracefully, or return
    99  // an error in case of a non-recoverable error.
   100  // It takes the action lock so no RPC interferes.
   101  func (tm *TabletManager) RestoreData(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool, backupTime time.Time) error {
   102  	if err := tm.lock(ctx); err != nil {
   103  		return err
   104  	}
   105  	defer tm.unlock()
   106  	if tm.Cnf == nil {
   107  		return fmt.Errorf("cannot perform restore without my.cnf, please restart vttablet with a my.cnf file specified")
   108  	}
   109  
   110  	var (
   111  		err       error
   112  		startTime time.Time
   113  	)
   114  
   115  	defer func() {
   116  		stopTime := time.Now()
   117  
   118  		h := hook.NewSimpleHook("vttablet_restore_done")
   119  		h.ExtraEnv = tm.hookExtraEnv()
   120  		h.ExtraEnv["TM_RESTORE_DATA_START_TS"] = startTime.UTC().Format(time.RFC3339)
   121  		h.ExtraEnv["TM_RESTORE_DATA_STOP_TS"] = stopTime.UTC().Format(time.RFC3339)
   122  		h.ExtraEnv["TM_RESTORE_DATA_DURATION"] = stopTime.Sub(startTime).String()
   123  
   124  		if err != nil {
   125  			h.ExtraEnv["TM_RESTORE_DATA_ERROR"] = err.Error()
   126  		}
   127  
   128  		// vttablet_restore_done is best-effort (for now?).
   129  		go func() {
   130  			// Package vthook already logs the stdout/stderr of hooks when they
   131  			// are run, so we don't duplicate that here.
   132  			hr := h.Execute()
   133  			switch hr.ExitStatus {
   134  			case hook.HOOK_SUCCESS:
   135  			case hook.HOOK_DOES_NOT_EXIST:
   136  				log.Info("No vttablet_restore_done hook.")
   137  			default:
   138  				log.Warning("vttablet_restore_done hook failed")
   139  			}
   140  		}()
   141  	}()
   142  
   143  	startTime = time.Now()
   144  
   145  	req := &tabletmanagerdatapb.RestoreFromBackupRequest{
   146  		BackupTime: logutil.TimeToProto(backupTime),
   147  	}
   148  	err = tm.restoreDataLocked(ctx, logger, waitForBackupInterval, deleteBeforeRestore, req)
   149  	if err != nil {
   150  		return err
   151  	}
   152  	return nil
   153  }
   154  
   155  func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool, request *tabletmanagerdatapb.RestoreFromBackupRequest) error {
   156  
   157  	tablet := tm.Tablet()
   158  	originalType := tablet.Type
   159  	// Try to restore. Depending on the reason for failure, we may be ok.
   160  	// If we're not ok, return an error and the tm will log.Fatalf,
   161  	// causing the process to be restarted and the restore retried.
   162  
   163  	keyspace := tablet.Keyspace
   164  	keyspaceInfo, err := tm.TopoServer.GetKeyspace(ctx, keyspace)
   165  	if err != nil {
   166  		return err
   167  	}
   168  
   169  	// For a SNAPSHOT keyspace, we have to look for backups of BaseKeyspace
   170  	// so we will pass the BaseKeyspace in RestoreParams instead of tablet.Keyspace
   171  	if keyspaceInfo.KeyspaceType == topodatapb.KeyspaceType_SNAPSHOT {
   172  		if keyspaceInfo.BaseKeyspace == "" {
   173  			return vterrors.New(vtrpcpb.Code_INVALID_ARGUMENT, fmt.Sprintf("snapshot keyspace %v has no base_keyspace set", tablet.Keyspace))
   174  		}
   175  		keyspace = keyspaceInfo.BaseKeyspace
   176  		log.Infof("Using base_keyspace %v to restore keyspace %v using a backup time of %v", keyspace, tablet.Keyspace, logutil.ProtoToTime(request.BackupTime))
   177  	}
   178  
   179  	params := mysqlctl.RestoreParams{
   180  		Cnf:                 tm.Cnf,
   181  		Mysqld:              tm.MysqlDaemon,
   182  		Logger:              logger,
   183  		Concurrency:         restoreConcurrency,
   184  		HookExtraEnv:        tm.hookExtraEnv(),
   185  		DeleteBeforeRestore: deleteBeforeRestore,
   186  		DbName:              topoproto.TabletDbName(tablet),
   187  		Keyspace:            keyspace,
   188  		Shard:               tablet.Shard,
   189  		StartTime:           logutil.ProtoToTime(request.BackupTime),
   190  		DryRun:              request.DryRun,
   191  	}
   192  	if request.RestoreToPos != "" {
   193  		pos, err := mysql.DecodePosition(request.RestoreToPos)
   194  		if err != nil {
   195  			return vterrors.Wrapf(err, "restore failed: unable to decode --restore_to_pos: %s", request.RestoreToPos)
   196  		}
   197  		params.RestoreToPos = pos
   198  	}
   199  	params.Logger.Infof("Restore: original tablet type=%v", originalType)
   200  
   201  	// Check whether we're going to restore before changing to RESTORE type,
   202  	// so we keep our PrimaryTermStartTime (if any) if we aren't actually restoring.
   203  	ok, err := mysqlctl.ShouldRestore(ctx, params)
   204  	if err != nil {
   205  		return err
   206  	}
   207  	if !ok {
   208  		params.Logger.Infof("Attempting to restore, but mysqld already contains data. Assuming vttablet was just restarted.")
   209  		return nil
   210  	}
   211  	// We should not become primary after restore, because that would incorrectly
   212  	// start a new primary term, and it's likely our data dir will be out of date.
   213  	if originalType == topodatapb.TabletType_PRIMARY {
   214  		originalType = tm.baseTabletType
   215  	}
   216  	if err := tm.tmState.ChangeTabletType(ctx, topodatapb.TabletType_RESTORE, DBActionNone); err != nil {
   217  		return err
   218  	}
   219  	// Loop until a backup exists, unless we were told to give up immediately.
   220  	var backupManifest *mysqlctl.BackupManifest
   221  	for {
   222  		backupManifest, err = mysqlctl.Restore(ctx, params)
   223  		params.Logger.Infof("Restore: got a restore manifest: %v, err=%v, waitForBackupInterval=%v", backupManifest, err, waitForBackupInterval)
   224  		if waitForBackupInterval == 0 {
   225  			break
   226  		}
   227  		// We only retry a specific set of errors. The rest we return immediately.
   228  		if err != mysqlctl.ErrNoBackup && err != mysqlctl.ErrNoCompleteBackup {
   229  			break
   230  		}
   231  
   232  		log.Infof("No backup found. Waiting %v (from -wait_for_backup_interval flag) to check again.", waitForBackupInterval)
   233  		select {
   234  		case <-ctx.Done():
   235  			return ctx.Err()
   236  		case <-time.After(waitForBackupInterval):
   237  		}
   238  	}
   239  
   240  	var pos mysql.Position
   241  	if backupManifest != nil {
   242  		pos = backupManifest.Position
   243  		params.Logger.Infof("Restore: pos=%v", mysql.EncodePosition(pos))
   244  	}
   245  	// If SnapshotTime is set , then apply the incremental change
   246  	if keyspaceInfo.SnapshotTime != nil {
   247  		params.Logger.Infof("Restore: Restoring to time %v from binlog", keyspaceInfo.SnapshotTime)
   248  		err = tm.restoreToTimeFromBinlog(ctx, pos, keyspaceInfo.SnapshotTime)
   249  		if err != nil {
   250  			log.Errorf("unable to restore to the specified time %s, error : %v", keyspaceInfo.SnapshotTime.String(), err)
   251  			return nil
   252  		}
   253  	}
   254  	switch {
   255  	case err == nil && backupManifest != nil:
   256  		// Starting from here we won't be able to recover if we get stopped by a cancelled
   257  		// context. Thus we use the background context to get through to the finish.
   258  		if params.IsIncrementalRecovery() && !params.DryRun {
   259  			// The whole point of point-in-time recovery is that we want to restore up to a given position,
   260  			// and to NOT proceed from that position. We want to disable replication and NOT let the replica catch
   261  			// up with the primary.
   262  			params.Logger.Infof("Restore: disabling replication")
   263  			if err := tm.disableReplication(context.Background()); err != nil {
   264  				return err
   265  			}
   266  		} else if keyspaceInfo.KeyspaceType == topodatapb.KeyspaceType_NORMAL {
   267  			// Reconnect to primary only for "NORMAL" keyspaces
   268  			params.Logger.Infof("Restore: starting replication at position %v", pos)
   269  			if err := tm.startReplication(context.Background(), pos, originalType); err != nil {
   270  				return err
   271  			}
   272  		}
   273  	case err == mysqlctl.ErrNoBackup:
   274  		// Starting with empty database.
   275  		// We just need to initialize replication
   276  		_, err := tm.initializeReplication(ctx, originalType)
   277  		if err != nil {
   278  			return err
   279  		}
   280  	case err == nil && params.DryRun:
   281  		// Do nothing here, let the rest of code run
   282  		params.Logger.Infof("Dry run. No changes made")
   283  	default:
   284  		// If anything failed, we should reset the original tablet type
   285  		if err := tm.tmState.ChangeTabletType(ctx, originalType, DBActionNone); err != nil {
   286  			log.Errorf("Could not change back to original tablet type %v: %v", originalType, err)
   287  		}
   288  		return vterrors.Wrap(err, "Can't restore backup")
   289  	}
   290  
   291  	// If we had type BACKUP or RESTORE it's better to set our type to the init_tablet_type to make result of the restore
   292  	// similar to completely clean start from scratch.
   293  	if (originalType == topodatapb.TabletType_BACKUP || originalType == topodatapb.TabletType_RESTORE) && initTabletType != "" {
   294  		initType, err := topoproto.ParseTabletType(initTabletType)
   295  		if err == nil {
   296  			originalType = initType
   297  		}
   298  	}
   299  	if params.IsIncrementalRecovery() && !params.DryRun {
   300  		// override
   301  		params.Logger.Infof("Restore: will set tablet type to DRAINED as this is a point in time recovery")
   302  		originalType = topodatapb.TabletType_DRAINED
   303  	}
   304  	params.Logger.Infof("Restore: changing tablet type to %v for %s", originalType, tm.tabletAlias.String())
   305  	// Change type back to original type if we're ok to serve.
   306  	return tm.tmState.ChangeTabletType(ctx, originalType, DBActionNone)
   307  }
   308  
   309  // restoreToTimeFromBinlog restores to the snapshot time of the keyspace
   310  // currently this works with mysql based database only (as it uses mysql specific queries for restoring)
   311  func (tm *TabletManager) restoreToTimeFromBinlog(ctx context.Context, pos mysql.Position, restoreTime *vttime.Time) error {
   312  	// validate the minimal settings necessary for connecting to binlog server
   313  	if binlogHost == "" || binlogPort <= 0 || binlogUser == "" {
   314  		log.Warning("invalid binlog server setting, restoring to last available backup.")
   315  		return nil
   316  	}
   317  
   318  	timeoutCtx, cancelFnc := context.WithTimeout(ctx, timeoutForGTIDLookup)
   319  	defer cancelFnc()
   320  
   321  	afterGTIDPos, beforeGTIDPos, err := tm.getGTIDFromTimestamp(timeoutCtx, pos, restoreTime.Seconds)
   322  	if err != nil {
   323  		return err
   324  	}
   325  
   326  	if afterGTIDPos == "" && beforeGTIDPos == "" {
   327  		return vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, fmt.Sprintf("unable to fetch the GTID for the specified time - %s", restoreTime.String()))
   328  	} else if afterGTIDPos == "" && beforeGTIDPos != "" {
   329  		log.Info("no afterGTIDPos found, which implies we reached the end of all GTID events")
   330  	}
   331  
   332  	log.Infof("going to restore upto the GTID - %s", afterGTIDPos)
   333  	// when we don't have before GTID, we will take it as current backup pos's last GTID
   334  	// this is case where someone tries to restore just to the 1st event after backup
   335  	if beforeGTIDPos == "" {
   336  		beforeGTIDPos = pos.GTIDSet.Last()
   337  	}
   338  	err = tm.catchupToGTID(timeoutCtx, afterGTIDPos, beforeGTIDPos)
   339  	if err != nil {
   340  		return vterrors.Wrapf(err, "unable to replicate upto desired GTID : %s", afterGTIDPos)
   341  	}
   342  
   343  	return nil
   344  }
   345  
   346  // getGTIDFromTimestamp computes 2 GTIDs based on restoreTime
   347  // afterPos is the GTID of the first event at or after restoreTime.
   348  // beforePos is the GTID of the last event before restoreTime. This is the GTID upto which replication will be applied
   349  // afterPos can be used directly in the query `START SLAVE UNTIL SQL_BEFORE_GTIDS = ”`
   350  // beforePos will be used to check if replication was able to catch up from the binlog server
   351  func (tm *TabletManager) getGTIDFromTimestamp(ctx context.Context, pos mysql.Position, restoreTime int64) (afterPos string, beforePos string, err error) {
   352  	connParams := &mysql.ConnParams{
   353  		Host:       binlogHost,
   354  		Port:       binlogPort,
   355  		Uname:      binlogUser,
   356  		SslCa:      binlogSslCa,
   357  		SslCert:    binlogSslCert,
   358  		SslKey:     binlogSslKey,
   359  		ServerName: binlogSslServerName,
   360  	}
   361  	if binlogPwd != "" {
   362  		connParams.Pass = binlogPwd
   363  	}
   364  	if binlogSslCa != "" || binlogSslCert != "" {
   365  		connParams.EnableSSL()
   366  	}
   367  	dbCfgs := &dbconfigs.DBConfigs{
   368  		Host: connParams.Host,
   369  		Port: connParams.Port,
   370  	}
   371  	dbCfgs.SetDbParams(*connParams, *connParams, *connParams)
   372  	vsClient := vreplication.NewReplicaConnector(connParams)
   373  
   374  	filter := &binlogdatapb.Filter{
   375  		Rules: []*binlogdatapb.Rule{{
   376  			Match: "/.*",
   377  		}},
   378  	}
   379  
   380  	// get current lastPos of binlog server, so that if we hit that in vstream, we'll return from there
   381  	binlogConn, err := mysql.Connect(ctx, connParams)
   382  	if err != nil {
   383  		return "", "", err
   384  	}
   385  	defer binlogConn.Close()
   386  	lastPos, err := binlogConn.PrimaryPosition()
   387  	if err != nil {
   388  		return "", "", err
   389  	}
   390  
   391  	gtidsChan := make(chan []string, 1)
   392  
   393  	go func() {
   394  		err := vsClient.VStream(ctx, mysql.EncodePosition(pos), filter, func(events []*binlogdatapb.VEvent) error {
   395  			for _, event := range events {
   396  				if event.Gtid != "" {
   397  					// check if we reached the lastPos then return
   398  					eventPos, err := mysql.DecodePosition(event.Gtid)
   399  					if err != nil {
   400  						return err
   401  					}
   402  
   403  					if event.Timestamp >= restoreTime {
   404  						afterPos = event.Gtid
   405  						gtidsChan <- []string{event.Gtid, beforePos}
   406  						return io.EOF
   407  					}
   408  
   409  					if eventPos.AtLeast(lastPos) {
   410  						gtidsChan <- []string{"", beforePos}
   411  						return io.EOF
   412  					}
   413  					beforePos = event.Gtid
   414  				}
   415  			}
   416  			return nil
   417  		})
   418  		if err != nil && err != io.EOF {
   419  			log.Warningf("Error using VStream to find timestamp for GTID position: %v error: %v", pos, err)
   420  			gtidsChan <- []string{"", ""}
   421  		}
   422  	}()
   423  	defer vsClient.Close(ctx)
   424  	select {
   425  	case val := <-gtidsChan:
   426  		return val[0], val[1], nil
   427  	case <-ctx.Done():
   428  		log.Warningf("Can't find the GTID from restore time stamp, exiting.")
   429  		return "", beforePos, vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, "unable to find GTID from the snapshot time as context timed out")
   430  	}
   431  }
   432  
   433  // catchupToGTID replicates upto specified GTID from binlog server
   434  //
   435  // copies the data from binlog server by pointing to as replica
   436  // waits till all events to GTID replicated
   437  // once done, it will reset the replication
   438  func (tm *TabletManager) catchupToGTID(ctx context.Context, afterGTIDPos string, beforeGTIDPos string) error {
   439  	var afterGTIDStr string
   440  	if afterGTIDPos != "" {
   441  		afterGTIDParsed, err := mysql.DecodePosition(afterGTIDPos)
   442  		if err != nil {
   443  			return err
   444  		}
   445  		afterGTIDStr = afterGTIDParsed.GTIDSet.Last()
   446  	}
   447  
   448  	beforeGTIDPosParsed, err := mysql.DecodePosition(beforeGTIDPos)
   449  	if err != nil {
   450  		return err
   451  	}
   452  
   453  	// it uses mysql specific queries here
   454  	cmds := []string{
   455  		"STOP SLAVE FOR CHANNEL '' ",
   456  		"STOP SLAVE IO_THREAD FOR CHANNEL ''",
   457  	}
   458  
   459  	if binlogSslCa != "" || binlogSslCert != "" {
   460  		// We need to use TLS
   461  		cmd := fmt.Sprintf("CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='%s', MASTER_PASSWORD='%s', MASTER_AUTO_POSITION=1, MASTER_SSL=1", binlogHost, binlogPort, binlogUser, binlogPwd)
   462  		if binlogSslCa != "" {
   463  			cmd += fmt.Sprintf(", MASTER_SSL_CA='%s'", binlogSslCa)
   464  		}
   465  		if binlogSslCert != "" {
   466  			cmd += fmt.Sprintf(", MASTER_SSL_CERT='%s'", binlogSslCert)
   467  		}
   468  		if binlogSslKey != "" {
   469  			cmd += fmt.Sprintf(", MASTER_SSL_KEY='%s'", binlogSslKey)
   470  		}
   471  		cmds = append(cmds, cmd+";")
   472  	} else {
   473  		// No TLS
   474  		cmds = append(cmds, fmt.Sprintf("CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='%s', MASTER_PASSWORD='%s', MASTER_AUTO_POSITION=1;", binlogHost, binlogPort, binlogUser, binlogPwd))
   475  	}
   476  
   477  	if afterGTIDPos == "" { // when the there is no afterPos, that means need to replicate completely
   478  		cmds = append(cmds, "START SLAVE")
   479  	} else {
   480  		cmds = append(cmds, fmt.Sprintf("START SLAVE UNTIL SQL_BEFORE_GTIDS = '%s'", afterGTIDStr))
   481  	}
   482  
   483  	if err := tm.MysqlDaemon.ExecuteSuperQueryList(ctx, cmds); err != nil {
   484  		return vterrors.Wrap(err, fmt.Sprintf("failed to restart the replication until %s GTID", afterGTIDStr))
   485  	}
   486  	log.Infof("Waiting for position to reach", beforeGTIDPosParsed.GTIDSet.Last())
   487  	// Could not use `agent.MysqlDaemon.WaitSourcePos` as replication is stopped with `START SLAVE UNTIL SQL_BEFORE_GTIDS`
   488  	// this is as per https://dev.mysql.com/doc/refman/5.6/en/start-slave.html
   489  	// We need to wait until replication catches upto the specified afterGTIDPos
   490  	chGTIDCaughtup := make(chan bool)
   491  	go func() {
   492  		timeToWait := time.Now().Add(timeoutForGTIDLookup)
   493  		for time.Now().Before(timeToWait) {
   494  			pos, err := tm.MysqlDaemon.PrimaryPosition()
   495  			if err != nil {
   496  				chGTIDCaughtup <- false
   497  			}
   498  
   499  			if pos.AtLeast(beforeGTIDPosParsed) {
   500  				chGTIDCaughtup <- true
   501  			}
   502  			select {
   503  			case <-ctx.Done():
   504  				chGTIDCaughtup <- false
   505  			default:
   506  				time.Sleep(300 * time.Millisecond)
   507  			}
   508  		}
   509  	}()
   510  	select {
   511  	case resp := <-chGTIDCaughtup:
   512  		if resp {
   513  			cmds := []string{
   514  				"STOP SLAVE",
   515  				"RESET SLAVE ALL",
   516  			}
   517  			if err := tm.MysqlDaemon.ExecuteSuperQueryList(ctx, cmds); err != nil {
   518  				return vterrors.Wrap(err, "failed to stop replication")
   519  			}
   520  			return nil
   521  		}
   522  		return vterrors.Wrap(err, "error while fetching the current GTID position")
   523  	case <-ctx.Done():
   524  		log.Warningf("Could not copy up to GTID.")
   525  		return vterrors.Wrapf(err, "context timeout while restoring up to specified GTID - %s", beforeGTIDPos)
   526  	}
   527  }
   528  
   529  // disableReplication stopes and resets replication on the mysql server. It moreover sets impossible replication
   530  // source params, so that the replica can't possibly reconnect. It would take a `CHANGE [MASTER|REPLICATION SOURCE] TO ...` to
   531  // make the mysql server replicate again (available via tm.MysqlDaemon.SetReplicationPosition)
   532  func (tm *TabletManager) disableReplication(ctx context.Context) error {
   533  	cmds := []string{
   534  		"STOP SLAVE",
   535  		"RESET SLAVE ALL", // "ALL" makes it forget primary host:port.
   536  	}
   537  	if err := tm.MysqlDaemon.ExecuteSuperQueryList(ctx, cmds); err != nil {
   538  		return vterrors.Wrap(err, "failed to reset replication")
   539  	}
   540  	if err := tm.MysqlDaemon.SetReplicationSource(ctx, "//", 0, false /* stopReplicationBefore */, true /* startReplicationAfter */); err != nil {
   541  		return vterrors.Wrap(err, "failed to disable replication")
   542  	}
   543  
   544  	return nil
   545  }
   546  
   547  func (tm *TabletManager) startReplication(ctx context.Context, pos mysql.Position, tabletType topodatapb.TabletType) error {
   548  	cmds := []string{
   549  		"STOP SLAVE",
   550  		"RESET SLAVE ALL", // "ALL" makes it forget primary host:port.
   551  	}
   552  	if err := tm.MysqlDaemon.ExecuteSuperQueryList(ctx, cmds); err != nil {
   553  		return vterrors.Wrap(err, "failed to reset replication")
   554  	}
   555  
   556  	// Set the position at which to resume from the primary.
   557  	if err := tm.MysqlDaemon.SetReplicationPosition(ctx, pos); err != nil {
   558  		return vterrors.Wrap(err, "failed to set replication position")
   559  	}
   560  
   561  	primary, err := tm.initializeReplication(ctx, tabletType)
   562  	// If we ran into an error while initializing replication, then there is no point in waiting for catch-up.
   563  	// Also, if there is no primary tablet in the shard, we don't need to proceed further.
   564  	if err != nil || primary == nil {
   565  		return err
   566  	}
   567  
   568  	// wait for reliable replication_lag_seconds
   569  	// we have pos where we want to resume from
   570  	// if PrimaryPosition is the same, that means no writes
   571  	// have happened to primary, so we are up-to-date
   572  	// otherwise, wait for replica's Position to change from
   573  	// the initial pos before proceeding
   574  	tmc := tmclient.NewTabletManagerClient()
   575  	defer tmc.Close()
   576  	remoteCtx, remoteCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
   577  	defer remoteCancel()
   578  	posStr, err := tmc.PrimaryPosition(remoteCtx, primary.Tablet)
   579  	if err != nil {
   580  		// It is possible that though PrimaryAlias is set, the primary tablet is unreachable
   581  		// Log a warning and let tablet restore in that case
   582  		// If we had instead considered this fatal, all tablets would crash-loop
   583  		// until a primary appears, which would make it impossible to elect a primary.
   584  		log.Warningf("Can't get primary replication position after restore: %v", err)
   585  		return nil
   586  	}
   587  	primaryPos, err := mysql.DecodePosition(posStr)
   588  	if err != nil {
   589  		return vterrors.Wrapf(err, "can't decode primary replication position: %q", posStr)
   590  	}
   591  
   592  	if !pos.Equal(primaryPos) {
   593  		for {
   594  			if err := ctx.Err(); err != nil {
   595  				return err
   596  			}
   597  			status, err := tm.MysqlDaemon.ReplicationStatus()
   598  			if err != nil {
   599  				return vterrors.Wrap(err, "can't get replication status")
   600  			}
   601  			newPos := status.Position
   602  			if !newPos.Equal(pos) {
   603  				break
   604  			}
   605  			time.Sleep(1 * time.Second)
   606  		}
   607  	}
   608  
   609  	return nil
   610  }
   611  
   612  func (tm *TabletManager) getLocalMetadataValues(tabletType topodatapb.TabletType) map[string]string {
   613  	tablet := tm.Tablet()
   614  	values := map[string]string{
   615  		"Alias":         topoproto.TabletAliasString(tablet.Alias),
   616  		"ClusterAlias":  fmt.Sprintf("%s.%s", tablet.Keyspace, tablet.Shard),
   617  		"DataCenter":    tablet.Alias.Cell,
   618  		"PromotionRule": "must_not",
   619  	}
   620  	if isPrimaryEligible(tabletType) {
   621  		values["PromotionRule"] = "neutral"
   622  	}
   623  	return values
   624  }