vitess.io/vitess@v0.16.2/go/cmd/vtbackup/vtbackup.go (about)

     1  /*
     2  Copyright 2019 The Vitess Authors
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  /*
    18  vtbackup is a batch command to perform a single pass of backup maintenance for a shard.
    19  
    20  When run periodically for each shard, vtbackup can ensure these configurable policies:
    21  * There is always a recent backup for the shard.
    22  * Old backups for the shard are removed.
    23  
    24  Whatever system launches vtbackup is responsible for the following:
    25    - Running vtbackup with similar flags that would be used for a vttablet and
    26      mysqlctld in the target shard to be backed up.
    27    - Provisioning as much disk space for vtbackup as would be given to vttablet.
    28      The data directory MUST be empty at startup. Do NOT reuse a persistent disk.
    29    - Running vtbackup periodically for each shard, for each backup storage location.
    30    - Ensuring that at most one instance runs at a time for a given pair of shard
    31      and backup storage location.
    32    - Retrying vtbackup if it fails.
    33    - Alerting human operators if the failure is persistent.
    34  
    35  The process vtbackup follows to take a new backup is as follows:
    36   1. Restore from the most recent backup.
    37   2. Start a mysqld instance (but no vttablet) from the restored data.
    38   3. Instruct mysqld to connect to the current shard primary and replicate any
    39      transactions that are new since the last backup.
    40   4. Ask the primary for its current replication position and set that as the goal
    41      for catching up on replication before taking the backup, so the goalposts
    42      don't move.
    43   5. Wait until replication is caught up to the goal position or beyond.
    44   6. Stop mysqld and take a new backup.
    45  
    46  Aside from additional replication load while vtbackup's mysqld catches up on
    47  new transactions, the shard should be otherwise unaffected. Existing tablets
    48  will continue to serve, and no new tablets will appear in topology, meaning no
    49  query traffic will ever be routed to vtbackup's mysqld. This silent operation
    50  mode helps make backups minimally disruptive to serving capacity and orthogonal
    51  to the handling of the query path.
    52  
    53  The command-line parameters to vtbackup specify a policy for when a new backup
    54  is needed, and when old backups should be removed. If the existing backups
    55  already satisfy the policy, then vtbackup will do nothing and return success
    56  immediately.
    57  */
    58  package main
    59  
    60  import (
    61  	"context"
    62  	"crypto/rand"
    63  	"fmt"
    64  	"math"
    65  	"math/big"
    66  	"os"
    67  	"strings"
    68  	"syscall"
    69  	"time"
    70  
    71  	"github.com/spf13/pflag"
    72  
    73  	"vitess.io/vitess/go/acl"
    74  	"vitess.io/vitess/go/cmd"
    75  	"vitess.io/vitess/go/exit"
    76  	"vitess.io/vitess/go/mysql"
    77  	"vitess.io/vitess/go/vt/dbconfigs"
    78  	"vitess.io/vitess/go/vt/log"
    79  	"vitess.io/vitess/go/vt/logutil"
    80  	"vitess.io/vitess/go/vt/mysqlctl"
    81  	"vitess.io/vitess/go/vt/mysqlctl/backupstorage"
    82  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    83  	"vitess.io/vitess/go/vt/servenv"
    84  	"vitess.io/vitess/go/vt/topo"
    85  	"vitess.io/vitess/go/vt/topo/topoproto"
    86  	"vitess.io/vitess/go/vt/vterrors"
    87  	_ "vitess.io/vitess/go/vt/vttablet/grpctmclient"
    88  	"vitess.io/vitess/go/vt/vttablet/tmclient"
    89  )
    90  
    91  const (
    92  	// operationTimeout is the timeout for individual operations like fetching
    93  	// the primary position. This does not impose an overall timeout on
    94  	// long-running processes like taking the backup. It only applies to
    95  	// steps along the way that should complete quickly. This ensures we don't
    96  	// place a hard cap on the overall time for a backup, while also not waiting
    97  	// forever for things that should be quick.
    98  	operationTimeout = 1 * time.Minute
    99  )
   100  
   101  var (
   102  	minBackupInterval   time.Duration
   103  	minRetentionTime    time.Duration
   104  	minRetentionCount   = 1
   105  	initialBackup       bool
   106  	allowFirstBackup    bool
   107  	restartBeforeBackup bool
   108  	// vttablet-like flags
   109  	initDbNameOverride string
   110  	initKeyspace       string
   111  	initShard          string
   112  	concurrency        = 4
   113  	incrementalFromPos string
   114  	// mysqlctld-like flags
   115  	mysqlPort        = 3306
   116  	mysqlSocket      string
   117  	mysqlTimeout     = 5 * time.Minute
   118  	initDBSQLFile    string
   119  	detachedMode     bool
   120  	keepAliveTimeout = 0 * time.Second
   121  	disableRedoLog   = false
   122  )
   123  
   124  func registerFlags(fs *pflag.FlagSet) {
   125  	fs.DurationVar(&minBackupInterval, "min_backup_interval", minBackupInterval, "Only take a new backup if it's been at least this long since the most recent backup.")
   126  	fs.DurationVar(&minRetentionTime, "min_retention_time", minRetentionTime, "Keep each old backup for at least this long before removing it. Set to 0 to disable pruning of old backups.")
   127  	fs.IntVar(&minRetentionCount, "min_retention_count", minRetentionCount, "Always keep at least this many of the most recent backups in this backup storage location, even if some are older than the min_retention_time. This must be at least 1 since a backup must always exist to allow new backups to be made")
   128  	fs.BoolVar(&initialBackup, "initial_backup", initialBackup, "Instead of restoring from backup, initialize an empty database with the provided init_db_sql_file and upload a backup of that for the shard, if the shard has no backups yet. This can be used to seed a brand new shard with an initial, empty backup. If any backups already exist for the shard, this will be considered a successful no-op. This can only be done before the shard exists in topology (i.e. before any tablets are deployed).")
   129  	fs.BoolVar(&allowFirstBackup, "allow_first_backup", allowFirstBackup, "Allow this job to take the first backup of an existing shard.")
   130  	fs.BoolVar(&restartBeforeBackup, "restart_before_backup", restartBeforeBackup, "Perform a mysqld clean/full restart after applying binlogs, but before taking the backup. Only makes sense to work around xtrabackup bugs.")
   131  	// vttablet-like flags
   132  	fs.StringVar(&initDbNameOverride, "init_db_name_override", initDbNameOverride, "(init parameter) override the name of the db used by vttablet")
   133  	fs.StringVar(&initKeyspace, "init_keyspace", initKeyspace, "(init parameter) keyspace to use for this tablet")
   134  	fs.StringVar(&initShard, "init_shard", initShard, "(init parameter) shard to use for this tablet")
   135  	fs.IntVar(&concurrency, "concurrency", concurrency, "(init restore parameter) how many concurrent files to restore at once")
   136  	fs.StringVar(&incrementalFromPos, "incremental_from_pos", incrementalFromPos, "Position of previous backup. Default: empty. If given, then this backup becomes an incremental backup from given position. If value is 'auto', backup taken from last successful backup position")
   137  	// mysqlctld-like flags
   138  	fs.IntVar(&mysqlPort, "mysql_port", mysqlPort, "mysql port")
   139  	fs.StringVar(&mysqlSocket, "mysql_socket", mysqlSocket, "path to the mysql socket")
   140  	fs.DurationVar(&mysqlTimeout, "mysql_timeout", mysqlTimeout, "how long to wait for mysqld startup")
   141  	fs.StringVar(&initDBSQLFile, "init_db_sql_file", initDBSQLFile, "path to .sql file to run after mysql_install_db")
   142  	fs.BoolVar(&detachedMode, "detach", detachedMode, "detached mode - run backups detached from the terminal")
   143  	fs.DurationVar(&keepAliveTimeout, "keep-alive-timeout", keepAliveTimeout, "Wait until timeout elapses after a successful backup before shutting down.")
   144  	fs.BoolVar(&disableRedoLog, "disable-redo-log", disableRedoLog, "Disable InnoDB redo log during replication-from-primary phase of backup.")
   145  
   146  	acl.RegisterFlags(fs)
   147  }
   148  
   149  func init() {
   150  	servenv.RegisterDefaultFlags()
   151  	dbconfigs.RegisterFlags(dbconfigs.All...)
   152  	mysqlctl.RegisterFlags()
   153  	servenv.OnParse(registerFlags)
   154  }
   155  
   156  func main() {
   157  	defer exit.Recover()
   158  
   159  	servenv.ParseFlags("vtbackup")
   160  	servenv.Init()
   161  	ctx, cancel := context.WithCancel(context.Background())
   162  	servenv.OnClose(func() {
   163  		cancel()
   164  	})
   165  
   166  	defer func() {
   167  		servenv.ExitChan <- syscall.SIGTERM
   168  		<-ctx.Done()
   169  	}()
   170  
   171  	go servenv.RunDefault()
   172  
   173  	if detachedMode {
   174  		// this method will call os.Exit and kill this process
   175  		cmd.DetachFromTerminalAndExit()
   176  	}
   177  
   178  	defer logutil.Flush()
   179  
   180  	if minRetentionCount < 1 {
   181  		log.Errorf("min_retention_count must be at least 1 to allow restores to succeed")
   182  		exit.Return(1)
   183  	}
   184  
   185  	// Open connection backup storage.
   186  	backupStorage, err := backupstorage.GetBackupStorage()
   187  	if err != nil {
   188  		log.Errorf("Can't get backup storage: %v", err)
   189  		exit.Return(1)
   190  	}
   191  	defer backupStorage.Close()
   192  	// Open connection to topology server.
   193  	topoServer := topo.Open()
   194  	defer topoServer.Close()
   195  
   196  	// Try to take a backup, if it's been long enough since the last one.
   197  	// Skip pruning if backup wasn't fully successful. We don't want to be
   198  	// deleting things if the backup process is not healthy.
   199  	backupDir := mysqlctl.GetBackupDir(initKeyspace, initShard)
   200  	doBackup, err := shouldBackup(ctx, topoServer, backupStorage, backupDir)
   201  	if err != nil {
   202  		log.Errorf("Can't take backup: %v", err)
   203  		exit.Return(1)
   204  	}
   205  	if doBackup {
   206  		if err := takeBackup(ctx, topoServer, backupStorage); err != nil {
   207  			log.Errorf("Failed to take backup: %v", err)
   208  			exit.Return(1)
   209  		}
   210  	}
   211  
   212  	// Prune old backups.
   213  	if err := pruneBackups(ctx, backupStorage, backupDir); err != nil {
   214  		log.Errorf("Couldn't prune old backups: %v", err)
   215  		exit.Return(1)
   216  	}
   217  
   218  	if keepAliveTimeout > 0 {
   219  		log.Infof("Backup was successful, waiting %s before exiting (or until context expires).", keepAliveTimeout)
   220  		select {
   221  		case <-time.After(keepAliveTimeout):
   222  		case <-ctx.Done():
   223  		}
   224  	}
   225  	log.Info("Exiting.")
   226  }
   227  
   228  func takeBackup(ctx context.Context, topoServer *topo.Server, backupStorage backupstorage.BackupStorage) error {
   229  	// This is an imaginary tablet alias. The value doesn't matter for anything,
   230  	// except that we generate a random UID to ensure the target backup
   231  	// directory is unique if multiple vtbackup instances are launched for the
   232  	// same shard, at exactly the same second, pointed at the same backup
   233  	// storage location.
   234  	bigN, err := rand.Int(rand.Reader, big.NewInt(math.MaxUint32))
   235  	if err != nil {
   236  		return fmt.Errorf("can't generate random tablet UID: %v", err)
   237  	}
   238  	tabletAlias := &topodatapb.TabletAlias{
   239  		Cell: "vtbackup",
   240  		Uid:  uint32(bigN.Uint64()),
   241  	}
   242  
   243  	// Clean up our temporary data dir if we exit for any reason, to make sure
   244  	// every invocation of vtbackup starts with a clean slate, and it does not
   245  	// accumulate garbage (and run out of disk space) if it's restarted.
   246  	tabletDir := mysqlctl.TabletDir(tabletAlias.Uid)
   247  	defer func() {
   248  		log.Infof("Removing temporary tablet directory: %v", tabletDir)
   249  		if err := os.RemoveAll(tabletDir); err != nil {
   250  			log.Warningf("Failed to remove temporary tablet directory: %v", err)
   251  		}
   252  	}()
   253  
   254  	// Start up mysqld as if we are mysqlctld provisioning a fresh tablet.
   255  	mysqld, mycnf, err := mysqlctl.CreateMysqldAndMycnf(tabletAlias.Uid, mysqlSocket, int32(mysqlPort))
   256  	if err != nil {
   257  		return fmt.Errorf("failed to initialize mysql config: %v", err)
   258  	}
   259  	initCtx, initCancel := context.WithTimeout(ctx, mysqlTimeout)
   260  	defer initCancel()
   261  	if err := mysqld.Init(initCtx, mycnf, initDBSQLFile); err != nil {
   262  		return fmt.Errorf("failed to initialize mysql data dir and start mysqld: %v", err)
   263  	}
   264  	// Shut down mysqld when we're done.
   265  	defer func() {
   266  		// Be careful not to use the original context, because we don't want to
   267  		// skip shutdown just because we timed out waiting for other things.
   268  		ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
   269  		defer cancel()
   270  		if err := mysqld.Shutdown(ctx, mycnf, false); err != nil {
   271  			log.Errorf("failed to shutdown mysqld: %v", err)
   272  		}
   273  	}()
   274  
   275  	extraEnv := map[string]string{
   276  		"TABLET_ALIAS": topoproto.TabletAliasString(tabletAlias),
   277  	}
   278  	dbName := initDbNameOverride
   279  	if dbName == "" {
   280  		dbName = fmt.Sprintf("vt_%s", initKeyspace)
   281  	}
   282  
   283  	backupParams := mysqlctl.BackupParams{
   284  		Cnf:                mycnf,
   285  		Mysqld:             mysqld,
   286  		Logger:             logutil.NewConsoleLogger(),
   287  		Concurrency:        concurrency,
   288  		IncrementalFromPos: incrementalFromPos,
   289  		HookExtraEnv:       extraEnv,
   290  		TopoServer:         topoServer,
   291  		Keyspace:           initKeyspace,
   292  		Shard:              initShard,
   293  		TabletAlias:        topoproto.TabletAliasString(tabletAlias),
   294  	}
   295  	// In initial_backup mode, just take a backup of this empty database.
   296  	if initialBackup {
   297  		// Take a backup of this empty DB without restoring anything.
   298  		// First, initialize it the way InitShardPrimary would, so this backup
   299  		// produces a result that can be used to skip InitShardPrimary entirely.
   300  		// This involves resetting replication (to erase any history) and then
   301  		// creating the main database and some Vitess system tables.
   302  		if err := mysqld.ResetReplication(ctx); err != nil {
   303  			return fmt.Errorf("can't reset replication: %v", err)
   304  		}
   305  		cmd := mysqlctl.GenerateInitialBinlogEntry()
   306  		if err := mysqld.ExecuteSuperQueryList(ctx, []string{cmd}); err != nil {
   307  			return err
   308  		}
   309  
   310  		backupParams.BackupTime = time.Now()
   311  		// Now we're ready to take the backup.
   312  		if err := mysqlctl.Backup(ctx, backupParams); err != nil {
   313  			return fmt.Errorf("backup failed: %v", err)
   314  		}
   315  		log.Info("Initial backup successful.")
   316  		return nil
   317  	}
   318  
   319  	backupDir := mysqlctl.GetBackupDir(initKeyspace, initShard)
   320  	log.Infof("Restoring latest backup from directory %v", backupDir)
   321  	params := mysqlctl.RestoreParams{
   322  		Cnf:                 mycnf,
   323  		Mysqld:              mysqld,
   324  		Logger:              logutil.NewConsoleLogger(),
   325  		Concurrency:         concurrency,
   326  		HookExtraEnv:        extraEnv,
   327  		DeleteBeforeRestore: true,
   328  		DbName:              dbName,
   329  		Keyspace:            initKeyspace,
   330  		Shard:               initShard,
   331  	}
   332  	backupManifest, err := mysqlctl.Restore(ctx, params)
   333  	var restorePos mysql.Position
   334  	switch err {
   335  	case nil:
   336  		log.Infof("Successfully restored from backup at replication position %v", restorePos)
   337  		// if err is nil, we expect backupManifest to be non-nil
   338  		restorePos = backupManifest.Position
   339  	case mysqlctl.ErrNoBackup:
   340  		// There is no backup found, but we may be taking the initial backup of a shard
   341  		if !allowFirstBackup {
   342  			return fmt.Errorf("no backup found; not starting up empty since --initial_backup flag was not enabled")
   343  		}
   344  		restorePos = mysql.Position{}
   345  	default:
   346  		return fmt.Errorf("can't restore from backup: %v", err)
   347  	}
   348  
   349  	// Disable redo logging (if we can) before we start replication.
   350  	disabledRedoLog := false
   351  	if disableRedoLog {
   352  		if err := mysqld.DisableRedoLog(ctx); err != nil {
   353  			log.Warningf("Error disabling redo logging: %v", err)
   354  		} else {
   355  			disabledRedoLog = true
   356  		}
   357  	}
   358  
   359  	// We have restored a backup. Now start replication.
   360  	if err := resetReplication(ctx, restorePos, mysqld); err != nil {
   361  		return fmt.Errorf("error resetting replication: %v", err)
   362  	}
   363  	if err := startReplication(ctx, mysqld, topoServer); err != nil {
   364  		return fmt.Errorf("error starting replication: %v", err)
   365  	}
   366  
   367  	// Get the current primary replication position, and wait until we catch up
   368  	// to that point. We do this instead of looking at ReplicationLag
   369  	// because that value can
   370  	// sometimes lie and tell you there's 0 lag when actually replication is
   371  	// stopped. Also, if replication is making progress but is too slow to ever
   372  	// catch up to live changes, we'd rather take a backup of something rather
   373  	// than timing out.
   374  	tmc := tmclient.NewTabletManagerClient()
   375  	// Keep retrying if we can't contact the primary. The primary might be
   376  	// changing, moving, or down temporarily.
   377  	var primaryPos mysql.Position
   378  	err = retryOnError(ctx, func() error {
   379  		// Add a per-operation timeout so we re-read topo if the primary is unreachable.
   380  		opCtx, cancel := context.WithTimeout(ctx, operationTimeout)
   381  		defer cancel()
   382  		pos, err := getPrimaryPosition(opCtx, tmc, topoServer)
   383  		if err != nil {
   384  			return fmt.Errorf("can't get the primary replication position: %v", err)
   385  		}
   386  		primaryPos = pos
   387  		return nil
   388  	})
   389  	if err != nil {
   390  		return err
   391  	}
   392  
   393  	// Remember the time when we fetched the primary position, not when we caught
   394  	// up to it, so the timestamp on our backup is honest (assuming we make it
   395  	// to the goal position).
   396  	backupParams.BackupTime = time.Now()
   397  
   398  	// Wait for replication to catch up.
   399  	waitStartTime := time.Now()
   400  	for {
   401  		select {
   402  		case <-ctx.Done():
   403  			return ctx.Err()
   404  		case <-time.After(time.Second):
   405  		}
   406  
   407  		status, statusErr := mysqld.ReplicationStatus()
   408  		if statusErr != nil {
   409  			log.Warningf("Error getting replication status: %v", statusErr)
   410  			continue
   411  		}
   412  		if status.Position.AtLeast(primaryPos) {
   413  			// We're caught up on replication to at least the point the primary
   414  			// was at when this vtbackup run started.
   415  			log.Infof("Replication caught up to %v after %v", status.Position, time.Since(waitStartTime))
   416  			break
   417  		}
   418  		if !status.Healthy() {
   419  			log.Warning("Replication has stopped before backup could be taken. Trying to restart replication.")
   420  			if err := startReplication(ctx, mysqld, topoServer); err != nil {
   421  				log.Warningf("Failed to restart replication: %v", err)
   422  			}
   423  		}
   424  	}
   425  
   426  	// Stop replication and see where we are.
   427  	if err := mysqld.StopReplication(nil); err != nil {
   428  		return fmt.Errorf("can't stop replication: %v", err)
   429  	}
   430  
   431  	// Did we make any progress?
   432  	status, err := mysqld.ReplicationStatus()
   433  	if err != nil {
   434  		return fmt.Errorf("can't get replication status: %v", err)
   435  	}
   436  	log.Infof("Replication caught up to %v", status.Position)
   437  	if !status.Position.AtLeast(primaryPos) && status.Position.Equal(restorePos) {
   438  		return fmt.Errorf("not taking backup: replication did not make any progress from restore point: %v", restorePos)
   439  	}
   440  
   441  	// Re-enable redo logging.
   442  	if disabledRedoLog {
   443  		if err := mysqld.EnableRedoLog(ctx); err != nil {
   444  			return fmt.Errorf("failed to re-enable redo log: %v", err)
   445  		}
   446  	}
   447  
   448  	if restartBeforeBackup {
   449  		log.Info("Proceeding with clean MySQL shutdown and startup to flush all buffers.")
   450  		// Prep for full/clean shutdown (not typically the default)
   451  		if err := mysqld.ExecuteSuperQuery(ctx, "SET GLOBAL innodb_fast_shutdown=0"); err != nil {
   452  			return fmt.Errorf("Could not prep for full shutdown: %v", err)
   453  		}
   454  		// Shutdown, waiting for it to finish
   455  		if err := mysqld.Shutdown(ctx, mycnf, true); err != nil {
   456  			return fmt.Errorf("Something went wrong during full MySQL shutdown: %v", err)
   457  		}
   458  		// Start MySQL, waiting for it to come up
   459  		if err := mysqld.Start(ctx, mycnf); err != nil {
   460  			return fmt.Errorf("Could not start MySQL after full shutdown: %v", err)
   461  		}
   462  	}
   463  
   464  	// Now we can take a new backup.
   465  	if err := mysqlctl.Backup(ctx, backupParams); err != nil {
   466  		return fmt.Errorf("error taking backup: %v", err)
   467  	}
   468  
   469  	// Return a non-zero exit code if we didn't meet the replication position
   470  	// goal, even though we took a backup that pushes the high-water mark up.
   471  	if !status.Position.AtLeast(primaryPos) {
   472  		return fmt.Errorf("replication caught up to %v but didn't make it to the goal of %v; a backup was taken anyway to save partial progress, but the operation should still be retried since not all expected data is backed up", status.Position, primaryPos)
   473  	}
   474  	log.Info("Backup successful.")
   475  	return nil
   476  }
   477  
   478  func resetReplication(ctx context.Context, pos mysql.Position, mysqld mysqlctl.MysqlDaemon) error {
   479  	cmds := []string{
   480  		"STOP SLAVE",
   481  		"RESET SLAVE ALL", // "ALL" makes it forget replication source host:port.
   482  	}
   483  	if err := mysqld.ExecuteSuperQueryList(ctx, cmds); err != nil {
   484  		return vterrors.Wrap(err, "failed to reset replication")
   485  	}
   486  
   487  	// Check if we have a position to resume from, if not reset to the beginning of time
   488  	if !pos.IsZero() {
   489  		// Set the position at which to resume from the replication source.
   490  		if err := mysqld.SetReplicationPosition(ctx, pos); err != nil {
   491  			return vterrors.Wrap(err, "failed to set replica position")
   492  		}
   493  	} else {
   494  		if err := mysqld.ResetReplication(ctx); err != nil {
   495  			return vterrors.Wrap(err, "failed to reset replication")
   496  		}
   497  	}
   498  	return nil
   499  }
   500  
   501  func startReplication(ctx context.Context, mysqld mysqlctl.MysqlDaemon, topoServer *topo.Server) error {
   502  	si, err := topoServer.GetShard(ctx, initKeyspace, initShard)
   503  	if err != nil {
   504  		return vterrors.Wrap(err, "can't read shard")
   505  	}
   506  	if topoproto.TabletAliasIsZero(si.PrimaryAlias) {
   507  		// Normal tablets will sit around waiting to be reparented in this case.
   508  		// Since vtbackup is a batch job, we just have to fail.
   509  		return fmt.Errorf("can't start replication after restore: shard %v/%v has no primary", initKeyspace, initShard)
   510  	}
   511  	// TODO(enisoc): Support replicating from another replica, preferably in the
   512  	//   same cell, preferably rdonly, to reduce load on the primary.
   513  	ti, err := topoServer.GetTablet(ctx, si.PrimaryAlias)
   514  	if err != nil {
   515  		return vterrors.Wrapf(err, "Cannot read primary tablet %v", si.PrimaryAlias)
   516  	}
   517  
   518  	// Stop replication (in case we're restarting), set replication source, and start replication.
   519  	if err := mysqld.SetReplicationSource(ctx, ti.Tablet.MysqlHostname, int(ti.Tablet.MysqlPort), true /* stopReplicationBefore */, true /* startReplicationAfter */); err != nil {
   520  		return vterrors.Wrap(err, "MysqlDaemon.SetReplicationSource failed")
   521  	}
   522  	return nil
   523  }
   524  
   525  func getPrimaryPosition(ctx context.Context, tmc tmclient.TabletManagerClient, ts *topo.Server) (mysql.Position, error) {
   526  	si, err := ts.GetShard(ctx, initKeyspace, initShard)
   527  	if err != nil {
   528  		return mysql.Position{}, vterrors.Wrap(err, "can't read shard")
   529  	}
   530  	if topoproto.TabletAliasIsZero(si.PrimaryAlias) {
   531  		// Normal tablets will sit around waiting to be reparented in this case.
   532  		// Since vtbackup is a batch job, we just have to fail.
   533  		return mysql.Position{}, fmt.Errorf("shard %v/%v has no primary", initKeyspace, initShard)
   534  	}
   535  	ti, err := ts.GetTablet(ctx, si.PrimaryAlias)
   536  	if err != nil {
   537  		return mysql.Position{}, fmt.Errorf("can't get primary tablet record %v: %v", topoproto.TabletAliasString(si.PrimaryAlias), err)
   538  	}
   539  	posStr, err := tmc.PrimaryPosition(ctx, ti.Tablet)
   540  	if err != nil {
   541  		return mysql.Position{}, fmt.Errorf("can't get primary replication position: %v", err)
   542  	}
   543  	pos, err := mysql.DecodePosition(posStr)
   544  	if err != nil {
   545  		return mysql.Position{}, fmt.Errorf("can't decode primary replication position %q: %v", posStr, err)
   546  	}
   547  	return pos, nil
   548  }
   549  
   550  // retryOnError keeps calling the given function until it succeeds, or the given
   551  // Context is done. It waits an exponentially increasing amount of time between
   552  // retries to avoid hot-looping. The only time this returns an error is if the
   553  // Context is cancelled.
   554  func retryOnError(ctx context.Context, fn func() error) error {
   555  	waitTime := 1 * time.Second
   556  
   557  	for {
   558  		err := fn()
   559  		if err == nil {
   560  			return nil
   561  		}
   562  		log.Errorf("Waiting %v to retry after error: %v", waitTime, err)
   563  
   564  		select {
   565  		case <-ctx.Done():
   566  			log.Errorf("Not retrying after error: %v", ctx.Err())
   567  			return ctx.Err()
   568  		case <-time.After(waitTime):
   569  			waitTime *= 2
   570  		}
   571  	}
   572  }
   573  
   574  func pruneBackups(ctx context.Context, backupStorage backupstorage.BackupStorage, backupDir string) error {
   575  	if minRetentionTime == 0 {
   576  		log.Info("Pruning of old backups is disabled.")
   577  		return nil
   578  	}
   579  	backups, err := backupStorage.ListBackups(ctx, backupDir)
   580  	if err != nil {
   581  		return fmt.Errorf("can't list backups: %v", err)
   582  	}
   583  	numBackups := len(backups)
   584  	if numBackups <= minRetentionCount {
   585  		log.Infof("Found %v backups. Not pruning any since this is within the min_retention_count of %v.", numBackups, minRetentionCount)
   586  		return nil
   587  	}
   588  	// We have more than the minimum retention count, so we could afford to
   589  	// prune some. See if any are beyond the minimum retention time.
   590  	// ListBackups returns them sorted by oldest first.
   591  	for _, backup := range backups {
   592  		backupTime, err := parseBackupTime(backup.Name())
   593  		if err != nil {
   594  			return err
   595  		}
   596  		if time.Since(backupTime) < minRetentionTime {
   597  			// The oldest remaining backup is not old enough to prune.
   598  			log.Infof("Oldest backup taken at %v has not reached min_retention_time of %v. Nothing left to prune.", backupTime, minRetentionTime)
   599  			break
   600  		}
   601  		// Remove the backup.
   602  		log.Infof("Removing old backup %v from %v, since it's older than min_retention_time of %v", backup.Name(), backupDir, minRetentionTime)
   603  		if err := backupStorage.RemoveBackup(ctx, backupDir, backup.Name()); err != nil {
   604  			return fmt.Errorf("couldn't remove backup %v from %v: %v", backup.Name(), backupDir, err)
   605  		}
   606  		// We successfully removed one backup. Can we afford to prune any more?
   607  		numBackups--
   608  		if numBackups == minRetentionCount {
   609  			log.Infof("Successfully pruned backup count to min_retention_count of %v.", minRetentionCount)
   610  			break
   611  		}
   612  	}
   613  	return nil
   614  }
   615  
   616  func parseBackupTime(name string) (time.Time, error) {
   617  	// Backup names are formatted as "date.time.tablet-alias".
   618  	parts := strings.Split(name, ".")
   619  	if len(parts) != 3 {
   620  		return time.Time{}, fmt.Errorf("backup name not in expected format (date.time.tablet-alias): %v", name)
   621  	}
   622  	backupTime, err := time.Parse(mysqlctl.BackupTimestampFormat, fmt.Sprintf("%s.%s", parts[0], parts[1]))
   623  	if err != nil {
   624  		return time.Time{}, fmt.Errorf("can't parse timestamp from backup %q: %v", name, err)
   625  	}
   626  	return backupTime, nil
   627  }
   628  
   629  func shouldBackup(ctx context.Context, topoServer *topo.Server, backupStorage backupstorage.BackupStorage, backupDir string) (bool, error) {
   630  	// Look for the most recent, complete backup.
   631  	backups, err := backupStorage.ListBackups(ctx, backupDir)
   632  	if err != nil {
   633  		return false, fmt.Errorf("can't list backups: %v", err)
   634  	}
   635  	lastBackup := lastCompleteBackup(ctx, backups)
   636  
   637  	// Check preconditions for initial_backup mode.
   638  	if initialBackup {
   639  		// Check if any backups for the shard already exist in this backup storage location.
   640  		if lastBackup != nil {
   641  			log.Infof("At least one complete backup already exists, so there's no need to seed an empty backup. Doing nothing.")
   642  			return false, nil
   643  		}
   644  
   645  		// Check whether the shard exists.
   646  		_, shardErr := topoServer.GetShard(ctx, initKeyspace, initShard)
   647  		switch {
   648  		case shardErr == nil:
   649  			// If the shard exists, we should make sure none of the tablets are
   650  			// already in a serving state, because then they might have data
   651  			// that conflicts with the initial backup we're about to take.
   652  			tablets, err := topoServer.GetTabletMapForShard(ctx, initKeyspace, initShard)
   653  			if err != nil {
   654  				// We don't know for sure whether any tablets are serving,
   655  				// so it's not safe to continue.
   656  				return false, fmt.Errorf("failed to check whether shard %v/%v has serving tablets before doing initial backup: %v", initKeyspace, initShard, err)
   657  			}
   658  			for tabletAlias, tablet := range tablets {
   659  				// Check if any tablet has its type set to one of the serving types.
   660  				// If so, it's too late to do an initial backup.
   661  				if tablet.IsInServingGraph() {
   662  					return false, fmt.Errorf("refusing to upload initial backup of empty database: the shard %v/%v already has at least one tablet that may be serving (%v); you must take a backup from a live tablet instead", initKeyspace, initShard, tabletAlias)
   663  				}
   664  			}
   665  			log.Infof("Shard %v/%v exists but has no serving tablets.", initKeyspace, initShard)
   666  		case topo.IsErrType(shardErr, topo.NoNode):
   667  			// The shard doesn't exist, so we know no tablets are running.
   668  			log.Infof("Shard %v/%v doesn't exist; assuming it has no serving tablets.", initKeyspace, initShard)
   669  		default:
   670  			// If we encounter any other error, we don't know for sure whether
   671  			// the shard exists, so it's not safe to continue.
   672  			return false, fmt.Errorf("failed to check whether shard %v/%v exists before doing initial backup: %v", initKeyspace, initShard, err)
   673  		}
   674  
   675  		log.Infof("Shard %v/%v has no existing backups. Creating initial backup.", initKeyspace, initShard)
   676  		return true, nil
   677  	}
   678  
   679  	// We need at least one backup so we can restore first, unless the user explicitly says we don't
   680  	if len(backups) == 0 && !allowFirstBackup {
   681  		return false, fmt.Errorf("no existing backups to restore from; backup is not possible since --initial_backup flag was not enabled")
   682  	}
   683  	if lastBackup == nil {
   684  		if allowFirstBackup {
   685  			// There's no complete backup, but we were told to take one from scratch anyway.
   686  			return true, nil
   687  		}
   688  		return false, fmt.Errorf("no complete backups to restore from; backup is not possible since --initial_backup flag was not enabled")
   689  	}
   690  
   691  	// Has it been long enough since the last complete backup to need a new one?
   692  	if minBackupInterval == 0 {
   693  		// No minimum interval is set, so always backup.
   694  		return true, nil
   695  	}
   696  	lastBackupTime, err := parseBackupTime(lastBackup.Name())
   697  	if err != nil {
   698  		return false, fmt.Errorf("can't check last backup time: %v", err)
   699  	}
   700  	if elapsedTime := time.Since(lastBackupTime); elapsedTime < minBackupInterval {
   701  		// It hasn't been long enough yet.
   702  		log.Infof("Skipping backup since only %v has elapsed since the last backup at %v, which is less than the min_backup_interval of %v.", elapsedTime, lastBackupTime, minBackupInterval)
   703  		return false, nil
   704  	}
   705  	// It has been long enough.
   706  	log.Infof("The last backup was taken at %v, which is older than the min_backup_interval of %v.", lastBackupTime, minBackupInterval)
   707  	return true, nil
   708  }
   709  
   710  func lastCompleteBackup(ctx context.Context, backups []backupstorage.BackupHandle) backupstorage.BackupHandle {
   711  	if len(backups) == 0 {
   712  		return nil
   713  	}
   714  
   715  	// Backups are sorted in ascending order by start time. Start at the end.
   716  	for i := len(backups) - 1; i >= 0; i-- {
   717  		// Check if this backup is complete by looking for the MANIFEST file,
   718  		// which is written at the end after all files are uploaded.
   719  		backup := backups[i]
   720  		if err := checkBackupComplete(ctx, backup); err != nil {
   721  			log.Warningf("Ignoring backup %v because it's incomplete: %v", backup.Name(), err)
   722  			continue
   723  		}
   724  		return backup
   725  	}
   726  
   727  	return nil
   728  }
   729  
   730  func checkBackupComplete(ctx context.Context, backup backupstorage.BackupHandle) error {
   731  	manifest, err := mysqlctl.GetBackupManifest(ctx, backup)
   732  	if err != nil {
   733  		return fmt.Errorf("can't get backup MANIFEST: %v", err)
   734  	}
   735  
   736  	log.Infof("Found complete backup %v taken at position %v", backup.Name(), manifest.Position.String())
   737  	return nil
   738  }