vitess.io/vitess@v0.16.2/go/cmd/vtbackup/vtbackup.go (about) 1 /* 2 Copyright 2019 The Vitess Authors 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 /* 18 vtbackup is a batch command to perform a single pass of backup maintenance for a shard. 19 20 When run periodically for each shard, vtbackup can ensure these configurable policies: 21 * There is always a recent backup for the shard. 22 * Old backups for the shard are removed. 23 24 Whatever system launches vtbackup is responsible for the following: 25 - Running vtbackup with similar flags that would be used for a vttablet and 26 mysqlctld in the target shard to be backed up. 27 - Provisioning as much disk space for vtbackup as would be given to vttablet. 28 The data directory MUST be empty at startup. Do NOT reuse a persistent disk. 29 - Running vtbackup periodically for each shard, for each backup storage location. 30 - Ensuring that at most one instance runs at a time for a given pair of shard 31 and backup storage location. 32 - Retrying vtbackup if it fails. 33 - Alerting human operators if the failure is persistent. 34 35 The process vtbackup follows to take a new backup is as follows: 36 1. Restore from the most recent backup. 37 2. Start a mysqld instance (but no vttablet) from the restored data. 38 3. Instruct mysqld to connect to the current shard primary and replicate any 39 transactions that are new since the last backup. 40 4. Ask the primary for its current replication position and set that as the goal 41 for catching up on replication before taking the backup, so the goalposts 42 don't move. 43 5. Wait until replication is caught up to the goal position or beyond. 44 6. Stop mysqld and take a new backup. 45 46 Aside from additional replication load while vtbackup's mysqld catches up on 47 new transactions, the shard should be otherwise unaffected. Existing tablets 48 will continue to serve, and no new tablets will appear in topology, meaning no 49 query traffic will ever be routed to vtbackup's mysqld. This silent operation 50 mode helps make backups minimally disruptive to serving capacity and orthogonal 51 to the handling of the query path. 52 53 The command-line parameters to vtbackup specify a policy for when a new backup 54 is needed, and when old backups should be removed. If the existing backups 55 already satisfy the policy, then vtbackup will do nothing and return success 56 immediately. 57 */ 58 package main 59 60 import ( 61 "context" 62 "crypto/rand" 63 "fmt" 64 "math" 65 "math/big" 66 "os" 67 "strings" 68 "syscall" 69 "time" 70 71 "github.com/spf13/pflag" 72 73 "vitess.io/vitess/go/acl" 74 "vitess.io/vitess/go/cmd" 75 "vitess.io/vitess/go/exit" 76 "vitess.io/vitess/go/mysql" 77 "vitess.io/vitess/go/vt/dbconfigs" 78 "vitess.io/vitess/go/vt/log" 79 "vitess.io/vitess/go/vt/logutil" 80 "vitess.io/vitess/go/vt/mysqlctl" 81 "vitess.io/vitess/go/vt/mysqlctl/backupstorage" 82 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 83 "vitess.io/vitess/go/vt/servenv" 84 "vitess.io/vitess/go/vt/topo" 85 "vitess.io/vitess/go/vt/topo/topoproto" 86 "vitess.io/vitess/go/vt/vterrors" 87 _ "vitess.io/vitess/go/vt/vttablet/grpctmclient" 88 "vitess.io/vitess/go/vt/vttablet/tmclient" 89 ) 90 91 const ( 92 // operationTimeout is the timeout for individual operations like fetching 93 // the primary position. This does not impose an overall timeout on 94 // long-running processes like taking the backup. It only applies to 95 // steps along the way that should complete quickly. This ensures we don't 96 // place a hard cap on the overall time for a backup, while also not waiting 97 // forever for things that should be quick. 98 operationTimeout = 1 * time.Minute 99 ) 100 101 var ( 102 minBackupInterval time.Duration 103 minRetentionTime time.Duration 104 minRetentionCount = 1 105 initialBackup bool 106 allowFirstBackup bool 107 restartBeforeBackup bool 108 // vttablet-like flags 109 initDbNameOverride string 110 initKeyspace string 111 initShard string 112 concurrency = 4 113 incrementalFromPos string 114 // mysqlctld-like flags 115 mysqlPort = 3306 116 mysqlSocket string 117 mysqlTimeout = 5 * time.Minute 118 initDBSQLFile string 119 detachedMode bool 120 keepAliveTimeout = 0 * time.Second 121 disableRedoLog = false 122 ) 123 124 func registerFlags(fs *pflag.FlagSet) { 125 fs.DurationVar(&minBackupInterval, "min_backup_interval", minBackupInterval, "Only take a new backup if it's been at least this long since the most recent backup.") 126 fs.DurationVar(&minRetentionTime, "min_retention_time", minRetentionTime, "Keep each old backup for at least this long before removing it. Set to 0 to disable pruning of old backups.") 127 fs.IntVar(&minRetentionCount, "min_retention_count", minRetentionCount, "Always keep at least this many of the most recent backups in this backup storage location, even if some are older than the min_retention_time. This must be at least 1 since a backup must always exist to allow new backups to be made") 128 fs.BoolVar(&initialBackup, "initial_backup", initialBackup, "Instead of restoring from backup, initialize an empty database with the provided init_db_sql_file and upload a backup of that for the shard, if the shard has no backups yet. This can be used to seed a brand new shard with an initial, empty backup. If any backups already exist for the shard, this will be considered a successful no-op. This can only be done before the shard exists in topology (i.e. before any tablets are deployed).") 129 fs.BoolVar(&allowFirstBackup, "allow_first_backup", allowFirstBackup, "Allow this job to take the first backup of an existing shard.") 130 fs.BoolVar(&restartBeforeBackup, "restart_before_backup", restartBeforeBackup, "Perform a mysqld clean/full restart after applying binlogs, but before taking the backup. Only makes sense to work around xtrabackup bugs.") 131 // vttablet-like flags 132 fs.StringVar(&initDbNameOverride, "init_db_name_override", initDbNameOverride, "(init parameter) override the name of the db used by vttablet") 133 fs.StringVar(&initKeyspace, "init_keyspace", initKeyspace, "(init parameter) keyspace to use for this tablet") 134 fs.StringVar(&initShard, "init_shard", initShard, "(init parameter) shard to use for this tablet") 135 fs.IntVar(&concurrency, "concurrency", concurrency, "(init restore parameter) how many concurrent files to restore at once") 136 fs.StringVar(&incrementalFromPos, "incremental_from_pos", incrementalFromPos, "Position of previous backup. Default: empty. If given, then this backup becomes an incremental backup from given position. If value is 'auto', backup taken from last successful backup position") 137 // mysqlctld-like flags 138 fs.IntVar(&mysqlPort, "mysql_port", mysqlPort, "mysql port") 139 fs.StringVar(&mysqlSocket, "mysql_socket", mysqlSocket, "path to the mysql socket") 140 fs.DurationVar(&mysqlTimeout, "mysql_timeout", mysqlTimeout, "how long to wait for mysqld startup") 141 fs.StringVar(&initDBSQLFile, "init_db_sql_file", initDBSQLFile, "path to .sql file to run after mysql_install_db") 142 fs.BoolVar(&detachedMode, "detach", detachedMode, "detached mode - run backups detached from the terminal") 143 fs.DurationVar(&keepAliveTimeout, "keep-alive-timeout", keepAliveTimeout, "Wait until timeout elapses after a successful backup before shutting down.") 144 fs.BoolVar(&disableRedoLog, "disable-redo-log", disableRedoLog, "Disable InnoDB redo log during replication-from-primary phase of backup.") 145 146 acl.RegisterFlags(fs) 147 } 148 149 func init() { 150 servenv.RegisterDefaultFlags() 151 dbconfigs.RegisterFlags(dbconfigs.All...) 152 mysqlctl.RegisterFlags() 153 servenv.OnParse(registerFlags) 154 } 155 156 func main() { 157 defer exit.Recover() 158 159 servenv.ParseFlags("vtbackup") 160 servenv.Init() 161 ctx, cancel := context.WithCancel(context.Background()) 162 servenv.OnClose(func() { 163 cancel() 164 }) 165 166 defer func() { 167 servenv.ExitChan <- syscall.SIGTERM 168 <-ctx.Done() 169 }() 170 171 go servenv.RunDefault() 172 173 if detachedMode { 174 // this method will call os.Exit and kill this process 175 cmd.DetachFromTerminalAndExit() 176 } 177 178 defer logutil.Flush() 179 180 if minRetentionCount < 1 { 181 log.Errorf("min_retention_count must be at least 1 to allow restores to succeed") 182 exit.Return(1) 183 } 184 185 // Open connection backup storage. 186 backupStorage, err := backupstorage.GetBackupStorage() 187 if err != nil { 188 log.Errorf("Can't get backup storage: %v", err) 189 exit.Return(1) 190 } 191 defer backupStorage.Close() 192 // Open connection to topology server. 193 topoServer := topo.Open() 194 defer topoServer.Close() 195 196 // Try to take a backup, if it's been long enough since the last one. 197 // Skip pruning if backup wasn't fully successful. We don't want to be 198 // deleting things if the backup process is not healthy. 199 backupDir := mysqlctl.GetBackupDir(initKeyspace, initShard) 200 doBackup, err := shouldBackup(ctx, topoServer, backupStorage, backupDir) 201 if err != nil { 202 log.Errorf("Can't take backup: %v", err) 203 exit.Return(1) 204 } 205 if doBackup { 206 if err := takeBackup(ctx, topoServer, backupStorage); err != nil { 207 log.Errorf("Failed to take backup: %v", err) 208 exit.Return(1) 209 } 210 } 211 212 // Prune old backups. 213 if err := pruneBackups(ctx, backupStorage, backupDir); err != nil { 214 log.Errorf("Couldn't prune old backups: %v", err) 215 exit.Return(1) 216 } 217 218 if keepAliveTimeout > 0 { 219 log.Infof("Backup was successful, waiting %s before exiting (or until context expires).", keepAliveTimeout) 220 select { 221 case <-time.After(keepAliveTimeout): 222 case <-ctx.Done(): 223 } 224 } 225 log.Info("Exiting.") 226 } 227 228 func takeBackup(ctx context.Context, topoServer *topo.Server, backupStorage backupstorage.BackupStorage) error { 229 // This is an imaginary tablet alias. The value doesn't matter for anything, 230 // except that we generate a random UID to ensure the target backup 231 // directory is unique if multiple vtbackup instances are launched for the 232 // same shard, at exactly the same second, pointed at the same backup 233 // storage location. 234 bigN, err := rand.Int(rand.Reader, big.NewInt(math.MaxUint32)) 235 if err != nil { 236 return fmt.Errorf("can't generate random tablet UID: %v", err) 237 } 238 tabletAlias := &topodatapb.TabletAlias{ 239 Cell: "vtbackup", 240 Uid: uint32(bigN.Uint64()), 241 } 242 243 // Clean up our temporary data dir if we exit for any reason, to make sure 244 // every invocation of vtbackup starts with a clean slate, and it does not 245 // accumulate garbage (and run out of disk space) if it's restarted. 246 tabletDir := mysqlctl.TabletDir(tabletAlias.Uid) 247 defer func() { 248 log.Infof("Removing temporary tablet directory: %v", tabletDir) 249 if err := os.RemoveAll(tabletDir); err != nil { 250 log.Warningf("Failed to remove temporary tablet directory: %v", err) 251 } 252 }() 253 254 // Start up mysqld as if we are mysqlctld provisioning a fresh tablet. 255 mysqld, mycnf, err := mysqlctl.CreateMysqldAndMycnf(tabletAlias.Uid, mysqlSocket, int32(mysqlPort)) 256 if err != nil { 257 return fmt.Errorf("failed to initialize mysql config: %v", err) 258 } 259 initCtx, initCancel := context.WithTimeout(ctx, mysqlTimeout) 260 defer initCancel() 261 if err := mysqld.Init(initCtx, mycnf, initDBSQLFile); err != nil { 262 return fmt.Errorf("failed to initialize mysql data dir and start mysqld: %v", err) 263 } 264 // Shut down mysqld when we're done. 265 defer func() { 266 // Be careful not to use the original context, because we don't want to 267 // skip shutdown just because we timed out waiting for other things. 268 ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) 269 defer cancel() 270 if err := mysqld.Shutdown(ctx, mycnf, false); err != nil { 271 log.Errorf("failed to shutdown mysqld: %v", err) 272 } 273 }() 274 275 extraEnv := map[string]string{ 276 "TABLET_ALIAS": topoproto.TabletAliasString(tabletAlias), 277 } 278 dbName := initDbNameOverride 279 if dbName == "" { 280 dbName = fmt.Sprintf("vt_%s", initKeyspace) 281 } 282 283 backupParams := mysqlctl.BackupParams{ 284 Cnf: mycnf, 285 Mysqld: mysqld, 286 Logger: logutil.NewConsoleLogger(), 287 Concurrency: concurrency, 288 IncrementalFromPos: incrementalFromPos, 289 HookExtraEnv: extraEnv, 290 TopoServer: topoServer, 291 Keyspace: initKeyspace, 292 Shard: initShard, 293 TabletAlias: topoproto.TabletAliasString(tabletAlias), 294 } 295 // In initial_backup mode, just take a backup of this empty database. 296 if initialBackup { 297 // Take a backup of this empty DB without restoring anything. 298 // First, initialize it the way InitShardPrimary would, so this backup 299 // produces a result that can be used to skip InitShardPrimary entirely. 300 // This involves resetting replication (to erase any history) and then 301 // creating the main database and some Vitess system tables. 302 if err := mysqld.ResetReplication(ctx); err != nil { 303 return fmt.Errorf("can't reset replication: %v", err) 304 } 305 cmd := mysqlctl.GenerateInitialBinlogEntry() 306 if err := mysqld.ExecuteSuperQueryList(ctx, []string{cmd}); err != nil { 307 return err 308 } 309 310 backupParams.BackupTime = time.Now() 311 // Now we're ready to take the backup. 312 if err := mysqlctl.Backup(ctx, backupParams); err != nil { 313 return fmt.Errorf("backup failed: %v", err) 314 } 315 log.Info("Initial backup successful.") 316 return nil 317 } 318 319 backupDir := mysqlctl.GetBackupDir(initKeyspace, initShard) 320 log.Infof("Restoring latest backup from directory %v", backupDir) 321 params := mysqlctl.RestoreParams{ 322 Cnf: mycnf, 323 Mysqld: mysqld, 324 Logger: logutil.NewConsoleLogger(), 325 Concurrency: concurrency, 326 HookExtraEnv: extraEnv, 327 DeleteBeforeRestore: true, 328 DbName: dbName, 329 Keyspace: initKeyspace, 330 Shard: initShard, 331 } 332 backupManifest, err := mysqlctl.Restore(ctx, params) 333 var restorePos mysql.Position 334 switch err { 335 case nil: 336 log.Infof("Successfully restored from backup at replication position %v", restorePos) 337 // if err is nil, we expect backupManifest to be non-nil 338 restorePos = backupManifest.Position 339 case mysqlctl.ErrNoBackup: 340 // There is no backup found, but we may be taking the initial backup of a shard 341 if !allowFirstBackup { 342 return fmt.Errorf("no backup found; not starting up empty since --initial_backup flag was not enabled") 343 } 344 restorePos = mysql.Position{} 345 default: 346 return fmt.Errorf("can't restore from backup: %v", err) 347 } 348 349 // Disable redo logging (if we can) before we start replication. 350 disabledRedoLog := false 351 if disableRedoLog { 352 if err := mysqld.DisableRedoLog(ctx); err != nil { 353 log.Warningf("Error disabling redo logging: %v", err) 354 } else { 355 disabledRedoLog = true 356 } 357 } 358 359 // We have restored a backup. Now start replication. 360 if err := resetReplication(ctx, restorePos, mysqld); err != nil { 361 return fmt.Errorf("error resetting replication: %v", err) 362 } 363 if err := startReplication(ctx, mysqld, topoServer); err != nil { 364 return fmt.Errorf("error starting replication: %v", err) 365 } 366 367 // Get the current primary replication position, and wait until we catch up 368 // to that point. We do this instead of looking at ReplicationLag 369 // because that value can 370 // sometimes lie and tell you there's 0 lag when actually replication is 371 // stopped. Also, if replication is making progress but is too slow to ever 372 // catch up to live changes, we'd rather take a backup of something rather 373 // than timing out. 374 tmc := tmclient.NewTabletManagerClient() 375 // Keep retrying if we can't contact the primary. The primary might be 376 // changing, moving, or down temporarily. 377 var primaryPos mysql.Position 378 err = retryOnError(ctx, func() error { 379 // Add a per-operation timeout so we re-read topo if the primary is unreachable. 380 opCtx, cancel := context.WithTimeout(ctx, operationTimeout) 381 defer cancel() 382 pos, err := getPrimaryPosition(opCtx, tmc, topoServer) 383 if err != nil { 384 return fmt.Errorf("can't get the primary replication position: %v", err) 385 } 386 primaryPos = pos 387 return nil 388 }) 389 if err != nil { 390 return err 391 } 392 393 // Remember the time when we fetched the primary position, not when we caught 394 // up to it, so the timestamp on our backup is honest (assuming we make it 395 // to the goal position). 396 backupParams.BackupTime = time.Now() 397 398 // Wait for replication to catch up. 399 waitStartTime := time.Now() 400 for { 401 select { 402 case <-ctx.Done(): 403 return ctx.Err() 404 case <-time.After(time.Second): 405 } 406 407 status, statusErr := mysqld.ReplicationStatus() 408 if statusErr != nil { 409 log.Warningf("Error getting replication status: %v", statusErr) 410 continue 411 } 412 if status.Position.AtLeast(primaryPos) { 413 // We're caught up on replication to at least the point the primary 414 // was at when this vtbackup run started. 415 log.Infof("Replication caught up to %v after %v", status.Position, time.Since(waitStartTime)) 416 break 417 } 418 if !status.Healthy() { 419 log.Warning("Replication has stopped before backup could be taken. Trying to restart replication.") 420 if err := startReplication(ctx, mysqld, topoServer); err != nil { 421 log.Warningf("Failed to restart replication: %v", err) 422 } 423 } 424 } 425 426 // Stop replication and see where we are. 427 if err := mysqld.StopReplication(nil); err != nil { 428 return fmt.Errorf("can't stop replication: %v", err) 429 } 430 431 // Did we make any progress? 432 status, err := mysqld.ReplicationStatus() 433 if err != nil { 434 return fmt.Errorf("can't get replication status: %v", err) 435 } 436 log.Infof("Replication caught up to %v", status.Position) 437 if !status.Position.AtLeast(primaryPos) && status.Position.Equal(restorePos) { 438 return fmt.Errorf("not taking backup: replication did not make any progress from restore point: %v", restorePos) 439 } 440 441 // Re-enable redo logging. 442 if disabledRedoLog { 443 if err := mysqld.EnableRedoLog(ctx); err != nil { 444 return fmt.Errorf("failed to re-enable redo log: %v", err) 445 } 446 } 447 448 if restartBeforeBackup { 449 log.Info("Proceeding with clean MySQL shutdown and startup to flush all buffers.") 450 // Prep for full/clean shutdown (not typically the default) 451 if err := mysqld.ExecuteSuperQuery(ctx, "SET GLOBAL innodb_fast_shutdown=0"); err != nil { 452 return fmt.Errorf("Could not prep for full shutdown: %v", err) 453 } 454 // Shutdown, waiting for it to finish 455 if err := mysqld.Shutdown(ctx, mycnf, true); err != nil { 456 return fmt.Errorf("Something went wrong during full MySQL shutdown: %v", err) 457 } 458 // Start MySQL, waiting for it to come up 459 if err := mysqld.Start(ctx, mycnf); err != nil { 460 return fmt.Errorf("Could not start MySQL after full shutdown: %v", err) 461 } 462 } 463 464 // Now we can take a new backup. 465 if err := mysqlctl.Backup(ctx, backupParams); err != nil { 466 return fmt.Errorf("error taking backup: %v", err) 467 } 468 469 // Return a non-zero exit code if we didn't meet the replication position 470 // goal, even though we took a backup that pushes the high-water mark up. 471 if !status.Position.AtLeast(primaryPos) { 472 return fmt.Errorf("replication caught up to %v but didn't make it to the goal of %v; a backup was taken anyway to save partial progress, but the operation should still be retried since not all expected data is backed up", status.Position, primaryPos) 473 } 474 log.Info("Backup successful.") 475 return nil 476 } 477 478 func resetReplication(ctx context.Context, pos mysql.Position, mysqld mysqlctl.MysqlDaemon) error { 479 cmds := []string{ 480 "STOP SLAVE", 481 "RESET SLAVE ALL", // "ALL" makes it forget replication source host:port. 482 } 483 if err := mysqld.ExecuteSuperQueryList(ctx, cmds); err != nil { 484 return vterrors.Wrap(err, "failed to reset replication") 485 } 486 487 // Check if we have a position to resume from, if not reset to the beginning of time 488 if !pos.IsZero() { 489 // Set the position at which to resume from the replication source. 490 if err := mysqld.SetReplicationPosition(ctx, pos); err != nil { 491 return vterrors.Wrap(err, "failed to set replica position") 492 } 493 } else { 494 if err := mysqld.ResetReplication(ctx); err != nil { 495 return vterrors.Wrap(err, "failed to reset replication") 496 } 497 } 498 return nil 499 } 500 501 func startReplication(ctx context.Context, mysqld mysqlctl.MysqlDaemon, topoServer *topo.Server) error { 502 si, err := topoServer.GetShard(ctx, initKeyspace, initShard) 503 if err != nil { 504 return vterrors.Wrap(err, "can't read shard") 505 } 506 if topoproto.TabletAliasIsZero(si.PrimaryAlias) { 507 // Normal tablets will sit around waiting to be reparented in this case. 508 // Since vtbackup is a batch job, we just have to fail. 509 return fmt.Errorf("can't start replication after restore: shard %v/%v has no primary", initKeyspace, initShard) 510 } 511 // TODO(enisoc): Support replicating from another replica, preferably in the 512 // same cell, preferably rdonly, to reduce load on the primary. 513 ti, err := topoServer.GetTablet(ctx, si.PrimaryAlias) 514 if err != nil { 515 return vterrors.Wrapf(err, "Cannot read primary tablet %v", si.PrimaryAlias) 516 } 517 518 // Stop replication (in case we're restarting), set replication source, and start replication. 519 if err := mysqld.SetReplicationSource(ctx, ti.Tablet.MysqlHostname, int(ti.Tablet.MysqlPort), true /* stopReplicationBefore */, true /* startReplicationAfter */); err != nil { 520 return vterrors.Wrap(err, "MysqlDaemon.SetReplicationSource failed") 521 } 522 return nil 523 } 524 525 func getPrimaryPosition(ctx context.Context, tmc tmclient.TabletManagerClient, ts *topo.Server) (mysql.Position, error) { 526 si, err := ts.GetShard(ctx, initKeyspace, initShard) 527 if err != nil { 528 return mysql.Position{}, vterrors.Wrap(err, "can't read shard") 529 } 530 if topoproto.TabletAliasIsZero(si.PrimaryAlias) { 531 // Normal tablets will sit around waiting to be reparented in this case. 532 // Since vtbackup is a batch job, we just have to fail. 533 return mysql.Position{}, fmt.Errorf("shard %v/%v has no primary", initKeyspace, initShard) 534 } 535 ti, err := ts.GetTablet(ctx, si.PrimaryAlias) 536 if err != nil { 537 return mysql.Position{}, fmt.Errorf("can't get primary tablet record %v: %v", topoproto.TabletAliasString(si.PrimaryAlias), err) 538 } 539 posStr, err := tmc.PrimaryPosition(ctx, ti.Tablet) 540 if err != nil { 541 return mysql.Position{}, fmt.Errorf("can't get primary replication position: %v", err) 542 } 543 pos, err := mysql.DecodePosition(posStr) 544 if err != nil { 545 return mysql.Position{}, fmt.Errorf("can't decode primary replication position %q: %v", posStr, err) 546 } 547 return pos, nil 548 } 549 550 // retryOnError keeps calling the given function until it succeeds, or the given 551 // Context is done. It waits an exponentially increasing amount of time between 552 // retries to avoid hot-looping. The only time this returns an error is if the 553 // Context is cancelled. 554 func retryOnError(ctx context.Context, fn func() error) error { 555 waitTime := 1 * time.Second 556 557 for { 558 err := fn() 559 if err == nil { 560 return nil 561 } 562 log.Errorf("Waiting %v to retry after error: %v", waitTime, err) 563 564 select { 565 case <-ctx.Done(): 566 log.Errorf("Not retrying after error: %v", ctx.Err()) 567 return ctx.Err() 568 case <-time.After(waitTime): 569 waitTime *= 2 570 } 571 } 572 } 573 574 func pruneBackups(ctx context.Context, backupStorage backupstorage.BackupStorage, backupDir string) error { 575 if minRetentionTime == 0 { 576 log.Info("Pruning of old backups is disabled.") 577 return nil 578 } 579 backups, err := backupStorage.ListBackups(ctx, backupDir) 580 if err != nil { 581 return fmt.Errorf("can't list backups: %v", err) 582 } 583 numBackups := len(backups) 584 if numBackups <= minRetentionCount { 585 log.Infof("Found %v backups. Not pruning any since this is within the min_retention_count of %v.", numBackups, minRetentionCount) 586 return nil 587 } 588 // We have more than the minimum retention count, so we could afford to 589 // prune some. See if any are beyond the minimum retention time. 590 // ListBackups returns them sorted by oldest first. 591 for _, backup := range backups { 592 backupTime, err := parseBackupTime(backup.Name()) 593 if err != nil { 594 return err 595 } 596 if time.Since(backupTime) < minRetentionTime { 597 // The oldest remaining backup is not old enough to prune. 598 log.Infof("Oldest backup taken at %v has not reached min_retention_time of %v. Nothing left to prune.", backupTime, minRetentionTime) 599 break 600 } 601 // Remove the backup. 602 log.Infof("Removing old backup %v from %v, since it's older than min_retention_time of %v", backup.Name(), backupDir, minRetentionTime) 603 if err := backupStorage.RemoveBackup(ctx, backupDir, backup.Name()); err != nil { 604 return fmt.Errorf("couldn't remove backup %v from %v: %v", backup.Name(), backupDir, err) 605 } 606 // We successfully removed one backup. Can we afford to prune any more? 607 numBackups-- 608 if numBackups == minRetentionCount { 609 log.Infof("Successfully pruned backup count to min_retention_count of %v.", minRetentionCount) 610 break 611 } 612 } 613 return nil 614 } 615 616 func parseBackupTime(name string) (time.Time, error) { 617 // Backup names are formatted as "date.time.tablet-alias". 618 parts := strings.Split(name, ".") 619 if len(parts) != 3 { 620 return time.Time{}, fmt.Errorf("backup name not in expected format (date.time.tablet-alias): %v", name) 621 } 622 backupTime, err := time.Parse(mysqlctl.BackupTimestampFormat, fmt.Sprintf("%s.%s", parts[0], parts[1])) 623 if err != nil { 624 return time.Time{}, fmt.Errorf("can't parse timestamp from backup %q: %v", name, err) 625 } 626 return backupTime, nil 627 } 628 629 func shouldBackup(ctx context.Context, topoServer *topo.Server, backupStorage backupstorage.BackupStorage, backupDir string) (bool, error) { 630 // Look for the most recent, complete backup. 631 backups, err := backupStorage.ListBackups(ctx, backupDir) 632 if err != nil { 633 return false, fmt.Errorf("can't list backups: %v", err) 634 } 635 lastBackup := lastCompleteBackup(ctx, backups) 636 637 // Check preconditions for initial_backup mode. 638 if initialBackup { 639 // Check if any backups for the shard already exist in this backup storage location. 640 if lastBackup != nil { 641 log.Infof("At least one complete backup already exists, so there's no need to seed an empty backup. Doing nothing.") 642 return false, nil 643 } 644 645 // Check whether the shard exists. 646 _, shardErr := topoServer.GetShard(ctx, initKeyspace, initShard) 647 switch { 648 case shardErr == nil: 649 // If the shard exists, we should make sure none of the tablets are 650 // already in a serving state, because then they might have data 651 // that conflicts with the initial backup we're about to take. 652 tablets, err := topoServer.GetTabletMapForShard(ctx, initKeyspace, initShard) 653 if err != nil { 654 // We don't know for sure whether any tablets are serving, 655 // so it's not safe to continue. 656 return false, fmt.Errorf("failed to check whether shard %v/%v has serving tablets before doing initial backup: %v", initKeyspace, initShard, err) 657 } 658 for tabletAlias, tablet := range tablets { 659 // Check if any tablet has its type set to one of the serving types. 660 // If so, it's too late to do an initial backup. 661 if tablet.IsInServingGraph() { 662 return false, fmt.Errorf("refusing to upload initial backup of empty database: the shard %v/%v already has at least one tablet that may be serving (%v); you must take a backup from a live tablet instead", initKeyspace, initShard, tabletAlias) 663 } 664 } 665 log.Infof("Shard %v/%v exists but has no serving tablets.", initKeyspace, initShard) 666 case topo.IsErrType(shardErr, topo.NoNode): 667 // The shard doesn't exist, so we know no tablets are running. 668 log.Infof("Shard %v/%v doesn't exist; assuming it has no serving tablets.", initKeyspace, initShard) 669 default: 670 // If we encounter any other error, we don't know for sure whether 671 // the shard exists, so it's not safe to continue. 672 return false, fmt.Errorf("failed to check whether shard %v/%v exists before doing initial backup: %v", initKeyspace, initShard, err) 673 } 674 675 log.Infof("Shard %v/%v has no existing backups. Creating initial backup.", initKeyspace, initShard) 676 return true, nil 677 } 678 679 // We need at least one backup so we can restore first, unless the user explicitly says we don't 680 if len(backups) == 0 && !allowFirstBackup { 681 return false, fmt.Errorf("no existing backups to restore from; backup is not possible since --initial_backup flag was not enabled") 682 } 683 if lastBackup == nil { 684 if allowFirstBackup { 685 // There's no complete backup, but we were told to take one from scratch anyway. 686 return true, nil 687 } 688 return false, fmt.Errorf("no complete backups to restore from; backup is not possible since --initial_backup flag was not enabled") 689 } 690 691 // Has it been long enough since the last complete backup to need a new one? 692 if minBackupInterval == 0 { 693 // No minimum interval is set, so always backup. 694 return true, nil 695 } 696 lastBackupTime, err := parseBackupTime(lastBackup.Name()) 697 if err != nil { 698 return false, fmt.Errorf("can't check last backup time: %v", err) 699 } 700 if elapsedTime := time.Since(lastBackupTime); elapsedTime < minBackupInterval { 701 // It hasn't been long enough yet. 702 log.Infof("Skipping backup since only %v has elapsed since the last backup at %v, which is less than the min_backup_interval of %v.", elapsedTime, lastBackupTime, minBackupInterval) 703 return false, nil 704 } 705 // It has been long enough. 706 log.Infof("The last backup was taken at %v, which is older than the min_backup_interval of %v.", lastBackupTime, minBackupInterval) 707 return true, nil 708 } 709 710 func lastCompleteBackup(ctx context.Context, backups []backupstorage.BackupHandle) backupstorage.BackupHandle { 711 if len(backups) == 0 { 712 return nil 713 } 714 715 // Backups are sorted in ascending order by start time. Start at the end. 716 for i := len(backups) - 1; i >= 0; i-- { 717 // Check if this backup is complete by looking for the MANIFEST file, 718 // which is written at the end after all files are uploaded. 719 backup := backups[i] 720 if err := checkBackupComplete(ctx, backup); err != nil { 721 log.Warningf("Ignoring backup %v because it's incomplete: %v", backup.Name(), err) 722 continue 723 } 724 return backup 725 } 726 727 return nil 728 } 729 730 func checkBackupComplete(ctx context.Context, backup backupstorage.BackupHandle) error { 731 manifest, err := mysqlctl.GetBackupManifest(ctx, backup) 732 if err != nil { 733 return fmt.Errorf("can't get backup MANIFEST: %v", err) 734 } 735 736 log.Infof("Found complete backup %v taken at position %v", backup.Name(), manifest.Position.String()) 737 return nil 738 }