vitess.io/vitess@v0.16.2/go/test/endtoend/backup/vtctlbackup/backup_utils.go (about) 1 /* 2 Copyright 2019 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package vtctlbackup 18 19 import ( 20 "bufio" 21 "encoding/json" 22 "fmt" 23 "os" 24 "os/exec" 25 "path" 26 "strings" 27 "syscall" 28 "testing" 29 "time" 30 31 "vitess.io/vitess/go/mysql" 32 "vitess.io/vitess/go/sqltypes" 33 "vitess.io/vitess/go/textutil" 34 "vitess.io/vitess/go/vt/mysqlctl" 35 "vitess.io/vitess/go/vt/proto/topodata" 36 "vitess.io/vitess/go/vt/proto/vtrpc" 37 "vitess.io/vitess/go/vt/sqlparser" 38 "vitess.io/vitess/go/vt/vterrors" 39 40 "github.com/stretchr/testify/assert" 41 "github.com/stretchr/testify/require" 42 43 "vitess.io/vitess/go/test/endtoend/cluster" 44 ) 45 46 // constants for test variants 47 const ( 48 XtraBackup = iota 49 Backup 50 Mysqlctld 51 timeout = time.Duration(60 * time.Second) 52 ) 53 54 var ( 55 primary *cluster.Vttablet 56 replica1 *cluster.Vttablet 57 replica2 *cluster.Vttablet 58 localCluster *cluster.LocalProcessCluster 59 newInitDBFile string 60 useXtrabackup bool 61 cell = cluster.DefaultCell 62 63 hostname = "localhost" 64 keyspaceName = "ks" 65 dbPassword = "VtDbaPass" 66 shardKsName = fmt.Sprintf("%s/%s", keyspaceName, shardName) 67 dbCredentialFile string 68 shardName = "0" 69 commonTabletArg = []string{ 70 "--vreplication_healthcheck_topology_refresh", "1s", 71 "--vreplication_healthcheck_retry_delay", "1s", 72 "--vreplication_retry_delay", "1s", 73 "--degraded_threshold", "5s", 74 "--lock_tables_timeout", "5s", 75 "--watch_replication_stream", 76 "--enable_replication_reporter", 77 "--serving_state_grace_period", "1s", 78 } 79 80 vtInsertTest = ` 81 create table vt_insert_test ( 82 id bigint auto_increment, 83 msg varchar(64), 84 primary key (id) 85 ) Engine=InnoDB` 86 ) 87 88 type CompressionDetails struct { 89 CompressorEngineName string 90 ExternalCompressorCmd string 91 ExternalCompressorExt string 92 ExternalDecompressorCmd string 93 } 94 95 // LaunchCluster : starts the cluster as per given params. 96 func LaunchCluster(setupType int, streamMode string, stripes int, cDetails *CompressionDetails) (int, error) { 97 localCluster = cluster.NewCluster(cell, hostname) 98 99 // Start topo server 100 err := localCluster.StartTopo() 101 if err != nil { 102 return 1, err 103 } 104 105 // Start keyspace 106 localCluster.Keyspaces = []cluster.Keyspace{ 107 { 108 Name: keyspaceName, 109 Shards: []cluster.Shard{ 110 { 111 Name: shardName, 112 }, 113 }, 114 }, 115 } 116 shard := &localCluster.Keyspaces[0].Shards[0] 117 118 dbCredentialFile = cluster.WriteDbCredentialToTmp(localCluster.TmpDirectory) 119 initDb, _ := os.ReadFile(path.Join(os.Getenv("VTROOT"), "/config/init_db.sql")) 120 sql := string(initDb) 121 newInitDBFile = path.Join(localCluster.TmpDirectory, "init_db_with_passwords.sql") 122 sql = sql + cluster.GetPasswordUpdateSQL(localCluster) 123 err = os.WriteFile(newInitDBFile, []byte(sql), 0666) 124 if err != nil { 125 return 1, err 126 } 127 128 extraArgs := []string{"--db-credentials-file", dbCredentialFile} 129 commonTabletArg = append(commonTabletArg, "--db-credentials-file", dbCredentialFile) 130 131 // Update arguments for xtrabackup 132 if setupType == XtraBackup { 133 useXtrabackup = true 134 135 xtrabackupArgs := []string{ 136 "--backup_engine_implementation", "xtrabackup", 137 fmt.Sprintf("--xtrabackup_stream_mode=%s", streamMode), 138 "--xtrabackup_user=vt_dba", 139 fmt.Sprintf("--xtrabackup_stripes=%d", stripes), 140 "--xtrabackup_backup_flags", fmt.Sprintf("--password=%s", dbPassword), 141 } 142 143 // if streamMode is xbstream, add some additional args to test other xtrabackup flags 144 if streamMode == "xbstream" { 145 xtrabackupArgs = append(xtrabackupArgs, "--xtrabackup_prepare_flags", fmt.Sprintf("--use-memory=100M")) //nolint 146 } 147 148 commonTabletArg = append(commonTabletArg, xtrabackupArgs...) 149 } 150 151 commonTabletArg = append(commonTabletArg, getCompressorArgs(cDetails)...) 152 153 var mysqlProcs []*exec.Cmd 154 for i := 0; i < 3; i++ { 155 tabletType := "replica" 156 if i == 0 { 157 tabletType = "primary" 158 } 159 tablet := localCluster.NewVttabletInstance(tabletType, 0, cell) 160 tablet.VttabletProcess = localCluster.VtprocessInstanceFromVttablet(tablet, shard.Name, keyspaceName) 161 tablet.VttabletProcess.DbPassword = dbPassword 162 tablet.VttabletProcess.ExtraArgs = commonTabletArg 163 tablet.VttabletProcess.SupportsBackup = true 164 165 if setupType == Mysqlctld { 166 tablet.MysqlctldProcess = *cluster.MysqlCtldProcessInstance(tablet.TabletUID, tablet.MySQLPort, localCluster.TmpDirectory) 167 tablet.MysqlctldProcess.InitDBFile = newInitDBFile 168 tablet.MysqlctldProcess.ExtraArgs = extraArgs 169 tablet.MysqlctldProcess.Password = tablet.VttabletProcess.DbPassword 170 if err := tablet.MysqlctldProcess.Start(); err != nil { 171 return 1, err 172 } 173 shard.Vttablets = append(shard.Vttablets, tablet) 174 continue 175 } 176 177 tablet.MysqlctlProcess = *cluster.MysqlCtlProcessInstance(tablet.TabletUID, tablet.MySQLPort, localCluster.TmpDirectory) 178 tablet.MysqlctlProcess.InitDBFile = newInitDBFile 179 tablet.MysqlctlProcess.ExtraArgs = extraArgs 180 proc, err := tablet.MysqlctlProcess.StartProcess() 181 if err != nil { 182 return 1, err 183 } 184 mysqlProcs = append(mysqlProcs, proc) 185 186 shard.Vttablets = append(shard.Vttablets, tablet) 187 } 188 for _, proc := range mysqlProcs { 189 if err := proc.Wait(); err != nil { 190 return 1, err 191 } 192 } 193 primary = shard.Vttablets[0] 194 replica1 = shard.Vttablets[1] 195 replica2 = shard.Vttablets[2] 196 197 if err := localCluster.VtctlclientProcess.InitTablet(primary, cell, keyspaceName, hostname, shard.Name); err != nil { 198 return 1, err 199 } 200 if err := localCluster.VtctlclientProcess.InitTablet(replica1, cell, keyspaceName, hostname, shard.Name); err != nil { 201 return 1, err 202 } 203 vtctldClientProcess := cluster.VtctldClientProcessInstance("localhost", localCluster.VtctldProcess.GrpcPort, localCluster.TmpDirectory) 204 _, err = vtctldClientProcess.ExecuteCommandWithOutput("SetKeyspaceDurabilityPolicy", keyspaceName, "--durability-policy=semi_sync") 205 if err != nil { 206 return 1, err 207 } 208 209 for _, tablet := range []cluster.Vttablet{*primary, *replica1} { 210 if err := tablet.VttabletProcess.CreateDB(keyspaceName); err != nil { 211 return 1, err 212 } 213 if err := tablet.VttabletProcess.Setup(); err != nil { 214 return 1, err 215 } 216 } 217 218 if err := localCluster.VtctlclientProcess.InitShardPrimary(keyspaceName, shard.Name, cell, primary.TabletUID); err != nil { 219 return 1, err 220 } 221 222 if err := localCluster.StartVTOrc(keyspaceName); err != nil { 223 return 1, err 224 } 225 226 return 0, nil 227 } 228 229 func getCompressorArgs(cDetails *CompressionDetails) []string { 230 var args []string 231 232 if cDetails == nil { 233 return args 234 } 235 236 if cDetails.CompressorEngineName != "" { 237 args = append(args, fmt.Sprintf("--compression-engine-name=%s", cDetails.CompressorEngineName)) 238 } 239 if cDetails.ExternalCompressorCmd != "" { 240 args = append(args, fmt.Sprintf("--external-compressor=%s", cDetails.ExternalCompressorCmd)) 241 } 242 if cDetails.ExternalCompressorExt != "" { 243 args = append(args, fmt.Sprintf("--external-compressor-extension=%s", cDetails.ExternalCompressorExt)) 244 } 245 if cDetails.ExternalDecompressorCmd != "" { 246 args = append(args, fmt.Sprintf("--external-decompressor=%s", cDetails.ExternalDecompressorCmd)) 247 } 248 249 return args 250 251 } 252 253 // update arguments with new values of compressionDetail. 254 func updateCompressorArgs(commonArgs []string, cDetails *CompressionDetails) []string { 255 if cDetails == nil { 256 return commonArgs 257 } 258 259 // remove if any compression flag already exists 260 for i, s := range commonArgs { 261 if strings.Contains(s, "--compression-engine-name") || strings.Contains(s, "--external-compressor") || 262 strings.Contains(s, "--external-compressor-extension") || strings.Contains(s, "--external-decompressor") { 263 commonArgs = append(commonArgs[:i], commonArgs[i+1:]...) 264 } 265 } 266 267 // update it with new values 268 commonArgs = append(commonArgs, getCompressorArgs(cDetails)...) 269 return commonArgs 270 } 271 272 // TearDownCluster shuts down all cluster processes 273 func TearDownCluster() { 274 localCluster.Teardown() 275 } 276 277 // TestBackup runs all the backup tests 278 func TestBackup(t *testing.T, setupType int, streamMode string, stripes int, cDetails *CompressionDetails, runSpecific []string) error { 279 verStr, err := mysqlctl.GetVersionString() 280 require.NoError(t, err) 281 _, vers, err := mysqlctl.ParseVersionString(verStr) 282 require.NoError(t, err) 283 switch streamMode { 284 case "xbstream": 285 if vers.Major < 8 { 286 t.Logf("Skipping xtrabackup tests with --xtrabackup_stream_mode=xbstream as those are only tested on XtraBackup/MySQL 8.0+") 287 return nil 288 } 289 case "", "tar": // streaming method of tar is the default for the vttablet --xtrabackup_stream_mode flag 290 // XtraBackup 8.0 must be used with MySQL 8.0 and it no longer supports tar as a stream method: 291 // https://docs.percona.com/percona-xtrabackup/2.4/innobackupex/streaming_backups_innobackupex.html 292 // https://docs.percona.com/percona-xtrabackup/8.0/xtrabackup_bin/backup.streaming.html 293 if vers.Major > 5 { 294 t.Logf("Skipping xtrabackup tests with --xtrabackup_stream_mode=tar as tar is no longer a streaming option in XtraBackup 8.0") 295 return nil 296 } 297 default: 298 require.FailNow(t, fmt.Sprintf("Unsupported xtrabackup stream mode: %s", streamMode)) 299 } 300 301 testMethods := []struct { 302 name string 303 method func(t *testing.T) 304 }{ 305 { 306 name: "TestReplicaBackup", 307 method: func(t *testing.T) { 308 vtctlBackup(t, "replica") 309 }, 310 }, // 311 { 312 name: "TestRdonlyBackup", 313 method: func(t *testing.T) { 314 vtctlBackup(t, "rdonly") 315 }, 316 }, // 317 { 318 name: "TestPrimaryBackup", 319 method: primaryBackup, 320 }, 321 { 322 name: "TestPrimaryReplicaSameBackup", 323 method: primaryReplicaSameBackup, 324 }, // 325 { 326 name: "primaryReplicaSameBackupModifiedCompressionEngine", 327 method: primaryReplicaSameBackupModifiedCompressionEngine, 328 }, // 329 { 330 name: "TestRestoreOldPrimaryByRestart", 331 method: restoreOldPrimaryByRestart, 332 }, // 333 { 334 name: "TestRestoreOldPrimaryInPlace", 335 method: restoreOldPrimaryInPlace, 336 }, // 337 { 338 name: "TestTerminatedRestore", 339 method: terminatedRestore, 340 }, // 341 } 342 343 defer cluster.PanicHandler(t) 344 // setup cluster for the testing 345 code, err := LaunchCluster(setupType, streamMode, stripes, cDetails) 346 require.Nilf(t, err, "setup failed with status code %d", code) 347 348 // Teardown the cluster 349 defer TearDownCluster() 350 351 // Run all the backup tests 352 for _, test := range testMethods { 353 if len(runSpecific) > 0 && !isRegistered(test.name, runSpecific) { 354 continue 355 } 356 if retVal := t.Run(test.name, test.method); !retVal { 357 return vterrors.Errorf(vtrpc.Code_UNKNOWN, "test failure: %s", test.name) 358 } 359 } 360 return nil 361 } 362 363 func isRegistered(name string, runlist []string) bool { 364 for _, f := range runlist { 365 if f == name { 366 return true 367 } 368 } 369 return false 370 } 371 372 type restoreMethod func(t *testing.T, tablet *cluster.Vttablet) 373 374 // 1. create a shard with primary and replica1 only 375 // 2. run InitShardPrimary 376 // 3. insert some data 377 // 4. take a backup on primary and save the timestamp 378 // 5. bring up tablet_replica2 after the fact, let it restore the (latest/second) backup 379 // 6. check all data is right (before+after backup data) 380 // 7. insert more data on the primary 381 // 8. take another backup 382 // 9. verify that we now have 2 backups 383 // 10. do a PRS to make the original primary a replica so that we can do a restore there 384 // 11. Delete+teardown the new primary so that we can restore the first backup on the original 385 // primary to confirm we don't have the data from #7 386 // 12. restore first backup on the original primary tablet using the first backup timstamp 387 // 13. verify that don't have the data added after the first backup 388 // 14. remove the backups 389 func primaryBackup(t *testing.T) { 390 // Having the VTOrc in this test causes a lot of flakiness. For example when we delete the tablet `replica2` which 391 // is the current primary and then try to restore from backup the old primary (`primary.Alias`), but before that sometimes the VTOrc 392 // promotes the `replica1` to primary right after we delete the replica2 (current primary). 393 // This can result in unexpected behavior. Therefore, disabling the VTOrc in this test to remove flakiness. 394 localCluster.DisableVTOrcRecoveries(t) 395 defer func() { 396 localCluster.EnableVTOrcRecoveries(t) 397 }() 398 verifyInitialReplication(t) 399 400 output, err := localCluster.VtctlclientProcess.ExecuteCommandWithOutput("Backup", primary.Alias) 401 require.Error(t, err) 402 assert.Contains(t, output, "type PRIMARY cannot take backup. if you really need to do this, rerun the backup command with --allow_primary") 403 404 localCluster.VerifyBackupCount(t, shardKsName, 0) 405 406 err = localCluster.VtctlclientProcess.ExecuteCommand("Backup", "--", "--allow_primary=true", primary.Alias) 407 require.Nil(t, err) 408 409 // We'll restore this on the primary later to test restores using a backup timestamp 410 firstBackupTimestamp := time.Now().UTC().Format(mysqlctl.BackupTimestampFormat) 411 412 backups := localCluster.VerifyBackupCount(t, shardKsName, 1) 413 assert.Contains(t, backups[0], primary.Alias) 414 415 _, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('test2')", keyspaceName, true) 416 require.Nil(t, err) 417 418 restoreWaitForBackup(t, "replica", nil, true) 419 err = replica2.VttabletProcess.WaitForTabletStatusesForTimeout([]string{"SERVING"}, timeout) 420 require.Nil(t, err) 421 422 // Verify that we have all the new data -- we should have 2 records now... 423 // And only 1 record after we restore using the first backup timestamp 424 cluster.VerifyRowsInTablet(t, replica2, keyspaceName, 2) 425 426 err = localCluster.VtctlclientProcess.ExecuteCommand("Backup", "--", "--allow_primary=true", primary.Alias) 427 require.Nil(t, err) 428 429 backups = localCluster.VerifyBackupCount(t, shardKsName, 2) 430 assert.Contains(t, backups[1], primary.Alias) 431 432 // Perform PRS to demote the primary tablet (primary) so that we can do a restore there and verify we don't have the 433 // data from after the older/first backup 434 err = localCluster.VtctlclientProcess.ExecuteCommand("PlannedReparentShard", "--", 435 "--keyspace_shard", shardKsName, 436 "--new_primary", replica2.Alias) 437 require.Nil(t, err) 438 439 // Delete the current primary tablet (replica2) so that the original primary tablet (primary) can be restored from the 440 // older/first backup w/o it replicating the subsequent insert done after the first backup was taken 441 err = localCluster.VtctlclientProcess.ExecuteCommand("DeleteTablet", "--", "--allow_primary=true", replica2.Alias) 442 require.Nil(t, err) 443 err = replica2.VttabletProcess.TearDown() 444 require.Nil(t, err) 445 446 // Restore the older/first backup -- using the timestamp we saved -- on the original primary tablet (primary) 447 err = localCluster.VtctlclientProcess.ExecuteCommand("RestoreFromBackup", "--", "--backup_timestamp", firstBackupTimestamp, primary.Alias) 448 require.Nil(t, err) 449 450 // Re-init the shard -- making the original primary tablet (primary) primary again -- for subsequent tests 451 err = localCluster.VtctlclientProcess.InitShardPrimary(keyspaceName, shardName, cell, primary.TabletUID) 452 require.Nil(t, err) 453 454 // Verify that we don't have the record created after the older/first backup 455 cluster.VerifyRowsInTablet(t, primary, keyspaceName, 1) 456 457 verifyAfterRemovingBackupNoBackupShouldBePresent(t, backups) 458 require.Nil(t, err) 459 460 _, err = primary.VttabletProcess.QueryTablet("DROP TABLE vt_insert_test", keyspaceName, true) 461 require.Nil(t, err) 462 } 463 464 // Test a primary and replica from the same backup. 465 // 466 // Check that a replica and primary both restored from the same backup 467 // can replicate successfully. 468 func primaryReplicaSameBackup(t *testing.T) { 469 // insert data on primary, wait for replica to get it 470 verifyInitialReplication(t) 471 472 // backup the replica 473 err := localCluster.VtctlclientProcess.ExecuteCommand("Backup", replica1.Alias) 474 require.Nil(t, err) 475 476 // insert more data on the primary 477 _, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('test2')", keyspaceName, true) 478 require.Nil(t, err) 479 480 // now bring up the other replica, letting it restore from backup. 481 restoreWaitForBackup(t, "replica", nil, true) 482 err = replica2.VttabletProcess.WaitForTabletStatusesForTimeout([]string{"SERVING"}, timeout) 483 require.Nil(t, err) 484 485 // check the new replica has the data 486 cluster.VerifyRowsInTablet(t, replica2, keyspaceName, 2) 487 488 // Promote replica2 to primary 489 err = localCluster.VtctlclientProcess.ExecuteCommand("PlannedReparentShard", "--", 490 "--keyspace_shard", shardKsName, 491 "--new_primary", replica2.Alias) 492 require.Nil(t, err) 493 494 // insert more data on replica2 (current primary) 495 _, err = replica2.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('test3')", keyspaceName, true) 496 require.Nil(t, err) 497 498 // Force replica1 to restore from backup. 499 verifyRestoreTablet(t, replica1, "SERVING") 500 501 // wait for replica1 to catch up. 502 cluster.VerifyRowsInTablet(t, replica1, keyspaceName, 3) 503 504 // This is to test that replicationPosition is processed correctly 505 // while doing backup/restore after a reparent. 506 // It is written into the MANIFEST and read back from the MANIFEST. 507 // 508 // Take another backup on the replica. 509 err = localCluster.VtctlclientProcess.ExecuteCommand("Backup", replica1.Alias) 510 require.Nil(t, err) 511 512 // Insert more data on replica2 (current primary). 513 _, err = replica2.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('test4')", keyspaceName, true) 514 require.Nil(t, err) 515 516 // Force replica1 to restore from backup. 517 verifyRestoreTablet(t, replica1, "SERVING") 518 519 cluster.VerifyRowsInTablet(t, replica1, keyspaceName, 4) 520 err = replica2.VttabletProcess.TearDown() 521 require.Nil(t, err) 522 restartPrimaryAndReplica(t) 523 } 524 525 // Test a primary and replica from the same backup. 526 // 527 // Check that a replica and primary both restored from the same backup 528 // We change compression alogrithm in between but it should not break any restore functionality 529 func primaryReplicaSameBackupModifiedCompressionEngine(t *testing.T) { 530 // insert data on primary, wait for replica to get it 531 verifyInitialReplication(t) 532 533 // TODO: The following Sleep in introduced as it seems like the previous step doesn't fully complete, causing 534 // this test to be flaky. Sleep seems to solve the problem. Need to fix this in a better way and Wait for 535 // previous test to complete (suspicion: MySQL does not fully start) 536 time.Sleep(5 * time.Second) 537 538 // backup the replica 539 err := localCluster.VtctlclientProcess.ExecuteCommand("Backup", replica1.Alias) 540 require.Nil(t, err) 541 542 // insert more data on the primary 543 _, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('test2')", keyspaceName, true) 544 require.Nil(t, err) 545 546 // now bring up the other replica, with change in compression engine 547 // this is to verify that restore will read engine name from manifest instead of reading the new values 548 cDetails := &CompressionDetails{ 549 CompressorEngineName: "pgzip", 550 ExternalCompressorCmd: "gzip -c", 551 ExternalCompressorExt: ".gz", 552 ExternalDecompressorCmd: "", 553 } 554 restoreWaitForBackup(t, "replica", cDetails, false) 555 err = replica2.VttabletProcess.WaitForTabletStatusesForTimeout([]string{"SERVING"}, timeout) 556 require.Nil(t, err) 557 558 // check the new replica has the data 559 cluster.VerifyRowsInTablet(t, replica2, keyspaceName, 2) 560 561 // Promote replica2 to primary 562 err = localCluster.VtctlclientProcess.ExecuteCommand("PlannedReparentShard", "--", 563 "--keyspace_shard", shardKsName, 564 "--new_primary", replica2.Alias) 565 require.Nil(t, err) 566 567 // insert more data on replica2 (current primary) 568 _, err = replica2.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('test3')", keyspaceName, true) 569 require.Nil(t, err) 570 571 // Force replica1 to restore from backup. 572 verifyRestoreTablet(t, replica1, "SERVING") 573 574 // wait for replica1 to catch up. 575 cluster.VerifyRowsInTablet(t, replica1, keyspaceName, 3) 576 577 // Promote replica1 to primary 578 err = localCluster.VtctlclientProcess.ExecuteCommand("PlannedReparentShard", "--", 579 "--keyspace_shard", shardKsName, 580 "--new_primary", replica1.Alias) 581 require.Nil(t, err) 582 583 // Insert more data on replica1 (current primary). 584 _, err = replica1.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('test4')", keyspaceName, true) 585 require.Nil(t, err) 586 587 // wait for replica2 to catch up. 588 cluster.VerifyRowsInTablet(t, replica2, keyspaceName, 4) 589 590 // Now take replica2 backup with gzip (new compressor) 591 err = localCluster.VtctlclientProcess.ExecuteCommand("Backup", replica2.Alias) 592 require.Nil(t, err) 593 594 // Force replica2 to restore from backup. 595 verifyRestoreTablet(t, replica2, "SERVING") 596 cluster.VerifyRowsInTablet(t, replica2, keyspaceName, 4) 597 err = replica2.VttabletProcess.TearDown() 598 require.Nil(t, err) 599 restartPrimaryAndReplica(t) 600 } 601 602 func restoreOldPrimaryByRestart(t *testing.T) { 603 testRestoreOldPrimary(t, restoreUsingRestart) 604 } 605 606 func restoreOldPrimaryInPlace(t *testing.T) { 607 testRestoreOldPrimary(t, restoreInPlace) 608 } 609 610 // Test that a former primary replicates correctly after being restored. 611 // 612 // - Take a backup. 613 // - Reparent from old primary to new primary. 614 // - Force old primary to restore from a previous backup using restore_method. 615 // 616 // Args: 617 // restore_method: function accepting one parameter of type tablet.Tablet, 618 // this function is called to force a restore on the provided tablet 619 func testRestoreOldPrimary(t *testing.T, method restoreMethod) { 620 // insert data on primary, wait for replica to get it 621 verifyInitialReplication(t) 622 623 // TODO: The following Sleep in introduced as it seems like the previous step doesn't fully complete, causing 624 // this test to be flaky. Sleep seems to solve the problem. Need to fix this in a better way and Wait for 625 // previous test to complete (suspicion: MySQL does not fully start) 626 time.Sleep(5 * time.Second) 627 628 // backup the replica 629 err := localCluster.VtctlclientProcess.ExecuteCommand("Backup", replica1.Alias) 630 require.Nil(t, err) 631 632 // insert more data on the primary 633 _, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('test2')", keyspaceName, true) 634 require.Nil(t, err) 635 636 // reparent to replica1 637 err = localCluster.VtctlclientProcess.ExecuteCommand("PlannedReparentShard", "--", 638 "--keyspace_shard", shardKsName, 639 "--new_primary", replica1.Alias) 640 require.Nil(t, err) 641 642 // insert more data to new primary 643 _, err = replica1.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('test3')", keyspaceName, true) 644 require.Nil(t, err) 645 646 // force the old primary to restore at the latest backup. 647 method(t, primary) 648 649 // wait for it to catch up. 650 cluster.VerifyRowsInTablet(t, primary, keyspaceName, 3) 651 652 // teardown 653 restartPrimaryAndReplica(t) 654 } 655 656 func restoreUsingRestart(t *testing.T, tablet *cluster.Vttablet) { 657 err := tablet.VttabletProcess.TearDown() 658 require.Nil(t, err) 659 verifyRestoreTablet(t, tablet, "SERVING") 660 } 661 662 func restoreInPlace(t *testing.T, tablet *cluster.Vttablet) { 663 err := localCluster.VtctlclientProcess.ExecuteCommand("RestoreFromBackup", tablet.Alias) 664 require.Nil(t, err) 665 } 666 667 func restartPrimaryAndReplica(t *testing.T) { 668 // Stop all primary, replica tablet and mysql instance 669 stopAllTablets() 670 671 // remove all backups 672 localCluster.RemoveAllBackups(t, shardKsName) 673 // start all tablet and mysql instances 674 var mysqlProcs []*exec.Cmd 675 for _, tablet := range []*cluster.Vttablet{primary, replica1, replica2} { 676 if tablet.MysqlctldProcess.TabletUID > 0 { 677 err := tablet.MysqlctldProcess.Start() 678 require.Nilf(t, err, "error while starting mysqlctld, tabletUID %v", tablet.TabletUID) 679 continue 680 } 681 proc, _ := tablet.MysqlctlProcess.StartProcess() 682 mysqlProcs = append(mysqlProcs, proc) 683 } 684 for _, proc := range mysqlProcs { 685 proc.Wait() 686 } 687 for _, tablet := range []*cluster.Vttablet{primary, replica1} { 688 err := localCluster.VtctlclientProcess.InitTablet(tablet, cell, keyspaceName, hostname, shardName) 689 require.Nil(t, err) 690 err = tablet.VttabletProcess.CreateDB(keyspaceName) 691 require.Nil(t, err) 692 err = tablet.VttabletProcess.Setup() 693 require.Nil(t, err) 694 } 695 err := localCluster.VtctlclientProcess.InitShardPrimary(keyspaceName, shardName, cell, primary.TabletUID) 696 require.Nil(t, err) 697 } 698 699 func stopAllTablets() { 700 var mysqlProcs []*exec.Cmd 701 for _, tablet := range []*cluster.Vttablet{primary, replica1, replica2} { 702 tablet.VttabletProcess.TearDown() 703 if tablet.MysqlctldProcess.TabletUID > 0 { 704 tablet.MysqlctldProcess.Stop() 705 localCluster.VtctlclientProcess.ExecuteCommand("DeleteTablet", "--", "--allow_primary", tablet.Alias) 706 continue 707 } 708 proc, _ := tablet.MysqlctlProcess.StopProcess() 709 mysqlProcs = append(mysqlProcs, proc) 710 localCluster.VtctlclientProcess.ExecuteCommand("DeleteTablet", "--", "--allow_primary", tablet.Alias) 711 } 712 for _, proc := range mysqlProcs { 713 proc.Wait() 714 } 715 for _, tablet := range []*cluster.Vttablet{primary, replica1} { 716 os.RemoveAll(tablet.VttabletProcess.Directory) 717 } 718 } 719 720 func terminatedRestore(t *testing.T) { 721 // insert data on primary, wait for replica to get it 722 verifyInitialReplication(t) 723 724 // TODO: The following Sleep in introduced as it seems like the previous step doesn't fully complete, causing 725 // this test to be flaky. Sleep seems to solve the problem. Need to fix this in a better way and Wait for 726 // previous test to complete (suspicion: MySQL does not fully start) 727 time.Sleep(5 * time.Second) 728 729 // backup the replica 730 err := localCluster.VtctlclientProcess.ExecuteCommand("Backup", replica1.Alias) 731 require.Nil(t, err) 732 733 // insert more data on the primary 734 _, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('test2')", keyspaceName, true) 735 require.Nil(t, err) 736 737 // reparent to replica1 738 err = localCluster.VtctlclientProcess.ExecuteCommand("PlannedReparentShard", "--", 739 "--keyspace_shard", shardKsName, 740 "--new_primary", replica1.Alias) 741 require.Nil(t, err) 742 743 // insert more data to new primary 744 _, err = replica1.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('test3')", keyspaceName, true) 745 require.Nil(t, err) 746 747 terminateRestore(t) 748 749 err = localCluster.VtctlclientProcess.ExecuteCommand("RestoreFromBackup", primary.Alias) 750 require.Nil(t, err) 751 752 output, err := localCluster.VtctlclientProcess.ExecuteCommandWithOutput("GetTablet", primary.Alias) 753 require.Nil(t, err) 754 755 var tabletPB topodata.Tablet 756 err = json.Unmarshal([]byte(output), &tabletPB) 757 require.Nil(t, err) 758 assert.Equal(t, tabletPB.Type, topodata.TabletType_REPLICA) 759 760 _, err = os.Stat(path.Join(primary.VttabletProcess.Directory, "restore_in_progress")) 761 assert.True(t, os.IsNotExist(err)) 762 763 cluster.VerifyRowsInTablet(t, primary, keyspaceName, 3) 764 stopAllTablets() 765 } 766 767 // test_backup will: 768 // - create a shard with primary and replica1 only 769 // - run InitShardPrimary 770 // - bring up tablet_replica2 concurrently, telling it to wait for a backup 771 // - insert some data 772 // - take a backup 773 // - insert more data on the primary 774 // - wait for tablet_replica2 to become SERVING 775 // - check all data is right (before+after backup data) 776 // - list the backup, remove it 777 // 778 // Args: 779 // tablet_type: 'replica' or 'rdonly'. 780 func vtctlBackup(t *testing.T, tabletType string) { 781 // StopReplication on replica1. We verify that the replication works fine later in 782 // verifyInitialReplication. So this will also check that VTOrc is running. 783 err := localCluster.VtctlclientProcess.ExecuteCommand("StopReplication", replica1.Alias) 784 require.Nil(t, err) 785 786 verifyInitialReplication(t) 787 restoreWaitForBackup(t, tabletType, nil, true) 788 789 err = localCluster.VtctlclientProcess.ExecuteCommand("Backup", replica1.Alias) 790 require.Nil(t, err) 791 792 backups := localCluster.VerifyBackupCount(t, shardKsName, 1) 793 794 _, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('test2')", keyspaceName, true) 795 require.Nil(t, err) 796 797 err = replica2.VttabletProcess.WaitForTabletStatusesForTimeout([]string{"SERVING"}, 25*time.Second) 798 require.Nil(t, err) 799 cluster.VerifyRowsInTablet(t, replica2, keyspaceName, 2) 800 801 verifyAfterRemovingBackupNoBackupShouldBePresent(t, backups) 802 err = replica2.VttabletProcess.TearDown() 803 require.Nil(t, err) 804 805 err = localCluster.VtctlclientProcess.ExecuteCommand("DeleteTablet", replica2.Alias) 806 require.Nil(t, err) 807 _, err = primary.VttabletProcess.QueryTablet("DROP TABLE vt_insert_test", keyspaceName, true) 808 require.Nil(t, err) 809 810 } 811 812 func InitTestTable(t *testing.T) { 813 _, err := primary.VttabletProcess.QueryTablet("DROP TABLE IF EXISTS vt_insert_test", keyspaceName, true) 814 require.Nil(t, err) 815 _, err = primary.VttabletProcess.QueryTablet(vtInsertTest, keyspaceName, true) 816 require.Nil(t, err) 817 } 818 819 // This will create schema in primary, insert some data to primary and verify the same data in replica 820 func verifyInitialReplication(t *testing.T) { 821 InitTestTable(t) 822 _, err := primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('test1')", keyspaceName, true) 823 require.Nil(t, err) 824 cluster.VerifyRowsInTablet(t, replica1, keyspaceName, 1) 825 } 826 827 // Bring up another replica concurrently, telling it to wait until a backup 828 // is available instead of starting up empty. 829 // 830 // Override the backup engine implementation to a non-existent one for restore. 831 // This setting should only matter for taking new backups. We should be able 832 // to restore a previous backup successfully regardless of this setting. 833 func restoreWaitForBackup(t *testing.T, tabletType string, cDetails *CompressionDetails, fakeImpl bool) { 834 replica2.Type = tabletType 835 replica2.ValidateTabletRestart(t) 836 replicaTabletArgs := commonTabletArg 837 if cDetails != nil { 838 replicaTabletArgs = updateCompressorArgs(replicaTabletArgs, cDetails) 839 } 840 if fakeImpl { 841 replicaTabletArgs = append(replicaTabletArgs, "--backup_engine_implementation", "fake_implementation") 842 } 843 replicaTabletArgs = append(replicaTabletArgs, "--wait_for_backup_interval", "1s") 844 replicaTabletArgs = append(replicaTabletArgs, "--init_tablet_type", tabletType) 845 replica2.VttabletProcess.ExtraArgs = replicaTabletArgs 846 replica2.VttabletProcess.ServingStatus = "" 847 err := replica2.VttabletProcess.Setup() 848 require.Nil(t, err) 849 } 850 851 func RemoveBackup(t *testing.T, backupName string) { 852 err := localCluster.VtctlclientProcess.ExecuteCommand("RemoveBackup", shardKsName, backupName) 853 require.Nil(t, err) 854 } 855 856 func verifyAfterRemovingBackupNoBackupShouldBePresent(t *testing.T, backups []string) { 857 // Remove the backup 858 for _, backup := range backups { 859 RemoveBackup(t, backup) 860 } 861 862 // Now, there should not be no backup 863 localCluster.VerifyBackupCount(t, shardKsName, 0) 864 } 865 866 func verifyRestoreTablet(t *testing.T, tablet *cluster.Vttablet, status string) { 867 tablet.ValidateTabletRestart(t) 868 tablet.VttabletProcess.ServingStatus = "" 869 err := tablet.VttabletProcess.Setup() 870 require.Nil(t, err) 871 if status != "" { 872 err = tablet.VttabletProcess.WaitForTabletStatusesForTimeout([]string{status}, 25*time.Second) 873 require.Nil(t, err) 874 } 875 // We restart replication here because semi-sync will not be set correctly on tablet startup since 876 // we deprecated enable_semi_sync. StartReplication RPC fixes the semi-sync settings by consulting the 877 // durability policies set. 878 err = localCluster.VtctlclientProcess.ExecuteCommand("StopReplication", tablet.Alias) 879 require.NoError(t, err) 880 err = localCluster.VtctlclientProcess.ExecuteCommand("StartReplication", tablet.Alias) 881 require.NoError(t, err) 882 883 if tablet.Type == "replica" { 884 verifySemiSyncStatus(t, tablet, "ON") 885 } else if tablet.Type == "rdonly" { 886 verifySemiSyncStatus(t, tablet, "OFF") 887 } 888 } 889 890 func verifySemiSyncStatus(t *testing.T, vttablet *cluster.Vttablet, expectedStatus string) { 891 status, err := vttablet.VttabletProcess.GetDBVar("rpl_semi_sync_slave_enabled", keyspaceName) 892 require.Nil(t, err) 893 assert.Equal(t, status, expectedStatus) 894 status, err = vttablet.VttabletProcess.GetDBStatus("rpl_semi_sync_slave_status", keyspaceName) 895 require.Nil(t, err) 896 assert.Equal(t, status, expectedStatus) 897 } 898 899 func terminateRestore(t *testing.T) { 900 stopRestoreMsg := "Copying file 10" 901 if useXtrabackup { 902 stopRestoreMsg = "Restore: Preparing" 903 useXtrabackup = false 904 } 905 906 args := append([]string{"--server", localCluster.VtctlclientProcess.Server, "--alsologtostderr"}, "RestoreFromBackup", "--", primary.Alias) 907 tmpProcess := exec.Command( 908 "vtctlclient", 909 args..., 910 ) 911 912 reader, _ := tmpProcess.StderrPipe() 913 err := tmpProcess.Start() 914 require.Nil(t, err) 915 found := false 916 917 scanner := bufio.NewScanner(reader) 918 919 for scanner.Scan() { 920 text := scanner.Text() 921 if strings.Contains(text, stopRestoreMsg) { 922 if _, err := os.Stat(path.Join(primary.VttabletProcess.Directory, "restore_in_progress")); os.IsNotExist(err) { 923 assert.Fail(t, "restore in progress file missing") 924 } 925 tmpProcess.Process.Signal(syscall.SIGTERM) 926 found = true //nolint 927 return 928 } 929 } 930 assert.True(t, found, "Restore message not found") 931 } 932 933 func vtctlBackupReplicaNoDestroyNoWrites(t *testing.T, tabletType string) (backups []string, destroy func(t *testing.T)) { 934 restoreWaitForBackup(t, tabletType, nil, true) 935 verifyInitialReplication(t) 936 937 err := localCluster.VtctlclientProcess.ExecuteCommand("Backup", replica1.Alias) 938 require.Nil(t, err) 939 940 backups = localCluster.VerifyBackupCount(t, shardKsName, 1) 941 942 err = replica2.VttabletProcess.WaitForTabletStatusesForTimeout([]string{"SERVING"}, 25*time.Second) 943 require.Nil(t, err) 944 945 err = replica2.VttabletProcess.TearDown() 946 require.Nil(t, err) 947 948 err = localCluster.VtctlclientProcess.ExecuteCommand("DeleteTablet", replica2.Alias) 949 require.Nil(t, err) 950 951 destroy = func(t *testing.T) { 952 verifyAfterRemovingBackupNoBackupShouldBePresent(t, backups) 953 } 954 return backups, destroy 955 } 956 957 func GetReplicaPosition(t *testing.T) string { 958 pos, _ := cluster.GetPrimaryPosition(t, *replica1, hostname) 959 return pos 960 } 961 962 func GetReplicaGtidPurged(t *testing.T) string { 963 query := "select @@global.gtid_purged as gtid_purged" 964 rs, err := replica1.VttabletProcess.QueryTablet(query, keyspaceName, true) 965 require.NoError(t, err) 966 row := rs.Named().Row() 967 require.NotNil(t, row) 968 return row.AsString("gtid_purged", "") 969 } 970 971 func InsertRowOnPrimary(t *testing.T, hint string) { 972 if hint == "" { 973 hint = textutil.RandomHash()[:12] 974 } 975 query, err := sqlparser.ParseAndBind("insert into vt_insert_test (msg) values (%a)", sqltypes.StringBindVariable(hint)) 976 require.NoError(t, err) 977 _, err = primary.VttabletProcess.QueryTablet(query, keyspaceName, true) 978 require.NoError(t, err) 979 } 980 981 func ReadRowsFromTablet(t *testing.T, tablet *cluster.Vttablet) (msgs []string) { 982 query := "select msg from vt_insert_test" 983 rs, err := tablet.VttabletProcess.QueryTablet(query, keyspaceName, true) 984 require.NoError(t, err) 985 for _, row := range rs.Named().Rows { 986 msg, err := row.ToString("msg") 987 require.NoError(t, err) 988 msgs = append(msgs, msg) 989 } 990 return msgs 991 } 992 993 func ReadRowsFromPrimary(t *testing.T) (msgs []string) { 994 return ReadRowsFromTablet(t, primary) 995 } 996 997 func ReadRowsFromReplica(t *testing.T) (msgs []string) { 998 return ReadRowsFromTablet(t, replica1) 999 } 1000 1001 func readManifestFile(t *testing.T, backupLocation string) (manifest *mysqlctl.BackupManifest) { 1002 // reading manifest 1003 data, err := os.ReadFile(backupLocation + "/MANIFEST") 1004 require.NoErrorf(t, err, "error while reading MANIFEST %v", err) 1005 1006 // parsing manifest 1007 err = json.Unmarshal(data, &manifest) 1008 require.NoErrorf(t, err, "error while parsing MANIFEST %v", err) 1009 require.NotNil(t, manifest) 1010 return manifest 1011 } 1012 1013 func TestReplicaFullBackup(t *testing.T) (manifest *mysqlctl.BackupManifest, destroy func(t *testing.T)) { 1014 backups, destroy := vtctlBackupReplicaNoDestroyNoWrites(t, "replica") 1015 1016 backupLocation := localCluster.CurrentVTDATAROOT + "/backups/" + shardKsName + "/" + backups[len(backups)-1] 1017 return readManifestFile(t, backupLocation), destroy 1018 } 1019 1020 func TestReplicaIncrementalBackup(t *testing.T, incrementalFromPos mysql.Position, expectError string) (manifest *mysqlctl.BackupManifest, backupName string) { 1021 incrementalFromPosArg := "auto" 1022 if !incrementalFromPos.IsZero() { 1023 incrementalFromPosArg = mysql.EncodePosition(incrementalFromPos) 1024 } 1025 output, err := localCluster.VtctlclientProcess.ExecuteCommandWithOutput("Backup", "--", "--incremental_from_pos", incrementalFromPosArg, replica1.Alias) 1026 if expectError != "" { 1027 require.Errorf(t, err, "expected: %v", expectError) 1028 require.Contains(t, output, expectError) 1029 return nil, "" 1030 } 1031 require.NoErrorf(t, err, "output: %v", output) 1032 1033 backups, err := localCluster.ListBackups(shardKsName) 1034 require.NoError(t, err) 1035 backupName = backups[len(backups)-1] 1036 backupLocation := localCluster.CurrentVTDATAROOT + "/backups/" + shardKsName + "/" + backupName 1037 return readManifestFile(t, backupLocation), backupName 1038 } 1039 1040 func TestReplicaRestoreToPos(t *testing.T, restoreToPos mysql.Position, expectError string) { 1041 require.False(t, restoreToPos.IsZero()) 1042 restoreToPosArg := mysql.EncodePosition(restoreToPos) 1043 output, err := localCluster.VtctlclientProcess.ExecuteCommandWithOutput("RestoreFromBackup", "--", "--restore_to_pos", restoreToPosArg, replica1.Alias) 1044 if expectError != "" { 1045 require.Errorf(t, err, "expected: %v", expectError) 1046 require.Contains(t, output, expectError) 1047 return 1048 } 1049 require.NoErrorf(t, err, "output: %v", output) 1050 }