vitess.io/vitess@v0.16.2/go/test/endtoend/reparent/emergencyreparent/ers_test.go (about) 1 /* 2 Copyright 2019 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package emergencyreparent 18 19 import ( 20 "context" 21 "os/exec" 22 "testing" 23 "time" 24 25 "github.com/stretchr/testify/require" 26 27 "vitess.io/vitess/go/test/endtoend/cluster" 28 "vitess.io/vitess/go/test/endtoend/reparent/utils" 29 "vitess.io/vitess/go/vt/log" 30 ) 31 32 func TestTrivialERS(t *testing.T) { 33 defer cluster.PanicHandler(t) 34 clusterInstance := utils.SetupReparentCluster(t, "semi_sync") 35 defer utils.TeardownCluster(clusterInstance) 36 tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets 37 38 utils.ConfirmReplication(t, tablets[0], tablets[1:]) 39 40 // We should be able to do a series of ERS-es, even if nothing 41 // is down, without issue 42 for i := 1; i <= 4; i++ { 43 out, err := utils.Ers(clusterInstance, nil, "60s", "30s") 44 log.Infof("ERS loop %d. EmergencyReparentShard Output: %v", i, out) 45 require.NoError(t, err) 46 time.Sleep(5 * time.Second) 47 } 48 // We should do the same for vtctl binary 49 for i := 1; i <= 4; i++ { 50 out, err := utils.ErsWithVtctl(clusterInstance) 51 log.Infof("ERS-vtctl loop %d. EmergencyReparentShard Output: %v", i, out) 52 require.NoError(t, err) 53 time.Sleep(5 * time.Second) 54 } 55 } 56 57 func TestReparentIgnoreReplicas(t *testing.T) { 58 defer cluster.PanicHandler(t) 59 clusterInstance := utils.SetupReparentCluster(t, "semi_sync") 60 defer utils.TeardownCluster(clusterInstance) 61 tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets 62 var err error 63 64 ctx := context.Background() 65 66 insertVal := utils.ConfirmReplication(t, tablets[0], tablets[1:]) 67 68 // Make the current primary agent and database unavailable. 69 utils.StopTablet(t, tablets[0], true) 70 71 // Take down a replica - this should cause the emergency reparent to fail. 72 utils.StopTablet(t, tablets[2], true) 73 74 // We expect this one to fail because we have an unreachable replica 75 out, err := utils.Ers(clusterInstance, nil, "60s", "30s") 76 require.NotNil(t, err, out) 77 78 // Now let's run it again, but set the command to ignore the unreachable replica. 79 out, err = utils.ErsIgnoreTablet(clusterInstance, nil, "60s", "30s", []*cluster.Vttablet{tablets[2]}, false) 80 require.Nil(t, err, out) 81 82 // We'll bring back the replica we took down. 83 utils.RestartTablet(t, clusterInstance, tablets[2]) 84 85 // Check that old primary tablet is left around for human intervention. 86 utils.ConfirmOldPrimaryIsHangingAround(t, clusterInstance) 87 utils.DeleteTablet(t, clusterInstance, tablets[0]) 88 utils.ValidateTopology(t, clusterInstance, false) 89 90 newPrimary := utils.GetNewPrimary(t, clusterInstance) 91 // Check new primary has latest transaction. 92 err = utils.CheckInsertedValues(ctx, t, newPrimary, insertVal) 93 require.Nil(t, err) 94 95 // bring back the old primary as a replica, check that it catches up 96 utils.ResurrectTablet(ctx, t, clusterInstance, tablets[0]) 97 } 98 99 func TestReparentDownPrimary(t *testing.T) { 100 defer cluster.PanicHandler(t) 101 clusterInstance := utils.SetupReparentCluster(t, "semi_sync") 102 defer utils.TeardownCluster(clusterInstance) 103 tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets 104 105 ctx := context.Background() 106 107 // Make the current primary agent and database unavailable. 108 utils.StopTablet(t, tablets[0], true) 109 110 // Perform a planned reparent operation, will try to contact 111 // the current primary and fail somewhat quickly 112 _, err := utils.PrsWithTimeout(t, clusterInstance, tablets[1], false, "1s", "5s") 113 require.Error(t, err) 114 115 utils.ValidateTopology(t, clusterInstance, false) 116 117 // Run forced reparent operation, this should now proceed unimpeded. 118 out, err := utils.Ers(clusterInstance, tablets[1], "60s", "30s") 119 log.Infof("EmergencyReparentShard Output: %v", out) 120 require.NoError(t, err) 121 122 // Check that old primary tablet is left around for human intervention. 123 utils.ConfirmOldPrimaryIsHangingAround(t, clusterInstance) 124 125 // Now we'll manually remove it, simulating a human cleaning up a dead primary. 126 utils.DeleteTablet(t, clusterInstance, tablets[0]) 127 128 // Now validate topo is correct. 129 utils.ValidateTopology(t, clusterInstance, false) 130 utils.CheckPrimaryTablet(t, clusterInstance, tablets[1]) 131 utils.ConfirmReplication(t, tablets[1], []*cluster.Vttablet{tablets[2], tablets[3]}) 132 utils.ResurrectTablet(ctx, t, clusterInstance, tablets[0]) 133 } 134 135 func TestReparentNoChoiceDownPrimary(t *testing.T) { 136 defer cluster.PanicHandler(t) 137 clusterInstance := utils.SetupReparentCluster(t, "semi_sync") 138 defer utils.TeardownCluster(clusterInstance) 139 tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets 140 var err error 141 142 ctx := context.Background() 143 144 insertVal := utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]}) 145 146 // Make the current primary agent and database unavailable. 147 utils.StopTablet(t, tablets[0], true) 148 149 // Run forced reparent operation, this should now proceed unimpeded. 150 out, err := utils.Ers(clusterInstance, nil, "120s", "61s") 151 require.NoError(t, err, out) 152 153 // Check that old primary tablet is left around for human intervention. 154 utils.ConfirmOldPrimaryIsHangingAround(t, clusterInstance) 155 // Now we'll manually remove the old primary, simulating a human cleaning up a dead primary. 156 utils.DeleteTablet(t, clusterInstance, tablets[0]) 157 utils.ValidateTopology(t, clusterInstance, false) 158 newPrimary := utils.GetNewPrimary(t, clusterInstance) 159 // Validate new primary is not old primary. 160 require.NotEqual(t, newPrimary.Alias, tablets[0].Alias) 161 162 // Check new primary has latest transaction. 163 err = utils.CheckInsertedValues(ctx, t, newPrimary, insertVal) 164 require.NoError(t, err) 165 166 // bring back the old primary as a replica, check that it catches up 167 utils.ResurrectTablet(ctx, t, clusterInstance, tablets[0]) 168 } 169 170 func TestSemiSyncSetupCorrectly(t *testing.T) { 171 t.Run("semi-sync enabled", func(t *testing.T) { 172 defer cluster.PanicHandler(t) 173 clusterInstance := utils.SetupReparentCluster(t, "semi_sync") 174 defer utils.TeardownCluster(clusterInstance) 175 tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets 176 177 utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]}) 178 // Run forced reparent operation, this should proceed unimpeded. 179 out, err := utils.Ers(clusterInstance, tablets[1], "60s", "30s") 180 require.NoError(t, err, out) 181 182 utils.ConfirmReplication(t, tablets[1], []*cluster.Vttablet{tablets[0], tablets[2], tablets[3]}) 183 184 for _, tablet := range tablets { 185 utils.CheckSemiSyncSetupCorrectly(t, tablet, "ON") 186 } 187 188 // Run forced reparent operation, this should proceed unimpeded. 189 out, err = utils.Prs(t, clusterInstance, tablets[0]) 190 require.NoError(t, err, out) 191 192 utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]}) 193 194 for _, tablet := range tablets { 195 utils.CheckSemiSyncSetupCorrectly(t, tablet, "ON") 196 } 197 }) 198 199 t.Run("semi-sync disabled", func(t *testing.T) { 200 defer cluster.PanicHandler(t) 201 clusterInstance := utils.SetupReparentCluster(t, "none") 202 defer utils.TeardownCluster(clusterInstance) 203 tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets 204 205 utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]}) 206 // Run forced reparent operation, this should proceed unimpeded. 207 out, err := utils.Ers(clusterInstance, tablets[1], "60s", "30s") 208 require.NoError(t, err, out) 209 210 utils.ConfirmReplication(t, tablets[1], []*cluster.Vttablet{tablets[0], tablets[2], tablets[3]}) 211 212 for _, tablet := range tablets { 213 utils.CheckSemiSyncSetupCorrectly(t, tablet, "OFF") 214 } 215 216 // Run forced reparent operation, this should proceed unimpeded. 217 out, err = utils.Prs(t, clusterInstance, tablets[0]) 218 require.NoError(t, err, out) 219 220 utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]}) 221 222 for _, tablet := range tablets { 223 utils.CheckSemiSyncSetupCorrectly(t, tablet, "OFF") 224 } 225 }) 226 } 227 228 // TestERSPromoteRdonly tests that we never end up promoting a rdonly instance as the primary 229 func TestERSPromoteRdonly(t *testing.T) { 230 defer cluster.PanicHandler(t) 231 clusterInstance := utils.SetupReparentCluster(t, "semi_sync") 232 defer utils.TeardownCluster(clusterInstance) 233 tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets 234 var err error 235 236 err = clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", tablets[1].Alias, "rdonly") 237 require.NoError(t, err) 238 239 err = clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", tablets[2].Alias, "rdonly") 240 require.NoError(t, err) 241 242 utils.ConfirmReplication(t, tablets[0], tablets[1:]) 243 244 // Make the current primary agent and database unavailable. 245 utils.StopTablet(t, tablets[0], true) 246 247 // We expect this one to fail because we have ignored all the replicas and have only the rdonly's which should not be promoted 248 out, err := utils.ErsIgnoreTablet(clusterInstance, nil, "30s", "30s", []*cluster.Vttablet{tablets[3]}, false) 249 require.NotNil(t, err, out) 250 251 out, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("GetShard", utils.KeyspaceShard) 252 require.NoError(t, err) 253 require.Contains(t, out, `"uid": 101`, "the primary should still be 101 in the shard info") 254 } 255 256 // TestERSPreventCrossCellPromotion tests that we promote a replica in the same cell as the previous primary if prevent cross cell promotion flag is set 257 func TestERSPreventCrossCellPromotion(t *testing.T) { 258 defer cluster.PanicHandler(t) 259 clusterInstance := utils.SetupReparentCluster(t, "semi_sync") 260 defer utils.TeardownCluster(clusterInstance) 261 tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets 262 var err error 263 264 // confirm that replication is going smoothly 265 utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]}) 266 267 // Make the current primary agent and database unavailable. 268 utils.StopTablet(t, tablets[0], true) 269 270 // We expect that tablets[2] will be promoted since it is in the same cell as the previous primary 271 out, err := utils.ErsIgnoreTablet(clusterInstance, nil, "60s", "30s", []*cluster.Vttablet{tablets[1]}, true) 272 require.NoError(t, err, out) 273 274 newPrimary := utils.GetNewPrimary(t, clusterInstance) 275 require.Equal(t, newPrimary.Alias, tablets[2].Alias, "tablets[2] should be the promoted primary") 276 } 277 278 // TestPullFromRdonly tests that if a rdonly tablet is the most advanced, then our promoted primary should have 279 // caught up to it by pulling transactions from it 280 func TestPullFromRdonly(t *testing.T) { 281 defer cluster.PanicHandler(t) 282 clusterInstance := utils.SetupReparentCluster(t, "semi_sync") 283 defer utils.TeardownCluster(clusterInstance) 284 tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets 285 var err error 286 287 ctx := context.Background() 288 // make tablets[1] a rdonly tablet. 289 // rename tablet so that the test is not confusing 290 rdonly := tablets[1] 291 err = clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", rdonly.Alias, "rdonly") 292 require.NoError(t, err) 293 294 // confirm that all the tablets can replicate successfully right now 295 utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{rdonly, tablets[2], tablets[3]}) 296 297 // stop replication on the other two tablets 298 err = clusterInstance.VtctlclientProcess.ExecuteCommand("StopReplication", tablets[2].Alias) 299 require.NoError(t, err) 300 err = clusterInstance.VtctlclientProcess.ExecuteCommand("StopReplication", tablets[3].Alias) 301 require.NoError(t, err) 302 303 // stop semi-sync on the primary so that any transaction now added does not require an ack 304 utils.RunSQL(ctx, t, "SET GLOBAL rpl_semi_sync_master_enabled = false", tablets[0]) 305 306 // confirm that rdonly is able to replicate from our primary 307 // This will also introduce a new transaction into the rdonly tablet which the other 2 replicas don't have 308 insertVal := utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{rdonly}) 309 310 // Make the current primary agent and database unavailable. 311 utils.StopTablet(t, tablets[0], true) 312 313 // start the replication back on the two tablets 314 err = clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", tablets[2].Alias) 315 require.NoError(t, err) 316 err = clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", tablets[3].Alias) 317 require.NoError(t, err) 318 319 // check that tablets[2] and tablets[3] still only has 1 value 320 err = utils.CheckCountOfInsertedValues(ctx, t, tablets[2], 1) 321 require.NoError(t, err) 322 err = utils.CheckCountOfInsertedValues(ctx, t, tablets[3], 1) 323 require.NoError(t, err) 324 325 // At this point we have successfully made our rdonly tablet more advanced than tablets[2] and tablets[3] without introducing errant GTIDs 326 // We have simulated a network partition in which the primary and rdonly got isolated and then the primary went down leaving the rdonly most advanced 327 328 // We expect that tablets[2] will be promoted since it is in the same cell as the previous primary 329 // since we are preventing cross cell promotions 330 // Also it must be fully caught up 331 out, err := utils.ErsIgnoreTablet(clusterInstance, nil, "60s", "30s", nil, true) 332 require.NoError(t, err, out) 333 334 newPrimary := utils.GetNewPrimary(t, clusterInstance) 335 require.Equal(t, newPrimary.Alias, tablets[2].Alias, "tablets[2] should be the promoted primary") 336 337 // check that the new primary has the last transaction that only the rdonly had 338 err = utils.CheckInsertedValues(ctx, t, newPrimary, insertVal) 339 require.NoError(t, err) 340 } 341 342 // TestNoReplicationStatusAndIOThreadStopped checks that ERS is able to fix 343 // replicas which do not have any replication status and also succeeds if the io thread 344 // is stopped on the primary elect. 345 func TestNoReplicationStatusAndIOThreadStopped(t *testing.T) { 346 defer cluster.PanicHandler(t) 347 clusterInstance := utils.SetupReparentCluster(t, "semi_sync") 348 defer utils.TeardownCluster(clusterInstance) 349 tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets 350 utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]}) 351 352 err := clusterInstance.VtctlclientProcess.ExecuteCommand("ExecuteFetchAsDba", tablets[1].Alias, `STOP SLAVE; RESET SLAVE ALL`) 353 require.NoError(t, err) 354 err = clusterInstance.VtctlclientProcess.ExecuteCommand("ExecuteFetchAsDba", tablets[3].Alias, `STOP SLAVE IO_THREAD;`) 355 require.NoError(t, err) 356 // Run an additional command in the current primary which will only be acked by tablets[2] and be in its relay log. 357 insertedVal := utils.ConfirmReplication(t, tablets[0], nil) 358 // Failover to tablets[3] 359 out, err := utils.Ers(clusterInstance, tablets[3], "60s", "30s") 360 require.NoError(t, err, out) 361 // Verify that the tablet has the inserted value 362 err = utils.CheckInsertedValues(context.Background(), t, tablets[3], insertedVal) 363 require.NoError(t, err) 364 // Confirm that replication is setup correctly from tablets[3] to tablets[0] 365 utils.ConfirmReplication(t, tablets[3], tablets[:1]) 366 // Confirm that tablets[2] which had no replication status initially now has its replication started 367 utils.CheckReplicationStatus(context.Background(), t, tablets[1], true, true) 368 } 369 370 // TestERSForInitialization tests whether calling ERS in the beginning sets up the cluster properly or not 371 func TestERSForInitialization(t *testing.T) { 372 var tablets []*cluster.Vttablet 373 clusterInstance := cluster.NewCluster("zone1", "localhost") 374 defer clusterInstance.Teardown() 375 keyspace := &cluster.Keyspace{Name: utils.KeyspaceName} 376 // Start topo server 377 err := clusterInstance.StartTopo() 378 require.NoError(t, err) 379 err = clusterInstance.TopoProcess.ManageTopoDir("mkdir", "/vitess/"+"zone1") 380 require.NoError(t, err) 381 for i := 0; i < 4; i++ { 382 tablet := clusterInstance.NewVttabletInstance("replica", 100+i, "zone1") 383 tablets = append(tablets, tablet) 384 } 385 386 shard := &cluster.Shard{Name: utils.ShardName} 387 shard.Vttablets = tablets 388 clusterInstance.VtTabletExtraArgs = []string{ 389 "--lock_tables_timeout", "5s", 390 "--track_schema_versions=true", 391 } 392 393 // Initialize Cluster 394 err = clusterInstance.SetupCluster(keyspace, []cluster.Shard{*shard}) 395 require.NoError(t, err) 396 if clusterInstance.VtctlMajorVersion >= 14 { 397 vtctldClientProcess := cluster.VtctldClientProcessInstance("localhost", clusterInstance.VtctldProcess.GrpcPort, clusterInstance.TmpDirectory) 398 out, err := vtctldClientProcess.ExecuteCommandWithOutput("SetKeyspaceDurabilityPolicy", keyspace.Name, "--durability-policy=semi_sync") 399 require.NoError(t, err, out) 400 } 401 402 //Start MySql 403 var mysqlCtlProcessList []*exec.Cmd 404 for _, shard := range clusterInstance.Keyspaces[0].Shards { 405 for _, tablet := range shard.Vttablets { 406 log.Infof("Starting MySql for tablet %v", tablet.Alias) 407 proc, err := tablet.MysqlctlProcess.StartProcess() 408 require.NoError(t, err) 409 mysqlCtlProcessList = append(mysqlCtlProcessList, proc) 410 } 411 } 412 // Wait for mysql processes to start 413 for _, proc := range mysqlCtlProcessList { 414 if err := proc.Wait(); err != nil { 415 t.Fatalf("Error starting mysql: %s", err.Error()) 416 } 417 } 418 419 for _, tablet := range tablets { 420 // Start the tablet 421 err = tablet.VttabletProcess.Setup() 422 require.NoError(t, err) 423 } 424 for _, tablet := range tablets { 425 err := tablet.VttabletProcess.WaitForTabletStatuses([]string{"SERVING", "NOT_SERVING"}) 426 require.NoError(t, err) 427 } 428 429 // Force the replica to reparent assuming that all the datasets are identical. 430 res, err := utils.Ers(clusterInstance, tablets[0], "60s", "30s") 431 require.NoError(t, err, res) 432 433 utils.ValidateTopology(t, clusterInstance, true) 434 // create Tables 435 utils.RunSQL(context.Background(), t, "create table vt_insert_test (id bigint, msg varchar(64), primary key (id)) Engine=InnoDB", tablets[0]) 436 utils.CheckPrimaryTablet(t, clusterInstance, tablets[0]) 437 utils.ValidateTopology(t, clusterInstance, false) 438 utils.WaitForReplicationToStart(t, clusterInstance, utils.KeyspaceName, utils.ShardName, len(tablets), true) 439 utils.ConfirmReplication(t, tablets[0], tablets[1:]) 440 } 441 442 func TestRecoverWithMultipleFailures(t *testing.T) { 443 defer cluster.PanicHandler(t) 444 clusterInstance := utils.SetupReparentCluster(t, "semi_sync") 445 defer utils.TeardownCluster(clusterInstance) 446 tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets 447 utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]}) 448 449 // make tablets[1] a rdonly tablet. 450 err := clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", tablets[1].Alias, "rdonly") 451 require.NoError(t, err) 452 453 // Confirm that replication is still working as intended 454 utils.ConfirmReplication(t, tablets[0], tablets[1:]) 455 456 // Make the rdonly and primary tablets and databases unavailable. 457 utils.StopTablet(t, tablets[1], true) 458 utils.StopTablet(t, tablets[0], true) 459 460 // We expect this to succeed since we only have 1 primary eligible tablet which is down 461 out, err := utils.Ers(clusterInstance, nil, "30s", "10s") 462 require.NoError(t, err, out) 463 464 newPrimary := utils.GetNewPrimary(t, clusterInstance) 465 utils.ConfirmReplication(t, newPrimary, []*cluster.Vttablet{tablets[2], tablets[3]}) 466 } 467 468 // TestERSFailFast tests that ERS will fail fast if it cannot find any tablet which can be safely promoted instead of promoting 469 // a tablet and hanging while inserting a row in the reparent journal on getting semi-sync ACKs 470 func TestERSFailFast(t *testing.T) { 471 defer cluster.PanicHandler(t) 472 clusterInstance := utils.SetupReparentCluster(t, "semi_sync") 473 defer utils.TeardownCluster(clusterInstance) 474 tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets 475 utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]}) 476 477 // make tablets[1] a rdonly tablet. 478 err := clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", tablets[1].Alias, "rdonly") 479 require.NoError(t, err) 480 481 // Confirm that replication is still working as intended 482 utils.ConfirmReplication(t, tablets[0], tablets[1:]) 483 484 // Context to be used in the go-routine to cleanly exit it after the test ends 485 ctx, cancel := context.WithCancel(context.Background()) 486 defer cancel() 487 strChan := make(chan string) 488 go func() { 489 // We expect this to fail since we have ignored all replica tablets and only the rdonly is left, which is not capable of sending semi-sync ACKs 490 out, err := utils.ErsIgnoreTablet(clusterInstance, tablets[2], "240s", "90s", []*cluster.Vttablet{tablets[0], tablets[3]}, false) 491 require.Error(t, err) 492 select { 493 case strChan <- out: 494 return 495 case <-ctx.Done(): 496 return 497 } 498 }() 499 500 select { 501 case out := <-strChan: 502 require.Contains(t, out, "proposed primary zone1-0000000103 will not be able to make forward progress on being promoted") 503 case <-time.After(60 * time.Second): 504 require.Fail(t, "Emergency Reparent Shard did not fail in 60 seconds") 505 } 506 } 507 508 // TestReplicationStopped checks that ERS ignores the tablets that have sql thread stopped. 509 // If there are more than 1, we also fail. 510 func TestReplicationStopped(t *testing.T) { 511 defer cluster.PanicHandler(t) 512 clusterInstance := utils.SetupReparentCluster(t, "semi_sync") 513 defer utils.TeardownCluster(clusterInstance) 514 tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets 515 utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]}) 516 517 err := clusterInstance.VtctlclientProcess.ExecuteCommand("ExecuteFetchAsDba", tablets[1].Alias, `STOP SLAVE SQL_THREAD;`) 518 require.NoError(t, err) 519 err = clusterInstance.VtctlclientProcess.ExecuteCommand("ExecuteFetchAsDba", tablets[2].Alias, `STOP SLAVE;`) 520 require.NoError(t, err) 521 // Run an additional command in the current primary which will only be acked by tablets[3] and be in its relay log. 522 insertedVal := utils.ConfirmReplication(t, tablets[0], nil) 523 // Failover to tablets[3] 524 _, err = utils.Ers(clusterInstance, tablets[3], "60s", "30s") 525 require.Error(t, err, "ERS should fail with 2 replicas having replication stopped") 526 527 // Start replication back on tablet[1] 528 err = clusterInstance.VtctlclientProcess.ExecuteCommand("ExecuteFetchAsDba", tablets[1].Alias, `START SLAVE;`) 529 require.NoError(t, err) 530 // Failover to tablets[3] again. This time it should succeed 531 out, err := utils.Ers(clusterInstance, tablets[3], "60s", "30s") 532 require.NoError(t, err, out) 533 // Verify that the tablet has the inserted value 534 err = utils.CheckInsertedValues(context.Background(), t, tablets[3], insertedVal) 535 require.NoError(t, err) 536 // Confirm that replication is setup correctly from tablets[3] to tablets[0] 537 utils.ConfirmReplication(t, tablets[3], tablets[:1]) 538 // Confirm that tablets[2] which had replication stopped initially still has its replication stopped 539 utils.CheckReplicationStatus(context.Background(), t, tablets[2], false, false) 540 }