vitess.io/vitess@v0.16.2/go/test/endtoend/reparent/emergencyreparent/ers_test.go (about)

     1  /*
     2  Copyright 2019 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package emergencyreparent
    18  
    19  import (
    20  	"context"
    21  	"os/exec"
    22  	"testing"
    23  	"time"
    24  
    25  	"github.com/stretchr/testify/require"
    26  
    27  	"vitess.io/vitess/go/test/endtoend/cluster"
    28  	"vitess.io/vitess/go/test/endtoend/reparent/utils"
    29  	"vitess.io/vitess/go/vt/log"
    30  )
    31  
    32  func TestTrivialERS(t *testing.T) {
    33  	defer cluster.PanicHandler(t)
    34  	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
    35  	defer utils.TeardownCluster(clusterInstance)
    36  	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
    37  
    38  	utils.ConfirmReplication(t, tablets[0], tablets[1:])
    39  
    40  	// We should be able to do a series of ERS-es, even if nothing
    41  	// is down, without issue
    42  	for i := 1; i <= 4; i++ {
    43  		out, err := utils.Ers(clusterInstance, nil, "60s", "30s")
    44  		log.Infof("ERS loop %d.  EmergencyReparentShard Output: %v", i, out)
    45  		require.NoError(t, err)
    46  		time.Sleep(5 * time.Second)
    47  	}
    48  	// We should do the same for vtctl binary
    49  	for i := 1; i <= 4; i++ {
    50  		out, err := utils.ErsWithVtctl(clusterInstance)
    51  		log.Infof("ERS-vtctl loop %d.  EmergencyReparentShard Output: %v", i, out)
    52  		require.NoError(t, err)
    53  		time.Sleep(5 * time.Second)
    54  	}
    55  }
    56  
    57  func TestReparentIgnoreReplicas(t *testing.T) {
    58  	defer cluster.PanicHandler(t)
    59  	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
    60  	defer utils.TeardownCluster(clusterInstance)
    61  	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
    62  	var err error
    63  
    64  	ctx := context.Background()
    65  
    66  	insertVal := utils.ConfirmReplication(t, tablets[0], tablets[1:])
    67  
    68  	// Make the current primary agent and database unavailable.
    69  	utils.StopTablet(t, tablets[0], true)
    70  
    71  	// Take down a replica - this should cause the emergency reparent to fail.
    72  	utils.StopTablet(t, tablets[2], true)
    73  
    74  	// We expect this one to fail because we have an unreachable replica
    75  	out, err := utils.Ers(clusterInstance, nil, "60s", "30s")
    76  	require.NotNil(t, err, out)
    77  
    78  	// Now let's run it again, but set the command to ignore the unreachable replica.
    79  	out, err = utils.ErsIgnoreTablet(clusterInstance, nil, "60s", "30s", []*cluster.Vttablet{tablets[2]}, false)
    80  	require.Nil(t, err, out)
    81  
    82  	// We'll bring back the replica we took down.
    83  	utils.RestartTablet(t, clusterInstance, tablets[2])
    84  
    85  	// Check that old primary tablet is left around for human intervention.
    86  	utils.ConfirmOldPrimaryIsHangingAround(t, clusterInstance)
    87  	utils.DeleteTablet(t, clusterInstance, tablets[0])
    88  	utils.ValidateTopology(t, clusterInstance, false)
    89  
    90  	newPrimary := utils.GetNewPrimary(t, clusterInstance)
    91  	// Check new primary has latest transaction.
    92  	err = utils.CheckInsertedValues(ctx, t, newPrimary, insertVal)
    93  	require.Nil(t, err)
    94  
    95  	// bring back the old primary as a replica, check that it catches up
    96  	utils.ResurrectTablet(ctx, t, clusterInstance, tablets[0])
    97  }
    98  
    99  func TestReparentDownPrimary(t *testing.T) {
   100  	defer cluster.PanicHandler(t)
   101  	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
   102  	defer utils.TeardownCluster(clusterInstance)
   103  	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
   104  
   105  	ctx := context.Background()
   106  
   107  	// Make the current primary agent and database unavailable.
   108  	utils.StopTablet(t, tablets[0], true)
   109  
   110  	// Perform a planned reparent operation, will try to contact
   111  	// the current primary and fail somewhat quickly
   112  	_, err := utils.PrsWithTimeout(t, clusterInstance, tablets[1], false, "1s", "5s")
   113  	require.Error(t, err)
   114  
   115  	utils.ValidateTopology(t, clusterInstance, false)
   116  
   117  	// Run forced reparent operation, this should now proceed unimpeded.
   118  	out, err := utils.Ers(clusterInstance, tablets[1], "60s", "30s")
   119  	log.Infof("EmergencyReparentShard Output: %v", out)
   120  	require.NoError(t, err)
   121  
   122  	// Check that old primary tablet is left around for human intervention.
   123  	utils.ConfirmOldPrimaryIsHangingAround(t, clusterInstance)
   124  
   125  	// Now we'll manually remove it, simulating a human cleaning up a dead primary.
   126  	utils.DeleteTablet(t, clusterInstance, tablets[0])
   127  
   128  	// Now validate topo is correct.
   129  	utils.ValidateTopology(t, clusterInstance, false)
   130  	utils.CheckPrimaryTablet(t, clusterInstance, tablets[1])
   131  	utils.ConfirmReplication(t, tablets[1], []*cluster.Vttablet{tablets[2], tablets[3]})
   132  	utils.ResurrectTablet(ctx, t, clusterInstance, tablets[0])
   133  }
   134  
   135  func TestReparentNoChoiceDownPrimary(t *testing.T) {
   136  	defer cluster.PanicHandler(t)
   137  	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
   138  	defer utils.TeardownCluster(clusterInstance)
   139  	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
   140  	var err error
   141  
   142  	ctx := context.Background()
   143  
   144  	insertVal := utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]})
   145  
   146  	// Make the current primary agent and database unavailable.
   147  	utils.StopTablet(t, tablets[0], true)
   148  
   149  	// Run forced reparent operation, this should now proceed unimpeded.
   150  	out, err := utils.Ers(clusterInstance, nil, "120s", "61s")
   151  	require.NoError(t, err, out)
   152  
   153  	// Check that old primary tablet is left around for human intervention.
   154  	utils.ConfirmOldPrimaryIsHangingAround(t, clusterInstance)
   155  	// Now we'll manually remove the old primary, simulating a human cleaning up a dead primary.
   156  	utils.DeleteTablet(t, clusterInstance, tablets[0])
   157  	utils.ValidateTopology(t, clusterInstance, false)
   158  	newPrimary := utils.GetNewPrimary(t, clusterInstance)
   159  	// Validate new primary is not old primary.
   160  	require.NotEqual(t, newPrimary.Alias, tablets[0].Alias)
   161  
   162  	// Check new primary has latest transaction.
   163  	err = utils.CheckInsertedValues(ctx, t, newPrimary, insertVal)
   164  	require.NoError(t, err)
   165  
   166  	// bring back the old primary as a replica, check that it catches up
   167  	utils.ResurrectTablet(ctx, t, clusterInstance, tablets[0])
   168  }
   169  
   170  func TestSemiSyncSetupCorrectly(t *testing.T) {
   171  	t.Run("semi-sync enabled", func(t *testing.T) {
   172  		defer cluster.PanicHandler(t)
   173  		clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
   174  		defer utils.TeardownCluster(clusterInstance)
   175  		tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
   176  
   177  		utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]})
   178  		// Run forced reparent operation, this should proceed unimpeded.
   179  		out, err := utils.Ers(clusterInstance, tablets[1], "60s", "30s")
   180  		require.NoError(t, err, out)
   181  
   182  		utils.ConfirmReplication(t, tablets[1], []*cluster.Vttablet{tablets[0], tablets[2], tablets[3]})
   183  
   184  		for _, tablet := range tablets {
   185  			utils.CheckSemiSyncSetupCorrectly(t, tablet, "ON")
   186  		}
   187  
   188  		// Run forced reparent operation, this should proceed unimpeded.
   189  		out, err = utils.Prs(t, clusterInstance, tablets[0])
   190  		require.NoError(t, err, out)
   191  
   192  		utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]})
   193  
   194  		for _, tablet := range tablets {
   195  			utils.CheckSemiSyncSetupCorrectly(t, tablet, "ON")
   196  		}
   197  	})
   198  
   199  	t.Run("semi-sync disabled", func(t *testing.T) {
   200  		defer cluster.PanicHandler(t)
   201  		clusterInstance := utils.SetupReparentCluster(t, "none")
   202  		defer utils.TeardownCluster(clusterInstance)
   203  		tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
   204  
   205  		utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]})
   206  		// Run forced reparent operation, this should proceed unimpeded.
   207  		out, err := utils.Ers(clusterInstance, tablets[1], "60s", "30s")
   208  		require.NoError(t, err, out)
   209  
   210  		utils.ConfirmReplication(t, tablets[1], []*cluster.Vttablet{tablets[0], tablets[2], tablets[3]})
   211  
   212  		for _, tablet := range tablets {
   213  			utils.CheckSemiSyncSetupCorrectly(t, tablet, "OFF")
   214  		}
   215  
   216  		// Run forced reparent operation, this should proceed unimpeded.
   217  		out, err = utils.Prs(t, clusterInstance, tablets[0])
   218  		require.NoError(t, err, out)
   219  
   220  		utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]})
   221  
   222  		for _, tablet := range tablets {
   223  			utils.CheckSemiSyncSetupCorrectly(t, tablet, "OFF")
   224  		}
   225  	})
   226  }
   227  
   228  // TestERSPromoteRdonly tests that we never end up promoting a rdonly instance as the primary
   229  func TestERSPromoteRdonly(t *testing.T) {
   230  	defer cluster.PanicHandler(t)
   231  	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
   232  	defer utils.TeardownCluster(clusterInstance)
   233  	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
   234  	var err error
   235  
   236  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", tablets[1].Alias, "rdonly")
   237  	require.NoError(t, err)
   238  
   239  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", tablets[2].Alias, "rdonly")
   240  	require.NoError(t, err)
   241  
   242  	utils.ConfirmReplication(t, tablets[0], tablets[1:])
   243  
   244  	// Make the current primary agent and database unavailable.
   245  	utils.StopTablet(t, tablets[0], true)
   246  
   247  	// We expect this one to fail because we have ignored all the replicas and have only the rdonly's which should not be promoted
   248  	out, err := utils.ErsIgnoreTablet(clusterInstance, nil, "30s", "30s", []*cluster.Vttablet{tablets[3]}, false)
   249  	require.NotNil(t, err, out)
   250  
   251  	out, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("GetShard", utils.KeyspaceShard)
   252  	require.NoError(t, err)
   253  	require.Contains(t, out, `"uid": 101`, "the primary should still be 101 in the shard info")
   254  }
   255  
   256  // TestERSPreventCrossCellPromotion tests that we promote a replica in the same cell as the previous primary if prevent cross cell promotion flag is set
   257  func TestERSPreventCrossCellPromotion(t *testing.T) {
   258  	defer cluster.PanicHandler(t)
   259  	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
   260  	defer utils.TeardownCluster(clusterInstance)
   261  	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
   262  	var err error
   263  
   264  	// confirm that replication is going smoothly
   265  	utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]})
   266  
   267  	// Make the current primary agent and database unavailable.
   268  	utils.StopTablet(t, tablets[0], true)
   269  
   270  	// We expect that tablets[2] will be promoted since it is in the same cell as the previous primary
   271  	out, err := utils.ErsIgnoreTablet(clusterInstance, nil, "60s", "30s", []*cluster.Vttablet{tablets[1]}, true)
   272  	require.NoError(t, err, out)
   273  
   274  	newPrimary := utils.GetNewPrimary(t, clusterInstance)
   275  	require.Equal(t, newPrimary.Alias, tablets[2].Alias, "tablets[2] should be the promoted primary")
   276  }
   277  
   278  // TestPullFromRdonly tests that if a rdonly tablet is the most advanced, then our promoted primary should have
   279  // caught up to it by pulling transactions from it
   280  func TestPullFromRdonly(t *testing.T) {
   281  	defer cluster.PanicHandler(t)
   282  	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
   283  	defer utils.TeardownCluster(clusterInstance)
   284  	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
   285  	var err error
   286  
   287  	ctx := context.Background()
   288  	// make tablets[1] a rdonly tablet.
   289  	// rename tablet so that the test is not confusing
   290  	rdonly := tablets[1]
   291  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", rdonly.Alias, "rdonly")
   292  	require.NoError(t, err)
   293  
   294  	// confirm that all the tablets can replicate successfully right now
   295  	utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{rdonly, tablets[2], tablets[3]})
   296  
   297  	// stop replication on the other two tablets
   298  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("StopReplication", tablets[2].Alias)
   299  	require.NoError(t, err)
   300  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("StopReplication", tablets[3].Alias)
   301  	require.NoError(t, err)
   302  
   303  	// stop semi-sync on the primary so that any transaction now added does not require an ack
   304  	utils.RunSQL(ctx, t, "SET GLOBAL rpl_semi_sync_master_enabled = false", tablets[0])
   305  
   306  	// confirm that rdonly is able to replicate from our primary
   307  	// This will also introduce a new transaction into the rdonly tablet which the other 2 replicas don't have
   308  	insertVal := utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{rdonly})
   309  
   310  	// Make the current primary agent and database unavailable.
   311  	utils.StopTablet(t, tablets[0], true)
   312  
   313  	// start the replication back on the two tablets
   314  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", tablets[2].Alias)
   315  	require.NoError(t, err)
   316  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", tablets[3].Alias)
   317  	require.NoError(t, err)
   318  
   319  	// check that tablets[2] and tablets[3] still only has 1 value
   320  	err = utils.CheckCountOfInsertedValues(ctx, t, tablets[2], 1)
   321  	require.NoError(t, err)
   322  	err = utils.CheckCountOfInsertedValues(ctx, t, tablets[3], 1)
   323  	require.NoError(t, err)
   324  
   325  	// At this point we have successfully made our rdonly tablet more advanced than tablets[2] and tablets[3] without introducing errant GTIDs
   326  	// We have simulated a network partition in which the primary and rdonly got isolated and then the primary went down leaving the rdonly most advanced
   327  
   328  	// We expect that tablets[2] will be promoted since it is in the same cell as the previous primary
   329  	// since we are preventing cross cell promotions
   330  	// Also it must be fully caught up
   331  	out, err := utils.ErsIgnoreTablet(clusterInstance, nil, "60s", "30s", nil, true)
   332  	require.NoError(t, err, out)
   333  
   334  	newPrimary := utils.GetNewPrimary(t, clusterInstance)
   335  	require.Equal(t, newPrimary.Alias, tablets[2].Alias, "tablets[2] should be the promoted primary")
   336  
   337  	// check that the new primary has the last transaction that only the rdonly had
   338  	err = utils.CheckInsertedValues(ctx, t, newPrimary, insertVal)
   339  	require.NoError(t, err)
   340  }
   341  
   342  // TestNoReplicationStatusAndIOThreadStopped checks that ERS is able to fix
   343  // replicas which do not have any replication status and also succeeds if the io thread
   344  // is stopped on the primary elect.
   345  func TestNoReplicationStatusAndIOThreadStopped(t *testing.T) {
   346  	defer cluster.PanicHandler(t)
   347  	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
   348  	defer utils.TeardownCluster(clusterInstance)
   349  	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
   350  	utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]})
   351  
   352  	err := clusterInstance.VtctlclientProcess.ExecuteCommand("ExecuteFetchAsDba", tablets[1].Alias, `STOP SLAVE; RESET SLAVE ALL`)
   353  	require.NoError(t, err)
   354  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("ExecuteFetchAsDba", tablets[3].Alias, `STOP SLAVE IO_THREAD;`)
   355  	require.NoError(t, err)
   356  	// Run an additional command in the current primary which will only be acked by tablets[2] and be in its relay log.
   357  	insertedVal := utils.ConfirmReplication(t, tablets[0], nil)
   358  	// Failover to tablets[3]
   359  	out, err := utils.Ers(clusterInstance, tablets[3], "60s", "30s")
   360  	require.NoError(t, err, out)
   361  	// Verify that the tablet has the inserted value
   362  	err = utils.CheckInsertedValues(context.Background(), t, tablets[3], insertedVal)
   363  	require.NoError(t, err)
   364  	// Confirm that replication is setup correctly from tablets[3] to tablets[0]
   365  	utils.ConfirmReplication(t, tablets[3], tablets[:1])
   366  	// Confirm that tablets[2] which had no replication status initially now has its replication started
   367  	utils.CheckReplicationStatus(context.Background(), t, tablets[1], true, true)
   368  }
   369  
   370  // TestERSForInitialization tests whether calling ERS in the beginning sets up the cluster properly or not
   371  func TestERSForInitialization(t *testing.T) {
   372  	var tablets []*cluster.Vttablet
   373  	clusterInstance := cluster.NewCluster("zone1", "localhost")
   374  	defer clusterInstance.Teardown()
   375  	keyspace := &cluster.Keyspace{Name: utils.KeyspaceName}
   376  	// Start topo server
   377  	err := clusterInstance.StartTopo()
   378  	require.NoError(t, err)
   379  	err = clusterInstance.TopoProcess.ManageTopoDir("mkdir", "/vitess/"+"zone1")
   380  	require.NoError(t, err)
   381  	for i := 0; i < 4; i++ {
   382  		tablet := clusterInstance.NewVttabletInstance("replica", 100+i, "zone1")
   383  		tablets = append(tablets, tablet)
   384  	}
   385  
   386  	shard := &cluster.Shard{Name: utils.ShardName}
   387  	shard.Vttablets = tablets
   388  	clusterInstance.VtTabletExtraArgs = []string{
   389  		"--lock_tables_timeout", "5s",
   390  		"--track_schema_versions=true",
   391  	}
   392  
   393  	// Initialize Cluster
   394  	err = clusterInstance.SetupCluster(keyspace, []cluster.Shard{*shard})
   395  	require.NoError(t, err)
   396  	if clusterInstance.VtctlMajorVersion >= 14 {
   397  		vtctldClientProcess := cluster.VtctldClientProcessInstance("localhost", clusterInstance.VtctldProcess.GrpcPort, clusterInstance.TmpDirectory)
   398  		out, err := vtctldClientProcess.ExecuteCommandWithOutput("SetKeyspaceDurabilityPolicy", keyspace.Name, "--durability-policy=semi_sync")
   399  		require.NoError(t, err, out)
   400  	}
   401  
   402  	//Start MySql
   403  	var mysqlCtlProcessList []*exec.Cmd
   404  	for _, shard := range clusterInstance.Keyspaces[0].Shards {
   405  		for _, tablet := range shard.Vttablets {
   406  			log.Infof("Starting MySql for tablet %v", tablet.Alias)
   407  			proc, err := tablet.MysqlctlProcess.StartProcess()
   408  			require.NoError(t, err)
   409  			mysqlCtlProcessList = append(mysqlCtlProcessList, proc)
   410  		}
   411  	}
   412  	// Wait for mysql processes to start
   413  	for _, proc := range mysqlCtlProcessList {
   414  		if err := proc.Wait(); err != nil {
   415  			t.Fatalf("Error starting mysql: %s", err.Error())
   416  		}
   417  	}
   418  
   419  	for _, tablet := range tablets {
   420  		// Start the tablet
   421  		err = tablet.VttabletProcess.Setup()
   422  		require.NoError(t, err)
   423  	}
   424  	for _, tablet := range tablets {
   425  		err := tablet.VttabletProcess.WaitForTabletStatuses([]string{"SERVING", "NOT_SERVING"})
   426  		require.NoError(t, err)
   427  	}
   428  
   429  	// Force the replica to reparent assuming that all the datasets are identical.
   430  	res, err := utils.Ers(clusterInstance, tablets[0], "60s", "30s")
   431  	require.NoError(t, err, res)
   432  
   433  	utils.ValidateTopology(t, clusterInstance, true)
   434  	// create Tables
   435  	utils.RunSQL(context.Background(), t, "create table vt_insert_test (id bigint, msg varchar(64), primary key (id)) Engine=InnoDB", tablets[0])
   436  	utils.CheckPrimaryTablet(t, clusterInstance, tablets[0])
   437  	utils.ValidateTopology(t, clusterInstance, false)
   438  	utils.WaitForReplicationToStart(t, clusterInstance, utils.KeyspaceName, utils.ShardName, len(tablets), true)
   439  	utils.ConfirmReplication(t, tablets[0], tablets[1:])
   440  }
   441  
   442  func TestRecoverWithMultipleFailures(t *testing.T) {
   443  	defer cluster.PanicHandler(t)
   444  	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
   445  	defer utils.TeardownCluster(clusterInstance)
   446  	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
   447  	utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]})
   448  
   449  	// make tablets[1] a rdonly tablet.
   450  	err := clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", tablets[1].Alias, "rdonly")
   451  	require.NoError(t, err)
   452  
   453  	// Confirm that replication is still working as intended
   454  	utils.ConfirmReplication(t, tablets[0], tablets[1:])
   455  
   456  	// Make the rdonly and primary tablets and databases unavailable.
   457  	utils.StopTablet(t, tablets[1], true)
   458  	utils.StopTablet(t, tablets[0], true)
   459  
   460  	// We expect this to succeed since we only have 1 primary eligible tablet which is down
   461  	out, err := utils.Ers(clusterInstance, nil, "30s", "10s")
   462  	require.NoError(t, err, out)
   463  
   464  	newPrimary := utils.GetNewPrimary(t, clusterInstance)
   465  	utils.ConfirmReplication(t, newPrimary, []*cluster.Vttablet{tablets[2], tablets[3]})
   466  }
   467  
   468  // TestERSFailFast tests that ERS will fail fast if it cannot find any tablet which can be safely promoted instead of promoting
   469  // a tablet and hanging while inserting a row in the reparent journal on getting semi-sync ACKs
   470  func TestERSFailFast(t *testing.T) {
   471  	defer cluster.PanicHandler(t)
   472  	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
   473  	defer utils.TeardownCluster(clusterInstance)
   474  	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
   475  	utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]})
   476  
   477  	// make tablets[1] a rdonly tablet.
   478  	err := clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", tablets[1].Alias, "rdonly")
   479  	require.NoError(t, err)
   480  
   481  	// Confirm that replication is still working as intended
   482  	utils.ConfirmReplication(t, tablets[0], tablets[1:])
   483  
   484  	// Context to be used in the go-routine to cleanly exit it after the test ends
   485  	ctx, cancel := context.WithCancel(context.Background())
   486  	defer cancel()
   487  	strChan := make(chan string)
   488  	go func() {
   489  		// We expect this to fail since we have ignored all replica tablets and only the rdonly is left, which is not capable of sending semi-sync ACKs
   490  		out, err := utils.ErsIgnoreTablet(clusterInstance, tablets[2], "240s", "90s", []*cluster.Vttablet{tablets[0], tablets[3]}, false)
   491  		require.Error(t, err)
   492  		select {
   493  		case strChan <- out:
   494  			return
   495  		case <-ctx.Done():
   496  			return
   497  		}
   498  	}()
   499  
   500  	select {
   501  	case out := <-strChan:
   502  		require.Contains(t, out, "proposed primary zone1-0000000103 will not be able to make forward progress on being promoted")
   503  	case <-time.After(60 * time.Second):
   504  		require.Fail(t, "Emergency Reparent Shard did not fail in 60 seconds")
   505  	}
   506  }
   507  
   508  // TestReplicationStopped checks that ERS ignores the tablets that have sql thread stopped.
   509  // If there are more than 1, we also fail.
   510  func TestReplicationStopped(t *testing.T) {
   511  	defer cluster.PanicHandler(t)
   512  	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
   513  	defer utils.TeardownCluster(clusterInstance)
   514  	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
   515  	utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]})
   516  
   517  	err := clusterInstance.VtctlclientProcess.ExecuteCommand("ExecuteFetchAsDba", tablets[1].Alias, `STOP SLAVE SQL_THREAD;`)
   518  	require.NoError(t, err)
   519  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("ExecuteFetchAsDba", tablets[2].Alias, `STOP SLAVE;`)
   520  	require.NoError(t, err)
   521  	// Run an additional command in the current primary which will only be acked by tablets[3] and be in its relay log.
   522  	insertedVal := utils.ConfirmReplication(t, tablets[0], nil)
   523  	// Failover to tablets[3]
   524  	_, err = utils.Ers(clusterInstance, tablets[3], "60s", "30s")
   525  	require.Error(t, err, "ERS should fail with 2 replicas having replication stopped")
   526  
   527  	// Start replication back on tablet[1]
   528  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("ExecuteFetchAsDba", tablets[1].Alias, `START SLAVE;`)
   529  	require.NoError(t, err)
   530  	// Failover to tablets[3] again. This time it should succeed
   531  	out, err := utils.Ers(clusterInstance, tablets[3], "60s", "30s")
   532  	require.NoError(t, err, out)
   533  	// Verify that the tablet has the inserted value
   534  	err = utils.CheckInsertedValues(context.Background(), t, tablets[3], insertedVal)
   535  	require.NoError(t, err)
   536  	// Confirm that replication is setup correctly from tablets[3] to tablets[0]
   537  	utils.ConfirmReplication(t, tablets[3], tablets[:1])
   538  	// Confirm that tablets[2] which had replication stopped initially still has its replication stopped
   539  	utils.CheckReplicationStatus(context.Background(), t, tablets[2], false, false)
   540  }