vitess.io/vitess@v0.16.2/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package primaryfailure
    18  
    19  import (
    20  	"testing"
    21  	"time"
    22  
    23  	"github.com/stretchr/testify/assert"
    24  	"github.com/stretchr/testify/require"
    25  
    26  	"vitess.io/vitess/go/test/endtoend/cluster"
    27  	"vitess.io/vitess/go/test/endtoend/vtorc/utils"
    28  	"vitess.io/vitess/go/vt/vtorc/logic"
    29  )
    30  
    31  // bring down primary, let orc promote replica
    32  // covers the test case master-failover from orchestrator
    33  // Also tests that VTOrc can handle multiple failures, if the durability policies allow it
    34  func TestDownPrimary(t *testing.T) {
    35  	defer cluster.PanicHandler(t)
    36  	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{
    37  		PreventCrossDataCenterPrimaryFailover: true,
    38  	}, 1, "semi_sync")
    39  	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
    40  	shard0 := &keyspace.Shards[0]
    41  	// find primary from topo
    42  	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
    43  	assert.NotNil(t, curPrimary, "should have elected a primary")
    44  	vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
    45  	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
    46  
    47  	// find the replica and rdonly tablets
    48  	var replica, rdonly *cluster.Vttablet
    49  	for _, tablet := range shard0.Vttablets {
    50  		// we know we have only two replcia tablets, so the one not the primary must be the other replica
    51  		if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" {
    52  			replica = tablet
    53  		}
    54  		if tablet.Type == "rdonly" {
    55  			rdonly = tablet
    56  		}
    57  	}
    58  	assert.NotNil(t, replica, "could not find replica tablet")
    59  	assert.NotNil(t, rdonly, "could not find rdonly tablet")
    60  
    61  	// Start a cross-cell replica
    62  	crossCellReplica := utils.StartVttablet(t, clusterInfo, utils.Cell2, false)
    63  
    64  	// check that the replication is setup correctly before we failover
    65  	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica, crossCellReplica}, 10*time.Second)
    66  
    67  	// Make the rdonly vttablet unavailable
    68  	err := rdonly.VttabletProcess.TearDown()
    69  	require.NoError(t, err)
    70  	err = rdonly.MysqlctlProcess.Stop()
    71  	require.NoError(t, err)
    72  	// Make the current primary vttablet unavailable.
    73  	err = curPrimary.VttabletProcess.TearDown()
    74  	require.NoError(t, err)
    75  	err = curPrimary.MysqlctlProcess.Stop()
    76  	require.NoError(t, err)
    77  	defer func() {
    78  		// we remove the tablet from our global list
    79  		utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
    80  		utils.PermanentlyRemoveVttablet(clusterInfo, rdonly)
    81  	}()
    82  
    83  	// check that the replica gets promoted
    84  	utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
    85  	// also check that the replication is working correctly after failover
    86  	utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second)
    87  	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
    88  }
    89  
    90  // Failover should not be cross data centers, according to the configuration file
    91  // covers part of the test case master-failover-lost-replicas from orchestrator
    92  func TestCrossDataCenterFailure(t *testing.T) {
    93  	defer cluster.PanicHandler(t)
    94  	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
    95  		PreventCrossDataCenterPrimaryFailover: true,
    96  	}, 1, "")
    97  	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
    98  	shard0 := &keyspace.Shards[0]
    99  	// find primary from topo
   100  	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
   101  	assert.NotNil(t, curPrimary, "should have elected a primary")
   102  
   103  	// find the replica and rdonly tablets
   104  	var replicaInSameCell, rdonly *cluster.Vttablet
   105  	for _, tablet := range shard0.Vttablets {
   106  		// we know we have only two replcia tablets, so the one not the primary must be the other replica
   107  		if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" {
   108  			replicaInSameCell = tablet
   109  		}
   110  		if tablet.Type == "rdonly" {
   111  			rdonly = tablet
   112  		}
   113  	}
   114  	assert.NotNil(t, replicaInSameCell, "could not find replica tablet")
   115  	assert.NotNil(t, rdonly, "could not find rdonly tablet")
   116  
   117  	crossCellReplica := utils.StartVttablet(t, clusterInfo, utils.Cell2, false)
   118  	// newly started tablet does not replicate from anyone yet, we will allow vtorc to fix this too
   119  	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{crossCellReplica, replicaInSameCell, rdonly}, 25*time.Second)
   120  
   121  	// Make the current primary database unavailable.
   122  	err := curPrimary.MysqlctlProcess.Stop()
   123  	require.NoError(t, err)
   124  	defer func() {
   125  		// we remove the tablet from our global list since its mysqlctl process has stopped and cannot be reused for other tests
   126  		utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
   127  	}()
   128  
   129  	// we have a replica in the same cell, so that is the one which should be promoted and not the one from another cell
   130  	utils.CheckPrimaryTablet(t, clusterInfo, replicaInSameCell, true)
   131  	// also check that the replication is working correctly after failover
   132  	utils.VerifyWritesSucceed(t, clusterInfo, replicaInSameCell, []*cluster.Vttablet{crossCellReplica, rdonly}, 10*time.Second)
   133  }
   134  
   135  // Failover should not be cross data centers, according to the configuration file
   136  // In case of no viable candidates, we should error out
   137  func TestCrossDataCenterFailureError(t *testing.T) {
   138  	defer cluster.PanicHandler(t)
   139  	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, nil, cluster.VTOrcConfiguration{
   140  		PreventCrossDataCenterPrimaryFailover: true,
   141  	}, 1, "")
   142  	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
   143  	shard0 := &keyspace.Shards[0]
   144  	// find primary from topo
   145  	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
   146  	assert.NotNil(t, curPrimary, "should have elected a primary")
   147  
   148  	// find the rdonly tablet
   149  	var rdonly *cluster.Vttablet
   150  	for _, tablet := range shard0.Vttablets {
   151  		if tablet.Type == "rdonly" {
   152  			rdonly = tablet
   153  		}
   154  	}
   155  	assert.NotNil(t, rdonly, "could not find rdonly tablet")
   156  
   157  	crossCellReplica1 := utils.StartVttablet(t, clusterInfo, utils.Cell2, false)
   158  	crossCellReplica2 := utils.StartVttablet(t, clusterInfo, utils.Cell2, false)
   159  	// newly started tablet does not replicate from anyone yet, we will allow vtorc to fix this too
   160  	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{crossCellReplica1, crossCellReplica2, rdonly}, 25*time.Second)
   161  
   162  	// Make the current primary database unavailable.
   163  	err := curPrimary.MysqlctlProcess.Stop()
   164  	require.NoError(t, err)
   165  	defer func() {
   166  		// we remove the tablet from our global list since its mysqlctl process has stopped and cannot be reused for other tests
   167  		utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
   168  	}()
   169  
   170  	// wait for 20 seconds
   171  	time.Sleep(20 * time.Second)
   172  
   173  	// the previous primary should still be the primary since recovery of dead primary should fail
   174  	utils.CheckPrimaryTablet(t, clusterInfo, curPrimary, false)
   175  }
   176  
   177  // Failover will sometimes lead to a rdonly which can no longer replicate.
   178  // covers part of the test case master-failover-lost-replicas from orchestrator
   179  func TestLostRdonlyOnPrimaryFailure(t *testing.T) {
   180  	// new version of ERS does not check for lost replicas yet
   181  	// Earlier any replicas that were not able to replicate from the previous primary
   182  	// were detected by vtorc and could be configured to have their sources detached
   183  	t.Skip()
   184  	defer cluster.PanicHandler(t)
   185  	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 2, nil, cluster.VTOrcConfiguration{
   186  		PreventCrossDataCenterPrimaryFailover: true,
   187  	}, 1, "")
   188  	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
   189  	shard0 := &keyspace.Shards[0]
   190  	// find primary from topo
   191  	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
   192  	assert.NotNil(t, curPrimary, "should have elected a primary")
   193  
   194  	// get the tablets
   195  	var replica, rdonly, aheadRdonly *cluster.Vttablet
   196  	for _, tablet := range shard0.Vttablets {
   197  		// find tablets which are not the primary
   198  		if tablet.Alias != curPrimary.Alias {
   199  			if tablet.Type == "replica" {
   200  				replica = tablet
   201  			} else {
   202  				if rdonly == nil {
   203  					rdonly = tablet
   204  				} else {
   205  					aheadRdonly = tablet
   206  				}
   207  			}
   208  		}
   209  	}
   210  	assert.NotNil(t, replica, "could not find replica tablet")
   211  	assert.NotNil(t, rdonly, "could not find any rdonly tablet")
   212  	assert.NotNil(t, aheadRdonly, "could not find both rdonly tablet")
   213  
   214  	// check that replication is setup correctly
   215  	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, aheadRdonly, replica}, 15*time.Second)
   216  
   217  	// revoke super privileges from vtorc on replica and rdonly so that it is unable to repair the replication
   218  	utils.ChangePrivileges(t, `REVOKE SUPER ON *.* FROM 'orc_client_user'@'%'`, replica, "orc_client_user")
   219  	utils.ChangePrivileges(t, `REVOKE SUPER ON *.* FROM 'orc_client_user'@'%'`, rdonly, "orc_client_user")
   220  
   221  	// stop replication on the replica and rdonly.
   222  	err := clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommand("StopReplication", replica.Alias)
   223  	require.NoError(t, err)
   224  	err = clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommand("StopReplication", rdonly.Alias)
   225  	require.NoError(t, err)
   226  
   227  	// check that aheadRdonly is able to replicate. We also want to add some queries to aheadRdonly which will not be there in replica and rdonly
   228  	utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{aheadRdonly}, 15*time.Second)
   229  
   230  	// assert that the replica and rdonly are indeed lagging and do not have the new insertion by checking the count of rows in the tables
   231  	out, err := utils.RunSQL(t, "SELECT * FROM vt_insert_test", replica, "vt_ks")
   232  	require.NoError(t, err)
   233  	require.Equal(t, 1, len(out.Rows))
   234  	out, err = utils.RunSQL(t, "SELECT * FROM vt_insert_test", rdonly, "vt_ks")
   235  	require.NoError(t, err)
   236  	require.Equal(t, 1, len(out.Rows))
   237  
   238  	// Make the current primary database unavailable.
   239  	err = curPrimary.MysqlctlProcess.Stop()
   240  	require.NoError(t, err)
   241  	defer func() {
   242  		// we remove the tablet from our global list since its mysqlctl process has stopped and cannot be reused for other tests
   243  		utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
   244  	}()
   245  
   246  	// grant super privileges back to vtorc on replica and rdonly so that it can repair
   247  	utils.ChangePrivileges(t, `GRANT SUPER ON *.* TO 'orc_client_user'@'%'`, replica, "orc_client_user")
   248  	utils.ChangePrivileges(t, `GRANT SUPER ON *.* TO 'orc_client_user'@'%'`, rdonly, "orc_client_user")
   249  
   250  	// vtorc must promote the lagging replica and not the rdonly, since it has a MustNotPromoteRule promotion rule
   251  	utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
   252  
   253  	// also check that the replication is setup correctly
   254  	utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 15*time.Second)
   255  
   256  	// check that the rdonly is lost. The lost replica has is detached and its host is prepended with `//`
   257  	out, err = utils.RunSQL(t, "SELECT HOST FROM performance_schema.replication_connection_configuration", aheadRdonly, "")
   258  	require.NoError(t, err)
   259  	require.Equal(t, "//localhost", out.Rows[0][0].ToString())
   260  }
   261  
   262  // This test checks that the promotion of a tablet succeeds if it passes the promotion lag test
   263  // covers the test case master-failover-fail-promotion-lag-minutes-success from orchestrator
   264  func TestPromotionLagSuccess(t *testing.T) {
   265  	defer cluster.PanicHandler(t)
   266  	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
   267  		ReplicationLagQuery:              "select 59",
   268  		FailPrimaryPromotionOnLagMinutes: 1,
   269  	}, 1, "")
   270  	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
   271  	shard0 := &keyspace.Shards[0]
   272  	// find primary from topo
   273  	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
   274  	assert.NotNil(t, curPrimary, "should have elected a primary")
   275  
   276  	// find the replica and rdonly tablets
   277  	var replica, rdonly *cluster.Vttablet
   278  	for _, tablet := range shard0.Vttablets {
   279  		// we know we have only two replcia tablets, so the one not the primary must be the other replica
   280  		if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" {
   281  			replica = tablet
   282  		}
   283  		if tablet.Type == "rdonly" {
   284  			rdonly = tablet
   285  		}
   286  	}
   287  	assert.NotNil(t, replica, "could not find replica tablet")
   288  	assert.NotNil(t, rdonly, "could not find rdonly tablet")
   289  
   290  	// check that the replication is setup correctly before we failover
   291  	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica}, 10*time.Second)
   292  
   293  	// Make the current primary database unavailable.
   294  	err := curPrimary.MysqlctlProcess.Stop()
   295  	require.NoError(t, err)
   296  	defer func() {
   297  		// we remove the tablet from our global list since its mysqlctl process has stopped and cannot be reused for other tests
   298  		utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
   299  	}()
   300  
   301  	// check that the replica gets promoted
   302  	utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
   303  	// also check that the replication is working correctly after failover
   304  	utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second)
   305  }
   306  
   307  // This test checks that the promotion of a tablet succeeds if it passes the promotion lag test
   308  // covers the test case master-failover-fail-promotion-lag-minutes-failure from orchestrator
   309  func TestPromotionLagFailure(t *testing.T) {
   310  	// new version of ERS does not check for promotion lag yet
   311  	// Earlier vtorc used to check that the promotion lag between the new primary and the old one
   312  	// was smaller than the configured value, otherwise it would fail the promotion
   313  	t.Skip()
   314  	defer cluster.PanicHandler(t)
   315  	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 3, 1, nil, cluster.VTOrcConfiguration{
   316  		ReplicationLagQuery:              "select 61",
   317  		FailPrimaryPromotionOnLagMinutes: 1,
   318  	}, 1, "")
   319  	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
   320  	shard0 := &keyspace.Shards[0]
   321  	// find primary from topo
   322  	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
   323  	assert.NotNil(t, curPrimary, "should have elected a primary")
   324  
   325  	// find the replica and rdonly tablets
   326  	var replica1, replica2, rdonly *cluster.Vttablet
   327  	for _, tablet := range shard0.Vttablets {
   328  		// we know we have only two replcia tablets, so the one not the primary must be the other replica
   329  		if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" {
   330  			if replica1 == nil {
   331  				replica1 = tablet
   332  			} else {
   333  				replica2 = tablet
   334  			}
   335  		}
   336  		if tablet.Type == "rdonly" {
   337  			rdonly = tablet
   338  		}
   339  	}
   340  	assert.NotNil(t, replica1, "could not find replica tablet")
   341  	assert.NotNil(t, replica2, "could not find second replica tablet")
   342  	assert.NotNil(t, rdonly, "could not find rdonly tablet")
   343  
   344  	// check that the replication is setup correctly before we failover
   345  	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica1, replica2}, 10*time.Second)
   346  
   347  	// Make the current primary database unavailable.
   348  	err := curPrimary.MysqlctlProcess.Stop()
   349  	require.NoError(t, err)
   350  	defer func() {
   351  		// we remove the tablet from our global list since its mysqlctl process has stopped and cannot be reused for other tests
   352  		utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
   353  	}()
   354  
   355  	// wait for 20 seconds
   356  	time.Sleep(20 * time.Second)
   357  
   358  	// the previous primary should still be the primary since recovery of dead primary should fail
   359  	utils.CheckPrimaryTablet(t, clusterInfo, curPrimary, false)
   360  }
   361  
   362  // covers the test case master-failover-candidate from orchestrator
   363  // We explicitly set one of the replicas to Prefer promotion rule.
   364  // That is the replica which should be promoted in case of primary failure
   365  func TestDownPrimaryPromotionRule(t *testing.T) {
   366  	defer cluster.PanicHandler(t)
   367  	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
   368  		LockShardTimeoutSeconds: 5,
   369  	}, 1, "test")
   370  	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
   371  	shard0 := &keyspace.Shards[0]
   372  	// find primary from topo
   373  	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
   374  	assert.NotNil(t, curPrimary, "should have elected a primary")
   375  
   376  	// find the replica and rdonly tablets
   377  	var replica, rdonly *cluster.Vttablet
   378  	for _, tablet := range shard0.Vttablets {
   379  		// we know we have only two replcia tablets, so the one not the primary must be the other replica
   380  		if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" {
   381  			replica = tablet
   382  		}
   383  		if tablet.Type == "rdonly" {
   384  			rdonly = tablet
   385  		}
   386  	}
   387  	assert.NotNil(t, replica, "could not find replica tablet")
   388  	assert.NotNil(t, rdonly, "could not find rdonly tablet")
   389  
   390  	crossCellReplica := utils.StartVttablet(t, clusterInfo, utils.Cell2, false)
   391  	// newly started tablet does not replicate from anyone yet, we will allow vtorc to fix this too
   392  	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{crossCellReplica, rdonly, replica}, 25*time.Second)
   393  
   394  	// Make the current primary database unavailable.
   395  	err := curPrimary.MysqlctlProcess.Stop()
   396  	require.NoError(t, err)
   397  	defer func() {
   398  		// we remove the tablet from our global list since its mysqlctl process has stopped and cannot be reused for other tests
   399  		utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
   400  	}()
   401  
   402  	// we have a replica with a preferred promotion rule, so that is the one which should be promoted
   403  	utils.CheckPrimaryTablet(t, clusterInfo, crossCellReplica, true)
   404  	// also check that the replication is working correctly after failover
   405  	utils.VerifyWritesSucceed(t, clusterInfo, crossCellReplica, []*cluster.Vttablet{rdonly, replica}, 10*time.Second)
   406  }
   407  
   408  // covers the test case master-failover-candidate-lag from orchestrator
   409  // We explicitly set one of the replicas to Prefer promotion rule and make it lag with respect to other replicas.
   410  // That is the replica which should be promoted in case of primary failure
   411  // It should also be caught up when it is promoted
   412  func TestDownPrimaryPromotionRuleWithLag(t *testing.T) {
   413  	defer cluster.PanicHandler(t)
   414  	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
   415  		LockShardTimeoutSeconds: 5,
   416  	}, 1, "test")
   417  	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
   418  	shard0 := &keyspace.Shards[0]
   419  	// find primary from topo
   420  	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
   421  	assert.NotNil(t, curPrimary, "should have elected a primary")
   422  
   423  	// get the replicas in the same cell
   424  	var replica, rdonly *cluster.Vttablet
   425  	for _, tablet := range shard0.Vttablets {
   426  		// find tablets which are not the primary
   427  		if tablet.Alias != curPrimary.Alias {
   428  			if tablet.Type == "replica" {
   429  				replica = tablet
   430  			} else {
   431  				rdonly = tablet
   432  			}
   433  		}
   434  	}
   435  	assert.NotNil(t, replica, "could not find replica tablet")
   436  	assert.NotNil(t, rdonly, "could not find rdonly tablet")
   437  
   438  	crossCellReplica := utils.StartVttablet(t, clusterInfo, utils.Cell2, false)
   439  	// newly started tablet does not replicate from anyone yet, we will allow vtorc to fix this too
   440  	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{crossCellReplica, replica, rdonly}, 25*time.Second)
   441  
   442  	// revoke super privileges from vtorc on crossCellReplica so that it is unable to repair the replication
   443  	utils.ChangePrivileges(t, `REVOKE SUPER ON *.* FROM 'orc_client_user'@'%'`, crossCellReplica, "orc_client_user")
   444  
   445  	// stop replication on the crossCellReplica.
   446  	err := clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommand("StopReplication", crossCellReplica.Alias)
   447  	require.NoError(t, err)
   448  
   449  	// check that rdonly and replica are able to replicate. We also want to add some queries to replica which will not be there in crossCellReplica
   450  	utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, rdonly}, 15*time.Second)
   451  
   452  	// reset the primary logs so that crossCellReplica can never catch up
   453  	utils.ResetPrimaryLogs(t, curPrimary)
   454  
   455  	// start replication back on the crossCellReplica.
   456  	err = clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", crossCellReplica.Alias)
   457  	require.NoError(t, err)
   458  
   459  	// grant super privileges back to vtorc on crossCellReplica so that it can repair
   460  	utils.ChangePrivileges(t, `GRANT SUPER ON *.* TO 'orc_client_user'@'%'`, crossCellReplica, "orc_client_user")
   461  
   462  	// assert that the crossCellReplica is indeed lagging and does not have the new insertion by checking the count of rows in the table
   463  	out, err := utils.RunSQL(t, "SELECT * FROM vt_insert_test", crossCellReplica, "vt_ks")
   464  	require.NoError(t, err)
   465  	require.Equal(t, 1, len(out.Rows))
   466  
   467  	// Make the current primary database unavailable.
   468  	err = curPrimary.MysqlctlProcess.Stop()
   469  	require.NoError(t, err)
   470  	defer func() {
   471  		// we remove the tablet from our global list since its mysqlctl process has stopped and cannot be reused for other tests
   472  		utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
   473  	}()
   474  
   475  	// the crossCellReplica is set to be preferred according to the durability requirements. So it must be promoted
   476  	utils.CheckPrimaryTablet(t, clusterInfo, crossCellReplica, true)
   477  
   478  	// assert that the crossCellReplica has indeed caught up
   479  	out, err = utils.RunSQL(t, "SELECT * FROM vt_insert_test", crossCellReplica, "vt_ks")
   480  	require.NoError(t, err)
   481  	require.Equal(t, 2, len(out.Rows))
   482  
   483  	// check that rdonly and replica are able to replicate from the crossCellReplica
   484  	utils.VerifyWritesSucceed(t, clusterInfo, crossCellReplica, []*cluster.Vttablet{replica, rdonly}, 15*time.Second)
   485  }
   486  
   487  // covers the test case master-failover-candidate-lag-cross-datacenter from orchestrator
   488  // We explicitly set one of the cross-cell replicas to Prefer promotion rule, but we prevent cross data center promotions.
   489  // We let a replica in our own cell lag. That is the replica which should be promoted in case of primary failure
   490  // It should also be caught up when it is promoted
   491  func TestDownPrimaryPromotionRuleWithLagCrossCenter(t *testing.T) {
   492  	defer cluster.PanicHandler(t)
   493  	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
   494  		LockShardTimeoutSeconds:               5,
   495  		PreventCrossDataCenterPrimaryFailover: true,
   496  	}, 1, "test")
   497  	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
   498  	shard0 := &keyspace.Shards[0]
   499  	// find primary from topo
   500  	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
   501  	assert.NotNil(t, curPrimary, "should have elected a primary")
   502  
   503  	// get the replicas in the same cell
   504  	var replica, rdonly *cluster.Vttablet
   505  	for _, tablet := range shard0.Vttablets {
   506  		// find tablets which are not the primary
   507  		if tablet.Alias != curPrimary.Alias {
   508  			if tablet.Type == "replica" {
   509  				replica = tablet
   510  			} else {
   511  				rdonly = tablet
   512  			}
   513  		}
   514  	}
   515  	assert.NotNil(t, replica, "could not find replica tablet")
   516  	assert.NotNil(t, rdonly, "could not find rdonly tablet")
   517  
   518  	crossCellReplica := utils.StartVttablet(t, clusterInfo, utils.Cell2, false)
   519  	// newly started tablet does not replicate from anyone yet, we will allow vtorc to fix this too
   520  	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{crossCellReplica, replica, rdonly}, 25*time.Second)
   521  
   522  	// revoke super privileges from vtorc on replica so that it is unable to repair the replication
   523  	utils.ChangePrivileges(t, `REVOKE SUPER ON *.* FROM 'orc_client_user'@'%'`, replica, "orc_client_user")
   524  
   525  	// stop replication on the replica.
   526  	err := clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommand("StopReplication", replica.Alias)
   527  	require.NoError(t, err)
   528  
   529  	// check that rdonly and crossCellReplica are able to replicate. We also want to add some queries to crossCenterReplica which will not be there in replica
   530  	utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, crossCellReplica}, 15*time.Second)
   531  
   532  	// reset the primary logs so that crossCellReplica can never catch up
   533  	utils.ResetPrimaryLogs(t, curPrimary)
   534  
   535  	// start replication back on the replica.
   536  	err = clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", replica.Alias)
   537  	require.NoError(t, err)
   538  
   539  	// grant super privileges back to vtorc on replica so that it can repair
   540  	utils.ChangePrivileges(t, `GRANT SUPER ON *.* TO 'orc_client_user'@'%'`, replica, "orc_client_user")
   541  
   542  	// assert that the replica is indeed lagging and does not have the new insertion by checking the count of rows in the table
   543  	out, err := utils.RunSQL(t, "SELECT * FROM vt_insert_test", replica, "vt_ks")
   544  	require.NoError(t, err)
   545  	require.Equal(t, 1, len(out.Rows))
   546  
   547  	// Make the current primary database unavailable.
   548  	err = curPrimary.MysqlctlProcess.Stop()
   549  	require.NoError(t, err)
   550  	defer func() {
   551  		// we remove the tablet from our global list since its mysqlctl process has stopped and cannot be reused for other tests
   552  		utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
   553  	}()
   554  
   555  	// the replica should be promoted since we have prevented cross cell promotions
   556  	utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
   557  
   558  	// assert that the replica has indeed caught up
   559  	out, err = utils.RunSQL(t, "SELECT * FROM vt_insert_test", replica, "vt_ks")
   560  	require.NoError(t, err)
   561  	require.Equal(t, 2, len(out.Rows))
   562  
   563  	// check that rdonly and crossCellReplica are able to replicate from the replica
   564  	utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica, rdonly}, 15*time.Second)
   565  }