vitess.io/vitess@v0.16.2/go/test/endtoend/vtorc/general/vtorc_test.go (about)

     1  /*
     2  Copyright 2020 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package general
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"testing"
    23  	"time"
    24  
    25  	"github.com/stretchr/testify/assert"
    26  	"github.com/stretchr/testify/require"
    27  
    28  	"vitess.io/vitess/go/test/endtoend/cluster"
    29  	"vitess.io/vitess/go/test/endtoend/vtorc/utils"
    30  	"vitess.io/vitess/go/vt/log"
    31  	"vitess.io/vitess/go/vt/vtorc/logic"
    32  )
    33  
    34  // Cases to test:
    35  // 1. create cluster with 2 replicas and 1 rdonly, let orc choose primary
    36  // verify rdonly is not elected, only replica
    37  // verify replication is setup
    38  // verify that with multiple vtorc instances, we still only have 1 PlannedReparentShard call
    39  func TestPrimaryElection(t *testing.T) {
    40  	defer cluster.PanicHandler(t)
    41  	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
    42  		PreventCrossDataCenterPrimaryFailover: true,
    43  	}, 2, "")
    44  	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
    45  	shard0 := &keyspace.Shards[0]
    46  
    47  	primary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
    48  	assert.NotNil(t, primary, "should have elected a primary")
    49  	utils.CheckReplication(t, clusterInfo, primary, shard0.Vttablets, 10*time.Second)
    50  
    51  	for _, vttablet := range shard0.Vttablets {
    52  		if vttablet.Type == "rdonly" && primary.Alias == vttablet.Alias {
    53  			t.Errorf("Rdonly tablet promoted as primary - %v", primary.Alias)
    54  		}
    55  	}
    56  
    57  	res, err := utils.RunSQL(t, "select * from reparent_journal", primary, "_vt")
    58  	require.NoError(t, err)
    59  	require.Len(t, res.Rows, 1, "There should only be 1 primary tablet which was elected")
    60  }
    61  
    62  // Cases to test:
    63  // 1. create cluster with 1 replica and 1 rdonly, let orc choose primary
    64  // verify rdonly is not elected, only replica
    65  // verify replication is setup
    66  func TestSingleKeyspace(t *testing.T) {
    67  	defer cluster.PanicHandler(t)
    68  	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, []string{"--clusters_to_watch", "ks"}, cluster.VTOrcConfiguration{
    69  		PreventCrossDataCenterPrimaryFailover: true,
    70  	}, 1, "")
    71  	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
    72  	shard0 := &keyspace.Shards[0]
    73  
    74  	utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true)
    75  	utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second)
    76  	utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1)
    77  }
    78  
    79  // Cases to test:
    80  // 1. create cluster with 1 replica and 1 rdonly, let orc choose primary
    81  // verify rdonly is not elected, only replica
    82  // verify replication is setup
    83  func TestKeyspaceShard(t *testing.T) {
    84  	defer cluster.PanicHandler(t)
    85  	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, []string{"--clusters_to_watch", "ks/0"}, cluster.VTOrcConfiguration{
    86  		PreventCrossDataCenterPrimaryFailover: true,
    87  	}, 1, "")
    88  	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
    89  	shard0 := &keyspace.Shards[0]
    90  
    91  	utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true)
    92  	utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second)
    93  	utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1)
    94  }
    95  
    96  // Cases to test:
    97  // 1. make primary readonly, let vtorc repair
    98  // 2. make replica ReadWrite, let vtorc repair
    99  // 3. stop replication, let vtorc repair
   100  // 4. setup replication from non-primary, let vtorc repair
   101  // 5. make instance A replicates from B and B from A, wait for repair
   102  func TestVTOrcRepairs(t *testing.T) {
   103  	defer cluster.PanicHandler(t)
   104  	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 3, 0, nil, cluster.VTOrcConfiguration{
   105  		PreventCrossDataCenterPrimaryFailover: true,
   106  	}, 1, "")
   107  	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
   108  	shard0 := &keyspace.Shards[0]
   109  
   110  	// find primary from topo
   111  	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
   112  	assert.NotNil(t, curPrimary, "should have elected a primary")
   113  	vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
   114  	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
   115  
   116  	var replica, otherReplica *cluster.Vttablet
   117  	for _, tablet := range shard0.Vttablets {
   118  		// we know we have only two tablets, so the "other" one must be the new primary
   119  		if tablet.Alias != curPrimary.Alias {
   120  			if replica == nil {
   121  				replica = tablet
   122  			} else {
   123  				otherReplica = tablet
   124  			}
   125  		}
   126  	}
   127  	require.NotNil(t, replica, "should be able to find a replica")
   128  	require.NotNil(t, otherReplica, "should be able to find 2nd replica")
   129  
   130  	// check replication is setup correctly
   131  	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second)
   132  
   133  	t.Run("PrimaryReadOnly", func(t *testing.T) {
   134  		// Make the current primary database read-only.
   135  		_, err := utils.RunSQL(t, "set global read_only=ON", curPrimary, "")
   136  		require.NoError(t, err)
   137  
   138  		// wait for repair
   139  		match := utils.WaitForReadOnlyValue(t, curPrimary, 0)
   140  		require.True(t, match)
   141  		utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, 1)
   142  	})
   143  
   144  	t.Run("ReplicaReadWrite", func(t *testing.T) {
   145  		// Make the replica database read-write.
   146  		_, err := utils.RunSQL(t, "set global read_only=OFF", replica, "")
   147  		require.NoError(t, err)
   148  
   149  		// wait for repair
   150  		match := utils.WaitForReadOnlyValue(t, replica, 1)
   151  		require.True(t, match)
   152  		utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 1)
   153  	})
   154  
   155  	t.Run("StopReplication", func(t *testing.T) {
   156  		// use vtctlclient to stop replication
   157  		_, err := clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("StopReplication", replica.Alias)
   158  		require.NoError(t, err)
   159  
   160  		// check replication is setup correctly
   161  		utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second)
   162  		utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 2)
   163  
   164  		// Stop just the IO thread on the replica
   165  		_, err = utils.RunSQL(t, "STOP SLAVE IO_THREAD", replica, "")
   166  		require.NoError(t, err)
   167  
   168  		// check replication is setup correctly
   169  		utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second)
   170  		utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 3)
   171  
   172  		// Stop just the SQL thread on the replica
   173  		_, err = utils.RunSQL(t, "STOP SLAVE SQL_THREAD", replica, "")
   174  		require.NoError(t, err)
   175  
   176  		// check replication is setup correctly
   177  		utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second)
   178  		utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 4)
   179  	})
   180  
   181  	t.Run("ReplicationFromOtherReplica", func(t *testing.T) {
   182  		// point replica at otherReplica
   183  		changeReplicationSourceCommand := fmt.Sprintf("STOP SLAVE; RESET SLAVE ALL;"+
   184  			"CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1; START SLAVE", utils.Hostname, otherReplica.MySQLPort)
   185  		_, err := utils.RunSQL(t, changeReplicationSourceCommand, replica, "")
   186  		require.NoError(t, err)
   187  
   188  		// wait until the source port is set back correctly by vtorc
   189  		utils.CheckSourcePort(t, replica, curPrimary, 15*time.Second)
   190  		utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 5)
   191  
   192  		// check that writes succeed
   193  		utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second)
   194  	})
   195  
   196  	t.Run("CircularReplication", func(t *testing.T) {
   197  		// change the replication source on the primary
   198  		changeReplicationSourceCommands := fmt.Sprintf("STOP SLAVE; RESET SLAVE ALL;"+
   199  			"CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1;"+
   200  			"START SLAVE;", replica.VttabletProcess.TabletHostname, replica.MySQLPort)
   201  		_, err := utils.RunSQL(t, changeReplicationSourceCommands, curPrimary, "")
   202  		require.NoError(t, err)
   203  
   204  		// wait for curPrimary to reach stable state
   205  		time.Sleep(1 * time.Second)
   206  
   207  		// wait for repair
   208  		err = utils.WaitForReplicationToStop(t, curPrimary)
   209  		require.NoError(t, err)
   210  		utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, 1)
   211  		// check that the writes still succeed
   212  		utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 10*time.Second)
   213  	})
   214  }
   215  
   216  func TestRepairAfterTER(t *testing.T) {
   217  	// test fails intermittently on CI, skip until it can be fixed.
   218  	t.SkipNow()
   219  	defer cluster.PanicHandler(t)
   220  	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 0, nil, cluster.VTOrcConfiguration{
   221  		PreventCrossDataCenterPrimaryFailover: true,
   222  	}, 1, "")
   223  	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
   224  	shard0 := &keyspace.Shards[0]
   225  
   226  	// find primary from topo
   227  	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
   228  	assert.NotNil(t, curPrimary, "should have elected a primary")
   229  
   230  	// TODO(deepthi): we should not need to do this, the DB should be created automatically
   231  	_, err := curPrimary.VttabletProcess.QueryTablet(fmt.Sprintf("create database IF NOT EXISTS vt_%s", keyspace.Name), keyspace.Name, false)
   232  	require.NoError(t, err)
   233  
   234  	var newPrimary *cluster.Vttablet
   235  	for _, tablet := range shard0.Vttablets {
   236  		// we know we have only two tablets, so the "other" one must be the new primary
   237  		if tablet.Alias != curPrimary.Alias {
   238  			newPrimary = tablet
   239  			break
   240  		}
   241  	}
   242  
   243  	// TER to other tablet
   244  	_, err = clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("TabletExternallyReparented", newPrimary.Alias)
   245  	require.NoError(t, err)
   246  
   247  	utils.CheckReplication(t, clusterInfo, newPrimary, []*cluster.Vttablet{curPrimary}, 15*time.Second)
   248  }
   249  
   250  // TestSemiSync tests that semi-sync is setup correctly by vtorc if it is incorrectly set
   251  func TestSemiSync(t *testing.T) {
   252  	// stop any vtorc instance running due to a previous test.
   253  	utils.StopVTOrcs(t, clusterInfo)
   254  	newCluster := utils.SetupNewClusterSemiSync(t)
   255  	utils.StartVTOrcs(t, newCluster, nil, cluster.VTOrcConfiguration{
   256  		PreventCrossDataCenterPrimaryFailover: true,
   257  	}, 1)
   258  	defer func() {
   259  		utils.StopVTOrcs(t, newCluster)
   260  		newCluster.ClusterInstance.Teardown()
   261  	}()
   262  	keyspace := &newCluster.ClusterInstance.Keyspaces[0]
   263  	shard0 := &keyspace.Shards[0]
   264  
   265  	// find primary from topo
   266  	primary := utils.ShardPrimaryTablet(t, newCluster, keyspace, shard0)
   267  	assert.NotNil(t, primary, "should have elected a primary")
   268  
   269  	var replica1, replica2, rdonly *cluster.Vttablet
   270  	for _, tablet := range shard0.Vttablets {
   271  		if tablet.Alias == primary.Alias {
   272  			continue
   273  		}
   274  		if tablet.Type == "rdonly" {
   275  			rdonly = tablet
   276  		} else {
   277  			if replica1 == nil {
   278  				replica1 = tablet
   279  			} else {
   280  				replica2 = tablet
   281  			}
   282  		}
   283  	}
   284  
   285  	assert.NotNil(t, replica1, "could not find any replica tablet")
   286  	assert.NotNil(t, replica2, "could not find the second replica tablet")
   287  	assert.NotNil(t, rdonly, "could not find rdonly tablet")
   288  
   289  	// check that the replication is setup correctly
   290  	utils.CheckReplication(t, newCluster, primary, []*cluster.Vttablet{rdonly, replica1, replica2}, 10*time.Second)
   291  
   292  	_, err := utils.RunSQL(t, "SET GLOBAL rpl_semi_sync_slave_enabled = 0", replica1, "")
   293  	require.NoError(t, err)
   294  	_, err = utils.RunSQL(t, "SET GLOBAL rpl_semi_sync_slave_enabled = 1", rdonly, "")
   295  	require.NoError(t, err)
   296  	_, err = utils.RunSQL(t, "SET GLOBAL rpl_semi_sync_master_enabled = 0", primary, "")
   297  	require.NoError(t, err)
   298  
   299  	timeout := time.After(20 * time.Second)
   300  	for {
   301  		select {
   302  		case <-timeout:
   303  			require.Fail(t, "timed out waiting for semi sync settings to be fixed")
   304  			return
   305  		default:
   306  			if utils.IsSemiSyncSetupCorrectly(t, replica1, "ON") &&
   307  				utils.IsSemiSyncSetupCorrectly(t, rdonly, "OFF") &&
   308  				utils.IsPrimarySemiSyncSetupCorrectly(t, primary, "ON") {
   309  				return
   310  			}
   311  			log.Warningf("semi sync settings not fixed yet")
   312  			time.Sleep(1 * time.Second)
   313  		}
   314  	}
   315  }
   316  
   317  // TestVTOrcWithPrs tests that VTOrc works fine even when PRS is called from vtctld
   318  func TestVTOrcWithPrs(t *testing.T) {
   319  	defer cluster.PanicHandler(t)
   320  	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 4, 0, nil, cluster.VTOrcConfiguration{
   321  		PreventCrossDataCenterPrimaryFailover: true,
   322  	}, 1, "")
   323  	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
   324  	shard0 := &keyspace.Shards[0]
   325  
   326  	// find primary from topo
   327  	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
   328  	assert.NotNil(t, curPrimary, "should have elected a primary")
   329  	vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
   330  	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
   331  
   332  	// find any replica tablet other than the current primary
   333  	var replica *cluster.Vttablet
   334  	for _, tablet := range shard0.Vttablets {
   335  		if tablet.Alias != curPrimary.Alias {
   336  			replica = tablet
   337  			break
   338  		}
   339  	}
   340  	assert.NotNil(t, replica, "could not find any replica tablet")
   341  
   342  	// check that the replication is setup correctly before we failover
   343  	utils.CheckReplication(t, clusterInfo, curPrimary, shard0.Vttablets, 10*time.Second)
   344  
   345  	output, err := clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommandWithOutput(
   346  		"PlannedReparentShard", "--",
   347  		"--keyspace_shard", fmt.Sprintf("%s/%s", keyspace.Name, shard0.Name),
   348  		"--wait_replicas_timeout", "31s",
   349  		"--new_primary", replica.Alias)
   350  	require.NoError(t, err, "error in PlannedReparentShard output - %s", output)
   351  
   352  	time.Sleep(40 * time.Second)
   353  
   354  	// check that the replica gets promoted
   355  	utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
   356  	// Verify that VTOrc didn't run any other recovery
   357  	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
   358  	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 0)
   359  	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, 0)
   360  	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 0)
   361  	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, 0)
   362  	utils.VerifyWritesSucceed(t, clusterInfo, replica, shard0.Vttablets, 10*time.Second)
   363  }
   364  
   365  // TestMultipleDurabilities tests that VTOrc works with 2 keyspaces having 2 different durability policies
   366  func TestMultipleDurabilities(t *testing.T) {
   367  	defer cluster.PanicHandler(t)
   368  	// Setup a normal cluster and start vtorc
   369  	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, nil, cluster.VTOrcConfiguration{}, 1, "")
   370  	// Setup a semi-sync cluster
   371  	utils.AddSemiSyncKeyspace(t, clusterInfo)
   372  
   373  	keyspaceNone := &clusterInfo.ClusterInstance.Keyspaces[0]
   374  	shardNone := &keyspaceNone.Shards[0]
   375  	utils.CheckPrimaryTablet(t, clusterInfo, shardNone.Vttablets[0], true)
   376  	utils.CheckReplication(t, clusterInfo, shardNone.Vttablets[0], shardNone.Vttablets[1:], 10*time.Second)
   377  
   378  	keyspaceSemiSync := &clusterInfo.ClusterInstance.Keyspaces[1]
   379  	shardSemiSync := &keyspaceSemiSync.Shards[0]
   380  	// find primary from topo
   381  	primary := utils.ShardPrimaryTablet(t, clusterInfo, keyspaceSemiSync, shardSemiSync)
   382  	assert.NotNil(t, primary, "should have elected a primary")
   383  }
   384  
   385  // TestDurabilityPolicySetLater tests that VTOrc works even if the durability policy of the keyspace is
   386  // set after VTOrc has been started.
   387  func TestDurabilityPolicySetLater(t *testing.T) {
   388  	// stop any vtorc instance running due to a previous test.
   389  	utils.StopVTOrcs(t, clusterInfo)
   390  	newCluster := utils.SetupNewClusterSemiSync(t)
   391  	keyspace := &newCluster.ClusterInstance.Keyspaces[0]
   392  	shard0 := &keyspace.Shards[0]
   393  	// Before starting VTOrc we explicity want to set the durability policy of the keyspace to an empty string
   394  	func() {
   395  		ctx, unlock, lockErr := newCluster.Ts.LockKeyspace(context.Background(), keyspace.Name, "TestDurabilityPolicySetLater")
   396  		require.NoError(t, lockErr)
   397  		defer unlock(&lockErr)
   398  		ki, err := newCluster.Ts.GetKeyspace(ctx, keyspace.Name)
   399  		require.NoError(t, err)
   400  		ki.DurabilityPolicy = ""
   401  		err = newCluster.Ts.UpdateKeyspace(ctx, ki)
   402  		require.NoError(t, err)
   403  	}()
   404  
   405  	// Verify that the durability policy is indeed empty
   406  	ki, err := newCluster.Ts.GetKeyspace(context.Background(), keyspace.Name)
   407  	require.NoError(t, err)
   408  	require.Empty(t, ki.DurabilityPolicy)
   409  
   410  	// Now start the vtorc instances
   411  	utils.StartVTOrcs(t, newCluster, nil, cluster.VTOrcConfiguration{
   412  		PreventCrossDataCenterPrimaryFailover: true,
   413  	}, 1)
   414  	defer func() {
   415  		utils.StopVTOrcs(t, newCluster)
   416  		newCluster.ClusterInstance.Teardown()
   417  	}()
   418  
   419  	// Wait for some time to be sure that VTOrc has started.
   420  	// TODO(GuptaManan100): Once we have a debug page for VTOrc, use that instead
   421  	time.Sleep(30 * time.Second)
   422  
   423  	// Now set the correct durability policy
   424  	out, err := newCluster.VtctldClientProcess.ExecuteCommandWithOutput("SetKeyspaceDurabilityPolicy", keyspace.Name, "--durability-policy=semi_sync")
   425  	require.NoError(t, err, out)
   426  
   427  	// VTOrc should promote a new primary after seeing the durability policy change
   428  	primary := utils.ShardPrimaryTablet(t, newCluster, keyspace, shard0)
   429  	assert.NotNil(t, primary, "should have elected a primary")
   430  	utils.CheckReplication(t, newCluster, primary, shard0.Vttablets, 10*time.Second)
   431  }