vitess.io/vitess@v0.16.2/go/test/endtoend/reparent/plannedreparent/reparent_test.go (about)

     1  /*
     2  Copyright 2019 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package plannedreparent
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"strconv"
    23  	"testing"
    24  	"time"
    25  
    26  	"github.com/stretchr/testify/assert"
    27  	"github.com/stretchr/testify/require"
    28  
    29  	"google.golang.org/protobuf/encoding/protojson"
    30  
    31  	"vitess.io/vitess/go/mysql"
    32  	"vitess.io/vitess/go/test/endtoend/cluster"
    33  	"vitess.io/vitess/go/test/endtoend/reparent/utils"
    34  	"vitess.io/vitess/go/vt/log"
    35  	replicationdatapb "vitess.io/vitess/go/vt/proto/replicationdata"
    36  )
    37  
    38  func TestPrimaryToSpareStateChangeImpossible(t *testing.T) {
    39  	defer cluster.PanicHandler(t)
    40  	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
    41  	defer utils.TeardownCluster(clusterInstance)
    42  	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
    43  
    44  	// We cannot change a primary to spare
    45  	out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("ChangeTabletType", tablets[0].Alias, "spare")
    46  	require.Error(t, err, out)
    47  	require.Contains(t, out, "type change PRIMARY -> SPARE is not an allowed transition for ChangeTabletType")
    48  }
    49  
    50  func TestReparentCrossCell(t *testing.T) {
    51  	defer cluster.PanicHandler(t)
    52  	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
    53  	defer utils.TeardownCluster(clusterInstance)
    54  	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
    55  
    56  	// Perform a graceful reparent operation to another cell.
    57  	_, err := utils.Prs(t, clusterInstance, tablets[3])
    58  	require.NoError(t, err)
    59  
    60  	utils.ValidateTopology(t, clusterInstance, false)
    61  	utils.CheckPrimaryTablet(t, clusterInstance, tablets[3])
    62  }
    63  
    64  func TestReparentGraceful(t *testing.T) {
    65  	defer cluster.PanicHandler(t)
    66  	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
    67  	defer utils.TeardownCluster(clusterInstance)
    68  	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
    69  
    70  	// Run this to make sure it succeeds.
    71  	utils.WaitForReplicationToStart(t, clusterInstance, utils.KeyspaceName, utils.ShardName, len(tablets), true)
    72  
    73  	// Perform a graceful reparent operation
    74  	utils.Prs(t, clusterInstance, tablets[1])
    75  	utils.ValidateTopology(t, clusterInstance, false)
    76  	utils.CheckPrimaryTablet(t, clusterInstance, tablets[1])
    77  
    78  	// A graceful reparent to the same primary should be idempotent.
    79  	utils.Prs(t, clusterInstance, tablets[1])
    80  	utils.ValidateTopology(t, clusterInstance, false)
    81  	utils.CheckPrimaryTablet(t, clusterInstance, tablets[1])
    82  
    83  	utils.ConfirmReplication(t, tablets[1], []*cluster.Vttablet{tablets[0], tablets[2], tablets[3]})
    84  }
    85  
    86  // TestPRSWithDrainedLaggingTablet tests that PRS succeeds even if we have a lagging drained tablet
    87  func TestPRSWithDrainedLaggingTablet(t *testing.T) {
    88  	defer cluster.PanicHandler(t)
    89  	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
    90  	defer utils.TeardownCluster(clusterInstance)
    91  	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
    92  
    93  	err := clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", tablets[1].Alias, "drained")
    94  	require.NoError(t, err)
    95  
    96  	utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]})
    97  
    98  	// make tablets[1 lag from the other tablets by setting the delay to a large number
    99  	utils.RunSQL(context.Background(), t, `stop slave;CHANGE MASTER TO MASTER_DELAY = 1999;start slave;`, tablets[1])
   100  
   101  	// insert another row in tablets[1
   102  	utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[2], tablets[3]})
   103  
   104  	// assert that there is indeed only 1 row in tablets[1
   105  	res := utils.RunSQL(context.Background(), t, `select msg from vt_insert_test;`, tablets[1])
   106  	assert.Equal(t, 1, len(res.Rows))
   107  
   108  	// Perform a graceful reparent operation
   109  	utils.Prs(t, clusterInstance, tablets[2])
   110  	utils.ValidateTopology(t, clusterInstance, false)
   111  	utils.CheckPrimaryTablet(t, clusterInstance, tablets[2])
   112  }
   113  
   114  func TestReparentReplicaOffline(t *testing.T) {
   115  	defer cluster.PanicHandler(t)
   116  	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
   117  	defer utils.TeardownCluster(clusterInstance)
   118  	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
   119  
   120  	// Kill one tablet so we seem offline
   121  	utils.StopTablet(t, tablets[3], true)
   122  
   123  	// Perform a graceful reparent operation.
   124  	out, err := utils.PrsWithTimeout(t, clusterInstance, tablets[1], false, "", "31s")
   125  	require.Error(t, err)
   126  	assert.True(t, utils.SetReplicationSourceFailed(tablets[3], out))
   127  
   128  	utils.CheckPrimaryTablet(t, clusterInstance, tablets[1])
   129  }
   130  
   131  func TestReparentAvoid(t *testing.T) {
   132  	defer cluster.PanicHandler(t)
   133  	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
   134  	defer utils.TeardownCluster(clusterInstance)
   135  	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
   136  	utils.DeleteTablet(t, clusterInstance, tablets[2])
   137  
   138  	// Perform a reparent operation with avoid_tablet pointing to non-primary. It
   139  	// should succeed without doing anything.
   140  	_, err := utils.PrsAvoid(t, clusterInstance, tablets[1])
   141  	require.NoError(t, err)
   142  
   143  	utils.ValidateTopology(t, clusterInstance, false)
   144  	utils.CheckPrimaryTablet(t, clusterInstance, tablets[0])
   145  
   146  	// Perform a reparent operation with avoid_tablet pointing to primary.
   147  	_, err = utils.PrsAvoid(t, clusterInstance, tablets[0])
   148  	require.NoError(t, err)
   149  	utils.ValidateTopology(t, clusterInstance, false)
   150  
   151  	// tablets[1 is in the same cell and tablets[3] is in a different cell, so we must land on tablets[1
   152  	utils.CheckPrimaryTablet(t, clusterInstance, tablets[1])
   153  
   154  	// If we kill the tablet in the same cell as primary then reparent --avoid_tablet will fail.
   155  	utils.StopTablet(t, tablets[0], true)
   156  	out, err := utils.PrsAvoid(t, clusterInstance, tablets[1])
   157  	require.Error(t, err)
   158  	assert.Contains(t, out, "cannot find a tablet to reparent to in the same cell as the current primary")
   159  	utils.ValidateTopology(t, clusterInstance, false)
   160  	utils.CheckPrimaryTablet(t, clusterInstance, tablets[1])
   161  }
   162  
   163  func TestReparentFromOutside(t *testing.T) {
   164  	defer cluster.PanicHandler(t)
   165  	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
   166  	defer utils.TeardownCluster(clusterInstance)
   167  	reparentFromOutside(t, clusterInstance, false)
   168  }
   169  
   170  func TestReparentFromOutsideWithNoPrimary(t *testing.T) {
   171  	defer cluster.PanicHandler(t)
   172  	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
   173  	defer utils.TeardownCluster(clusterInstance)
   174  	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
   175  
   176  	reparentFromOutside(t, clusterInstance, true)
   177  
   178  	// FIXME: @Deepthi: is this needed, since we teardown the cluster, does this achieve any additional test coverage?
   179  	// We will have to restart mysql to avoid hanging/locks due to external Reparent
   180  	for _, tablet := range tablets {
   181  		log.Infof("Restarting MySql for tablet %v", tablet.Alias)
   182  		err := tablet.MysqlctlProcess.Stop()
   183  		require.NoError(t, err)
   184  		tablet.MysqlctlProcess.InitMysql = false
   185  		err = tablet.MysqlctlProcess.Start()
   186  		require.NoError(t, err)
   187  	}
   188  }
   189  
   190  func reparentFromOutside(t *testing.T, clusterInstance *cluster.LocalProcessCluster, downPrimary bool) {
   191  	//This test will start a primary and 3 replicas.
   192  	//Then:
   193  	//- one replica will be the new primary
   194  	//- one replica will be reparented to that new primary
   195  	//- one replica will be busted and dead in the water and we'll call TabletExternallyReparented.
   196  	//Args:
   197  	//downPrimary: kills the old primary first
   198  	ctx := context.Background()
   199  	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
   200  
   201  	// now manually reparent 1 out of 2 tablets
   202  	// tablets[1 will be the new primary
   203  	// tablets[2 won't be re-parented, so it will be busted
   204  
   205  	if !downPrimary {
   206  		// commands to stop the current primary
   207  		demoteCommands := "SET GLOBAL read_only = ON; FLUSH TABLES WITH READ LOCK; UNLOCK TABLES"
   208  		utils.RunSQL(ctx, t, demoteCommands, tablets[0])
   209  
   210  		//Get the position of the old primary and wait for the new one to catch up.
   211  		err := utils.WaitForReplicationPosition(t, tablets[0], tablets[1])
   212  		require.NoError(t, err)
   213  	}
   214  
   215  	// commands to convert a replica to be writable
   216  	promoteReplicaCommands := "STOP SLAVE; RESET SLAVE ALL; SET GLOBAL read_only = OFF;"
   217  	utils.RunSQL(ctx, t, promoteReplicaCommands, tablets[1])
   218  
   219  	// Get primary position
   220  	_, gtID := cluster.GetPrimaryPosition(t, *tablets[1], utils.Hostname)
   221  
   222  	// tablets[0] will now be a replica of tablets[1
   223  	changeReplicationSourceCommands := fmt.Sprintf("RESET MASTER; RESET SLAVE; SET GLOBAL gtid_purged = '%s';"+
   224  		"CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1;"+
   225  		"START SLAVE;", gtID, utils.Hostname, tablets[1].MySQLPort)
   226  	utils.RunSQL(ctx, t, changeReplicationSourceCommands, tablets[0])
   227  
   228  	// Capture time when we made tablets[1 writable
   229  	baseTime := time.Now().UnixNano() / 1000000000
   230  
   231  	// tablets[2 will be a replica of tablets[1
   232  	changeReplicationSourceCommands = fmt.Sprintf("STOP SLAVE; RESET MASTER; SET GLOBAL gtid_purged = '%s';"+
   233  		"CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1;"+
   234  		"START SLAVE;", gtID, utils.Hostname, tablets[1].MySQLPort)
   235  	utils.RunSQL(ctx, t, changeReplicationSourceCommands, tablets[2])
   236  
   237  	// To test the downPrimary, we kill the old primary first and delete its tablet record
   238  	if downPrimary {
   239  		err := tablets[0].VttabletProcess.TearDownWithTimeout(30 * time.Second)
   240  		require.NoError(t, err)
   241  		err = clusterInstance.VtctlclientProcess.ExecuteCommand("DeleteTablet", "--",
   242  			"--allow_primary", tablets[0].Alias)
   243  		require.NoError(t, err)
   244  	}
   245  
   246  	// update topology with the new server
   247  	err := clusterInstance.VtctlclientProcess.ExecuteCommand("TabletExternallyReparented",
   248  		tablets[1].Alias)
   249  	require.NoError(t, err)
   250  
   251  	utils.CheckReparentFromOutside(t, clusterInstance, tablets[1], downPrimary, baseTime)
   252  
   253  	if !downPrimary {
   254  		err := tablets[0].VttabletProcess.TearDownWithTimeout(30 * time.Second)
   255  		require.NoError(t, err)
   256  	}
   257  }
   258  
   259  func TestReparentWithDownReplica(t *testing.T) {
   260  	defer cluster.PanicHandler(t)
   261  	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
   262  	defer utils.TeardownCluster(clusterInstance)
   263  	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
   264  
   265  	ctx := context.Background()
   266  
   267  	utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]})
   268  
   269  	// Stop replica mysql Process
   270  	err := tablets[2].MysqlctlProcess.Stop()
   271  	require.NoError(t, err)
   272  
   273  	utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[3]})
   274  
   275  	// Perform a graceful reparent operation. It will fail as one tablet is down.
   276  	out, err := utils.Prs(t, clusterInstance, tablets[1])
   277  	require.Error(t, err)
   278  	assert.True(t, utils.SetReplicationSourceFailed(tablets[2], out))
   279  
   280  	// insert data into the new primary, check the connected replica work
   281  	insertVal := utils.ConfirmReplication(t, tablets[1], []*cluster.Vttablet{tablets[0], tablets[3]})
   282  
   283  	// restart mysql on the old replica, should still be connecting to the old primary
   284  	tablets[2].MysqlctlProcess.InitMysql = false
   285  	err = tablets[2].MysqlctlProcess.Start()
   286  	require.NoError(t, err)
   287  
   288  	// Use the same PlannedReparentShard command to fix up the tablet.
   289  	_, err = utils.Prs(t, clusterInstance, tablets[1])
   290  	require.NoError(t, err)
   291  
   292  	// We have to StartReplication on tablets[2] since the MySQL instance is restarted and does not have replication running
   293  	// We earlier used to rely on replicationManager to fix this but we have disabled it in our testing environment for latest versions of vttablet and vtctl.
   294  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", tablets[2].Alias)
   295  	require.NoError(t, err)
   296  
   297  	// wait until it gets the data
   298  	err = utils.CheckInsertedValues(ctx, t, tablets[2], insertVal)
   299  	require.NoError(t, err)
   300  }
   301  
   302  func TestChangeTypeSemiSync(t *testing.T) {
   303  	defer cluster.PanicHandler(t)
   304  	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
   305  	defer utils.TeardownCluster(clusterInstance)
   306  	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
   307  
   308  	ctx := context.Background()
   309  
   310  	// Create new names for tablets, so this test is less confusing.
   311  	primary, replica, rdonly1, rdonly2 := tablets[0], tablets[1], tablets[2], tablets[3]
   312  
   313  	// Updated rdonly tablet and set tablet type to rdonly
   314  	err := clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", rdonly1.Alias, "rdonly")
   315  	require.NoError(t, err)
   316  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", rdonly2.Alias, "rdonly")
   317  	require.NoError(t, err)
   318  
   319  	utils.ValidateTopology(t, clusterInstance, true)
   320  
   321  	utils.CheckPrimaryTablet(t, clusterInstance, primary)
   322  
   323  	// Stop replication on rdonly1, to make sure when we make it replica it doesn't start again.
   324  	// Note we do a similar test for replica -> rdonly below.
   325  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("StopReplication", rdonly1.Alias)
   326  	require.NoError(t, err)
   327  
   328  	// Check semi-sync on replicas.
   329  	// The flag is only an indication of the value to use next time
   330  	// we turn replication on, so also check the status.
   331  	// rdonly1 is not replicating, so its status is off.
   332  	utils.CheckDBvar(ctx, t, replica, "rpl_semi_sync_slave_enabled", "ON")
   333  	utils.CheckDBvar(ctx, t, rdonly1, "rpl_semi_sync_slave_enabled", "OFF")
   334  	utils.CheckDBvar(ctx, t, rdonly2, "rpl_semi_sync_slave_enabled", "OFF")
   335  	utils.CheckDBstatus(ctx, t, replica, "Rpl_semi_sync_slave_status", "ON")
   336  	utils.CheckDBstatus(ctx, t, rdonly1, "Rpl_semi_sync_slave_status", "OFF")
   337  	utils.CheckDBstatus(ctx, t, rdonly2, "Rpl_semi_sync_slave_status", "OFF")
   338  
   339  	// Change replica to rdonly while replicating, should turn off semi-sync, and restart replication.
   340  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", replica.Alias, "rdonly")
   341  	require.NoError(t, err)
   342  	utils.CheckDBvar(ctx, t, replica, "rpl_semi_sync_slave_enabled", "OFF")
   343  	utils.CheckDBstatus(ctx, t, replica, "Rpl_semi_sync_slave_status", "OFF")
   344  
   345  	// Change rdonly1 to replica, should turn on semi-sync, and not start replication.
   346  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", rdonly1.Alias, "replica")
   347  	require.NoError(t, err)
   348  	utils.CheckDBvar(ctx, t, rdonly1, "rpl_semi_sync_slave_enabled", "ON")
   349  	utils.CheckDBstatus(ctx, t, rdonly1, "Rpl_semi_sync_slave_status", "OFF")
   350  	utils.CheckReplicaStatus(ctx, t, rdonly1)
   351  
   352  	// Now change from replica back to rdonly, make sure replication is still not enabled.
   353  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", rdonly1.Alias, "rdonly")
   354  	require.NoError(t, err)
   355  	utils.CheckDBvar(ctx, t, rdonly1, "rpl_semi_sync_slave_enabled", "OFF")
   356  	utils.CheckDBstatus(ctx, t, rdonly1, "Rpl_semi_sync_slave_status", "OFF")
   357  	utils.CheckReplicaStatus(ctx, t, rdonly1)
   358  
   359  	// Change rdonly2 to replica, should turn on semi-sync, and restart replication.
   360  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", rdonly2.Alias, "replica")
   361  	require.NoError(t, err)
   362  	utils.CheckDBvar(ctx, t, rdonly2, "rpl_semi_sync_slave_enabled", "ON")
   363  	utils.CheckDBstatus(ctx, t, rdonly2, "Rpl_semi_sync_slave_status", "ON")
   364  }
   365  
   366  // TestCrossCellDurability tests 2 things -
   367  // 1. When PRS is run with the cross_cell durability policy setup, then the semi-sync settings on all the tablets are as expected
   368  // 2. Bringing up a new vttablet should have its replication and semi-sync setup correctly without any manual intervention
   369  func TestCrossCellDurability(t *testing.T) {
   370  	defer cluster.PanicHandler(t)
   371  	clusterInstance := utils.SetupReparentCluster(t, "cross_cell")
   372  	defer utils.TeardownCluster(clusterInstance)
   373  	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
   374  
   375  	utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]})
   376  
   377  	// When tablets[0] is the primary, the only tablet in a different cell is tablets[3].
   378  	// So the other two should have semi-sync turned off
   379  	utils.CheckSemiSyncSetupCorrectly(t, tablets[0], "ON")
   380  	utils.CheckSemiSyncSetupCorrectly(t, tablets[3], "ON")
   381  	utils.CheckSemiSyncSetupCorrectly(t, tablets[1], "OFF")
   382  	utils.CheckSemiSyncSetupCorrectly(t, tablets[2], "OFF")
   383  
   384  	// Run forced reparent operation, this should proceed unimpeded.
   385  	out, err := utils.Prs(t, clusterInstance, tablets[3])
   386  	require.NoError(t, err, out)
   387  
   388  	utils.ConfirmReplication(t, tablets[3], []*cluster.Vttablet{tablets[0], tablets[1], tablets[2]})
   389  
   390  	// All the tablets will have semi-sync setup since tablets[3] is in Cell2 and all
   391  	// others are in Cell1, so all of them are eligible to send semi-sync ACKs
   392  	for _, tablet := range tablets {
   393  		utils.CheckSemiSyncSetupCorrectly(t, tablet, "ON")
   394  	}
   395  
   396  	for i, supportsBackup := range []bool{false, true} {
   397  		// Bring up a new replica tablet
   398  		// In this new tablet, we do not disable active reparents, otherwise replication will not be started.
   399  		newReplica := utils.StartNewVTTablet(t, clusterInstance, 300+i, supportsBackup)
   400  		// Add the tablet to the list of tablets in this shard
   401  		clusterInstance.Keyspaces[0].Shards[0].Vttablets = append(clusterInstance.Keyspaces[0].Shards[0].Vttablets, newReplica)
   402  		// Check that we can replicate to it and semi-sync is setup correctly on it
   403  		utils.ConfirmReplication(t, tablets[3], []*cluster.Vttablet{tablets[0], tablets[1], tablets[2], newReplica})
   404  		utils.CheckSemiSyncSetupCorrectly(t, newReplica, "ON")
   405  	}
   406  }
   407  
   408  // TestFullStatus tests that the RPC FullStatus works as intended.
   409  func TestFullStatus(t *testing.T) {
   410  	defer cluster.PanicHandler(t)
   411  	clusterInstance := utils.SetupReparentCluster(t, "semi_sync")
   412  	defer utils.TeardownCluster(clusterInstance)
   413  	tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
   414  	utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]})
   415  
   416  	// Check that full status gives the correct result for a primary tablet
   417  	primaryTablet := tablets[0]
   418  	primaryStatusString, err := clusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("GetFullStatus", primaryTablet.Alias)
   419  	require.NoError(t, err)
   420  	primaryStatus := &replicationdatapb.FullStatus{}
   421  	opt := protojson.UnmarshalOptions{DiscardUnknown: true}
   422  	err = opt.Unmarshal([]byte(primaryStatusString), primaryStatus)
   423  	require.NoError(t, err)
   424  	assert.NotEmpty(t, primaryStatus.ServerUuid)
   425  	assert.NotEmpty(t, primaryStatus.ServerId)
   426  	// For a primary tablet there is no replication status
   427  	assert.Nil(t, primaryStatus.ReplicationStatus)
   428  	assert.Contains(t, primaryStatus.PrimaryStatus.String(), "vt-0000000101-bin")
   429  	assert.Equal(t, primaryStatus.GtidPurged, "MySQL56/")
   430  	assert.False(t, primaryStatus.ReadOnly)
   431  	assert.True(t, primaryStatus.SemiSyncPrimaryEnabled)
   432  	assert.True(t, primaryStatus.SemiSyncReplicaEnabled)
   433  	assert.True(t, primaryStatus.SemiSyncPrimaryStatus)
   434  	assert.False(t, primaryStatus.SemiSyncReplicaStatus)
   435  	assert.EqualValues(t, 3, primaryStatus.SemiSyncPrimaryClients)
   436  	assert.EqualValues(t, 1000000000000000000, primaryStatus.SemiSyncPrimaryTimeout)
   437  	assert.EqualValues(t, 1, primaryStatus.SemiSyncWaitForReplicaCount)
   438  	assert.Equal(t, "ROW", primaryStatus.BinlogFormat)
   439  	assert.Equal(t, "FULL", primaryStatus.BinlogRowImage)
   440  	assert.Equal(t, "ON", primaryStatus.GtidMode)
   441  	assert.True(t, primaryStatus.LogReplicaUpdates)
   442  	assert.True(t, primaryStatus.LogBinEnabled)
   443  	assert.Regexp(t, `[58]\.[07].*`, primaryStatus.Version)
   444  	assert.NotEmpty(t, primaryStatus.VersionComment)
   445  
   446  	replicaTablet := tablets[1]
   447  
   448  	waitForFilePosition(t, clusterInstance, primaryTablet, replicaTablet, 5*time.Second)
   449  
   450  	// Check that full status gives the correct result for a replica tablet
   451  	replicaStatusString, err := clusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("GetFullStatus", replicaTablet.Alias)
   452  	require.NoError(t, err)
   453  	replicaStatus := &replicationdatapb.FullStatus{}
   454  	opt = protojson.UnmarshalOptions{DiscardUnknown: true}
   455  	err = opt.Unmarshal([]byte(replicaStatusString), replicaStatus)
   456  	require.NoError(t, err)
   457  	assert.NotEmpty(t, replicaStatus.ServerUuid)
   458  	assert.NotEmpty(t, replicaStatus.ServerId)
   459  	assert.Contains(t, replicaStatus.ReplicationStatus.Position, "MySQL56/"+replicaStatus.ReplicationStatus.SourceUuid)
   460  	assert.EqualValues(t, mysql.ReplicationStateRunning, replicaStatus.ReplicationStatus.IoState)
   461  	assert.EqualValues(t, mysql.ReplicationStateRunning, replicaStatus.ReplicationStatus.SqlState)
   462  	assert.Equal(t, fileNameFromPosition(replicaStatus.ReplicationStatus.FilePosition), fileNameFromPosition(primaryStatus.PrimaryStatus.FilePosition))
   463  	assert.LessOrEqual(t, rowNumberFromPosition(replicaStatus.ReplicationStatus.FilePosition), rowNumberFromPosition(primaryStatus.PrimaryStatus.FilePosition))
   464  	assert.Equal(t, replicaStatus.ReplicationStatus.RelayLogSourceBinlogEquivalentPosition, primaryStatus.PrimaryStatus.FilePosition)
   465  	assert.Contains(t, replicaStatus.ReplicationStatus.RelayLogFilePosition, "vt-0000000102-relay")
   466  	assert.Equal(t, replicaStatus.ReplicationStatus.Position, primaryStatus.PrimaryStatus.Position)
   467  	assert.Equal(t, replicaStatus.ReplicationStatus.RelayLogPosition, primaryStatus.PrimaryStatus.Position)
   468  	assert.Empty(t, replicaStatus.ReplicationStatus.LastIoError)
   469  	assert.Empty(t, replicaStatus.ReplicationStatus.LastSqlError)
   470  	assert.Equal(t, replicaStatus.ReplicationStatus.SourceUuid, primaryStatus.ServerUuid)
   471  	assert.LessOrEqual(t, int(replicaStatus.ReplicationStatus.ReplicationLagSeconds), 1)
   472  	assert.False(t, replicaStatus.ReplicationStatus.ReplicationLagUnknown)
   473  	assert.EqualValues(t, 0, replicaStatus.ReplicationStatus.SqlDelay)
   474  	assert.False(t, replicaStatus.ReplicationStatus.SslAllowed)
   475  	assert.False(t, replicaStatus.ReplicationStatus.HasReplicationFilters)
   476  	assert.False(t, replicaStatus.ReplicationStatus.UsingGtid)
   477  	assert.True(t, replicaStatus.ReplicationStatus.AutoPosition)
   478  	assert.Equal(t, replicaStatus.ReplicationStatus.SourceHost, utils.Hostname)
   479  	assert.EqualValues(t, replicaStatus.ReplicationStatus.SourcePort, tablets[0].MySQLPort)
   480  	assert.Equal(t, replicaStatus.ReplicationStatus.SourceUser, "vt_repl")
   481  	assert.Contains(t, replicaStatus.PrimaryStatus.String(), "vt-0000000102-bin")
   482  	assert.Equal(t, replicaStatus.GtidPurged, "MySQL56/")
   483  	assert.True(t, replicaStatus.ReadOnly)
   484  	assert.False(t, replicaStatus.SemiSyncPrimaryEnabled)
   485  	assert.True(t, replicaStatus.SemiSyncReplicaEnabled)
   486  	assert.False(t, replicaStatus.SemiSyncPrimaryStatus)
   487  	assert.True(t, replicaStatus.SemiSyncReplicaStatus)
   488  	assert.EqualValues(t, 0, replicaStatus.SemiSyncPrimaryClients)
   489  	assert.EqualValues(t, 1000000000000000000, replicaStatus.SemiSyncPrimaryTimeout)
   490  	assert.EqualValues(t, 1, replicaStatus.SemiSyncWaitForReplicaCount)
   491  	assert.Equal(t, "ROW", replicaStatus.BinlogFormat)
   492  	assert.Equal(t, "FULL", replicaStatus.BinlogRowImage)
   493  	assert.Equal(t, "ON", replicaStatus.GtidMode)
   494  	assert.True(t, replicaStatus.LogReplicaUpdates)
   495  	assert.True(t, replicaStatus.LogBinEnabled)
   496  	assert.Regexp(t, `[58]\.[07].*`, replicaStatus.Version)
   497  	assert.NotEmpty(t, replicaStatus.VersionComment)
   498  }
   499  
   500  func getFullStatus(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tablet *cluster.Vttablet) *replicationdatapb.FullStatus {
   501  	statusString, err := clusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("GetFullStatus", tablet.Alias)
   502  	require.NoError(t, err)
   503  	status := &replicationdatapb.FullStatus{}
   504  	opt := protojson.UnmarshalOptions{DiscardUnknown: true}
   505  	err = opt.Unmarshal([]byte(statusString), status)
   506  	require.NoError(t, err)
   507  	return status
   508  }
   509  
   510  // waitForFilePosition waits for timeout to see if FilePositions align b/w primary and replica, to fix flakiness in tests due to race conditions where replica is still catching up
   511  func waitForFilePosition(t *testing.T, clusterInstance *cluster.LocalProcessCluster, primary *cluster.Vttablet, replica *cluster.Vttablet, timeout time.Duration) {
   512  	start := time.Now()
   513  	for {
   514  		primaryStatus := getFullStatus(t, clusterInstance, primary)
   515  		replicaStatus := getFullStatus(t, clusterInstance, replica)
   516  		if primaryStatus.PrimaryStatus.FilePosition == replicaStatus.ReplicationStatus.FilePosition {
   517  			return
   518  		}
   519  		if d := time.Since(start); d > timeout {
   520  			require.FailNowf(t, "waitForFilePosition timed out, primary %s, replica %s",
   521  				primaryStatus.PrimaryStatus.FilePosition, replicaStatus.ReplicationStatus.FilePosition)
   522  		}
   523  		time.Sleep(100 * time.Millisecond)
   524  	}
   525  }
   526  
   527  // fileNameFromPosition gets the file name from the position
   528  func fileNameFromPosition(pos string) string {
   529  	return pos[0 : len(pos)-4]
   530  }
   531  
   532  // rowNumberFromPosition gets the row number from the position
   533  func rowNumberFromPosition(pos string) int {
   534  	rowNumStr := pos[len(pos)-4:]
   535  	rowNum, _ := strconv.Atoi(rowNumStr)
   536  	return rowNum
   537  }