vitess.io/vitess@v0.16.2/go/test/endtoend/reparent/utils/utils.go (about)

     1  /*
     2  Copyright 2019 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package utils
    18  
    19  import (
    20  	"context"
    21  	"encoding/json"
    22  	"fmt"
    23  	"os"
    24  	"os/exec"
    25  	"path"
    26  	"reflect"
    27  	"strings"
    28  	"testing"
    29  	"time"
    30  
    31  	"github.com/stretchr/testify/assert"
    32  	"github.com/stretchr/testify/require"
    33  
    34  	querypb "vitess.io/vitess/go/vt/proto/query"
    35  	"vitess.io/vitess/go/vt/vttablet/tabletconn"
    36  
    37  	"vitess.io/vitess/go/json2"
    38  	"vitess.io/vitess/go/mysql"
    39  	"vitess.io/vitess/go/sqltypes"
    40  	"vitess.io/vitess/go/test/endtoend/cluster"
    41  	"vitess.io/vitess/go/vt/log"
    42  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    43  )
    44  
    45  var (
    46  	KeyspaceName = "ks"
    47  	dbName       = "vt_" + KeyspaceName
    48  	username     = "vt_dba"
    49  	Hostname     = "localhost"
    50  	insertVal    = 1
    51  	insertSQL    = "insert into vt_insert_test(id, msg) values (%d, 'test %d')"
    52  	sqlSchema    = `
    53  	create table vt_insert_test (
    54  	id bigint,
    55  	msg varchar(64),
    56  	primary key (id)
    57  	) Engine=InnoDB	
    58  `
    59  	cell1                  = "zone1"
    60  	cell2                  = "zone2"
    61  	ShardName              = "0"
    62  	KeyspaceShard          = KeyspaceName + "/" + ShardName
    63  	replicationWaitTimeout = time.Duration(15 * time.Second)
    64  )
    65  
    66  //region cluster setup/teardown
    67  
    68  // SetupReparentCluster is used to setup the reparent cluster
    69  func SetupReparentCluster(t *testing.T, durability string) *cluster.LocalProcessCluster {
    70  	return setupCluster(context.Background(), t, ShardName, []string{cell1, cell2}, []int{3, 1}, durability)
    71  }
    72  
    73  // SetupRangeBasedCluster sets up the range based cluster
    74  func SetupRangeBasedCluster(ctx context.Context, t *testing.T) *cluster.LocalProcessCluster {
    75  	return setupCluster(ctx, t, ShardName, []string{cell1}, []int{2}, "semi_sync")
    76  }
    77  
    78  // TeardownCluster is used to teardown the reparent cluster
    79  func TeardownCluster(clusterInstance *cluster.LocalProcessCluster) {
    80  	clusterInstance.Teardown()
    81  }
    82  
    83  func setupCluster(ctx context.Context, t *testing.T, shardName string, cells []string, numTablets []int, durability string) *cluster.LocalProcessCluster {
    84  	var tablets []*cluster.Vttablet
    85  	clusterInstance := cluster.NewCluster(cells[0], Hostname)
    86  	keyspace := &cluster.Keyspace{Name: KeyspaceName}
    87  
    88  	// Start topo server
    89  	err := clusterInstance.StartTopo()
    90  	require.NoError(t, err, "Error starting topo")
    91  	err = clusterInstance.TopoProcess.ManageTopoDir("mkdir", "/vitess/"+cells[0])
    92  	require.NoError(t, err, "Error managing topo")
    93  	numCell := 1
    94  	for numCell < len(cells) {
    95  		err = clusterInstance.VtctlProcess.AddCellInfo(cells[numCell])
    96  		require.NoError(t, err, "Error managing topo")
    97  		numCell++
    98  	}
    99  
   100  	// Adding another cell in the same cluster
   101  	numCell = 0
   102  	for numCell < len(cells) {
   103  		i := 0
   104  		for i < numTablets[numCell] {
   105  			i++
   106  			tablet := clusterInstance.NewVttabletInstance("replica", 100*(numCell+1)+i, cells[numCell])
   107  			tablets = append(tablets, tablet)
   108  		}
   109  		numCell++
   110  	}
   111  
   112  	shard := &cluster.Shard{Name: shardName}
   113  	shard.Vttablets = tablets
   114  
   115  	clusterInstance.VtTabletExtraArgs = append(clusterInstance.VtTabletExtraArgs,
   116  		"--lock_tables_timeout", "5s",
   117  		"--track_schema_versions=true",
   118  		// disabling online-ddl for reparent tests. This is done to reduce flakiness.
   119  		// All the tests in this package reparent frequently between different tablets
   120  		// This means that Promoting a tablet to primary is sometimes immediately followed by a DemotePrimary call.
   121  		// In this case, the close method and initSchema method of the onlineDDL executor race.
   122  		// If the initSchema acquires the lock, then it takes about 30 seconds for it to run during which time the
   123  		// DemotePrimary rpc is stalled!
   124  		"--queryserver_enable_online_ddl=false",
   125  		// disabling active reparents on the tablet since we don't want the replication manager
   126  		// to fix replication if it is stopped. Some tests deliberately do that. Also, we don't want
   127  		// the replication manager to silently fix the replication in case ERS or PRS mess up. All the
   128  		// tests in this test suite should work irrespective of this flag. Each run of ERS, PRS should be
   129  		// setting up the replication correctly.
   130  		"--disable-replication-manager")
   131  
   132  	// Initialize Cluster
   133  	err = clusterInstance.SetupCluster(keyspace, []cluster.Shard{*shard})
   134  	require.NoError(t, err, "Cannot launch cluster")
   135  
   136  	//Start MySql
   137  	var mysqlCtlProcessList []*exec.Cmd
   138  	for _, shard := range clusterInstance.Keyspaces[0].Shards {
   139  		for _, tablet := range shard.Vttablets {
   140  			log.Infof("Starting MySql for tablet %v", tablet.Alias)
   141  			proc, err := tablet.MysqlctlProcess.StartProcess()
   142  			require.NoError(t, err, "Error starting start mysql")
   143  			mysqlCtlProcessList = append(mysqlCtlProcessList, proc)
   144  		}
   145  	}
   146  
   147  	// Wait for mysql processes to start
   148  	for _, proc := range mysqlCtlProcessList {
   149  		if err := proc.Wait(); err != nil {
   150  			clusterInstance.PrintMysqlctlLogFiles()
   151  			require.FailNow(t, "Error starting mysql: %s", err.Error())
   152  		}
   153  	}
   154  	if clusterInstance.VtctlMajorVersion >= 14 {
   155  		clusterInstance.VtctldClientProcess = *cluster.VtctldClientProcessInstance("localhost", clusterInstance.VtctldProcess.GrpcPort, clusterInstance.TmpDirectory)
   156  		out, err := clusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("SetKeyspaceDurabilityPolicy", KeyspaceName, fmt.Sprintf("--durability-policy=%s", durability))
   157  		require.NoError(t, err, out)
   158  	}
   159  
   160  	setupShard(ctx, t, clusterInstance, shardName, tablets)
   161  	return clusterInstance
   162  }
   163  
   164  func setupShard(ctx context.Context, t *testing.T, clusterInstance *cluster.LocalProcessCluster, shardName string, tablets []*cluster.Vttablet) {
   165  	for _, tablet := range tablets {
   166  		tablet.VttabletProcess.SupportsBackup = false
   167  		// Start the tablet
   168  		err := tablet.VttabletProcess.Setup()
   169  		require.NoError(t, err)
   170  	}
   171  
   172  	for _, tablet := range tablets {
   173  		err := tablet.VttabletProcess.WaitForTabletStatuses([]string{"SERVING", "NOT_SERVING"})
   174  		require.NoError(t, err)
   175  	}
   176  
   177  	// Initialize shard
   178  	err := clusterInstance.VtctlclientProcess.InitializeShard(KeyspaceName, shardName, tablets[0].Cell, tablets[0].TabletUID)
   179  	require.NoError(t, err)
   180  
   181  	ValidateTopology(t, clusterInstance, true)
   182  
   183  	// create Tables
   184  	RunSQL(ctx, t, sqlSchema, tablets[0])
   185  
   186  	CheckPrimaryTablet(t, clusterInstance, tablets[0])
   187  
   188  	ValidateTopology(t, clusterInstance, false)
   189  	WaitForReplicationToStart(t, clusterInstance, KeyspaceName, shardName, len(tablets), true)
   190  }
   191  
   192  // StartNewVTTablet starts a new vttablet instance
   193  func StartNewVTTablet(t *testing.T, clusterInstance *cluster.LocalProcessCluster, uuid int, supportsBackup bool) *cluster.Vttablet {
   194  	tablet := clusterInstance.NewVttabletInstance("replica", uuid, cell1)
   195  	keyspace := clusterInstance.Keyspaces[0]
   196  	shard := keyspace.Shards[0]
   197  
   198  	// Setup MysqlctlProcess
   199  	tablet.MysqlctlProcess = *cluster.MysqlCtlProcessInstance(tablet.TabletUID, tablet.MySQLPort, clusterInstance.TmpDirectory)
   200  	// Setup VttabletProcess
   201  	tablet.VttabletProcess = cluster.VttabletProcessInstance(
   202  		tablet.HTTPPort,
   203  		tablet.GrpcPort,
   204  		tablet.TabletUID,
   205  		tablet.Cell,
   206  		shard.Name,
   207  		keyspace.Name,
   208  		clusterInstance.VtctldProcess.Port,
   209  		tablet.Type,
   210  		clusterInstance.TopoProcess.Port,
   211  		clusterInstance.Hostname,
   212  		clusterInstance.TmpDirectory,
   213  		[]string{
   214  			"--lock_tables_timeout", "5s",
   215  			"--track_schema_versions=true",
   216  			"--queryserver_enable_online_ddl=false",
   217  		},
   218  		clusterInstance.DefaultCharset)
   219  	tablet.VttabletProcess.SupportsBackup = supportsBackup
   220  
   221  	log.Infof("Starting MySql for tablet %v", tablet.Alias)
   222  	proc, err := tablet.MysqlctlProcess.StartProcess()
   223  	require.NoError(t, err, "Error starting start mysql")
   224  	if err := proc.Wait(); err != nil {
   225  		clusterInstance.PrintMysqlctlLogFiles()
   226  		require.FailNow(t, "Error starting mysql: %s", err.Error())
   227  	}
   228  
   229  	// The tablet should come up as serving since the primary for the shard already exists
   230  	tablet.VttabletProcess.ServingStatus = "SERVING"
   231  	tablet.VttabletProcess.SupportsBackup = false
   232  	err = tablet.VttabletProcess.Setup()
   233  	require.NoError(t, err)
   234  	return tablet
   235  }
   236  
   237  //endregion
   238  
   239  // region database queries
   240  func getMysqlConnParam(tablet *cluster.Vttablet) mysql.ConnParams {
   241  	connParams := mysql.ConnParams{
   242  		Uname:      username,
   243  		DbName:     dbName,
   244  		UnixSocket: path.Join(os.Getenv("VTDATAROOT"), fmt.Sprintf("/vt_%010d/mysql.sock", tablet.TabletUID)),
   245  	}
   246  	return connParams
   247  }
   248  
   249  // RunSQL is used to run a SQL command directly on the MySQL instance of a vttablet
   250  func RunSQL(ctx context.Context, t *testing.T, sql string, tablet *cluster.Vttablet) *sqltypes.Result {
   251  	tabletParams := getMysqlConnParam(tablet)
   252  	conn, err := mysql.Connect(ctx, &tabletParams)
   253  	require.Nil(t, err)
   254  	defer conn.Close()
   255  	return execute(t, conn, sql)
   256  }
   257  
   258  func execute(t *testing.T, conn *mysql.Conn, query string) *sqltypes.Result {
   259  	t.Helper()
   260  	qr, err := conn.ExecuteFetch(query, 1000, true)
   261  	require.Nil(t, err)
   262  	return qr
   263  }
   264  
   265  //endregion
   266  
   267  // region ers, prs
   268  
   269  // Prs runs PRS
   270  func Prs(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet) (string, error) {
   271  	return PrsWithTimeout(t, clusterInstance, tab, false, "", "")
   272  }
   273  
   274  // PrsAvoid runs PRS
   275  func PrsAvoid(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet) (string, error) {
   276  	return PrsWithTimeout(t, clusterInstance, tab, true, "", "")
   277  }
   278  
   279  // PrsWithTimeout runs PRS
   280  func PrsWithTimeout(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet, avoid bool, actionTimeout, waitTimeout string) (string, error) {
   281  	args := []string{
   282  		"PlannedReparentShard", "--",
   283  		"--keyspace_shard", fmt.Sprintf("%s/%s", KeyspaceName, ShardName)}
   284  	if actionTimeout != "" {
   285  		args = append(args, "--action_timeout", actionTimeout)
   286  	}
   287  	if waitTimeout != "" {
   288  		args = append(args, "--wait_replicas_timeout", waitTimeout)
   289  	}
   290  	if avoid {
   291  		args = append(args, "--avoid_tablet")
   292  	} else {
   293  		args = append(args, "--new_primary")
   294  	}
   295  	args = append(args, tab.Alias)
   296  	out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput(args...)
   297  	return out, err
   298  }
   299  
   300  // Ers runs the ERS
   301  func Ers(clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet, totalTimeout, waitReplicasTimeout string) (string, error) {
   302  	return ErsIgnoreTablet(clusterInstance, tab, totalTimeout, waitReplicasTimeout, nil, false)
   303  }
   304  
   305  // ErsIgnoreTablet is used to run ERS
   306  func ErsIgnoreTablet(clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet, timeout, waitReplicasTimeout string, tabletsToIgnore []*cluster.Vttablet, preventCrossCellPromotion bool) (string, error) {
   307  	var args []string
   308  	if timeout != "" {
   309  		args = append(args, "--action_timeout", timeout)
   310  	}
   311  	args = append(args, "EmergencyReparentShard", "--", "--keyspace_shard", fmt.Sprintf("%s/%s", KeyspaceName, ShardName))
   312  	if tab != nil {
   313  		args = append(args, "--new_primary", tab.Alias)
   314  	}
   315  	if waitReplicasTimeout != "" {
   316  		args = append(args, "--wait_replicas_timeout", waitReplicasTimeout)
   317  	}
   318  	if preventCrossCellPromotion {
   319  		args = append(args, "--prevent_cross_cell_promotion=true")
   320  	}
   321  	if len(tabletsToIgnore) != 0 {
   322  		tabsString := ""
   323  		for _, vttablet := range tabletsToIgnore {
   324  			if tabsString == "" {
   325  				tabsString = vttablet.Alias
   326  			} else {
   327  				tabsString = tabsString + "," + vttablet.Alias
   328  			}
   329  		}
   330  		args = append(args, "--ignore_replicas", tabsString)
   331  	}
   332  	return clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput(args...)
   333  }
   334  
   335  // ErsWithVtctl runs ERS via vtctl binary
   336  func ErsWithVtctl(clusterInstance *cluster.LocalProcessCluster) (string, error) {
   337  	args := []string{"EmergencyReparentShard", "--", "--keyspace_shard", fmt.Sprintf("%s/%s", KeyspaceName, ShardName)}
   338  	return clusterInstance.VtctlProcess.ExecuteCommandWithOutput(args...)
   339  }
   340  
   341  // endregion
   342  
   343  // region validations
   344  
   345  // ValidateTopology is used to validate the topology
   346  func ValidateTopology(t *testing.T, clusterInstance *cluster.LocalProcessCluster, pingTablets bool) {
   347  	args := []string{"Validate"}
   348  
   349  	if pingTablets {
   350  		args = append(args, "--", "--ping-tablets=true")
   351  	}
   352  	out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput(args...)
   353  	require.Empty(t, out)
   354  	require.NoError(t, err)
   355  }
   356  
   357  // ConfirmReplication confirms that the replication is working properly
   358  func ConfirmReplication(t *testing.T, primary *cluster.Vttablet, replicas []*cluster.Vttablet) int {
   359  	ctx := context.Background()
   360  	insertVal++
   361  	n := insertVal // unique value ...
   362  	// insert data into the new primary, check the connected replica work
   363  	insertSQL := fmt.Sprintf(insertSQL, n, n)
   364  	RunSQL(ctx, t, insertSQL, primary)
   365  	for _, tab := range replicas {
   366  		err := CheckInsertedValues(ctx, t, tab, n)
   367  		require.NoError(t, err)
   368  	}
   369  	return n
   370  }
   371  
   372  // ConfirmOldPrimaryIsHangingAround confirms that the old primary is hanging around
   373  func ConfirmOldPrimaryIsHangingAround(t *testing.T, clusterInstance *cluster.LocalProcessCluster) {
   374  	out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("Validate")
   375  	require.Error(t, err)
   376  	require.Contains(t, out, "already has primary")
   377  }
   378  
   379  // CheckPrimaryTablet makes sure the tablet type is primary, and its health check agrees.
   380  func CheckPrimaryTablet(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tablet *cluster.Vttablet) {
   381  	result, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("GetTablet", tablet.Alias)
   382  	require.NoError(t, err)
   383  	var tabletInfo topodatapb.Tablet
   384  	err = json2.Unmarshal([]byte(result), &tabletInfo)
   385  	require.NoError(t, err)
   386  	assert.Equal(t, topodatapb.TabletType_PRIMARY, tabletInfo.GetType())
   387  
   388  	// make sure the health stream is updated
   389  	shrs, err := clusterInstance.StreamTabletHealth(context.Background(), tablet, 1)
   390  	require.NoError(t, err)
   391  	streamHealthResponse := shrs[0]
   392  
   393  	assert.True(t, streamHealthResponse.GetServing())
   394  	tabletType := streamHealthResponse.GetTarget().GetTabletType()
   395  	assert.Equal(t, topodatapb.TabletType_PRIMARY, tabletType)
   396  }
   397  
   398  // isHealthyPrimaryTablet will return if tablet is primary AND healthy.
   399  func isHealthyPrimaryTablet(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tablet *cluster.Vttablet) bool {
   400  	result, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("GetTablet", tablet.Alias)
   401  	require.Nil(t, err)
   402  	var tabletInfo topodatapb.Tablet
   403  	err = json2.Unmarshal([]byte(result), &tabletInfo)
   404  	require.Nil(t, err)
   405  	if tabletInfo.GetType() != topodatapb.TabletType_PRIMARY {
   406  		return false
   407  	}
   408  
   409  	// make sure the health stream is updated
   410  	shrs, err := clusterInstance.StreamTabletHealth(context.Background(), tablet, 1)
   411  	require.NoError(t, err)
   412  	streamHealthResponse := shrs[0]
   413  
   414  	assert.True(t, streamHealthResponse.GetServing())
   415  	tabletType := streamHealthResponse.GetTarget().GetTabletType()
   416  	return tabletType == topodatapb.TabletType_PRIMARY
   417  }
   418  
   419  // CheckInsertedValues checks that the given value is present in the given tablet
   420  func CheckInsertedValues(ctx context.Context, t *testing.T, tablet *cluster.Vttablet, index int) error {
   421  	query := fmt.Sprintf("select msg from vt_insert_test where id=%d", index)
   422  	tabletParams := getMysqlConnParam(tablet)
   423  	var conn *mysql.Conn
   424  
   425  	// wait until it gets the data
   426  	timeout := time.Now().Add(replicationWaitTimeout)
   427  	i := 0
   428  	for time.Now().Before(timeout) {
   429  		// We start with no connection to MySQL
   430  		if conn == nil {
   431  			// Try connecting to MySQL
   432  			mysqlConn, err := mysql.Connect(ctx, &tabletParams)
   433  			// This can fail if the database create hasn't been replicated yet.
   434  			// We ignore this failure and try again later
   435  			if err == nil {
   436  				// If we succeed, then we store the connection
   437  				// and reuse it for checking the rows in the table.
   438  				conn = mysqlConn
   439  				defer conn.Close()
   440  			}
   441  		}
   442  		if conn != nil {
   443  			// We'll get a mysql.ERNoSuchTable (1146) error if the CREATE TABLE has not replicated yet and
   444  			// it's possible that we get other ephemeral errors too, so we make the tests more robust by
   445  			// retrying with the timeout.
   446  			qr, err := conn.ExecuteFetch(query, 1, true)
   447  			if err == nil && len(qr.Rows) == 1 {
   448  				return nil
   449  			}
   450  		}
   451  		t := time.Duration(300 * i)
   452  		time.Sleep(t * time.Millisecond)
   453  		i++
   454  	}
   455  	return fmt.Errorf("data did not get replicated on tablet %s within the timeout of %v", tablet.Alias, replicationWaitTimeout)
   456  }
   457  
   458  func CheckSemiSyncSetupCorrectly(t *testing.T, tablet *cluster.Vttablet, semiSyncVal string) {
   459  	dbVar, err := tablet.VttabletProcess.GetDBVar("rpl_semi_sync_slave_enabled", "")
   460  	require.NoError(t, err)
   461  	require.Equal(t, semiSyncVal, dbVar)
   462  }
   463  
   464  // CheckCountOfInsertedValues checks that the number of inserted values matches the given count on the given tablet
   465  func CheckCountOfInsertedValues(ctx context.Context, t *testing.T, tablet *cluster.Vttablet, count int) error {
   466  	selectSQL := "select * from vt_insert_test"
   467  	qr := RunSQL(ctx, t, selectSQL, tablet)
   468  	if len(qr.Rows) == count {
   469  		return nil
   470  	}
   471  	return fmt.Errorf("count does not match on the tablet %s", tablet.Alias)
   472  }
   473  
   474  // endregion
   475  
   476  // region tablet operations
   477  
   478  // StopTablet stops the tablet
   479  func StopTablet(t *testing.T, tab *cluster.Vttablet, stopDatabase bool) {
   480  	err := tab.VttabletProcess.TearDownWithTimeout(30 * time.Second)
   481  	require.NoError(t, err)
   482  	if stopDatabase {
   483  		err = tab.MysqlctlProcess.Stop()
   484  		require.NoError(t, err)
   485  	}
   486  }
   487  
   488  // RestartTablet restarts the tablet
   489  func RestartTablet(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet) {
   490  	tab.MysqlctlProcess.InitMysql = false
   491  	err := tab.MysqlctlProcess.Start()
   492  	require.NoError(t, err)
   493  	err = clusterInstance.VtctlclientProcess.InitTablet(tab, tab.Cell, KeyspaceName, Hostname, ShardName)
   494  	require.NoError(t, err)
   495  }
   496  
   497  // ResurrectTablet is used to resurrect the given tablet
   498  func ResurrectTablet(ctx context.Context, t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet) {
   499  	tab.MysqlctlProcess.InitMysql = false
   500  	err := tab.MysqlctlProcess.Start()
   501  	require.NoError(t, err)
   502  	err = clusterInstance.VtctlclientProcess.InitTablet(tab, tab.Cell, KeyspaceName, Hostname, ShardName)
   503  	require.NoError(t, err)
   504  
   505  	// As there is already a primary the new replica will come directly in SERVING state
   506  	tab.VttabletProcess.ServingStatus = "SERVING"
   507  	// Start the tablet
   508  	err = tab.VttabletProcess.Setup()
   509  	require.NoError(t, err)
   510  
   511  	err = CheckInsertedValues(ctx, t, tab, insertVal)
   512  	require.NoError(t, err)
   513  }
   514  
   515  // DeleteTablet is used to delete the given tablet
   516  func DeleteTablet(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet) {
   517  	err := clusterInstance.VtctlclientProcess.ExecuteCommand(
   518  		"DeleteTablet", "--",
   519  		"--allow_primary",
   520  		tab.Alias)
   521  	require.NoError(t, err)
   522  }
   523  
   524  // endregion
   525  
   526  // region get info
   527  
   528  // GetNewPrimary is used to find the new primary of the cluster.
   529  func GetNewPrimary(t *testing.T, clusterInstance *cluster.LocalProcessCluster) *cluster.Vttablet {
   530  	var newPrimary *cluster.Vttablet
   531  	for _, tablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets[1:] {
   532  		if isHealthyPrimaryTablet(t, clusterInstance, tablet) {
   533  			newPrimary = tablet
   534  			break
   535  		}
   536  	}
   537  	require.NotNil(t, newPrimary)
   538  	return newPrimary
   539  }
   540  
   541  // GetShardReplicationPositions gets the shards replication positions.
   542  // This should not generally be called directly, instead use the WaitForReplicationToCatchup method.
   543  func GetShardReplicationPositions(t *testing.T, clusterInstance *cluster.LocalProcessCluster, keyspaceName, shardName string, doPrint bool) []string {
   544  	output, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput(
   545  		"ShardReplicationPositions", fmt.Sprintf("%s/%s", keyspaceName, shardName))
   546  	require.NoError(t, err)
   547  	strArray := strings.Split(output, "\n")
   548  	if strArray[len(strArray)-1] == "" {
   549  		strArray = strArray[:len(strArray)-1] // Truncate slice, remove empty line
   550  	}
   551  	if doPrint {
   552  		log.Infof("Positions:")
   553  		for _, pos := range strArray {
   554  			log.Infof("\t%s", pos)
   555  		}
   556  	}
   557  	return strArray
   558  }
   559  
   560  func WaitForReplicationToStart(t *testing.T, clusterInstance *cluster.LocalProcessCluster, keyspaceName, shardName string, tabletCnt int, doPrint bool) {
   561  	tkr := time.NewTicker(500 * time.Millisecond)
   562  	defer tkr.Stop()
   563  	for {
   564  		select {
   565  		case <-tkr.C:
   566  			strArray := GetShardReplicationPositions(t, clusterInstance, KeyspaceName, shardName, true)
   567  			if len(strArray) == tabletCnt && strings.Contains(strArray[0], "primary") { // primary first
   568  				return
   569  			}
   570  		case <-time.After(replicationWaitTimeout):
   571  			require.FailNow(t, fmt.Sprintf("replication did not start everywhere in %s/%s within the timeout of %v",
   572  				keyspaceName, shardName, replicationWaitTimeout))
   573  			return
   574  		}
   575  	}
   576  }
   577  
   578  // endregion
   579  
   580  // CheckReplicaStatus checks the replication status and asserts that the replication is stopped
   581  func CheckReplicaStatus(ctx context.Context, t *testing.T, tablet *cluster.Vttablet) {
   582  	qr := RunSQL(ctx, t, "show slave status", tablet)
   583  	IOThreadRunning := fmt.Sprintf("%v", qr.Rows[0][10])
   584  	SQLThreadRunning := fmt.Sprintf("%v", qr.Rows[0][10])
   585  	assert.Equal(t, IOThreadRunning, "VARCHAR(\"No\")")
   586  	assert.Equal(t, SQLThreadRunning, "VARCHAR(\"No\")")
   587  }
   588  
   589  // CheckReparentFromOutside checks that cluster was reparented from outside
   590  func CheckReparentFromOutside(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tablet *cluster.Vttablet, downPrimary bool, baseTime int64) {
   591  	result, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("GetShardReplication", cell1, KeyspaceShard)
   592  	require.Nil(t, err, "error should be Nil")
   593  	if !downPrimary {
   594  		assertNodeCount(t, result, int(3))
   595  	} else {
   596  		assertNodeCount(t, result, int(2))
   597  	}
   598  
   599  	// make sure the primary status page says it's the primary
   600  	status := tablet.VttabletProcess.GetStatus()
   601  	assert.Contains(t, status, "Tablet Type: PRIMARY")
   602  
   603  	// make sure the primary health stream says it's the primary too
   604  	// (health check is disabled on these servers, force it first)
   605  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("RunHealthCheck", tablet.Alias)
   606  	require.NoError(t, err)
   607  
   608  	shrs, err := clusterInstance.StreamTabletHealth(context.Background(), tablet, 1)
   609  	require.NoError(t, err)
   610  	streamHealthResponse := shrs[0]
   611  
   612  	assert.Equal(t, streamHealthResponse.Target.TabletType, topodatapb.TabletType_PRIMARY)
   613  	assert.True(t, streamHealthResponse.TabletExternallyReparentedTimestamp >= baseTime)
   614  }
   615  
   616  // WaitForReplicationPosition waits for tablet B to catch up to the replication position of tablet A.
   617  func WaitForReplicationPosition(t *testing.T, tabletA *cluster.Vttablet, tabletB *cluster.Vttablet) error {
   618  	posA, _ := cluster.GetPrimaryPosition(t, *tabletA, Hostname)
   619  	timeout := time.Now().Add(replicationWaitTimeout)
   620  	for time.Now().Before(timeout) {
   621  		posB, _ := cluster.GetPrimaryPosition(t, *tabletB, Hostname)
   622  		if positionAtLeast(t, tabletB, posA, posB) {
   623  			return nil
   624  		}
   625  		time.Sleep(500 * time.Millisecond)
   626  	}
   627  	return fmt.Errorf("failed to catch up on replication position")
   628  }
   629  
   630  // positionAtLeast executes the command position at_least
   631  func positionAtLeast(t *testing.T, tablet *cluster.Vttablet, a string, b string) bool {
   632  	isAtleast := false
   633  	val, err := tablet.MysqlctlProcess.ExecuteCommandWithOutput("position", "at_least", a, b)
   634  	require.NoError(t, err)
   635  	if strings.Contains(val, "true") {
   636  		isAtleast = true
   637  	}
   638  	return isAtleast
   639  }
   640  
   641  func assertNodeCount(t *testing.T, result string, want int) {
   642  	resultMap := make(map[string]any)
   643  	err := json.Unmarshal([]byte(result), &resultMap)
   644  	require.NoError(t, err)
   645  
   646  	nodes := reflect.ValueOf(resultMap["nodes"])
   647  	got := nodes.Len()
   648  	assert.Equal(t, want, got)
   649  }
   650  
   651  // CheckDBvar checks the db var
   652  func CheckDBvar(ctx context.Context, t *testing.T, tablet *cluster.Vttablet, variable string, status string) {
   653  	tabletParams := getMysqlConnParam(tablet)
   654  	conn, err := mysql.Connect(ctx, &tabletParams)
   655  	require.NoError(t, err)
   656  	defer conn.Close()
   657  
   658  	qr := execute(t, conn, fmt.Sprintf("show variables like '%s'", variable))
   659  	got := fmt.Sprintf("%v", qr.Rows)
   660  	want := fmt.Sprintf("[[VARCHAR(\"%s\") VARCHAR(\"%s\")]]", variable, status)
   661  	assert.Equal(t, want, got)
   662  }
   663  
   664  // CheckDBstatus checks the db status
   665  func CheckDBstatus(ctx context.Context, t *testing.T, tablet *cluster.Vttablet, variable string, status string) {
   666  	tabletParams := getMysqlConnParam(tablet)
   667  	conn, err := mysql.Connect(ctx, &tabletParams)
   668  	require.NoError(t, err)
   669  	defer conn.Close()
   670  
   671  	qr := execute(t, conn, fmt.Sprintf("show status like '%s'", variable))
   672  	got := fmt.Sprintf("%v", qr.Rows)
   673  	want := fmt.Sprintf("[[VARCHAR(\"%s\") VARCHAR(\"%s\")]]", variable, status)
   674  	assert.Equal(t, want, got)
   675  }
   676  
   677  // SetReplicationSourceFailed returns true if the given output from PRS had failed because the given tablet was
   678  // unable to setReplicationSource. Since some tests are used in upgrade-downgrade testing, we need this function to
   679  // work with different versions of vtctl.
   680  func SetReplicationSourceFailed(tablet *cluster.Vttablet, prsOut string) bool {
   681  	return strings.Contains(prsOut, fmt.Sprintf("tablet %s failed to SetReplicationSource", tablet.Alias))
   682  }
   683  
   684  // CheckReplicationStatus checks that the replication for sql and io threads is setup as expected
   685  func CheckReplicationStatus(ctx context.Context, t *testing.T, tablet *cluster.Vttablet, sqlThreadRunning bool, ioThreadRunning bool) {
   686  	res := RunSQL(ctx, t, "show slave status;", tablet)
   687  	if ioThreadRunning {
   688  		require.Equal(t, "Yes", res.Rows[0][10].ToString())
   689  	} else {
   690  		require.Equal(t, "No", res.Rows[0][10].ToString())
   691  	}
   692  
   693  	if sqlThreadRunning {
   694  		require.Equal(t, "Yes", res.Rows[0][11].ToString())
   695  	} else {
   696  		require.Equal(t, "No", res.Rows[0][11].ToString())
   697  	}
   698  }
   699  
   700  func WaitForTabletToBeServing(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tablet *cluster.Vttablet, timeout time.Duration) {
   701  	vTablet, err := clusterInstance.VtctlclientGetTablet(tablet)
   702  	require.NoError(t, err)
   703  
   704  	tConn, err := tabletconn.GetDialer()(vTablet, false)
   705  	require.NoError(t, err)
   706  
   707  	newCtx, cancel := context.WithTimeout(context.Background(), timeout)
   708  	err = tConn.StreamHealth(newCtx, func(shr *querypb.StreamHealthResponse) error {
   709  		if shr.Serving {
   710  			cancel()
   711  		}
   712  		return nil
   713  	})
   714  
   715  	// the error should only be because we cancelled the context when the tablet became serving again.
   716  	if err != nil && !strings.Contains(err.Error(), "context canceled") {
   717  		t.Fatal(err.Error())
   718  	}
   719  }