vitess.io/vitess@v0.16.2/go/test/endtoend/vtorc/utils/utils.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package utils
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"os"
    23  	"os/exec"
    24  	"path"
    25  	"strings"
    26  	"testing"
    27  	"time"
    28  
    29  	"github.com/stretchr/testify/assert"
    30  	"github.com/stretchr/testify/require"
    31  
    32  	"vitess.io/vitess/go/json2"
    33  	"vitess.io/vitess/go/mysql"
    34  	"vitess.io/vitess/go/sqltypes"
    35  	"vitess.io/vitess/go/test/endtoend/cluster"
    36  	"vitess.io/vitess/go/vt/log"
    37  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    38  	"vitess.io/vitess/go/vt/topo"
    39  	"vitess.io/vitess/go/vt/topo/topoproto"
    40  
    41  	// Register topo implementations.
    42  	_ "vitess.io/vitess/go/vt/topo/consultopo"
    43  	_ "vitess.io/vitess/go/vt/topo/etcd2topo"
    44  	_ "vitess.io/vitess/go/vt/topo/k8stopo"
    45  	_ "vitess.io/vitess/go/vt/topo/zk2topo"
    46  )
    47  
    48  const (
    49  	keyspaceName = "ks"
    50  	shardName    = "0"
    51  	Hostname     = "localhost"
    52  	Cell1        = "zone1"
    53  	Cell2        = "zone2"
    54  )
    55  
    56  // CellInfo stores the information regarding 1 cell including the tablets it contains
    57  type CellInfo struct {
    58  	CellName       string
    59  	ReplicaTablets []*cluster.Vttablet
    60  	RdonlyTablets  []*cluster.Vttablet
    61  	// constants that should be set in TestMain
    62  	NumReplicas int
    63  	NumRdonly   int
    64  	UIDBase     int
    65  }
    66  
    67  // VTOrcClusterInfo stores the information for a cluster. This is supposed to be used only for VTOrc tests.
    68  type VTOrcClusterInfo struct {
    69  	ClusterInstance     *cluster.LocalProcessCluster
    70  	Ts                  *topo.Server
    71  	CellInfos           []*CellInfo
    72  	VtctldClientProcess *cluster.VtctldClientProcess
    73  	lastUsedValue       int
    74  }
    75  
    76  // CreateClusterAndStartTopo starts the cluster and topology service
    77  func CreateClusterAndStartTopo(cellInfos []*CellInfo) (*VTOrcClusterInfo, error) {
    78  	clusterInstance := cluster.NewCluster(Cell1, Hostname)
    79  
    80  	// Start topo server
    81  	err := clusterInstance.StartTopo()
    82  	if err != nil {
    83  		return nil, err
    84  	}
    85  
    86  	// Adding another cell in the same cluster
    87  	err = clusterInstance.TopoProcess.ManageTopoDir("mkdir", "/vitess/"+Cell2)
    88  	if err != nil {
    89  		return nil, err
    90  	}
    91  	err = clusterInstance.VtctlProcess.AddCellInfo(Cell2)
    92  	if err != nil {
    93  		return nil, err
    94  	}
    95  
    96  	// create the vttablets
    97  	err = createVttablets(clusterInstance, cellInfos)
    98  	if err != nil {
    99  		return nil, err
   100  	}
   101  
   102  	// store the vtctldclient process
   103  	vtctldClientProcess := cluster.VtctldClientProcessInstance("localhost", clusterInstance.VtctldProcess.GrpcPort, clusterInstance.TmpDirectory)
   104  
   105  	// create topo server connection
   106  	ts, err := topo.OpenServer(*clusterInstance.TopoFlavorString(), clusterInstance.VtctlProcess.TopoGlobalAddress, clusterInstance.VtctlProcess.TopoGlobalRoot)
   107  	return &VTOrcClusterInfo{
   108  		ClusterInstance:     clusterInstance,
   109  		Ts:                  ts,
   110  		CellInfos:           cellInfos,
   111  		lastUsedValue:       100,
   112  		VtctldClientProcess: vtctldClientProcess,
   113  	}, err
   114  }
   115  
   116  // createVttablets is used to create the vttablets for all the tests
   117  func createVttablets(clusterInstance *cluster.LocalProcessCluster, cellInfos []*CellInfo) error {
   118  	keyspace := &cluster.Keyspace{Name: keyspaceName}
   119  	shard0 := &cluster.Shard{Name: shardName}
   120  
   121  	// creating tablets by hand instead of using StartKeyspace because we don't want to call InitShardPrimary
   122  	var tablets []*cluster.Vttablet
   123  	for _, cellInfo := range cellInfos {
   124  		for i := 0; i < cellInfo.NumReplicas; i++ {
   125  			vttabletInstance := clusterInstance.NewVttabletInstance("replica", cellInfo.UIDBase, cellInfo.CellName)
   126  			cellInfo.UIDBase++
   127  			tablets = append(tablets, vttabletInstance)
   128  			cellInfo.ReplicaTablets = append(cellInfo.ReplicaTablets, vttabletInstance)
   129  		}
   130  		for i := 0; i < cellInfo.NumRdonly; i++ {
   131  			vttabletInstance := clusterInstance.NewVttabletInstance("rdonly", cellInfo.UIDBase, cellInfo.CellName)
   132  			cellInfo.UIDBase++
   133  			tablets = append(tablets, vttabletInstance)
   134  			cellInfo.RdonlyTablets = append(cellInfo.RdonlyTablets, vttabletInstance)
   135  		}
   136  	}
   137  	clusterInstance.VtTabletExtraArgs = []string{
   138  		"--lock_tables_timeout", "5s",
   139  		"--disable_active_reparents",
   140  	}
   141  	// Initialize Cluster
   142  	shard0.Vttablets = tablets
   143  	err := clusterInstance.SetupCluster(keyspace, []cluster.Shard{*shard0})
   144  	if err != nil {
   145  		return err
   146  	}
   147  	//Start MySql
   148  	var mysqlCtlProcessList []*exec.Cmd
   149  	for _, tablet := range shard0.Vttablets {
   150  		log.Infof("Starting MySql for tablet %v", tablet.Alias)
   151  		proc, err := tablet.MysqlctlProcess.StartProcess()
   152  		if err != nil {
   153  			return err
   154  		}
   155  		mysqlCtlProcessList = append(mysqlCtlProcessList, proc)
   156  	}
   157  	// Wait for mysql processes to start
   158  	for _, proc := range mysqlCtlProcessList {
   159  		err := proc.Wait()
   160  		if err != nil {
   161  			return err
   162  		}
   163  	}
   164  	for _, tablet := range shard0.Vttablets {
   165  		// Reset status, don't wait for the tablet status. We will check it later
   166  		tablet.VttabletProcess.ServingStatus = ""
   167  		// Start the tablet
   168  		err := tablet.VttabletProcess.Setup()
   169  		if err != nil {
   170  			return err
   171  		}
   172  	}
   173  	for _, tablet := range shard0.Vttablets {
   174  		err := tablet.VttabletProcess.WaitForTabletStatuses([]string{"SERVING", "NOT_SERVING"})
   175  		if err != nil {
   176  			return err
   177  		}
   178  	}
   179  
   180  	// we also need to wait for the tablet type to change from restore to replica, before we delete a tablet from the topology
   181  	// otherwise it will notice that their is no record for the tablet in the topology when it tries to update its state and shutdown itself!
   182  	for _, tablet := range shard0.Vttablets {
   183  		err := tablet.VttabletProcess.WaitForTabletTypes([]string{"replica", "rdonly"})
   184  		if err != nil {
   185  			return err
   186  		}
   187  	}
   188  
   189  	return nil
   190  }
   191  
   192  // shutdownVttablets shuts down all the vttablets and removes them from the topology
   193  func shutdownVttablets(clusterInfo *VTOrcClusterInfo) error {
   194  	// reset the shard primary
   195  	err := resetShardPrimary(clusterInfo.Ts)
   196  	if err != nil {
   197  		return err
   198  	}
   199  
   200  	for _, vttablet := range clusterInfo.ClusterInstance.Keyspaces[0].Shards[0].Vttablets {
   201  		// we need to stop a vttablet only if it is not shutdown
   202  		if !vttablet.VttabletProcess.IsShutdown() {
   203  			// Stop the vttablets
   204  			err := vttablet.VttabletProcess.TearDown()
   205  			if err != nil {
   206  				return err
   207  			}
   208  			// Remove the tablet record for this tablet
   209  		}
   210  		err = clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommand("DeleteTablet", vttablet.Alias)
   211  		if err != nil {
   212  			return err
   213  		}
   214  	}
   215  	clusterInfo.ClusterInstance.Keyspaces[0].Shards[0].Vttablets = nil
   216  	return nil
   217  }
   218  
   219  // resetShardPrimary resets the shard's primary
   220  func resetShardPrimary(ts *topo.Server) (err error) {
   221  	// lock the shard
   222  	ctx, unlock, lockErr := ts.LockShard(context.Background(), keyspaceName, shardName, "resetShardPrimary-vtorc-endtoend-test")
   223  	if lockErr != nil {
   224  		return lockErr
   225  	}
   226  	defer unlock(&err)
   227  
   228  	// update the shard record's primary
   229  	if _, err = ts.UpdateShardFields(ctx, keyspaceName, shardName, func(si *topo.ShardInfo) error {
   230  		si.PrimaryAlias = nil
   231  		return nil
   232  	}); err != nil {
   233  		return err
   234  	}
   235  	return
   236  }
   237  
   238  // StartVTOrcs is used to start the vtorcs with the given extra arguments
   239  func StartVTOrcs(t *testing.T, clusterInfo *VTOrcClusterInfo, orcExtraArgs []string, config cluster.VTOrcConfiguration, count int) {
   240  	t.Helper()
   241  	// Start vtorc
   242  	for i := 0; i < count; i++ {
   243  		vtorcProcess := clusterInfo.ClusterInstance.NewVTOrcProcess(config)
   244  		vtorcProcess.ExtraArgs = orcExtraArgs
   245  		err := vtorcProcess.Setup()
   246  		require.NoError(t, err)
   247  		clusterInfo.ClusterInstance.VTOrcProcesses = append(clusterInfo.ClusterInstance.VTOrcProcesses, vtorcProcess)
   248  	}
   249  }
   250  
   251  // StopVTOrcs is used to stop the vtorcs
   252  func StopVTOrcs(t *testing.T, clusterInfo *VTOrcClusterInfo) {
   253  	t.Helper()
   254  	// Stop vtorc
   255  	for _, vtorcProcess := range clusterInfo.ClusterInstance.VTOrcProcesses {
   256  		if err := vtorcProcess.TearDown(); err != nil {
   257  			log.Errorf("Error in vtorc teardown: %v", err)
   258  		}
   259  	}
   260  	clusterInfo.ClusterInstance.VTOrcProcesses = nil
   261  }
   262  
   263  // SetupVttabletsAndVTOrcs is used to setup the vttablets and start the vtorcs
   264  func SetupVttabletsAndVTOrcs(t *testing.T, clusterInfo *VTOrcClusterInfo, numReplicasReqCell1, numRdonlyReqCell1 int, orcExtraArgs []string, config cluster.VTOrcConfiguration, vtorcCount int, durability string) {
   265  	// stop vtorc if it is running
   266  	StopVTOrcs(t, clusterInfo)
   267  
   268  	// remove all the vttablets so that each test can add the amount that they require
   269  	err := shutdownVttablets(clusterInfo)
   270  	require.NoError(t, err)
   271  
   272  	for _, cellInfo := range clusterInfo.CellInfos {
   273  		if cellInfo.CellName == Cell1 {
   274  			for _, tablet := range cellInfo.ReplicaTablets {
   275  				if numReplicasReqCell1 == 0 {
   276  					break
   277  				}
   278  				cleanAndStartVttablet(t, clusterInfo, tablet)
   279  				numReplicasReqCell1--
   280  			}
   281  
   282  			for _, tablet := range cellInfo.RdonlyTablets {
   283  				if numRdonlyReqCell1 == 0 {
   284  					break
   285  				}
   286  				cleanAndStartVttablet(t, clusterInfo, tablet)
   287  				numRdonlyReqCell1--
   288  			}
   289  		}
   290  	}
   291  
   292  	if numRdonlyReqCell1 > 0 || numReplicasReqCell1 > 0 {
   293  		t.Fatalf("more than available tablets requested. Please increase the constants numReplicas or numRdonly")
   294  	}
   295  
   296  	// wait for the tablets to come up properly
   297  	for _, tablet := range clusterInfo.ClusterInstance.Keyspaces[0].Shards[0].Vttablets {
   298  		err := tablet.VttabletProcess.WaitForTabletStatuses([]string{"SERVING", "NOT_SERVING"})
   299  		require.NoError(t, err)
   300  	}
   301  	for _, tablet := range clusterInfo.ClusterInstance.Keyspaces[0].Shards[0].Vttablets {
   302  		err := tablet.VttabletProcess.WaitForTabletTypes([]string{"replica", "rdonly"})
   303  		require.NoError(t, err)
   304  	}
   305  
   306  	if durability == "" {
   307  		durability = "none"
   308  	}
   309  	out, err := clusterInfo.VtctldClientProcess.ExecuteCommandWithOutput("SetKeyspaceDurabilityPolicy", keyspaceName, fmt.Sprintf("--durability-policy=%s", durability))
   310  	require.NoError(t, err, out)
   311  
   312  	// start vtorc
   313  	StartVTOrcs(t, clusterInfo, orcExtraArgs, config, vtorcCount)
   314  }
   315  
   316  // cleanAndStartVttablet cleans the MySQL instance underneath for running a new test. It also starts the vttablet.
   317  func cleanAndStartVttablet(t *testing.T, clusterInfo *VTOrcClusterInfo, vttablet *cluster.Vttablet) {
   318  	t.Helper()
   319  	// set super-read-only to false
   320  	_, err := RunSQL(t, "SET GLOBAL super_read_only = OFF", vttablet, "")
   321  	require.NoError(t, err)
   322  	// remove the databases if they exist
   323  	_, err = RunSQL(t, "DROP DATABASE IF EXISTS vt_ks", vttablet, "")
   324  	require.NoError(t, err)
   325  	_, err = RunSQL(t, "DROP DATABASE IF EXISTS _vt", vttablet, "")
   326  	require.NoError(t, err)
   327  	// stop the replication
   328  	_, err = RunSQL(t, "STOP SLAVE", vttablet, "")
   329  	require.NoError(t, err)
   330  	// reset the binlog
   331  	_, err = RunSQL(t, "RESET MASTER", vttablet, "")
   332  	require.NoError(t, err)
   333  	// set read-only to true
   334  	_, err = RunSQL(t, "SET GLOBAL read_only = ON", vttablet, "")
   335  	require.NoError(t, err)
   336  
   337  	// start the vttablet
   338  	err = vttablet.VttabletProcess.Setup()
   339  	require.NoError(t, err)
   340  
   341  	clusterInfo.ClusterInstance.Keyspaces[0].Shards[0].Vttablets = append(clusterInfo.ClusterInstance.Keyspaces[0].Shards[0].Vttablets, vttablet)
   342  }
   343  
   344  // ShardPrimaryTablet waits until a primary tablet has been elected for the given shard and returns it
   345  func ShardPrimaryTablet(t *testing.T, clusterInfo *VTOrcClusterInfo, keyspace *cluster.Keyspace, shard *cluster.Shard) *cluster.Vttablet {
   346  	start := time.Now()
   347  	for {
   348  		now := time.Now()
   349  		if now.Sub(start) > time.Second*60 {
   350  			assert.FailNow(t, "failed to elect primary before timeout")
   351  		}
   352  		result, err := clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("GetShard", fmt.Sprintf("%s/%s", keyspace.Name, shard.Name))
   353  		assert.Nil(t, err)
   354  
   355  		var shardInfo topodatapb.Shard
   356  		err = json2.Unmarshal([]byte(result), &shardInfo)
   357  		assert.Nil(t, err)
   358  		if shardInfo.PrimaryAlias == nil {
   359  			log.Warningf("Shard %v/%v has no primary yet, sleep for 1 second\n", keyspace.Name, shard.Name)
   360  			time.Sleep(time.Second)
   361  			continue
   362  		}
   363  		for _, tablet := range shard.Vttablets {
   364  			if tablet.Alias == topoproto.TabletAliasString(shardInfo.PrimaryAlias) {
   365  				return tablet
   366  			}
   367  		}
   368  	}
   369  }
   370  
   371  // CheckPrimaryTablet waits until the specified tablet becomes the primary tablet
   372  // Makes sure the tablet type is primary, and its health check agrees.
   373  func CheckPrimaryTablet(t *testing.T, clusterInfo *VTOrcClusterInfo, tablet *cluster.Vttablet, checkServing bool) {
   374  	start := time.Now()
   375  	for {
   376  		now := time.Now()
   377  		if now.Sub(start) > time.Second*60 {
   378  			//log.Exitf("error")
   379  			assert.FailNow(t, "failed to elect primary before timeout")
   380  		}
   381  		result, err := clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("GetTablet", tablet.Alias)
   382  		require.NoError(t, err)
   383  		var tabletInfo topodatapb.Tablet
   384  		err = json2.Unmarshal([]byte(result), &tabletInfo)
   385  		require.NoError(t, err)
   386  
   387  		if topodatapb.TabletType_PRIMARY != tabletInfo.GetType() {
   388  			log.Warningf("Tablet %v is not primary yet, sleep for 1 second\n", tablet.Alias)
   389  			time.Sleep(time.Second)
   390  			continue
   391  		}
   392  		// make sure the health stream is updated
   393  		shrs, err := clusterInfo.ClusterInstance.StreamTabletHealth(context.Background(), tablet, 1)
   394  		require.NoError(t, err)
   395  
   396  		streamHealthResponse := shrs[0]
   397  
   398  		if checkServing && !streamHealthResponse.GetServing() {
   399  			log.Warningf("Tablet %v is not serving in health stream yet, sleep for 1 second\n", tablet.Alias)
   400  			time.Sleep(time.Second)
   401  			continue
   402  		}
   403  		tabletType := streamHealthResponse.GetTarget().GetTabletType()
   404  		if tabletType != topodatapb.TabletType_PRIMARY {
   405  			log.Warningf("Tablet %v is not primary in health stream yet, sleep for 1 second\n", tablet.Alias)
   406  			time.Sleep(time.Second)
   407  			continue
   408  		}
   409  		break
   410  	}
   411  }
   412  
   413  // CheckReplication checks that the replication is setup correctly and writes succeed and are replicated on all the replicas
   414  func CheckReplication(t *testing.T, clusterInfo *VTOrcClusterInfo, primary *cluster.Vttablet, replicas []*cluster.Vttablet, timeToWait time.Duration) {
   415  	endTime := time.Now().Add(timeToWait)
   416  	// create tables, insert data and make sure it is replicated correctly
   417  	sqlSchema := `
   418  		create table if not exists vt_ks.vt_insert_test (
   419  		id bigint,
   420  		msg varchar(64),
   421  		primary key (id)
   422  		) Engine=InnoDB
   423  		`
   424  	timeout := time.After(time.Until(endTime))
   425  	for {
   426  		select {
   427  		case <-timeout:
   428  			t.Fatal("timedout waiting for keyspace vt_ks to be created by schema engine")
   429  			return
   430  		default:
   431  			_, err := RunSQL(t, sqlSchema, primary, "")
   432  			if err != nil {
   433  				log.Warningf("create table failed on primary - %v, will retry", err)
   434  				time.Sleep(100 * time.Millisecond)
   435  				break
   436  			}
   437  			confirmReplication(t, primary, replicas, time.Until(endTime), clusterInfo.lastUsedValue)
   438  			clusterInfo.lastUsedValue++
   439  			validateTopology(t, clusterInfo, true, time.Until(endTime))
   440  			return
   441  		}
   442  	}
   443  }
   444  
   445  // VerifyWritesSucceed inserts more data into the table vt_insert_test and checks that it is replicated too
   446  // Call this function only after CheckReplication has been executed once, since that function creates the table that this function uses.
   447  func VerifyWritesSucceed(t *testing.T, clusterInfo *VTOrcClusterInfo, primary *cluster.Vttablet, replicas []*cluster.Vttablet, timeToWait time.Duration) {
   448  	t.Helper()
   449  	confirmReplication(t, primary, replicas, timeToWait, clusterInfo.lastUsedValue)
   450  	clusterInfo.lastUsedValue++
   451  }
   452  
   453  func confirmReplication(t *testing.T, primary *cluster.Vttablet, replicas []*cluster.Vttablet, timeToWait time.Duration, valueToInsert int) {
   454  	t.Helper()
   455  	log.Infof("Insert data into primary and check that it is replicated to replica")
   456  	// insert data into the new primary, check the connected replica work
   457  	insertSQL := fmt.Sprintf("insert into vt_insert_test(id, msg) values (%d, 'test %d')", valueToInsert, valueToInsert)
   458  	_, err := RunSQL(t, insertSQL, primary, "vt_ks")
   459  	require.NoError(t, err)
   460  	time.Sleep(100 * time.Millisecond)
   461  	timeout := time.After(timeToWait)
   462  	for {
   463  		select {
   464  		case <-timeout:
   465  			t.Fatal("timedout waiting for replication, data not yet replicated")
   466  			return
   467  		default:
   468  			err = nil
   469  			for _, tab := range replicas {
   470  				errInReplication := checkInsertedValues(t, tab, valueToInsert)
   471  				if errInReplication != nil {
   472  					err = errInReplication
   473  				}
   474  			}
   475  			if err != nil {
   476  				log.Warningf("waiting for replication - error received - %v, will retry", err)
   477  				time.Sleep(300 * time.Millisecond)
   478  				break
   479  			}
   480  			return
   481  		}
   482  	}
   483  }
   484  
   485  func checkInsertedValues(t *testing.T, tablet *cluster.Vttablet, index int) error {
   486  	selectSQL := fmt.Sprintf("select msg from vt_ks.vt_insert_test where id=%d", index)
   487  	qr, err := RunSQL(t, selectSQL, tablet, "")
   488  	// The error may be not nil, if the replication has not caught upto the point where the table exists.
   489  	// We can safely skip this error and retry reading after wait
   490  	if err == nil && len(qr.Rows) == 1 {
   491  		return nil
   492  	}
   493  	return fmt.Errorf("data is not yet replicated")
   494  }
   495  
   496  // WaitForReplicationToStop waits for replication to stop on the given tablet
   497  func WaitForReplicationToStop(t *testing.T, vttablet *cluster.Vttablet) error {
   498  	timeout := time.After(15 * time.Second)
   499  	for {
   500  		select {
   501  		case <-timeout:
   502  			return fmt.Errorf("timedout: waiting for primary to stop replication")
   503  		default:
   504  			res, err := RunSQL(t, "SHOW SLAVE STATUS", vttablet, "")
   505  			if err != nil {
   506  				return err
   507  			}
   508  			if len(res.Rows) == 0 {
   509  				return nil
   510  			}
   511  			time.Sleep(1 * time.Second)
   512  		}
   513  	}
   514  }
   515  
   516  func validateTopology(t *testing.T, clusterInfo *VTOrcClusterInfo, pingTablets bool, timeToWait time.Duration) {
   517  	ch := make(chan error)
   518  	timeout := time.After(timeToWait)
   519  	go func() {
   520  		for {
   521  			select {
   522  			case <-timeout:
   523  				ch <- fmt.Errorf("time out waiting for validation to pass")
   524  				return
   525  			default:
   526  				var err error
   527  				var output string
   528  				if pingTablets {
   529  					output, err = clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("Validate", "--", "--ping-tablets=true")
   530  				} else {
   531  					output, err = clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("Validate")
   532  				}
   533  				if err != nil {
   534  					log.Warningf("Validate failed, retrying, output - %s", output)
   535  					time.Sleep(100 * time.Millisecond)
   536  					break
   537  				}
   538  				ch <- nil
   539  				return
   540  			}
   541  		}
   542  	}()
   543  
   544  	select {
   545  	case err := <-ch:
   546  		require.NoError(t, err)
   547  		return
   548  	case <-timeout:
   549  		t.Fatal("time out waiting for validation to pass")
   550  	}
   551  }
   552  
   553  // KillTablets is used to kill the tablets
   554  func KillTablets(vttablets []*cluster.Vttablet) {
   555  	for _, tablet := range vttablets {
   556  		log.Infof("Shutting down MySQL for %v", tablet.Alias)
   557  		_ = tablet.MysqlctlProcess.Stop()
   558  		log.Infof("Calling TearDown on tablet %v", tablet.Alias)
   559  		_ = tablet.VttabletProcess.TearDown()
   560  	}
   561  }
   562  
   563  func getMysqlConnParam(tablet *cluster.Vttablet, db string) mysql.ConnParams {
   564  	connParams := mysql.ConnParams{
   565  		Uname:      "vt_dba",
   566  		UnixSocket: path.Join(os.Getenv("VTDATAROOT"), fmt.Sprintf("/vt_%010d/mysql.sock", tablet.TabletUID)),
   567  	}
   568  	if db != "" {
   569  		connParams.DbName = db
   570  	}
   571  	return connParams
   572  }
   573  
   574  // RunSQL is used to run a SQL statement on the given tablet
   575  func RunSQL(t *testing.T, sql string, tablet *cluster.Vttablet, db string) (*sqltypes.Result, error) {
   576  	// Get Connection
   577  	tabletParams := getMysqlConnParam(tablet, db)
   578  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
   579  	defer cancel()
   580  	conn, err := mysql.Connect(ctx, &tabletParams)
   581  	require.Nil(t, err)
   582  	defer conn.Close()
   583  
   584  	// RunSQL
   585  	return execute(t, conn, sql)
   586  }
   587  
   588  func execute(t *testing.T, conn *mysql.Conn, query string) (*sqltypes.Result, error) {
   589  	t.Helper()
   590  	return conn.ExecuteFetch(query, 1000, true)
   591  }
   592  
   593  // StartVttablet is used to start a vttablet from the given cell and type
   594  func StartVttablet(t *testing.T, clusterInfo *VTOrcClusterInfo, cell string, isRdonly bool) *cluster.Vttablet {
   595  
   596  	var tablet *cluster.Vttablet
   597  	for _, cellInfo := range clusterInfo.CellInfos {
   598  		if cellInfo.CellName == cell {
   599  			tabletsToUse := cellInfo.ReplicaTablets
   600  			if isRdonly {
   601  				tabletsToUse = cellInfo.RdonlyTablets
   602  			}
   603  			for _, vttablet := range tabletsToUse {
   604  				if isVttabletInUse(clusterInfo, vttablet) {
   605  					continue
   606  				}
   607  				tablet = vttablet
   608  				cleanAndStartVttablet(t, clusterInfo, vttablet)
   609  				break
   610  			}
   611  			break
   612  		}
   613  	}
   614  
   615  	require.NotNil(t, tablet, "Could not start requested tablet")
   616  	// wait for the tablets to come up properly
   617  	err := tablet.VttabletProcess.WaitForTabletStatuses([]string{"SERVING", "NOT_SERVING"})
   618  	require.NoError(t, err)
   619  	err = tablet.VttabletProcess.WaitForTabletTypes([]string{"replica", "rdonly"})
   620  	require.NoError(t, err)
   621  	return tablet
   622  }
   623  
   624  func isVttabletInUse(clusterInfo *VTOrcClusterInfo, tablet *cluster.Vttablet) bool {
   625  	for _, vttablet := range clusterInfo.ClusterInstance.Keyspaces[0].Shards[0].Vttablets {
   626  		if tablet == vttablet {
   627  			return true
   628  		}
   629  	}
   630  	return false
   631  }
   632  
   633  // PermanentlyRemoveVttablet removes the tablet specified from the cluster. It makes it so that
   634  // this vttablet or mysql instance are not reused for any other test.
   635  func PermanentlyRemoveVttablet(clusterInfo *VTOrcClusterInfo, tablet *cluster.Vttablet) {
   636  	// remove the tablet from our global list
   637  	for _, cellInfo := range clusterInfo.CellInfos {
   638  		for i, vttablet := range cellInfo.ReplicaTablets {
   639  			if vttablet == tablet {
   640  				// remove this tablet since its mysql has stopped
   641  				cellInfo.ReplicaTablets = append(cellInfo.ReplicaTablets[:i], cellInfo.ReplicaTablets[i+1:]...)
   642  				KillTablets([]*cluster.Vttablet{tablet})
   643  				return
   644  			}
   645  		}
   646  		for i, vttablet := range cellInfo.RdonlyTablets {
   647  			if vttablet == tablet {
   648  				// remove this tablet since its mysql has stopped
   649  				cellInfo.RdonlyTablets = append(cellInfo.RdonlyTablets[:i], cellInfo.RdonlyTablets[i+1:]...)
   650  				KillTablets([]*cluster.Vttablet{tablet})
   651  				return
   652  			}
   653  		}
   654  	}
   655  }
   656  
   657  // ChangePrivileges is used to change the privileges of the given user. These commands are executed such that they are not replicated
   658  func ChangePrivileges(t *testing.T, sql string, tablet *cluster.Vttablet, user string) {
   659  	_, err := RunSQL(t, "SET sql_log_bin = OFF;"+sql+";SET sql_log_bin = ON;", tablet, "")
   660  	require.NoError(t, err)
   661  
   662  	res, err := RunSQL(t, fmt.Sprintf("SELECT id FROM INFORMATION_SCHEMA.PROCESSLIST WHERE user = '%s'", user), tablet, "")
   663  	require.NoError(t, err)
   664  	for _, row := range res.Rows {
   665  		id, err := row[0].ToInt64()
   666  		require.NoError(t, err)
   667  		_, err = RunSQL(t, fmt.Sprintf("kill %d", id), tablet, "")
   668  		require.NoError(t, err)
   669  	}
   670  }
   671  
   672  // ResetPrimaryLogs is used reset the binary logs
   673  func ResetPrimaryLogs(t *testing.T, curPrimary *cluster.Vttablet) {
   674  	_, err := RunSQL(t, "FLUSH BINARY LOGS", curPrimary, "")
   675  	require.NoError(t, err)
   676  
   677  	binLogsOutput, err := RunSQL(t, "SHOW BINARY LOGS", curPrimary, "")
   678  	require.NoError(t, err)
   679  	require.True(t, len(binLogsOutput.Rows) >= 2, "there should be atlease 2 binlog files")
   680  
   681  	lastLogFile := binLogsOutput.Rows[len(binLogsOutput.Rows)-1][0].ToString()
   682  
   683  	_, err = RunSQL(t, "PURGE BINARY LOGS TO '"+lastLogFile+"'", curPrimary, "")
   684  	require.NoError(t, err)
   685  }
   686  
   687  // CheckSourcePort is used to check that the replica has the given source port set in its MySQL instance
   688  func CheckSourcePort(t *testing.T, replica *cluster.Vttablet, source *cluster.Vttablet, timeToWait time.Duration) {
   689  	timeout := time.After(timeToWait)
   690  	for {
   691  		select {
   692  		case <-timeout:
   693  			t.Fatal("timedout waiting for correct primary to be setup")
   694  			return
   695  		default:
   696  			res, err := RunSQL(t, "SHOW SLAVE STATUS", replica, "")
   697  			require.NoError(t, err)
   698  
   699  			if len(res.Rows) != 1 {
   700  				log.Warningf("no replication status yet, will retry")
   701  				break
   702  			}
   703  
   704  			for idx, field := range res.Fields {
   705  				if strings.EqualFold(field.Name, "MASTER_PORT") || strings.EqualFold(field.Name, "SOURCE_PORT") {
   706  					port, err := res.Rows[0][idx].ToInt64()
   707  					require.NoError(t, err)
   708  					if port == int64(source.MySQLPort) {
   709  						return
   710  					}
   711  				}
   712  			}
   713  			log.Warningf("source port not set correctly yet, will retry")
   714  		}
   715  		time.Sleep(300 * time.Millisecond)
   716  	}
   717  }
   718  
   719  // MakeAPICall is used make an API call given the url. It returns the status and the body of the response received
   720  func MakeAPICall(t *testing.T, vtorc *cluster.VTOrcProcess, url string) (status int, response string) {
   721  	t.Helper()
   722  	var err error
   723  	status, response, err = vtorc.MakeAPICall(url)
   724  	require.NoError(t, err)
   725  	return status, response
   726  }
   727  
   728  // MakeAPICallRetry is used to make an API call and retry on the given condition.
   729  // The function provided takes in the status and response and returns if we should continue to retry or not
   730  func MakeAPICallRetry(t *testing.T, vtorc *cluster.VTOrcProcess, url string, retry func(int, string) bool) (status int, response string) {
   731  	t.Helper()
   732  	timeout := time.After(10 * time.Second)
   733  	for {
   734  		select {
   735  		case <-timeout:
   736  			t.Fatal("timed out waiting for api to work")
   737  			return
   738  		default:
   739  			status, response = MakeAPICall(t, vtorc, url)
   740  			if retry(status, response) {
   741  				time.Sleep(1 * time.Second)
   742  				break
   743  			}
   744  			return status, response
   745  		}
   746  	}
   747  }
   748  
   749  // SetupNewClusterSemiSync is used to setup a new cluster with semi-sync set.
   750  // It creates a cluster with 4 tablets, one of which is a Replica
   751  func SetupNewClusterSemiSync(t *testing.T) *VTOrcClusterInfo {
   752  	var tablets []*cluster.Vttablet
   753  	clusterInstance := cluster.NewCluster(Cell1, Hostname)
   754  	keyspace := &cluster.Keyspace{Name: keyspaceName}
   755  	// Start topo server
   756  	err := clusterInstance.StartTopo()
   757  	require.NoError(t, err, "Error starting topo: %v", err)
   758  
   759  	err = clusterInstance.TopoProcess.ManageTopoDir("mkdir", "/vitess/"+Cell1)
   760  	require.NoError(t, err, "Error managing topo: %v", err)
   761  
   762  	for i := 0; i < 3; i++ {
   763  		tablet := clusterInstance.NewVttabletInstance("replica", 100+i, Cell1)
   764  		tablets = append(tablets, tablet)
   765  	}
   766  	tablet := clusterInstance.NewVttabletInstance("rdonly", 103, Cell1)
   767  	tablets = append(tablets, tablet)
   768  
   769  	shard := &cluster.Shard{Name: shardName}
   770  	shard.Vttablets = tablets
   771  
   772  	clusterInstance.VtTabletExtraArgs = []string{
   773  		"--lock_tables_timeout", "5s",
   774  		"--disable_active_reparents",
   775  	}
   776  
   777  	// Initialize Cluster
   778  	err = clusterInstance.SetupCluster(keyspace, []cluster.Shard{*shard})
   779  	require.NoError(t, err, "Cannot launch cluster: %v", err)
   780  
   781  	//Start MySql
   782  	var mysqlCtlProcessList []*exec.Cmd
   783  	for _, shard := range clusterInstance.Keyspaces[0].Shards {
   784  		for _, tablet := range shard.Vttablets {
   785  			log.Infof("Starting MySql for tablet %v", tablet.Alias)
   786  			proc, err := tablet.MysqlctlProcess.StartProcess()
   787  			if err != nil {
   788  				require.NoError(t, err, "Error starting start mysql: %v", err)
   789  			}
   790  			mysqlCtlProcessList = append(mysqlCtlProcessList, proc)
   791  		}
   792  	}
   793  
   794  	// Wait for mysql processes to start
   795  	for _, proc := range mysqlCtlProcessList {
   796  		if err := proc.Wait(); err != nil {
   797  			require.NoError(t, err, "Error starting mysql: %v", err)
   798  		}
   799  	}
   800  
   801  	for _, tablet := range tablets {
   802  		require.NoError(t, err)
   803  		// Start the tablet
   804  		err = tablet.VttabletProcess.Setup()
   805  		require.NoError(t, err)
   806  	}
   807  
   808  	for _, tablet := range tablets {
   809  		err := tablet.VttabletProcess.WaitForTabletStatuses([]string{"SERVING", "NOT_SERVING"})
   810  		require.NoError(t, err)
   811  	}
   812  
   813  	vtctldClientProcess := cluster.VtctldClientProcessInstance("localhost", clusterInstance.VtctldProcess.GrpcPort, clusterInstance.TmpDirectory)
   814  
   815  	out, err := vtctldClientProcess.ExecuteCommandWithOutput("SetKeyspaceDurabilityPolicy", keyspaceName, "--durability-policy=semi_sync")
   816  	require.NoError(t, err, out)
   817  
   818  	// create topo server connection
   819  	ts, err := topo.OpenServer(*clusterInstance.TopoFlavorString(), clusterInstance.VtctlProcess.TopoGlobalAddress, clusterInstance.VtctlProcess.TopoGlobalRoot)
   820  	require.NoError(t, err)
   821  	clusterInfo := &VTOrcClusterInfo{
   822  		ClusterInstance:     clusterInstance,
   823  		Ts:                  ts,
   824  		CellInfos:           nil,
   825  		lastUsedValue:       100,
   826  		VtctldClientProcess: vtctldClientProcess,
   827  	}
   828  	return clusterInfo
   829  }
   830  
   831  // AddSemiSyncKeyspace is used to setup a new keyspace with semi-sync.
   832  // It creates a keyspace with 3 tablets
   833  func AddSemiSyncKeyspace(t *testing.T, clusterInfo *VTOrcClusterInfo) {
   834  	var tablets []*cluster.Vttablet
   835  	keyspaceSemiSyncName := "ks2"
   836  	keyspace := &cluster.Keyspace{Name: keyspaceSemiSyncName}
   837  
   838  	for i := 0; i < 3; i++ {
   839  		tablet := clusterInfo.ClusterInstance.NewVttabletInstance("replica", 300+i, Cell1)
   840  		tablets = append(tablets, tablet)
   841  	}
   842  
   843  	shard := &cluster.Shard{Name: shardName}
   844  	shard.Vttablets = tablets
   845  
   846  	oldVttabletArgs := clusterInfo.ClusterInstance.VtTabletExtraArgs
   847  	defer func() {
   848  		clusterInfo.ClusterInstance.VtTabletExtraArgs = oldVttabletArgs
   849  	}()
   850  	clusterInfo.ClusterInstance.VtTabletExtraArgs = []string{
   851  		"--lock_tables_timeout", "5s",
   852  		"--disable_active_reparents",
   853  	}
   854  
   855  	// Initialize Cluster
   856  	err := clusterInfo.ClusterInstance.SetupCluster(keyspace, []cluster.Shard{*shard})
   857  	require.NoError(t, err, "Cannot launch cluster: %v", err)
   858  
   859  	//Start MySql
   860  	var mysqlCtlProcessList []*exec.Cmd
   861  	for _, shard := range clusterInfo.ClusterInstance.Keyspaces[1].Shards {
   862  		for _, tablet := range shard.Vttablets {
   863  			log.Infof("Starting MySql for tablet %v", tablet.Alias)
   864  			proc, err := tablet.MysqlctlProcess.StartProcess()
   865  			if err != nil {
   866  				require.NoError(t, err, "Error starting start mysql: %v", err)
   867  			}
   868  			mysqlCtlProcessList = append(mysqlCtlProcessList, proc)
   869  		}
   870  	}
   871  
   872  	// Wait for mysql processes to start
   873  	for _, proc := range mysqlCtlProcessList {
   874  		if err := proc.Wait(); err != nil {
   875  			require.NoError(t, err, "Error starting mysql: %v", err)
   876  		}
   877  	}
   878  
   879  	for _, tablet := range tablets {
   880  		require.NoError(t, err)
   881  		// Start the tablet
   882  		err = tablet.VttabletProcess.Setup()
   883  		require.NoError(t, err)
   884  	}
   885  
   886  	for _, tablet := range tablets {
   887  		err := tablet.VttabletProcess.WaitForTabletStatuses([]string{"SERVING", "NOT_SERVING"})
   888  		require.NoError(t, err)
   889  	}
   890  
   891  	vtctldClientProcess := cluster.VtctldClientProcessInstance("localhost", clusterInfo.ClusterInstance.VtctldProcess.GrpcPort, clusterInfo.ClusterInstance.TmpDirectory)
   892  	out, err := vtctldClientProcess.ExecuteCommandWithOutput("SetKeyspaceDurabilityPolicy", keyspaceSemiSyncName, "--durability-policy=semi_sync")
   893  	require.NoError(t, err, out)
   894  }
   895  
   896  // IsSemiSyncSetupCorrectly checks that the semi-sync is setup correctly on the given vttablet
   897  func IsSemiSyncSetupCorrectly(t *testing.T, tablet *cluster.Vttablet, semiSyncVal string) bool {
   898  	dbVar, err := tablet.VttabletProcess.GetDBVar("rpl_semi_sync_slave_enabled", "")
   899  	require.NoError(t, err)
   900  	return semiSyncVal == dbVar
   901  }
   902  
   903  // IsPrimarySemiSyncSetupCorrectly checks that the priamry side semi-sync is setup correctly on the given vttablet
   904  func IsPrimarySemiSyncSetupCorrectly(t *testing.T, tablet *cluster.Vttablet, semiSyncVal string) bool {
   905  	dbVar, err := tablet.VttabletProcess.GetDBVar("rpl_semi_sync_master_enabled", "")
   906  	require.NoError(t, err)
   907  	return semiSyncVal == dbVar
   908  }
   909  
   910  // WaitForReadOnlyValue waits for the read_only global variable to reach the provided value
   911  func WaitForReadOnlyValue(t *testing.T, curPrimary *cluster.Vttablet, expectValue int64) (match bool) {
   912  	timeout := 15 * time.Second
   913  	startTime := time.Now()
   914  	for time.Since(startTime) < timeout {
   915  		qr, err := RunSQL(t, "select @@global.read_only as read_only", curPrimary, "")
   916  		require.NoError(t, err)
   917  		require.NotNil(t, qr)
   918  		row := qr.Named().Row()
   919  		require.NotNil(t, row)
   920  		readOnly, err := row.ToInt64("read_only")
   921  		require.NoError(t, err)
   922  		if readOnly == expectValue {
   923  			return true
   924  		}
   925  		time.Sleep(time.Second)
   926  	}
   927  	return false
   928  }
   929  
   930  // WaitForSuccessfulRecoveryCount waits until the given recovery name's count of successful runs matches the count expected
   931  func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess, recoveryName string, countExpected int) {
   932  	t.Helper()
   933  	timeout := 15 * time.Second
   934  	startTime := time.Now()
   935  	for time.Since(startTime) < timeout {
   936  		vars := vtorcInstance.GetVars()
   937  		successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{})
   938  		successCount := successfulRecoveriesMap[recoveryName]
   939  		if successCount == countExpected {
   940  			return
   941  		}
   942  		time.Sleep(time.Second)
   943  	}
   944  	vars := vtorcInstance.GetVars()
   945  	successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{})
   946  	successCount := successfulRecoveriesMap[recoveryName]
   947  	assert.EqualValues(t, countExpected, successCount)
   948  }