vitess.io/vitess@v0.16.2/go/test/endtoend/reparent/utils/utils.go (about) 1 /* 2 Copyright 2019 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package utils 18 19 import ( 20 "context" 21 "encoding/json" 22 "fmt" 23 "os" 24 "os/exec" 25 "path" 26 "reflect" 27 "strings" 28 "testing" 29 "time" 30 31 "github.com/stretchr/testify/assert" 32 "github.com/stretchr/testify/require" 33 34 querypb "vitess.io/vitess/go/vt/proto/query" 35 "vitess.io/vitess/go/vt/vttablet/tabletconn" 36 37 "vitess.io/vitess/go/json2" 38 "vitess.io/vitess/go/mysql" 39 "vitess.io/vitess/go/sqltypes" 40 "vitess.io/vitess/go/test/endtoend/cluster" 41 "vitess.io/vitess/go/vt/log" 42 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 43 ) 44 45 var ( 46 KeyspaceName = "ks" 47 dbName = "vt_" + KeyspaceName 48 username = "vt_dba" 49 Hostname = "localhost" 50 insertVal = 1 51 insertSQL = "insert into vt_insert_test(id, msg) values (%d, 'test %d')" 52 sqlSchema = ` 53 create table vt_insert_test ( 54 id bigint, 55 msg varchar(64), 56 primary key (id) 57 ) Engine=InnoDB 58 ` 59 cell1 = "zone1" 60 cell2 = "zone2" 61 ShardName = "0" 62 KeyspaceShard = KeyspaceName + "/" + ShardName 63 replicationWaitTimeout = time.Duration(15 * time.Second) 64 ) 65 66 //region cluster setup/teardown 67 68 // SetupReparentCluster is used to setup the reparent cluster 69 func SetupReparentCluster(t *testing.T, durability string) *cluster.LocalProcessCluster { 70 return setupCluster(context.Background(), t, ShardName, []string{cell1, cell2}, []int{3, 1}, durability) 71 } 72 73 // SetupRangeBasedCluster sets up the range based cluster 74 func SetupRangeBasedCluster(ctx context.Context, t *testing.T) *cluster.LocalProcessCluster { 75 return setupCluster(ctx, t, ShardName, []string{cell1}, []int{2}, "semi_sync") 76 } 77 78 // TeardownCluster is used to teardown the reparent cluster 79 func TeardownCluster(clusterInstance *cluster.LocalProcessCluster) { 80 clusterInstance.Teardown() 81 } 82 83 func setupCluster(ctx context.Context, t *testing.T, shardName string, cells []string, numTablets []int, durability string) *cluster.LocalProcessCluster { 84 var tablets []*cluster.Vttablet 85 clusterInstance := cluster.NewCluster(cells[0], Hostname) 86 keyspace := &cluster.Keyspace{Name: KeyspaceName} 87 88 // Start topo server 89 err := clusterInstance.StartTopo() 90 require.NoError(t, err, "Error starting topo") 91 err = clusterInstance.TopoProcess.ManageTopoDir("mkdir", "/vitess/"+cells[0]) 92 require.NoError(t, err, "Error managing topo") 93 numCell := 1 94 for numCell < len(cells) { 95 err = clusterInstance.VtctlProcess.AddCellInfo(cells[numCell]) 96 require.NoError(t, err, "Error managing topo") 97 numCell++ 98 } 99 100 // Adding another cell in the same cluster 101 numCell = 0 102 for numCell < len(cells) { 103 i := 0 104 for i < numTablets[numCell] { 105 i++ 106 tablet := clusterInstance.NewVttabletInstance("replica", 100*(numCell+1)+i, cells[numCell]) 107 tablets = append(tablets, tablet) 108 } 109 numCell++ 110 } 111 112 shard := &cluster.Shard{Name: shardName} 113 shard.Vttablets = tablets 114 115 clusterInstance.VtTabletExtraArgs = append(clusterInstance.VtTabletExtraArgs, 116 "--lock_tables_timeout", "5s", 117 "--track_schema_versions=true", 118 // disabling online-ddl for reparent tests. This is done to reduce flakiness. 119 // All the tests in this package reparent frequently between different tablets 120 // This means that Promoting a tablet to primary is sometimes immediately followed by a DemotePrimary call. 121 // In this case, the close method and initSchema method of the onlineDDL executor race. 122 // If the initSchema acquires the lock, then it takes about 30 seconds for it to run during which time the 123 // DemotePrimary rpc is stalled! 124 "--queryserver_enable_online_ddl=false", 125 // disabling active reparents on the tablet since we don't want the replication manager 126 // to fix replication if it is stopped. Some tests deliberately do that. Also, we don't want 127 // the replication manager to silently fix the replication in case ERS or PRS mess up. All the 128 // tests in this test suite should work irrespective of this flag. Each run of ERS, PRS should be 129 // setting up the replication correctly. 130 "--disable-replication-manager") 131 132 // Initialize Cluster 133 err = clusterInstance.SetupCluster(keyspace, []cluster.Shard{*shard}) 134 require.NoError(t, err, "Cannot launch cluster") 135 136 //Start MySql 137 var mysqlCtlProcessList []*exec.Cmd 138 for _, shard := range clusterInstance.Keyspaces[0].Shards { 139 for _, tablet := range shard.Vttablets { 140 log.Infof("Starting MySql for tablet %v", tablet.Alias) 141 proc, err := tablet.MysqlctlProcess.StartProcess() 142 require.NoError(t, err, "Error starting start mysql") 143 mysqlCtlProcessList = append(mysqlCtlProcessList, proc) 144 } 145 } 146 147 // Wait for mysql processes to start 148 for _, proc := range mysqlCtlProcessList { 149 if err := proc.Wait(); err != nil { 150 clusterInstance.PrintMysqlctlLogFiles() 151 require.FailNow(t, "Error starting mysql: %s", err.Error()) 152 } 153 } 154 if clusterInstance.VtctlMajorVersion >= 14 { 155 clusterInstance.VtctldClientProcess = *cluster.VtctldClientProcessInstance("localhost", clusterInstance.VtctldProcess.GrpcPort, clusterInstance.TmpDirectory) 156 out, err := clusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("SetKeyspaceDurabilityPolicy", KeyspaceName, fmt.Sprintf("--durability-policy=%s", durability)) 157 require.NoError(t, err, out) 158 } 159 160 setupShard(ctx, t, clusterInstance, shardName, tablets) 161 return clusterInstance 162 } 163 164 func setupShard(ctx context.Context, t *testing.T, clusterInstance *cluster.LocalProcessCluster, shardName string, tablets []*cluster.Vttablet) { 165 for _, tablet := range tablets { 166 tablet.VttabletProcess.SupportsBackup = false 167 // Start the tablet 168 err := tablet.VttabletProcess.Setup() 169 require.NoError(t, err) 170 } 171 172 for _, tablet := range tablets { 173 err := tablet.VttabletProcess.WaitForTabletStatuses([]string{"SERVING", "NOT_SERVING"}) 174 require.NoError(t, err) 175 } 176 177 // Initialize shard 178 err := clusterInstance.VtctlclientProcess.InitializeShard(KeyspaceName, shardName, tablets[0].Cell, tablets[0].TabletUID) 179 require.NoError(t, err) 180 181 ValidateTopology(t, clusterInstance, true) 182 183 // create Tables 184 RunSQL(ctx, t, sqlSchema, tablets[0]) 185 186 CheckPrimaryTablet(t, clusterInstance, tablets[0]) 187 188 ValidateTopology(t, clusterInstance, false) 189 WaitForReplicationToStart(t, clusterInstance, KeyspaceName, shardName, len(tablets), true) 190 } 191 192 // StartNewVTTablet starts a new vttablet instance 193 func StartNewVTTablet(t *testing.T, clusterInstance *cluster.LocalProcessCluster, uuid int, supportsBackup bool) *cluster.Vttablet { 194 tablet := clusterInstance.NewVttabletInstance("replica", uuid, cell1) 195 keyspace := clusterInstance.Keyspaces[0] 196 shard := keyspace.Shards[0] 197 198 // Setup MysqlctlProcess 199 tablet.MysqlctlProcess = *cluster.MysqlCtlProcessInstance(tablet.TabletUID, tablet.MySQLPort, clusterInstance.TmpDirectory) 200 // Setup VttabletProcess 201 tablet.VttabletProcess = cluster.VttabletProcessInstance( 202 tablet.HTTPPort, 203 tablet.GrpcPort, 204 tablet.TabletUID, 205 tablet.Cell, 206 shard.Name, 207 keyspace.Name, 208 clusterInstance.VtctldProcess.Port, 209 tablet.Type, 210 clusterInstance.TopoProcess.Port, 211 clusterInstance.Hostname, 212 clusterInstance.TmpDirectory, 213 []string{ 214 "--lock_tables_timeout", "5s", 215 "--track_schema_versions=true", 216 "--queryserver_enable_online_ddl=false", 217 }, 218 clusterInstance.DefaultCharset) 219 tablet.VttabletProcess.SupportsBackup = supportsBackup 220 221 log.Infof("Starting MySql for tablet %v", tablet.Alias) 222 proc, err := tablet.MysqlctlProcess.StartProcess() 223 require.NoError(t, err, "Error starting start mysql") 224 if err := proc.Wait(); err != nil { 225 clusterInstance.PrintMysqlctlLogFiles() 226 require.FailNow(t, "Error starting mysql: %s", err.Error()) 227 } 228 229 // The tablet should come up as serving since the primary for the shard already exists 230 tablet.VttabletProcess.ServingStatus = "SERVING" 231 tablet.VttabletProcess.SupportsBackup = false 232 err = tablet.VttabletProcess.Setup() 233 require.NoError(t, err) 234 return tablet 235 } 236 237 //endregion 238 239 // region database queries 240 func getMysqlConnParam(tablet *cluster.Vttablet) mysql.ConnParams { 241 connParams := mysql.ConnParams{ 242 Uname: username, 243 DbName: dbName, 244 UnixSocket: path.Join(os.Getenv("VTDATAROOT"), fmt.Sprintf("/vt_%010d/mysql.sock", tablet.TabletUID)), 245 } 246 return connParams 247 } 248 249 // RunSQL is used to run a SQL command directly on the MySQL instance of a vttablet 250 func RunSQL(ctx context.Context, t *testing.T, sql string, tablet *cluster.Vttablet) *sqltypes.Result { 251 tabletParams := getMysqlConnParam(tablet) 252 conn, err := mysql.Connect(ctx, &tabletParams) 253 require.Nil(t, err) 254 defer conn.Close() 255 return execute(t, conn, sql) 256 } 257 258 func execute(t *testing.T, conn *mysql.Conn, query string) *sqltypes.Result { 259 t.Helper() 260 qr, err := conn.ExecuteFetch(query, 1000, true) 261 require.Nil(t, err) 262 return qr 263 } 264 265 //endregion 266 267 // region ers, prs 268 269 // Prs runs PRS 270 func Prs(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet) (string, error) { 271 return PrsWithTimeout(t, clusterInstance, tab, false, "", "") 272 } 273 274 // PrsAvoid runs PRS 275 func PrsAvoid(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet) (string, error) { 276 return PrsWithTimeout(t, clusterInstance, tab, true, "", "") 277 } 278 279 // PrsWithTimeout runs PRS 280 func PrsWithTimeout(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet, avoid bool, actionTimeout, waitTimeout string) (string, error) { 281 args := []string{ 282 "PlannedReparentShard", "--", 283 "--keyspace_shard", fmt.Sprintf("%s/%s", KeyspaceName, ShardName)} 284 if actionTimeout != "" { 285 args = append(args, "--action_timeout", actionTimeout) 286 } 287 if waitTimeout != "" { 288 args = append(args, "--wait_replicas_timeout", waitTimeout) 289 } 290 if avoid { 291 args = append(args, "--avoid_tablet") 292 } else { 293 args = append(args, "--new_primary") 294 } 295 args = append(args, tab.Alias) 296 out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput(args...) 297 return out, err 298 } 299 300 // Ers runs the ERS 301 func Ers(clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet, totalTimeout, waitReplicasTimeout string) (string, error) { 302 return ErsIgnoreTablet(clusterInstance, tab, totalTimeout, waitReplicasTimeout, nil, false) 303 } 304 305 // ErsIgnoreTablet is used to run ERS 306 func ErsIgnoreTablet(clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet, timeout, waitReplicasTimeout string, tabletsToIgnore []*cluster.Vttablet, preventCrossCellPromotion bool) (string, error) { 307 var args []string 308 if timeout != "" { 309 args = append(args, "--action_timeout", timeout) 310 } 311 args = append(args, "EmergencyReparentShard", "--", "--keyspace_shard", fmt.Sprintf("%s/%s", KeyspaceName, ShardName)) 312 if tab != nil { 313 args = append(args, "--new_primary", tab.Alias) 314 } 315 if waitReplicasTimeout != "" { 316 args = append(args, "--wait_replicas_timeout", waitReplicasTimeout) 317 } 318 if preventCrossCellPromotion { 319 args = append(args, "--prevent_cross_cell_promotion=true") 320 } 321 if len(tabletsToIgnore) != 0 { 322 tabsString := "" 323 for _, vttablet := range tabletsToIgnore { 324 if tabsString == "" { 325 tabsString = vttablet.Alias 326 } else { 327 tabsString = tabsString + "," + vttablet.Alias 328 } 329 } 330 args = append(args, "--ignore_replicas", tabsString) 331 } 332 return clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput(args...) 333 } 334 335 // ErsWithVtctl runs ERS via vtctl binary 336 func ErsWithVtctl(clusterInstance *cluster.LocalProcessCluster) (string, error) { 337 args := []string{"EmergencyReparentShard", "--", "--keyspace_shard", fmt.Sprintf("%s/%s", KeyspaceName, ShardName)} 338 return clusterInstance.VtctlProcess.ExecuteCommandWithOutput(args...) 339 } 340 341 // endregion 342 343 // region validations 344 345 // ValidateTopology is used to validate the topology 346 func ValidateTopology(t *testing.T, clusterInstance *cluster.LocalProcessCluster, pingTablets bool) { 347 args := []string{"Validate"} 348 349 if pingTablets { 350 args = append(args, "--", "--ping-tablets=true") 351 } 352 out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput(args...) 353 require.Empty(t, out) 354 require.NoError(t, err) 355 } 356 357 // ConfirmReplication confirms that the replication is working properly 358 func ConfirmReplication(t *testing.T, primary *cluster.Vttablet, replicas []*cluster.Vttablet) int { 359 ctx := context.Background() 360 insertVal++ 361 n := insertVal // unique value ... 362 // insert data into the new primary, check the connected replica work 363 insertSQL := fmt.Sprintf(insertSQL, n, n) 364 RunSQL(ctx, t, insertSQL, primary) 365 for _, tab := range replicas { 366 err := CheckInsertedValues(ctx, t, tab, n) 367 require.NoError(t, err) 368 } 369 return n 370 } 371 372 // ConfirmOldPrimaryIsHangingAround confirms that the old primary is hanging around 373 func ConfirmOldPrimaryIsHangingAround(t *testing.T, clusterInstance *cluster.LocalProcessCluster) { 374 out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("Validate") 375 require.Error(t, err) 376 require.Contains(t, out, "already has primary") 377 } 378 379 // CheckPrimaryTablet makes sure the tablet type is primary, and its health check agrees. 380 func CheckPrimaryTablet(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tablet *cluster.Vttablet) { 381 result, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("GetTablet", tablet.Alias) 382 require.NoError(t, err) 383 var tabletInfo topodatapb.Tablet 384 err = json2.Unmarshal([]byte(result), &tabletInfo) 385 require.NoError(t, err) 386 assert.Equal(t, topodatapb.TabletType_PRIMARY, tabletInfo.GetType()) 387 388 // make sure the health stream is updated 389 shrs, err := clusterInstance.StreamTabletHealth(context.Background(), tablet, 1) 390 require.NoError(t, err) 391 streamHealthResponse := shrs[0] 392 393 assert.True(t, streamHealthResponse.GetServing()) 394 tabletType := streamHealthResponse.GetTarget().GetTabletType() 395 assert.Equal(t, topodatapb.TabletType_PRIMARY, tabletType) 396 } 397 398 // isHealthyPrimaryTablet will return if tablet is primary AND healthy. 399 func isHealthyPrimaryTablet(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tablet *cluster.Vttablet) bool { 400 result, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("GetTablet", tablet.Alias) 401 require.Nil(t, err) 402 var tabletInfo topodatapb.Tablet 403 err = json2.Unmarshal([]byte(result), &tabletInfo) 404 require.Nil(t, err) 405 if tabletInfo.GetType() != topodatapb.TabletType_PRIMARY { 406 return false 407 } 408 409 // make sure the health stream is updated 410 shrs, err := clusterInstance.StreamTabletHealth(context.Background(), tablet, 1) 411 require.NoError(t, err) 412 streamHealthResponse := shrs[0] 413 414 assert.True(t, streamHealthResponse.GetServing()) 415 tabletType := streamHealthResponse.GetTarget().GetTabletType() 416 return tabletType == topodatapb.TabletType_PRIMARY 417 } 418 419 // CheckInsertedValues checks that the given value is present in the given tablet 420 func CheckInsertedValues(ctx context.Context, t *testing.T, tablet *cluster.Vttablet, index int) error { 421 query := fmt.Sprintf("select msg from vt_insert_test where id=%d", index) 422 tabletParams := getMysqlConnParam(tablet) 423 var conn *mysql.Conn 424 425 // wait until it gets the data 426 timeout := time.Now().Add(replicationWaitTimeout) 427 i := 0 428 for time.Now().Before(timeout) { 429 // We start with no connection to MySQL 430 if conn == nil { 431 // Try connecting to MySQL 432 mysqlConn, err := mysql.Connect(ctx, &tabletParams) 433 // This can fail if the database create hasn't been replicated yet. 434 // We ignore this failure and try again later 435 if err == nil { 436 // If we succeed, then we store the connection 437 // and reuse it for checking the rows in the table. 438 conn = mysqlConn 439 defer conn.Close() 440 } 441 } 442 if conn != nil { 443 // We'll get a mysql.ERNoSuchTable (1146) error if the CREATE TABLE has not replicated yet and 444 // it's possible that we get other ephemeral errors too, so we make the tests more robust by 445 // retrying with the timeout. 446 qr, err := conn.ExecuteFetch(query, 1, true) 447 if err == nil && len(qr.Rows) == 1 { 448 return nil 449 } 450 } 451 t := time.Duration(300 * i) 452 time.Sleep(t * time.Millisecond) 453 i++ 454 } 455 return fmt.Errorf("data did not get replicated on tablet %s within the timeout of %v", tablet.Alias, replicationWaitTimeout) 456 } 457 458 func CheckSemiSyncSetupCorrectly(t *testing.T, tablet *cluster.Vttablet, semiSyncVal string) { 459 dbVar, err := tablet.VttabletProcess.GetDBVar("rpl_semi_sync_slave_enabled", "") 460 require.NoError(t, err) 461 require.Equal(t, semiSyncVal, dbVar) 462 } 463 464 // CheckCountOfInsertedValues checks that the number of inserted values matches the given count on the given tablet 465 func CheckCountOfInsertedValues(ctx context.Context, t *testing.T, tablet *cluster.Vttablet, count int) error { 466 selectSQL := "select * from vt_insert_test" 467 qr := RunSQL(ctx, t, selectSQL, tablet) 468 if len(qr.Rows) == count { 469 return nil 470 } 471 return fmt.Errorf("count does not match on the tablet %s", tablet.Alias) 472 } 473 474 // endregion 475 476 // region tablet operations 477 478 // StopTablet stops the tablet 479 func StopTablet(t *testing.T, tab *cluster.Vttablet, stopDatabase bool) { 480 err := tab.VttabletProcess.TearDownWithTimeout(30 * time.Second) 481 require.NoError(t, err) 482 if stopDatabase { 483 err = tab.MysqlctlProcess.Stop() 484 require.NoError(t, err) 485 } 486 } 487 488 // RestartTablet restarts the tablet 489 func RestartTablet(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet) { 490 tab.MysqlctlProcess.InitMysql = false 491 err := tab.MysqlctlProcess.Start() 492 require.NoError(t, err) 493 err = clusterInstance.VtctlclientProcess.InitTablet(tab, tab.Cell, KeyspaceName, Hostname, ShardName) 494 require.NoError(t, err) 495 } 496 497 // ResurrectTablet is used to resurrect the given tablet 498 func ResurrectTablet(ctx context.Context, t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet) { 499 tab.MysqlctlProcess.InitMysql = false 500 err := tab.MysqlctlProcess.Start() 501 require.NoError(t, err) 502 err = clusterInstance.VtctlclientProcess.InitTablet(tab, tab.Cell, KeyspaceName, Hostname, ShardName) 503 require.NoError(t, err) 504 505 // As there is already a primary the new replica will come directly in SERVING state 506 tab.VttabletProcess.ServingStatus = "SERVING" 507 // Start the tablet 508 err = tab.VttabletProcess.Setup() 509 require.NoError(t, err) 510 511 err = CheckInsertedValues(ctx, t, tab, insertVal) 512 require.NoError(t, err) 513 } 514 515 // DeleteTablet is used to delete the given tablet 516 func DeleteTablet(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet) { 517 err := clusterInstance.VtctlclientProcess.ExecuteCommand( 518 "DeleteTablet", "--", 519 "--allow_primary", 520 tab.Alias) 521 require.NoError(t, err) 522 } 523 524 // endregion 525 526 // region get info 527 528 // GetNewPrimary is used to find the new primary of the cluster. 529 func GetNewPrimary(t *testing.T, clusterInstance *cluster.LocalProcessCluster) *cluster.Vttablet { 530 var newPrimary *cluster.Vttablet 531 for _, tablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets[1:] { 532 if isHealthyPrimaryTablet(t, clusterInstance, tablet) { 533 newPrimary = tablet 534 break 535 } 536 } 537 require.NotNil(t, newPrimary) 538 return newPrimary 539 } 540 541 // GetShardReplicationPositions gets the shards replication positions. 542 // This should not generally be called directly, instead use the WaitForReplicationToCatchup method. 543 func GetShardReplicationPositions(t *testing.T, clusterInstance *cluster.LocalProcessCluster, keyspaceName, shardName string, doPrint bool) []string { 544 output, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput( 545 "ShardReplicationPositions", fmt.Sprintf("%s/%s", keyspaceName, shardName)) 546 require.NoError(t, err) 547 strArray := strings.Split(output, "\n") 548 if strArray[len(strArray)-1] == "" { 549 strArray = strArray[:len(strArray)-1] // Truncate slice, remove empty line 550 } 551 if doPrint { 552 log.Infof("Positions:") 553 for _, pos := range strArray { 554 log.Infof("\t%s", pos) 555 } 556 } 557 return strArray 558 } 559 560 func WaitForReplicationToStart(t *testing.T, clusterInstance *cluster.LocalProcessCluster, keyspaceName, shardName string, tabletCnt int, doPrint bool) { 561 tkr := time.NewTicker(500 * time.Millisecond) 562 defer tkr.Stop() 563 for { 564 select { 565 case <-tkr.C: 566 strArray := GetShardReplicationPositions(t, clusterInstance, KeyspaceName, shardName, true) 567 if len(strArray) == tabletCnt && strings.Contains(strArray[0], "primary") { // primary first 568 return 569 } 570 case <-time.After(replicationWaitTimeout): 571 require.FailNow(t, fmt.Sprintf("replication did not start everywhere in %s/%s within the timeout of %v", 572 keyspaceName, shardName, replicationWaitTimeout)) 573 return 574 } 575 } 576 } 577 578 // endregion 579 580 // CheckReplicaStatus checks the replication status and asserts that the replication is stopped 581 func CheckReplicaStatus(ctx context.Context, t *testing.T, tablet *cluster.Vttablet) { 582 qr := RunSQL(ctx, t, "show slave status", tablet) 583 IOThreadRunning := fmt.Sprintf("%v", qr.Rows[0][10]) 584 SQLThreadRunning := fmt.Sprintf("%v", qr.Rows[0][10]) 585 assert.Equal(t, IOThreadRunning, "VARCHAR(\"No\")") 586 assert.Equal(t, SQLThreadRunning, "VARCHAR(\"No\")") 587 } 588 589 // CheckReparentFromOutside checks that cluster was reparented from outside 590 func CheckReparentFromOutside(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tablet *cluster.Vttablet, downPrimary bool, baseTime int64) { 591 result, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("GetShardReplication", cell1, KeyspaceShard) 592 require.Nil(t, err, "error should be Nil") 593 if !downPrimary { 594 assertNodeCount(t, result, int(3)) 595 } else { 596 assertNodeCount(t, result, int(2)) 597 } 598 599 // make sure the primary status page says it's the primary 600 status := tablet.VttabletProcess.GetStatus() 601 assert.Contains(t, status, "Tablet Type: PRIMARY") 602 603 // make sure the primary health stream says it's the primary too 604 // (health check is disabled on these servers, force it first) 605 err = clusterInstance.VtctlclientProcess.ExecuteCommand("RunHealthCheck", tablet.Alias) 606 require.NoError(t, err) 607 608 shrs, err := clusterInstance.StreamTabletHealth(context.Background(), tablet, 1) 609 require.NoError(t, err) 610 streamHealthResponse := shrs[0] 611 612 assert.Equal(t, streamHealthResponse.Target.TabletType, topodatapb.TabletType_PRIMARY) 613 assert.True(t, streamHealthResponse.TabletExternallyReparentedTimestamp >= baseTime) 614 } 615 616 // WaitForReplicationPosition waits for tablet B to catch up to the replication position of tablet A. 617 func WaitForReplicationPosition(t *testing.T, tabletA *cluster.Vttablet, tabletB *cluster.Vttablet) error { 618 posA, _ := cluster.GetPrimaryPosition(t, *tabletA, Hostname) 619 timeout := time.Now().Add(replicationWaitTimeout) 620 for time.Now().Before(timeout) { 621 posB, _ := cluster.GetPrimaryPosition(t, *tabletB, Hostname) 622 if positionAtLeast(t, tabletB, posA, posB) { 623 return nil 624 } 625 time.Sleep(500 * time.Millisecond) 626 } 627 return fmt.Errorf("failed to catch up on replication position") 628 } 629 630 // positionAtLeast executes the command position at_least 631 func positionAtLeast(t *testing.T, tablet *cluster.Vttablet, a string, b string) bool { 632 isAtleast := false 633 val, err := tablet.MysqlctlProcess.ExecuteCommandWithOutput("position", "at_least", a, b) 634 require.NoError(t, err) 635 if strings.Contains(val, "true") { 636 isAtleast = true 637 } 638 return isAtleast 639 } 640 641 func assertNodeCount(t *testing.T, result string, want int) { 642 resultMap := make(map[string]any) 643 err := json.Unmarshal([]byte(result), &resultMap) 644 require.NoError(t, err) 645 646 nodes := reflect.ValueOf(resultMap["nodes"]) 647 got := nodes.Len() 648 assert.Equal(t, want, got) 649 } 650 651 // CheckDBvar checks the db var 652 func CheckDBvar(ctx context.Context, t *testing.T, tablet *cluster.Vttablet, variable string, status string) { 653 tabletParams := getMysqlConnParam(tablet) 654 conn, err := mysql.Connect(ctx, &tabletParams) 655 require.NoError(t, err) 656 defer conn.Close() 657 658 qr := execute(t, conn, fmt.Sprintf("show variables like '%s'", variable)) 659 got := fmt.Sprintf("%v", qr.Rows) 660 want := fmt.Sprintf("[[VARCHAR(\"%s\") VARCHAR(\"%s\")]]", variable, status) 661 assert.Equal(t, want, got) 662 } 663 664 // CheckDBstatus checks the db status 665 func CheckDBstatus(ctx context.Context, t *testing.T, tablet *cluster.Vttablet, variable string, status string) { 666 tabletParams := getMysqlConnParam(tablet) 667 conn, err := mysql.Connect(ctx, &tabletParams) 668 require.NoError(t, err) 669 defer conn.Close() 670 671 qr := execute(t, conn, fmt.Sprintf("show status like '%s'", variable)) 672 got := fmt.Sprintf("%v", qr.Rows) 673 want := fmt.Sprintf("[[VARCHAR(\"%s\") VARCHAR(\"%s\")]]", variable, status) 674 assert.Equal(t, want, got) 675 } 676 677 // SetReplicationSourceFailed returns true if the given output from PRS had failed because the given tablet was 678 // unable to setReplicationSource. Since some tests are used in upgrade-downgrade testing, we need this function to 679 // work with different versions of vtctl. 680 func SetReplicationSourceFailed(tablet *cluster.Vttablet, prsOut string) bool { 681 return strings.Contains(prsOut, fmt.Sprintf("tablet %s failed to SetReplicationSource", tablet.Alias)) 682 } 683 684 // CheckReplicationStatus checks that the replication for sql and io threads is setup as expected 685 func CheckReplicationStatus(ctx context.Context, t *testing.T, tablet *cluster.Vttablet, sqlThreadRunning bool, ioThreadRunning bool) { 686 res := RunSQL(ctx, t, "show slave status;", tablet) 687 if ioThreadRunning { 688 require.Equal(t, "Yes", res.Rows[0][10].ToString()) 689 } else { 690 require.Equal(t, "No", res.Rows[0][10].ToString()) 691 } 692 693 if sqlThreadRunning { 694 require.Equal(t, "Yes", res.Rows[0][11].ToString()) 695 } else { 696 require.Equal(t, "No", res.Rows[0][11].ToString()) 697 } 698 } 699 700 func WaitForTabletToBeServing(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tablet *cluster.Vttablet, timeout time.Duration) { 701 vTablet, err := clusterInstance.VtctlclientGetTablet(tablet) 702 require.NoError(t, err) 703 704 tConn, err := tabletconn.GetDialer()(vTablet, false) 705 require.NoError(t, err) 706 707 newCtx, cancel := context.WithTimeout(context.Background(), timeout) 708 err = tConn.StreamHealth(newCtx, func(shr *querypb.StreamHealthResponse) error { 709 if shr.Serving { 710 cancel() 711 } 712 return nil 713 }) 714 715 // the error should only be because we cancelled the context when the tablet became serving again. 716 if err != nil && !strings.Contains(err.Error(), "context canceled") { 717 t.Fatal(err.Error()) 718 } 719 }