vitess.io/vitess@v0.16.2/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go (about) 1 /* 2 Copyright 2021 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package primaryfailure 18 19 import ( 20 "testing" 21 "time" 22 23 "github.com/stretchr/testify/assert" 24 "github.com/stretchr/testify/require" 25 26 "vitess.io/vitess/go/test/endtoend/cluster" 27 "vitess.io/vitess/go/test/endtoend/vtorc/utils" 28 "vitess.io/vitess/go/vt/vtorc/logic" 29 ) 30 31 // bring down primary, let orc promote replica 32 // covers the test case master-failover from orchestrator 33 // Also tests that VTOrc can handle multiple failures, if the durability policies allow it 34 func TestDownPrimary(t *testing.T) { 35 defer cluster.PanicHandler(t) 36 utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{ 37 PreventCrossDataCenterPrimaryFailover: true, 38 }, 1, "semi_sync") 39 keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] 40 shard0 := &keyspace.Shards[0] 41 // find primary from topo 42 curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) 43 assert.NotNil(t, curPrimary, "should have elected a primary") 44 vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] 45 utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) 46 47 // find the replica and rdonly tablets 48 var replica, rdonly *cluster.Vttablet 49 for _, tablet := range shard0.Vttablets { 50 // we know we have only two replcia tablets, so the one not the primary must be the other replica 51 if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" { 52 replica = tablet 53 } 54 if tablet.Type == "rdonly" { 55 rdonly = tablet 56 } 57 } 58 assert.NotNil(t, replica, "could not find replica tablet") 59 assert.NotNil(t, rdonly, "could not find rdonly tablet") 60 61 // Start a cross-cell replica 62 crossCellReplica := utils.StartVttablet(t, clusterInfo, utils.Cell2, false) 63 64 // check that the replication is setup correctly before we failover 65 utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica, crossCellReplica}, 10*time.Second) 66 67 // Make the rdonly vttablet unavailable 68 err := rdonly.VttabletProcess.TearDown() 69 require.NoError(t, err) 70 err = rdonly.MysqlctlProcess.Stop() 71 require.NoError(t, err) 72 // Make the current primary vttablet unavailable. 73 err = curPrimary.VttabletProcess.TearDown() 74 require.NoError(t, err) 75 err = curPrimary.MysqlctlProcess.Stop() 76 require.NoError(t, err) 77 defer func() { 78 // we remove the tablet from our global list 79 utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary) 80 utils.PermanentlyRemoveVttablet(clusterInfo, rdonly) 81 }() 82 83 // check that the replica gets promoted 84 utils.CheckPrimaryTablet(t, clusterInfo, replica, true) 85 // also check that the replication is working correctly after failover 86 utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second) 87 utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) 88 } 89 90 // Failover should not be cross data centers, according to the configuration file 91 // covers part of the test case master-failover-lost-replicas from orchestrator 92 func TestCrossDataCenterFailure(t *testing.T) { 93 defer cluster.PanicHandler(t) 94 utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ 95 PreventCrossDataCenterPrimaryFailover: true, 96 }, 1, "") 97 keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] 98 shard0 := &keyspace.Shards[0] 99 // find primary from topo 100 curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) 101 assert.NotNil(t, curPrimary, "should have elected a primary") 102 103 // find the replica and rdonly tablets 104 var replicaInSameCell, rdonly *cluster.Vttablet 105 for _, tablet := range shard0.Vttablets { 106 // we know we have only two replcia tablets, so the one not the primary must be the other replica 107 if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" { 108 replicaInSameCell = tablet 109 } 110 if tablet.Type == "rdonly" { 111 rdonly = tablet 112 } 113 } 114 assert.NotNil(t, replicaInSameCell, "could not find replica tablet") 115 assert.NotNil(t, rdonly, "could not find rdonly tablet") 116 117 crossCellReplica := utils.StartVttablet(t, clusterInfo, utils.Cell2, false) 118 // newly started tablet does not replicate from anyone yet, we will allow vtorc to fix this too 119 utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{crossCellReplica, replicaInSameCell, rdonly}, 25*time.Second) 120 121 // Make the current primary database unavailable. 122 err := curPrimary.MysqlctlProcess.Stop() 123 require.NoError(t, err) 124 defer func() { 125 // we remove the tablet from our global list since its mysqlctl process has stopped and cannot be reused for other tests 126 utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary) 127 }() 128 129 // we have a replica in the same cell, so that is the one which should be promoted and not the one from another cell 130 utils.CheckPrimaryTablet(t, clusterInfo, replicaInSameCell, true) 131 // also check that the replication is working correctly after failover 132 utils.VerifyWritesSucceed(t, clusterInfo, replicaInSameCell, []*cluster.Vttablet{crossCellReplica, rdonly}, 10*time.Second) 133 } 134 135 // Failover should not be cross data centers, according to the configuration file 136 // In case of no viable candidates, we should error out 137 func TestCrossDataCenterFailureError(t *testing.T) { 138 defer cluster.PanicHandler(t) 139 utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, nil, cluster.VTOrcConfiguration{ 140 PreventCrossDataCenterPrimaryFailover: true, 141 }, 1, "") 142 keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] 143 shard0 := &keyspace.Shards[0] 144 // find primary from topo 145 curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) 146 assert.NotNil(t, curPrimary, "should have elected a primary") 147 148 // find the rdonly tablet 149 var rdonly *cluster.Vttablet 150 for _, tablet := range shard0.Vttablets { 151 if tablet.Type == "rdonly" { 152 rdonly = tablet 153 } 154 } 155 assert.NotNil(t, rdonly, "could not find rdonly tablet") 156 157 crossCellReplica1 := utils.StartVttablet(t, clusterInfo, utils.Cell2, false) 158 crossCellReplica2 := utils.StartVttablet(t, clusterInfo, utils.Cell2, false) 159 // newly started tablet does not replicate from anyone yet, we will allow vtorc to fix this too 160 utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{crossCellReplica1, crossCellReplica2, rdonly}, 25*time.Second) 161 162 // Make the current primary database unavailable. 163 err := curPrimary.MysqlctlProcess.Stop() 164 require.NoError(t, err) 165 defer func() { 166 // we remove the tablet from our global list since its mysqlctl process has stopped and cannot be reused for other tests 167 utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary) 168 }() 169 170 // wait for 20 seconds 171 time.Sleep(20 * time.Second) 172 173 // the previous primary should still be the primary since recovery of dead primary should fail 174 utils.CheckPrimaryTablet(t, clusterInfo, curPrimary, false) 175 } 176 177 // Failover will sometimes lead to a rdonly which can no longer replicate. 178 // covers part of the test case master-failover-lost-replicas from orchestrator 179 func TestLostRdonlyOnPrimaryFailure(t *testing.T) { 180 // new version of ERS does not check for lost replicas yet 181 // Earlier any replicas that were not able to replicate from the previous primary 182 // were detected by vtorc and could be configured to have their sources detached 183 t.Skip() 184 defer cluster.PanicHandler(t) 185 utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 2, nil, cluster.VTOrcConfiguration{ 186 PreventCrossDataCenterPrimaryFailover: true, 187 }, 1, "") 188 keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] 189 shard0 := &keyspace.Shards[0] 190 // find primary from topo 191 curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) 192 assert.NotNil(t, curPrimary, "should have elected a primary") 193 194 // get the tablets 195 var replica, rdonly, aheadRdonly *cluster.Vttablet 196 for _, tablet := range shard0.Vttablets { 197 // find tablets which are not the primary 198 if tablet.Alias != curPrimary.Alias { 199 if tablet.Type == "replica" { 200 replica = tablet 201 } else { 202 if rdonly == nil { 203 rdonly = tablet 204 } else { 205 aheadRdonly = tablet 206 } 207 } 208 } 209 } 210 assert.NotNil(t, replica, "could not find replica tablet") 211 assert.NotNil(t, rdonly, "could not find any rdonly tablet") 212 assert.NotNil(t, aheadRdonly, "could not find both rdonly tablet") 213 214 // check that replication is setup correctly 215 utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, aheadRdonly, replica}, 15*time.Second) 216 217 // revoke super privileges from vtorc on replica and rdonly so that it is unable to repair the replication 218 utils.ChangePrivileges(t, `REVOKE SUPER ON *.* FROM 'orc_client_user'@'%'`, replica, "orc_client_user") 219 utils.ChangePrivileges(t, `REVOKE SUPER ON *.* FROM 'orc_client_user'@'%'`, rdonly, "orc_client_user") 220 221 // stop replication on the replica and rdonly. 222 err := clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommand("StopReplication", replica.Alias) 223 require.NoError(t, err) 224 err = clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommand("StopReplication", rdonly.Alias) 225 require.NoError(t, err) 226 227 // check that aheadRdonly is able to replicate. We also want to add some queries to aheadRdonly which will not be there in replica and rdonly 228 utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{aheadRdonly}, 15*time.Second) 229 230 // assert that the replica and rdonly are indeed lagging and do not have the new insertion by checking the count of rows in the tables 231 out, err := utils.RunSQL(t, "SELECT * FROM vt_insert_test", replica, "vt_ks") 232 require.NoError(t, err) 233 require.Equal(t, 1, len(out.Rows)) 234 out, err = utils.RunSQL(t, "SELECT * FROM vt_insert_test", rdonly, "vt_ks") 235 require.NoError(t, err) 236 require.Equal(t, 1, len(out.Rows)) 237 238 // Make the current primary database unavailable. 239 err = curPrimary.MysqlctlProcess.Stop() 240 require.NoError(t, err) 241 defer func() { 242 // we remove the tablet from our global list since its mysqlctl process has stopped and cannot be reused for other tests 243 utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary) 244 }() 245 246 // grant super privileges back to vtorc on replica and rdonly so that it can repair 247 utils.ChangePrivileges(t, `GRANT SUPER ON *.* TO 'orc_client_user'@'%'`, replica, "orc_client_user") 248 utils.ChangePrivileges(t, `GRANT SUPER ON *.* TO 'orc_client_user'@'%'`, rdonly, "orc_client_user") 249 250 // vtorc must promote the lagging replica and not the rdonly, since it has a MustNotPromoteRule promotion rule 251 utils.CheckPrimaryTablet(t, clusterInfo, replica, true) 252 253 // also check that the replication is setup correctly 254 utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 15*time.Second) 255 256 // check that the rdonly is lost. The lost replica has is detached and its host is prepended with `//` 257 out, err = utils.RunSQL(t, "SELECT HOST FROM performance_schema.replication_connection_configuration", aheadRdonly, "") 258 require.NoError(t, err) 259 require.Equal(t, "//localhost", out.Rows[0][0].ToString()) 260 } 261 262 // This test checks that the promotion of a tablet succeeds if it passes the promotion lag test 263 // covers the test case master-failover-fail-promotion-lag-minutes-success from orchestrator 264 func TestPromotionLagSuccess(t *testing.T) { 265 defer cluster.PanicHandler(t) 266 utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ 267 ReplicationLagQuery: "select 59", 268 FailPrimaryPromotionOnLagMinutes: 1, 269 }, 1, "") 270 keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] 271 shard0 := &keyspace.Shards[0] 272 // find primary from topo 273 curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) 274 assert.NotNil(t, curPrimary, "should have elected a primary") 275 276 // find the replica and rdonly tablets 277 var replica, rdonly *cluster.Vttablet 278 for _, tablet := range shard0.Vttablets { 279 // we know we have only two replcia tablets, so the one not the primary must be the other replica 280 if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" { 281 replica = tablet 282 } 283 if tablet.Type == "rdonly" { 284 rdonly = tablet 285 } 286 } 287 assert.NotNil(t, replica, "could not find replica tablet") 288 assert.NotNil(t, rdonly, "could not find rdonly tablet") 289 290 // check that the replication is setup correctly before we failover 291 utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica}, 10*time.Second) 292 293 // Make the current primary database unavailable. 294 err := curPrimary.MysqlctlProcess.Stop() 295 require.NoError(t, err) 296 defer func() { 297 // we remove the tablet from our global list since its mysqlctl process has stopped and cannot be reused for other tests 298 utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary) 299 }() 300 301 // check that the replica gets promoted 302 utils.CheckPrimaryTablet(t, clusterInfo, replica, true) 303 // also check that the replication is working correctly after failover 304 utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second) 305 } 306 307 // This test checks that the promotion of a tablet succeeds if it passes the promotion lag test 308 // covers the test case master-failover-fail-promotion-lag-minutes-failure from orchestrator 309 func TestPromotionLagFailure(t *testing.T) { 310 // new version of ERS does not check for promotion lag yet 311 // Earlier vtorc used to check that the promotion lag between the new primary and the old one 312 // was smaller than the configured value, otherwise it would fail the promotion 313 t.Skip() 314 defer cluster.PanicHandler(t) 315 utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 3, 1, nil, cluster.VTOrcConfiguration{ 316 ReplicationLagQuery: "select 61", 317 FailPrimaryPromotionOnLagMinutes: 1, 318 }, 1, "") 319 keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] 320 shard0 := &keyspace.Shards[0] 321 // find primary from topo 322 curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) 323 assert.NotNil(t, curPrimary, "should have elected a primary") 324 325 // find the replica and rdonly tablets 326 var replica1, replica2, rdonly *cluster.Vttablet 327 for _, tablet := range shard0.Vttablets { 328 // we know we have only two replcia tablets, so the one not the primary must be the other replica 329 if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" { 330 if replica1 == nil { 331 replica1 = tablet 332 } else { 333 replica2 = tablet 334 } 335 } 336 if tablet.Type == "rdonly" { 337 rdonly = tablet 338 } 339 } 340 assert.NotNil(t, replica1, "could not find replica tablet") 341 assert.NotNil(t, replica2, "could not find second replica tablet") 342 assert.NotNil(t, rdonly, "could not find rdonly tablet") 343 344 // check that the replication is setup correctly before we failover 345 utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica1, replica2}, 10*time.Second) 346 347 // Make the current primary database unavailable. 348 err := curPrimary.MysqlctlProcess.Stop() 349 require.NoError(t, err) 350 defer func() { 351 // we remove the tablet from our global list since its mysqlctl process has stopped and cannot be reused for other tests 352 utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary) 353 }() 354 355 // wait for 20 seconds 356 time.Sleep(20 * time.Second) 357 358 // the previous primary should still be the primary since recovery of dead primary should fail 359 utils.CheckPrimaryTablet(t, clusterInfo, curPrimary, false) 360 } 361 362 // covers the test case master-failover-candidate from orchestrator 363 // We explicitly set one of the replicas to Prefer promotion rule. 364 // That is the replica which should be promoted in case of primary failure 365 func TestDownPrimaryPromotionRule(t *testing.T) { 366 defer cluster.PanicHandler(t) 367 utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ 368 LockShardTimeoutSeconds: 5, 369 }, 1, "test") 370 keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] 371 shard0 := &keyspace.Shards[0] 372 // find primary from topo 373 curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) 374 assert.NotNil(t, curPrimary, "should have elected a primary") 375 376 // find the replica and rdonly tablets 377 var replica, rdonly *cluster.Vttablet 378 for _, tablet := range shard0.Vttablets { 379 // we know we have only two replcia tablets, so the one not the primary must be the other replica 380 if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" { 381 replica = tablet 382 } 383 if tablet.Type == "rdonly" { 384 rdonly = tablet 385 } 386 } 387 assert.NotNil(t, replica, "could not find replica tablet") 388 assert.NotNil(t, rdonly, "could not find rdonly tablet") 389 390 crossCellReplica := utils.StartVttablet(t, clusterInfo, utils.Cell2, false) 391 // newly started tablet does not replicate from anyone yet, we will allow vtorc to fix this too 392 utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{crossCellReplica, rdonly, replica}, 25*time.Second) 393 394 // Make the current primary database unavailable. 395 err := curPrimary.MysqlctlProcess.Stop() 396 require.NoError(t, err) 397 defer func() { 398 // we remove the tablet from our global list since its mysqlctl process has stopped and cannot be reused for other tests 399 utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary) 400 }() 401 402 // we have a replica with a preferred promotion rule, so that is the one which should be promoted 403 utils.CheckPrimaryTablet(t, clusterInfo, crossCellReplica, true) 404 // also check that the replication is working correctly after failover 405 utils.VerifyWritesSucceed(t, clusterInfo, crossCellReplica, []*cluster.Vttablet{rdonly, replica}, 10*time.Second) 406 } 407 408 // covers the test case master-failover-candidate-lag from orchestrator 409 // We explicitly set one of the replicas to Prefer promotion rule and make it lag with respect to other replicas. 410 // That is the replica which should be promoted in case of primary failure 411 // It should also be caught up when it is promoted 412 func TestDownPrimaryPromotionRuleWithLag(t *testing.T) { 413 defer cluster.PanicHandler(t) 414 utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ 415 LockShardTimeoutSeconds: 5, 416 }, 1, "test") 417 keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] 418 shard0 := &keyspace.Shards[0] 419 // find primary from topo 420 curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) 421 assert.NotNil(t, curPrimary, "should have elected a primary") 422 423 // get the replicas in the same cell 424 var replica, rdonly *cluster.Vttablet 425 for _, tablet := range shard0.Vttablets { 426 // find tablets which are not the primary 427 if tablet.Alias != curPrimary.Alias { 428 if tablet.Type == "replica" { 429 replica = tablet 430 } else { 431 rdonly = tablet 432 } 433 } 434 } 435 assert.NotNil(t, replica, "could not find replica tablet") 436 assert.NotNil(t, rdonly, "could not find rdonly tablet") 437 438 crossCellReplica := utils.StartVttablet(t, clusterInfo, utils.Cell2, false) 439 // newly started tablet does not replicate from anyone yet, we will allow vtorc to fix this too 440 utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{crossCellReplica, replica, rdonly}, 25*time.Second) 441 442 // revoke super privileges from vtorc on crossCellReplica so that it is unable to repair the replication 443 utils.ChangePrivileges(t, `REVOKE SUPER ON *.* FROM 'orc_client_user'@'%'`, crossCellReplica, "orc_client_user") 444 445 // stop replication on the crossCellReplica. 446 err := clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommand("StopReplication", crossCellReplica.Alias) 447 require.NoError(t, err) 448 449 // check that rdonly and replica are able to replicate. We also want to add some queries to replica which will not be there in crossCellReplica 450 utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, rdonly}, 15*time.Second) 451 452 // reset the primary logs so that crossCellReplica can never catch up 453 utils.ResetPrimaryLogs(t, curPrimary) 454 455 // start replication back on the crossCellReplica. 456 err = clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", crossCellReplica.Alias) 457 require.NoError(t, err) 458 459 // grant super privileges back to vtorc on crossCellReplica so that it can repair 460 utils.ChangePrivileges(t, `GRANT SUPER ON *.* TO 'orc_client_user'@'%'`, crossCellReplica, "orc_client_user") 461 462 // assert that the crossCellReplica is indeed lagging and does not have the new insertion by checking the count of rows in the table 463 out, err := utils.RunSQL(t, "SELECT * FROM vt_insert_test", crossCellReplica, "vt_ks") 464 require.NoError(t, err) 465 require.Equal(t, 1, len(out.Rows)) 466 467 // Make the current primary database unavailable. 468 err = curPrimary.MysqlctlProcess.Stop() 469 require.NoError(t, err) 470 defer func() { 471 // we remove the tablet from our global list since its mysqlctl process has stopped and cannot be reused for other tests 472 utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary) 473 }() 474 475 // the crossCellReplica is set to be preferred according to the durability requirements. So it must be promoted 476 utils.CheckPrimaryTablet(t, clusterInfo, crossCellReplica, true) 477 478 // assert that the crossCellReplica has indeed caught up 479 out, err = utils.RunSQL(t, "SELECT * FROM vt_insert_test", crossCellReplica, "vt_ks") 480 require.NoError(t, err) 481 require.Equal(t, 2, len(out.Rows)) 482 483 // check that rdonly and replica are able to replicate from the crossCellReplica 484 utils.VerifyWritesSucceed(t, clusterInfo, crossCellReplica, []*cluster.Vttablet{replica, rdonly}, 15*time.Second) 485 } 486 487 // covers the test case master-failover-candidate-lag-cross-datacenter from orchestrator 488 // We explicitly set one of the cross-cell replicas to Prefer promotion rule, but we prevent cross data center promotions. 489 // We let a replica in our own cell lag. That is the replica which should be promoted in case of primary failure 490 // It should also be caught up when it is promoted 491 func TestDownPrimaryPromotionRuleWithLagCrossCenter(t *testing.T) { 492 defer cluster.PanicHandler(t) 493 utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ 494 LockShardTimeoutSeconds: 5, 495 PreventCrossDataCenterPrimaryFailover: true, 496 }, 1, "test") 497 keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] 498 shard0 := &keyspace.Shards[0] 499 // find primary from topo 500 curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) 501 assert.NotNil(t, curPrimary, "should have elected a primary") 502 503 // get the replicas in the same cell 504 var replica, rdonly *cluster.Vttablet 505 for _, tablet := range shard0.Vttablets { 506 // find tablets which are not the primary 507 if tablet.Alias != curPrimary.Alias { 508 if tablet.Type == "replica" { 509 replica = tablet 510 } else { 511 rdonly = tablet 512 } 513 } 514 } 515 assert.NotNil(t, replica, "could not find replica tablet") 516 assert.NotNil(t, rdonly, "could not find rdonly tablet") 517 518 crossCellReplica := utils.StartVttablet(t, clusterInfo, utils.Cell2, false) 519 // newly started tablet does not replicate from anyone yet, we will allow vtorc to fix this too 520 utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{crossCellReplica, replica, rdonly}, 25*time.Second) 521 522 // revoke super privileges from vtorc on replica so that it is unable to repair the replication 523 utils.ChangePrivileges(t, `REVOKE SUPER ON *.* FROM 'orc_client_user'@'%'`, replica, "orc_client_user") 524 525 // stop replication on the replica. 526 err := clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommand("StopReplication", replica.Alias) 527 require.NoError(t, err) 528 529 // check that rdonly and crossCellReplica are able to replicate. We also want to add some queries to crossCenterReplica which will not be there in replica 530 utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, crossCellReplica}, 15*time.Second) 531 532 // reset the primary logs so that crossCellReplica can never catch up 533 utils.ResetPrimaryLogs(t, curPrimary) 534 535 // start replication back on the replica. 536 err = clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", replica.Alias) 537 require.NoError(t, err) 538 539 // grant super privileges back to vtorc on replica so that it can repair 540 utils.ChangePrivileges(t, `GRANT SUPER ON *.* TO 'orc_client_user'@'%'`, replica, "orc_client_user") 541 542 // assert that the replica is indeed lagging and does not have the new insertion by checking the count of rows in the table 543 out, err := utils.RunSQL(t, "SELECT * FROM vt_insert_test", replica, "vt_ks") 544 require.NoError(t, err) 545 require.Equal(t, 1, len(out.Rows)) 546 547 // Make the current primary database unavailable. 548 err = curPrimary.MysqlctlProcess.Stop() 549 require.NoError(t, err) 550 defer func() { 551 // we remove the tablet from our global list since its mysqlctl process has stopped and cannot be reused for other tests 552 utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary) 553 }() 554 555 // the replica should be promoted since we have prevented cross cell promotions 556 utils.CheckPrimaryTablet(t, clusterInfo, replica, true) 557 558 // assert that the replica has indeed caught up 559 out, err = utils.RunSQL(t, "SELECT * FROM vt_insert_test", replica, "vt_ks") 560 require.NoError(t, err) 561 require.Equal(t, 2, len(out.Rows)) 562 563 // check that rdonly and crossCellReplica are able to replicate from the replica 564 utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica, rdonly}, 15*time.Second) 565 }