vitess.io/vitess@v0.16.2/go/test/endtoend/vtorc/general/vtorc_test.go (about) 1 /* 2 Copyright 2020 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package general 18 19 import ( 20 "context" 21 "fmt" 22 "testing" 23 "time" 24 25 "github.com/stretchr/testify/assert" 26 "github.com/stretchr/testify/require" 27 28 "vitess.io/vitess/go/test/endtoend/cluster" 29 "vitess.io/vitess/go/test/endtoend/vtorc/utils" 30 "vitess.io/vitess/go/vt/log" 31 "vitess.io/vitess/go/vt/vtorc/logic" 32 ) 33 34 // Cases to test: 35 // 1. create cluster with 2 replicas and 1 rdonly, let orc choose primary 36 // verify rdonly is not elected, only replica 37 // verify replication is setup 38 // verify that with multiple vtorc instances, we still only have 1 PlannedReparentShard call 39 func TestPrimaryElection(t *testing.T) { 40 defer cluster.PanicHandler(t) 41 utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ 42 PreventCrossDataCenterPrimaryFailover: true, 43 }, 2, "") 44 keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] 45 shard0 := &keyspace.Shards[0] 46 47 primary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) 48 assert.NotNil(t, primary, "should have elected a primary") 49 utils.CheckReplication(t, clusterInfo, primary, shard0.Vttablets, 10*time.Second) 50 51 for _, vttablet := range shard0.Vttablets { 52 if vttablet.Type == "rdonly" && primary.Alias == vttablet.Alias { 53 t.Errorf("Rdonly tablet promoted as primary - %v", primary.Alias) 54 } 55 } 56 57 res, err := utils.RunSQL(t, "select * from reparent_journal", primary, "_vt") 58 require.NoError(t, err) 59 require.Len(t, res.Rows, 1, "There should only be 1 primary tablet which was elected") 60 } 61 62 // Cases to test: 63 // 1. create cluster with 1 replica and 1 rdonly, let orc choose primary 64 // verify rdonly is not elected, only replica 65 // verify replication is setup 66 func TestSingleKeyspace(t *testing.T) { 67 defer cluster.PanicHandler(t) 68 utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, []string{"--clusters_to_watch", "ks"}, cluster.VTOrcConfiguration{ 69 PreventCrossDataCenterPrimaryFailover: true, 70 }, 1, "") 71 keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] 72 shard0 := &keyspace.Shards[0] 73 74 utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true) 75 utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second) 76 utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1) 77 } 78 79 // Cases to test: 80 // 1. create cluster with 1 replica and 1 rdonly, let orc choose primary 81 // verify rdonly is not elected, only replica 82 // verify replication is setup 83 func TestKeyspaceShard(t *testing.T) { 84 defer cluster.PanicHandler(t) 85 utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, []string{"--clusters_to_watch", "ks/0"}, cluster.VTOrcConfiguration{ 86 PreventCrossDataCenterPrimaryFailover: true, 87 }, 1, "") 88 keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] 89 shard0 := &keyspace.Shards[0] 90 91 utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true) 92 utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second) 93 utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1) 94 } 95 96 // Cases to test: 97 // 1. make primary readonly, let vtorc repair 98 // 2. make replica ReadWrite, let vtorc repair 99 // 3. stop replication, let vtorc repair 100 // 4. setup replication from non-primary, let vtorc repair 101 // 5. make instance A replicates from B and B from A, wait for repair 102 func TestVTOrcRepairs(t *testing.T) { 103 defer cluster.PanicHandler(t) 104 utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 3, 0, nil, cluster.VTOrcConfiguration{ 105 PreventCrossDataCenterPrimaryFailover: true, 106 }, 1, "") 107 keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] 108 shard0 := &keyspace.Shards[0] 109 110 // find primary from topo 111 curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) 112 assert.NotNil(t, curPrimary, "should have elected a primary") 113 vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] 114 utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) 115 116 var replica, otherReplica *cluster.Vttablet 117 for _, tablet := range shard0.Vttablets { 118 // we know we have only two tablets, so the "other" one must be the new primary 119 if tablet.Alias != curPrimary.Alias { 120 if replica == nil { 121 replica = tablet 122 } else { 123 otherReplica = tablet 124 } 125 } 126 } 127 require.NotNil(t, replica, "should be able to find a replica") 128 require.NotNil(t, otherReplica, "should be able to find 2nd replica") 129 130 // check replication is setup correctly 131 utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) 132 133 t.Run("PrimaryReadOnly", func(t *testing.T) { 134 // Make the current primary database read-only. 135 _, err := utils.RunSQL(t, "set global read_only=ON", curPrimary, "") 136 require.NoError(t, err) 137 138 // wait for repair 139 match := utils.WaitForReadOnlyValue(t, curPrimary, 0) 140 require.True(t, match) 141 utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, 1) 142 }) 143 144 t.Run("ReplicaReadWrite", func(t *testing.T) { 145 // Make the replica database read-write. 146 _, err := utils.RunSQL(t, "set global read_only=OFF", replica, "") 147 require.NoError(t, err) 148 149 // wait for repair 150 match := utils.WaitForReadOnlyValue(t, replica, 1) 151 require.True(t, match) 152 utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 1) 153 }) 154 155 t.Run("StopReplication", func(t *testing.T) { 156 // use vtctlclient to stop replication 157 _, err := clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("StopReplication", replica.Alias) 158 require.NoError(t, err) 159 160 // check replication is setup correctly 161 utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) 162 utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 2) 163 164 // Stop just the IO thread on the replica 165 _, err = utils.RunSQL(t, "STOP SLAVE IO_THREAD", replica, "") 166 require.NoError(t, err) 167 168 // check replication is setup correctly 169 utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) 170 utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 3) 171 172 // Stop just the SQL thread on the replica 173 _, err = utils.RunSQL(t, "STOP SLAVE SQL_THREAD", replica, "") 174 require.NoError(t, err) 175 176 // check replication is setup correctly 177 utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) 178 utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 4) 179 }) 180 181 t.Run("ReplicationFromOtherReplica", func(t *testing.T) { 182 // point replica at otherReplica 183 changeReplicationSourceCommand := fmt.Sprintf("STOP SLAVE; RESET SLAVE ALL;"+ 184 "CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1; START SLAVE", utils.Hostname, otherReplica.MySQLPort) 185 _, err := utils.RunSQL(t, changeReplicationSourceCommand, replica, "") 186 require.NoError(t, err) 187 188 // wait until the source port is set back correctly by vtorc 189 utils.CheckSourcePort(t, replica, curPrimary, 15*time.Second) 190 utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 5) 191 192 // check that writes succeed 193 utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) 194 }) 195 196 t.Run("CircularReplication", func(t *testing.T) { 197 // change the replication source on the primary 198 changeReplicationSourceCommands := fmt.Sprintf("STOP SLAVE; RESET SLAVE ALL;"+ 199 "CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1;"+ 200 "START SLAVE;", replica.VttabletProcess.TabletHostname, replica.MySQLPort) 201 _, err := utils.RunSQL(t, changeReplicationSourceCommands, curPrimary, "") 202 require.NoError(t, err) 203 204 // wait for curPrimary to reach stable state 205 time.Sleep(1 * time.Second) 206 207 // wait for repair 208 err = utils.WaitForReplicationToStop(t, curPrimary) 209 require.NoError(t, err) 210 utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, 1) 211 // check that the writes still succeed 212 utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 10*time.Second) 213 }) 214 } 215 216 func TestRepairAfterTER(t *testing.T) { 217 // test fails intermittently on CI, skip until it can be fixed. 218 t.SkipNow() 219 defer cluster.PanicHandler(t) 220 utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 0, nil, cluster.VTOrcConfiguration{ 221 PreventCrossDataCenterPrimaryFailover: true, 222 }, 1, "") 223 keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] 224 shard0 := &keyspace.Shards[0] 225 226 // find primary from topo 227 curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) 228 assert.NotNil(t, curPrimary, "should have elected a primary") 229 230 // TODO(deepthi): we should not need to do this, the DB should be created automatically 231 _, err := curPrimary.VttabletProcess.QueryTablet(fmt.Sprintf("create database IF NOT EXISTS vt_%s", keyspace.Name), keyspace.Name, false) 232 require.NoError(t, err) 233 234 var newPrimary *cluster.Vttablet 235 for _, tablet := range shard0.Vttablets { 236 // we know we have only two tablets, so the "other" one must be the new primary 237 if tablet.Alias != curPrimary.Alias { 238 newPrimary = tablet 239 break 240 } 241 } 242 243 // TER to other tablet 244 _, err = clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("TabletExternallyReparented", newPrimary.Alias) 245 require.NoError(t, err) 246 247 utils.CheckReplication(t, clusterInfo, newPrimary, []*cluster.Vttablet{curPrimary}, 15*time.Second) 248 } 249 250 // TestSemiSync tests that semi-sync is setup correctly by vtorc if it is incorrectly set 251 func TestSemiSync(t *testing.T) { 252 // stop any vtorc instance running due to a previous test. 253 utils.StopVTOrcs(t, clusterInfo) 254 newCluster := utils.SetupNewClusterSemiSync(t) 255 utils.StartVTOrcs(t, newCluster, nil, cluster.VTOrcConfiguration{ 256 PreventCrossDataCenterPrimaryFailover: true, 257 }, 1) 258 defer func() { 259 utils.StopVTOrcs(t, newCluster) 260 newCluster.ClusterInstance.Teardown() 261 }() 262 keyspace := &newCluster.ClusterInstance.Keyspaces[0] 263 shard0 := &keyspace.Shards[0] 264 265 // find primary from topo 266 primary := utils.ShardPrimaryTablet(t, newCluster, keyspace, shard0) 267 assert.NotNil(t, primary, "should have elected a primary") 268 269 var replica1, replica2, rdonly *cluster.Vttablet 270 for _, tablet := range shard0.Vttablets { 271 if tablet.Alias == primary.Alias { 272 continue 273 } 274 if tablet.Type == "rdonly" { 275 rdonly = tablet 276 } else { 277 if replica1 == nil { 278 replica1 = tablet 279 } else { 280 replica2 = tablet 281 } 282 } 283 } 284 285 assert.NotNil(t, replica1, "could not find any replica tablet") 286 assert.NotNil(t, replica2, "could not find the second replica tablet") 287 assert.NotNil(t, rdonly, "could not find rdonly tablet") 288 289 // check that the replication is setup correctly 290 utils.CheckReplication(t, newCluster, primary, []*cluster.Vttablet{rdonly, replica1, replica2}, 10*time.Second) 291 292 _, err := utils.RunSQL(t, "SET GLOBAL rpl_semi_sync_slave_enabled = 0", replica1, "") 293 require.NoError(t, err) 294 _, err = utils.RunSQL(t, "SET GLOBAL rpl_semi_sync_slave_enabled = 1", rdonly, "") 295 require.NoError(t, err) 296 _, err = utils.RunSQL(t, "SET GLOBAL rpl_semi_sync_master_enabled = 0", primary, "") 297 require.NoError(t, err) 298 299 timeout := time.After(20 * time.Second) 300 for { 301 select { 302 case <-timeout: 303 require.Fail(t, "timed out waiting for semi sync settings to be fixed") 304 return 305 default: 306 if utils.IsSemiSyncSetupCorrectly(t, replica1, "ON") && 307 utils.IsSemiSyncSetupCorrectly(t, rdonly, "OFF") && 308 utils.IsPrimarySemiSyncSetupCorrectly(t, primary, "ON") { 309 return 310 } 311 log.Warningf("semi sync settings not fixed yet") 312 time.Sleep(1 * time.Second) 313 } 314 } 315 } 316 317 // TestVTOrcWithPrs tests that VTOrc works fine even when PRS is called from vtctld 318 func TestVTOrcWithPrs(t *testing.T) { 319 defer cluster.PanicHandler(t) 320 utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 4, 0, nil, cluster.VTOrcConfiguration{ 321 PreventCrossDataCenterPrimaryFailover: true, 322 }, 1, "") 323 keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] 324 shard0 := &keyspace.Shards[0] 325 326 // find primary from topo 327 curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) 328 assert.NotNil(t, curPrimary, "should have elected a primary") 329 vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] 330 utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) 331 332 // find any replica tablet other than the current primary 333 var replica *cluster.Vttablet 334 for _, tablet := range shard0.Vttablets { 335 if tablet.Alias != curPrimary.Alias { 336 replica = tablet 337 break 338 } 339 } 340 assert.NotNil(t, replica, "could not find any replica tablet") 341 342 // check that the replication is setup correctly before we failover 343 utils.CheckReplication(t, clusterInfo, curPrimary, shard0.Vttablets, 10*time.Second) 344 345 output, err := clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommandWithOutput( 346 "PlannedReparentShard", "--", 347 "--keyspace_shard", fmt.Sprintf("%s/%s", keyspace.Name, shard0.Name), 348 "--wait_replicas_timeout", "31s", 349 "--new_primary", replica.Alias) 350 require.NoError(t, err, "error in PlannedReparentShard output - %s", output) 351 352 time.Sleep(40 * time.Second) 353 354 // check that the replica gets promoted 355 utils.CheckPrimaryTablet(t, clusterInfo, replica, true) 356 // Verify that VTOrc didn't run any other recovery 357 utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) 358 utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 0) 359 utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, 0) 360 utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 0) 361 utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, 0) 362 utils.VerifyWritesSucceed(t, clusterInfo, replica, shard0.Vttablets, 10*time.Second) 363 } 364 365 // TestMultipleDurabilities tests that VTOrc works with 2 keyspaces having 2 different durability policies 366 func TestMultipleDurabilities(t *testing.T) { 367 defer cluster.PanicHandler(t) 368 // Setup a normal cluster and start vtorc 369 utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, nil, cluster.VTOrcConfiguration{}, 1, "") 370 // Setup a semi-sync cluster 371 utils.AddSemiSyncKeyspace(t, clusterInfo) 372 373 keyspaceNone := &clusterInfo.ClusterInstance.Keyspaces[0] 374 shardNone := &keyspaceNone.Shards[0] 375 utils.CheckPrimaryTablet(t, clusterInfo, shardNone.Vttablets[0], true) 376 utils.CheckReplication(t, clusterInfo, shardNone.Vttablets[0], shardNone.Vttablets[1:], 10*time.Second) 377 378 keyspaceSemiSync := &clusterInfo.ClusterInstance.Keyspaces[1] 379 shardSemiSync := &keyspaceSemiSync.Shards[0] 380 // find primary from topo 381 primary := utils.ShardPrimaryTablet(t, clusterInfo, keyspaceSemiSync, shardSemiSync) 382 assert.NotNil(t, primary, "should have elected a primary") 383 } 384 385 // TestDurabilityPolicySetLater tests that VTOrc works even if the durability policy of the keyspace is 386 // set after VTOrc has been started. 387 func TestDurabilityPolicySetLater(t *testing.T) { 388 // stop any vtorc instance running due to a previous test. 389 utils.StopVTOrcs(t, clusterInfo) 390 newCluster := utils.SetupNewClusterSemiSync(t) 391 keyspace := &newCluster.ClusterInstance.Keyspaces[0] 392 shard0 := &keyspace.Shards[0] 393 // Before starting VTOrc we explicity want to set the durability policy of the keyspace to an empty string 394 func() { 395 ctx, unlock, lockErr := newCluster.Ts.LockKeyspace(context.Background(), keyspace.Name, "TestDurabilityPolicySetLater") 396 require.NoError(t, lockErr) 397 defer unlock(&lockErr) 398 ki, err := newCluster.Ts.GetKeyspace(ctx, keyspace.Name) 399 require.NoError(t, err) 400 ki.DurabilityPolicy = "" 401 err = newCluster.Ts.UpdateKeyspace(ctx, ki) 402 require.NoError(t, err) 403 }() 404 405 // Verify that the durability policy is indeed empty 406 ki, err := newCluster.Ts.GetKeyspace(context.Background(), keyspace.Name) 407 require.NoError(t, err) 408 require.Empty(t, ki.DurabilityPolicy) 409 410 // Now start the vtorc instances 411 utils.StartVTOrcs(t, newCluster, nil, cluster.VTOrcConfiguration{ 412 PreventCrossDataCenterPrimaryFailover: true, 413 }, 1) 414 defer func() { 415 utils.StopVTOrcs(t, newCluster) 416 newCluster.ClusterInstance.Teardown() 417 }() 418 419 // Wait for some time to be sure that VTOrc has started. 420 // TODO(GuptaManan100): Once we have a debug page for VTOrc, use that instead 421 time.Sleep(30 * time.Second) 422 423 // Now set the correct durability policy 424 out, err := newCluster.VtctldClientProcess.ExecuteCommandWithOutput("SetKeyspaceDurabilityPolicy", keyspace.Name, "--durability-policy=semi_sync") 425 require.NoError(t, err, out) 426 427 // VTOrc should promote a new primary after seeing the durability policy change 428 primary := utils.ShardPrimaryTablet(t, newCluster, keyspace, shard0) 429 assert.NotNil(t, primary, "should have elected a primary") 430 utils.CheckReplication(t, newCluster, primary, shard0.Vttablets, 10*time.Second) 431 }