vitess.io/vitess@v0.16.2/go/test/endtoend/vtgr/vtgr_test.go (about) 1 /* 2 Copyright 2021 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package vtgr 18 19 import ( 20 "fmt" 21 "os" 22 "os/exec" 23 "path" 24 "strconv" 25 "strings" 26 "testing" 27 "time" 28 29 "vitess.io/vitess/go/sqltypes" 30 31 "github.com/stretchr/testify/require" 32 "gotest.tools/assert" 33 34 "vitess.io/vitess/go/json2" 35 "vitess.io/vitess/go/test/endtoend/cluster" 36 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 37 ) 38 39 // To run this test locally on MacOS, set hostname to localhost first: 40 // $ sudo scutil --set HostName localhost 41 42 func createCluster(t *testing.T, numReplicas int) *cluster.LocalProcessCluster { 43 keyspaceName := "ks" 44 shardName := "0" 45 keyspace := &cluster.Keyspace{Name: keyspaceName} 46 shard0 := &cluster.Shard{Name: shardName} 47 hostname := "localhost" 48 cell1 := "zone1" 49 tablets := []*cluster.Vttablet{} 50 clusterInstance := cluster.NewCluster(cell1, hostname) 51 52 os.Setenv("EXTRA_MY_CNF", path.Join(os.Getenv("PWD"), "my.cnf")) 53 54 // Start topo server 55 err := clusterInstance.StartTopo() 56 require.NoError(t, err) 57 58 uidBase := 100 59 for i := 0; i < numReplicas; i++ { 60 tablet := clusterInstance.NewVttabletInstance("replica", uidBase+i, cell1) 61 tablets = append(tablets, tablet) 62 } 63 64 // Initialize Cluster 65 shard0.Vttablets = tablets 66 err = clusterInstance.SetupCluster(keyspace, []cluster.Shard{*shard0}) 67 require.NoError(t, err) 68 69 // Start MySql 70 var mysqlCtlProcessList []*exec.Cmd 71 for _, tablet := range shard0.Vttablets { 72 proc, err := tablet.MysqlctlProcess.StartProcess() 73 require.NoError(t, err) 74 mysqlCtlProcessList = append(mysqlCtlProcessList, proc) 75 } 76 77 // Wait for mysql processes to start 78 for _, proc := range mysqlCtlProcessList { 79 err := proc.Wait() 80 require.NoError(t, err) 81 } 82 for _, tablet := range shard0.Vttablets { 83 // Reset status, don't wait for the tablet status. We will check it later 84 tablet.VttabletProcess.ServingStatus = "" 85 tablet.VttabletProcess.DbFlavor = "MysqlGR" 86 // If we enable backup the GR setup is a bit wacky 87 tablet.VttabletProcess.SupportsBackup = false 88 // Start the tablet 89 err := tablet.VttabletProcess.Setup() 90 require.NoError(t, err) 91 } 92 93 // Start vtgr - we deploy vtgr on the tablet node in the test 94 baseGrPort := 33061 95 for i, tablet := range shard0.Vttablets { 96 tablet.VtgrProcess = clusterInstance.NewVtgrProcess( 97 []string{fmt.Sprintf("%s/%s", keyspaceName, shardName)}, 98 path.Join(os.Getenv("PWD"), "test_config.json"), 99 baseGrPort+i, 100 ) 101 } 102 103 for _, tablet := range shard0.Vttablets { 104 err := tablet.VttabletProcess.WaitForTabletTypes([]string{"NOT_SERVING"}) 105 require.NoError(t, err) 106 } 107 return clusterInstance 108 } 109 110 func killTablets(t *testing.T, shard *cluster.Shard) { 111 for _, tablet := range shard.Vttablets { 112 if tablet.VtgrProcess != nil { 113 err := tablet.VtgrProcess.TearDown() 114 require.NoError(t, err) 115 } 116 err := tablet.VttabletProcess.TearDown() 117 require.NoError(t, err) 118 } 119 } 120 121 func TestBasicSetup(t *testing.T) { 122 defer cluster.PanicHandler(t) 123 clusterInstance := createCluster(t, 2) 124 keyspace := &clusterInstance.Keyspaces[0] 125 shard0 := &keyspace.Shards[0] 126 defer func() { 127 clusterInstance.Teardown() 128 killTablets(t, shard0) 129 }() 130 for _, tablet := range shard0.Vttablets { 131 // Until there is a primary, all tablets are replica and should all be NOT_SERVING status 132 tab := getTablet(t, clusterInstance, tablet.Alias) 133 assert.Equal(t, tab.Type.String(), "REPLICA") 134 assert.Equal(t, tablet.VttabletProcess.GetTabletStatus(), "NOT_SERVING") 135 } 136 _, err := getPrimaryTablet(t, clusterInstance, keyspace.Name, shard0.Name) 137 assert.ErrorContains(t, err, "timeout looking for primary tablet") 138 139 tablet1 := shard0.Vttablets[0] 140 query := `select count(*) 141 from performance_schema.replication_group_members 142 where MEMBER_STATE='ONLINE'` 143 var count int 144 err = getSQLResult(t, tablet1, query, func(values []sqltypes.Value) bool { 145 cnt, err := values[0].ToInt64() 146 if err != nil { 147 return false 148 } 149 count = int(cnt) 150 return true 151 }) 152 require.NoError(t, err) 153 require.NoError(t, err) 154 // without vtgr, tablet process will not create a mysql group 155 // and all the nodes are replicas type in NOT_SERVING state 156 assert.Equal(t, 0, int(count)) 157 } 158 159 func TestVTGRSetup(t *testing.T) { 160 defer cluster.PanicHandler(t) 161 clusterInstance := createCluster(t, 2) 162 keyspace := &clusterInstance.Keyspaces[0] 163 shard0 := &keyspace.Shards[0] 164 defer func() { 165 clusterInstance.Teardown() 166 killTablets(t, shard0) 167 }() 168 for _, tablet := range shard0.Vttablets { 169 // Until there is a primary, all tablets are replica and should all be NOT_SERVING status 170 tab := getTablet(t, clusterInstance, tablet.Alias) 171 assert.Equal(t, tab.Type.String(), "REPLICA") 172 assert.Equal(t, tablet.VttabletProcess.GetTabletStatus(), "NOT_SERVING") 173 } 174 175 // start VTGR processes 176 for _, tablet := range shard0.Vttablets { 177 err := tablet.VtgrProcess.Start(tablet.Alias) 178 require.NoError(t, err) 179 } 180 181 // VTGR will pick one tablet as the primary 182 primaryAlias, err := getPrimaryTablet(t, clusterInstance, keyspace.Name, shard0.Name) 183 require.NoError(t, err) 184 require.NotEqual(t, nil, primaryAlias) 185 186 tablet1 := shard0.Vttablets[0] 187 query := `select count(*) 188 from performance_schema.replication_group_members 189 where MEMBER_STATE='ONLINE'` 190 err = getSQLResult(t, tablet1, query, func(values []sqltypes.Value) bool { 191 cnt, err := values[0].ToInt64() 192 if err != nil { 193 return false 194 } 195 // VTGR should bootstrap the group and put the replica into the group 196 return cnt == 2 197 }) 198 require.NoError(t, err) 199 } 200 201 func TestVTGRWrongPrimaryTablet(t *testing.T) { 202 defer cluster.PanicHandler(t) 203 clusterInstance := createCluster(t, 2) 204 keyspace := &clusterInstance.Keyspaces[0] 205 shard0 := &keyspace.Shards[0] 206 defer func() { 207 clusterInstance.Teardown() 208 killTablets(t, shard0) 209 }() 210 for _, tablet := range shard0.Vttablets { 211 // Until there is a primary, all tablets are replica and should all be NOT_SERVING status 212 tab := getTablet(t, clusterInstance, tablet.Alias) 213 assert.Equal(t, tab.Type.String(), "REPLICA") 214 assert.Equal(t, tablet.VttabletProcess.GetTabletStatus(), "NOT_SERVING") 215 } 216 // start VTGR processes 217 for _, tablet := range shard0.Vttablets { 218 err := tablet.VtgrProcess.Start(tablet.Alias) 219 require.NoError(t, err) 220 } 221 // VTGR will pick one tablet as the primary 222 primaryAlias, err := getPrimaryTablet(t, clusterInstance, keyspace.Name, shard0.Name) 223 require.NoError(t, err) 224 require.NotEqual(t, nil, primaryAlias) 225 tablet := shard0.Vttablets[0] 226 query := `select member_id 227 from performance_schema.replication_group_members 228 where member_role='SECONDARY' and member_state='ONLINE'` 229 var member string 230 err = getSQLResult(t, tablet, query, func(values []sqltypes.Value) bool { 231 member = values[0].ToString() 232 return true 233 }) 234 require.NoError(t, err) 235 query = fmt.Sprintf(`select group_replication_set_as_primary('%s')`, member) 236 _, err = tablet.VttabletProcess.QueryTabletWithDB(query, "") 237 require.NoError(t, err) 238 239 // Verify the mysql primary changed, and also the primary tablet changed as well 240 query = fmt.Sprintf(`select member_role from performance_schema.replication_group_members where member_id='%s'`, member) 241 err = getSQLResult(t, tablet, query, func(values []sqltypes.Value) bool { 242 return values[0].ToString() == "PRIMARY" 243 }) 244 require.NoError(t, err) 245 err = verifyPrimaryChange(t, clusterInstance, keyspace.Name, shard0.Name, primaryAlias) 246 require.NoError(t, err) 247 } 248 249 func TestVTGRFailover(t *testing.T) { 250 defer cluster.PanicHandler(t) 251 clusterInstance := createCluster(t, 3) 252 keyspace := &clusterInstance.Keyspaces[0] 253 shard0 := &keyspace.Shards[0] 254 defer func() { 255 clusterInstance.Teardown() 256 killTablets(t, shard0) 257 }() 258 for _, tablet := range shard0.Vttablets { 259 // Until there is a primary, all tablets are replica and should all be NOT_SERVING status 260 tab := getTablet(t, clusterInstance, tablet.Alias) 261 assert.Equal(t, tab.Type.String(), "REPLICA") 262 assert.Equal(t, tablet.VttabletProcess.GetTabletStatus(), "NOT_SERVING") 263 } 264 // start VTGR processes 265 for _, tablet := range shard0.Vttablets { 266 err := tablet.VtgrProcess.Start(tablet.Alias) 267 require.NoError(t, err) 268 } 269 primaryAlias, err := getPrimaryTablet(t, clusterInstance, keyspace.Name, shard0.Name) 270 require.NoError(t, err) 271 // VTGR has init the cluster 272 require.NotEqual(t, "", primaryAlias) 273 primaryTablet := findTabletByAlias(shard0.Vttablets, primaryAlias) 274 require.NotNil(t, primaryTablet) 275 // Wait until there are two nodes in the group 276 query := `select count(*) from 277 performance_schema.replication_group_members 278 where MEMBER_STATE='ONLINE'` 279 err = getSQLResult(t, primaryTablet, query, func(values []sqltypes.Value) bool { 280 return values[0].ToString() == "3" 281 }) 282 require.NoError(t, err) 283 284 // Now kill the primary 285 // VTGR should move mysql primary to a different node and change failover primary tablet 286 err = primaryTablet.VttabletProcess.TearDown() 287 require.NoError(t, err) 288 err = verifyPrimaryChange(t, clusterInstance, keyspace.Name, shard0.Name, primaryAlias) 289 require.NoError(t, err) 290 // now the primary has changed 291 primaryAlias, err = getPrimaryTablet(t, clusterInstance, keyspace.Name, shard0.Name) 292 require.NoError(t, err) 293 // verify on the _new_ primary node, we are running the mysql primary as well 294 primaryTablet = findTabletByAlias(shard0.Vttablets, primaryAlias) 295 require.NotNil(t, primaryTablet) 296 query = `SELECT count(*) FROM 297 performance_schema.replication_group_members 298 WHERE MEMBER_STATE='ONLINE' AND MEMBER_ROLE='PRIMARY' AND MEMBER_PORT=@@port` 299 err = getSQLResult(t, primaryTablet, query, func(values []sqltypes.Value) bool { 300 return values[0].ToString() == "1" 301 }) 302 require.NoError(t, err) 303 } 304 305 func getTablet(t *testing.T, cluster *cluster.LocalProcessCluster, alias string) *topodatapb.Tablet { 306 result, err := cluster.VtctlclientProcess.ExecuteCommandWithOutput("GetTablet", alias) 307 require.NoError(t, err) 308 var tabletInfo *topodatapb.Tablet 309 err = json2.Unmarshal([]byte(result), &tabletInfo) 310 require.NoError(t, err) 311 return tabletInfo 312 } 313 314 func findTabletByAlias(tablets []*cluster.Vttablet, alias *topodatapb.TabletAlias) *cluster.Vttablet { 315 for _, tablet := range tablets { 316 if tablet.Cell == alias.Cell && strings.HasSuffix(tablet.Alias, strconv.Itoa(int(alias.Uid))) { 317 return tablet 318 } 319 } 320 return nil 321 } 322 323 func verifyPrimaryChange(t *testing.T, cluster *cluster.LocalProcessCluster, ks, shard string, old *topodatapb.TabletAlias) error { 324 timeToWait := time.Now().Add(180 * time.Second) 325 for time.Now().Before(timeToWait) { 326 time.Sleep(1 * time.Second) 327 result, err := cluster.VtctlclientProcess.ExecuteCommandWithOutput("GetShard", fmt.Sprintf("%s/%s", ks, shard)) 328 require.NoError(t, err) 329 var shardInfo topodatapb.Shard 330 err = json2.Unmarshal([]byte(result), &shardInfo) 331 require.NoError(t, err) 332 if shardInfo.PrimaryAlias.String() != old.String() { 333 return nil 334 } 335 } 336 return fmt.Errorf("fail to verify primary change") 337 } 338 339 func getPrimaryTablet(t *testing.T, cluster *cluster.LocalProcessCluster, ks, shard string) (*topodatapb.TabletAlias, error) { 340 timeToWait := time.Now().Add(180 * time.Second) 341 for time.Now().Before(timeToWait) { 342 time.Sleep(1 * time.Second) 343 result, err := cluster.VtctlclientProcess.ExecuteCommandWithOutput("GetShard", fmt.Sprintf("%s/%s", ks, shard)) 344 require.NoError(t, err) 345 var shardInfo topodatapb.Shard 346 err = json2.Unmarshal([]byte(result), &shardInfo) 347 require.NoError(t, err) 348 if shardInfo.PrimaryAlias != nil { 349 return shardInfo.PrimaryAlias, nil 350 } 351 } 352 return nil, fmt.Errorf("timeout looking for primary tablet") 353 } 354 355 func getSQLResult(t *testing.T, tablet *cluster.Vttablet, query string, check func([]sqltypes.Value) bool) error { 356 timeToWait := time.Now().Add(180 * time.Second) 357 for time.Now().Before(timeToWait) { 358 time.Sleep(1 * time.Second) 359 qr, err := tablet.VttabletProcess.QueryTabletWithDB(query, "") 360 require.NoError(t, err) 361 if len(qr.Rows) == 1 && check(qr.Rows[0]) { 362 return nil 363 } 364 } 365 return fmt.Errorf("timeout waiting for sql result") 366 }