vitess.io/vitess@v0.16.2/go/test/endtoend/vtgr/vtgr_test.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package vtgr
    18  
    19  import (
    20  	"fmt"
    21  	"os"
    22  	"os/exec"
    23  	"path"
    24  	"strconv"
    25  	"strings"
    26  	"testing"
    27  	"time"
    28  
    29  	"vitess.io/vitess/go/sqltypes"
    30  
    31  	"github.com/stretchr/testify/require"
    32  	"gotest.tools/assert"
    33  
    34  	"vitess.io/vitess/go/json2"
    35  	"vitess.io/vitess/go/test/endtoend/cluster"
    36  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    37  )
    38  
    39  // To run this test locally on MacOS, set hostname to localhost first:
    40  // $ sudo scutil --set HostName localhost
    41  
    42  func createCluster(t *testing.T, numReplicas int) *cluster.LocalProcessCluster {
    43  	keyspaceName := "ks"
    44  	shardName := "0"
    45  	keyspace := &cluster.Keyspace{Name: keyspaceName}
    46  	shard0 := &cluster.Shard{Name: shardName}
    47  	hostname := "localhost"
    48  	cell1 := "zone1"
    49  	tablets := []*cluster.Vttablet{}
    50  	clusterInstance := cluster.NewCluster(cell1, hostname)
    51  
    52  	os.Setenv("EXTRA_MY_CNF", path.Join(os.Getenv("PWD"), "my.cnf"))
    53  
    54  	// Start topo server
    55  	err := clusterInstance.StartTopo()
    56  	require.NoError(t, err)
    57  
    58  	uidBase := 100
    59  	for i := 0; i < numReplicas; i++ {
    60  		tablet := clusterInstance.NewVttabletInstance("replica", uidBase+i, cell1)
    61  		tablets = append(tablets, tablet)
    62  	}
    63  
    64  	// Initialize Cluster
    65  	shard0.Vttablets = tablets
    66  	err = clusterInstance.SetupCluster(keyspace, []cluster.Shard{*shard0})
    67  	require.NoError(t, err)
    68  
    69  	// Start MySql
    70  	var mysqlCtlProcessList []*exec.Cmd
    71  	for _, tablet := range shard0.Vttablets {
    72  		proc, err := tablet.MysqlctlProcess.StartProcess()
    73  		require.NoError(t, err)
    74  		mysqlCtlProcessList = append(mysqlCtlProcessList, proc)
    75  	}
    76  
    77  	// Wait for mysql processes to start
    78  	for _, proc := range mysqlCtlProcessList {
    79  		err := proc.Wait()
    80  		require.NoError(t, err)
    81  	}
    82  	for _, tablet := range shard0.Vttablets {
    83  		// Reset status, don't wait for the tablet status. We will check it later
    84  		tablet.VttabletProcess.ServingStatus = ""
    85  		tablet.VttabletProcess.DbFlavor = "MysqlGR"
    86  		// If we enable backup the GR setup is a bit wacky
    87  		tablet.VttabletProcess.SupportsBackup = false
    88  		// Start the tablet
    89  		err := tablet.VttabletProcess.Setup()
    90  		require.NoError(t, err)
    91  	}
    92  
    93  	// Start vtgr - we deploy vtgr on the tablet node in the test
    94  	baseGrPort := 33061
    95  	for i, tablet := range shard0.Vttablets {
    96  		tablet.VtgrProcess = clusterInstance.NewVtgrProcess(
    97  			[]string{fmt.Sprintf("%s/%s", keyspaceName, shardName)},
    98  			path.Join(os.Getenv("PWD"), "test_config.json"),
    99  			baseGrPort+i,
   100  		)
   101  	}
   102  
   103  	for _, tablet := range shard0.Vttablets {
   104  		err := tablet.VttabletProcess.WaitForTabletTypes([]string{"NOT_SERVING"})
   105  		require.NoError(t, err)
   106  	}
   107  	return clusterInstance
   108  }
   109  
   110  func killTablets(t *testing.T, shard *cluster.Shard) {
   111  	for _, tablet := range shard.Vttablets {
   112  		if tablet.VtgrProcess != nil {
   113  			err := tablet.VtgrProcess.TearDown()
   114  			require.NoError(t, err)
   115  		}
   116  		err := tablet.VttabletProcess.TearDown()
   117  		require.NoError(t, err)
   118  	}
   119  }
   120  
   121  func TestBasicSetup(t *testing.T) {
   122  	defer cluster.PanicHandler(t)
   123  	clusterInstance := createCluster(t, 2)
   124  	keyspace := &clusterInstance.Keyspaces[0]
   125  	shard0 := &keyspace.Shards[0]
   126  	defer func() {
   127  		clusterInstance.Teardown()
   128  		killTablets(t, shard0)
   129  	}()
   130  	for _, tablet := range shard0.Vttablets {
   131  		// Until there is a primary, all tablets are replica and should all be NOT_SERVING status
   132  		tab := getTablet(t, clusterInstance, tablet.Alias)
   133  		assert.Equal(t, tab.Type.String(), "REPLICA")
   134  		assert.Equal(t, tablet.VttabletProcess.GetTabletStatus(), "NOT_SERVING")
   135  	}
   136  	_, err := getPrimaryTablet(t, clusterInstance, keyspace.Name, shard0.Name)
   137  	assert.ErrorContains(t, err, "timeout looking for primary tablet")
   138  
   139  	tablet1 := shard0.Vttablets[0]
   140  	query := `select count(*)
   141  		from performance_schema.replication_group_members
   142  		where MEMBER_STATE='ONLINE'`
   143  	var count int
   144  	err = getSQLResult(t, tablet1, query, func(values []sqltypes.Value) bool {
   145  		cnt, err := values[0].ToInt64()
   146  		if err != nil {
   147  			return false
   148  		}
   149  		count = int(cnt)
   150  		return true
   151  	})
   152  	require.NoError(t, err)
   153  	require.NoError(t, err)
   154  	// without vtgr, tablet process will not create a mysql group
   155  	// and all the nodes are replicas type in NOT_SERVING state
   156  	assert.Equal(t, 0, int(count))
   157  }
   158  
   159  func TestVTGRSetup(t *testing.T) {
   160  	defer cluster.PanicHandler(t)
   161  	clusterInstance := createCluster(t, 2)
   162  	keyspace := &clusterInstance.Keyspaces[0]
   163  	shard0 := &keyspace.Shards[0]
   164  	defer func() {
   165  		clusterInstance.Teardown()
   166  		killTablets(t, shard0)
   167  	}()
   168  	for _, tablet := range shard0.Vttablets {
   169  		// Until there is a primary, all tablets are replica and should all be NOT_SERVING status
   170  		tab := getTablet(t, clusterInstance, tablet.Alias)
   171  		assert.Equal(t, tab.Type.String(), "REPLICA")
   172  		assert.Equal(t, tablet.VttabletProcess.GetTabletStatus(), "NOT_SERVING")
   173  	}
   174  
   175  	// start VTGR processes
   176  	for _, tablet := range shard0.Vttablets {
   177  		err := tablet.VtgrProcess.Start(tablet.Alias)
   178  		require.NoError(t, err)
   179  	}
   180  
   181  	// VTGR will pick one tablet as the primary
   182  	primaryAlias, err := getPrimaryTablet(t, clusterInstance, keyspace.Name, shard0.Name)
   183  	require.NoError(t, err)
   184  	require.NotEqual(t, nil, primaryAlias)
   185  
   186  	tablet1 := shard0.Vttablets[0]
   187  	query := `select count(*) 
   188  		from performance_schema.replication_group_members 
   189  		where MEMBER_STATE='ONLINE'`
   190  	err = getSQLResult(t, tablet1, query, func(values []sqltypes.Value) bool {
   191  		cnt, err := values[0].ToInt64()
   192  		if err != nil {
   193  			return false
   194  		}
   195  		// VTGR should bootstrap the group and put the replica into the group
   196  		return cnt == 2
   197  	})
   198  	require.NoError(t, err)
   199  }
   200  
   201  func TestVTGRWrongPrimaryTablet(t *testing.T) {
   202  	defer cluster.PanicHandler(t)
   203  	clusterInstance := createCluster(t, 2)
   204  	keyspace := &clusterInstance.Keyspaces[0]
   205  	shard0 := &keyspace.Shards[0]
   206  	defer func() {
   207  		clusterInstance.Teardown()
   208  		killTablets(t, shard0)
   209  	}()
   210  	for _, tablet := range shard0.Vttablets {
   211  		// Until there is a primary, all tablets are replica and should all be NOT_SERVING status
   212  		tab := getTablet(t, clusterInstance, tablet.Alias)
   213  		assert.Equal(t, tab.Type.String(), "REPLICA")
   214  		assert.Equal(t, tablet.VttabletProcess.GetTabletStatus(), "NOT_SERVING")
   215  	}
   216  	// start VTGR processes
   217  	for _, tablet := range shard0.Vttablets {
   218  		err := tablet.VtgrProcess.Start(tablet.Alias)
   219  		require.NoError(t, err)
   220  	}
   221  	// VTGR will pick one tablet as the primary
   222  	primaryAlias, err := getPrimaryTablet(t, clusterInstance, keyspace.Name, shard0.Name)
   223  	require.NoError(t, err)
   224  	require.NotEqual(t, nil, primaryAlias)
   225  	tablet := shard0.Vttablets[0]
   226  	query := `select member_id
   227  		from performance_schema.replication_group_members
   228  		where member_role='SECONDARY' and member_state='ONLINE'`
   229  	var member string
   230  	err = getSQLResult(t, tablet, query, func(values []sqltypes.Value) bool {
   231  		member = values[0].ToString()
   232  		return true
   233  	})
   234  	require.NoError(t, err)
   235  	query = fmt.Sprintf(`select group_replication_set_as_primary('%s')`, member)
   236  	_, err = tablet.VttabletProcess.QueryTabletWithDB(query, "")
   237  	require.NoError(t, err)
   238  
   239  	// Verify the mysql primary changed, and also the primary tablet changed as well
   240  	query = fmt.Sprintf(`select member_role from performance_schema.replication_group_members where member_id='%s'`, member)
   241  	err = getSQLResult(t, tablet, query, func(values []sqltypes.Value) bool {
   242  		return values[0].ToString() == "PRIMARY"
   243  	})
   244  	require.NoError(t, err)
   245  	err = verifyPrimaryChange(t, clusterInstance, keyspace.Name, shard0.Name, primaryAlias)
   246  	require.NoError(t, err)
   247  }
   248  
   249  func TestVTGRFailover(t *testing.T) {
   250  	defer cluster.PanicHandler(t)
   251  	clusterInstance := createCluster(t, 3)
   252  	keyspace := &clusterInstance.Keyspaces[0]
   253  	shard0 := &keyspace.Shards[0]
   254  	defer func() {
   255  		clusterInstance.Teardown()
   256  		killTablets(t, shard0)
   257  	}()
   258  	for _, tablet := range shard0.Vttablets {
   259  		// Until there is a primary, all tablets are replica and should all be NOT_SERVING status
   260  		tab := getTablet(t, clusterInstance, tablet.Alias)
   261  		assert.Equal(t, tab.Type.String(), "REPLICA")
   262  		assert.Equal(t, tablet.VttabletProcess.GetTabletStatus(), "NOT_SERVING")
   263  	}
   264  	// start VTGR processes
   265  	for _, tablet := range shard0.Vttablets {
   266  		err := tablet.VtgrProcess.Start(tablet.Alias)
   267  		require.NoError(t, err)
   268  	}
   269  	primaryAlias, err := getPrimaryTablet(t, clusterInstance, keyspace.Name, shard0.Name)
   270  	require.NoError(t, err)
   271  	// VTGR has init the cluster
   272  	require.NotEqual(t, "", primaryAlias)
   273  	primaryTablet := findTabletByAlias(shard0.Vttablets, primaryAlias)
   274  	require.NotNil(t, primaryTablet)
   275  	// Wait until there are two nodes in the group
   276  	query := `select count(*) from
   277  		performance_schema.replication_group_members
   278  		where MEMBER_STATE='ONLINE'`
   279  	err = getSQLResult(t, primaryTablet, query, func(values []sqltypes.Value) bool {
   280  		return values[0].ToString() == "3"
   281  	})
   282  	require.NoError(t, err)
   283  
   284  	// Now kill the primary
   285  	// VTGR should move mysql primary to a different node and change failover primary tablet
   286  	err = primaryTablet.VttabletProcess.TearDown()
   287  	require.NoError(t, err)
   288  	err = verifyPrimaryChange(t, clusterInstance, keyspace.Name, shard0.Name, primaryAlias)
   289  	require.NoError(t, err)
   290  	// now the primary has changed
   291  	primaryAlias, err = getPrimaryTablet(t, clusterInstance, keyspace.Name, shard0.Name)
   292  	require.NoError(t, err)
   293  	// verify on the _new_ primary node, we are running the mysql primary as well
   294  	primaryTablet = findTabletByAlias(shard0.Vttablets, primaryAlias)
   295  	require.NotNil(t, primaryTablet)
   296  	query = `SELECT count(*) FROM
   297  		performance_schema.replication_group_members
   298  		WHERE MEMBER_STATE='ONLINE' AND MEMBER_ROLE='PRIMARY' AND MEMBER_PORT=@@port`
   299  	err = getSQLResult(t, primaryTablet, query, func(values []sqltypes.Value) bool {
   300  		return values[0].ToString() == "1"
   301  	})
   302  	require.NoError(t, err)
   303  }
   304  
   305  func getTablet(t *testing.T, cluster *cluster.LocalProcessCluster, alias string) *topodatapb.Tablet {
   306  	result, err := cluster.VtctlclientProcess.ExecuteCommandWithOutput("GetTablet", alias)
   307  	require.NoError(t, err)
   308  	var tabletInfo *topodatapb.Tablet
   309  	err = json2.Unmarshal([]byte(result), &tabletInfo)
   310  	require.NoError(t, err)
   311  	return tabletInfo
   312  }
   313  
   314  func findTabletByAlias(tablets []*cluster.Vttablet, alias *topodatapb.TabletAlias) *cluster.Vttablet {
   315  	for _, tablet := range tablets {
   316  		if tablet.Cell == alias.Cell && strings.HasSuffix(tablet.Alias, strconv.Itoa(int(alias.Uid))) {
   317  			return tablet
   318  		}
   319  	}
   320  	return nil
   321  }
   322  
   323  func verifyPrimaryChange(t *testing.T, cluster *cluster.LocalProcessCluster, ks, shard string, old *topodatapb.TabletAlias) error {
   324  	timeToWait := time.Now().Add(180 * time.Second)
   325  	for time.Now().Before(timeToWait) {
   326  		time.Sleep(1 * time.Second)
   327  		result, err := cluster.VtctlclientProcess.ExecuteCommandWithOutput("GetShard", fmt.Sprintf("%s/%s", ks, shard))
   328  		require.NoError(t, err)
   329  		var shardInfo topodatapb.Shard
   330  		err = json2.Unmarshal([]byte(result), &shardInfo)
   331  		require.NoError(t, err)
   332  		if shardInfo.PrimaryAlias.String() != old.String() {
   333  			return nil
   334  		}
   335  	}
   336  	return fmt.Errorf("fail to verify primary change")
   337  }
   338  
   339  func getPrimaryTablet(t *testing.T, cluster *cluster.LocalProcessCluster, ks, shard string) (*topodatapb.TabletAlias, error) {
   340  	timeToWait := time.Now().Add(180 * time.Second)
   341  	for time.Now().Before(timeToWait) {
   342  		time.Sleep(1 * time.Second)
   343  		result, err := cluster.VtctlclientProcess.ExecuteCommandWithOutput("GetShard", fmt.Sprintf("%s/%s", ks, shard))
   344  		require.NoError(t, err)
   345  		var shardInfo topodatapb.Shard
   346  		err = json2.Unmarshal([]byte(result), &shardInfo)
   347  		require.NoError(t, err)
   348  		if shardInfo.PrimaryAlias != nil {
   349  			return shardInfo.PrimaryAlias, nil
   350  		}
   351  	}
   352  	return nil, fmt.Errorf("timeout looking for primary tablet")
   353  }
   354  
   355  func getSQLResult(t *testing.T, tablet *cluster.Vttablet, query string, check func([]sqltypes.Value) bool) error {
   356  	timeToWait := time.Now().Add(180 * time.Second)
   357  	for time.Now().Before(timeToWait) {
   358  		time.Sleep(1 * time.Second)
   359  		qr, err := tablet.VttabletProcess.QueryTabletWithDB(query, "")
   360  		require.NoError(t, err)
   361  		if len(qr.Rows) == 1 && check(qr.Rows[0]) {
   362  			return nil
   363  		}
   364  	}
   365  	return fmt.Errorf("timeout waiting for sql result")
   366  }