vitess.io/vitess@v0.16.2/go/test/endtoend/tabletmanager/tablet_health_test.go (about)

     1  /*
     2  Copyright 2019 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package tabletmanager
    18  
    19  import (
    20  	"context"
    21  	"encoding/json"
    22  	"fmt"
    23  	"net/http"
    24  	"sync"
    25  	"testing"
    26  	"time"
    27  
    28  	"github.com/stretchr/testify/assert"
    29  	"github.com/stretchr/testify/require"
    30  
    31  	"vitess.io/vitess/go/json2"
    32  	"vitess.io/vitess/go/mysql"
    33  	"vitess.io/vitess/go/test/endtoend/cluster"
    34  	"vitess.io/vitess/go/test/endtoend/utils"
    35  
    36  	querypb "vitess.io/vitess/go/vt/proto/query"
    37  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    38  )
    39  
    40  // TabletReshuffle test if a vttablet can be pointed at an existing mysql
    41  func TestTabletReshuffle(t *testing.T) {
    42  	defer cluster.PanicHandler(t)
    43  	ctx := context.Background()
    44  
    45  	conn, err := mysql.Connect(ctx, &primaryTabletParams)
    46  	require.NoError(t, err)
    47  	defer conn.Close()
    48  
    49  	replicaConn, err := mysql.Connect(ctx, &replicaTabletParams)
    50  	require.NoError(t, err)
    51  	defer replicaConn.Close()
    52  
    53  	// Sanity Check
    54  	utils.Exec(t, conn, "delete from t1")
    55  	utils.Exec(t, conn, "insert into t1(id, value) values(1,'a'), (2,'b')")
    56  	checkDataOnReplica(t, replicaConn, `[[VARCHAR("a")] [VARCHAR("b")]]`)
    57  
    58  	// Create new tablet
    59  	rTablet := clusterInstance.NewVttabletInstance("replica", 0, "")
    60  
    61  	// mycnf_server_id prevents vttablet from reading the mycnf
    62  	// Pointing to primaryTablet's socket file
    63  	// We have to disable active reparenting to prevent the tablet from trying to fix replication.
    64  	// We also have to disable replication reporting because we're pointed at the primary.
    65  	clusterInstance.VtTabletExtraArgs = []string{
    66  		"--lock_tables_timeout", "5s",
    67  		"--mycnf_server_id", fmt.Sprintf("%d", rTablet.TabletUID),
    68  		"--db_socket", fmt.Sprintf("%s/mysql.sock", primaryTablet.VttabletProcess.Directory),
    69  		"--disable_active_reparents",
    70  		"--enable_replication_reporter=false",
    71  	}
    72  	defer func() { clusterInstance.VtTabletExtraArgs = []string{} }()
    73  
    74  	// SupportsBackup=False prevents vttablet from trying to restore
    75  	// Start vttablet process
    76  	err = clusterInstance.StartVttablet(rTablet, "SERVING", false, cell, keyspaceName, hostname, shardName)
    77  	require.NoError(t, err)
    78  
    79  	sql := "select value from t1"
    80  	qr, err := clusterInstance.ExecOnTablet(ctx, rTablet, sql, nil, &querypb.ExecuteOptions{IncludedFields: querypb.ExecuteOptions_TYPE_ONLY})
    81  	require.NoError(t, err)
    82  
    83  	result, err := json.Marshal(qr)
    84  	require.NoError(t, err)
    85  	assertExcludeFields(t, string(result))
    86  
    87  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("Backup", rTablet.Alias)
    88  	assert.Error(t, err, "cannot perform backup without my.cnf")
    89  
    90  	killTablets(t, rTablet)
    91  }
    92  
    93  func TestHealthCheck(t *testing.T) {
    94  	// Add one replica that starts not initialized
    95  	defer cluster.PanicHandler(t)
    96  	ctx := context.Background()
    97  
    98  	rTablet := clusterInstance.NewVttabletInstance("replica", 0, "")
    99  
   100  	// Start Mysql Processes and return connection
   101  	replicaConn, err := cluster.StartMySQLAndGetConnection(ctx, rTablet, username, clusterInstance.TmpDirectory)
   102  	require.NoError(t, err)
   103  
   104  	defer replicaConn.Close()
   105  
   106  	// Create database in mysql
   107  	utils.Exec(t, replicaConn, fmt.Sprintf("create database vt_%s", keyspaceName))
   108  
   109  	// start vttablet process, should be in SERVING state as we already have a primary
   110  	err = clusterInstance.StartVttablet(rTablet, "SERVING", false, cell, keyspaceName, hostname, shardName)
   111  	require.NoError(t, err)
   112  
   113  	conn, err := mysql.Connect(ctx, &primaryTabletParams)
   114  	require.NoError(t, err)
   115  	defer conn.Close()
   116  
   117  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("RunHealthCheck", rTablet.Alias)
   118  	require.NoError(t, err)
   119  	checkHealth(t, rTablet.HTTPPort, false)
   120  
   121  	// Make sure the primary is still primary
   122  	checkTabletType(t, primaryTablet.Alias, "PRIMARY")
   123  	utils.Exec(t, conn, "stop slave")
   124  
   125  	// stop replication, make sure we don't go unhealthy.
   126  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("StopReplication", rTablet.Alias)
   127  	require.NoError(t, err)
   128  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("RunHealthCheck", rTablet.Alias)
   129  	require.NoError(t, err)
   130  
   131  	// make sure the health stream is updated
   132  	shrs, err := clusterInstance.StreamTabletHealth(ctx, rTablet, 1)
   133  	require.NoError(t, err)
   134  	for _, shr := range shrs {
   135  		verifyStreamHealth(t, shr, true)
   136  	}
   137  
   138  	// then restart replication, make sure we stay healthy
   139  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", rTablet.Alias)
   140  	require.NoError(t, err)
   141  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("RunHealthCheck", rTablet.Alias)
   142  	require.NoError(t, err)
   143  	checkHealth(t, rTablet.HTTPPort, false)
   144  
   145  	// now test the health stream returns the right thing
   146  	shrs, err = clusterInstance.StreamTabletHealth(ctx, rTablet, 2)
   147  	require.NoError(t, err)
   148  	for _, shr := range shrs {
   149  		verifyStreamHealth(t, shr, true)
   150  	}
   151  
   152  	// stop the replica's source mysqld instance to break replication
   153  	// and test that the replica tablet becomes unhealthy and non-serving after crossing
   154  	// the tablet's --unhealthy_threshold and the gateway's --discovery_low_replication_lag
   155  	err = primaryTablet.MysqlctlProcess.Stop()
   156  	require.NoError(t, err)
   157  
   158  	time.Sleep(tabletUnhealthyThreshold + tabletHealthcheckRefreshInterval)
   159  
   160  	// now the replica's health stream should show it as unhealthy
   161  	shrs, err = clusterInstance.StreamTabletHealth(ctx, rTablet, 1)
   162  	require.NoError(t, err)
   163  	for _, shr := range shrs {
   164  		verifyStreamHealth(t, shr, false)
   165  	}
   166  
   167  	// start the primary tablet's mysqld back up
   168  	primaryTablet.MysqlctlProcess.InitMysql = false
   169  	err = primaryTablet.MysqlctlProcess.Start()
   170  	primaryTablet.MysqlctlProcess.InitMysql = true
   171  	require.NoError(t, err)
   172  
   173  	// On a MySQL restart, it comes up as a read-only tablet (check default.cnf file).
   174  	// We have to explicitly set it to read-write otherwise heartbeat writer is unable
   175  	// to write the heartbeats
   176  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("SetReadWrite", primaryTablet.Alias)
   177  	require.NoError(t, err)
   178  
   179  	// explicitly start replication on all of the replicas to avoid any test flakiness as they were all
   180  	// replicating from the primary instance
   181  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", rTablet.Alias)
   182  	require.NoError(t, err)
   183  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", replicaTablet.Alias)
   184  	require.NoError(t, err)
   185  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", rdonlyTablet.Alias)
   186  	require.NoError(t, err)
   187  
   188  	time.Sleep(tabletHealthcheckRefreshInterval)
   189  
   190  	// now the replica's health stream should show it as healthy again
   191  	shrs, err = clusterInstance.StreamTabletHealth(ctx, rTablet, 1)
   192  	require.NoError(t, err)
   193  	for _, shr := range shrs {
   194  		verifyStreamHealth(t, shr, true)
   195  	}
   196  
   197  	// Manual cleanup of processes
   198  	killTablets(t, rTablet)
   199  }
   200  
   201  func checkHealth(t *testing.T, port int, shouldError bool) {
   202  	url := fmt.Sprintf("http://localhost:%d/healthz", port)
   203  	resp, err := http.Get(url)
   204  	require.NoError(t, err)
   205  	defer resp.Body.Close()
   206  	if shouldError {
   207  		assert.True(t, resp.StatusCode > 400)
   208  	} else {
   209  		assert.Equal(t, 200, resp.StatusCode)
   210  	}
   211  }
   212  
   213  func checkTabletType(t *testing.T, tabletAlias string, typeWant string) {
   214  	result, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("GetTablet", tabletAlias)
   215  	require.NoError(t, err)
   216  
   217  	var tablet topodatapb.Tablet
   218  	err = json2.Unmarshal([]byte(result), &tablet)
   219  	require.NoError(t, err)
   220  
   221  	actualType := tablet.GetType()
   222  	got := fmt.Sprintf("%d", actualType)
   223  
   224  	tabletType := topodatapb.TabletType_value[typeWant]
   225  	want := fmt.Sprintf("%d", tabletType)
   226  
   227  	assert.Equal(t, want, got)
   228  }
   229  
   230  func verifyStreamHealth(t *testing.T, streamHealthResponse *querypb.StreamHealthResponse, expectHealthy bool) {
   231  	serving := streamHealthResponse.GetServing()
   232  	UID := streamHealthResponse.GetTabletAlias().GetUid()
   233  	realTimeStats := streamHealthResponse.GetRealtimeStats()
   234  	replicationLagSeconds := realTimeStats.GetReplicationLagSeconds()
   235  	assert.True(t, UID > 0, "Tablet should contain uid")
   236  	if expectHealthy {
   237  		assert.True(t, serving, "Tablet should be in serving state")
   238  		// replicationLagSeconds varies till 7200 so setting safe limit
   239  		assert.True(t, replicationLagSeconds < 10000, "replica should not be behind primary")
   240  	} else {
   241  		assert.True(t, (!serving || replicationLagSeconds >= uint32(tabletUnhealthyThreshold.Seconds())), "Tablet should not be in serving and healthy state")
   242  	}
   243  }
   244  
   245  func TestHealthCheckDrainedStateDoesNotShutdownQueryService(t *testing.T) {
   246  	// This test is similar to test_health_check, but has the following differences:
   247  	// - the second tablet is an 'rdonly' and not a 'replica'
   248  	// - the second tablet will be set to 'drained' and we expect that
   249  	// - the query service won't be shutdown
   250  
   251  	//Wait if tablet is not in service state
   252  	defer cluster.PanicHandler(t)
   253  	err := rdonlyTablet.VttabletProcess.WaitForTabletStatus("SERVING")
   254  	require.NoError(t, err)
   255  
   256  	// Check tablet health
   257  	checkHealth(t, rdonlyTablet.HTTPPort, false)
   258  	assert.Equal(t, "SERVING", rdonlyTablet.VttabletProcess.GetTabletStatus())
   259  
   260  	// Change from rdonly to drained and stop replication. The tablet will stay
   261  	// healthy, and the query service is still running.
   262  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", rdonlyTablet.Alias, "drained")
   263  	require.NoError(t, err)
   264  	// Trying to drain the same tablet again, should error
   265  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", rdonlyTablet.Alias, "drained")
   266  	assert.Error(t, err, "already drained")
   267  
   268  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("StopReplication", rdonlyTablet.Alias)
   269  	require.NoError(t, err)
   270  	// Trigger healthcheck explicitly to avoid waiting for the next interval.
   271  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("RunHealthCheck", rdonlyTablet.Alias)
   272  	require.NoError(t, err)
   273  
   274  	checkTabletType(t, rdonlyTablet.Alias, "DRAINED")
   275  
   276  	// Query service is still running.
   277  	err = rdonlyTablet.VttabletProcess.WaitForTabletStatus("SERVING")
   278  	require.NoError(t, err)
   279  
   280  	// Restart replication. Tablet will become healthy again.
   281  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", rdonlyTablet.Alias, "rdonly")
   282  	require.NoError(t, err)
   283  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", rdonlyTablet.Alias)
   284  	require.NoError(t, err)
   285  	err = clusterInstance.VtctlclientProcess.ExecuteCommand("RunHealthCheck", rdonlyTablet.Alias)
   286  	require.NoError(t, err)
   287  	checkHealth(t, rdonlyTablet.HTTPPort, false)
   288  }
   289  
   290  func killTablets(t *testing.T, tablets ...*cluster.Vttablet) {
   291  	var wg sync.WaitGroup
   292  	for _, tablet := range tablets {
   293  		wg.Add(1)
   294  		go func(tablet *cluster.Vttablet) {
   295  			defer wg.Done()
   296  			_ = tablet.VttabletProcess.TearDown()
   297  			_ = tablet.MysqlctlProcess.Stop()
   298  		}(tablet)
   299  	}
   300  	wg.Wait()
   301  }