vitess.io/vitess@v0.16.2/go/test/endtoend/tabletmanager/tablet_health_test.go (about) 1 /* 2 Copyright 2019 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package tabletmanager 18 19 import ( 20 "context" 21 "encoding/json" 22 "fmt" 23 "net/http" 24 "sync" 25 "testing" 26 "time" 27 28 "github.com/stretchr/testify/assert" 29 "github.com/stretchr/testify/require" 30 31 "vitess.io/vitess/go/json2" 32 "vitess.io/vitess/go/mysql" 33 "vitess.io/vitess/go/test/endtoend/cluster" 34 "vitess.io/vitess/go/test/endtoend/utils" 35 36 querypb "vitess.io/vitess/go/vt/proto/query" 37 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 38 ) 39 40 // TabletReshuffle test if a vttablet can be pointed at an existing mysql 41 func TestTabletReshuffle(t *testing.T) { 42 defer cluster.PanicHandler(t) 43 ctx := context.Background() 44 45 conn, err := mysql.Connect(ctx, &primaryTabletParams) 46 require.NoError(t, err) 47 defer conn.Close() 48 49 replicaConn, err := mysql.Connect(ctx, &replicaTabletParams) 50 require.NoError(t, err) 51 defer replicaConn.Close() 52 53 // Sanity Check 54 utils.Exec(t, conn, "delete from t1") 55 utils.Exec(t, conn, "insert into t1(id, value) values(1,'a'), (2,'b')") 56 checkDataOnReplica(t, replicaConn, `[[VARCHAR("a")] [VARCHAR("b")]]`) 57 58 // Create new tablet 59 rTablet := clusterInstance.NewVttabletInstance("replica", 0, "") 60 61 // mycnf_server_id prevents vttablet from reading the mycnf 62 // Pointing to primaryTablet's socket file 63 // We have to disable active reparenting to prevent the tablet from trying to fix replication. 64 // We also have to disable replication reporting because we're pointed at the primary. 65 clusterInstance.VtTabletExtraArgs = []string{ 66 "--lock_tables_timeout", "5s", 67 "--mycnf_server_id", fmt.Sprintf("%d", rTablet.TabletUID), 68 "--db_socket", fmt.Sprintf("%s/mysql.sock", primaryTablet.VttabletProcess.Directory), 69 "--disable_active_reparents", 70 "--enable_replication_reporter=false", 71 } 72 defer func() { clusterInstance.VtTabletExtraArgs = []string{} }() 73 74 // SupportsBackup=False prevents vttablet from trying to restore 75 // Start vttablet process 76 err = clusterInstance.StartVttablet(rTablet, "SERVING", false, cell, keyspaceName, hostname, shardName) 77 require.NoError(t, err) 78 79 sql := "select value from t1" 80 qr, err := clusterInstance.ExecOnTablet(ctx, rTablet, sql, nil, &querypb.ExecuteOptions{IncludedFields: querypb.ExecuteOptions_TYPE_ONLY}) 81 require.NoError(t, err) 82 83 result, err := json.Marshal(qr) 84 require.NoError(t, err) 85 assertExcludeFields(t, string(result)) 86 87 err = clusterInstance.VtctlclientProcess.ExecuteCommand("Backup", rTablet.Alias) 88 assert.Error(t, err, "cannot perform backup without my.cnf") 89 90 killTablets(t, rTablet) 91 } 92 93 func TestHealthCheck(t *testing.T) { 94 // Add one replica that starts not initialized 95 defer cluster.PanicHandler(t) 96 ctx := context.Background() 97 98 rTablet := clusterInstance.NewVttabletInstance("replica", 0, "") 99 100 // Start Mysql Processes and return connection 101 replicaConn, err := cluster.StartMySQLAndGetConnection(ctx, rTablet, username, clusterInstance.TmpDirectory) 102 require.NoError(t, err) 103 104 defer replicaConn.Close() 105 106 // Create database in mysql 107 utils.Exec(t, replicaConn, fmt.Sprintf("create database vt_%s", keyspaceName)) 108 109 // start vttablet process, should be in SERVING state as we already have a primary 110 err = clusterInstance.StartVttablet(rTablet, "SERVING", false, cell, keyspaceName, hostname, shardName) 111 require.NoError(t, err) 112 113 conn, err := mysql.Connect(ctx, &primaryTabletParams) 114 require.NoError(t, err) 115 defer conn.Close() 116 117 err = clusterInstance.VtctlclientProcess.ExecuteCommand("RunHealthCheck", rTablet.Alias) 118 require.NoError(t, err) 119 checkHealth(t, rTablet.HTTPPort, false) 120 121 // Make sure the primary is still primary 122 checkTabletType(t, primaryTablet.Alias, "PRIMARY") 123 utils.Exec(t, conn, "stop slave") 124 125 // stop replication, make sure we don't go unhealthy. 126 err = clusterInstance.VtctlclientProcess.ExecuteCommand("StopReplication", rTablet.Alias) 127 require.NoError(t, err) 128 err = clusterInstance.VtctlclientProcess.ExecuteCommand("RunHealthCheck", rTablet.Alias) 129 require.NoError(t, err) 130 131 // make sure the health stream is updated 132 shrs, err := clusterInstance.StreamTabletHealth(ctx, rTablet, 1) 133 require.NoError(t, err) 134 for _, shr := range shrs { 135 verifyStreamHealth(t, shr, true) 136 } 137 138 // then restart replication, make sure we stay healthy 139 err = clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", rTablet.Alias) 140 require.NoError(t, err) 141 err = clusterInstance.VtctlclientProcess.ExecuteCommand("RunHealthCheck", rTablet.Alias) 142 require.NoError(t, err) 143 checkHealth(t, rTablet.HTTPPort, false) 144 145 // now test the health stream returns the right thing 146 shrs, err = clusterInstance.StreamTabletHealth(ctx, rTablet, 2) 147 require.NoError(t, err) 148 for _, shr := range shrs { 149 verifyStreamHealth(t, shr, true) 150 } 151 152 // stop the replica's source mysqld instance to break replication 153 // and test that the replica tablet becomes unhealthy and non-serving after crossing 154 // the tablet's --unhealthy_threshold and the gateway's --discovery_low_replication_lag 155 err = primaryTablet.MysqlctlProcess.Stop() 156 require.NoError(t, err) 157 158 time.Sleep(tabletUnhealthyThreshold + tabletHealthcheckRefreshInterval) 159 160 // now the replica's health stream should show it as unhealthy 161 shrs, err = clusterInstance.StreamTabletHealth(ctx, rTablet, 1) 162 require.NoError(t, err) 163 for _, shr := range shrs { 164 verifyStreamHealth(t, shr, false) 165 } 166 167 // start the primary tablet's mysqld back up 168 primaryTablet.MysqlctlProcess.InitMysql = false 169 err = primaryTablet.MysqlctlProcess.Start() 170 primaryTablet.MysqlctlProcess.InitMysql = true 171 require.NoError(t, err) 172 173 // On a MySQL restart, it comes up as a read-only tablet (check default.cnf file). 174 // We have to explicitly set it to read-write otherwise heartbeat writer is unable 175 // to write the heartbeats 176 err = clusterInstance.VtctlclientProcess.ExecuteCommand("SetReadWrite", primaryTablet.Alias) 177 require.NoError(t, err) 178 179 // explicitly start replication on all of the replicas to avoid any test flakiness as they were all 180 // replicating from the primary instance 181 err = clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", rTablet.Alias) 182 require.NoError(t, err) 183 err = clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", replicaTablet.Alias) 184 require.NoError(t, err) 185 err = clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", rdonlyTablet.Alias) 186 require.NoError(t, err) 187 188 time.Sleep(tabletHealthcheckRefreshInterval) 189 190 // now the replica's health stream should show it as healthy again 191 shrs, err = clusterInstance.StreamTabletHealth(ctx, rTablet, 1) 192 require.NoError(t, err) 193 for _, shr := range shrs { 194 verifyStreamHealth(t, shr, true) 195 } 196 197 // Manual cleanup of processes 198 killTablets(t, rTablet) 199 } 200 201 func checkHealth(t *testing.T, port int, shouldError bool) { 202 url := fmt.Sprintf("http://localhost:%d/healthz", port) 203 resp, err := http.Get(url) 204 require.NoError(t, err) 205 defer resp.Body.Close() 206 if shouldError { 207 assert.True(t, resp.StatusCode > 400) 208 } else { 209 assert.Equal(t, 200, resp.StatusCode) 210 } 211 } 212 213 func checkTabletType(t *testing.T, tabletAlias string, typeWant string) { 214 result, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("GetTablet", tabletAlias) 215 require.NoError(t, err) 216 217 var tablet topodatapb.Tablet 218 err = json2.Unmarshal([]byte(result), &tablet) 219 require.NoError(t, err) 220 221 actualType := tablet.GetType() 222 got := fmt.Sprintf("%d", actualType) 223 224 tabletType := topodatapb.TabletType_value[typeWant] 225 want := fmt.Sprintf("%d", tabletType) 226 227 assert.Equal(t, want, got) 228 } 229 230 func verifyStreamHealth(t *testing.T, streamHealthResponse *querypb.StreamHealthResponse, expectHealthy bool) { 231 serving := streamHealthResponse.GetServing() 232 UID := streamHealthResponse.GetTabletAlias().GetUid() 233 realTimeStats := streamHealthResponse.GetRealtimeStats() 234 replicationLagSeconds := realTimeStats.GetReplicationLagSeconds() 235 assert.True(t, UID > 0, "Tablet should contain uid") 236 if expectHealthy { 237 assert.True(t, serving, "Tablet should be in serving state") 238 // replicationLagSeconds varies till 7200 so setting safe limit 239 assert.True(t, replicationLagSeconds < 10000, "replica should not be behind primary") 240 } else { 241 assert.True(t, (!serving || replicationLagSeconds >= uint32(tabletUnhealthyThreshold.Seconds())), "Tablet should not be in serving and healthy state") 242 } 243 } 244 245 func TestHealthCheckDrainedStateDoesNotShutdownQueryService(t *testing.T) { 246 // This test is similar to test_health_check, but has the following differences: 247 // - the second tablet is an 'rdonly' and not a 'replica' 248 // - the second tablet will be set to 'drained' and we expect that 249 // - the query service won't be shutdown 250 251 //Wait if tablet is not in service state 252 defer cluster.PanicHandler(t) 253 err := rdonlyTablet.VttabletProcess.WaitForTabletStatus("SERVING") 254 require.NoError(t, err) 255 256 // Check tablet health 257 checkHealth(t, rdonlyTablet.HTTPPort, false) 258 assert.Equal(t, "SERVING", rdonlyTablet.VttabletProcess.GetTabletStatus()) 259 260 // Change from rdonly to drained and stop replication. The tablet will stay 261 // healthy, and the query service is still running. 262 err = clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", rdonlyTablet.Alias, "drained") 263 require.NoError(t, err) 264 // Trying to drain the same tablet again, should error 265 err = clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", rdonlyTablet.Alias, "drained") 266 assert.Error(t, err, "already drained") 267 268 err = clusterInstance.VtctlclientProcess.ExecuteCommand("StopReplication", rdonlyTablet.Alias) 269 require.NoError(t, err) 270 // Trigger healthcheck explicitly to avoid waiting for the next interval. 271 err = clusterInstance.VtctlclientProcess.ExecuteCommand("RunHealthCheck", rdonlyTablet.Alias) 272 require.NoError(t, err) 273 274 checkTabletType(t, rdonlyTablet.Alias, "DRAINED") 275 276 // Query service is still running. 277 err = rdonlyTablet.VttabletProcess.WaitForTabletStatus("SERVING") 278 require.NoError(t, err) 279 280 // Restart replication. Tablet will become healthy again. 281 err = clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", rdonlyTablet.Alias, "rdonly") 282 require.NoError(t, err) 283 err = clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", rdonlyTablet.Alias) 284 require.NoError(t, err) 285 err = clusterInstance.VtctlclientProcess.ExecuteCommand("RunHealthCheck", rdonlyTablet.Alias) 286 require.NoError(t, err) 287 checkHealth(t, rdonlyTablet.HTTPPort, false) 288 } 289 290 func killTablets(t *testing.T, tablets ...*cluster.Vttablet) { 291 var wg sync.WaitGroup 292 for _, tablet := range tablets { 293 wg.Add(1) 294 go func(tablet *cluster.Vttablet) { 295 defer wg.Done() 296 _ = tablet.VttabletProcess.TearDown() 297 _ = tablet.MysqlctlProcess.Stop() 298 }(tablet) 299 } 300 wg.Wait() 301 }