vitess.io/vitess@v0.16.2/go/test/endtoend/tabletmanager/throttler_topo/throttler_test.go (about) 1 /* 2 Copyright 2022 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 package throttler 17 18 import ( 19 "context" 20 "flag" 21 "fmt" 22 "io" 23 "net/http" 24 "os" 25 "sync" 26 "testing" 27 "time" 28 29 "vitess.io/vitess/go/mysql" 30 "vitess.io/vitess/go/sqltypes" 31 "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle/base" 32 33 "vitess.io/vitess/go/test/endtoend/cluster" 34 "vitess.io/vitess/go/test/endtoend/throttler" 35 36 "github.com/stretchr/testify/assert" 37 "github.com/stretchr/testify/require" 38 ) 39 40 const ( 41 customQuery = "show global status like 'threads_running'" 42 customThreshold = 5 * time.Second 43 unreasonablyLowThreshold = 1 * time.Millisecond 44 extremelyHighThreshold = 1 * time.Hour 45 onDemandHeartbeatDuration = 5 * time.Second 46 throttlerEnabledTimeout = 60 * time.Second 47 useDefaultQuery = "" 48 ) 49 50 var ( 51 clusterInstance *cluster.LocalProcessCluster 52 primaryTablet *cluster.Vttablet 53 replicaTablet *cluster.Vttablet 54 vtParams mysql.ConnParams 55 hostname = "localhost" 56 keyspaceName = "ks" 57 cell = "zone1" 58 sqlSchema = ` 59 create table t1( 60 id bigint, 61 value varchar(16), 62 primary key(id) 63 ) Engine=InnoDB; 64 ` 65 66 vSchema = ` 67 { 68 "sharded": true, 69 "vindexes": { 70 "hash": { 71 "type": "hash" 72 } 73 }, 74 "tables": { 75 "t1": { 76 "column_vindexes": [ 77 { 78 "column": "id", 79 "name": "hash" 80 } 81 ] 82 } 83 } 84 }` 85 86 httpClient = base.SetupHTTPClient(time.Second) 87 throttledAppsAPIPath = "throttler/throttled-apps" 88 checkAPIPath = "throttler/check" 89 checkSelfAPIPath = "throttler/check-self" 90 getResponseBody = func(resp *http.Response) string { 91 body, _ := io.ReadAll(resp.Body) 92 return string(body) 93 } 94 ) 95 96 func TestMain(m *testing.M) { 97 defer cluster.PanicHandler(nil) 98 flag.Parse() 99 100 exitCode := func() int { 101 clusterInstance = cluster.NewCluster(cell, hostname) 102 defer clusterInstance.Teardown() 103 104 // Start topo server 105 err := clusterInstance.StartTopo() 106 if err != nil { 107 return 1 108 } 109 110 // Set extra tablet args for lock timeout 111 clusterInstance.VtTabletExtraArgs = []string{ 112 "--lock_tables_timeout", "5s", 113 "--watch_replication_stream", 114 "--enable_replication_reporter", 115 "--throttler-config-via-topo", 116 "--heartbeat_enable", 117 "--heartbeat_interval", "250ms", 118 "--heartbeat_on_demand_duration", onDemandHeartbeatDuration.String(), 119 "--disable_active_reparents", 120 } 121 122 // Start keyspace 123 keyspace := &cluster.Keyspace{ 124 Name: keyspaceName, 125 SchemaSQL: sqlSchema, 126 VSchema: vSchema, 127 } 128 129 if err = clusterInstance.StartUnshardedKeyspace(*keyspace, 1, false); err != nil { 130 return 1 131 } 132 133 // Collect table paths and ports 134 tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets 135 for _, tablet := range tablets { 136 if tablet.Type == "primary" { 137 primaryTablet = tablet 138 } else if tablet.Type != "rdonly" { 139 replicaTablet = tablet 140 } 141 } 142 143 vtgateInstance := clusterInstance.NewVtgateInstance() 144 // Start vtgate 145 if err := vtgateInstance.Setup(); err != nil { 146 return 1 147 } 148 // ensure it is torn down during cluster TearDown 149 clusterInstance.VtgateProcess = *vtgateInstance 150 vtParams = mysql.ConnParams{ 151 Host: clusterInstance.Hostname, 152 Port: clusterInstance.VtgateMySQLPort, 153 } 154 clusterInstance.VtctldClientProcess = *cluster.VtctldClientProcessInstance("localhost", clusterInstance.VtctldProcess.GrpcPort, clusterInstance.TmpDirectory) 155 156 return m.Run() 157 }() 158 os.Exit(exitCode) 159 } 160 161 func throttledApps(tablet *cluster.Vttablet) (resp *http.Response, respBody string, err error) { 162 resp, err = httpClient.Get(fmt.Sprintf("http://localhost:%d/%s", tablet.HTTPPort, throttledAppsAPIPath)) 163 if err != nil { 164 return resp, respBody, err 165 } 166 b, err := io.ReadAll(resp.Body) 167 if err != nil { 168 return resp, respBody, err 169 } 170 respBody = string(b) 171 return resp, respBody, err 172 } 173 174 func throttleCheck(tablet *cluster.Vttablet, skipRequestHeartbeats bool) (*http.Response, error) { 175 resp, err := httpClient.Get(fmt.Sprintf("http://localhost:%d/%s?s=%t", tablet.HTTPPort, checkAPIPath, skipRequestHeartbeats)) 176 return resp, err 177 } 178 179 func throttleCheckSelf(tablet *cluster.Vttablet) (*http.Response, error) { 180 return httpClient.Get(fmt.Sprintf("http://localhost:%d/%s", tablet.HTTPPort, checkSelfAPIPath)) 181 } 182 183 func warmUpHeartbeat(t *testing.T) (respStatus int) { 184 // because we run with -heartbeat_on_demand_duration=5s, the heartbeat is "cold" right now. 185 // Let's warm it up. 186 resp, err := throttleCheck(primaryTablet, false) 187 require.NoError(t, err) 188 defer resp.Body.Close() 189 190 time.Sleep(time.Second) 191 return resp.StatusCode 192 } 193 194 // waitForThrottleCheckStatus waits for the tablet to return the provided HTTP code in a throttle check 195 func waitForThrottleCheckStatus(t *testing.T, tablet *cluster.Vttablet, wantCode int) { 196 _ = warmUpHeartbeat(t) 197 ctx, cancel := context.WithTimeout(context.Background(), onDemandHeartbeatDuration*4) 198 defer cancel() 199 200 for { 201 resp, err := throttleCheck(tablet, true) 202 require.NoError(t, err) 203 204 if wantCode == resp.StatusCode { 205 // Wait for any cached check values to be cleared and the new 206 // status value to be in effect everywhere before returning. 207 resp.Body.Close() 208 return 209 } 210 select { 211 case <-ctx.Done(): 212 b, err := io.ReadAll(resp.Body) 213 require.NoError(t, err) 214 resp.Body.Close() 215 216 assert.Equalf(t, wantCode, resp.StatusCode, "body: %s", string(b)) 217 return 218 default: 219 resp.Body.Close() 220 time.Sleep(time.Second) 221 } 222 } 223 } 224 225 func vtgateExec(t *testing.T, query string, expectError string) *sqltypes.Result { 226 t.Helper() 227 228 ctx := context.Background() 229 conn, err := mysql.Connect(ctx, &vtParams) 230 require.Nil(t, err) 231 defer conn.Close() 232 233 qr, err := conn.ExecuteFetch(query, 1000, true) 234 if expectError == "" { 235 require.NoError(t, err) 236 } else { 237 require.Error(t, err, "error should not be nil") 238 assert.Contains(t, err.Error(), expectError, "Unexpected error") 239 } 240 return qr 241 } 242 243 func TestInitialThrottler(t *testing.T) { 244 defer cluster.PanicHandler(t) 245 246 t.Run("validating OK response from disabled throttler", func(t *testing.T) { 247 waitForThrottleCheckStatus(t, primaryTablet, http.StatusOK) 248 }) 249 t.Run("enabling throttler with very low threshold", func(t *testing.T) { 250 _, err := throttler.UpdateThrottlerTopoConfig(clusterInstance, true, false, unreasonablyLowThreshold.Seconds(), useDefaultQuery, false) 251 assert.NoError(t, err) 252 253 // Wait for the throttler to be enabled everywhere with the new config. 254 for _, tablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { 255 throttler.WaitForThrottlerStatusEnabled(t, tablet, true, &throttler.Config{Query: throttler.DefaultQuery, Threshold: unreasonablyLowThreshold.Seconds()}, throttlerEnabledTimeout) 256 } 257 }) 258 t.Run("validating pushback response from throttler", func(t *testing.T) { 259 waitForThrottleCheckStatus(t, primaryTablet, http.StatusTooManyRequests) 260 }) 261 t.Run("disabling throttler", func(t *testing.T) { 262 _, err := throttler.UpdateThrottlerTopoConfig(clusterInstance, false, true, unreasonablyLowThreshold.Seconds(), useDefaultQuery, false) 263 assert.NoError(t, err) 264 265 // Wait for the throttler to be disabled everywhere. 266 for _, tablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { 267 throttler.WaitForThrottlerStatusEnabled(t, tablet, false, nil, throttlerEnabledTimeout) 268 } 269 }) 270 t.Run("validating OK response from disabled throttler, again", func(t *testing.T) { 271 waitForThrottleCheckStatus(t, primaryTablet, http.StatusOK) 272 }) 273 t.Run("enabling throttler, again", func(t *testing.T) { 274 // Enable the throttler again with the default query which also moves us back 275 // to the default threshold. 276 _, err := throttler.UpdateThrottlerTopoConfig(clusterInstance, true, false, 0, useDefaultQuery, true) 277 assert.NoError(t, err) 278 279 // Wait for the throttler to be enabled everywhere again with the default config. 280 for _, tablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { 281 throttler.WaitForThrottlerStatusEnabled(t, tablet, true, throttler.DefaultConfig, throttlerEnabledTimeout) 282 } 283 }) 284 t.Run("validating pushback response from throttler, again", func(t *testing.T) { 285 waitForThrottleCheckStatus(t, primaryTablet, http.StatusTooManyRequests) 286 }) 287 t.Run("setting high threshold", func(t *testing.T) { 288 _, err := throttler.UpdateThrottlerTopoConfig(clusterInstance, false, false, extremelyHighThreshold.Seconds(), useDefaultQuery, true) 289 assert.NoError(t, err) 290 291 // Wait for the throttler to be enabled everywhere with new config. 292 for _, tablet := range []cluster.Vttablet{*primaryTablet, *replicaTablet} { 293 throttler.WaitForThrottlerStatusEnabled(t, &tablet, true, &throttler.Config{Query: throttler.DefaultQuery, Threshold: extremelyHighThreshold.Seconds()}, throttlerEnabledTimeout) 294 } 295 }) 296 t.Run("validating OK response from throttler with high threshold", func(t *testing.T) { 297 waitForThrottleCheckStatus(t, primaryTablet, http.StatusOK) 298 }) 299 t.Run("setting low threshold", func(t *testing.T) { 300 _, err := throttler.UpdateThrottlerTopoConfig(clusterInstance, false, false, throttler.DefaultThreshold.Seconds(), useDefaultQuery, true) 301 assert.NoError(t, err) 302 303 // Wait for the throttler to be enabled everywhere with new config. 304 for _, tablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { 305 throttler.WaitForThrottlerStatusEnabled(t, tablet, true, throttler.DefaultConfig, throttlerEnabledTimeout) 306 } 307 }) 308 t.Run("validating pushback response from throttler on low threshold", func(t *testing.T) { 309 waitForThrottleCheckStatus(t, primaryTablet, http.StatusTooManyRequests) 310 }) 311 t.Run("requesting heartbeats", func(t *testing.T) { 312 respStatus := warmUpHeartbeat(t) 313 assert.NotEqual(t, http.StatusOK, respStatus) 314 }) 315 t.Run("validating OK response from throttler with low threshold, heartbeats running", func(t *testing.T) { 316 time.Sleep(1 * time.Second) 317 resp, err := throttleCheck(primaryTablet, false) 318 require.NoError(t, err) 319 defer resp.Body.Close() 320 assert.Equalf(t, http.StatusOK, resp.StatusCode, "Unexpected response from throttler: %s", getResponseBody(resp)) 321 }) 322 t.Run("validating OK response from throttler with low threshold, heartbeats running still", func(t *testing.T) { 323 time.Sleep(1 * time.Second) 324 resp, err := throttleCheck(primaryTablet, false) 325 require.NoError(t, err) 326 defer resp.Body.Close() 327 assert.Equalf(t, http.StatusOK, resp.StatusCode, "Unexpected response from throttler: %s", getResponseBody(resp)) 328 }) 329 t.Run("validating pushback response from throttler on low threshold once heartbeats go stale", func(t *testing.T) { 330 time.Sleep(2 * onDemandHeartbeatDuration) // just... really wait long enough, make sure on-demand stops 331 waitForThrottleCheckStatus(t, primaryTablet, http.StatusTooManyRequests) 332 }) 333 } 334 335 func TestThrottlerAfterMetricsCollected(t *testing.T) { 336 defer cluster.PanicHandler(t) 337 338 // By this time metrics will have been collected. We expect no lag, and something like: 339 // {"StatusCode":200,"Value":0.282278,"Threshold":1,"Message":""} 340 t.Run("validating throttler OK", func(t *testing.T) { 341 waitForThrottleCheckStatus(t, primaryTablet, http.StatusOK) 342 }) 343 t.Run("validating throttled apps", func(t *testing.T) { 344 resp, body, err := throttledApps(primaryTablet) 345 require.NoError(t, err) 346 defer resp.Body.Close() 347 assert.Equalf(t, http.StatusOK, resp.StatusCode, "Unexpected response from throttler: %s", getResponseBody(resp)) 348 assert.Contains(t, body, "always-throttled-app") 349 }) 350 t.Run("validating primary check self", func(t *testing.T) { 351 resp, err := throttleCheckSelf(primaryTablet) 352 require.NoError(t, err) 353 defer resp.Body.Close() 354 assert.Equalf(t, http.StatusOK, resp.StatusCode, "Unexpected response from throttler: %s", getResponseBody(resp)) 355 }) 356 t.Run("validating replica check self", func(t *testing.T) { 357 resp, err := throttleCheckSelf(replicaTablet) 358 require.NoError(t, err) 359 defer resp.Body.Close() 360 assert.Equalf(t, http.StatusOK, resp.StatusCode, "Unexpected response from throttler: %s", getResponseBody(resp)) 361 }) 362 } 363 364 func TestLag(t *testing.T) { 365 defer cluster.PanicHandler(t) 366 // Temporarily disable VTOrc recoveries because we want to 367 // STOP replication specifically in order to increase the 368 // lag and we DO NOT want VTOrc to try and fix this. 369 clusterInstance.DisableVTOrcRecoveries(t) 370 defer clusterInstance.EnableVTOrcRecoveries(t) 371 372 t.Run("stopping replication", func(t *testing.T) { 373 err := clusterInstance.VtctlclientProcess.ExecuteCommand("StopReplication", replicaTablet.Alias) 374 assert.NoError(t, err) 375 }) 376 t.Run("accumulating lag, expecting throttler push back", func(t *testing.T) { 377 time.Sleep(2 * throttler.DefaultThreshold) 378 379 resp, err := throttleCheck(primaryTablet, false) 380 require.NoError(t, err) 381 defer resp.Body.Close() 382 assert.Equalf(t, http.StatusTooManyRequests, resp.StatusCode, "Unexpected response from throttler: %s", getResponseBody(resp)) 383 }) 384 t.Run("primary self-check should still be fine", func(t *testing.T) { 385 resp, err := throttleCheckSelf(primaryTablet) 386 require.NoError(t, err) 387 defer resp.Body.Close() 388 // self (on primary) is unaffected by replication lag 389 assert.Equalf(t, http.StatusOK, resp.StatusCode, "Unexpected response from throttler: %s", getResponseBody(resp)) 390 }) 391 t.Run("replica self-check should show error", func(t *testing.T) { 392 resp, err := throttleCheckSelf(replicaTablet) 393 require.NoError(t, err) 394 defer resp.Body.Close() 395 assert.Equalf(t, http.StatusTooManyRequests, resp.StatusCode, "Unexpected response from throttler: %s", getResponseBody(resp)) 396 }) 397 t.Run("starting replication", func(t *testing.T) { 398 err := clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", replicaTablet.Alias) 399 assert.NoError(t, err) 400 }) 401 t.Run("expecting replication to catch up and throttler check to return OK", func(t *testing.T) { 402 waitForThrottleCheckStatus(t, primaryTablet, http.StatusOK) 403 }) 404 t.Run("primary self-check should be fine", func(t *testing.T) { 405 resp, err := throttleCheckSelf(primaryTablet) 406 require.NoError(t, err) 407 defer resp.Body.Close() 408 // self (on primary) is unaffected by replication lag 409 assert.Equalf(t, http.StatusOK, resp.StatusCode, "Unexpected response from throttler: %s", getResponseBody(resp)) 410 }) 411 t.Run("replica self-check should be fine", func(t *testing.T) { 412 resp, err := throttleCheckSelf(replicaTablet) 413 require.NoError(t, err) 414 defer resp.Body.Close() 415 assert.Equalf(t, http.StatusOK, resp.StatusCode, "Unexpected response from throttler: %s", getResponseBody(resp)) 416 }) 417 } 418 419 func TestNoReplicas(t *testing.T) { 420 defer cluster.PanicHandler(t) 421 t.Run("changing replica to RDONLY", func(t *testing.T) { 422 err := clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", replicaTablet.Alias, "RDONLY") 423 assert.NoError(t, err) 424 425 // This makes no REPLICA servers available. We expect something like: 426 // {"StatusCode":200,"Value":0,"Threshold":1,"Message":""} 427 waitForThrottleCheckStatus(t, primaryTablet, http.StatusOK) 428 }) 429 t.Run("restoring to REPLICA", func(t *testing.T) { 430 err := clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", replicaTablet.Alias, "REPLICA") 431 assert.NoError(t, err) 432 433 waitForThrottleCheckStatus(t, primaryTablet, http.StatusOK) 434 }) 435 } 436 437 func TestCustomQuery(t *testing.T) { 438 defer cluster.PanicHandler(t) 439 440 t.Run("enabling throttler with custom query and threshold", func(t *testing.T) { 441 _, err := throttler.UpdateThrottlerTopoConfig(clusterInstance, true, false, customThreshold.Seconds(), customQuery, false) 442 assert.NoError(t, err) 443 444 // Wait for the throttler to be enabled everywhere with new custom config. 445 for _, tablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { 446 throttler.WaitForThrottlerStatusEnabled(t, tablet, true, &throttler.Config{Query: customQuery, Threshold: customThreshold.Seconds()}, throttlerEnabledTimeout) 447 } 448 }) 449 t.Run("validating OK response from throttler with custom query", func(t *testing.T) { 450 resp, err := throttleCheck(primaryTablet, false) 451 require.NoError(t, err) 452 defer resp.Body.Close() 453 assert.Equalf(t, http.StatusOK, resp.StatusCode, "Unexpected response from throttler: %s", getResponseBody(resp)) 454 }) 455 t.Run("test threads running", func(t *testing.T) { 456 sleepDuration := 20 * time.Second 457 var wg sync.WaitGroup 458 for i := 0; i < int(customThreshold.Seconds()); i++ { 459 // Generate different Sleep() calls, all at minimum sleepDuration. 460 wg.Add(1) 461 go func(i int) { 462 defer wg.Done() 463 vtgateExec(t, fmt.Sprintf("select sleep(%d)", int(sleepDuration.Seconds())+i), "") 464 }(i) 465 } 466 t.Run("exceeds threshold", func(t *testing.T) { 467 throttler.WaitForQueryResult(t, primaryTablet, 468 "select if(variable_value > 5, 'true', 'false') as result from performance_schema.global_status where variable_name='threads_running'", 469 "true", sleepDuration/3) 470 throttler.WaitForValidData(t, primaryTablet, sleepDuration-(5*time.Second)) 471 // Now we should be reporting ~ customThreshold*2 threads_running, and we should 472 // hit the threshold. For example: 473 // {"StatusCode":429,"Value":6,"Threshold":5,"Message":"Threshold exceeded"} 474 { 475 resp, err := throttleCheck(primaryTablet, false) 476 require.NoError(t, err) 477 defer resp.Body.Close() 478 assert.Equalf(t, http.StatusTooManyRequests, resp.StatusCode, "Unexpected response from throttler: %s", getResponseBody(resp)) 479 } 480 { 481 resp, err := throttleCheckSelf(primaryTablet) 482 require.NoError(t, err) 483 defer resp.Body.Close() 484 assert.Equalf(t, http.StatusTooManyRequests, resp.StatusCode, "Unexpected response from throttler: %s", getResponseBody(resp)) 485 } 486 }) 487 t.Run("wait for queries to terminate", func(t *testing.T) { 488 wg.Wait() 489 time.Sleep(1 * time.Second) // graceful time to let throttler read metrics 490 }) 491 t.Run("restored below threshold", func(t *testing.T) { 492 { 493 resp, err := throttleCheck(primaryTablet, false) 494 require.NoError(t, err) 495 defer resp.Body.Close() 496 assert.Equalf(t, http.StatusOK, resp.StatusCode, "Unexpected response from throttler: %s", getResponseBody(resp)) 497 } 498 { 499 resp, err := throttleCheckSelf(primaryTablet) 500 require.NoError(t, err) 501 defer resp.Body.Close() 502 assert.Equalf(t, http.StatusOK, resp.StatusCode, "Unexpected response from throttler: %s", getResponseBody(resp)) 503 } 504 }) 505 }) 506 } 507 508 func TestRestoreDefaultQuery(t *testing.T) { 509 defer cluster.PanicHandler(t) 510 511 // Validate going back from custom-query to default-query (replication lag) still works. 512 t.Run("enabling throttler with default query and threshold", func(t *testing.T) { 513 _, err := throttler.UpdateThrottlerTopoConfig(clusterInstance, true, false, throttler.DefaultThreshold.Seconds(), useDefaultQuery, false) 514 assert.NoError(t, err) 515 516 // Wait for the throttler to be up and running everywhere again with the default config. 517 for _, tablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { 518 throttler.WaitForThrottlerStatusEnabled(t, tablet, true, throttler.DefaultConfig, throttlerEnabledTimeout) 519 } 520 }) 521 t.Run("validating OK response from throttler with default threshold, heartbeats running", func(t *testing.T) { 522 resp, err := throttleCheck(primaryTablet, false) 523 require.NoError(t, err) 524 defer resp.Body.Close() 525 assert.Equalf(t, http.StatusOK, resp.StatusCode, "Unexpected response from throttler: %s", getResponseBody(resp)) 526 }) 527 t.Run("validating pushback response from throttler on default threshold once heartbeats go stale", func(t *testing.T) { 528 time.Sleep(2 * onDemandHeartbeatDuration) // just... really wait long enough, make sure on-demand stops 529 waitForThrottleCheckStatus(t, primaryTablet, http.StatusTooManyRequests) 530 }) 531 }