vitess.io/vitess@v0.16.2/go/vt/vttablet/tabletserver/throttle/throttler.go (about) 1 /* 2 Copyright 2017 GitHub Inc. 3 4 Licensed under MIT License. See https://github.com/github/freno/blob/master/LICENSE 5 */ 6 7 package throttle 8 9 import ( 10 "context" 11 "encoding/json" 12 "errors" 13 "fmt" 14 "io" 15 "math" 16 "math/rand" 17 "net/http" 18 "strconv" 19 "strings" 20 "sync" 21 "sync/atomic" 22 "time" 23 24 "github.com/patrickmn/go-cache" 25 "github.com/spf13/pflag" 26 27 "vitess.io/vitess/go/sync2" 28 "vitess.io/vitess/go/textutil" 29 "vitess.io/vitess/go/timer" 30 "vitess.io/vitess/go/vt/log" 31 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 32 "vitess.io/vitess/go/vt/servenv" 33 "vitess.io/vitess/go/vt/srvtopo" 34 "vitess.io/vitess/go/vt/topo" 35 "vitess.io/vitess/go/vt/vttablet/tabletserver/connpool" 36 "vitess.io/vitess/go/vt/vttablet/tabletserver/heartbeat" 37 "vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv" 38 "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle/base" 39 "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle/config" 40 "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle/mysql" 41 ) 42 43 const ( 44 leaderCheckInterval = 5 * time.Second 45 mysqlCollectInterval = 250 * time.Millisecond 46 mysqlDormantCollectInterval = 5 * time.Second 47 mysqlRefreshInterval = 10 * time.Second 48 mysqlAggregateInterval = 125 * time.Millisecond 49 50 aggregatedMetricsExpiration = 5 * time.Second 51 throttledAppsSnapshotInterval = 5 * time.Second 52 recentAppsExpiration = time.Hour * 24 53 54 nonDeprioritizedAppMapExpiration = time.Second 55 56 dormantPeriod = time.Minute 57 defaultThrottleTTLMinutes = 60 58 defaultThrottleRatio = 1.0 59 60 shardStoreName = "shard" 61 selfStoreName = "self" 62 ) 63 64 var ( 65 // flag vars 66 throttleThreshold = 1 * time.Second 67 throttleTabletTypes = "replica" 68 throttleMetricQuery string 69 throttleMetricThreshold = math.MaxFloat64 70 throttlerCheckAsCheckSelf = false 71 throttlerConfigViaTopo = false 72 ) 73 74 func init() { 75 servenv.OnParseFor("vtcombo", registerThrottlerFlags) 76 servenv.OnParseFor("vttablet", registerThrottlerFlags) 77 } 78 79 func registerThrottlerFlags(fs *pflag.FlagSet) { 80 fs.StringVar(&throttleTabletTypes, "throttle_tablet_types", throttleTabletTypes, "Comma separated VTTablet types to be considered by the throttler. default: 'replica'. example: 'replica,rdonly'. 'replica' aways implicitly included") 81 82 fs.DurationVar(&throttleThreshold, "throttle_threshold", throttleThreshold, "Replication lag threshold for default lag throttling") 83 fs.StringVar(&throttleMetricQuery, "throttle_metrics_query", throttleMetricQuery, "Override default heartbeat/lag metric. Use either `SELECT` (must return single row, single value) or `SHOW GLOBAL ... LIKE ...` queries. Set -throttle_metrics_threshold respectively.") 84 fs.Float64Var(&throttleMetricThreshold, "throttle_metrics_threshold", throttleMetricThreshold, "Override default throttle threshold, respective to --throttle_metrics_query") 85 fs.BoolVar(&throttlerCheckAsCheckSelf, "throttle_check_as_check_self", throttlerCheckAsCheckSelf, "Should throttler/check return a throttler/check-self result (changes throttler behavior for writes)") 86 fs.BoolVar(&throttlerConfigViaTopo, "throttler-config-via-topo", throttlerConfigViaTopo, "When 'true', read config from topo service and ignore throttle_threshold, throttle_metrics_threshold, throttle_metrics_query, throttle_check_as_check_self") 87 } 88 89 var ( 90 replicationLagQuery = `select unix_timestamp(now(6))-max(ts/1000000000) as replication_lag from _vt.heartbeat` 91 92 ErrThrottlerNotReady = errors.New("throttler not enabled/ready") 93 ) 94 95 // ThrottleCheckType allows a client to indicate what type of check it wants to issue. See available types below. 96 type ThrottleCheckType int // nolint:revive 97 98 const ( 99 // ThrottleCheckPrimaryWrite indicates a check before making a write on a primary server 100 ThrottleCheckPrimaryWrite ThrottleCheckType = iota 101 // ThrottleCheckSelf indicates a check on a specific server health 102 ThrottleCheckSelf 103 ) 104 105 func init() { 106 rand.Seed(time.Now().UnixNano()) 107 } 108 109 // Throttler is the main entity in the throttling mechanism. This service runs, probes, collects data, 110 // aggregates, reads inventory, provides information, etc. 111 type Throttler struct { 112 keyspace string 113 shard string 114 cell string 115 116 check *ThrottlerCheck 117 isEnabled int64 118 isLeader int64 119 isOpen int64 120 121 env tabletenv.Env 122 pool *connpool.Pool 123 tabletTypeFunc func() topodatapb.TabletType 124 ts *topo.Server 125 srvTopoServer srvtopo.Server 126 heartbeatWriter heartbeat.HeartbeatWriter 127 128 throttleTabletTypesMap map[topodatapb.TabletType]bool 129 130 mysqlThrottleMetricChan chan *mysql.MySQLThrottleMetric 131 mysqlInventoryChan chan *mysql.Inventory 132 mysqlClusterProbesChan chan *mysql.ClusterProbes 133 throttlerConfigChan chan *topodatapb.ThrottlerConfig 134 135 mysqlInventory *mysql.Inventory 136 137 metricsQuery atomic.Value 138 MetricsThreshold sync2.AtomicFloat64 139 140 mysqlClusterThresholds *cache.Cache 141 aggregatedMetrics *cache.Cache 142 throttledApps *cache.Cache 143 recentApps *cache.Cache 144 metricsHealth *cache.Cache 145 146 lastCheckTimeNano int64 147 148 initMutex sync.Mutex 149 enableMutex sync.Mutex 150 cancelEnableContext context.CancelFunc 151 throttledAppsMutex sync.Mutex 152 153 nonLowPriorityAppRequestsThrottled *cache.Cache 154 httpClient *http.Client 155 } 156 157 // ThrottlerStatus published some status values from the throttler 158 type ThrottlerStatus struct { 159 Keyspace string 160 Shard string 161 162 IsLeader bool 163 IsOpen bool 164 IsEnabled bool 165 IsDormant bool 166 167 Query string 168 Threshold float64 169 170 AggregatedMetrics map[string]base.MetricResult 171 MetricsHealth base.MetricHealthMap 172 } 173 174 // NewThrottler creates a Throttler 175 func NewThrottler(env tabletenv.Env, srvTopoServer srvtopo.Server, ts *topo.Server, cell string, heartbeatWriter heartbeat.HeartbeatWriter, tabletTypeFunc func() topodatapb.TabletType) *Throttler { 176 throttler := &Throttler{ 177 isLeader: 0, 178 isOpen: 0, 179 180 cell: cell, 181 env: env, 182 tabletTypeFunc: tabletTypeFunc, 183 srvTopoServer: srvTopoServer, 184 ts: ts, 185 heartbeatWriter: heartbeatWriter, 186 pool: connpool.NewPool(env, "ThrottlerPool", tabletenv.ConnPoolConfig{ 187 Size: 2, 188 IdleTimeoutSeconds: env.Config().OltpReadPool.IdleTimeoutSeconds, 189 }), 190 } 191 192 throttler.mysqlThrottleMetricChan = make(chan *mysql.MySQLThrottleMetric) 193 throttler.mysqlInventoryChan = make(chan *mysql.Inventory, 1) 194 throttler.mysqlClusterProbesChan = make(chan *mysql.ClusterProbes) 195 throttler.throttlerConfigChan = make(chan *topodatapb.ThrottlerConfig) 196 throttler.mysqlInventory = mysql.NewInventory() 197 198 throttler.throttledApps = cache.New(cache.NoExpiration, 0) 199 throttler.mysqlClusterThresholds = cache.New(cache.NoExpiration, 0) 200 throttler.aggregatedMetrics = cache.New(aggregatedMetricsExpiration, 0) 201 throttler.recentApps = cache.New(recentAppsExpiration, 0) 202 throttler.metricsHealth = cache.New(cache.NoExpiration, 0) 203 throttler.nonLowPriorityAppRequestsThrottled = cache.New(nonDeprioritizedAppMapExpiration, 0) 204 205 throttler.httpClient = base.SetupHTTPClient(2 * mysqlCollectInterval) 206 throttler.initThrottleTabletTypes() 207 throttler.check = NewThrottlerCheck(throttler) 208 209 throttler.metricsQuery.Store(replicationLagQuery) // default 210 if throttleMetricQuery != "" { 211 throttler.metricsQuery.Store(throttleMetricQuery) // override 212 } 213 throttler.MetricsThreshold = sync2.NewAtomicFloat64(throttleThreshold.Seconds()) //default 214 if throttleMetricThreshold != math.MaxFloat64 { 215 throttler.MetricsThreshold.Set(throttleMetricThreshold) // override 216 } 217 218 throttler.initConfig() 219 220 return throttler 221 } 222 223 // CheckIsReady checks if this throttler is ready to serve. If not, it returns an error 224 func (throttler *Throttler) CheckIsReady() error { 225 if throttler.IsEnabled() { 226 // all good 227 return nil 228 } 229 return ErrThrottlerNotReady 230 } 231 232 // initThrottleTabletTypes reads the user supplied throttle_tablet_types and sets these 233 // for the duration of this tablet's lifetime 234 func (throttler *Throttler) initThrottleTabletTypes() { 235 throttler.throttleTabletTypesMap = make(map[topodatapb.TabletType]bool) 236 237 tokens := textutil.SplitDelimitedList(throttleTabletTypes) 238 for _, token := range tokens { 239 token = strings.ToUpper(token) 240 if value, ok := topodatapb.TabletType_value[token]; ok { 241 throttler.throttleTabletTypesMap[topodatapb.TabletType(value)] = true 242 } 243 } 244 // always on: 245 throttler.throttleTabletTypesMap[topodatapb.TabletType_REPLICA] = true 246 } 247 248 // InitDBConfig initializes keyspace and shard 249 func (throttler *Throttler) InitDBConfig(keyspace, shard string) { 250 throttler.keyspace = keyspace 251 throttler.shard = shard 252 253 if throttlerConfigViaTopo { 254 throttler.srvTopoServer.WatchSrvKeyspace(context.Background(), throttler.cell, throttler.keyspace, throttler.WatchSrvKeyspaceCallback) 255 } 256 } 257 258 func (throttler *Throttler) GetMetricsQuery() string { 259 return throttler.metricsQuery.Load().(string) 260 } 261 262 func (throttler *Throttler) GetMetricsThreshold() float64 { 263 return throttler.MetricsThreshold.Get() 264 } 265 266 // initThrottler initializes config 267 func (throttler *Throttler) initConfig() { 268 log.Infof("Throttler: initializing config") 269 270 config.Instance = &config.ConfigurationSettings{ 271 Stores: config.StoresSettings{ 272 MySQL: config.MySQLConfigurationSettings{ 273 IgnoreDialTCPErrors: true, 274 Clusters: map[string](*config.MySQLClusterConfigurationSettings){}, 275 }, 276 }, 277 } 278 config.Instance.Stores.MySQL.Clusters[selfStoreName] = &config.MySQLClusterConfigurationSettings{ 279 MetricQuery: throttler.GetMetricsQuery(), 280 ThrottleThreshold: &throttler.MetricsThreshold, 281 IgnoreHostsCount: 0, 282 } 283 config.Instance.Stores.MySQL.Clusters[shardStoreName] = &config.MySQLClusterConfigurationSettings{ 284 MetricQuery: throttler.GetMetricsQuery(), 285 ThrottleThreshold: &throttler.MetricsThreshold, 286 IgnoreHostsCount: 0, 287 } 288 } 289 290 // readThrottlerConfig proactively reads the throttler's config from SrvKeyspace in local topo 291 func (throttler *Throttler) readThrottlerConfig(ctx context.Context) (*topodatapb.ThrottlerConfig, error) { 292 srvks, err := throttler.ts.GetSrvKeyspace(ctx, throttler.cell, throttler.keyspace) 293 if err != nil { 294 return nil, err 295 } 296 return throttler.normalizeThrottlerConfig(srvks.ThrottlerConfig), nil 297 } 298 299 // normalizeThrottlerConfig noramlizes missing throttler config information, as needed. 300 func (throttler *Throttler) normalizeThrottlerConfig(thottlerConfig *topodatapb.ThrottlerConfig) *topodatapb.ThrottlerConfig { 301 if thottlerConfig == nil { 302 thottlerConfig = &topodatapb.ThrottlerConfig{} 303 } 304 if thottlerConfig.CustomQuery == "" { 305 // no custom query; we check replication lag 306 if thottlerConfig.Threshold == 0 { 307 thottlerConfig.Threshold = throttleThreshold.Seconds() 308 } 309 } 310 return thottlerConfig 311 } 312 313 func (throttler *Throttler) WatchSrvKeyspaceCallback(srvks *topodatapb.SrvKeyspace, err error) bool { 314 if err != nil { 315 log.Errorf("WatchSrvKeyspaceCallback error: %v", err) 316 return false 317 } 318 throttlerConfig := throttler.normalizeThrottlerConfig(srvks.ThrottlerConfig) 319 320 if throttler.IsEnabled() { 321 // Throttler is running and we should apply the config change through Operate() 322 // or else we get into race conditions. 323 go func() { 324 throttler.throttlerConfigChan <- throttlerConfig 325 }() 326 } else { 327 // throttler is not running, we should apply directly 328 throttler.applyThrottlerConfig(context.Background(), throttlerConfig) 329 } 330 331 return true 332 } 333 334 // applyThrottlerConfig receives a Throttlerconfig as read from SrvKeyspace, and applies the configuration. This may cause 335 // the throttler to be enabled/disabled, and of course it affects the throttling query/threshold. 336 func (throttler *Throttler) applyThrottlerConfig(ctx context.Context, throttlerConfig *topodatapb.ThrottlerConfig) { 337 if !throttlerConfigViaTopo { 338 return 339 } 340 if throttlerConfig.CustomQuery == "" { 341 throttler.metricsQuery.Store(replicationLagQuery) 342 } else { 343 throttler.metricsQuery.Store(throttlerConfig.CustomQuery) 344 } 345 throttler.MetricsThreshold.Set(throttlerConfig.Threshold) 346 throttlerCheckAsCheckSelf = throttlerConfig.CheckAsCheckSelf 347 if throttlerConfig.Enabled { 348 go throttler.Enable(ctx) 349 } else { 350 go throttler.Disable(ctx) 351 } 352 } 353 354 func (throttler *Throttler) IsEnabled() bool { 355 return atomic.LoadInt64(&throttler.isEnabled) > 0 356 } 357 358 // Enable activates the throttler probes; when enabled, the throttler responds to check queries based on 359 // the collected metrics. 360 func (throttler *Throttler) Enable(ctx context.Context) bool { 361 throttler.enableMutex.Lock() 362 defer throttler.enableMutex.Unlock() 363 364 if throttler.isEnabled > 0 { 365 return false 366 } 367 atomic.StoreInt64(&throttler.isEnabled, 1) 368 369 ctx, throttler.cancelEnableContext = context.WithCancel(ctx) 370 throttler.check.SelfChecks(ctx) 371 throttler.Operate(ctx) 372 373 // Make a one-time request for a lease of heartbeats 374 go throttler.heartbeatWriter.RequestHeartbeats() 375 376 return true 377 } 378 379 // Disable deactivates the probes and associated operations. When disabled, the throttler reponds to check 380 // queries with "200 OK" irrespective of lag or any other metrics. 381 func (throttler *Throttler) Disable(ctx context.Context) bool { 382 throttler.enableMutex.Lock() 383 defer throttler.enableMutex.Unlock() 384 385 if throttler.isEnabled == 0 { 386 return false 387 } 388 // _ = throttler.updateConfig(ctx, false, throttler.MetricsThreshold.Get()) // TODO(shlomi) 389 atomic.StoreInt64(&throttler.isEnabled, 0) 390 391 throttler.aggregatedMetrics.Flush() 392 throttler.recentApps.Flush() 393 throttler.nonLowPriorityAppRequestsThrottled.Flush() 394 // we do not flush throttler.throttledApps because this is data submitted by the user; the user expects the data to survive a disable+enable 395 396 throttler.cancelEnableContext() 397 return true 398 } 399 400 // Open opens database pool and initializes the schema 401 func (throttler *Throttler) Open() error { 402 throttler.initMutex.Lock() 403 defer throttler.initMutex.Unlock() 404 if atomic.LoadInt64(&throttler.isOpen) > 0 { 405 // already open 406 return nil 407 } 408 ctx := context.Background() 409 throttler.pool.Open(throttler.env.Config().DB.AppWithDB(), throttler.env.Config().DB.DbaWithDB(), throttler.env.Config().DB.AppDebugWithDB()) 410 atomic.StoreInt64(&throttler.isOpen, 1) 411 412 throttler.ThrottleApp("always-throttled-app", time.Now().Add(time.Hour*24*365*10), defaultThrottleRatio) 413 414 if throttlerConfigViaTopo { 415 // We want to read throttler config from topo and apply it. 416 // But also, we're in an Open() function, which blocks state manager's operation, and affects 417 // opening of all other components. We thus read the throttler config in the background. 418 // However, we want to handle a situation where the read errors out. 419 // So we kick a loop that keeps retrying reading the config, for as long as this throttler is open. 420 go func() { 421 retryTicker := time.NewTicker(time.Minute) 422 defer retryTicker.Stop() 423 for { 424 if atomic.LoadInt64(&throttler.isOpen) == 0 { 425 // closed down. No need to keep retrying 426 return 427 } 428 429 throttlerConfig, err := throttler.readThrottlerConfig(ctx) 430 if err == nil { 431 // it's possible that during a retry-sleep, the throttler is closed and opened again, leading 432 // to two (or more) instances of this goroutine. That's not a big problem; it's fine if all 433 // attempt to read the throttler config; but we just want to ensure they don't step on each other 434 // while applying the changes. 435 throttler.initMutex.Lock() 436 defer throttler.initMutex.Unlock() 437 438 throttler.applyThrottlerConfig(ctx, throttlerConfig) // may issue an Enable 439 return 440 } 441 log.Errorf("Throttler.Open(): error reading throttler config. Will retry in 1 minute. Err=%+v", err) 442 <-retryTicker.C 443 } 444 }() 445 } else { 446 // backwards-cmpatible: check for --enable-lag-throttler flag in vttablet 447 // this will be removed in a future version 448 if throttler.env.Config().EnableLagThrottler { 449 go throttler.Enable(ctx) 450 } 451 } 452 return nil 453 } 454 455 // Close frees resources 456 func (throttler *Throttler) Close() { 457 log.Infof("Throttler: started execution of Close. Acquiring initMutex lock") 458 throttler.initMutex.Lock() 459 log.Infof("Throttler: acquired initMutex lock") 460 defer throttler.initMutex.Unlock() 461 if atomic.LoadInt64(&throttler.isOpen) == 0 { 462 log.Infof("Throttler: throttler is not open") 463 return 464 } 465 ctx := context.Background() 466 throttler.Disable(ctx) 467 atomic.StoreInt64(&throttler.isLeader, 0) 468 469 log.Infof("Throttler: closing pool") 470 throttler.pool.Close() 471 atomic.StoreInt64(&throttler.isOpen, 0) 472 log.Infof("Throttler: finished execution of Close") 473 } 474 475 func (throttler *Throttler) generateSelfMySQLThrottleMetricFunc(ctx context.Context, probe *mysql.Probe) func() *mysql.MySQLThrottleMetric { 476 f := func() *mysql.MySQLThrottleMetric { 477 return throttler.readSelfMySQLThrottleMetric(ctx, probe) 478 } 479 return f 480 } 481 482 // readSelfMySQLThrottleMetric reads the mysql metric from thi very tablet's backend mysql. 483 func (throttler *Throttler) readSelfMySQLThrottleMetric(ctx context.Context, probe *mysql.Probe) *mysql.MySQLThrottleMetric { 484 metric := &mysql.MySQLThrottleMetric{ 485 ClusterName: selfStoreName, 486 Key: *mysql.SelfInstanceKey, 487 Value: 0, 488 Err: nil, 489 } 490 conn, err := throttler.pool.Get(ctx, nil) 491 if err != nil { 492 metric.Err = err 493 return metric 494 } 495 defer conn.Recycle() 496 497 tm, err := conn.Exec(ctx, probe.MetricQuery, 1, true) 498 if err != nil { 499 metric.Err = err 500 return metric 501 } 502 row := tm.Named().Row() 503 if row == nil { 504 metric.Err = fmt.Errorf("no results for readSelfMySQLThrottleMetric") 505 return metric 506 } 507 508 metricsQueryType := mysql.GetMetricsQueryType(throttler.GetMetricsQuery()) 509 switch metricsQueryType { 510 case mysql.MetricsQueryTypeSelect: 511 // We expect a single row, single column result. 512 // The "for" iteration below is just a way to get first result without knowning column name 513 for k := range row { 514 metric.Value, metric.Err = row.ToFloat64(k) 515 } 516 case mysql.MetricsQueryTypeShowGlobal: 517 metric.Value, metric.Err = strconv.ParseFloat(row["Value"].ToString(), 64) 518 default: 519 metric.Err = fmt.Errorf("Unsupported metrics query type for query: %s", throttler.GetMetricsQuery()) 520 } 521 522 return metric 523 } 524 525 // throttledAppsSnapshot returns a snapshot (a copy) of current throttled apps 526 func (throttler *Throttler) throttledAppsSnapshot() map[string]cache.Item { 527 return throttler.throttledApps.Items() 528 } 529 530 // ThrottledAppsSnapshot returns a snapshot (a copy) of current throttled apps 531 func (throttler *Throttler) ThrottledApps() (result []base.AppThrottle) { 532 for _, item := range throttler.throttledAppsSnapshot() { 533 appThrottle, _ := item.Object.(*base.AppThrottle) 534 result = append(result, *appThrottle) 535 } 536 return result 537 } 538 539 // isDormant returns true when the last check was more than dormantPeriod ago 540 func (throttler *Throttler) isDormant() bool { 541 lastCheckTime := time.Unix(0, atomic.LoadInt64(&throttler.lastCheckTimeNano)) 542 return time.Since(lastCheckTime) > dormantPeriod 543 } 544 545 // Operate is the main entry point for the throttler operation and logic. It will 546 // run the probes, collect metrics, refresh inventory, etc. 547 func (throttler *Throttler) Operate(ctx context.Context) { 548 549 tickers := [](*timer.SuspendableTicker){} 550 addTicker := func(d time.Duration) *timer.SuspendableTicker { 551 t := timer.NewSuspendableTicker(d, false) 552 tickers = append(tickers, t) 553 return t 554 } 555 leaderCheckTicker := addTicker(leaderCheckInterval) 556 mysqlCollectTicker := addTicker(mysqlCollectInterval) 557 mysqlDormantCollectTicker := addTicker(mysqlDormantCollectInterval) 558 mysqlRefreshTicker := addTicker(mysqlRefreshInterval) 559 mysqlAggregateTicker := addTicker(mysqlAggregateInterval) 560 throttledAppsTicker := addTicker(throttledAppsSnapshotInterval) 561 562 go func() { 563 defer log.Infof("Throttler: Operate terminated, tickers stopped") 564 for _, t := range tickers { 565 defer t.Stop() 566 // since we just started the tickers now, speed up the ticks by forcing an immediate tick 567 go t.TickNow() 568 } 569 570 for { 571 select { 572 case <-ctx.Done(): 573 return 574 case <-leaderCheckTicker.C: 575 { 576 func() { 577 throttler.initMutex.Lock() 578 defer throttler.initMutex.Unlock() 579 580 // sparse 581 shouldBeLeader := int64(0) 582 if atomic.LoadInt64(&throttler.isOpen) > 0 { 583 if throttler.tabletTypeFunc() == topodatapb.TabletType_PRIMARY { 584 shouldBeLeader = 1 585 } 586 } 587 588 transitionedIntoLeader := false 589 if shouldBeLeader > throttler.isLeader { 590 log.Infof("Throttler: transition into leadership") 591 transitionedIntoLeader = true 592 } 593 if shouldBeLeader < throttler.isLeader { 594 log.Infof("Throttler: transition out of leadership") 595 } 596 597 atomic.StoreInt64(&throttler.isLeader, shouldBeLeader) 598 599 if transitionedIntoLeader { 600 // transitioned into leadership, let's speed up the next 'refresh' and 'collect' ticks 601 go mysqlRefreshTicker.TickNow() 602 go throttler.heartbeatWriter.RequestHeartbeats() 603 } 604 }() 605 } 606 case <-mysqlCollectTicker.C: 607 { 608 if atomic.LoadInt64(&throttler.isOpen) > 0 { 609 // frequent 610 if !throttler.isDormant() { 611 throttler.collectMySQLMetrics(ctx) 612 } 613 } 614 } 615 case <-mysqlDormantCollectTicker.C: 616 { 617 if atomic.LoadInt64(&throttler.isOpen) > 0 { 618 // infrequent 619 if throttler.isDormant() { 620 throttler.collectMySQLMetrics(ctx) 621 } 622 } 623 } 624 case metric := <-throttler.mysqlThrottleMetricChan: 625 { 626 // incoming MySQL metric, frequent, as result of collectMySQLMetrics() 627 throttler.mysqlInventory.InstanceKeyMetrics[metric.GetClusterInstanceKey()] = metric 628 } 629 case <-mysqlRefreshTicker.C: 630 { 631 // sparse 632 if atomic.LoadInt64(&throttler.isOpen) > 0 { 633 go throttler.refreshMySQLInventory(ctx) 634 } 635 } 636 case probes := <-throttler.mysqlClusterProbesChan: 637 { 638 // incoming structural update, sparse, as result of refreshMySQLInventory() 639 throttler.updateMySQLClusterProbes(ctx, probes) 640 } 641 case <-mysqlAggregateTicker.C: 642 { 643 if atomic.LoadInt64(&throttler.isOpen) > 0 { 644 throttler.aggregateMySQLMetrics(ctx) 645 } 646 } 647 case <-throttledAppsTicker.C: 648 { 649 if atomic.LoadInt64(&throttler.isOpen) > 0 { 650 go throttler.expireThrottledApps() 651 } 652 } 653 case throttlerConfig := <-throttler.throttlerConfigChan: 654 throttler.applyThrottlerConfig(ctx, throttlerConfig) 655 } 656 } 657 }() 658 } 659 660 func (throttler *Throttler) generateTabletHTTPProbeFunction(ctx context.Context, clusterName string, probe *mysql.Probe) (probeFunc func() *mysql.MySQLThrottleMetric) { 661 return func() *mysql.MySQLThrottleMetric { 662 // Hit a tablet's `check-self` via HTTP, and convert its CheckResult JSON output into a MySQLThrottleMetric 663 mySQLThrottleMetric := mysql.NewMySQLThrottleMetric() 664 mySQLThrottleMetric.ClusterName = clusterName 665 mySQLThrottleMetric.Key = probe.Key 666 667 tabletCheckSelfURL := fmt.Sprintf("http://%s:%d/throttler/check-self?app=vitess", probe.TabletHost, probe.TabletPort) 668 resp, err := throttler.httpClient.Get(tabletCheckSelfURL) 669 if err != nil { 670 mySQLThrottleMetric.Err = err 671 return mySQLThrottleMetric 672 } 673 defer resp.Body.Close() 674 b, err := io.ReadAll(resp.Body) 675 if err != nil { 676 mySQLThrottleMetric.Err = err 677 return mySQLThrottleMetric 678 } 679 checkResult := &CheckResult{} 680 if err := json.Unmarshal(b, checkResult); err != nil { 681 mySQLThrottleMetric.Err = err 682 return mySQLThrottleMetric 683 } 684 mySQLThrottleMetric.Value = checkResult.Value 685 686 if checkResult.StatusCode == http.StatusInternalServerError { 687 mySQLThrottleMetric.Err = fmt.Errorf("Status code: %d", checkResult.StatusCode) 688 } 689 return mySQLThrottleMetric 690 } 691 } 692 693 func (throttler *Throttler) collectMySQLMetrics(ctx context.Context) error { 694 // synchronously, get lists of probes 695 for clusterName, probes := range throttler.mysqlInventory.ClustersProbes { 696 clusterName := clusterName 697 probes := probes 698 go func() { 699 // probes is known not to change. It can be *replaced*, but not changed. 700 // so it's safe to iterate it 701 for _, probe := range *probes { 702 probe := probe 703 go func() { 704 // Avoid querying the same server twice at the same time. If previous read is still there, 705 // we avoid re-reading it. 706 if !atomic.CompareAndSwapInt64(&probe.QueryInProgress, 0, 1) { 707 return 708 } 709 defer atomic.StoreInt64(&probe.QueryInProgress, 0) 710 711 var throttleMetricFunc func() *mysql.MySQLThrottleMetric 712 if clusterName == selfStoreName { 713 throttleMetricFunc = throttler.generateSelfMySQLThrottleMetricFunc(ctx, probe) 714 } else { 715 throttleMetricFunc = throttler.generateTabletHTTPProbeFunction(ctx, clusterName, probe) 716 } 717 throttleMetrics := mysql.ReadThrottleMetric(probe, clusterName, throttleMetricFunc) 718 throttler.mysqlThrottleMetricChan <- throttleMetrics 719 }() 720 } 721 }() 722 } 723 return nil 724 } 725 726 // refreshMySQLInventory will re-structure the inventory based on reading config settings 727 func (throttler *Throttler) refreshMySQLInventory(ctx context.Context) error { 728 729 // distribute the query/threshold from the throttler down to the cluster settings and from there to the probes 730 metricsQuery := throttler.GetMetricsQuery() 731 metricsThreshold := throttler.MetricsThreshold.Get() 732 addInstanceKey := func(tabletHost string, tabletPort int, key *mysql.InstanceKey, clusterName string, clusterSettings *config.MySQLClusterConfigurationSettings, probes *mysql.Probes) { 733 for _, ignore := range clusterSettings.IgnoreHosts { 734 if strings.Contains(key.StringCode(), ignore) { 735 log.Infof("Throttler: instance key ignored: %+v", key) 736 return 737 } 738 } 739 if !key.IsValid() && !key.IsSelf() { 740 log.Infof("Throttler: read invalid instance key: [%+v] for cluster %+v", key, clusterName) 741 return 742 } 743 744 probe := &mysql.Probe{ 745 Key: *key, 746 TabletHost: tabletHost, 747 TabletPort: tabletPort, 748 MetricQuery: clusterSettings.MetricQuery, 749 CacheMillis: clusterSettings.CacheMillis, 750 } 751 (*probes)[*key] = probe 752 } 753 754 for clusterName, clusterSettings := range config.Settings().Stores.MySQL.Clusters { 755 clusterName := clusterName 756 clusterSettings := clusterSettings 757 clusterSettings.MetricQuery = metricsQuery 758 clusterSettings.ThrottleThreshold.Set(metricsThreshold) 759 // config may dynamically change, but internal structure (config.Settings().Stores.MySQL.Clusters in our case) 760 // is immutable and can only be _replaced_. Hence, it's safe to read in a goroutine: 761 go func() { 762 throttler.mysqlClusterThresholds.Set(clusterName, clusterSettings.ThrottleThreshold.Get(), cache.DefaultExpiration) 763 clusterProbes := &mysql.ClusterProbes{ 764 ClusterName: clusterName, 765 IgnoreHostsCount: clusterSettings.IgnoreHostsCount, 766 InstanceProbes: mysql.NewProbes(), 767 } 768 769 if clusterName == selfStoreName { 770 // special case: just looking at this tablet's MySQL server 771 // We will probe this "cluster" (of one server) is a special way. 772 addInstanceKey("", 0, mysql.SelfInstanceKey, clusterName, clusterSettings, clusterProbes.InstanceProbes) 773 throttler.mysqlClusterProbesChan <- clusterProbes 774 return 775 } 776 if atomic.LoadInt64(&throttler.isLeader) == 0 { 777 // not the leader (primary tablet)? Then no more work for us. 778 return 779 } 780 // The primary tablet is also in charge of collecting the shard's metrics 781 err := func() error { 782 tabletAliases, err := throttler.ts.FindAllTabletAliasesInShard(ctx, throttler.keyspace, throttler.shard) 783 if err != nil { 784 return err 785 } 786 for _, tabletAlias := range tabletAliases { 787 tablet, err := throttler.ts.GetTablet(ctx, tabletAlias) 788 if err != nil { 789 return err 790 } 791 if throttler.throttleTabletTypesMap[tablet.Type] { 792 key := mysql.InstanceKey{Hostname: tablet.MysqlHostname, Port: int(tablet.MysqlPort)} 793 addInstanceKey(tablet.Hostname, int(tablet.PortMap["vt"]), &key, clusterName, clusterSettings, clusterProbes.InstanceProbes) 794 } 795 } 796 throttler.mysqlClusterProbesChan <- clusterProbes 797 return nil 798 }() 799 if err != nil { 800 log.Errorf("refreshMySQLInventory: %+v", err) 801 } 802 }() 803 } 804 return nil 805 } 806 807 // synchronous update of inventory 808 func (throttler *Throttler) updateMySQLClusterProbes(ctx context.Context, clusterProbes *mysql.ClusterProbes) error { 809 throttler.mysqlInventory.ClustersProbes[clusterProbes.ClusterName] = clusterProbes.InstanceProbes 810 throttler.mysqlInventory.IgnoreHostsCount[clusterProbes.ClusterName] = clusterProbes.IgnoreHostsCount 811 throttler.mysqlInventory.IgnoreHostsThreshold[clusterProbes.ClusterName] = clusterProbes.IgnoreHostsThreshold 812 return nil 813 } 814 815 // synchronous aggregation of collected data 816 func (throttler *Throttler) aggregateMySQLMetrics(ctx context.Context) error { 817 for clusterName, probes := range throttler.mysqlInventory.ClustersProbes { 818 metricName := fmt.Sprintf("mysql/%s", clusterName) 819 ignoreHostsCount := throttler.mysqlInventory.IgnoreHostsCount[clusterName] 820 ignoreHostsThreshold := throttler.mysqlInventory.IgnoreHostsThreshold[clusterName] 821 aggregatedMetric := aggregateMySQLProbes(ctx, probes, clusterName, throttler.mysqlInventory.InstanceKeyMetrics, ignoreHostsCount, config.Settings().Stores.MySQL.IgnoreDialTCPErrors, ignoreHostsThreshold) 822 throttler.aggregatedMetrics.Set(metricName, aggregatedMetric, cache.DefaultExpiration) 823 } 824 return nil 825 } 826 827 func (throttler *Throttler) getNamedMetric(metricName string) base.MetricResult { 828 if metricResultVal, found := throttler.aggregatedMetrics.Get(metricName); found { 829 return metricResultVal.(base.MetricResult) 830 } 831 return base.NoSuchMetric 832 } 833 834 func (throttler *Throttler) getMySQLClusterMetrics(ctx context.Context, clusterName string) (base.MetricResult, float64) { 835 if thresholdVal, found := throttler.mysqlClusterThresholds.Get(clusterName); found { 836 threshold, _ := thresholdVal.(float64) 837 metricName := fmt.Sprintf("mysql/%s", clusterName) 838 return throttler.getNamedMetric(metricName), threshold 839 } 840 841 return base.NoSuchMetric, 0 842 } 843 844 func (throttler *Throttler) aggregatedMetricsSnapshot() map[string]base.MetricResult { 845 snapshot := make(map[string]base.MetricResult) 846 for key, value := range throttler.aggregatedMetrics.Items() { 847 metricResult, _ := value.Object.(base.MetricResult) 848 snapshot[key] = metricResult 849 } 850 return snapshot 851 } 852 853 func (throttler *Throttler) expireThrottledApps() { 854 now := time.Now() 855 for appName, item := range throttler.throttledApps.Items() { 856 appThrottle := item.Object.(*base.AppThrottle) 857 if appThrottle.ExpireAt.Before(now) { 858 throttler.UnthrottleApp(appName) 859 } 860 } 861 } 862 863 // ThrottleApp instructs the throttler to begin throttling an app, to som eperiod and with some ratio. 864 func (throttler *Throttler) ThrottleApp(appName string, expireAt time.Time, ratio float64) (appThrottle *base.AppThrottle) { 865 throttler.throttledAppsMutex.Lock() 866 defer throttler.throttledAppsMutex.Unlock() 867 868 now := time.Now() 869 if object, found := throttler.throttledApps.Get(appName); found { 870 appThrottle = object.(*base.AppThrottle) 871 if !expireAt.IsZero() { 872 appThrottle.ExpireAt = expireAt 873 } 874 if ratio >= 0 { 875 appThrottle.Ratio = ratio 876 } 877 } else { 878 if expireAt.IsZero() { 879 expireAt = now.Add(defaultThrottleTTLMinutes * time.Minute) 880 } 881 if ratio < 0 { 882 ratio = defaultThrottleRatio 883 } 884 appThrottle = base.NewAppThrottle(appName, expireAt, ratio) 885 } 886 if now.Before(appThrottle.ExpireAt) { 887 throttler.throttledApps.Set(appName, appThrottle, cache.DefaultExpiration) 888 } else { 889 throttler.UnthrottleApp(appName) 890 } 891 return appThrottle 892 } 893 894 // UnthrottleApp cancels any throttling, if any, for a given app 895 func (throttler *Throttler) UnthrottleApp(appName string) (appThrottle *base.AppThrottle) { 896 throttler.throttledApps.Delete(appName) 897 // the app is likely to check 898 go throttler.heartbeatWriter.RequestHeartbeats() 899 return base.NewAppThrottle(appName, time.Now(), 0) 900 } 901 902 // IsAppThrottled tells whether some app should be throttled. 903 // Assuming an app is throttled to some extend, it will randomize the result based 904 // on the throttle ratio 905 func (throttler *Throttler) IsAppThrottled(appName string) bool { 906 isSingleAppNameThrottled := func(singleAppName string) bool { 907 if object, found := throttler.throttledApps.Get(singleAppName); found { 908 appThrottle := object.(*base.AppThrottle) 909 if appThrottle.ExpireAt.Before(time.Now()) { 910 // throttling cleanup hasn't purged yet, but it is expired 911 return false 912 } 913 // handle ratio 914 if rand.Float64() < appThrottle.Ratio { 915 return true 916 } 917 } 918 return false 919 } 920 if isSingleAppNameThrottled(appName) { 921 return true 922 } 923 for _, singleAppName := range strings.Split(appName, ":") { 924 if singleAppName == "" { 925 continue 926 } 927 if isSingleAppNameThrottled(singleAppName) { 928 return true 929 } 930 } 931 return false 932 } 933 934 // ThrottledAppsMap returns a (copy) map of currently throttled apps 935 func (throttler *Throttler) ThrottledAppsMap() (result map[string](*base.AppThrottle)) { 936 result = make(map[string](*base.AppThrottle)) 937 938 for appName, item := range throttler.throttledApps.Items() { 939 appThrottle := item.Object.(*base.AppThrottle) 940 result[appName] = appThrottle 941 } 942 return result 943 } 944 945 // markRecentApp takes note that an app has just asked about throttling, making it "recent" 946 func (throttler *Throttler) markRecentApp(appName string, remoteAddr string) { 947 recentAppKey := fmt.Sprintf("%s/%s", appName, remoteAddr) 948 throttler.recentApps.Set(recentAppKey, time.Now(), cache.DefaultExpiration) 949 } 950 951 // RecentAppsMap returns a (copy) map of apps which checked for throttling recently 952 func (throttler *Throttler) RecentAppsMap() (result map[string](*base.RecentApp)) { 953 result = make(map[string](*base.RecentApp)) 954 955 for recentAppKey, item := range throttler.recentApps.Items() { 956 recentApp := base.NewRecentApp(item.Object.(time.Time)) 957 result[recentAppKey] = recentApp 958 } 959 return result 960 } 961 962 // markMetricHealthy will mark the time "now" as the last time a given metric was checked to be "OK" 963 func (throttler *Throttler) markMetricHealthy(metricName string) { 964 throttler.metricsHealth.Set(metricName, time.Now(), cache.DefaultExpiration) 965 } 966 967 // timeSinceMetricHealthy returns time elapsed since the last time a metric checked "OK" 968 func (throttler *Throttler) timeSinceMetricHealthy(metricName string) (timeSinceHealthy time.Duration, found bool) { 969 if lastOKTime, found := throttler.metricsHealth.Get(metricName); found { 970 return time.Since(lastOKTime.(time.Time)), true 971 } 972 return 0, false 973 } 974 975 func (throttler *Throttler) metricsHealthSnapshot() base.MetricHealthMap { 976 snapshot := make(base.MetricHealthMap) 977 for key, value := range throttler.metricsHealth.Items() { 978 lastHealthyAt, _ := value.Object.(time.Time) 979 snapshot[key] = base.NewMetricHealth(lastHealthyAt) 980 } 981 return snapshot 982 } 983 984 // AppRequestMetricResult gets a metric result in the context of a specific app 985 func (throttler *Throttler) AppRequestMetricResult(ctx context.Context, appName string, metricResultFunc base.MetricResultFunc, denyApp bool) (metricResult base.MetricResult, threshold float64) { 986 if denyApp { 987 return base.AppDeniedMetric, 0 988 } 989 if throttler.IsAppThrottled(appName) { 990 return base.AppDeniedMetric, 0 991 } 992 return metricResultFunc() 993 } 994 995 // checkStore checks the aggregated value of given MySQL store 996 func (throttler *Throttler) checkStore(ctx context.Context, appName string, storeName string, remoteAddr string, flags *CheckFlags) (checkResult *CheckResult) { 997 if !throttler.IsEnabled() { 998 return okMetricCheckResult 999 } 1000 return throttler.check.Check(ctx, appName, "mysql", storeName, remoteAddr, flags) 1001 } 1002 1003 // checkShard checks the health of the shard, and runs on the primary tablet only 1004 func (throttler *Throttler) checkShard(ctx context.Context, appName string, remoteAddr string, flags *CheckFlags) (checkResult *CheckResult) { 1005 return throttler.checkStore(ctx, appName, shardStoreName, remoteAddr, flags) 1006 } 1007 1008 // CheckSelf is checks the mysql/self metric, and is available on each tablet 1009 func (throttler *Throttler) checkSelf(ctx context.Context, appName string, remoteAddr string, flags *CheckFlags) (checkResult *CheckResult) { 1010 return throttler.checkStore(ctx, appName, selfStoreName, remoteAddr, flags) 1011 } 1012 1013 // CheckByType runs a check by requested check type 1014 func (throttler *Throttler) CheckByType(ctx context.Context, appName string, remoteAddr string, flags *CheckFlags, checkType ThrottleCheckType) (checkResult *CheckResult) { 1015 if throttler.IsEnabled() && !flags.SkipRequestHeartbeats { 1016 go throttler.heartbeatWriter.RequestHeartbeats() 1017 } 1018 switch checkType { 1019 case ThrottleCheckSelf: 1020 return throttler.checkSelf(ctx, appName, remoteAddr, flags) 1021 case ThrottleCheckPrimaryWrite: 1022 if throttlerCheckAsCheckSelf { 1023 return throttler.checkSelf(ctx, appName, remoteAddr, flags) 1024 } 1025 return throttler.checkShard(ctx, appName, remoteAddr, flags) 1026 default: 1027 return invalidCheckTypeCheckResult 1028 } 1029 } 1030 1031 // Status exports a status breakdown 1032 func (throttler *Throttler) Status() *ThrottlerStatus { 1033 return &ThrottlerStatus{ 1034 Keyspace: throttler.keyspace, 1035 Shard: throttler.shard, 1036 1037 IsLeader: (atomic.LoadInt64(&throttler.isLeader) > 0), 1038 IsOpen: (atomic.LoadInt64(&throttler.isOpen) > 0), 1039 IsEnabled: (atomic.LoadInt64(&throttler.isEnabled) > 0), 1040 IsDormant: throttler.isDormant(), 1041 1042 Query: throttler.GetMetricsQuery(), 1043 Threshold: throttler.GetMetricsThreshold(), 1044 1045 AggregatedMetrics: throttler.aggregatedMetricsSnapshot(), 1046 MetricsHealth: throttler.metricsHealthSnapshot(), 1047 } 1048 }