github.com/gravitational/teleport/api@v0.0.0-20240507183017-3110591cbafc/breaker/breaker.go (about) 1 // Copyright 2022 Gravitational, Inc 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package breaker 16 17 import ( 18 "fmt" 19 "net/http" 20 "sync" 21 "time" 22 23 "github.com/gravitational/trace" 24 "github.com/jonboulle/clockwork" 25 "google.golang.org/grpc/codes" 26 "google.golang.org/grpc/status" 27 28 "github.com/gravitational/teleport/api/defaults" 29 "github.com/gravitational/teleport/api/utils/retryutils" 30 ) 31 32 // Metrics tallies success and failure counts 33 // for all executions performed by a CircuitBreaker 34 type Metrics struct { 35 // Executions the total number of times the breaker has executed within the interval 36 Executions uint32 37 // Successes the number of successful executions 38 Successes uint32 39 // Failures the total number of failed executions 40 Failures uint32 41 // ConsecutiveSuccesses the number of consecutive successful executions 42 ConsecutiveSuccesses uint32 43 // ConsecutiveFailures the number of consecutive failed executions 44 ConsecutiveFailures uint32 45 } 46 47 func (m *Metrics) String() string { 48 return fmt.Sprintf("Metrics(executions=%d, successes=%d, failures=%d, consecutiveSuccesses=%d, consecutiveFailures=%d)", m.Executions, m.Successes, m.Failures, m.ConsecutiveSuccesses, m.ConsecutiveFailures) 49 } 50 51 // reset restores all counts to zero 52 func (m *Metrics) reset() { 53 *m = Metrics{} 54 } 55 56 // success increments the counters tracking successful executions 57 // and resets the ConsecutiveFailures count 58 func (m *Metrics) success() { 59 m.Successes++ 60 m.ConsecutiveSuccesses++ 61 m.ConsecutiveFailures = 0 62 } 63 64 // failure increments the counters tracking failed executions 65 // and resets the ConsecutiveSuccesses count 66 func (m *Metrics) failure() { 67 m.Failures++ 68 m.ConsecutiveFailures++ 69 m.ConsecutiveSuccesses = 0 70 } 71 72 // execute increments Executions 73 func (m *Metrics) execute() { 74 m.Executions++ 75 } 76 77 // State represents an operating state that a CircuitBreaker may be in. 78 type State int 79 80 const ( 81 // StateStandby indicates the breaker is passing all requests and watching stats 82 StateStandby State = iota 83 // StateTripped indicates too many errors have occurred and requests are actively being rejected 84 StateTripped 85 // StateRecovering indicates the breaker is allowing some requests to go through and rejecting others 86 StateRecovering 87 ) 88 89 // String returns the string representation of a State 90 func (s State) String() string { 91 switch s { 92 case StateStandby: 93 return "standby" 94 case StateTripped: 95 return "tripped" 96 case StateRecovering: 97 return "recovering" 98 default: 99 return fmt.Sprintf("undefined(%v)", int(s)) 100 } 101 } 102 103 // ErrStateTripped will be returned from executions performed while the CircuitBreaker 104 // is in StateTripped 105 var ErrStateTripped = &trace.ConnectionProblemError{Message: "breaker is tripped"} 106 107 // Config contains configuration of the CircuitBreaker 108 type Config struct { 109 // Clock is used to control time - mainly used for testing 110 Clock clockwork.Clock 111 // Interval is the period of time that execution metrics will be collected for within StateStandby before 112 // transitioning to the next generation. 113 Interval time.Duration 114 // TrippedPeriod is the amount of time to remain in StateTripped before transitioning 115 // into StateRecovering 116 TrippedPeriod time.Duration 117 // Recover specifies the TripFn that will be used to determine if the CircuitBreaker should transition from 118 // StateRecovering to StateTripped. This is required to be supplied, failure to do so will result in an error 119 // creating the CircuitBreaker. 120 Recover TripFn 121 // RecoveryLimit is the number on consecutive successful executions required to transition from 122 // StateRecovering to StateStandby 123 RecoveryLimit uint32 124 // Trip specifies the TripFn that will be used to determine if the CircuitBreaker should transition from 125 // StateStandby to StateTripped. This is required to be supplied, failure to do so will result in an error 126 // creating the CircuitBreaker. 127 Trip TripFn 128 // OnTripped will be called when the CircuitBreaker enters the StateTripped 129 // state; this callback is called while holding a lock, so it should return 130 // quickly. 131 OnTripped func() 132 // OnStandby will be called when the CircuitBreaker returns to the 133 // StateStandby state; this callback is called while holding a lock, so it 134 // should return quickly. 135 OnStandBy func() 136 // OnExecute will be called once for each execution, and given the result 137 // and the current state of the breaker state; this callback is called while 138 // holding a lock, so it should return quickly. 139 OnExecute func(success bool, state State) 140 // IsSuccessful is used by the CircuitBreaker to determine if the executed function was successful or not 141 IsSuccessful func(v interface{}, err error) bool 142 // TrippedErrorMessage is an optional message to use as the error message when the CircuitBreaker 143 // is tripped. Defaults to ErrStateTripped if not provided. 144 TrippedErrorMessage string 145 } 146 147 // Clone returns a clone of the Config. 148 func (c *Config) Clone() Config { 149 // the current Config can just be copied without issues 150 return *c 151 } 152 153 // TripFn determines if the CircuitBreaker should be tripped based 154 // on the state of the provided Metrics. A return value of true will 155 // cause the CircuitBreaker to transition into the StateTripped state 156 type TripFn = func(m Metrics) bool 157 158 // StaticTripper is a TripFn that always returns the provided value 159 // regardless of the Metrics. Useful for testing. 160 func StaticTripper(b bool) TripFn { 161 return func(m Metrics) bool { 162 return b 163 } 164 } 165 166 // RatioTripper is a TripFn that returns true it the error ratio 167 // is greater than the provided ratio and there have been at least 168 // minExecutions performed. 169 func RatioTripper(ratio float64, minExecutions uint32) TripFn { 170 return func(m Metrics) bool { 171 if m.Executions < minExecutions { 172 return false 173 } 174 175 r := float64(m.Failures) / float64(m.Executions) 176 return r >= ratio 177 } 178 } 179 180 // ConsecutiveFailureTripper is a TripFn that will return true if 181 // Metrics.ConsecutiveFailures is greater than the provided value. 182 func ConsecutiveFailureTripper(max uint32) TripFn { 183 return func(m Metrics) bool { 184 return m.ConsecutiveFailures > max 185 } 186 } 187 188 // NonNilErrorIsSuccess returns true if the provided error is non nil. This 189 // is the default value for Config.IsSuccessful if not provided. 190 func NonNilErrorIsSuccess(_ interface{}, err error) bool { 191 return err == nil 192 } 193 194 // IsResponseSuccessful determines whether the error provided should be ignored by the circuit breaker. This checks 195 // for http status codes < 500 and a few unsuccessful gRPC status codes. 196 func IsResponseSuccessful(v interface{}, err error) bool { 197 switch t := v.(type) { 198 case nil: 199 break 200 case *http.Response: 201 if t == nil { 202 break 203 } 204 return t.StatusCode < http.StatusInternalServerError 205 } 206 207 code := status.Code(err) 208 switch { 209 case err == nil: 210 return true 211 case code == codes.Canceled || code == codes.Unknown || code == codes.Unavailable || code == codes.DeadlineExceeded: 212 return false 213 default: 214 return true 215 } 216 } 217 218 func DefaultBreakerConfig(clock clockwork.Clock) Config { 219 return Config{ 220 Clock: clock, 221 Interval: defaults.BreakerInterval, 222 Trip: RatioTripper(defaults.BreakerRatio, defaults.BreakerRatioMinExecutions), 223 Recover: RatioTripper(defaults.BreakerRatio/2, defaults.BreakerRatioMinExecutions/3), 224 IsSuccessful: IsResponseSuccessful, 225 } 226 } 227 228 func NoopBreakerConfig() Config { 229 return Config{ 230 Interval: defaults.BreakerInterval, 231 Trip: StaticTripper(false), 232 Recover: StaticTripper(false), 233 IsSuccessful: func(v interface{}, err error) bool { return true }, 234 } 235 } 236 237 // CheckAndSetDefaults checks and sets default config values. 238 func (c *Config) CheckAndSetDefaults() error { 239 if c.Clock == nil { 240 c.Clock = clockwork.NewRealClock() 241 } 242 243 if c.Interval <= 0 { 244 return trace.BadParameter("CircuitBreaker Interval must be set") 245 } 246 247 if c.Trip == nil { 248 return trace.BadParameter("CircuitBreaker Trip must be set") 249 } 250 if c.Recover == nil { 251 return trace.BadParameter("CircuitBreaker Recover must be set") 252 } 253 254 if c.TrippedPeriod <= 0 { 255 c.TrippedPeriod = defaults.TrippedPeriod 256 } 257 258 if c.RecoveryLimit <= 0 { 259 c.RecoveryLimit = defaults.RecoveryLimit 260 } 261 262 if c.OnTripped == nil { 263 c.OnTripped = func() {} 264 } 265 266 if c.OnStandBy == nil { 267 c.OnStandBy = func() {} 268 } 269 270 if c.OnExecute == nil { 271 c.OnExecute = func(bool, State) {} 272 } 273 274 if c.IsSuccessful == nil { 275 c.IsSuccessful = NonNilErrorIsSuccess 276 } 277 278 c.TrippedPeriod = retryutils.NewSeventhJitter()(c.TrippedPeriod) 279 280 return nil 281 } 282 283 // CircuitBreaker implements the circuit breaker pattern 284 type CircuitBreaker struct { 285 cfg Config 286 287 mu sync.Mutex 288 state State 289 generation uint64 290 metrics Metrics 291 expiry time.Time 292 } 293 294 func NewNoop() *CircuitBreaker { 295 return &CircuitBreaker{ 296 cfg: NoopBreakerConfig(), 297 } 298 } 299 300 // New returns a CircuitBreaker configured with the provided Config 301 func New(cfg Config) (*CircuitBreaker, error) { 302 if err := cfg.CheckAndSetDefaults(); err != nil { 303 return nil, err 304 } 305 306 cb := CircuitBreaker{cfg: cfg} 307 cb.nextGeneration(cfg.Clock.Now()) 308 309 return &cb, nil 310 } 311 312 // Execute calls the provided function depending on the CircuitBreaker state. 313 // - StateStandby: all functions are executed. 314 // - StateTripped: no functions are executed and ErrStateTripped is returned. 315 // - StateRecovering: some functions are executed, some functions are not, 316 // when not executed ErrLimitExceeded is returned. 317 // 318 // The CircuitBreaker state is updated according to the outcome of executing the 319 // provided function and the current state. See package docs for a more detailed 320 // explanation of state transitions. 321 func (c *CircuitBreaker) Execute(f func() (interface{}, error)) (interface{}, error) { 322 generation, err := c.beforeExecution() 323 if err != nil { 324 return nil, err 325 } 326 327 v, err := f() 328 329 c.afterExecution(generation, v, err) 330 331 return v, err 332 } 333 334 // beforeExecution checks the current state to determine if a new generation 335 // should be created and whether Execute is allowed to proceed. 336 func (c *CircuitBreaker) beforeExecution() (uint64, error) { 337 c.mu.Lock() 338 defer c.mu.Unlock() 339 340 now := c.cfg.Clock.Now() 341 342 generation, state := c.currentState(now) 343 344 if state == StateTripped { 345 c.cfg.OnExecute(false, StateTripped) 346 347 if c.cfg.TrippedErrorMessage != "" { 348 return generation, trace.ConnectionProblem(nil, c.cfg.TrippedErrorMessage) 349 } 350 351 return generation, trace.Wrap(ErrStateTripped) 352 } 353 354 c.metrics.execute() 355 return generation, nil 356 } 357 358 // afterExecution updates the CircuitBreaker state based on the outcome of 359 // processing the fn in Execute. 360 func (c *CircuitBreaker) afterExecution(prior uint64, v interface{}, err error) { 361 c.mu.Lock() 362 defer c.mu.Unlock() 363 364 now := c.cfg.Clock.Now() 365 366 generation, state := c.currentState(now) 367 if generation != prior { 368 return 369 } 370 371 if c.cfg.IsSuccessful(v, err) { 372 c.successLocked(state, now) 373 } else { 374 c.failureLocked(state, now) 375 } 376 } 377 378 // successLocked tallies a successful execution and migrates to StateStandby 379 // if in another state and criteria has been met to transition 380 func (c *CircuitBreaker) successLocked(state State, t time.Time) { 381 switch state { 382 case StateStandby: 383 c.cfg.OnExecute(true, StateStandby) 384 c.metrics.success() 385 case StateRecovering: 386 c.cfg.OnExecute(true, StateRecovering) 387 c.metrics.success() 388 if c.metrics.ConsecutiveSuccesses >= c.cfg.RecoveryLimit { 389 c.setState(StateStandby, t) 390 go c.cfg.OnStandBy() 391 } 392 } 393 } 394 395 // failureLocked tallies a failed execution and migrate to StateTripped 396 // if in another state and criteria has been met to transition 397 func (c *CircuitBreaker) failureLocked(state State, t time.Time) { 398 c.metrics.failure() 399 400 switch state { 401 case StateRecovering: 402 c.cfg.OnExecute(false, StateRecovering) 403 if c.cfg.Recover(c.metrics) { 404 c.setState(StateTripped, t) 405 } 406 case StateStandby: 407 c.cfg.OnExecute(false, StateStandby) 408 if c.cfg.Trip(c.metrics) { 409 c.setState(StateTripped, t) 410 go c.cfg.OnTripped() 411 } 412 } 413 } 414 415 // setState updates the state and creates a new generation if the 416 // provided state is different from the CircuitBreakers current state 417 func (c *CircuitBreaker) setState(s State, t time.Time) { 418 if c.state == s { 419 return 420 } 421 422 c.state = s 423 c.nextGeneration(t) 424 } 425 426 // currentState returns the state of the CircuitBreaker 427 func (c *CircuitBreaker) currentState(t time.Time) (uint64, State) { 428 switch { 429 case c.state == StateTripped && c.expiry.Before(t): 430 c.setState(StateRecovering, t) 431 case c.state == StateStandby && !c.expiry.IsZero() && c.expiry.Before(t): 432 c.nextGeneration(t) 433 } 434 435 return c.generation, c.state 436 } 437 438 // nextGeneration creates a new generation and adjusts its expiration 439 // based on the current state 440 func (c *CircuitBreaker) nextGeneration(t time.Time) { 441 c.metrics.reset() 442 c.generation++ 443 444 switch c.state { 445 case StateRecovering: 446 c.expiry = time.Time{} 447 case StateTripped: 448 c.expiry = t.Add(c.cfg.TrippedPeriod) 449 case StateStandby: 450 c.expiry = t.Add(c.cfg.Interval) 451 } 452 }