github.com/gravitational/teleport/api@v0.0.0-20240507183017-3110591cbafc/breaker/breaker.go (about)

     1  // Copyright 2022 Gravitational, Inc
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package breaker
    16  
    17  import (
    18  	"fmt"
    19  	"net/http"
    20  	"sync"
    21  	"time"
    22  
    23  	"github.com/gravitational/trace"
    24  	"github.com/jonboulle/clockwork"
    25  	"google.golang.org/grpc/codes"
    26  	"google.golang.org/grpc/status"
    27  
    28  	"github.com/gravitational/teleport/api/defaults"
    29  	"github.com/gravitational/teleport/api/utils/retryutils"
    30  )
    31  
    32  // Metrics tallies success and failure counts
    33  // for all executions performed by a CircuitBreaker
    34  type Metrics struct {
    35  	// Executions the total number of times the breaker has executed within the interval
    36  	Executions uint32
    37  	// Successes the number of successful executions
    38  	Successes uint32
    39  	// Failures the total number of failed executions
    40  	Failures uint32
    41  	// ConsecutiveSuccesses the number of consecutive successful executions
    42  	ConsecutiveSuccesses uint32
    43  	// ConsecutiveFailures the number of consecutive failed executions
    44  	ConsecutiveFailures uint32
    45  }
    46  
    47  func (m *Metrics) String() string {
    48  	return fmt.Sprintf("Metrics(executions=%d, successes=%d, failures=%d, consecutiveSuccesses=%d, consecutiveFailures=%d)", m.Executions, m.Successes, m.Failures, m.ConsecutiveSuccesses, m.ConsecutiveFailures)
    49  }
    50  
    51  // reset restores all counts to zero
    52  func (m *Metrics) reset() {
    53  	*m = Metrics{}
    54  }
    55  
    56  // success increments the counters tracking successful executions
    57  // and resets the ConsecutiveFailures count
    58  func (m *Metrics) success() {
    59  	m.Successes++
    60  	m.ConsecutiveSuccesses++
    61  	m.ConsecutiveFailures = 0
    62  }
    63  
    64  // failure increments the counters tracking failed executions
    65  // and resets the ConsecutiveSuccesses count
    66  func (m *Metrics) failure() {
    67  	m.Failures++
    68  	m.ConsecutiveFailures++
    69  	m.ConsecutiveSuccesses = 0
    70  }
    71  
    72  // execute increments Executions
    73  func (m *Metrics) execute() {
    74  	m.Executions++
    75  }
    76  
    77  // State represents an operating state that a CircuitBreaker may be in.
    78  type State int
    79  
    80  const (
    81  	// StateStandby indicates the breaker is passing all requests and watching stats
    82  	StateStandby State = iota
    83  	// StateTripped indicates too many errors have occurred and requests are actively being rejected
    84  	StateTripped
    85  	// StateRecovering indicates the breaker is allowing some requests to go through and rejecting others
    86  	StateRecovering
    87  )
    88  
    89  // String returns the string representation of a State
    90  func (s State) String() string {
    91  	switch s {
    92  	case StateStandby:
    93  		return "standby"
    94  	case StateTripped:
    95  		return "tripped"
    96  	case StateRecovering:
    97  		return "recovering"
    98  	default:
    99  		return fmt.Sprintf("undefined(%v)", int(s))
   100  	}
   101  }
   102  
   103  // ErrStateTripped will be returned from executions performed while the CircuitBreaker
   104  // is in StateTripped
   105  var ErrStateTripped = &trace.ConnectionProblemError{Message: "breaker is tripped"}
   106  
   107  // Config contains configuration of the CircuitBreaker
   108  type Config struct {
   109  	// Clock is used to control time - mainly used for testing
   110  	Clock clockwork.Clock
   111  	// Interval is the period of time that execution metrics will be collected for within StateStandby before
   112  	// transitioning to the next generation.
   113  	Interval time.Duration
   114  	// TrippedPeriod is the amount of time to remain in StateTripped before transitioning
   115  	// into StateRecovering
   116  	TrippedPeriod time.Duration
   117  	// Recover specifies the TripFn that will be used to determine if the CircuitBreaker should transition from
   118  	// StateRecovering to StateTripped. This is required to be supplied, failure to do so will result in an error
   119  	// creating the CircuitBreaker.
   120  	Recover TripFn
   121  	// RecoveryLimit is the number on consecutive successful executions required to transition from
   122  	// StateRecovering to StateStandby
   123  	RecoveryLimit uint32
   124  	// Trip specifies the TripFn that will be used to determine if the CircuitBreaker should transition from
   125  	// StateStandby to StateTripped. This is required to be supplied, failure to do so will result in an error
   126  	// creating the CircuitBreaker.
   127  	Trip TripFn
   128  	// OnTripped will be called when the CircuitBreaker enters the StateTripped
   129  	// state; this callback is called while holding a lock, so it should return
   130  	// quickly.
   131  	OnTripped func()
   132  	// OnStandby will be called when the CircuitBreaker returns to the
   133  	// StateStandby state; this callback is called while holding a lock, so it
   134  	// should return quickly.
   135  	OnStandBy func()
   136  	// OnExecute will be called once for each execution, and given the result
   137  	// and the current state of the breaker state; this callback is called while
   138  	// holding a lock, so it should return quickly.
   139  	OnExecute func(success bool, state State)
   140  	// IsSuccessful is used by the CircuitBreaker to determine if the executed function was successful or not
   141  	IsSuccessful func(v interface{}, err error) bool
   142  	// TrippedErrorMessage is an optional message to use as the error message when the CircuitBreaker
   143  	// is tripped. Defaults to ErrStateTripped if not provided.
   144  	TrippedErrorMessage string
   145  }
   146  
   147  // Clone returns a clone of the Config.
   148  func (c *Config) Clone() Config {
   149  	// the current Config can just be copied without issues
   150  	return *c
   151  }
   152  
   153  // TripFn determines if the CircuitBreaker should be tripped based
   154  // on the state of the provided Metrics. A return value of true will
   155  // cause the CircuitBreaker to transition into the StateTripped state
   156  type TripFn = func(m Metrics) bool
   157  
   158  // StaticTripper is a TripFn that always returns the provided value
   159  // regardless of the Metrics. Useful for testing.
   160  func StaticTripper(b bool) TripFn {
   161  	return func(m Metrics) bool {
   162  		return b
   163  	}
   164  }
   165  
   166  // RatioTripper is a TripFn that returns true it the error ratio
   167  // is greater than the provided ratio and there have been at least
   168  // minExecutions performed.
   169  func RatioTripper(ratio float64, minExecutions uint32) TripFn {
   170  	return func(m Metrics) bool {
   171  		if m.Executions < minExecutions {
   172  			return false
   173  		}
   174  
   175  		r := float64(m.Failures) / float64(m.Executions)
   176  		return r >= ratio
   177  	}
   178  }
   179  
   180  // ConsecutiveFailureTripper is a TripFn that will return true if
   181  // Metrics.ConsecutiveFailures is greater than the provided value.
   182  func ConsecutiveFailureTripper(max uint32) TripFn {
   183  	return func(m Metrics) bool {
   184  		return m.ConsecutiveFailures > max
   185  	}
   186  }
   187  
   188  // NonNilErrorIsSuccess returns true if the provided error is non nil. This
   189  // is the default value for Config.IsSuccessful if not provided.
   190  func NonNilErrorIsSuccess(_ interface{}, err error) bool {
   191  	return err == nil
   192  }
   193  
   194  // IsResponseSuccessful determines whether the error provided should be ignored by the circuit breaker. This checks
   195  // for http status codes < 500 and a few unsuccessful gRPC status codes.
   196  func IsResponseSuccessful(v interface{}, err error) bool {
   197  	switch t := v.(type) {
   198  	case nil:
   199  		break
   200  	case *http.Response:
   201  		if t == nil {
   202  			break
   203  		}
   204  		return t.StatusCode < http.StatusInternalServerError
   205  	}
   206  
   207  	code := status.Code(err)
   208  	switch {
   209  	case err == nil:
   210  		return true
   211  	case code == codes.Canceled || code == codes.Unknown || code == codes.Unavailable || code == codes.DeadlineExceeded:
   212  		return false
   213  	default:
   214  		return true
   215  	}
   216  }
   217  
   218  func DefaultBreakerConfig(clock clockwork.Clock) Config {
   219  	return Config{
   220  		Clock:        clock,
   221  		Interval:     defaults.BreakerInterval,
   222  		Trip:         RatioTripper(defaults.BreakerRatio, defaults.BreakerRatioMinExecutions),
   223  		Recover:      RatioTripper(defaults.BreakerRatio/2, defaults.BreakerRatioMinExecutions/3),
   224  		IsSuccessful: IsResponseSuccessful,
   225  	}
   226  }
   227  
   228  func NoopBreakerConfig() Config {
   229  	return Config{
   230  		Interval:     defaults.BreakerInterval,
   231  		Trip:         StaticTripper(false),
   232  		Recover:      StaticTripper(false),
   233  		IsSuccessful: func(v interface{}, err error) bool { return true },
   234  	}
   235  }
   236  
   237  // CheckAndSetDefaults checks and sets default config values.
   238  func (c *Config) CheckAndSetDefaults() error {
   239  	if c.Clock == nil {
   240  		c.Clock = clockwork.NewRealClock()
   241  	}
   242  
   243  	if c.Interval <= 0 {
   244  		return trace.BadParameter("CircuitBreaker Interval must be set")
   245  	}
   246  
   247  	if c.Trip == nil {
   248  		return trace.BadParameter("CircuitBreaker Trip must be set")
   249  	}
   250  	if c.Recover == nil {
   251  		return trace.BadParameter("CircuitBreaker Recover must be set")
   252  	}
   253  
   254  	if c.TrippedPeriod <= 0 {
   255  		c.TrippedPeriod = defaults.TrippedPeriod
   256  	}
   257  
   258  	if c.RecoveryLimit <= 0 {
   259  		c.RecoveryLimit = defaults.RecoveryLimit
   260  	}
   261  
   262  	if c.OnTripped == nil {
   263  		c.OnTripped = func() {}
   264  	}
   265  
   266  	if c.OnStandBy == nil {
   267  		c.OnStandBy = func() {}
   268  	}
   269  
   270  	if c.OnExecute == nil {
   271  		c.OnExecute = func(bool, State) {}
   272  	}
   273  
   274  	if c.IsSuccessful == nil {
   275  		c.IsSuccessful = NonNilErrorIsSuccess
   276  	}
   277  
   278  	c.TrippedPeriod = retryutils.NewSeventhJitter()(c.TrippedPeriod)
   279  
   280  	return nil
   281  }
   282  
   283  // CircuitBreaker implements the circuit breaker pattern
   284  type CircuitBreaker struct {
   285  	cfg Config
   286  
   287  	mu         sync.Mutex
   288  	state      State
   289  	generation uint64
   290  	metrics    Metrics
   291  	expiry     time.Time
   292  }
   293  
   294  func NewNoop() *CircuitBreaker {
   295  	return &CircuitBreaker{
   296  		cfg: NoopBreakerConfig(),
   297  	}
   298  }
   299  
   300  // New returns a CircuitBreaker configured with the provided Config
   301  func New(cfg Config) (*CircuitBreaker, error) {
   302  	if err := cfg.CheckAndSetDefaults(); err != nil {
   303  		return nil, err
   304  	}
   305  
   306  	cb := CircuitBreaker{cfg: cfg}
   307  	cb.nextGeneration(cfg.Clock.Now())
   308  
   309  	return &cb, nil
   310  }
   311  
   312  // Execute calls the provided function depending on the CircuitBreaker state.
   313  //   - StateStandby: all functions are executed.
   314  //   - StateTripped: no functions are executed and ErrStateTripped is returned.
   315  //   - StateRecovering: some functions are executed, some functions are not,
   316  //     when not executed ErrLimitExceeded is returned.
   317  //
   318  // The CircuitBreaker state is updated according to the outcome of executing the
   319  // provided function and the current state. See package docs for a more detailed
   320  // explanation of state transitions.
   321  func (c *CircuitBreaker) Execute(f func() (interface{}, error)) (interface{}, error) {
   322  	generation, err := c.beforeExecution()
   323  	if err != nil {
   324  		return nil, err
   325  	}
   326  
   327  	v, err := f()
   328  
   329  	c.afterExecution(generation, v, err)
   330  
   331  	return v, err
   332  }
   333  
   334  // beforeExecution checks the current state to determine if a new generation
   335  // should be created and whether Execute is allowed to proceed.
   336  func (c *CircuitBreaker) beforeExecution() (uint64, error) {
   337  	c.mu.Lock()
   338  	defer c.mu.Unlock()
   339  
   340  	now := c.cfg.Clock.Now()
   341  
   342  	generation, state := c.currentState(now)
   343  
   344  	if state == StateTripped {
   345  		c.cfg.OnExecute(false, StateTripped)
   346  
   347  		if c.cfg.TrippedErrorMessage != "" {
   348  			return generation, trace.ConnectionProblem(nil, c.cfg.TrippedErrorMessage)
   349  		}
   350  
   351  		return generation, trace.Wrap(ErrStateTripped)
   352  	}
   353  
   354  	c.metrics.execute()
   355  	return generation, nil
   356  }
   357  
   358  // afterExecution updates the CircuitBreaker state based on the outcome of
   359  // processing the fn in Execute.
   360  func (c *CircuitBreaker) afterExecution(prior uint64, v interface{}, err error) {
   361  	c.mu.Lock()
   362  	defer c.mu.Unlock()
   363  
   364  	now := c.cfg.Clock.Now()
   365  
   366  	generation, state := c.currentState(now)
   367  	if generation != prior {
   368  		return
   369  	}
   370  
   371  	if c.cfg.IsSuccessful(v, err) {
   372  		c.successLocked(state, now)
   373  	} else {
   374  		c.failureLocked(state, now)
   375  	}
   376  }
   377  
   378  // successLocked tallies a successful execution and migrates to StateStandby
   379  // if in another state and criteria has been met to transition
   380  func (c *CircuitBreaker) successLocked(state State, t time.Time) {
   381  	switch state {
   382  	case StateStandby:
   383  		c.cfg.OnExecute(true, StateStandby)
   384  		c.metrics.success()
   385  	case StateRecovering:
   386  		c.cfg.OnExecute(true, StateRecovering)
   387  		c.metrics.success()
   388  		if c.metrics.ConsecutiveSuccesses >= c.cfg.RecoveryLimit {
   389  			c.setState(StateStandby, t)
   390  			go c.cfg.OnStandBy()
   391  		}
   392  	}
   393  }
   394  
   395  // failureLocked tallies a failed execution and migrate to StateTripped
   396  // if in another state and criteria has been met to transition
   397  func (c *CircuitBreaker) failureLocked(state State, t time.Time) {
   398  	c.metrics.failure()
   399  
   400  	switch state {
   401  	case StateRecovering:
   402  		c.cfg.OnExecute(false, StateRecovering)
   403  		if c.cfg.Recover(c.metrics) {
   404  			c.setState(StateTripped, t)
   405  		}
   406  	case StateStandby:
   407  		c.cfg.OnExecute(false, StateStandby)
   408  		if c.cfg.Trip(c.metrics) {
   409  			c.setState(StateTripped, t)
   410  			go c.cfg.OnTripped()
   411  		}
   412  	}
   413  }
   414  
   415  // setState updates the state and creates a new generation if the
   416  // provided state is different from the CircuitBreakers current state
   417  func (c *CircuitBreaker) setState(s State, t time.Time) {
   418  	if c.state == s {
   419  		return
   420  	}
   421  
   422  	c.state = s
   423  	c.nextGeneration(t)
   424  }
   425  
   426  // currentState returns the state of the CircuitBreaker
   427  func (c *CircuitBreaker) currentState(t time.Time) (uint64, State) {
   428  	switch {
   429  	case c.state == StateTripped && c.expiry.Before(t):
   430  		c.setState(StateRecovering, t)
   431  	case c.state == StateStandby && !c.expiry.IsZero() && c.expiry.Before(t):
   432  		c.nextGeneration(t)
   433  	}
   434  
   435  	return c.generation, c.state
   436  }
   437  
   438  // nextGeneration creates a new generation and adjusts its expiration
   439  // based on the current state
   440  func (c *CircuitBreaker) nextGeneration(t time.Time) {
   441  	c.metrics.reset()
   442  	c.generation++
   443  
   444  	switch c.state {
   445  	case StateRecovering:
   446  		c.expiry = time.Time{}
   447  	case StateTripped:
   448  		c.expiry = t.Add(c.cfg.TrippedPeriod)
   449  	case StateStandby:
   450  		c.expiry = t.Add(c.cfg.Interval)
   451  	}
   452  }