go.uber.org/cadence@v1.2.9/internal/common/backoff/retry.go (about)

     1  // Copyright (c) 2017 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package backoff
    22  
    23  import (
    24  	"context"
    25  	"errors"
    26  	"sync"
    27  	"time"
    28  
    29  	s "go.uber.org/cadence/.gen/go/shared"
    30  )
    31  
    32  type (
    33  	// Operation to retry
    34  	Operation func() error
    35  
    36  	// IsRetryable handler can be used to exclude certain errors during retry
    37  	IsRetryable func(error) bool
    38  
    39  	// ConcurrentRetrier is used for client-side throttling. It determines whether to
    40  	// throttle outgoing traffic in case downstream backend server rejects
    41  	// requests due to out-of-quota or server busy errors.
    42  	ConcurrentRetrier struct {
    43  		sync.Mutex
    44  		retrier      Retrier // Backoff retrier
    45  		failureCount int64   // Number of consecutive failures seen
    46  	}
    47  )
    48  
    49  // Throttle Sleep if there were failures since the last success call.
    50  func (c *ConcurrentRetrier) Throttle() {
    51  	c.throttleInternal()
    52  }
    53  
    54  func (c *ConcurrentRetrier) throttleInternal() time.Duration {
    55  	next := done
    56  
    57  	// Check if we have failure count.
    58  	c.Lock()
    59  	if c.failureCount > 0 {
    60  		next = c.retrier.NextBackOff()
    61  	}
    62  	c.Unlock()
    63  
    64  	if next != done {
    65  		time.Sleep(next)
    66  	}
    67  
    68  	return next
    69  }
    70  
    71  // Succeeded marks client request succeeded.
    72  func (c *ConcurrentRetrier) Succeeded() {
    73  	defer c.Unlock()
    74  	c.Lock()
    75  	c.failureCount = 0
    76  	c.retrier.Reset()
    77  }
    78  
    79  // Failed marks client request failed because backend is busy.
    80  func (c *ConcurrentRetrier) Failed() {
    81  	defer c.Unlock()
    82  	c.Lock()
    83  	c.failureCount++
    84  }
    85  
    86  // NewConcurrentRetrier returns an instance of concurrent backoff retrier.
    87  func NewConcurrentRetrier(retryPolicy RetryPolicy) *ConcurrentRetrier {
    88  	retrier := NewRetrier(retryPolicy, SystemClock)
    89  	return &ConcurrentRetrier{retrier: retrier}
    90  }
    91  
    92  // Retry function can be used to wrap any call with retry logic using the passed in policy
    93  func Retry(ctx context.Context, operation Operation, policy RetryPolicy, isRetriable IsRetryable) error {
    94  	var err error
    95  	var next time.Duration
    96  
    97  	r := NewRetrier(policy, SystemClock)
    98  Retry_Loop:
    99  	for {
   100  		// operation completed successfully.  No need to retry.
   101  		if err = operation(); err == nil {
   102  			return nil
   103  		}
   104  
   105  		if next = r.NextBackOff(); next == done {
   106  			return err
   107  		}
   108  
   109  		if !isRetriable(err) {
   110  			return err
   111  		}
   112  
   113  		retryAfter := ErrRetryableAfter(err)
   114  		// update the time to wait until the next attempt.
   115  		// as this is a *minimum*, just add it to the current delay time.
   116  		//
   117  		// this could be changed to clamp to retryAfter as a minimum.
   118  		// this is intentionally *not* done here, so repeated service-busy errors are guaranteed
   119  		// to generate *increasing* amount of time between requests, and not just send N in a row
   120  		// with 1 second of delay.  duplicates imply "still overloaded", so this will hopefully
   121  		// help reduce the odds of snowballing.
   122  		// this is a pretty minor thing though, and it should not cause problems if we change it
   123  		// to make behavior more predictable.
   124  		next += retryAfter
   125  
   126  		// check if ctx is done
   127  		if ctx.Err() != nil {
   128  			return err
   129  		}
   130  
   131  		// wait for the next retry period (or context timeout)
   132  		if ctxDone := ctx.Done(); ctxDone != nil {
   133  			// we could check if this is longer than context deadline and immediately fail...
   134  			// ...but wasting time prevents higher-level retries from trying too early.
   135  			// this is particularly useful for service-busy, but seems valid for essentially all retried errors.
   136  			//
   137  			// this could probably be changed if we get requests for it, but for now it better-protects
   138  			// the server by preventing "external" retry storms.
   139  			timer := time.NewTimer(next)
   140  			select {
   141  			case <-ctxDone:
   142  				timer.Stop()
   143  				return err
   144  			case <-timer.C:
   145  				continue Retry_Loop
   146  			}
   147  		}
   148  
   149  		// ctx is not cancellable
   150  		time.Sleep(next)
   151  	}
   152  }
   153  
   154  // ErrRetryableAfter returns a minimum delay until the next attempt.
   155  //
   156  // for most errors this will be 0, and normal backoff logic will determine
   157  // the full retry period, but e.g. service busy errors (or any case where the
   158  // server knows a "time until it is not useful to retry") are safe to assume
   159  // that a literally immediate retry is *not* going to be useful.
   160  //
   161  // note that this is only a minimum, however.  longer delays are assumed to
   162  // be equally valid.
   163  func ErrRetryableAfter(err error) (retryAfter time.Duration) {
   164  	if target := (*s.ServiceBusyError)(nil); errors.As(err, &target) {
   165  		// eventually: return a time-until-retry from the server.
   166  		// for now though, just ensure at least one second before the next attempt.
   167  		return time.Second
   168  	}
   169  	return 0
   170  }