github.com/bazelbuild/remote-apis-sdks@v0.0.0-20240425170053-8a36686a6350/go/pkg/retry/retry.go (about)

     1  // Package retry implements retry logic helpers, which can be used to wrap operations that can
     2  // intermittently fail, but can be retried at a higher level.
     3  //
     4  // Consider replacing with https://github.com/eapache/go-resiliency/tree/master/retrier. This would
     5  // have slightly different values for backoff consts.
     6  package retry
     7  
     8  import (
     9  	"context"
    10  	stderrors "errors"
    11  	"fmt"
    12  	"math/rand"
    13  	"sync"
    14  	"time"
    15  
    16  	log "github.com/golang/glog"
    17  	"github.com/pkg/errors"
    18  	"google.golang.org/grpc/codes"
    19  	"google.golang.org/grpc/status"
    20  )
    21  
    22  const (
    23  	backoffFactor = 1.3 // backoff increases by this factor on each retry
    24  	backoffRange  = 0.4 // backoff is randomized downwards by this factor
    25  )
    26  
    27  // BackoffPolicy describes how to back off when retrying, and how many times to retry.
    28  type BackoffPolicy struct {
    29  	baseDelay, maxDelay time.Duration
    30  	maxAttempts         Attempts // 0 means unlimited
    31  }
    32  
    33  // ExponentialBackoff returns an exponential backoff implementation.
    34  //
    35  // Starting from baseDelay, it will delay by an additional fixed multiple with each retry, never
    36  // delaying by more than maxDelay.  For example, ExponentialBackoff(time.Second, time.Hour, 5) will
    37  // produce delays of roughly: 1s, 1.5s, 2s, 3s, 4s.
    38  //
    39  // Note that delays are randomized, so the exact values are not guaranteed. attempts=0 means
    40  // unlimited attempts. See UnlimitedAttempts.
    41  func ExponentialBackoff(baseDelay, maxDelay time.Duration, attempts Attempts) BackoffPolicy {
    42  	return BackoffPolicy{baseDelay, maxDelay, attempts}
    43  }
    44  
    45  // Immediately returns a retrier that retries right away.
    46  func Immediately(attempts Attempts) BackoffPolicy {
    47  	return BackoffPolicy{0, 0, attempts}
    48  }
    49  
    50  // Attempts is the number of times to attempt something before giving up. A value of 0 represents
    51  // an effectively unlimited number of attempts, or you can use the equivalent
    52  // retry.UnlimitedAttempts.
    53  type Attempts uint
    54  
    55  // UnlimitedAttempts is used to specify no limit to the number of attempts.
    56  const UnlimitedAttempts = Attempts(0)
    57  
    58  // ShouldRetry encapsulates the decision of whether an error is retry-able. If an error should not
    59  // be retried, the function must return false.
    60  type ShouldRetry func(error) bool
    61  
    62  // Always always retries, regardless of error.
    63  func Always(error) bool { return true }
    64  
    65  // TransientOnly returns true if the error is transient.
    66  // It implements ShouldRetry type.
    67  func TransientOnly(err error) bool {
    68  	// Retry RPC timeouts. Note that we do *not* retry context cancellations (context.Cancelled);
    69  	// if the user wants to back out of the call we should let them.
    70  	if stderrors.Is(err, context.DeadlineExceeded) {
    71  		return true
    72  	}
    73  	s, ok := status.FromError(err)
    74  	if !ok {
    75  		return false
    76  	}
    77  	switch s.Code() {
    78  	case codes.Canceled, codes.Unknown, codes.DeadlineExceeded, codes.Aborted,
    79  		codes.Internal, codes.Unavailable, codes.ResourceExhausted:
    80  		return true
    81  	default:
    82  		return false
    83  	}
    84  }
    85  
    86  // WithPolicy retries f until either it succeeds, or shouldRetry returns false, or the number of
    87  // retries is capped by the backoff policy. Returns the error returned by the final attempt. It
    88  // annotates the error message in case the retry budget is exhausted.
    89  func WithPolicy(ctx context.Context, shouldRetry ShouldRetry, bp BackoffPolicy, f func() error) error {
    90  	timeAfter, ok := ctx.Value(TimeAfterContextKey).(func(time.Duration) <-chan time.Time)
    91  	if !ok {
    92  		timeAfter = time.After
    93  	}
    94  
    95  	for attempts := 0; ; attempts++ {
    96  		err := f()
    97  		if err == nil || !shouldRetry(err) {
    98  			return err
    99  		}
   100  
   101  		if log.V(1) {
   102  			// This log depth is custom-tailored to the SDK usage, which always calls the retrier from within client.CallWithTimeout.
   103  			log.InfoDepth(3, fmt.Sprintf("call failed with err=%v, retrying.", err))
   104  		}
   105  
   106  		if attempts+1 == int(bp.maxAttempts) {
   107  			// Annotates the error message to indicate the retry budget was exhausted.
   108  			//
   109  			// This is a little hacky, but generic status annotation preserving status code doesn't exist
   110  			// in gRPC's status library yet, and it's overkill to implement it here for just this.
   111  			if s, ok := status.FromError(err); ok {
   112  				spb := s.Proto()
   113  				spb.Message = fmt.Sprintf("retry budget exhausted (%d attempts): ", bp.maxAttempts) + spb.Message
   114  				return status.ErrorProto(spb)
   115  			}
   116  			return errors.Wrapf(err, "retry budget exhausted (%d attempts)", bp.maxAttempts)
   117  		}
   118  
   119  		select {
   120  		case <-ctx.Done():
   121  			return ctx.Err()
   122  		case <-timeAfter(backoff(bp.baseDelay, bp.maxDelay, attempts)):
   123  
   124  		}
   125  	}
   126  }
   127  
   128  type timeAfterContextKey struct{}
   129  
   130  // TimeAfterContextKey is to be used as a key in the context to provide a value that is compatible
   131  // with time.After. The main purpose is to mock out time.After in the tests.
   132  var TimeAfterContextKey = timeAfterContextKey{}
   133  
   134  var (
   135  	mu  sync.Mutex
   136  	rng = rand.New(rand.NewSource(time.Now().UnixNano()))
   137  )
   138  
   139  // randFloat64 is equivalent to calling rng.Float64, but safe for concurrent use.
   140  func randFloat64() float64 {
   141  	mu.Lock()
   142  	f := rng.Float64()
   143  	mu.Unlock()
   144  	return f
   145  }
   146  
   147  // backoff returns a random value in [0, maxDelay] that increases exponentially with value of
   148  // retries, starting from baseDelay. Set retries to 0 for the first call and increment with each
   149  // subsequent call.
   150  func backoff(baseDelay, maxDelay time.Duration, retries int) time.Duration {
   151  	backoff, max := float64(baseDelay), float64(maxDelay)
   152  	for backoff < max && retries > 0 {
   153  		backoff = backoff * backoffFactor
   154  		retries--
   155  	}
   156  	if backoff > max {
   157  		backoff = max
   158  	}
   159  
   160  	// Randomize backoff delays so that if a cluster of requests start at the same time, they won't
   161  	// operate in lockstep. We just subtract up to 40% so that we obey maxDelay.
   162  	backoff -= backoff * backoffRange * randFloat64()
   163  	if backoff < 0 {
   164  		return 0
   165  	}
   166  	return time.Duration(backoff)
   167  }