github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/file/s3file/retry.go (about)

     1  package s3file
     2  
     3  import (
     4  	"context"
     5  	"strings"
     6  	"time"
     7  
     8  	awsrequest "github.com/aws/aws-sdk-go/aws/request"
     9  	"github.com/aws/aws-sdk-go/service/s3"
    10  	"github.com/aws/aws-sdk-go/service/s3/s3iface"
    11  	"github.com/Schaudge/grailbase/file"
    12  	"github.com/Schaudge/grailbase/log"
    13  	"github.com/Schaudge/grailbase/retry"
    14  )
    15  
    16  var (
    17  	// BackoffPolicy defines backoff timing parameters. It's exported for unit tests only.
    18  	// TODO(josh): Rename to `RetryPolicy`.
    19  	// TODO(josh): Create `retry.ThrottlePolicy` and `retry.AIMDPolicy` and use here.
    20  	BackoffPolicy = retry.Jitter(retry.Backoff(500*time.Millisecond, time.Minute, 1.2), 0.2)
    21  
    22  	// WithDeadline allows faking context.WithDeadline. It's exported for unit tests only.
    23  	WithDeadline = context.WithDeadline
    24  
    25  	// MaxRetryDuration defines the max amount of time a request can spend
    26  	// retrying on errors.
    27  	//
    28  	// Requirements:
    29  	//
    30  	// - The value must be >5 minutes. 5 min is the S3 negative-cache TTL.  If
    31  	//   less than 5 minutes, an Open() call w/ RetryWhenNotFound may fail.
    32  	//
    33  	// - It must be long enough to allow CompleteMultiPartUpload to finish after a
    34  	//   retry. The doc says it may take a few minutes even in a successful case.
    35  	MaxRetryDuration = 60 * time.Minute
    36  )
    37  
    38  // TODO: Rename to `retrier`.
    39  type retryPolicy struct {
    40  	clients       []s3iface.S3API
    41  	policy        retry.Policy
    42  	opts          file.Opts // passed to Open() or Stat request.
    43  	startTime     time.Time // the time requested started.
    44  	retryDeadline time.Time // when to give up retrying.
    45  	retries       int
    46  	waitErr       error // error happened during wait, typically deadline or cancellation.
    47  }
    48  
    49  func newBackoffPolicy(clients []s3iface.S3API, opts file.Opts) retryPolicy {
    50  	now := time.Now()
    51  	return retryPolicy{
    52  		clients:       clients,
    53  		policy:        BackoffPolicy,
    54  		opts:          opts,
    55  		startTime:     now,
    56  		retryDeadline: now.Add(MaxRetryDuration),
    57  	}
    58  }
    59  
    60  // client returns the s3 client to be use by the caller.
    61  func (r *retryPolicy) client() s3iface.S3API { return r.clients[0] }
    62  
    63  // shouldRetry determines if the caller should retry after seeing the given
    64  // error.  It will modify r.clients if it thinks the caller should retry with a
    65  // different client.
    66  func (r *retryPolicy) shouldRetry(ctx context.Context, err error, message string) bool {
    67  	wait := func() bool {
    68  		ctx2, cancel := WithDeadline(ctx, r.retryDeadline)
    69  		r.waitErr = retry.Wait(ctx2, r.policy, r.retries)
    70  		cancel()
    71  		if r.waitErr != nil {
    72  			// Context timeout or cancellation
    73  			r.clients = nil
    74  			return false
    75  		}
    76  		r.retries++
    77  		return true
    78  	}
    79  
    80  	if err == nil {
    81  		return false
    82  	}
    83  	if awsrequest.IsErrorRetryable(err) || awsrequest.IsErrorThrottle(err) || otherRetriableError(err) {
    84  		// Transient errors. Retry with the same client.
    85  		log.Printf("retry %s: %v", message, err)
    86  		return wait()
    87  	}
    88  	aerr, ok := getAWSError(err)
    89  	if ok {
    90  		if r.opts.RetryWhenNotFound && aerr.Code() == s3.ErrCodeNoSuchKey {
    91  			log.Printf("retry %s (not found): %v", message, err)
    92  			return wait()
    93  		}
    94  
    95  		switch aerr.Code() {
    96  		case s3.ErrCodeNoSuchBucket, s3.ErrCodeNoSuchKey:
    97  			// No point in trying again.
    98  			r.clients = nil
    99  			return false
   100  		case "NotFound":
   101  			// GetObject seems to return this error rather ErrCodeNoSuchKey
   102  			r.clients = nil
   103  			return false
   104  		default:
   105  			// Possible cases:
   106  			//
   107  			//- permission errors: we retry using a different client.
   108  			//
   109  			//- non-retriable errors: we retry using a different client, and it will
   110  			// fail again, and we eventually give up. The code it at least correct, if
   111  			// suboptimal.
   112  			//
   113  			// - transient errors we don't yet know. We'll abort when we shouldn't,
   114  			// but there's not much we can do. We'll add these errors to the above
   115  			// case as we discover them.
   116  		}
   117  	}
   118  	if len(r.clients) <= 1 {
   119  		// No more alternate clients to try
   120  		r.clients = nil
   121  		return false
   122  	}
   123  	r.clients = r.clients[1:]
   124  	return true
   125  }
   126  
   127  // Retriable errors not listed in aws' retry policy.
   128  func otherRetriableError(err error) bool {
   129  	aerr, ok := getAWSError(err)
   130  	if ok && (aerr.Code() == awsrequest.ErrCodeSerialization ||
   131  		aerr.Code() == awsrequest.ErrCodeRead ||
   132  		// The AWS SDK method IsErrorRetryable doesn't consider certain errors as retryable
   133  		// depending on the underlying cause.  (For a detailed explanation as to why,
   134  		// see https://github.com/aws/aws-sdk-go/issues/3027)
   135  		// In our case, we can safely consider every error of type "RequestError" regardless
   136  		// of the underlying cause as a retryable error.
   137  		aerr.Code() == "RequestError" ||
   138  		aerr.Code() == "SlowDown" ||
   139  		aerr.Code() == "InternalError" ||
   140  		aerr.Code() == "InternalServerError") {
   141  		return true
   142  	}
   143  	if ok && aerr.Code() == "XAmzContentSHA256Mismatch" {
   144  		// Example:
   145  		//
   146  		// XAmzContentSHA256Mismatch: The provided 'x-amz-content-sha256' header
   147  		// does not match what was computed.
   148  		//
   149  		// Happens sporadically for no discernible reason.  Just retry.
   150  		return true
   151  	}
   152  	if ok {
   153  		msg := strings.TrimSpace(aerr.Message())
   154  		if strings.HasSuffix(msg, "amazonaws.com: no such host") {
   155  			// Example:
   156  			//
   157  			// RequestError: send request failed caused by: Get
   158  			// https://grail-patchcnn.s3.us-west-2.amazonaws.com/key: dial tcp: lookup
   159  			// grail-patchcnn.s3.us-west-2.amazonaws.com: no such host
   160  			//
   161  			// This a DNS lookup error on the client side. This may be
   162  			// grail-specific. This error happens after S3 server resolves the bucket
   163  			// successfully, and redirects the client to a backend to fetch data. So
   164  			// accessing a non-existent bucket will not hit this path.
   165  			return true
   166  		}
   167  	}
   168  	msg := err.Error()
   169  	if strings.Contains(msg, "resource unavailable") ||
   170  		strings.Contains(msg, "Service Unavailable") ||
   171  		// As of v1.42.0, the AWS SDK marks these errors as non-retriable [1]. We think we see these
   172  		// errors when an S3 host is throttling us so we actually do want to retry.
   173  		// Note: Empirically, the s3transport package's workaround reduces the occurrence of these
   174  		// errors in our workloads, but we still see them occasionally.
   175  		//
   176  		// [1] https://github.com/aws/aws-sdk-go/blob/e04cf0432b79324cae8af9e8e333404c18268137/aws/request/connection_reset_error.go#L9
   177  		strings.Contains(msg, "read: connection reset") {
   178  		return true
   179  	}
   180  	return false
   181  }