github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/file/s3file/retry.go (about) 1 package s3file 2 3 import ( 4 "context" 5 "strings" 6 "time" 7 8 awsrequest "github.com/aws/aws-sdk-go/aws/request" 9 "github.com/aws/aws-sdk-go/service/s3" 10 "github.com/aws/aws-sdk-go/service/s3/s3iface" 11 "github.com/Schaudge/grailbase/file" 12 "github.com/Schaudge/grailbase/log" 13 "github.com/Schaudge/grailbase/retry" 14 ) 15 16 var ( 17 // BackoffPolicy defines backoff timing parameters. It's exported for unit tests only. 18 // TODO(josh): Rename to `RetryPolicy`. 19 // TODO(josh): Create `retry.ThrottlePolicy` and `retry.AIMDPolicy` and use here. 20 BackoffPolicy = retry.Jitter(retry.Backoff(500*time.Millisecond, time.Minute, 1.2), 0.2) 21 22 // WithDeadline allows faking context.WithDeadline. It's exported for unit tests only. 23 WithDeadline = context.WithDeadline 24 25 // MaxRetryDuration defines the max amount of time a request can spend 26 // retrying on errors. 27 // 28 // Requirements: 29 // 30 // - The value must be >5 minutes. 5 min is the S3 negative-cache TTL. If 31 // less than 5 minutes, an Open() call w/ RetryWhenNotFound may fail. 32 // 33 // - It must be long enough to allow CompleteMultiPartUpload to finish after a 34 // retry. The doc says it may take a few minutes even in a successful case. 35 MaxRetryDuration = 60 * time.Minute 36 ) 37 38 // TODO: Rename to `retrier`. 39 type retryPolicy struct { 40 clients []s3iface.S3API 41 policy retry.Policy 42 opts file.Opts // passed to Open() or Stat request. 43 startTime time.Time // the time requested started. 44 retryDeadline time.Time // when to give up retrying. 45 retries int 46 waitErr error // error happened during wait, typically deadline or cancellation. 47 } 48 49 func newBackoffPolicy(clients []s3iface.S3API, opts file.Opts) retryPolicy { 50 now := time.Now() 51 return retryPolicy{ 52 clients: clients, 53 policy: BackoffPolicy, 54 opts: opts, 55 startTime: now, 56 retryDeadline: now.Add(MaxRetryDuration), 57 } 58 } 59 60 // client returns the s3 client to be use by the caller. 61 func (r *retryPolicy) client() s3iface.S3API { return r.clients[0] } 62 63 // shouldRetry determines if the caller should retry after seeing the given 64 // error. It will modify r.clients if it thinks the caller should retry with a 65 // different client. 66 func (r *retryPolicy) shouldRetry(ctx context.Context, err error, message string) bool { 67 wait := func() bool { 68 ctx2, cancel := WithDeadline(ctx, r.retryDeadline) 69 r.waitErr = retry.Wait(ctx2, r.policy, r.retries) 70 cancel() 71 if r.waitErr != nil { 72 // Context timeout or cancellation 73 r.clients = nil 74 return false 75 } 76 r.retries++ 77 return true 78 } 79 80 if err == nil { 81 return false 82 } 83 if awsrequest.IsErrorRetryable(err) || awsrequest.IsErrorThrottle(err) || otherRetriableError(err) { 84 // Transient errors. Retry with the same client. 85 log.Printf("retry %s: %v", message, err) 86 return wait() 87 } 88 aerr, ok := getAWSError(err) 89 if ok { 90 if r.opts.RetryWhenNotFound && aerr.Code() == s3.ErrCodeNoSuchKey { 91 log.Printf("retry %s (not found): %v", message, err) 92 return wait() 93 } 94 95 switch aerr.Code() { 96 case s3.ErrCodeNoSuchBucket, s3.ErrCodeNoSuchKey: 97 // No point in trying again. 98 r.clients = nil 99 return false 100 case "NotFound": 101 // GetObject seems to return this error rather ErrCodeNoSuchKey 102 r.clients = nil 103 return false 104 default: 105 // Possible cases: 106 // 107 //- permission errors: we retry using a different client. 108 // 109 //- non-retriable errors: we retry using a different client, and it will 110 // fail again, and we eventually give up. The code it at least correct, if 111 // suboptimal. 112 // 113 // - transient errors we don't yet know. We'll abort when we shouldn't, 114 // but there's not much we can do. We'll add these errors to the above 115 // case as we discover them. 116 } 117 } 118 if len(r.clients) <= 1 { 119 // No more alternate clients to try 120 r.clients = nil 121 return false 122 } 123 r.clients = r.clients[1:] 124 return true 125 } 126 127 // Retriable errors not listed in aws' retry policy. 128 func otherRetriableError(err error) bool { 129 aerr, ok := getAWSError(err) 130 if ok && (aerr.Code() == awsrequest.ErrCodeSerialization || 131 aerr.Code() == awsrequest.ErrCodeRead || 132 // The AWS SDK method IsErrorRetryable doesn't consider certain errors as retryable 133 // depending on the underlying cause. (For a detailed explanation as to why, 134 // see https://github.com/aws/aws-sdk-go/issues/3027) 135 // In our case, we can safely consider every error of type "RequestError" regardless 136 // of the underlying cause as a retryable error. 137 aerr.Code() == "RequestError" || 138 aerr.Code() == "SlowDown" || 139 aerr.Code() == "InternalError" || 140 aerr.Code() == "InternalServerError") { 141 return true 142 } 143 if ok && aerr.Code() == "XAmzContentSHA256Mismatch" { 144 // Example: 145 // 146 // XAmzContentSHA256Mismatch: The provided 'x-amz-content-sha256' header 147 // does not match what was computed. 148 // 149 // Happens sporadically for no discernible reason. Just retry. 150 return true 151 } 152 if ok { 153 msg := strings.TrimSpace(aerr.Message()) 154 if strings.HasSuffix(msg, "amazonaws.com: no such host") { 155 // Example: 156 // 157 // RequestError: send request failed caused by: Get 158 // https://grail-patchcnn.s3.us-west-2.amazonaws.com/key: dial tcp: lookup 159 // grail-patchcnn.s3.us-west-2.amazonaws.com: no such host 160 // 161 // This a DNS lookup error on the client side. This may be 162 // grail-specific. This error happens after S3 server resolves the bucket 163 // successfully, and redirects the client to a backend to fetch data. So 164 // accessing a non-existent bucket will not hit this path. 165 return true 166 } 167 } 168 msg := err.Error() 169 if strings.Contains(msg, "resource unavailable") || 170 strings.Contains(msg, "Service Unavailable") || 171 // As of v1.42.0, the AWS SDK marks these errors as non-retriable [1]. We think we see these 172 // errors when an S3 host is throttling us so we actually do want to retry. 173 // Note: Empirically, the s3transport package's workaround reduces the occurrence of these 174 // errors in our workloads, but we still see them occasionally. 175 // 176 // [1] https://github.com/aws/aws-sdk-go/blob/e04cf0432b79324cae8af9e8e333404c18268137/aws/request/connection_reset_error.go#L9 177 strings.Contains(msg, "read: connection reset") { 178 return true 179 } 180 return false 181 }