github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/file/s3file/file_chunk_read.go (about) 1 package s3file 2 3 import ( 4 "context" 5 "fmt" 6 "io" 7 "path/filepath" 8 "sync" 9 "sync/atomic" 10 11 "github.com/aws/aws-sdk-go/aws" 12 "github.com/aws/aws-sdk-go/service/s3" 13 "github.com/aws/aws-sdk-go/service/s3/s3iface" 14 "github.com/Schaudge/grailbase/errors" 15 "github.com/Schaudge/grailbase/file/internal/s3bufpool" 16 "github.com/Schaudge/grailbase/file/s3file/internal/autolog" 17 "github.com/Schaudge/grailbase/log" 18 "github.com/Schaudge/grailbase/traverse" 19 ) 20 21 type ( 22 // chunkReaderAt is similar to ioctx.ReaderAt except it is not concurrency-safe. 23 // It's currently used to implement S3-recommended read parallelism for large reads, though 24 // clients of s3file still only see the non-parallel io.Reader API. 25 // TODO: Expose concurrency-safe ReaderAt API to clients. 26 chunkReaderAt struct { 27 // name is redundant with (bucket, key). 28 name, bucket, key, versionID string 29 // newRetryPolicy creates retry policies. It must be concurrency- and goroutine-safe. 30 newRetryPolicy func() retryPolicy 31 32 // previousR is a body reader open from a previous ReadAt. It's an optimization for 33 // clients that do many small reads. It may be nil (before first read, after errors, etc.). 34 previousR *posReader 35 // chunks is used locally within ReadAt. It's stored here only to reduce allocations. 36 chunks []readChunk 37 } 38 readChunk struct { 39 // s3Offset is the position of this *chunk* in the coordinates of the S3 object. 40 // That is, dst[0] will eventually contain s3Object[s3Offset]. 41 s3Offset int64 42 // dst contains the chunk's data after read. After read, dstN < len(dst) iff there was an 43 // error or EOF. 44 dst []byte 45 // dstN tracks how much of dst is already filled. 46 dstN int 47 // r is the current reader for this chunk. It may be nil or at the wrong position for 48 // this chunk's state; then we'd need a new reader. 49 r *posReader 50 } 51 52 // posReader wraps the S3 SDK's reader with retries and remembers its offset in the S3 object. 53 posReader struct { 54 rc io.ReadCloser 55 offset int64 56 // ids is set when posReader is opened. 57 ids s3RequestIDs 58 // info is set when posReader is opened, unless there's an error or EOF. 59 info s3Info 60 } 61 ) 62 63 // ReadChunkBytes is the size for individual S3 API read operations, guided by S3 docs: 64 // As a general rule, when you download large objects within a Region from Amazon S3 to 65 // Amazon EC2, we suggest making concurrent requests for byte ranges of an object at the 66 // granularity of 8–16 MB. 67 // https://web.archive.org/web/20220325121400/https://docs.aws.amazon.com/AmazonS3/latest/userguide/optimizing-performance-design-patterns.html 68 func ReadChunkBytes() int { return s3bufpool.BufBytes } 69 70 // ReadAt is not concurrency-safe. 71 // s3Info may be empty if no object metadata is fetched (zero-sized request, error). 72 func (r *chunkReaderAt) ReadAt(ctx context.Context, dst []byte, offset int64) (int, s3Info, error) { 73 if len(dst) == 0 { 74 return 0, s3Info{}, nil 75 } 76 r.chunks = r.chunks[:0] 77 for buf, bufOff := dst, offset; len(buf) > 0; { 78 size := len(buf) 79 if size > s3bufpool.BufBytes { 80 size = s3bufpool.BufBytes 81 } 82 r.chunks = append(r.chunks, readChunk{ 83 s3Offset: bufOff, 84 dst: buf[:size:size], 85 }) 86 bufOff += int64(size) 87 buf = buf[size:] 88 } 89 90 // The first chunk gets to try to use a previously-opened reader (best-effort). 91 // Note: If len(r.chunks) == 1 we're both reusing a saved reader and saving it again. 92 r.chunks[0].r, r.previousR = r.previousR, nil 93 defer func() { 94 r.previousR = r.chunks[len(r.chunks)-1].r 95 }() 96 97 var ( 98 infoMu sync.Mutex 99 info s3Info 100 ) 101 // TODO: traverse (or other common lib) support for exiting on first error to reduce latency. 102 err := traverse.Each(len(r.chunks), func(chunkIdx int) (err error) { 103 chunk := &r.chunks[chunkIdx] 104 policy := r.newRetryPolicy() 105 106 defer func() { 107 if err != nil { 108 err = annotate(err, chunk.r.maybeIDs(), &policy) 109 } 110 }() 111 // Leave the last chunk's reader open for future reuse. 112 if chunkIdx < len(r.chunks)-1 { 113 defer func() { chunk.r.Close(); chunk.r = nil }() 114 } 115 116 metric := metrics.Op("read").Start() 117 defer metric.Done() 118 119 attemptLoop: 120 for attempt := 0; ; attempt++ { 121 switch err { 122 case nil: // Initial attempt. 123 case io.EOF, io.ErrUnexpectedEOF: 124 // In rare cases the S3 SDK returns EOF for chunks that are not actually at EOF. 125 // To work around this, we ignore EOF errors, and keep reading as long as the 126 // object metadata size field says we're not done. See BXDS-2220 for details. 127 // See also: https://github.com/aws/aws-sdk-go/issues/4510 128 default: 129 if !policy.shouldRetry(ctx, err, r.name) { 130 break attemptLoop 131 } 132 } 133 err = nil 134 remainingBuf := chunk.dst[chunk.dstN:] 135 if len(remainingBuf) == 0 { 136 break 137 } 138 139 if attempt > 0 { 140 metric.Retry() 141 } 142 143 rangeStart := chunk.s3Offset + int64(chunk.dstN) 144 switch { 145 case chunk.r != nil && chunk.r.offset == rangeStart: 146 // We're ready to read. 147 case chunk.r != nil: 148 chunk.r.Close() 149 fallthrough 150 default: 151 chunk.r, err = newPosReader(ctx, policy.client(), r.name, r.bucket, r.key, r.versionID, rangeStart) 152 if err == io.EOF { 153 // rangeStart is at or past EOF, so this chunk is done. 154 err = nil 155 break attemptLoop 156 } 157 if err != nil { 158 continue 159 } 160 } 161 162 var size int64 163 infoMu.Lock() 164 if info == (s3Info{}) { 165 info = chunk.r.info 166 } else if info.etag != chunk.r.info.etag { 167 err = eTagChangedError(r.name, info.etag, chunk.r.info.etag) 168 } 169 size = info.size 170 infoMu.Unlock() 171 if err != nil { 172 continue 173 } 174 175 bytesUntilEOF := size - chunk.s3Offset - int64(chunk.dstN) 176 if bytesUntilEOF <= 0 { 177 break 178 } 179 if bytesUntilEOF < int64(len(remainingBuf)) { 180 remainingBuf = remainingBuf[:bytesUntilEOF] 181 } 182 var n int 183 n, err = io.ReadFull(chunk.r, remainingBuf) 184 chunk.dstN += n 185 if err == nil { 186 break 187 } 188 // Discard our reader after an error. This error is often due to throttling 189 // (especially connection reset), so we want to retry with a new HTTP request which 190 // may go to a new host. 191 chunk.r.Close() 192 chunk.r = nil 193 } 194 metric.Bytes(chunk.dstN) 195 return err 196 }) 197 198 var nBytes int 199 for _, chunk := range r.chunks { 200 nBytes += chunk.dstN 201 if chunk.dstN < len(chunk.dst) { 202 if err == nil { 203 err = io.EOF 204 } 205 break 206 } 207 } 208 return nBytes, info, err 209 } 210 211 func eTagChangedError(name, oldETag, newETag string) error { 212 return errors.E(errors.Precondition, fmt.Sprintf( 213 "read %v: ETag changed from %v to %v", name, oldETag, newETag)) 214 } 215 216 func (r *chunkReaderAt) Close() { r.previousR.Close() } 217 218 var ( 219 nOpenPos int32 220 nOpenPosOnce sync.Once 221 ) 222 223 func newPosReader( 224 ctx context.Context, 225 client s3iface.S3API, 226 name, bucket, key, versionID string, 227 offset int64, 228 ) (*posReader, error) { 229 nOpenPosOnce.Do(func() { 230 autolog.Register(func() { 231 log.Printf("s3file open posReader: %d", atomic.LoadInt32(&nOpenPos)) 232 }) 233 }) 234 r := posReader{offset: offset} 235 input := s3.GetObjectInput{ 236 Bucket: aws.String(bucket), 237 Key: aws.String(key), 238 Range: aws.String(fmt.Sprintf("bytes=%d-", r.offset)), 239 } 240 if versionID != "" { 241 input.VersionId = aws.String(versionID) 242 } 243 output, err := client.GetObjectWithContext(ctx, &input, r.ids.captureOption()) 244 if err != nil { 245 if output.Body != nil { 246 if errClose := output.Body.Close(); errClose != nil { 247 log.Printf("s3file.newPosReader: ignoring body close error: %v", err) 248 } 249 } 250 if awsErr, ok := getAWSError(err); ok && awsErr.Code() == "InvalidRange" { 251 // Since we're reading many chunks in parallel, some can be past the end of 252 // the object, resulting in range errors. Treat these as EOF. 253 err = io.EOF 254 } 255 return nil, err 256 } 257 _ = atomic.AddInt32(&nOpenPos, 1) 258 if output.ContentLength == nil || output.ETag == nil || output.LastModified == nil { 259 return nil, errors.E("s3file.newPosReader: object missing metadata (ContentLength, ETag, LastModified)") 260 } 261 if *output.ContentLength < 0 { 262 // We do not expect AWS to return negative ContentLength, but we are 263 // defensive, as things may otherwise break very confusingly for 264 // callers. 265 return nil, io.EOF 266 } 267 r.info = s3Info{ 268 name: filepath.Base(name), 269 size: offset + *output.ContentLength, 270 modTime: *output.LastModified, 271 etag: *output.ETag, 272 } 273 r.rc = output.Body 274 return &r, nil 275 } 276 277 // Read usually delegates to the underlying reader, except: (&posReader{}).Read is valid and 278 // always at EOF; nil.Read panics. 279 func (p *posReader) Read(dst []byte) (int, error) { 280 if p.rc == nil { 281 return 0, io.EOF 282 } 283 n, err := p.rc.Read(dst) 284 p.offset += int64(n) 285 return n, err 286 } 287 288 // Close usually delegates to the underlying reader, except: (&posReader{}).Close 289 // and nil.Close do nothing. 290 func (p *posReader) Close() { 291 if p == nil || p.rc == nil { 292 return 293 } 294 _ = atomic.AddInt32(&nOpenPos, -1) 295 if err := p.rc.Close(); err != nil { 296 // Note: Since the caller is already done reading from p.rc, we don't expect this error to 297 // indicate a problem with the correctness of past Reads, instead signaling some resource 298 // leakage (network connection, buffers, etc.). We can't retry the resource release: 299 // * io.Closer does not define behavior for multiple Close calls and 300 // s3.GetObjectOutput.Body doesn't say anything implementation-specific. 301 // * Body may be a net/http.Response.Body [1] but the standard library doesn't say 302 // anything about multiple Close either (and even if it did, we shouldn't rely on the 303 // AWS SDK's implementation details in all cases or in the future). 304 // Without a retry opportunity, it seems like callers could either ignore the potential 305 // leak, or exit the OS process. We assume, for now, that callers won't want to do the 306 // latter, so we hide the error. (This could eventually lead to OS process exit due to 307 // resource exhaustion, so arguably this hiding doesn't add much harm, though of course it 308 // may be confusing.) We could consider changing this in the future, especially if we notice 309 // such resource leaks in real programs. 310 // 311 // [1] https://github.com/aws/aws-sdk-go/blob/e842504a6323096540dc3defdc7cb357d8749893/private/protocol/rest/unmarshal.go#L89-L90 312 log.Printf("s3file.posReader.Close: ignoring body close error: %v", err) 313 } 314 } 315 316 // maybeIDs returns ids if available, otherwise zero. p == nil is allowed. 317 func (p *posReader) maybeIDs() s3RequestIDs { 318 if p == nil { 319 return s3RequestIDs{} 320 } 321 return p.ids 322 }