github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/file/s3file/file_chunk_read.go

github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/file/s3file/file_chunk_read.go (about)

     1  package s3file
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"io"
     7  	"path/filepath"
     8  	"sync"
     9  	"sync/atomic"
    10  
    11  	"github.com/aws/aws-sdk-go/aws"
    12  	"github.com/aws/aws-sdk-go/service/s3"
    13  	"github.com/aws/aws-sdk-go/service/s3/s3iface"
    14  	"github.com/Schaudge/grailbase/errors"
    15  	"github.com/Schaudge/grailbase/file/internal/s3bufpool"
    16  	"github.com/Schaudge/grailbase/file/s3file/internal/autolog"
    17  	"github.com/Schaudge/grailbase/log"
    18  	"github.com/Schaudge/grailbase/traverse"
    19  )
    20  
    21  type (
    22  	// chunkReaderAt is similar to ioctx.ReaderAt except it is not concurrency-safe.
    23  	// It's currently used to implement S3-recommended read parallelism for large reads, though
    24  	// clients of s3file still only see the non-parallel io.Reader API.
    25  	// TODO: Expose concurrency-safe ReaderAt API to clients.
    26  	chunkReaderAt struct {
    27  		// name is redundant with (bucket, key).
    28  		name, bucket, key, versionID string
    29  		// newRetryPolicy creates retry policies. It must be concurrency- and goroutine-safe.
    30  		newRetryPolicy func() retryPolicy
    31  
    32  		// previousR is a body reader open from a previous ReadAt. It's an optimization for
    33  		// clients that do many small reads. It may be nil (before first read, after errors, etc.).
    34  		previousR *posReader
    35  		// chunks is used locally within ReadAt. It's stored here only to reduce allocations.
    36  		chunks []readChunk
    37  	}
    38  	readChunk struct {
    39  		// s3Offset is the position of this *chunk* in the coordinates of the S3 object.
    40  		// That is, dst[0] will eventually contain s3Object[s3Offset].
    41  		s3Offset int64
    42  		// dst contains the chunk's data after read. After read, dstN < len(dst) iff there was an
    43  		// error or EOF.
    44  		dst []byte
    45  		// dstN tracks how much of dst is already filled.
    46  		dstN int
    47  		// r is the current reader for this chunk. It may be nil or at the wrong position for
    48  		// this chunk's state; then we'd need a new reader.
    49  		r *posReader
    50  	}
    51  
    52  	// posReader wraps the S3 SDK's reader with retries and remembers its offset in the S3 object.
    53  	posReader struct {
    54  		rc     io.ReadCloser
    55  		offset int64
    56  		// ids is set when posReader is opened.
    57  		ids s3RequestIDs
    58  		// info is set when posReader is opened, unless there's an error or EOF.
    59  		info s3Info
    60  	}
    61  )
    62  
    63  // ReadChunkBytes is the size for individual S3 API read operations, guided by S3 docs:
    64  //   As a general rule, when you download large objects within a Region from Amazon S3 to
    65  //   Amazon EC2, we suggest making concurrent requests for byte ranges of an object at the
    66  //   granularity of 8–16 MB.
    67  //   https://web.archive.org/web/20220325121400/https://docs.aws.amazon.com/AmazonS3/latest/userguide/optimizing-performance-design-patterns.html
    68  func ReadChunkBytes() int { return s3bufpool.BufBytes }
    69  
    70  // ReadAt is not concurrency-safe.
    71  // s3Info may be empty if no object metadata is fetched (zero-sized request, error).
    72  func (r *chunkReaderAt) ReadAt(ctx context.Context, dst []byte, offset int64) (int, s3Info, error) {
    73  	if len(dst) == 0 {
    74  		return 0, s3Info{}, nil
    75  	}
    76  	r.chunks = r.chunks[:0]
    77  	for buf, bufOff := dst, offset; len(buf) > 0; {
    78  		size := len(buf)
    79  		if size > s3bufpool.BufBytes {
    80  			size = s3bufpool.BufBytes
    81  		}
    82  		r.chunks = append(r.chunks, readChunk{
    83  			s3Offset: bufOff,
    84  			dst:      buf[:size:size],
    85  		})
    86  		bufOff += int64(size)
    87  		buf = buf[size:]
    88  	}
    89  
    90  	// The first chunk gets to try to use a previously-opened reader (best-effort).
    91  	// Note: If len(r.chunks) == 1 we're both reusing a saved reader and saving it again.
    92  	r.chunks[0].r, r.previousR = r.previousR, nil
    93  	defer func() {
    94  		r.previousR = r.chunks[len(r.chunks)-1].r
    95  	}()
    96  
    97  	var (
    98  		infoMu sync.Mutex
    99  		info   s3Info
   100  	)
   101  	// TODO: traverse (or other common lib) support for exiting on first error to reduce latency.
   102  	err := traverse.Each(len(r.chunks), func(chunkIdx int) (err error) {
   103  		chunk := &r.chunks[chunkIdx]
   104  		policy := r.newRetryPolicy()
   105  
   106  		defer func() {
   107  			if err != nil {
   108  				err = annotate(err, chunk.r.maybeIDs(), &policy)
   109  			}
   110  		}()
   111  		// Leave the last chunk's reader open for future reuse.
   112  		if chunkIdx < len(r.chunks)-1 {
   113  			defer func() { chunk.r.Close(); chunk.r = nil }()
   114  		}
   115  
   116  		metric := metrics.Op("read").Start()
   117  		defer metric.Done()
   118  
   119  	attemptLoop:
   120  		for attempt := 0; ; attempt++ {
   121  			switch err {
   122  			case nil: // Initial attempt.
   123  			case io.EOF, io.ErrUnexpectedEOF:
   124  				// In rare cases the S3 SDK returns EOF for chunks that are not actually at EOF.
   125  				// To work around this, we ignore EOF errors, and keep reading as long as the
   126  				// object metadata size field says we're not done. See BXDS-2220 for details.
   127  				// See also: https://github.com/aws/aws-sdk-go/issues/4510
   128  			default:
   129  				if !policy.shouldRetry(ctx, err, r.name) {
   130  					break attemptLoop
   131  				}
   132  			}
   133  			err = nil
   134  			remainingBuf := chunk.dst[chunk.dstN:]
   135  			if len(remainingBuf) == 0 {
   136  				break
   137  			}
   138  
   139  			if attempt > 0 {
   140  				metric.Retry()
   141  			}
   142  
   143  			rangeStart := chunk.s3Offset + int64(chunk.dstN)
   144  			switch {
   145  			case chunk.r != nil && chunk.r.offset == rangeStart:
   146  				// We're ready to read.
   147  			case chunk.r != nil:
   148  				chunk.r.Close()
   149  				fallthrough
   150  			default:
   151  				chunk.r, err = newPosReader(ctx, policy.client(), r.name, r.bucket, r.key, r.versionID, rangeStart)
   152  				if err == io.EOF {
   153  					// rangeStart is at or past EOF, so this chunk is done.
   154  					err = nil
   155  					break attemptLoop
   156  				}
   157  				if err != nil {
   158  					continue
   159  				}
   160  			}
   161  
   162  			var size int64
   163  			infoMu.Lock()
   164  			if info == (s3Info{}) {
   165  				info = chunk.r.info
   166  			} else if info.etag != chunk.r.info.etag {
   167  				err = eTagChangedError(r.name, info.etag, chunk.r.info.etag)
   168  			}
   169  			size = info.size
   170  			infoMu.Unlock()
   171  			if err != nil {
   172  				continue
   173  			}
   174  
   175  			bytesUntilEOF := size - chunk.s3Offset - int64(chunk.dstN)
   176  			if bytesUntilEOF <= 0 {
   177  				break
   178  			}
   179  			if bytesUntilEOF < int64(len(remainingBuf)) {
   180  				remainingBuf = remainingBuf[:bytesUntilEOF]
   181  			}
   182  			var n int
   183  			n, err = io.ReadFull(chunk.r, remainingBuf)
   184  			chunk.dstN += n
   185  			if err == nil {
   186  				break
   187  			}
   188  			// Discard our reader after an error. This error is often due to throttling
   189  			// (especially connection reset), so we want to retry with a new HTTP request which
   190  			// may go to a new host.
   191  			chunk.r.Close()
   192  			chunk.r = nil
   193  		}
   194  		metric.Bytes(chunk.dstN)
   195  		return err
   196  	})
   197  
   198  	var nBytes int
   199  	for _, chunk := range r.chunks {
   200  		nBytes += chunk.dstN
   201  		if chunk.dstN < len(chunk.dst) {
   202  			if err == nil {
   203  				err = io.EOF
   204  			}
   205  			break
   206  		}
   207  	}
   208  	return nBytes, info, err
   209  }
   210  
   211  func eTagChangedError(name, oldETag, newETag string) error {
   212  	return errors.E(errors.Precondition, fmt.Sprintf(
   213  		"read %v: ETag changed from %v to %v", name, oldETag, newETag))
   214  }
   215  
   216  func (r *chunkReaderAt) Close() { r.previousR.Close() }
   217  
   218  var (
   219  	nOpenPos     int32
   220  	nOpenPosOnce sync.Once
   221  )
   222  
   223  func newPosReader(
   224  	ctx context.Context,
   225  	client s3iface.S3API,
   226  	name, bucket, key, versionID string,
   227  	offset int64,
   228  ) (*posReader, error) {
   229  	nOpenPosOnce.Do(func() {
   230  		autolog.Register(func() {
   231  			log.Printf("s3file open posReader: %d", atomic.LoadInt32(&nOpenPos))
   232  		})
   233  	})
   234  	r := posReader{offset: offset}
   235  	input := s3.GetObjectInput{
   236  		Bucket: aws.String(bucket),
   237  		Key:    aws.String(key),
   238  		Range:  aws.String(fmt.Sprintf("bytes=%d-", r.offset)),
   239  	}
   240  	if versionID != "" {
   241  		input.VersionId = aws.String(versionID)
   242  	}
   243  	output, err := client.GetObjectWithContext(ctx, &input, r.ids.captureOption())
   244  	if err != nil {
   245  		if output.Body != nil {
   246  			if errClose := output.Body.Close(); errClose != nil {
   247  				log.Printf("s3file.newPosReader: ignoring body close error: %v", err)
   248  			}
   249  		}
   250  		if awsErr, ok := getAWSError(err); ok && awsErr.Code() == "InvalidRange" {
   251  			// Since we're reading many chunks in parallel, some can be past the end of
   252  			// the object, resulting in range errors. Treat these as EOF.
   253  			err = io.EOF
   254  		}
   255  		return nil, err
   256  	}
   257  	_ = atomic.AddInt32(&nOpenPos, 1)
   258  	if output.ContentLength == nil || output.ETag == nil || output.LastModified == nil {
   259  		return nil, errors.E("s3file.newPosReader: object missing metadata (ContentLength, ETag, LastModified)")
   260  	}
   261  	if *output.ContentLength < 0 {
   262  		// We do not expect AWS to return negative ContentLength, but we are
   263  		// defensive, as things may otherwise break very confusingly for
   264  		// callers.
   265  		return nil, io.EOF
   266  	}
   267  	r.info = s3Info{
   268  		name:    filepath.Base(name),
   269  		size:    offset + *output.ContentLength,
   270  		modTime: *output.LastModified,
   271  		etag:    *output.ETag,
   272  	}
   273  	r.rc = output.Body
   274  	return &r, nil
   275  }
   276  
   277  // Read usually delegates to the underlying reader, except: (&posReader{}).Read is valid and
   278  // always at EOF; nil.Read panics.
   279  func (p *posReader) Read(dst []byte) (int, error) {
   280  	if p.rc == nil {
   281  		return 0, io.EOF
   282  	}
   283  	n, err := p.rc.Read(dst)
   284  	p.offset += int64(n)
   285  	return n, err
   286  }
   287  
   288  // Close usually delegates to the underlying reader, except: (&posReader{}).Close
   289  // and nil.Close do nothing.
   290  func (p *posReader) Close() {
   291  	if p == nil || p.rc == nil {
   292  		return
   293  	}
   294  	_ = atomic.AddInt32(&nOpenPos, -1)
   295  	if err := p.rc.Close(); err != nil {
   296  		// Note: Since the caller is already done reading from p.rc, we don't expect this error to
   297  		// indicate a problem with the correctness of past Reads, instead signaling some resource
   298  		// leakage (network connection, buffers, etc.). We can't retry the resource release:
   299  		//   * io.Closer does not define behavior for multiple Close calls and
   300  		//     s3.GetObjectOutput.Body doesn't say anything implementation-specific.
   301  		//   * Body may be a net/http.Response.Body [1] but the standard library doesn't say
   302  		//     anything about multiple Close either (and even if it did, we shouldn't rely on the
   303  		//     AWS SDK's implementation details in all cases or in the future).
   304  		// Without a retry opportunity, it seems like callers could either ignore the potential
   305  		// leak, or exit the OS process. We assume, for now, that callers won't want to do the
   306  		// latter, so we hide the error. (This could eventually lead to OS process exit due to
   307  		// resource exhaustion, so arguably this hiding doesn't add much harm, though of course it
   308  		// may be confusing.) We could consider changing this in the future, especially if we notice
   309  		// such resource leaks in real programs.
   310  		//
   311  		// [1] https://github.com/aws/aws-sdk-go/blob/e842504a6323096540dc3defdc7cb357d8749893/private/protocol/rest/unmarshal.go#L89-L90
   312  		log.Printf("s3file.posReader.Close: ignoring body close error: %v", err)
   313  	}
   314  }
   315  
   316  // maybeIDs returns ids if available, otherwise zero. p == nil is allowed.
   317  func (p *posReader) maybeIDs() s3RequestIDs {
   318  	if p == nil {
   319  		return s3RequestIDs{}
   320  	}
   321  	return p.ids
   322  }