github.com/grailbio/base@v0.0.11/file/s3file/file.go (about)

     1  package s3file
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"io"
     7  	"time"
     8  
     9  	"github.com/aws/aws-sdk-go/service/s3/s3iface"
    10  	"github.com/grailbio/base/errors"
    11  	"github.com/grailbio/base/file"
    12  	"github.com/grailbio/base/ioctx"
    13  )
    14  
    15  // s3File implements file.File interface.
    16  //
    17  // Operations on a file are internally implemented by a goroutine running handleRequests,
    18  // which reads requests from s3file.reqCh and sends responses to request.ch.
    19  //
    20  // s3File's API methods (Read, Seek, etc.) are implemented by:
    21  // - Create a chan response.
    22  // - Construct a request{} object describing the operation and send it to reqCh.
    23  // - Wait for a message from either the response channel or context.Done(),
    24  // whichever comes first.
    25  type s3File struct {
    26  	name             string // "s3://bucket/key/.."
    27  	clientsForAction clientsForActionFunc
    28  	mode             accessMode
    29  	opts             file.Opts
    30  
    31  	bucket string // bucket part of "name".
    32  	key    string // key part "name".
    33  
    34  	// info is file metadata. Set at construction if mode == readonly, otherwise nil.
    35  	info *s3Info
    36  
    37  	// reqCh transports user operations (like Read) to the worker goroutine (handleRequests).
    38  	// This allows respecting context cancellation (regardless of what underlying AWS SDK operations
    39  	// do). It also guards subsequent fields; they are only accessed by the handleRequests
    40  	// goroutine.
    41  	reqCh chan request
    42  
    43  	// readerState is used for Reader(), which shares state across multiple callers.
    44  	readerState
    45  
    46  	// Used by files opened for writing.
    47  	uploader *s3Uploader
    48  }
    49  
    50  // Name returns the name of the file.
    51  func (f *s3File) Name() string {
    52  	return f.name
    53  }
    54  
    55  func (f *s3File) String() string {
    56  	return f.name
    57  }
    58  
    59  // s3Info implements file.Info interface.
    60  type s3Info struct {
    61  	name    string
    62  	size    int64
    63  	modTime time.Time
    64  	etag    string // = GetObjectOutput.ETag
    65  }
    66  
    67  func (i *s3Info) Name() string       { return i.name }
    68  func (i *s3Info) Size() int64        { return i.size }
    69  func (i *s3Info) ModTime() time.Time { return i.modTime }
    70  func (i *s3Info) ETag() string       { return i.etag }
    71  
    72  func (f *s3File) Stat(ctx context.Context) (file.Info, error) {
    73  	if f.mode != readonly {
    74  		return nil, errors.E(errors.NotSupported, f.name, "stat for writeonly file not supported")
    75  	}
    76  	if f.info == nil {
    77  		panic(f)
    78  	}
    79  	return f.info, nil
    80  }
    81  
    82  type (
    83  	reader struct {
    84  		f *s3File
    85  		*readerState
    86  	}
    87  	readerState struct {
    88  		position   int64
    89  		bodyReader chunkReaderCache
    90  	}
    91  	defaultReader struct {
    92  		ctx context.Context
    93  		f   *s3File
    94  	}
    95  )
    96  
    97  func (r reader) Read(ctx context.Context, p []byte) (int, error) {
    98  	// TODO: Defensively guard against the underlying http body reader not respecting context
    99  	// cancellation. Note that the handleRequests mechanism guards against this for its
   100  	// operations (in addition to synchronizing), but that's not true here.
   101  	// Such defense may be appropriate here, or deeper in the stack.
   102  	n, err := r.f.readAt(ctx, &r.bodyReader, p, r.position)
   103  	r.position += int64(n)
   104  	return n, err
   105  }
   106  
   107  func (r *readerState) Close(ctx context.Context) error {
   108  	r.bodyReader.close()
   109  	return nil
   110  }
   111  
   112  func (f *s3File) OffsetReader(offset int64) ioctx.ReadCloser {
   113  	return reader{f, &readerState{position: offset}}
   114  }
   115  
   116  func (r defaultReader) Read(p []byte) (int, error) {
   117  	res := r.f.runRequest(r.ctx, request{
   118  		reqType: readRequest,
   119  		buf:     p,
   120  	})
   121  	return res.n, res.err
   122  }
   123  
   124  func (r defaultReader) Seek(offset int64, whence int) (int64, error) {
   125  	res := r.f.runRequest(r.ctx, request{
   126  		reqType: seekRequest,
   127  		off:     offset,
   128  		whence:  whence,
   129  	})
   130  	return res.off, res.err
   131  }
   132  
   133  // Reader returns the default reader. There is only one default reader state for the entire file,
   134  // and all objects returned by Reader share it.
   135  // TODO: Consider deprecating this in favor of NewReader.
   136  func (f *s3File) Reader(ctx context.Context) io.ReadSeeker {
   137  	if f.mode != readonly {
   138  		return file.NewError(fmt.Errorf("reader %v: file is not opened in read mode", f.name))
   139  	}
   140  	return defaultReader{ctx, f}
   141  }
   142  
   143  // s3Writer implements a placeholder io.Writer for S3.
   144  type s3Writer struct {
   145  	ctx context.Context
   146  	f   *s3File
   147  }
   148  
   149  func (w *s3Writer) Write(p []byte) (n int, err error) {
   150  	if len(p) == 0 {
   151  		return 0, nil
   152  	}
   153  	res := w.f.runRequest(w.ctx, request{
   154  		reqType: writeRequest,
   155  		buf:     p,
   156  	})
   157  	return res.n, res.err
   158  }
   159  
   160  func (f *s3File) Writer(ctx context.Context) io.Writer {
   161  	if f.mode != writeonly {
   162  		return file.NewError(fmt.Errorf("writer %v: file is not opened in write mode", f.name))
   163  	}
   164  	return &s3Writer{ctx: ctx, f: f}
   165  }
   166  
   167  func (f *s3File) Close(ctx context.Context) error {
   168  	err := f.runRequest(ctx, request{reqType: closeRequest}).err
   169  	close(f.reqCh)
   170  	return err
   171  }
   172  
   173  func (f *s3File) Discard(ctx context.Context) {
   174  	if f.mode != writeonly {
   175  		return
   176  	}
   177  	_ = f.runRequest(ctx, request{reqType: abortRequest})
   178  	close(f.reqCh)
   179  }
   180  
   181  type requestType int
   182  
   183  const (
   184  	seekRequest requestType = iota
   185  	readRequest
   186  	statRequest
   187  	writeRequest
   188  	closeRequest
   189  	abortRequest
   190  )
   191  
   192  type request struct {
   193  	ctx     context.Context // context passed to Read, Seek, Close, etc.
   194  	reqType requestType
   195  
   196  	// For Read and Write
   197  	buf []byte
   198  
   199  	// For Seek
   200  	off    int64
   201  	whence int
   202  
   203  	// For sending the response
   204  	ch chan response
   205  }
   206  
   207  type response struct {
   208  	n         int     // # of bytes read. Set only by Read.
   209  	off       int64   // Seek location. Set only by Seek.
   210  	info      *s3Info // Set only by Stat.
   211  	signedURL string  // Set only by Presign.
   212  	err       error   // Any error
   213  	uploader  *s3Uploader
   214  }
   215  
   216  func (f *s3File) handleRequests() {
   217  	for req := range f.reqCh {
   218  		switch req.reqType {
   219  		case statRequest:
   220  			f.handleStat(req)
   221  		case seekRequest:
   222  			f.handleSeek(req)
   223  		case readRequest:
   224  			f.handleRead(req)
   225  		case writeRequest:
   226  			f.handleWrite(req)
   227  		case closeRequest:
   228  			f.handleClose(req)
   229  		case abortRequest:
   230  			f.handleAbort(req)
   231  		default:
   232  			panic(fmt.Sprintf("Illegal request: %+v", req))
   233  		}
   234  		close(req.ch)
   235  	}
   236  }
   237  
   238  // Send a request to the handleRequests goroutine and wait for a response. The
   239  // caller must set all the necessary fields in req, except ctx and ch, which are
   240  // filled by this method. On ctx timeout or cancellation, returns a response
   241  // with non-nil err field.
   242  func (f *s3File) runRequest(ctx context.Context, req request) response {
   243  	resCh := make(chan response, 1)
   244  	req.ctx = ctx
   245  	req.ch = resCh
   246  	f.reqCh <- req
   247  	select {
   248  	case res := <-resCh:
   249  		return res
   250  	case <-ctx.Done():
   251  		return response{err: errors.E(errors.Canceled)}
   252  	}
   253  }
   254  
   255  func (f *s3File) handleStat(req request) {
   256  	ctx := req.ctx
   257  	clients, err := f.clientsForAction(ctx, "GetObject", f.bucket, f.key)
   258  	if err != nil {
   259  		req.ch <- response{err: errors.E(err, fmt.Sprintf("s3file.stat %v", f.name))}
   260  		return
   261  	}
   262  	policy := newBackoffPolicy(clients, f.opts)
   263  	info, err := stat(ctx, clients, policy, f.name, f.bucket, f.key)
   264  	if err != nil {
   265  		req.ch <- response{err: err}
   266  		return
   267  	}
   268  	f.info = info
   269  	req.ch <- response{err: nil}
   270  }
   271  
   272  // Seek implements io.Seeker
   273  func (f *s3File) handleSeek(req request) {
   274  	if f.info == nil {
   275  		panic("stat not filled")
   276  	}
   277  	var newPosition int64
   278  	switch req.whence {
   279  	case io.SeekStart:
   280  		newPosition = req.off
   281  	case io.SeekCurrent:
   282  		newPosition = f.position + req.off
   283  	case io.SeekEnd:
   284  		newPosition = f.info.size + req.off
   285  	default:
   286  		req.ch <- response{off: f.position, err: fmt.Errorf("s3file.seek(%s,%d,%d): illegal whence", f.name, req.off, req.whence)}
   287  		return
   288  	}
   289  	if newPosition < 0 {
   290  		req.ch <- response{off: f.position, err: fmt.Errorf("s3file.seek(%s,%d,%d): out-of-bounds seek", f.name, req.off, req.whence)}
   291  		return
   292  	}
   293  	if newPosition == f.position {
   294  		req.ch <- response{off: f.position}
   295  		return
   296  	}
   297  	f.position = newPosition
   298  	req.ch <- response{off: f.position}
   299  }
   300  
   301  func (f *s3File) readAt(
   302  	ctx context.Context,
   303  	readerCache *chunkReaderCache,
   304  	buf []byte,
   305  	off int64,
   306  ) (int, error) {
   307  	if f.mode != readonly {
   308  		return 0, errors.E(errors.NotAllowed, "not opened for read")
   309  	}
   310  	if f.info == nil {
   311  		panic("stat not filled")
   312  	}
   313  
   314  	reader, cleanUp, err := readerCache.getOrCreate(ctx, func() (*chunkReaderAt, error) {
   315  		clients, err := f.clientsForAction(ctx, "GetObject", f.bucket, f.key)
   316  		if err != nil {
   317  			return nil, errors.E(err, "getting clients")
   318  		}
   319  		return &chunkReaderAt{
   320  			name:   f.name,
   321  			bucket: f.bucket,
   322  			key:    f.key,
   323  			newRetryPolicy: func() retryPolicy {
   324  				return newBackoffPolicy(append([]s3iface.S3API{}, clients...), f.opts)
   325  			},
   326  		}, nil
   327  	})
   328  	if err != nil {
   329  		return 0, err
   330  	}
   331  	defer cleanUp()
   332  
   333  	var n int
   334  	// Note: We allow seeking past EOF, consistent with io.Seeker.Seek's documentation. We simply
   335  	// return EOF in this situation.
   336  	if bytesUntilEOF := f.info.size - off; bytesUntilEOF <= 0 {
   337  		err = io.EOF
   338  	} else {
   339  		// Because we know the size of the object, pass a smaller buffer to the
   340  		// chunk reader to save it the effort of trying to fill it (with
   341  		// parallel reads). This is an optimization that does not affect
   342  		// correctness.
   343  		// TODO: Consider how to move this optimization into the chunk reader
   344  		// itself, possibly by optionally passing in the size/metadata.
   345  		if len(buf) > int(bytesUntilEOF) {
   346  			buf = buf[:bytesUntilEOF]
   347  		}
   348  		var info s3Info
   349  		n, info, err = reader.ReadAt(ctx, buf, off)
   350  		if err != nil && err != io.EOF {
   351  			err = errors.E(err, fmt.Sprintf("s3file.read %v", f.name))
   352  		} else if info == (s3Info{}) {
   353  			// Maybe EOF or len(req.buf) == 0.
   354  		} else if f.info.etag != info.etag {
   355  			// Note: If err was io.EOF, we intentionally drop that in favor of flagging ETag mismatch.
   356  			err = eTagChangedError(f.name, f.info.etag, info.etag)
   357  		}
   358  	}
   359  	return n, err
   360  }
   361  
   362  func (f *s3File) handleRead(req request) {
   363  	n, err := reader{f, &f.readerState}.Read(req.ctx, req.buf)
   364  	req.ch <- response{n: n, err: err}
   365  }
   366  
   367  func (f *s3File) handleWrite(req request) {
   368  	f.uploader.write(req.buf)
   369  	req.ch <- response{n: len(req.buf), err: nil}
   370  }
   371  
   372  func (f *s3File) handleClose(req request) {
   373  	var err error
   374  	if f.uploader != nil {
   375  		err = f.uploader.finish()
   376  	}
   377  	errors.CleanUpCtx(req.ctx, f.readerState.Close, &err)
   378  	if err != nil {
   379  		err = errors.E(err, "s3file.close", f.name)
   380  	}
   381  	f.clientsForAction = nil
   382  	req.ch <- response{err: err}
   383  }
   384  
   385  func (f *s3File) handleAbort(req request) {
   386  	err := f.uploader.abort()
   387  	if err != nil {
   388  		err = errors.E(err, "s3file.abort", f.name)
   389  	}
   390  	f.clientsForAction = nil
   391  	req.ch <- response{err: err}
   392  }