github.com/grailbio/base@v0.0.11/file/s3file/file.go (about) 1 package s3file 2 3 import ( 4 "context" 5 "fmt" 6 "io" 7 "time" 8 9 "github.com/aws/aws-sdk-go/service/s3/s3iface" 10 "github.com/grailbio/base/errors" 11 "github.com/grailbio/base/file" 12 "github.com/grailbio/base/ioctx" 13 ) 14 15 // s3File implements file.File interface. 16 // 17 // Operations on a file are internally implemented by a goroutine running handleRequests, 18 // which reads requests from s3file.reqCh and sends responses to request.ch. 19 // 20 // s3File's API methods (Read, Seek, etc.) are implemented by: 21 // - Create a chan response. 22 // - Construct a request{} object describing the operation and send it to reqCh. 23 // - Wait for a message from either the response channel or context.Done(), 24 // whichever comes first. 25 type s3File struct { 26 name string // "s3://bucket/key/.." 27 clientsForAction clientsForActionFunc 28 mode accessMode 29 opts file.Opts 30 31 bucket string // bucket part of "name". 32 key string // key part "name". 33 34 // info is file metadata. Set at construction if mode == readonly, otherwise nil. 35 info *s3Info 36 37 // reqCh transports user operations (like Read) to the worker goroutine (handleRequests). 38 // This allows respecting context cancellation (regardless of what underlying AWS SDK operations 39 // do). It also guards subsequent fields; they are only accessed by the handleRequests 40 // goroutine. 41 reqCh chan request 42 43 // readerState is used for Reader(), which shares state across multiple callers. 44 readerState 45 46 // Used by files opened for writing. 47 uploader *s3Uploader 48 } 49 50 // Name returns the name of the file. 51 func (f *s3File) Name() string { 52 return f.name 53 } 54 55 func (f *s3File) String() string { 56 return f.name 57 } 58 59 // s3Info implements file.Info interface. 60 type s3Info struct { 61 name string 62 size int64 63 modTime time.Time 64 etag string // = GetObjectOutput.ETag 65 } 66 67 func (i *s3Info) Name() string { return i.name } 68 func (i *s3Info) Size() int64 { return i.size } 69 func (i *s3Info) ModTime() time.Time { return i.modTime } 70 func (i *s3Info) ETag() string { return i.etag } 71 72 func (f *s3File) Stat(ctx context.Context) (file.Info, error) { 73 if f.mode != readonly { 74 return nil, errors.E(errors.NotSupported, f.name, "stat for writeonly file not supported") 75 } 76 if f.info == nil { 77 panic(f) 78 } 79 return f.info, nil 80 } 81 82 type ( 83 reader struct { 84 f *s3File 85 *readerState 86 } 87 readerState struct { 88 position int64 89 bodyReader chunkReaderCache 90 } 91 defaultReader struct { 92 ctx context.Context 93 f *s3File 94 } 95 ) 96 97 func (r reader) Read(ctx context.Context, p []byte) (int, error) { 98 // TODO: Defensively guard against the underlying http body reader not respecting context 99 // cancellation. Note that the handleRequests mechanism guards against this for its 100 // operations (in addition to synchronizing), but that's not true here. 101 // Such defense may be appropriate here, or deeper in the stack. 102 n, err := r.f.readAt(ctx, &r.bodyReader, p, r.position) 103 r.position += int64(n) 104 return n, err 105 } 106 107 func (r *readerState) Close(ctx context.Context) error { 108 r.bodyReader.close() 109 return nil 110 } 111 112 func (f *s3File) OffsetReader(offset int64) ioctx.ReadCloser { 113 return reader{f, &readerState{position: offset}} 114 } 115 116 func (r defaultReader) Read(p []byte) (int, error) { 117 res := r.f.runRequest(r.ctx, request{ 118 reqType: readRequest, 119 buf: p, 120 }) 121 return res.n, res.err 122 } 123 124 func (r defaultReader) Seek(offset int64, whence int) (int64, error) { 125 res := r.f.runRequest(r.ctx, request{ 126 reqType: seekRequest, 127 off: offset, 128 whence: whence, 129 }) 130 return res.off, res.err 131 } 132 133 // Reader returns the default reader. There is only one default reader state for the entire file, 134 // and all objects returned by Reader share it. 135 // TODO: Consider deprecating this in favor of NewReader. 136 func (f *s3File) Reader(ctx context.Context) io.ReadSeeker { 137 if f.mode != readonly { 138 return file.NewError(fmt.Errorf("reader %v: file is not opened in read mode", f.name)) 139 } 140 return defaultReader{ctx, f} 141 } 142 143 // s3Writer implements a placeholder io.Writer for S3. 144 type s3Writer struct { 145 ctx context.Context 146 f *s3File 147 } 148 149 func (w *s3Writer) Write(p []byte) (n int, err error) { 150 if len(p) == 0 { 151 return 0, nil 152 } 153 res := w.f.runRequest(w.ctx, request{ 154 reqType: writeRequest, 155 buf: p, 156 }) 157 return res.n, res.err 158 } 159 160 func (f *s3File) Writer(ctx context.Context) io.Writer { 161 if f.mode != writeonly { 162 return file.NewError(fmt.Errorf("writer %v: file is not opened in write mode", f.name)) 163 } 164 return &s3Writer{ctx: ctx, f: f} 165 } 166 167 func (f *s3File) Close(ctx context.Context) error { 168 err := f.runRequest(ctx, request{reqType: closeRequest}).err 169 close(f.reqCh) 170 return err 171 } 172 173 func (f *s3File) Discard(ctx context.Context) { 174 if f.mode != writeonly { 175 return 176 } 177 _ = f.runRequest(ctx, request{reqType: abortRequest}) 178 close(f.reqCh) 179 } 180 181 type requestType int 182 183 const ( 184 seekRequest requestType = iota 185 readRequest 186 statRequest 187 writeRequest 188 closeRequest 189 abortRequest 190 ) 191 192 type request struct { 193 ctx context.Context // context passed to Read, Seek, Close, etc. 194 reqType requestType 195 196 // For Read and Write 197 buf []byte 198 199 // For Seek 200 off int64 201 whence int 202 203 // For sending the response 204 ch chan response 205 } 206 207 type response struct { 208 n int // # of bytes read. Set only by Read. 209 off int64 // Seek location. Set only by Seek. 210 info *s3Info // Set only by Stat. 211 signedURL string // Set only by Presign. 212 err error // Any error 213 uploader *s3Uploader 214 } 215 216 func (f *s3File) handleRequests() { 217 for req := range f.reqCh { 218 switch req.reqType { 219 case statRequest: 220 f.handleStat(req) 221 case seekRequest: 222 f.handleSeek(req) 223 case readRequest: 224 f.handleRead(req) 225 case writeRequest: 226 f.handleWrite(req) 227 case closeRequest: 228 f.handleClose(req) 229 case abortRequest: 230 f.handleAbort(req) 231 default: 232 panic(fmt.Sprintf("Illegal request: %+v", req)) 233 } 234 close(req.ch) 235 } 236 } 237 238 // Send a request to the handleRequests goroutine and wait for a response. The 239 // caller must set all the necessary fields in req, except ctx and ch, which are 240 // filled by this method. On ctx timeout or cancellation, returns a response 241 // with non-nil err field. 242 func (f *s3File) runRequest(ctx context.Context, req request) response { 243 resCh := make(chan response, 1) 244 req.ctx = ctx 245 req.ch = resCh 246 f.reqCh <- req 247 select { 248 case res := <-resCh: 249 return res 250 case <-ctx.Done(): 251 return response{err: errors.E(errors.Canceled)} 252 } 253 } 254 255 func (f *s3File) handleStat(req request) { 256 ctx := req.ctx 257 clients, err := f.clientsForAction(ctx, "GetObject", f.bucket, f.key) 258 if err != nil { 259 req.ch <- response{err: errors.E(err, fmt.Sprintf("s3file.stat %v", f.name))} 260 return 261 } 262 policy := newBackoffPolicy(clients, f.opts) 263 info, err := stat(ctx, clients, policy, f.name, f.bucket, f.key) 264 if err != nil { 265 req.ch <- response{err: err} 266 return 267 } 268 f.info = info 269 req.ch <- response{err: nil} 270 } 271 272 // Seek implements io.Seeker 273 func (f *s3File) handleSeek(req request) { 274 if f.info == nil { 275 panic("stat not filled") 276 } 277 var newPosition int64 278 switch req.whence { 279 case io.SeekStart: 280 newPosition = req.off 281 case io.SeekCurrent: 282 newPosition = f.position + req.off 283 case io.SeekEnd: 284 newPosition = f.info.size + req.off 285 default: 286 req.ch <- response{off: f.position, err: fmt.Errorf("s3file.seek(%s,%d,%d): illegal whence", f.name, req.off, req.whence)} 287 return 288 } 289 if newPosition < 0 { 290 req.ch <- response{off: f.position, err: fmt.Errorf("s3file.seek(%s,%d,%d): out-of-bounds seek", f.name, req.off, req.whence)} 291 return 292 } 293 if newPosition == f.position { 294 req.ch <- response{off: f.position} 295 return 296 } 297 f.position = newPosition 298 req.ch <- response{off: f.position} 299 } 300 301 func (f *s3File) readAt( 302 ctx context.Context, 303 readerCache *chunkReaderCache, 304 buf []byte, 305 off int64, 306 ) (int, error) { 307 if f.mode != readonly { 308 return 0, errors.E(errors.NotAllowed, "not opened for read") 309 } 310 if f.info == nil { 311 panic("stat not filled") 312 } 313 314 reader, cleanUp, err := readerCache.getOrCreate(ctx, func() (*chunkReaderAt, error) { 315 clients, err := f.clientsForAction(ctx, "GetObject", f.bucket, f.key) 316 if err != nil { 317 return nil, errors.E(err, "getting clients") 318 } 319 return &chunkReaderAt{ 320 name: f.name, 321 bucket: f.bucket, 322 key: f.key, 323 newRetryPolicy: func() retryPolicy { 324 return newBackoffPolicy(append([]s3iface.S3API{}, clients...), f.opts) 325 }, 326 }, nil 327 }) 328 if err != nil { 329 return 0, err 330 } 331 defer cleanUp() 332 333 var n int 334 // Note: We allow seeking past EOF, consistent with io.Seeker.Seek's documentation. We simply 335 // return EOF in this situation. 336 if bytesUntilEOF := f.info.size - off; bytesUntilEOF <= 0 { 337 err = io.EOF 338 } else { 339 // Because we know the size of the object, pass a smaller buffer to the 340 // chunk reader to save it the effort of trying to fill it (with 341 // parallel reads). This is an optimization that does not affect 342 // correctness. 343 // TODO: Consider how to move this optimization into the chunk reader 344 // itself, possibly by optionally passing in the size/metadata. 345 if len(buf) > int(bytesUntilEOF) { 346 buf = buf[:bytesUntilEOF] 347 } 348 var info s3Info 349 n, info, err = reader.ReadAt(ctx, buf, off) 350 if err != nil && err != io.EOF { 351 err = errors.E(err, fmt.Sprintf("s3file.read %v", f.name)) 352 } else if info == (s3Info{}) { 353 // Maybe EOF or len(req.buf) == 0. 354 } else if f.info.etag != info.etag { 355 // Note: If err was io.EOF, we intentionally drop that in favor of flagging ETag mismatch. 356 err = eTagChangedError(f.name, f.info.etag, info.etag) 357 } 358 } 359 return n, err 360 } 361 362 func (f *s3File) handleRead(req request) { 363 n, err := reader{f, &f.readerState}.Read(req.ctx, req.buf) 364 req.ch <- response{n: n, err: err} 365 } 366 367 func (f *s3File) handleWrite(req request) { 368 f.uploader.write(req.buf) 369 req.ch <- response{n: len(req.buf), err: nil} 370 } 371 372 func (f *s3File) handleClose(req request) { 373 var err error 374 if f.uploader != nil { 375 err = f.uploader.finish() 376 } 377 errors.CleanUpCtx(req.ctx, f.readerState.Close, &err) 378 if err != nil { 379 err = errors.E(err, "s3file.close", f.name) 380 } 381 f.clientsForAction = nil 382 req.ch <- response{err: err} 383 } 384 385 func (f *s3File) handleAbort(req request) { 386 err := f.uploader.abort() 387 if err != nil { 388 err = errors.E(err, "s3file.abort", f.name) 389 } 390 f.clientsForAction = nil 391 req.ch <- response{err: err} 392 }