github.com/sagansystems/goofys-app@v0.19.1-0.20180410053237-b2302fdf5af9/internal/file.go (about) 1 // Copyright 2015 - 2017 Ka-Hing Cheung 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package internal 16 17 import ( 18 "errors" 19 "fmt" 20 "io" 21 "net/url" 22 "strconv" 23 "sync" 24 "syscall" 25 26 "github.com/aws/aws-sdk-go/aws" 27 "github.com/aws/aws-sdk-go/service/s3" 28 29 "github.com/jacobsa/fuse" 30 ) 31 32 type FileHandle struct { 33 inode *Inode 34 35 mpuKey *string 36 dirty bool 37 writeInit sync.Once 38 mpuWG sync.WaitGroup 39 etags []*string 40 41 mu sync.Mutex 42 mpuId *string 43 nextWriteOffset int64 44 lastPartId int 45 46 poolHandle *BufferPool 47 buf *MBuf 48 49 lastWriteError error 50 51 // read 52 reader io.ReadCloser 53 readBufOffset int64 54 55 // parallel read 56 buffers []*S3ReadBuffer 57 existingReadahead int 58 seqReadAmount uint64 59 numOOORead uint64 // number of out of order read 60 } 61 62 const MAX_READAHEAD = uint32(100 * 1024 * 1024) 63 const READAHEAD_CHUNK = uint32(20 * 1024 * 1024) 64 65 func NewFileHandle(in *Inode) *FileHandle { 66 fh := &FileHandle{inode: in} 67 return fh 68 } 69 70 func (fh *FileHandle) initWrite() { 71 fh.writeInit.Do(func() { 72 fh.mpuWG.Add(1) 73 go fh.initMPU() 74 }) 75 } 76 77 func (fh *FileHandle) initMPU() { 78 defer func() { 79 fh.mpuWG.Done() 80 }() 81 82 fh.mpuKey = fh.inode.FullName() 83 fs := fh.inode.fs 84 85 params := &s3.CreateMultipartUploadInput{ 86 Bucket: &fs.bucket, 87 Key: fs.key(*fh.mpuKey), 88 StorageClass: &fs.flags.StorageClass, 89 ContentType: fs.getMimeType(*fh.inode.FullName()), 90 } 91 92 if fs.flags.UseSSE { 93 params.ServerSideEncryption = &fs.sseType 94 if fs.flags.UseKMS && fs.flags.KMSKeyID != "" { 95 params.SSEKMSKeyId = &fs.flags.KMSKeyID 96 } 97 } 98 99 if fs.flags.ACL != "" { 100 params.ACL = &fs.flags.ACL 101 } 102 103 if !fs.gcs { 104 resp, err := fs.s3.CreateMultipartUpload(params) 105 106 fh.mu.Lock() 107 defer fh.mu.Unlock() 108 109 if err != nil { 110 fh.lastWriteError = mapAwsError(err) 111 s3Log.Errorf("CreateMultipartUpload %v = %v", *fh.mpuKey, err) 112 } 113 114 s3Log.Debug(resp) 115 116 fh.mpuId = resp.UploadId 117 } else { 118 req, _ := fs.s3.CreateMultipartUploadRequest(params) 119 // get rid of ?uploads= 120 req.HTTPRequest.URL.RawQuery = "" 121 req.HTTPRequest.Header.Set("x-goog-resumable", "start") 122 123 err := req.Send() 124 if err != nil { 125 fh.lastWriteError = mapAwsError(err) 126 s3Log.Errorf("CreateMultipartUpload %v = %v", *fh.mpuKey, err) 127 } 128 129 location := req.HTTPResponse.Header.Get("Location") 130 _, err = url.Parse(location) 131 if err != nil { 132 fh.lastWriteError = mapAwsError(err) 133 s3Log.Errorf("CreateMultipartUpload %v = %v", *fh.mpuKey, err) 134 } 135 136 fh.mpuId = &location 137 } 138 139 fh.etags = make([]*string, 10000) // at most 10K parts 140 141 return 142 } 143 144 func (fh *FileHandle) mpuPartNoSpawn(buf *MBuf, part int, total int64, last bool) (err error) { 145 fs := fh.inode.fs 146 147 fs.replicators.Take(1, true) 148 defer fs.replicators.Return(1) 149 150 defer buf.Free() 151 152 if part == 0 || part > 10000 { 153 return errors.New(fmt.Sprintf("invalid part number: %v", part)) 154 } 155 156 en := &fh.etags[part-1] 157 158 if !fs.gcs { 159 params := &s3.UploadPartInput{ 160 Bucket: &fs.bucket, 161 Key: fs.key(*fh.inode.FullName()), 162 PartNumber: aws.Int64(int64(part)), 163 UploadId: fh.mpuId, 164 Body: buf, 165 } 166 167 s3Log.Debug(params) 168 169 resp, err := fs.s3.UploadPart(params) 170 if err != nil { 171 return mapAwsError(err) 172 } 173 174 if *en != nil { 175 panic(fmt.Sprintf("etags for part %v already set: %v", part, **en)) 176 } 177 *en = resp.ETag 178 } else { 179 // the mpuId serves as authentication token so 180 // technically we don't need to sign this anymore and 181 // can just use a plain HTTP request, but going 182 // through aws-sdk-go anyway to get retry handling 183 params := &s3.PutObjectInput{ 184 Bucket: &fs.bucket, 185 Key: fs.key(*fh.inode.FullName()), 186 Body: buf, 187 } 188 189 s3Log.Debug(params) 190 191 req, _ := fs.s3.PutObjectRequest(params) 192 req.HTTPRequest.URL, _ = url.Parse(*fh.mpuId) 193 194 bufSize := buf.Len() 195 start := total - int64(bufSize) 196 end := total - 1 197 var size string 198 if last { 199 size = strconv.FormatInt(total, 10) 200 } else { 201 size = "*" 202 } 203 204 contentRange := fmt.Sprintf("bytes %v-%v/%v", start, end, size) 205 206 req.HTTPRequest.Header.Set("Content-Length", strconv.Itoa(bufSize)) 207 req.HTTPRequest.Header.Set("Content-Range", contentRange) 208 209 err = req.Send() 210 if err != nil { 211 if req.HTTPResponse.StatusCode == 308 { 212 err = nil 213 } else { 214 return mapAwsError(err) 215 } 216 } 217 } 218 219 return 220 } 221 222 func (fh *FileHandle) mpuPart(buf *MBuf, part int, total int64) { 223 defer func() { 224 fh.mpuWG.Done() 225 }() 226 227 // maybe wait for CreateMultipartUpload 228 if fh.mpuId == nil { 229 fh.mpuWG.Wait() 230 // initMPU might have errored 231 if fh.mpuId == nil { 232 return 233 } 234 } 235 236 err := fh.mpuPartNoSpawn(buf, part, total, false) 237 if err != nil { 238 if fh.lastWriteError == nil { 239 fh.lastWriteError = mapAwsError(err) 240 } 241 } 242 } 243 244 func (fh *FileHandle) waitForCreateMPU() (err error) { 245 if fh.mpuId == nil { 246 fh.mu.Unlock() 247 fh.initWrite() 248 fh.mpuWG.Wait() // wait for initMPU 249 fh.mu.Lock() 250 251 if fh.lastWriteError != nil { 252 return fh.lastWriteError 253 } 254 } 255 256 return 257 } 258 259 func (fh *FileHandle) partSize() uint64 { 260 if fh.lastPartId < 1000 { 261 return 5 * 1024 * 1024 262 } else if fh.lastPartId < 2000 { 263 return 25 * 1024 * 1024 264 } else { 265 return 125 * 1024 * 1024 266 } 267 } 268 269 func (fh *FileHandle) uploadCurrentBuf(gcs bool) (err error) { 270 err = fh.waitForCreateMPU() 271 if err != nil { 272 return 273 } 274 275 fh.lastPartId++ 276 part := fh.lastPartId 277 buf := fh.buf 278 fh.buf = nil 279 280 if !gcs { 281 fh.mpuWG.Add(1) 282 go fh.mpuPart(buf, part, fh.nextWriteOffset) 283 } else { 284 // GCS doesn't support concurrent uploads 285 err = fh.mpuPartNoSpawn(buf, part, fh.nextWriteOffset, false) 286 } 287 288 return 289 } 290 291 func (fh *FileHandle) WriteFile(offset int64, data []byte) (err error) { 292 fh.inode.logFuse("WriteFile", offset, len(data)) 293 294 fh.mu.Lock() 295 defer fh.mu.Unlock() 296 297 if fh.lastWriteError != nil { 298 return fh.lastWriteError 299 } 300 301 if offset != fh.nextWriteOffset { 302 fh.inode.errFuse("WriteFile: only sequential writes supported", fh.nextWriteOffset, offset) 303 fh.lastWriteError = syscall.ENOTSUP 304 return fh.lastWriteError 305 } 306 307 fs := fh.inode.fs 308 309 if offset == 0 { 310 fh.poolHandle = fs.bufferPool 311 fh.dirty = true 312 } 313 314 for { 315 if fh.buf == nil { 316 fh.buf = MBuf{}.Init(fh.poolHandle, fh.partSize(), true) 317 } 318 319 if fh.buf.Full() { 320 err = fh.uploadCurrentBuf(fs.gcs) 321 if err != nil { 322 return 323 } 324 fh.buf = MBuf{}.Init(fh.poolHandle, fh.partSize(), true) 325 } 326 327 nCopied, _ := fh.buf.Write(data) 328 fh.nextWriteOffset += int64(nCopied) 329 330 if !fs.gcs { 331 // don't upload a buffer post write for GCS 332 // because we want to leave a buffer until 333 // flush so that we can mark the last part 334 // specially 335 if fh.buf.Full() { 336 err = fh.uploadCurrentBuf(fs.gcs) 337 if err != nil { 338 return 339 } 340 } 341 } 342 343 if nCopied == len(data) { 344 break 345 } 346 347 data = data[nCopied:] 348 } 349 350 fh.inode.Attributes.Size = uint64(fh.nextWriteOffset) 351 352 return 353 } 354 355 type S3ReadBuffer struct { 356 s3 *s3.S3 357 offset uint64 358 size uint32 359 buf *Buffer 360 } 361 362 func (b S3ReadBuffer) Init(fh *FileHandle, offset uint64, size uint32) *S3ReadBuffer { 363 fs := fh.inode.fs 364 b.s3 = fs.s3 365 b.offset = offset 366 b.size = size 367 368 mbuf := MBuf{}.Init(fh.poolHandle, uint64(size), false) 369 if mbuf == nil { 370 return nil 371 } 372 373 b.buf = Buffer{}.Init(mbuf, func() (io.ReadCloser, error) { 374 params := &s3.GetObjectInput{ 375 Bucket: &fs.bucket, 376 Key: fs.key(*fh.inode.FullName()), 377 } 378 379 bytes := fmt.Sprintf("bytes=%v-%v", offset, offset+uint64(size)-1) 380 params.Range = &bytes 381 382 req, resp := fs.s3.GetObjectRequest(params) 383 384 err := req.Send() 385 if err != nil { 386 return nil, mapAwsError(err) 387 } 388 389 return resp.Body, nil 390 }) 391 392 return &b 393 } 394 395 func (b *S3ReadBuffer) Read(offset uint64, p []byte) (n int, err error) { 396 if b.offset == offset { 397 n, err = io.ReadFull(b.buf, p) 398 if n != 0 && err == io.ErrUnexpectedEOF { 399 err = nil 400 } 401 if n > 0 { 402 if uint32(n) > b.size { 403 panic(fmt.Sprintf("read more than available %v %v", n, b.size)) 404 } 405 406 b.offset += uint64(n) 407 b.size -= uint32(n) 408 } 409 410 return 411 } else { 412 panic(fmt.Sprintf("not the right buffer, expecting %v got %v, %v left", b.offset, offset, b.size)) 413 err = errors.New(fmt.Sprintf("not the right buffer, expecting %v got %v", b.offset, offset)) 414 return 415 } 416 } 417 418 func (fh *FileHandle) readFromReadAhead(offset uint64, buf []byte) (bytesRead int, err error) { 419 var nread int 420 for len(fh.buffers) != 0 { 421 nread, err = fh.buffers[0].Read(offset+uint64(bytesRead), buf) 422 bytesRead += nread 423 if err != nil { 424 return 425 } 426 427 if fh.buffers[0].size == 0 { 428 // we've exhausted the first buffer 429 fh.buffers[0].buf.Close() 430 fh.buffers = fh.buffers[1:] 431 } 432 433 buf = buf[nread:] 434 435 if len(buf) == 0 { 436 // we've filled the user buffer 437 return 438 } 439 } 440 441 return 442 } 443 444 func (fh *FileHandle) readAhead(offset uint64, needAtLeast int) (err error) { 445 existingReadahead := uint32(0) 446 for _, b := range fh.buffers { 447 existingReadahead += b.size 448 } 449 450 readAheadAmount := MAX_READAHEAD 451 452 for readAheadAmount-existingReadahead >= READAHEAD_CHUNK { 453 off := offset + uint64(existingReadahead) 454 remaining := fh.inode.Attributes.Size - off 455 456 // only read up to readahead chunk each time 457 size := MinUInt32(readAheadAmount-existingReadahead, READAHEAD_CHUNK) 458 // but don't read past the file 459 size = uint32(MinUInt64(uint64(size), remaining)) 460 461 if size != 0 { 462 fh.inode.logFuse("readahead", off, size, existingReadahead) 463 464 readAheadBuf := S3ReadBuffer{}.Init(fh, off, size) 465 if readAheadBuf != nil { 466 fh.buffers = append(fh.buffers, readAheadBuf) 467 existingReadahead += size 468 } else { 469 if existingReadahead != 0 { 470 // don't do more readahead now, but don't fail, cross our 471 // fingers that we will be able to allocate the buffers 472 // later 473 return nil 474 } else { 475 return syscall.ENOMEM 476 } 477 } 478 } 479 480 if size != READAHEAD_CHUNK { 481 // that was the last remaining chunk to readahead 482 break 483 } 484 } 485 486 return nil 487 } 488 489 func (fh *FileHandle) ReadFile(offset int64, buf []byte) (bytesRead int, err error) { 490 fh.inode.logFuse("ReadFile", offset, len(buf)) 491 defer func() { 492 fh.inode.logFuse("< ReadFile", bytesRead, err) 493 494 if err != nil { 495 if err == io.EOF { 496 err = nil 497 } 498 } 499 }() 500 501 fh.mu.Lock() 502 defer fh.mu.Unlock() 503 504 nwant := len(buf) 505 var nread int 506 507 for bytesRead < nwant && err == nil { 508 nread, err = fh.readFile(offset+int64(bytesRead), buf[bytesRead:]) 509 if nread > 0 { 510 bytesRead += nread 511 } 512 } 513 514 return 515 } 516 517 func (fh *FileHandle) readFile(offset int64, buf []byte) (bytesRead int, err error) { 518 defer func() { 519 if bytesRead > 0 { 520 fh.readBufOffset += int64(bytesRead) 521 fh.seqReadAmount += uint64(bytesRead) 522 } 523 524 fh.inode.logFuse("< readFile", bytesRead, err) 525 }() 526 527 if uint64(offset) >= fh.inode.Attributes.Size { 528 // nothing to read 529 if fh.inode.Invalid { 530 err = fuse.ENOENT 531 } else if fh.inode.KnownSize == nil { 532 err = io.EOF 533 } else { 534 err = io.EOF 535 } 536 return 537 } 538 539 fs := fh.inode.fs 540 541 if fh.poolHandle == nil { 542 fh.poolHandle = fs.bufferPool 543 } 544 545 if fh.readBufOffset != offset { 546 // XXX out of order read, maybe disable prefetching 547 fh.inode.logFuse("out of order read", offset, fh.readBufOffset) 548 549 fh.readBufOffset = offset 550 fh.seqReadAmount = 0 551 if fh.reader != nil { 552 fh.reader.Close() 553 fh.reader = nil 554 } 555 556 if fh.buffers != nil { 557 // we misdetected 558 fh.numOOORead++ 559 } 560 561 for _, b := range fh.buffers { 562 b.buf.Close() 563 } 564 fh.buffers = nil 565 } 566 567 if !fs.flags.Cheap && fh.seqReadAmount >= uint64(READAHEAD_CHUNK) && fh.numOOORead < 3 { 568 if fh.reader != nil { 569 fh.inode.logFuse("cutover to the parallel algorithm") 570 fh.reader.Close() 571 fh.reader = nil 572 } 573 574 err = fh.readAhead(uint64(offset), len(buf)) 575 if err == nil { 576 bytesRead, err = fh.readFromReadAhead(uint64(offset), buf) 577 return 578 } else { 579 // fall back to read serially 580 fh.inode.logFuse("not enough memory, fallback to serial read") 581 fh.seqReadAmount = 0 582 for _, b := range fh.buffers { 583 b.buf.Close() 584 } 585 fh.buffers = nil 586 } 587 } 588 589 bytesRead, err = fh.readFromStream(offset, buf) 590 591 return 592 } 593 594 func (fh *FileHandle) Release() { 595 // read buffers 596 for _, b := range fh.buffers { 597 b.buf.Close() 598 } 599 fh.buffers = nil 600 601 if fh.reader != nil { 602 fh.reader.Close() 603 } 604 605 // write buffers 606 if fh.poolHandle != nil { 607 if fh.buf != nil && fh.buf.buffers != nil { 608 if fh.lastWriteError == nil { 609 panic("buf not freed but error is nil") 610 } 611 612 fh.buf.Free() 613 // the other in-flight multipart PUT buffers will be 614 // freed when they finish/error out 615 } 616 } 617 618 fh.inode.mu.Lock() 619 defer fh.inode.mu.Unlock() 620 621 if fh.inode.fileHandles == 0 { 622 panic(fh.inode.fileHandles) 623 } 624 625 fh.inode.fileHandles -= 1 626 } 627 628 func (fh *FileHandle) readFromStream(offset int64, buf []byte) (bytesRead int, err error) { 629 defer func() { 630 if fh.inode.fs.flags.DebugFuse { 631 fh.inode.logFuse("< readFromStream", bytesRead) 632 } 633 }() 634 635 if uint64(offset) >= fh.inode.Attributes.Size { 636 // nothing to read 637 return 638 } 639 640 fs := fh.inode.fs 641 642 if fh.reader == nil { 643 params := &s3.GetObjectInput{ 644 Bucket: &fs.bucket, 645 Key: fs.key(*fh.inode.FullName()), 646 } 647 648 if offset != 0 { 649 bytes := fmt.Sprintf("bytes=%v-", offset) 650 params.Range = &bytes 651 } 652 653 req, resp := fs.s3.GetObjectRequest(params) 654 655 err = req.Send() 656 if err != nil { 657 return bytesRead, mapAwsError(err) 658 } 659 660 fh.reader = resp.Body 661 } 662 663 bytesRead, err = fh.reader.Read(buf) 664 if err != nil { 665 if err != io.EOF { 666 fh.inode.logFuse("< readFromStream error", bytesRead, err) 667 } 668 // always retry error on read 669 fh.reader.Close() 670 fh.reader = nil 671 err = nil 672 } 673 674 return 675 } 676 677 func (fh *FileHandle) flushSmallFile() (err error) { 678 buf := fh.buf 679 fh.buf = nil 680 681 if buf == nil { 682 buf = MBuf{}.Init(fh.poolHandle, 0, true) 683 } 684 685 defer buf.Free() 686 687 fs := fh.inode.fs 688 689 storageClass := fs.flags.StorageClass 690 if fh.nextWriteOffset < 128*1024 && storageClass == "STANDARD_IA" { 691 storageClass = "STANDARD" 692 } 693 694 params := &s3.PutObjectInput{ 695 Bucket: &fs.bucket, 696 Key: fs.key(*fh.inode.FullName()), 697 Body: buf, 698 StorageClass: &storageClass, 699 ContentType: fs.getMimeType(*fh.inode.FullName()), 700 } 701 702 if fs.flags.UseSSE { 703 params.ServerSideEncryption = &fs.sseType 704 if fs.flags.UseKMS && fs.flags.KMSKeyID != "" { 705 params.SSEKMSKeyId = &fs.flags.KMSKeyID 706 } 707 } 708 709 if fs.flags.ACL != "" { 710 params.ACL = &fs.flags.ACL 711 } 712 713 fs.replicators.Take(1, true) 714 defer fs.replicators.Return(1) 715 716 _, err = fs.s3.PutObject(params) 717 if err != nil { 718 err = mapAwsError(err) 719 fh.lastWriteError = err 720 } 721 return 722 } 723 724 func (fh *FileHandle) resetToKnownSize() { 725 if fh.inode.KnownSize != nil { 726 fh.inode.Attributes.Size = *fh.inode.KnownSize 727 } else { 728 fh.inode.Attributes.Size = 0 729 fh.inode.Invalid = true 730 } 731 } 732 733 func (fh *FileHandle) FlushFile() (err error) { 734 fh.mu.Lock() 735 defer fh.mu.Unlock() 736 737 fh.inode.logFuse("FlushFile") 738 739 if !fh.dirty || fh.lastWriteError != nil { 740 if fh.lastWriteError != nil { 741 err = fh.lastWriteError 742 fh.resetToKnownSize() 743 } 744 return 745 } 746 747 fs := fh.inode.fs 748 749 // abort mpu on error 750 defer func() { 751 if err != nil { 752 if fh.mpuId != nil { 753 go func() { 754 params := &s3.AbortMultipartUploadInput{ 755 Bucket: &fs.bucket, 756 Key: fs.key(*fh.inode.FullName()), 757 UploadId: fh.mpuId, 758 } 759 760 fh.mpuId = nil 761 resp, _ := fs.s3.AbortMultipartUpload(params) 762 s3Log.Debug(resp) 763 }() 764 } 765 766 fh.resetToKnownSize() 767 } else { 768 if fh.dirty { 769 // don't unset this if we never actually flushed 770 size := fh.inode.Attributes.Size 771 fh.inode.KnownSize = &size 772 fh.inode.Invalid = false 773 } 774 fh.dirty = false 775 } 776 777 fh.writeInit = sync.Once{} 778 fh.nextWriteOffset = 0 779 fh.lastPartId = 0 780 }() 781 782 if fh.lastPartId == 0 { 783 return fh.flushSmallFile() 784 } 785 786 fh.mpuWG.Wait() 787 788 if fh.lastWriteError != nil { 789 return fh.lastWriteError 790 } 791 792 if fh.mpuId == nil { 793 return 794 } 795 796 nParts := fh.lastPartId 797 if fh.buf != nil { 798 // upload last part 799 nParts++ 800 err = fh.mpuPartNoSpawn(fh.buf, nParts, fh.nextWriteOffset, true) 801 if err != nil { 802 return 803 } 804 } 805 806 if !fs.gcs { 807 parts := make([]*s3.CompletedPart, nParts) 808 for i := 0; i < nParts; i++ { 809 parts[i] = &s3.CompletedPart{ 810 ETag: fh.etags[i], 811 PartNumber: aws.Int64(int64(i + 1)), 812 } 813 } 814 815 params := &s3.CompleteMultipartUploadInput{ 816 Bucket: &fs.bucket, 817 Key: fs.key(*fh.mpuKey), 818 UploadId: fh.mpuId, 819 MultipartUpload: &s3.CompletedMultipartUpload{ 820 Parts: parts, 821 }, 822 } 823 824 s3Log.Debug(params) 825 826 resp, err := fs.s3.CompleteMultipartUpload(params) 827 if err != nil { 828 return mapAwsError(err) 829 } 830 831 s3Log.Debug(resp) 832 } else { 833 // nothing, we already uploaded last part 834 } 835 836 fh.mpuId = nil 837 838 if *fh.mpuKey != *fh.inode.FullName() { 839 // the file was renamed 840 err = renameObject(fs, fh.nextWriteOffset, *fh.mpuKey, *fh.inode.FullName()) 841 } 842 843 return 844 }