github.com/maobaolong/goofys@v0.24.1-0.20200717030821-b50ef2d29ddf/internal/file.go (about) 1 // Copyright 2015 - 2017 Ka-Hing Cheung 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package internal 16 17 import ( 18 "errors" 19 "fmt" 20 "io" 21 "sync" 22 "sync/atomic" 23 "syscall" 24 "time" 25 26 "github.com/jacobsa/fuse" 27 "github.com/jacobsa/fuse/fuseops" 28 ) 29 30 type FileHandle struct { 31 inode *Inode 32 cloud StorageBackend 33 key string 34 35 mpuName *string 36 dirty bool 37 writeInit sync.Once 38 mpuWG sync.WaitGroup 39 40 mu sync.Mutex 41 mpuId *MultipartBlobCommitInput 42 nextWriteOffset int64 43 lastPartId uint32 44 45 poolHandle *BufferPool 46 buf *MBuf 47 48 lastWriteError error 49 50 // read 51 reader io.ReadCloser 52 readBufOffset int64 53 54 // parallel read 55 buffers []*S3ReadBuffer 56 existingReadahead int 57 seqReadAmount uint64 58 numOOORead uint64 // number of out of order read 59 // User space PID. All threads created by a process will have the same TGID, 60 // but different PIDs[1]. 61 // This value can be nil if we fail to get TGID from PID[2]. 62 // [1] : https://godoc.org/github.com/shirou/gopsutil/process#Process.Tgid 63 // [2] : https://github.com/shirou/gopsutil#process-class 64 Tgid *int32 65 66 keepPageCache bool // the same value we returned to OpenFile 67 } 68 69 // NewFileHandle returns a new file handle for the given `inode` triggered by fuse 70 // operation with the given `opMetadata` 71 func NewFileHandle(inode *Inode, opMetadata fuseops.OpMetadata) *FileHandle { 72 tgid, err := GetTgid(opMetadata.Pid) 73 if err != nil { 74 log.Debugf( 75 "Failed to retrieve tgid for the given pid. pid: %v err: %v inode id: %v err: %v", 76 opMetadata.Pid, err, inode.Id, err) 77 } 78 fh := &FileHandle{inode: inode, Tgid: tgid} 79 fh.cloud, fh.key = inode.cloud() 80 return fh 81 } 82 83 func (fh *FileHandle) initWrite() { 84 fh.writeInit.Do(func() { 85 fh.mpuWG.Add(1) 86 go fh.initMPU() 87 }) 88 } 89 90 func (fh *FileHandle) initMPU() { 91 defer func() { 92 fh.mpuWG.Done() 93 }() 94 95 fs := fh.inode.fs 96 fh.mpuName = &fh.key 97 98 resp, err := fh.cloud.MultipartBlobBegin(&MultipartBlobBeginInput{ 99 Key: *fh.mpuName, 100 ContentType: fs.flags.GetMimeType(*fh.mpuName), 101 }) 102 103 fh.mu.Lock() 104 defer fh.mu.Unlock() 105 106 if err != nil { 107 fh.lastWriteError = mapAwsError(err) 108 } else { 109 fh.mpuId = resp 110 } 111 112 return 113 } 114 115 func (fh *FileHandle) mpuPartNoSpawn(buf *MBuf, part uint32, total int64, last bool) (err error) { 116 fs := fh.inode.fs 117 118 fs.replicators.Take(1, true) 119 defer fs.replicators.Return(1) 120 121 if part == 0 || part > 10000 { 122 return errors.New(fmt.Sprintf("invalid part number: %v", part)) 123 } 124 125 mpu := MultipartBlobAddInput{ 126 Commit: fh.mpuId, 127 PartNumber: part, 128 Body: buf, 129 Size: uint64(buf.Len()), 130 Last: last, 131 Offset: uint64(total - int64(buf.Len())), 132 } 133 134 defer func() { 135 if mpu.Body != nil { 136 bufferLog.Debugf("Free %T", buf) 137 buf.Free() 138 } 139 }() 140 141 _, err = fh.cloud.MultipartBlobAdd(&mpu) 142 143 return 144 } 145 146 func (fh *FileHandle) mpuPart(buf *MBuf, part uint32, total int64) { 147 defer func() { 148 fh.mpuWG.Done() 149 }() 150 151 // maybe wait for CreateMultipartUpload 152 if fh.mpuId == nil { 153 fh.mpuWG.Wait() 154 // initMPU might have errored 155 if fh.mpuId == nil { 156 return 157 } 158 } 159 160 err := fh.mpuPartNoSpawn(buf, part, total, false) 161 if err != nil { 162 if fh.lastWriteError == nil { 163 fh.lastWriteError = err 164 } 165 } 166 } 167 168 func (fh *FileHandle) waitForCreateMPU() (err error) { 169 if fh.mpuId == nil { 170 fh.mu.Unlock() 171 fh.initWrite() 172 fh.mpuWG.Wait() // wait for initMPU 173 fh.mu.Lock() 174 175 if fh.lastWriteError != nil { 176 return fh.lastWriteError 177 } 178 } 179 180 return 181 } 182 183 func (fh *FileHandle) partSize() uint64 { 184 var size uint64 185 186 if fh.lastPartId < 1000 { 187 size = 5 * 1024 * 1024 188 } else if fh.lastPartId < 2000 { 189 size = 25 * 1024 * 1024 190 } else { 191 size = 125 * 1024 * 1024 192 } 193 194 maxPartSize := fh.cloud.Capabilities().MaxMultipartSize 195 if maxPartSize != 0 { 196 size = MinUInt64(maxPartSize, size) 197 } 198 return size 199 } 200 201 func (fh *FileHandle) uploadCurrentBuf(parallel bool) (err error) { 202 err = fh.waitForCreateMPU() 203 if err != nil { 204 return 205 } 206 207 fh.lastPartId++ 208 part := fh.lastPartId 209 buf := fh.buf 210 fh.buf = nil 211 212 if parallel { 213 fh.mpuWG.Add(1) 214 go fh.mpuPart(buf, part, fh.nextWriteOffset) 215 } else { 216 err = fh.mpuPartNoSpawn(buf, part, fh.nextWriteOffset, false) 217 if fh.lastWriteError == nil { 218 fh.lastWriteError = err 219 } 220 } 221 222 return 223 } 224 225 func (fh *FileHandle) WriteFile(offset int64, data []byte) (err error) { 226 fh.inode.logFuse("WriteFile", offset, len(data)) 227 228 fh.mu.Lock() 229 defer fh.mu.Unlock() 230 231 if fh.lastWriteError != nil { 232 fh.inode.mu.Lock() 233 // our write failed, next time we open we should not 234 // use page cache so we will read from cloud again 235 fh.inode.invalidateCache = true 236 fh.inode.mu.Unlock() 237 return fh.lastWriteError 238 } 239 240 if offset != fh.nextWriteOffset { 241 fh.inode.errFuse("WriteFile: only sequential writes supported", fh.nextWriteOffset, offset) 242 fh.lastWriteError = syscall.ENOTSUP 243 return fh.lastWriteError 244 } 245 246 if offset == 0 { 247 fh.poolHandle = fh.inode.fs.bufferPool 248 fh.dirty = true 249 fh.inode.mu.Lock() 250 // we are updating this file, set knownETag to nil so 251 // on next lookup we won't think it's changed, to 252 // always prefer to read back our own write. We set 253 // this back to the ETag at flush time 254 // 255 // XXX this doesn't actually work, see the notes in 256 // Goofys.OpenFile about KeepPageCache 257 fh.inode.knownETag = nil 258 fh.inode.invalidateCache = false 259 fh.inode.mu.Unlock() 260 } 261 262 for { 263 if fh.buf == nil { 264 fh.buf = MBuf{}.Init(fh.poolHandle, fh.partSize(), true) 265 } 266 267 nCopied, _ := fh.buf.Write(data) 268 fh.nextWriteOffset += int64(nCopied) 269 270 if fh.buf.Full() { 271 err = fh.uploadCurrentBuf(!fh.cloud.Capabilities().NoParallelMultipart) 272 if err != nil { 273 return 274 } 275 } 276 277 if nCopied == len(data) { 278 break 279 } 280 281 data = data[nCopied:] 282 } 283 284 fh.inode.Attributes.Size = uint64(fh.nextWriteOffset) 285 fh.inode.Attributes.Mtime = time.Now() 286 287 return 288 } 289 290 type S3ReadBuffer struct { 291 s3 StorageBackend 292 startOffset uint64 293 nRetries uint8 294 mbuf *MBuf 295 296 offset uint64 297 size uint32 298 buf *Buffer 299 } 300 301 func (b S3ReadBuffer) Init(fh *FileHandle, offset uint64, size uint32) *S3ReadBuffer { 302 b.s3 = fh.cloud 303 b.offset = offset 304 b.startOffset = offset 305 b.size = size 306 b.nRetries = 3 307 308 b.mbuf = MBuf{}.Init(fh.poolHandle, uint64(size), false) 309 if b.mbuf == nil { 310 return nil 311 } 312 313 b.initBuffer(fh, offset, size) 314 return &b 315 } 316 317 func (b *S3ReadBuffer) initBuffer(fh *FileHandle, offset uint64, size uint32) { 318 getFunc := func() (io.ReadCloser, error) { 319 resp, err := b.s3.GetBlob(&GetBlobInput{ 320 Key: fh.key, 321 Start: offset, 322 Count: uint64(size), 323 }) 324 if err != nil { 325 return nil, err 326 } 327 328 return resp.Body, nil 329 } 330 331 if b.buf == nil { 332 b.buf = Buffer{}.Init(b.mbuf, getFunc) 333 } else { 334 b.buf.ReInit(getFunc) 335 } 336 } 337 338 func (b *S3ReadBuffer) Read(offset uint64, p []byte) (n int, err error) { 339 if b.offset == offset { 340 n, err = io.ReadFull(b.buf, p) 341 if n != 0 && err == io.ErrUnexpectedEOF { 342 err = nil 343 } 344 if n > 0 { 345 if uint32(n) > b.size { 346 panic(fmt.Sprintf("read more than available %v %v", n, b.size)) 347 } 348 349 b.offset += uint64(n) 350 b.size -= uint32(n) 351 } 352 if b.size == 0 && err != nil { 353 // we've read everything, sometimes we may 354 // request for more bytes then there's left in 355 // this chunk so we could get an error back, 356 // ex: http2: response body closed this 357 // doesn't tend to happen because our chunks 358 // are aligned to 4K and also 128K (except for 359 // the last chunk, but seems kernel requests 360 // for a smaller buffer for the last chunk) 361 err = nil 362 } 363 364 return 365 } else { 366 panic(fmt.Sprintf("not the right buffer, expecting %v got %v, %v left", b.offset, offset, b.size)) 367 err = errors.New(fmt.Sprintf("not the right buffer, expecting %v got %v", b.offset, offset)) 368 return 369 } 370 } 371 372 func (fh *FileHandle) readFromReadAhead(offset uint64, buf []byte) (bytesRead int, err error) { 373 var nread int 374 for len(fh.buffers) != 0 { 375 readAheadBuf := fh.buffers[0] 376 377 nread, err = readAheadBuf.Read(offset+uint64(bytesRead), buf) 378 bytesRead += nread 379 if err != nil { 380 if err == io.EOF && readAheadBuf.size != 0 { 381 // in case we hit 382 // https://github.com/kahing/goofys/issues/464 383 // again, this will convert that into 384 // an error 385 fuseLog.Errorf("got EOF when data remains: %v", *fh.inode.FullName()) 386 err = io.ErrUnexpectedEOF 387 } else if err != io.EOF && readAheadBuf.size > 0 { 388 // we hit some other errors when 389 // reading from this part. If we can 390 // retry, do that 391 if readAheadBuf.nRetries > 0 { 392 readAheadBuf.nRetries -= 1 393 readAheadBuf.initBuffer(fh, readAheadBuf.offset, readAheadBuf.size) 394 // we unset error and return, 395 // so upper layer will retry 396 // this read 397 err = nil 398 } 399 } 400 return 401 } 402 403 if readAheadBuf.size == 0 { 404 // we've exhausted the first buffer 405 readAheadBuf.buf.Close() 406 fh.buffers = fh.buffers[1:] 407 } 408 409 buf = buf[nread:] 410 411 if len(buf) == 0 { 412 // we've filled the user buffer 413 return 414 } 415 } 416 417 return 418 } 419 420 func (fh *FileHandle) readAhead(offset uint64, needAtLeast int) (err error) { 421 existingReadahead := uint32(0) 422 for _, b := range fh.buffers { 423 existingReadahead += b.size 424 } 425 426 readAheadAmount := fh.inode.fs.flags.MaxReadAhead 427 428 for readAheadAmount-existingReadahead >= fh.inode.fs.flags.ReadAheadChunk { 429 off := offset + uint64(existingReadahead) 430 remaining := fh.inode.Attributes.Size - off 431 432 // only read up to readahead chunk each time 433 size := MinUInt32(readAheadAmount-existingReadahead, fh.inode.fs.flags.ReadAheadChunk) 434 // but don't read past the file 435 size = uint32(MinUInt64(uint64(size), remaining)) 436 437 if size != 0 { 438 fh.inode.logFuse("readahead", off, size, existingReadahead) 439 440 readAheadBuf := S3ReadBuffer{}.Init(fh, off, size) 441 if readAheadBuf != nil { 442 fh.buffers = append(fh.buffers, readAheadBuf) 443 existingReadahead += size 444 } else { 445 if existingReadahead != 0 { 446 // don't do more readahead now, but don't fail, cross our 447 // fingers that we will be able to allocate the buffers 448 // later 449 return nil 450 } else { 451 return syscall.ENOMEM 452 } 453 } 454 } 455 456 if size != fh.inode.fs.flags.ReadAheadChunk { 457 // that was the last remaining chunk to readahead 458 break 459 } 460 } 461 462 return nil 463 } 464 465 func (fh *FileHandle) ReadFile(offset int64, buf []byte) (bytesRead int, err error) { 466 fh.inode.logFuse("ReadFile", offset, len(buf)) 467 defer func() { 468 fh.inode.logFuse("< ReadFile", bytesRead, err) 469 470 if err != nil { 471 if err == io.EOF { 472 err = nil 473 } 474 } 475 }() 476 477 fh.mu.Lock() 478 defer fh.mu.Unlock() 479 480 nwant := len(buf) 481 var nread int 482 483 for bytesRead < nwant && err == nil { 484 nread, err = fh.readFile(offset+int64(bytesRead), buf[bytesRead:]) 485 if nread > 0 { 486 bytesRead += nread 487 } 488 } 489 490 return 491 } 492 493 func (fh *FileHandle) readFile(offset int64, buf []byte) (bytesRead int, err error) { 494 defer func() { 495 if bytesRead > 0 { 496 fh.readBufOffset += int64(bytesRead) 497 fh.seqReadAmount += uint64(bytesRead) 498 } 499 500 fh.inode.logFuse("< readFile", bytesRead, err) 501 }() 502 503 if uint64(offset) >= fh.inode.Attributes.Size { 504 // nothing to read 505 if fh.inode.Invalid { 506 err = fuse.ENOENT 507 } else if fh.inode.KnownSize == nil { 508 err = io.EOF 509 } else { 510 err = io.EOF 511 } 512 return 513 } 514 515 fs := fh.inode.fs 516 517 if fh.poolHandle == nil { 518 fh.poolHandle = fs.bufferPool 519 } 520 521 if fh.readBufOffset != offset { 522 // XXX out of order read, maybe disable prefetching 523 fh.inode.logFuse("out of order read", offset, fh.readBufOffset) 524 525 fh.readBufOffset = offset 526 fh.seqReadAmount = 0 527 if fh.reader != nil { 528 fh.reader.Close() 529 fh.reader = nil 530 } 531 532 if fh.buffers != nil { 533 // we misdetected 534 fh.numOOORead++ 535 } 536 537 for _, b := range fh.buffers { 538 b.buf.Close() 539 } 540 fh.buffers = nil 541 } 542 543 if !fs.flags.Cheap && fh.seqReadAmount >= uint64(fh.inode.fs.flags.ReadAheadChunk) && fh.numOOORead < 3 { 544 if fh.reader != nil { 545 fh.inode.logFuse("cutover to the parallel algorithm") 546 fh.reader.Close() 547 fh.reader = nil 548 } 549 550 err = fh.readAhead(uint64(offset), len(buf)) 551 if err == nil { 552 bytesRead, err = fh.readFromReadAhead(uint64(offset), buf) 553 return 554 } else { 555 // fall back to read serially 556 fh.inode.logFuse("not enough memory, fallback to serial read") 557 fh.seqReadAmount = 0 558 for _, b := range fh.buffers { 559 b.buf.Close() 560 } 561 fh.buffers = nil 562 } 563 } 564 565 bytesRead, err = fh.readFromStream(offset, buf) 566 567 return 568 } 569 570 func (fh *FileHandle) Release() { 571 // read buffers 572 for _, b := range fh.buffers { 573 b.buf.Close() 574 } 575 fh.buffers = nil 576 577 if fh.reader != nil { 578 fh.reader.Close() 579 } 580 581 // write buffers 582 if fh.poolHandle != nil { 583 if fh.buf != nil && fh.buf.buffers != nil { 584 if fh.lastWriteError == nil { 585 panic("buf not freed but error is nil") 586 } 587 588 fh.buf.Free() 589 // the other in-flight multipart PUT buffers will be 590 // freed when they finish/error out 591 } 592 } 593 594 fh.inode.mu.Lock() 595 defer fh.inode.mu.Unlock() 596 597 if atomic.AddInt32(&fh.inode.fileHandles, -1) == -1 { 598 panic(fh.inode.fileHandles) 599 } 600 } 601 602 func (fh *FileHandle) readFromStream(offset int64, buf []byte) (bytesRead int, err error) { 603 defer func() { 604 if fh.inode.fs.flags.DebugFuse { 605 fh.inode.logFuse("< readFromStream", bytesRead) 606 } 607 }() 608 609 if uint64(offset) >= fh.inode.Attributes.Size { 610 // nothing to read 611 return 612 } 613 614 if fh.reader == nil { 615 resp, err := fh.cloud.GetBlob(&GetBlobInput{ 616 Key: fh.key, 617 Start: uint64(offset), 618 }) 619 if err != nil { 620 return bytesRead, err 621 } 622 623 fh.reader = resp.Body 624 } 625 626 bytesRead, err = fh.reader.Read(buf) 627 if err != nil { 628 if err != io.EOF { 629 fh.inode.logFuse("< readFromStream error", bytesRead, err) 630 } 631 // always retry error on read 632 fh.reader.Close() 633 fh.reader = nil 634 err = nil 635 } 636 637 return 638 } 639 640 func (fh *FileHandle) flushSmallFile() (err error) { 641 buf := fh.buf 642 fh.buf = nil 643 644 if buf == nil { 645 buf = MBuf{}.Init(fh.poolHandle, 0, true) 646 } 647 648 defer buf.Free() 649 650 fs := fh.inode.fs 651 652 fs.replicators.Take(1, true) 653 defer fs.replicators.Return(1) 654 655 // we want to get key from inode because the file could have been renamed 656 _, key := fh.inode.cloud() 657 resp, err := fh.cloud.PutBlob(&PutBlobInput{ 658 Key: key, 659 Body: buf, 660 Size: PUInt64(uint64(buf.Len())), 661 ContentType: fs.flags.GetMimeType(*fh.inode.FullName()), 662 }) 663 if err != nil { 664 fh.lastWriteError = err 665 } else { 666 fh.updateFromFlush(resp.ETag, resp.LastModified, resp.StorageClass) 667 } 668 return 669 } 670 671 // LOCKS_EXCLUDED(fh.inode.mu) 672 func (fh *FileHandle) updateFromFlush(etag *string, lastModified *time.Time, storageClass *string) { 673 inode := fh.inode 674 inode.mu.Lock() 675 defer inode.mu.Unlock() 676 677 if etag != nil { 678 inode.s3Metadata["etag"] = []byte(*etag) 679 } 680 if storageClass != nil { 681 inode.s3Metadata["storage-class"] = []byte(*storageClass) 682 } 683 if fh.keepPageCache { 684 // if this write didn't update page cache, don't try 685 // to update these values so on next lookup, we would 686 // invalidate the cache. We want to do that because 687 // our cache could have been populated by subsequent 688 // reads 689 if lastModified != nil { 690 inode.Attributes.Mtime = *lastModified 691 } 692 inode.knownETag = etag 693 } 694 } 695 696 func (fh *FileHandle) resetToKnownSize() { 697 if fh.inode.KnownSize != nil { 698 fh.inode.Attributes.Size = *fh.inode.KnownSize 699 } else { 700 fh.inode.Attributes.Size = 0 701 fh.inode.Invalid = true 702 } 703 } 704 705 func (fh *FileHandle) FlushFile() (err error) { 706 fh.mu.Lock() 707 defer fh.mu.Unlock() 708 709 fh.inode.logFuse("FlushFile") 710 711 if !fh.dirty || fh.lastWriteError != nil { 712 if fh.lastWriteError != nil { 713 err = fh.lastWriteError 714 fh.resetToKnownSize() 715 } 716 return 717 } 718 719 if fh.inode.Parent == nil { 720 // the file is deleted 721 if fh.mpuId != nil { 722 go func() { 723 _, _ = fh.cloud.MultipartBlobAbort(fh.mpuId) 724 fh.mpuId = nil 725 }() 726 } 727 return 728 } 729 730 fs := fh.inode.fs 731 732 // abort mpu on error 733 defer func() { 734 if err != nil { 735 if fh.mpuId != nil { 736 go func() { 737 _, _ = fh.cloud.MultipartBlobAbort(fh.mpuId) 738 fh.mpuId = nil 739 }() 740 } 741 742 fh.resetToKnownSize() 743 } else { 744 if fh.dirty { 745 // don't unset this if we never actually flushed 746 size := fh.inode.Attributes.Size 747 fh.inode.KnownSize = &size 748 fh.inode.Invalid = false 749 } 750 fh.dirty = false 751 } 752 753 fh.writeInit = sync.Once{} 754 fh.nextWriteOffset = 0 755 fh.lastPartId = 0 756 }() 757 758 if fh.lastPartId == 0 { 759 return fh.flushSmallFile() 760 } 761 762 fh.mpuWG.Wait() 763 764 if fh.lastWriteError != nil { 765 return fh.lastWriteError 766 } 767 768 if fh.mpuId == nil { 769 return 770 } 771 772 nParts := fh.lastPartId 773 if fh.buf != nil { 774 // upload last part 775 nParts++ 776 err = fh.mpuPartNoSpawn(fh.buf, nParts, fh.nextWriteOffset, true) 777 if err != nil { 778 return 779 } 780 fh.buf = nil 781 } 782 783 resp, err := fh.cloud.MultipartBlobCommit(fh.mpuId) 784 if err != nil { 785 return 786 } 787 788 fh.updateFromFlush(resp.ETag, resp.LastModified, resp.StorageClass) 789 790 fh.mpuId = nil 791 792 // we want to get key from inode because the file could have been renamed 793 _, key := fh.inode.cloud() 794 if *fh.mpuName != key { 795 // the file was renamed 796 err = fh.inode.renameObject(fs, PUInt64(uint64(fh.nextWriteOffset)), *fh.mpuName, *fh.inode.FullName()) 797 } 798 799 return 800 }