github.com/gaukas/goofys100m@v0.24.0/internal/file.go (about) 1 // Copyright 2015 - 2017 Ka-Hing Cheung 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package internal 16 17 import ( 18 "errors" 19 "fmt" 20 "io" 21 "sync" 22 "sync/atomic" 23 "syscall" 24 "time" 25 26 "github.com/jacobsa/fuse" 27 "github.com/jacobsa/fuse/fuseops" 28 ) 29 30 type FileHandle struct { 31 inode *Inode 32 cloud StorageBackend 33 key string 34 35 mpuName *string 36 dirty bool 37 writeInit sync.Once 38 mpuWG sync.WaitGroup 39 40 mu sync.Mutex 41 mpuId *MultipartBlobCommitInput 42 nextWriteOffset int64 43 lastPartId uint32 44 45 poolHandle *BufferPool 46 buf *MBuf 47 48 lastWriteError error 49 50 // read 51 reader io.ReadCloser 52 readBufOffset int64 53 54 // parallel read 55 buffers []*S3ReadBuffer 56 existingReadahead int 57 seqReadAmount uint64 58 numOOORead uint64 // number of out of order read 59 // User space PID. All threads created by a process will have the same TGID, 60 // but different PIDs[1]. 61 // This value can be nil if we fail to get TGID from PID[2]. 62 // [1] : https://godoc.org/github.com/shirou/gopsutil/process#Process.Tgid 63 // [2] : https://github.com/shirou/gopsutil#process-class 64 Tgid *int32 65 66 keepPageCache bool // the same value we returned to OpenFile 67 } 68 69 const MAX_READAHEAD = uint32(400 * 1024 * 1024) 70 const READAHEAD_CHUNK = uint32(20 * 1024 * 1024) 71 72 // NewFileHandle returns a new file handle for the given `inode` triggered by fuse 73 // operation with the given `opMetadata` 74 func NewFileHandle(inode *Inode, opMetadata fuseops.OpMetadata) *FileHandle { 75 tgid, err := GetTgid(opMetadata.Pid) 76 if err != nil { 77 log.Debugf( 78 "Failed to retrieve tgid for the given pid. pid: %v err: %v inode id: %v err: %v", 79 opMetadata.Pid, err, inode.Id, err) 80 } 81 fh := &FileHandle{inode: inode, Tgid: tgid} 82 fh.cloud, fh.key = inode.cloud() 83 return fh 84 } 85 86 func (fh *FileHandle) initWrite() { 87 fh.writeInit.Do(func() { 88 fh.mpuWG.Add(1) 89 go fh.initMPU() 90 }) 91 } 92 93 func (fh *FileHandle) initMPU() { 94 defer func() { 95 fh.mpuWG.Done() 96 }() 97 98 fs := fh.inode.fs 99 fh.mpuName = &fh.key 100 101 resp, err := fh.cloud.MultipartBlobBegin(&MultipartBlobBeginInput{ 102 Key: *fh.mpuName, 103 ContentType: fs.flags.GetMimeType(*fh.mpuName), 104 }) 105 106 fh.mu.Lock() 107 defer fh.mu.Unlock() 108 109 if err != nil { 110 fh.lastWriteError = mapAwsError(err) 111 } else { 112 fh.mpuId = resp 113 } 114 115 return 116 } 117 118 func (fh *FileHandle) mpuPartNoSpawn(buf *MBuf, part uint32, total int64, last bool) (err error) { 119 fs := fh.inode.fs 120 121 fs.replicators.Take(1, true) 122 defer fs.replicators.Return(1) 123 124 if part == 0 || part > 10000 { 125 return errors.New(fmt.Sprintf("invalid part number: %v", part)) 126 } 127 128 mpu := MultipartBlobAddInput{ 129 Commit: fh.mpuId, 130 PartNumber: part, 131 Body: buf, 132 Size: uint64(buf.Len()), 133 Last: last, 134 Offset: uint64(total - int64(buf.Len())), 135 } 136 137 defer func() { 138 if mpu.Body != nil { 139 bufferLog.Debugf("Free %T", buf) 140 buf.Free() 141 } 142 }() 143 144 _, err = fh.cloud.MultipartBlobAdd(&mpu) 145 146 return 147 } 148 149 func (fh *FileHandle) mpuPart(buf *MBuf, part uint32, total int64) { 150 defer func() { 151 fh.mpuWG.Done() 152 }() 153 154 // maybe wait for CreateMultipartUpload 155 if fh.mpuId == nil { 156 fh.mpuWG.Wait() 157 // initMPU might have errored 158 if fh.mpuId == nil { 159 return 160 } 161 } 162 163 err := fh.mpuPartNoSpawn(buf, part, total, false) 164 if err != nil { 165 if fh.lastWriteError == nil { 166 fh.lastWriteError = err 167 } 168 } 169 } 170 171 func (fh *FileHandle) waitForCreateMPU() (err error) { 172 if fh.mpuId == nil { 173 fh.mu.Unlock() 174 fh.initWrite() 175 fh.mpuWG.Wait() // wait for initMPU 176 fh.mu.Lock() 177 178 if fh.lastWriteError != nil { 179 return fh.lastWriteError 180 } 181 } 182 183 return 184 } 185 186 func (fh *FileHandle) partSize() uint64 { 187 var size uint64 188 189 if fh.lastPartId < 1000 { 190 size = 5 * 1024 * 1024 191 } else if fh.lastPartId < 2000 { 192 size = 25 * 1024 * 1024 193 } else { 194 size = 125 * 1024 * 1024 195 } 196 197 maxPartSize := fh.cloud.Capabilities().MaxMultipartSize 198 if maxPartSize != 0 { 199 size = MinUInt64(maxPartSize, size) 200 } 201 return size 202 } 203 204 func (fh *FileHandle) uploadCurrentBuf(parallel bool) (err error) { 205 err = fh.waitForCreateMPU() 206 if err != nil { 207 return 208 } 209 210 fh.lastPartId++ 211 part := fh.lastPartId 212 buf := fh.buf 213 fh.buf = nil 214 215 if parallel { 216 fh.mpuWG.Add(1) 217 go fh.mpuPart(buf, part, fh.nextWriteOffset) 218 } else { 219 err = fh.mpuPartNoSpawn(buf, part, fh.nextWriteOffset, false) 220 if fh.lastWriteError == nil { 221 fh.lastWriteError = err 222 } 223 } 224 225 return 226 } 227 228 func (fh *FileHandle) WriteFile(offset int64, data []byte) (err error) { 229 fh.inode.logFuse("WriteFile", offset, len(data)) 230 231 fh.mu.Lock() 232 defer fh.mu.Unlock() 233 234 if fh.lastWriteError != nil { 235 fh.inode.mu.Lock() 236 // our write failed, next time we open we should not 237 // use page cache so we will read from cloud again 238 fh.inode.invalidateCache = true 239 fh.inode.mu.Unlock() 240 return fh.lastWriteError 241 } 242 243 if offset != fh.nextWriteOffset { 244 fh.inode.errFuse("WriteFile: only sequential writes supported", fh.nextWriteOffset, offset) 245 fh.lastWriteError = syscall.ENOTSUP 246 return fh.lastWriteError 247 } 248 249 if offset == 0 { 250 fh.poolHandle = fh.inode.fs.bufferPool 251 fh.dirty = true 252 fh.inode.mu.Lock() 253 // we are updating this file, set knownETag to nil so 254 // on next lookup we won't think it's changed, to 255 // always prefer to read back our own write. We set 256 // this back to the ETag at flush time 257 // 258 // XXX this doesn't actually work, see the notes in 259 // Goofys.OpenFile about KeepPageCache 260 fh.inode.knownETag = nil 261 fh.inode.invalidateCache = false 262 fh.inode.mu.Unlock() 263 } 264 265 for { 266 if fh.buf == nil { 267 fh.buf = MBuf{}.Init(fh.poolHandle, fh.partSize(), true) 268 } 269 270 nCopied, _ := fh.buf.Write(data) 271 fh.nextWriteOffset += int64(nCopied) 272 273 if fh.buf.Full() { 274 err = fh.uploadCurrentBuf(!fh.cloud.Capabilities().NoParallelMultipart) 275 if err != nil { 276 return 277 } 278 } 279 280 if nCopied == len(data) { 281 break 282 } 283 284 data = data[nCopied:] 285 } 286 287 fh.inode.Attributes.Size = uint64(fh.nextWriteOffset) 288 fh.inode.Attributes.Mtime = time.Now() 289 290 return 291 } 292 293 type S3ReadBuffer struct { 294 s3 StorageBackend 295 startOffset uint64 296 nRetries uint8 297 mbuf *MBuf 298 299 offset uint64 300 size uint32 301 buf *Buffer 302 } 303 304 func (b S3ReadBuffer) Init(fh *FileHandle, offset uint64, size uint32) *S3ReadBuffer { 305 b.s3 = fh.cloud 306 b.offset = offset 307 b.startOffset = offset 308 b.size = size 309 b.nRetries = 3 310 311 b.mbuf = MBuf{}.Init(fh.poolHandle, uint64(size), false) 312 if b.mbuf == nil { 313 return nil 314 } 315 316 b.initBuffer(fh, offset, size) 317 return &b 318 } 319 320 func (b *S3ReadBuffer) initBuffer(fh *FileHandle, offset uint64, size uint32) { 321 getFunc := func() (io.ReadCloser, error) { 322 resp, err := b.s3.GetBlob(&GetBlobInput{ 323 Key: fh.key, 324 Start: offset, 325 Count: uint64(size), 326 }) 327 if err != nil { 328 return nil, err 329 } 330 331 return resp.Body, nil 332 } 333 334 if b.buf == nil { 335 b.buf = Buffer{}.Init(b.mbuf, getFunc) 336 } else { 337 b.buf.ReInit(getFunc) 338 } 339 } 340 341 func (b *S3ReadBuffer) Read(offset uint64, p []byte) (n int, err error) { 342 if b.offset == offset { 343 n, err = io.ReadFull(b.buf, p) 344 if n != 0 && err == io.ErrUnexpectedEOF { 345 err = nil 346 } 347 if n > 0 { 348 if uint32(n) > b.size { 349 panic(fmt.Sprintf("read more than available %v %v", n, b.size)) 350 } 351 352 b.offset += uint64(n) 353 b.size -= uint32(n) 354 } 355 if b.size == 0 && err != nil { 356 // we've read everything, sometimes we may 357 // request for more bytes then there's left in 358 // this chunk so we could get an error back, 359 // ex: http2: response body closed this 360 // doesn't tend to happen because our chunks 361 // are aligned to 4K and also 128K (except for 362 // the last chunk, but seems kernel requests 363 // for a smaller buffer for the last chunk) 364 err = nil 365 } 366 367 return 368 } else { 369 panic(fmt.Sprintf("not the right buffer, expecting %v got %v, %v left", b.offset, offset, b.size)) 370 err = errors.New(fmt.Sprintf("not the right buffer, expecting %v got %v", b.offset, offset)) 371 return 372 } 373 } 374 375 func (fh *FileHandle) readFromReadAhead(offset uint64, buf []byte) (bytesRead int, err error) { 376 var nread int 377 for len(fh.buffers) != 0 { 378 readAheadBuf := fh.buffers[0] 379 380 nread, err = readAheadBuf.Read(offset+uint64(bytesRead), buf) 381 bytesRead += nread 382 if err != nil { 383 if err == io.EOF && readAheadBuf.size != 0 { 384 // in case we hit 385 // https://github.com/kahing/goofys/issues/464 386 // again, this will convert that into 387 // an error 388 fuseLog.Errorf("got EOF when data remains: %v", *fh.inode.FullName()) 389 err = io.ErrUnexpectedEOF 390 } else if err != io.EOF && readAheadBuf.size > 0 { 391 // we hit some other errors when 392 // reading from this part. If we can 393 // retry, do that 394 if readAheadBuf.nRetries > 0 { 395 readAheadBuf.nRetries -= 1 396 readAheadBuf.initBuffer(fh, readAheadBuf.offset, readAheadBuf.size) 397 // we unset error and return, 398 // so upper layer will retry 399 // this read 400 err = nil 401 } 402 } 403 return 404 } 405 406 if readAheadBuf.size == 0 { 407 // we've exhausted the first buffer 408 readAheadBuf.buf.Close() 409 fh.buffers = fh.buffers[1:] 410 } 411 412 buf = buf[nread:] 413 414 if len(buf) == 0 { 415 // we've filled the user buffer 416 return 417 } 418 } 419 420 return 421 } 422 423 func (fh *FileHandle) readAhead(offset uint64, needAtLeast int) (err error) { 424 existingReadahead := uint32(0) 425 for _, b := range fh.buffers { 426 existingReadahead += b.size 427 } 428 429 readAheadAmount := MAX_READAHEAD 430 431 for readAheadAmount-existingReadahead >= READAHEAD_CHUNK { 432 off := offset + uint64(existingReadahead) 433 remaining := fh.inode.Attributes.Size - off 434 435 // only read up to readahead chunk each time 436 size := MinUInt32(readAheadAmount-existingReadahead, READAHEAD_CHUNK) 437 // but don't read past the file 438 size = uint32(MinUInt64(uint64(size), remaining)) 439 440 if size != 0 { 441 fh.inode.logFuse("readahead", off, size, existingReadahead) 442 443 readAheadBuf := S3ReadBuffer{}.Init(fh, off, size) 444 if readAheadBuf != nil { 445 fh.buffers = append(fh.buffers, readAheadBuf) 446 existingReadahead += size 447 } else { 448 if existingReadahead != 0 { 449 // don't do more readahead now, but don't fail, cross our 450 // fingers that we will be able to allocate the buffers 451 // later 452 return nil 453 } else { 454 return syscall.ENOMEM 455 } 456 } 457 } 458 459 if size != READAHEAD_CHUNK { 460 // that was the last remaining chunk to readahead 461 break 462 } 463 } 464 465 return nil 466 } 467 468 func (fh *FileHandle) ReadFile(offset int64, buf []byte) (bytesRead int, err error) { 469 fh.inode.logFuse("ReadFile", offset, len(buf)) 470 defer func() { 471 fh.inode.logFuse("< ReadFile", bytesRead, err) 472 473 if err != nil { 474 if err == io.EOF { 475 err = nil 476 } 477 } 478 }() 479 480 fh.mu.Lock() 481 defer fh.mu.Unlock() 482 483 nwant := len(buf) 484 var nread int 485 486 for bytesRead < nwant && err == nil { 487 nread, err = fh.readFile(offset+int64(bytesRead), buf[bytesRead:]) 488 if nread > 0 { 489 bytesRead += nread 490 } 491 } 492 493 return 494 } 495 496 func (fh *FileHandle) readFile(offset int64, buf []byte) (bytesRead int, err error) { 497 defer func() { 498 if bytesRead > 0 { 499 fh.readBufOffset += int64(bytesRead) 500 fh.seqReadAmount += uint64(bytesRead) 501 } 502 503 fh.inode.logFuse("< readFile", bytesRead, err) 504 }() 505 506 if uint64(offset) >= fh.inode.Attributes.Size { 507 // nothing to read 508 if fh.inode.Invalid { 509 err = fuse.ENOENT 510 } else if fh.inode.KnownSize == nil { 511 err = io.EOF 512 } else { 513 err = io.EOF 514 } 515 return 516 } 517 518 fs := fh.inode.fs 519 520 if fh.poolHandle == nil { 521 fh.poolHandle = fs.bufferPool 522 } 523 524 if fh.readBufOffset != offset { 525 // XXX out of order read, maybe disable prefetching 526 fh.inode.logFuse("out of order read", offset, fh.readBufOffset) 527 528 fh.readBufOffset = offset 529 fh.seqReadAmount = 0 530 if fh.reader != nil { 531 fh.reader.Close() 532 fh.reader = nil 533 } 534 535 if fh.buffers != nil { 536 // we misdetected 537 fh.numOOORead++ 538 } 539 540 for _, b := range fh.buffers { 541 b.buf.Close() 542 } 543 fh.buffers = nil 544 } 545 546 if !fs.flags.Cheap && fh.seqReadAmount >= uint64(READAHEAD_CHUNK) && fh.numOOORead < 3 { 547 if fh.reader != nil { 548 fh.inode.logFuse("cutover to the parallel algorithm") 549 fh.reader.Close() 550 fh.reader = nil 551 } 552 553 err = fh.readAhead(uint64(offset), len(buf)) 554 if err == nil { 555 bytesRead, err = fh.readFromReadAhead(uint64(offset), buf) 556 return 557 } else { 558 // fall back to read serially 559 fh.inode.logFuse("not enough memory, fallback to serial read") 560 fh.seqReadAmount = 0 561 for _, b := range fh.buffers { 562 b.buf.Close() 563 } 564 fh.buffers = nil 565 } 566 } 567 568 bytesRead, err = fh.readFromStream(offset, buf) 569 570 return 571 } 572 573 func (fh *FileHandle) Release() { 574 // read buffers 575 for _, b := range fh.buffers { 576 b.buf.Close() 577 } 578 fh.buffers = nil 579 580 if fh.reader != nil { 581 fh.reader.Close() 582 } 583 584 // write buffers 585 if fh.poolHandle != nil { 586 if fh.buf != nil && fh.buf.buffers != nil { 587 if fh.lastWriteError == nil { 588 panic("buf not freed but error is nil") 589 } 590 591 fh.buf.Free() 592 // the other in-flight multipart PUT buffers will be 593 // freed when they finish/error out 594 } 595 } 596 597 fh.inode.mu.Lock() 598 defer fh.inode.mu.Unlock() 599 600 if atomic.AddInt32(&fh.inode.fileHandles, -1) == -1 { 601 panic(fh.inode.fileHandles) 602 } 603 } 604 605 func (fh *FileHandle) readFromStream(offset int64, buf []byte) (bytesRead int, err error) { 606 defer func() { 607 if fh.inode.fs.flags.DebugFuse { 608 fh.inode.logFuse("< readFromStream", bytesRead) 609 } 610 }() 611 612 if uint64(offset) >= fh.inode.Attributes.Size { 613 // nothing to read 614 return 615 } 616 617 if fh.reader == nil { 618 resp, err := fh.cloud.GetBlob(&GetBlobInput{ 619 Key: fh.key, 620 Start: uint64(offset), 621 }) 622 if err != nil { 623 return bytesRead, err 624 } 625 626 fh.reader = resp.Body 627 } 628 629 bytesRead, err = fh.reader.Read(buf) 630 if err != nil { 631 if err != io.EOF { 632 fh.inode.logFuse("< readFromStream error", bytesRead, err) 633 } 634 // always retry error on read 635 fh.reader.Close() 636 fh.reader = nil 637 err = nil 638 } 639 640 return 641 } 642 643 func (fh *FileHandle) flushSmallFile() (err error) { 644 buf := fh.buf 645 fh.buf = nil 646 647 if buf == nil { 648 buf = MBuf{}.Init(fh.poolHandle, 0, true) 649 } 650 651 defer buf.Free() 652 653 fs := fh.inode.fs 654 655 fs.replicators.Take(1, true) 656 defer fs.replicators.Return(1) 657 658 // we want to get key from inode because the file could have been renamed 659 _, key := fh.inode.cloud() 660 resp, err := fh.cloud.PutBlob(&PutBlobInput{ 661 Key: key, 662 Body: buf, 663 Size: PUInt64(uint64(buf.Len())), 664 ContentType: fs.flags.GetMimeType(*fh.inode.FullName()), 665 }) 666 if err != nil { 667 fh.lastWriteError = err 668 } else { 669 fh.updateFromFlush(resp.ETag, resp.LastModified, resp.StorageClass) 670 } 671 return 672 } 673 674 // LOCKS_EXCLUDED(fh.inode.mu) 675 func (fh *FileHandle) updateFromFlush(etag *string, lastModified *time.Time, storageClass *string) { 676 inode := fh.inode 677 inode.mu.Lock() 678 defer inode.mu.Unlock() 679 680 if etag != nil { 681 inode.s3Metadata["etag"] = []byte(*etag) 682 } 683 if storageClass != nil { 684 inode.s3Metadata["storage-class"] = []byte(*storageClass) 685 } 686 if fh.keepPageCache { 687 // if this write didn't update page cache, don't try 688 // to update these values so on next lookup, we would 689 // invalidate the cache. We want to do that because 690 // our cache could have been populated by subsequent 691 // reads 692 if lastModified != nil { 693 inode.Attributes.Mtime = *lastModified 694 } 695 inode.knownETag = etag 696 } 697 } 698 699 func (fh *FileHandle) resetToKnownSize() { 700 if fh.inode.KnownSize != nil { 701 fh.inode.Attributes.Size = *fh.inode.KnownSize 702 } else { 703 fh.inode.Attributes.Size = 0 704 fh.inode.Invalid = true 705 } 706 } 707 708 func (fh *FileHandle) FlushFile() (err error) { 709 fh.mu.Lock() 710 defer fh.mu.Unlock() 711 712 fh.inode.logFuse("FlushFile") 713 714 if !fh.dirty || fh.lastWriteError != nil { 715 if fh.lastWriteError != nil { 716 err = fh.lastWriteError 717 fh.resetToKnownSize() 718 } 719 return 720 } 721 722 if fh.inode.Parent == nil { 723 // the file is deleted 724 if fh.mpuId != nil { 725 go func() { 726 _, _ = fh.cloud.MultipartBlobAbort(fh.mpuId) 727 fh.mpuId = nil 728 }() 729 } 730 return 731 } 732 733 fs := fh.inode.fs 734 735 // abort mpu on error 736 defer func() { 737 if err != nil { 738 if fh.mpuId != nil { 739 go func() { 740 _, _ = fh.cloud.MultipartBlobAbort(fh.mpuId) 741 fh.mpuId = nil 742 }() 743 } 744 745 fh.resetToKnownSize() 746 } else { 747 if fh.dirty { 748 // don't unset this if we never actually flushed 749 size := fh.inode.Attributes.Size 750 fh.inode.KnownSize = &size 751 fh.inode.Invalid = false 752 } 753 fh.dirty = false 754 } 755 756 fh.writeInit = sync.Once{} 757 fh.nextWriteOffset = 0 758 fh.lastPartId = 0 759 }() 760 761 if fh.lastPartId == 0 { 762 return fh.flushSmallFile() 763 } 764 765 fh.mpuWG.Wait() 766 767 if fh.lastWriteError != nil { 768 return fh.lastWriteError 769 } 770 771 if fh.mpuId == nil { 772 return 773 } 774 775 nParts := fh.lastPartId 776 if fh.buf != nil { 777 // upload last part 778 nParts++ 779 err = fh.mpuPartNoSpawn(fh.buf, nParts, fh.nextWriteOffset, true) 780 if err != nil { 781 return 782 } 783 fh.buf = nil 784 } 785 786 resp, err := fh.cloud.MultipartBlobCommit(fh.mpuId) 787 if err != nil { 788 return 789 } 790 791 fh.updateFromFlush(resp.ETag, resp.LastModified, resp.StorageClass) 792 793 fh.mpuId = nil 794 795 // we want to get key from inode because the file could have been renamed 796 _, key := fh.inode.cloud() 797 if *fh.mpuName != key { 798 // the file was renamed 799 err = fh.inode.renameObject(fs, PUInt64(uint64(fh.nextWriteOffset)), *fh.mpuName, *fh.inode.FullName()) 800 } 801 802 return 803 }