github.com/adrianjagielak/goofys@v0.24.1-0.20230810095418-94919a5d2254/internal/file.go (about) 1 // Copyright 2015 - 2017 Ka-Hing Cheung 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package internal 16 17 import ( 18 "errors" 19 "fmt" 20 "io" 21 "sync" 22 "sync/atomic" 23 "syscall" 24 "time" 25 26 "github.com/jacobsa/fuse" 27 "github.com/jacobsa/fuse/fuseops" 28 ) 29 30 type FileHandle struct { 31 inode *Inode 32 cloud StorageBackend 33 key string 34 35 mpuName *string 36 dirty bool 37 writeInit sync.Once 38 mpuWG sync.WaitGroup 39 40 mu sync.Mutex 41 mpuId *MultipartBlobCommitInput 42 nextWriteOffset int64 43 lastPartId uint32 44 45 poolHandle *BufferPool 46 buf *MBuf 47 48 lastWriteError error 49 50 // read 51 reader io.ReadCloser 52 readBufOffset int64 53 54 // parallel read 55 buffers []*S3ReadBuffer 56 existingReadahead int 57 seqReadAmount uint64 58 numOOORead uint64 // number of out of order read 59 // User space PID. All threads created by a process will have the same TGID, 60 // but different PIDs[1]. 61 // This value can be nil if we fail to get TGID from PID[2]. 62 // [1] : https://godoc.org/github.com/shirou/gopsutil/process#Process.Tgid 63 // [2] : https://github.com/shirou/gopsutil#process-class 64 Tgid *int32 65 66 keepPageCache bool // the same value we returned to OpenFile 67 } 68 69 const MAX_READAHEAD = uint32(400 * 1024 * 1024) 70 const READAHEAD_CHUNK = uint32(20 * 1024 * 1024) 71 72 // NewFileHandle returns a new file handle for the given `inode` triggered by fuse 73 // operation with the given `opMetadata` 74 func NewFileHandle(inode *Inode, opMetadata fuseops.OpContext) *FileHandle { 75 tgid, err := GetTgid(opMetadata.Pid) 76 if err != nil { 77 log.Debugf( 78 "Failed to retrieve tgid for the given pid. pid: %v err: %v inode id: %v err: %v", 79 opMetadata.Pid, err, inode.Id, err) 80 } 81 fh := &FileHandle{inode: inode, Tgid: tgid} 82 fh.cloud, fh.key = inode.cloud() 83 return fh 84 } 85 86 func (fh *FileHandle) initWrite() { 87 fh.writeInit.Do(func() { 88 fh.mpuWG.Add(1) 89 go fh.initMPU() 90 }) 91 } 92 93 func (fh *FileHandle) initMPU() { 94 defer func() { 95 fh.mpuWG.Done() 96 }() 97 98 fs := fh.inode.fs 99 fh.mpuName = &fh.key 100 101 resp, err := fh.cloud.MultipartBlobBegin(&MultipartBlobBeginInput{ 102 Key: *fh.mpuName, 103 ContentType: fs.flags.GetMimeType(*fh.mpuName), 104 }) 105 106 fh.mu.Lock() 107 defer fh.mu.Unlock() 108 109 if err != nil { 110 fh.lastWriteError = mapAwsError(err) 111 } else { 112 fh.mpuId = resp 113 } 114 115 return 116 } 117 118 func (fh *FileHandle) mpuPartNoSpawn(buf *MBuf, part uint32, total int64, last bool) (err error) { 119 fs := fh.inode.fs 120 121 fs.replicators.Take(1, true) 122 defer fs.replicators.Return(1) 123 124 if part == 0 || part > 10000 { 125 return errors.New(fmt.Sprintf("invalid part number: %v", part)) 126 } 127 128 mpu := MultipartBlobAddInput{ 129 Commit: fh.mpuId, 130 PartNumber: part, 131 Body: buf, 132 Size: uint64(buf.Len()), 133 Last: last, 134 Offset: uint64(total - int64(buf.Len())), 135 } 136 137 defer func() { 138 if mpu.Body != nil { 139 bufferLog.Debugf("Free %T", buf) 140 buf.Free() 141 } 142 }() 143 144 _, err = fh.cloud.MultipartBlobAdd(&mpu) 145 146 return 147 } 148 149 func (fh *FileHandle) mpuPart(buf *MBuf, part uint32, total int64) { 150 defer func() { 151 fh.mpuWG.Done() 152 }() 153 154 // maybe wait for CreateMultipartUpload 155 if fh.mpuId == nil { 156 fh.mpuWG.Wait() 157 // initMPU might have errored 158 if fh.mpuId == nil { 159 return 160 } 161 } 162 163 err := fh.mpuPartNoSpawn(buf, part, total, false) 164 if err != nil { 165 if fh.lastWriteError == nil { 166 fh.lastWriteError = err 167 } 168 } 169 } 170 171 func (fh *FileHandle) waitForCreateMPU() (err error) { 172 if fh.mpuId == nil { 173 fh.mu.Unlock() 174 fh.initWrite() 175 fh.mpuWG.Wait() // wait for initMPU 176 fh.mu.Lock() 177 178 if fh.lastWriteError != nil { 179 return fh.lastWriteError 180 } 181 } 182 183 return 184 } 185 186 func (fh *FileHandle) partSize() uint64 { 187 var size uint64 188 189 if fh.lastPartId < 500 { 190 size = 5 * 1024 * 1024 191 } else if fh.lastPartId < 1000 { 192 size = 25 * 1024 * 1024 193 } else if fh.lastPartId < 2000 { 194 size = 125 * 1024 * 1024 195 } else { 196 size = 625 * 1024 * 1024 197 } 198 199 maxPartSize := fh.cloud.Capabilities().MaxMultipartSize 200 if maxPartSize != 0 { 201 size = MinUInt64(maxPartSize, size) 202 } 203 return size 204 } 205 206 func (fh *FileHandle) uploadCurrentBuf(parallel bool) (err error) { 207 err = fh.waitForCreateMPU() 208 if err != nil { 209 return 210 } 211 212 fh.lastPartId++ 213 part := fh.lastPartId 214 buf := fh.buf 215 fh.buf = nil 216 217 if parallel { 218 fh.mpuWG.Add(1) 219 go fh.mpuPart(buf, part, fh.nextWriteOffset) 220 } else { 221 err = fh.mpuPartNoSpawn(buf, part, fh.nextWriteOffset, false) 222 if fh.lastWriteError == nil { 223 fh.lastWriteError = err 224 } 225 } 226 227 return 228 } 229 230 func (fh *FileHandle) WriteFile(offset int64, data []byte) (err error) { 231 fh.inode.logFuse("WriteFile", offset, len(data)) 232 233 fh.mu.Lock() 234 defer fh.mu.Unlock() 235 236 if fh.lastWriteError != nil { 237 fh.inode.mu.Lock() 238 // our write failed, next time we open we should not 239 // use page cache so we will read from cloud again 240 fh.inode.invalidateCache = true 241 fh.inode.mu.Unlock() 242 return fh.lastWriteError 243 } 244 245 if offset != fh.nextWriteOffset { 246 fh.inode.errFuse("WriteFile: only sequential writes supported", fh.nextWriteOffset, offset) 247 fh.lastWriteError = syscall.ENOTSUP 248 return fh.lastWriteError 249 } 250 251 if offset == 0 { 252 fh.poolHandle = fh.inode.fs.bufferPool 253 fh.dirty = true 254 fh.inode.mu.Lock() 255 // we are updating this file, set knownETag to nil so 256 // on next lookup we won't think it's changed, to 257 // always prefer to read back our own write. We set 258 // this back to the ETag at flush time 259 // 260 // XXX this doesn't actually work, see the notes in 261 // Goofys.OpenFile about KeepPageCache 262 fh.inode.knownETag = nil 263 fh.inode.invalidateCache = false 264 fh.inode.mu.Unlock() 265 } 266 267 for { 268 if fh.buf == nil { 269 fh.buf = MBuf{}.Init(fh.poolHandle, fh.partSize(), true) 270 } 271 272 nCopied, _ := fh.buf.Write(data) 273 fh.nextWriteOffset += int64(nCopied) 274 275 if fh.buf.Full() { 276 err = fh.uploadCurrentBuf(!fh.cloud.Capabilities().NoParallelMultipart) 277 if err != nil { 278 return 279 } 280 } 281 282 if nCopied == len(data) { 283 break 284 } 285 286 data = data[nCopied:] 287 } 288 289 fh.inode.Attributes.Size = uint64(fh.nextWriteOffset) 290 fh.inode.Attributes.Mtime = time.Now() 291 292 return 293 } 294 295 type S3ReadBuffer struct { 296 s3 StorageBackend 297 startOffset uint64 298 nRetries uint8 299 mbuf *MBuf 300 301 offset uint64 302 size uint32 303 buf *Buffer 304 } 305 306 func (b S3ReadBuffer) Init(fh *FileHandle, offset uint64, size uint32) *S3ReadBuffer { 307 b.s3 = fh.cloud 308 b.offset = offset 309 b.startOffset = offset 310 b.size = size 311 b.nRetries = 3 312 313 b.mbuf = MBuf{}.Init(fh.poolHandle, uint64(size), false) 314 if b.mbuf == nil { 315 return nil 316 } 317 318 b.initBuffer(fh, offset, size) 319 return &b 320 } 321 322 func (b *S3ReadBuffer) initBuffer(fh *FileHandle, offset uint64, size uint32) { 323 getFunc := func() (io.ReadCloser, error) { 324 resp, err := b.s3.GetBlob(&GetBlobInput{ 325 Key: fh.key, 326 Start: offset, 327 Count: uint64(size), 328 }) 329 if err != nil { 330 return nil, err 331 } 332 333 return resp.Body, nil 334 } 335 336 if b.buf == nil { 337 b.buf = Buffer{}.Init(b.mbuf, getFunc) 338 } else { 339 b.buf.ReInit(getFunc) 340 } 341 } 342 343 func (b *S3ReadBuffer) Read(offset uint64, p []byte) (n int, err error) { 344 if b.offset == offset { 345 n, err = io.ReadFull(b.buf, p) 346 if n != 0 && err == io.ErrUnexpectedEOF { 347 err = nil 348 } 349 if n > 0 { 350 if uint32(n) > b.size { 351 panic(fmt.Sprintf("read more than available %v %v", n, b.size)) 352 } 353 354 b.offset += uint64(n) 355 b.size -= uint32(n) 356 } 357 if b.size == 0 && err != nil { 358 // we've read everything, sometimes we may 359 // request for more bytes then there's left in 360 // this chunk so we could get an error back, 361 // ex: http2: response body closed this 362 // doesn't tend to happen because our chunks 363 // are aligned to 4K and also 128K (except for 364 // the last chunk, but seems kernel requests 365 // for a smaller buffer for the last chunk) 366 err = nil 367 } 368 369 return 370 } else { 371 panic(fmt.Sprintf("not the right buffer, expecting %v got %v, %v left", b.offset, offset, b.size)) 372 err = errors.New(fmt.Sprintf("not the right buffer, expecting %v got %v", b.offset, offset)) 373 return 374 } 375 } 376 377 func (fh *FileHandle) readFromReadAhead(offset uint64, buf []byte) (bytesRead int, err error) { 378 var nread int 379 for len(fh.buffers) != 0 { 380 readAheadBuf := fh.buffers[0] 381 382 nread, err = readAheadBuf.Read(offset+uint64(bytesRead), buf) 383 bytesRead += nread 384 if err != nil { 385 if err == io.EOF && readAheadBuf.size != 0 { 386 // in case we hit 387 // https://github.com/kahing/goofys/issues/464 388 // again, this will convert that into 389 // an error 390 fuseLog.Errorf("got EOF when data remains: %v", *fh.inode.FullName()) 391 err = io.ErrUnexpectedEOF 392 } else if err != io.EOF && readAheadBuf.size > 0 { 393 // we hit some other errors when 394 // reading from this part. If we can 395 // retry, do that 396 if readAheadBuf.nRetries > 0 { 397 readAheadBuf.nRetries -= 1 398 readAheadBuf.initBuffer(fh, readAheadBuf.offset, readAheadBuf.size) 399 // we unset error and return, 400 // so upper layer will retry 401 // this read 402 err = nil 403 } 404 } 405 return 406 } 407 408 if readAheadBuf.size == 0 { 409 // we've exhausted the first buffer 410 readAheadBuf.buf.Close() 411 fh.buffers = fh.buffers[1:] 412 } 413 414 buf = buf[nread:] 415 416 if len(buf) == 0 { 417 // we've filled the user buffer 418 return 419 } 420 } 421 422 return 423 } 424 425 func (fh *FileHandle) readAhead(offset uint64, needAtLeast int) (err error) { 426 existingReadahead := uint32(0) 427 for _, b := range fh.buffers { 428 existingReadahead += b.size 429 } 430 431 readAheadAmount := MAX_READAHEAD 432 433 for readAheadAmount-existingReadahead >= READAHEAD_CHUNK { 434 off := offset + uint64(existingReadahead) 435 remaining := fh.inode.Attributes.Size - off 436 437 // only read up to readahead chunk each time 438 size := MinUInt32(readAheadAmount-existingReadahead, READAHEAD_CHUNK) 439 // but don't read past the file 440 size = uint32(MinUInt64(uint64(size), remaining)) 441 442 if size != 0 { 443 fh.inode.logFuse("readahead", off, size, existingReadahead) 444 445 readAheadBuf := S3ReadBuffer{}.Init(fh, off, size) 446 if readAheadBuf != nil { 447 fh.buffers = append(fh.buffers, readAheadBuf) 448 existingReadahead += size 449 } else { 450 if existingReadahead != 0 { 451 // don't do more readahead now, but don't fail, cross our 452 // fingers that we will be able to allocate the buffers 453 // later 454 return nil 455 } else { 456 return syscall.ENOMEM 457 } 458 } 459 } 460 461 if size != READAHEAD_CHUNK { 462 // that was the last remaining chunk to readahead 463 break 464 } 465 } 466 467 return nil 468 } 469 470 func (fh *FileHandle) ReadFile(offset int64, buf []byte) (bytesRead int, err error) { 471 fh.inode.logFuse("ReadFile", offset, len(buf)) 472 defer func() { 473 fh.inode.logFuse("< ReadFile", bytesRead, err) 474 475 if err != nil { 476 if err == io.EOF { 477 err = nil 478 } 479 } 480 }() 481 482 fh.mu.Lock() 483 defer fh.mu.Unlock() 484 485 nwant := len(buf) 486 var nread int 487 488 for bytesRead < nwant && err == nil { 489 nread, err = fh.readFile(offset+int64(bytesRead), buf[bytesRead:]) 490 if nread > 0 { 491 bytesRead += nread 492 } 493 } 494 495 return 496 } 497 498 func (fh *FileHandle) readFile(offset int64, buf []byte) (bytesRead int, err error) { 499 defer func() { 500 if bytesRead > 0 { 501 fh.readBufOffset += int64(bytesRead) 502 fh.seqReadAmount += uint64(bytesRead) 503 } 504 505 fh.inode.logFuse("< readFile", bytesRead, err) 506 }() 507 508 if uint64(offset) >= fh.inode.Attributes.Size { 509 // nothing to read 510 if fh.inode.Invalid { 511 err = fuse.ENOENT 512 } else if fh.inode.KnownSize == nil { 513 err = io.EOF 514 } else { 515 err = io.EOF 516 } 517 return 518 } 519 520 fs := fh.inode.fs 521 522 if fh.poolHandle == nil { 523 fh.poolHandle = fs.bufferPool 524 } 525 526 if fh.readBufOffset != offset { 527 // XXX out of order read, maybe disable prefetching 528 fh.inode.logFuse("out of order read", offset, fh.readBufOffset) 529 530 fh.readBufOffset = offset 531 fh.seqReadAmount = 0 532 if fh.reader != nil { 533 fh.reader.Close() 534 fh.reader = nil 535 } 536 537 if fh.buffers != nil { 538 // we misdetected 539 fh.numOOORead++ 540 } 541 542 for _, b := range fh.buffers { 543 b.buf.Close() 544 } 545 fh.buffers = nil 546 } 547 548 if !fs.flags.Cheap && fh.seqReadAmount >= uint64(READAHEAD_CHUNK) && fh.numOOORead < 3 { 549 if fh.reader != nil { 550 fh.inode.logFuse("cutover to the parallel algorithm") 551 fh.reader.Close() 552 fh.reader = nil 553 } 554 555 err = fh.readAhead(uint64(offset), len(buf)) 556 if err == nil { 557 bytesRead, err = fh.readFromReadAhead(uint64(offset), buf) 558 return 559 } else { 560 // fall back to read serially 561 fh.inode.logFuse("not enough memory, fallback to serial read") 562 fh.seqReadAmount = 0 563 for _, b := range fh.buffers { 564 b.buf.Close() 565 } 566 fh.buffers = nil 567 } 568 } 569 570 bytesRead, err = fh.readFromStream(offset, buf) 571 572 return 573 } 574 575 func (fh *FileHandle) Release() { 576 // read buffers 577 for _, b := range fh.buffers { 578 b.buf.Close() 579 } 580 fh.buffers = nil 581 582 if fh.reader != nil { 583 fh.reader.Close() 584 } 585 586 // write buffers 587 if fh.poolHandle != nil { 588 if fh.buf != nil && fh.buf.buffers != nil { 589 if fh.lastWriteError == nil { 590 panic("buf not freed but error is nil") 591 } 592 593 fh.buf.Free() 594 // the other in-flight multipart PUT buffers will be 595 // freed when they finish/error out 596 } 597 } 598 599 fh.inode.mu.Lock() 600 defer fh.inode.mu.Unlock() 601 602 if atomic.AddInt32(&fh.inode.fileHandles, -1) == -1 { 603 panic(fh.inode.fileHandles) 604 } 605 } 606 607 func (fh *FileHandle) readFromStream(offset int64, buf []byte) (bytesRead int, err error) { 608 defer func() { 609 if fh.inode.fs.flags.DebugFuse { 610 fh.inode.logFuse("< readFromStream", bytesRead) 611 } 612 }() 613 614 if uint64(offset) >= fh.inode.Attributes.Size { 615 // nothing to read 616 return 617 } 618 619 if fh.reader == nil { 620 resp, err := fh.cloud.GetBlob(&GetBlobInput{ 621 Key: fh.key, 622 Start: uint64(offset), 623 }) 624 if err != nil { 625 return bytesRead, err 626 } 627 628 fh.reader = resp.Body 629 } 630 631 bytesRead, err = fh.reader.Read(buf) 632 if err != nil { 633 if err != io.EOF { 634 fh.inode.logFuse("< readFromStream error", bytesRead, err) 635 } 636 // always retry error on read 637 fh.reader.Close() 638 fh.reader = nil 639 err = nil 640 } 641 642 return 643 } 644 645 func (fh *FileHandle) flushSmallFile() (err error) { 646 buf := fh.buf 647 fh.buf = nil 648 649 if buf == nil { 650 buf = MBuf{}.Init(fh.poolHandle, 0, true) 651 } 652 653 defer buf.Free() 654 655 fs := fh.inode.fs 656 657 fs.replicators.Take(1, true) 658 defer fs.replicators.Return(1) 659 660 // we want to get key from inode because the file could have been renamed 661 _, key := fh.inode.cloud() 662 resp, err := fh.cloud.PutBlob(&PutBlobInput{ 663 Key: key, 664 Body: buf, 665 Size: PUInt64(uint64(buf.Len())), 666 ContentType: fs.flags.GetMimeType(*fh.inode.FullName()), 667 }) 668 if err != nil { 669 fh.lastWriteError = err 670 } else { 671 fh.updateFromFlush(resp.ETag, resp.LastModified, resp.StorageClass) 672 } 673 return 674 } 675 676 // LOCKS_EXCLUDED(fh.inode.mu) 677 func (fh *FileHandle) updateFromFlush(etag *string, lastModified *time.Time, storageClass *string) { 678 inode := fh.inode 679 inode.mu.Lock() 680 defer inode.mu.Unlock() 681 682 if etag != nil { 683 inode.s3Metadata["etag"] = []byte(*etag) 684 } 685 if storageClass != nil { 686 inode.s3Metadata["storage-class"] = []byte(*storageClass) 687 } 688 if fh.keepPageCache { 689 // if this write didn't update page cache, don't try 690 // to update these values so on next lookup, we would 691 // invalidate the cache. We want to do that because 692 // our cache could have been populated by subsequent 693 // reads 694 if lastModified != nil { 695 inode.Attributes.Mtime = *lastModified 696 } 697 inode.knownETag = etag 698 } 699 } 700 701 func (fh *FileHandle) resetToKnownSize() { 702 if fh.inode.KnownSize != nil { 703 fh.inode.Attributes.Size = *fh.inode.KnownSize 704 } else { 705 fh.inode.Attributes.Size = 0 706 fh.inode.Invalid = true 707 } 708 } 709 710 func (fh *FileHandle) FlushFile() (err error) { 711 fh.mu.Lock() 712 defer fh.mu.Unlock() 713 714 fh.inode.logFuse("FlushFile") 715 716 if !fh.dirty || fh.lastWriteError != nil { 717 if fh.lastWriteError != nil { 718 err = fh.lastWriteError 719 fh.resetToKnownSize() 720 } 721 return 722 } 723 724 if fh.inode.Parent == nil { 725 // the file is deleted 726 if fh.mpuId != nil { 727 go func() { 728 _, _ = fh.cloud.MultipartBlobAbort(fh.mpuId) 729 fh.mpuId = nil 730 }() 731 } 732 return 733 } 734 735 fs := fh.inode.fs 736 737 // abort mpu on error 738 defer func() { 739 if err != nil { 740 if fh.mpuId != nil { 741 go func() { 742 _, _ = fh.cloud.MultipartBlobAbort(fh.mpuId) 743 fh.mpuId = nil 744 }() 745 } 746 747 fh.resetToKnownSize() 748 } else { 749 if fh.dirty { 750 // don't unset this if we never actually flushed 751 size := fh.inode.Attributes.Size 752 fh.inode.KnownSize = &size 753 fh.inode.Invalid = false 754 } 755 fh.dirty = false 756 } 757 758 fh.writeInit = sync.Once{} 759 fh.nextWriteOffset = 0 760 fh.lastPartId = 0 761 }() 762 763 if fh.lastPartId == 0 { 764 return fh.flushSmallFile() 765 } 766 767 fh.mpuWG.Wait() 768 769 if fh.lastWriteError != nil { 770 return fh.lastWriteError 771 } 772 773 if fh.mpuId == nil { 774 return 775 } 776 777 nParts := fh.lastPartId 778 if fh.buf != nil { 779 // upload last part 780 nParts++ 781 err = fh.mpuPartNoSpawn(fh.buf, nParts, fh.nextWriteOffset, true) 782 if err != nil { 783 return 784 } 785 fh.buf = nil 786 } 787 788 resp, err := fh.cloud.MultipartBlobCommit(fh.mpuId) 789 if err != nil { 790 return 791 } 792 793 fh.updateFromFlush(resp.ETag, resp.LastModified, resp.StorageClass) 794 795 fh.mpuId = nil 796 797 // we want to get key from inode because the file could have been renamed 798 _, key := fh.inode.cloud() 799 if *fh.mpuName != key { 800 // the file was renamed 801 err = fh.inode.renameObject(fs, PUInt64(uint64(fh.nextWriteOffset)), *fh.mpuName, *fh.inode.FullName()) 802 } 803 804 return 805 }