github.com/StarfishStorage/goofys@v0.23.2-0.20200415030923-535558486b34/internal/file.go (about) 1 // Copyright 2015 - 2017 Ka-Hing Cheung 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package internal 16 17 import ( 18 "errors" 19 "fmt" 20 "io" 21 "sync" 22 "sync/atomic" 23 "syscall" 24 25 "github.com/jacobsa/fuse" 26 "github.com/jacobsa/fuse/fuseops" 27 ) 28 29 type FileHandle struct { 30 inode *Inode 31 cloud StorageBackend 32 key string 33 34 mpuName *string 35 dirty bool 36 writeInit sync.Once 37 mpuWG sync.WaitGroup 38 39 mu sync.Mutex 40 mpuId *MultipartBlobCommitInput 41 nextWriteOffset int64 42 lastPartId uint32 43 44 poolHandle *BufferPool 45 buf *MBuf 46 47 lastWriteError error 48 49 // read 50 reader io.ReadCloser 51 readBufOffset int64 52 53 // parallel read 54 buffers []*S3ReadBuffer 55 existingReadahead int 56 seqReadAmount uint64 57 numOOORead uint64 // number of out of order read 58 // User space PID. All threads created by a process will have the same TGID, 59 // but different PIDs[1]. 60 // This value can be nil if we fail to get TGID from PID[2]. 61 // [1] : https://godoc.org/github.com/shirou/gopsutil/process#Process.Tgid 62 // [2] : https://github.com/shirou/gopsutil#process-class 63 Tgid *int32 64 } 65 66 const MAX_READAHEAD = uint32(400 * 1024 * 1024) 67 const READAHEAD_CHUNK = uint32(20 * 1024 * 1024) 68 69 // NewFileHandle returns a new file handle for the given `inode` triggered by fuse 70 // operation with the given `opMetadata` 71 func NewFileHandle(inode *Inode, opMetadata fuseops.OpMetadata) *FileHandle { 72 tgid, err := GetTgid(opMetadata.Pid) 73 if err != nil { 74 log.Debugf( 75 "Failed to retrieve tgid for the given pid. pid: %v err: %v inode id: %v err: %v", 76 opMetadata.Pid, err, inode.Id, err) 77 } 78 fh := &FileHandle{inode: inode, Tgid: tgid} 79 fh.cloud, fh.key = inode.cloud() 80 return fh 81 } 82 83 func (fh *FileHandle) initWrite() { 84 fh.writeInit.Do(func() { 85 fh.mpuWG.Add(1) 86 go fh.initMPU() 87 }) 88 } 89 90 func (fh *FileHandle) initMPU() { 91 defer func() { 92 fh.mpuWG.Done() 93 }() 94 95 fs := fh.inode.fs 96 fh.mpuName = &fh.key 97 98 resp, err := fh.cloud.MultipartBlobBegin(&MultipartBlobBeginInput{ 99 Key: *fh.mpuName, 100 ContentType: fs.flags.GetMimeType(*fh.mpuName), 101 }) 102 103 fh.mu.Lock() 104 defer fh.mu.Unlock() 105 106 if err != nil { 107 fh.lastWriteError = mapAwsError(err) 108 } else { 109 fh.mpuId = resp 110 } 111 112 return 113 } 114 115 func (fh *FileHandle) mpuPartNoSpawn(buf *MBuf, part uint32, total int64, last bool) (err error) { 116 fs := fh.inode.fs 117 118 fs.replicators.Take(1, true) 119 defer fs.replicators.Return(1) 120 121 if part == 0 || part > 10000 { 122 return errors.New(fmt.Sprintf("invalid part number: %v", part)) 123 } 124 125 mpu := MultipartBlobAddInput{ 126 Commit: fh.mpuId, 127 PartNumber: part, 128 Body: buf, 129 Size: uint64(buf.Len()), 130 Last: last, 131 Offset: uint64(total - int64(buf.Len())), 132 } 133 134 defer func() { 135 if mpu.Body != nil { 136 bufferLog.Debugf("Free %T", buf) 137 buf.Free() 138 } 139 }() 140 141 _, err = fh.cloud.MultipartBlobAdd(&mpu) 142 143 return 144 } 145 146 func (fh *FileHandle) mpuPart(buf *MBuf, part uint32, total int64) { 147 defer func() { 148 fh.mpuWG.Done() 149 }() 150 151 // maybe wait for CreateMultipartUpload 152 if fh.mpuId == nil { 153 fh.mpuWG.Wait() 154 // initMPU might have errored 155 if fh.mpuId == nil { 156 return 157 } 158 } 159 160 err := fh.mpuPartNoSpawn(buf, part, total, false) 161 if err != nil { 162 if fh.lastWriteError == nil { 163 fh.lastWriteError = err 164 } 165 } 166 } 167 168 func (fh *FileHandle) waitForCreateMPU() (err error) { 169 if fh.mpuId == nil { 170 fh.mu.Unlock() 171 fh.initWrite() 172 fh.mpuWG.Wait() // wait for initMPU 173 fh.mu.Lock() 174 175 if fh.lastWriteError != nil { 176 return fh.lastWriteError 177 } 178 } 179 180 return 181 } 182 183 func (fh *FileHandle) partSize() uint64 { 184 var size uint64 185 186 if fh.lastPartId < 1000 { 187 size = 5 * 1024 * 1024 188 } else if fh.lastPartId < 2000 { 189 size = 25 * 1024 * 1024 190 } else { 191 size = 125 * 1024 * 1024 192 } 193 194 maxPartSize := fh.cloud.Capabilities().MaxMultipartSize 195 if maxPartSize != 0 { 196 size = MinUInt64(maxPartSize, size) 197 } 198 return size 199 } 200 201 func (fh *FileHandle) uploadCurrentBuf(parallel bool) (err error) { 202 err = fh.waitForCreateMPU() 203 if err != nil { 204 return 205 } 206 207 fh.lastPartId++ 208 part := fh.lastPartId 209 buf := fh.buf 210 fh.buf = nil 211 212 if parallel { 213 fh.mpuWG.Add(1) 214 go fh.mpuPart(buf, part, fh.nextWriteOffset) 215 } else { 216 err = fh.mpuPartNoSpawn(buf, part, fh.nextWriteOffset, false) 217 if fh.lastWriteError == nil { 218 fh.lastWriteError = err 219 } 220 } 221 222 return 223 } 224 225 func (fh *FileHandle) WriteFile(offset int64, data []byte) (err error) { 226 fh.inode.logFuse("WriteFile", offset, len(data)) 227 228 fh.mu.Lock() 229 defer fh.mu.Unlock() 230 231 if fh.lastWriteError != nil { 232 fh.inode.mu.Lock() 233 // our write failed, next time we open we should not 234 // use page cache so we will read from cloud again 235 fh.inode.invalidateCache = true 236 fh.inode.mu.Unlock() 237 return fh.lastWriteError 238 } 239 240 if offset != fh.nextWriteOffset { 241 fh.inode.errFuse("WriteFile: only sequential writes supported", fh.nextWriteOffset, offset) 242 fh.lastWriteError = syscall.ENOTSUP 243 return fh.lastWriteError 244 } 245 246 if offset == 0 { 247 fh.poolHandle = fh.inode.fs.bufferPool 248 fh.dirty = true 249 fh.inode.mu.Lock() 250 // we are updating this file, always prefer to read 251 // back our own write. XXX this doesn't actually work, 252 // see the notes in Goofys.OpenFile about 253 // KeepPageCache 254 fh.inode.knownETag = nil 255 fh.inode.mu.Unlock() 256 } 257 258 for { 259 if fh.buf == nil { 260 fh.buf = MBuf{}.Init(fh.poolHandle, fh.partSize(), true) 261 } 262 263 nCopied, _ := fh.buf.Write(data) 264 fh.nextWriteOffset += int64(nCopied) 265 266 if fh.buf.Full() { 267 err = fh.uploadCurrentBuf(!fh.cloud.Capabilities().NoParallelMultipart) 268 if err != nil { 269 return 270 } 271 } 272 273 if nCopied == len(data) { 274 break 275 } 276 277 data = data[nCopied:] 278 } 279 280 fh.inode.Attributes.Size = uint64(fh.nextWriteOffset) 281 282 return 283 } 284 285 type S3ReadBuffer struct { 286 s3 StorageBackend 287 startOffset uint64 288 nRetries uint8 289 mbuf *MBuf 290 291 offset uint64 292 size uint32 293 buf *Buffer 294 } 295 296 func (b S3ReadBuffer) Init(fh *FileHandle, offset uint64, size uint32) *S3ReadBuffer { 297 b.s3 = fh.cloud 298 b.offset = offset 299 b.startOffset = offset 300 b.size = size 301 b.nRetries = 3 302 303 b.mbuf = MBuf{}.Init(fh.poolHandle, uint64(size), false) 304 if b.mbuf == nil { 305 return nil 306 } 307 308 b.initBuffer(fh, offset, size) 309 return &b 310 } 311 312 func (b *S3ReadBuffer) initBuffer(fh *FileHandle, offset uint64, size uint32) { 313 getFunc := func() (io.ReadCloser, error) { 314 resp, err := b.s3.GetBlob(&GetBlobInput{ 315 Key: fh.key, 316 Start: offset, 317 Count: uint64(size), 318 }) 319 if err != nil { 320 return nil, err 321 } 322 323 return resp.Body, nil 324 } 325 326 if b.buf == nil { 327 b.buf = Buffer{}.Init(b.mbuf, getFunc) 328 } else { 329 b.buf.ReInit(getFunc) 330 } 331 } 332 333 func (b *S3ReadBuffer) Read(offset uint64, p []byte) (n int, err error) { 334 if b.offset == offset { 335 n, err = io.ReadFull(b.buf, p) 336 if n != 0 && err == io.ErrUnexpectedEOF { 337 err = nil 338 } 339 if n > 0 { 340 if uint32(n) > b.size { 341 panic(fmt.Sprintf("read more than available %v %v", n, b.size)) 342 } 343 344 b.offset += uint64(n) 345 b.size -= uint32(n) 346 } 347 if b.size == 0 && err != nil { 348 // we've read everything, sometimes we may 349 // request for more bytes then there's left in 350 // this chunk so we could get an error back, 351 // ex: http2: response body closed this 352 // doesn't tend to happen because our chunks 353 // are aligned to 4K and also 128K (except for 354 // the last chunk, but seems kernel requests 355 // for a smaller buffer for the last chunk) 356 err = nil 357 } 358 359 return 360 } else { 361 panic(fmt.Sprintf("not the right buffer, expecting %v got %v, %v left", b.offset, offset, b.size)) 362 err = errors.New(fmt.Sprintf("not the right buffer, expecting %v got %v", b.offset, offset)) 363 return 364 } 365 } 366 367 func (fh *FileHandle) readFromReadAhead(offset uint64, buf []byte) (bytesRead int, err error) { 368 var nread int 369 for len(fh.buffers) != 0 { 370 readAheadBuf := fh.buffers[0] 371 372 nread, err = readAheadBuf.Read(offset+uint64(bytesRead), buf) 373 bytesRead += nread 374 if err != nil { 375 if err == io.EOF && readAheadBuf.size != 0 { 376 // in case we hit 377 // https://github.com/kahing/goofys/issues/464 378 // again, this will convert that into 379 // an error 380 fuseLog.Errorf("got EOF when data remains: %v", *fh.inode.FullName()) 381 err = io.ErrUnexpectedEOF 382 } else if err != io.EOF && readAheadBuf.size > 0 { 383 // we hit some other errors when 384 // reading from this part. If we can 385 // retry, do that 386 if readAheadBuf.nRetries > 0 { 387 readAheadBuf.nRetries -= 1 388 readAheadBuf.initBuffer(fh, readAheadBuf.offset, readAheadBuf.size) 389 // we unset error and return, 390 // so upper layer will retry 391 // this read 392 err = nil 393 } 394 } 395 return 396 } 397 398 if readAheadBuf.size == 0 { 399 // we've exhausted the first buffer 400 readAheadBuf.buf.Close() 401 fh.buffers = fh.buffers[1:] 402 } 403 404 buf = buf[nread:] 405 406 if len(buf) == 0 { 407 // we've filled the user buffer 408 return 409 } 410 } 411 412 return 413 } 414 415 func (fh *FileHandle) readAhead(offset uint64, needAtLeast int) (err error) { 416 existingReadahead := uint32(0) 417 for _, b := range fh.buffers { 418 existingReadahead += b.size 419 } 420 421 readAheadAmount := MAX_READAHEAD 422 423 for readAheadAmount-existingReadahead >= READAHEAD_CHUNK { 424 off := offset + uint64(existingReadahead) 425 remaining := fh.inode.Attributes.Size - off 426 427 // only read up to readahead chunk each time 428 size := MinUInt32(readAheadAmount-existingReadahead, READAHEAD_CHUNK) 429 // but don't read past the file 430 size = uint32(MinUInt64(uint64(size), remaining)) 431 432 if size != 0 { 433 fh.inode.logFuse("readahead", off, size, existingReadahead) 434 435 readAheadBuf := S3ReadBuffer{}.Init(fh, off, size) 436 if readAheadBuf != nil { 437 fh.buffers = append(fh.buffers, readAheadBuf) 438 existingReadahead += size 439 } else { 440 if existingReadahead != 0 { 441 // don't do more readahead now, but don't fail, cross our 442 // fingers that we will be able to allocate the buffers 443 // later 444 return nil 445 } else { 446 return syscall.ENOMEM 447 } 448 } 449 } 450 451 if size != READAHEAD_CHUNK { 452 // that was the last remaining chunk to readahead 453 break 454 } 455 } 456 457 return nil 458 } 459 460 func (fh *FileHandle) ReadFile(offset int64, buf []byte) (bytesRead int, err error) { 461 fh.inode.logFuse("ReadFile", offset, len(buf)) 462 defer func() { 463 fh.inode.logFuse("< ReadFile", bytesRead, err) 464 465 if err != nil { 466 if err == io.EOF { 467 err = nil 468 } 469 } 470 }() 471 472 fh.mu.Lock() 473 defer fh.mu.Unlock() 474 475 nwant := len(buf) 476 var nread int 477 478 for bytesRead < nwant && err == nil { 479 nread, err = fh.readFile(offset+int64(bytesRead), buf[bytesRead:]) 480 if nread > 0 { 481 bytesRead += nread 482 } 483 } 484 485 return 486 } 487 488 func (fh *FileHandle) readFile(offset int64, buf []byte) (bytesRead int, err error) { 489 defer func() { 490 if bytesRead > 0 { 491 fh.readBufOffset += int64(bytesRead) 492 fh.seqReadAmount += uint64(bytesRead) 493 } 494 495 fh.inode.logFuse("< readFile", bytesRead, err) 496 }() 497 498 if uint64(offset) >= fh.inode.Attributes.Size { 499 // nothing to read 500 if fh.inode.Invalid { 501 err = fuse.ENOENT 502 } else if fh.inode.KnownSize == nil { 503 err = io.EOF 504 } else { 505 err = io.EOF 506 } 507 return 508 } 509 510 fs := fh.inode.fs 511 512 if fh.poolHandle == nil { 513 fh.poolHandle = fs.bufferPool 514 } 515 516 if fh.readBufOffset != offset { 517 // XXX out of order read, maybe disable prefetching 518 fh.inode.logFuse("out of order read", offset, fh.readBufOffset) 519 520 fh.readBufOffset = offset 521 fh.seqReadAmount = 0 522 if fh.reader != nil { 523 fh.reader.Close() 524 fh.reader = nil 525 } 526 527 if fh.buffers != nil { 528 // we misdetected 529 fh.numOOORead++ 530 } 531 532 for _, b := range fh.buffers { 533 b.buf.Close() 534 } 535 fh.buffers = nil 536 } 537 538 if !fs.flags.Cheap && fh.seqReadAmount >= uint64(READAHEAD_CHUNK) && fh.numOOORead < 3 { 539 if fh.reader != nil { 540 fh.inode.logFuse("cutover to the parallel algorithm") 541 fh.reader.Close() 542 fh.reader = nil 543 } 544 545 err = fh.readAhead(uint64(offset), len(buf)) 546 if err == nil { 547 bytesRead, err = fh.readFromReadAhead(uint64(offset), buf) 548 return 549 } else { 550 // fall back to read serially 551 fh.inode.logFuse("not enough memory, fallback to serial read") 552 fh.seqReadAmount = 0 553 for _, b := range fh.buffers { 554 b.buf.Close() 555 } 556 fh.buffers = nil 557 } 558 } 559 560 bytesRead, err = fh.readFromStream(offset, buf) 561 562 return 563 } 564 565 func (fh *FileHandle) Release() { 566 // read buffers 567 for _, b := range fh.buffers { 568 b.buf.Close() 569 } 570 fh.buffers = nil 571 572 if fh.reader != nil { 573 fh.reader.Close() 574 } 575 576 // write buffers 577 if fh.poolHandle != nil { 578 if fh.buf != nil && fh.buf.buffers != nil { 579 if fh.lastWriteError == nil { 580 panic("buf not freed but error is nil") 581 } 582 583 fh.buf.Free() 584 // the other in-flight multipart PUT buffers will be 585 // freed when they finish/error out 586 } 587 } 588 589 fh.inode.mu.Lock() 590 defer fh.inode.mu.Unlock() 591 592 if atomic.AddInt32(&fh.inode.fileHandles, -1) == -1 { 593 panic(fh.inode.fileHandles) 594 } 595 } 596 597 func (fh *FileHandle) readFromStream(offset int64, buf []byte) (bytesRead int, err error) { 598 defer func() { 599 if fh.inode.fs.flags.DebugFuse { 600 fh.inode.logFuse("< readFromStream", bytesRead) 601 } 602 }() 603 604 if uint64(offset) >= fh.inode.Attributes.Size { 605 // nothing to read 606 return 607 } 608 609 if fh.reader == nil { 610 resp, err := fh.cloud.GetBlob(&GetBlobInput{ 611 Key: fh.key, 612 Start: uint64(offset), 613 }) 614 if err != nil { 615 return bytesRead, err 616 } 617 618 fh.reader = resp.Body 619 } 620 621 bytesRead, err = fh.reader.Read(buf) 622 if err != nil { 623 if err != io.EOF { 624 fh.inode.logFuse("< readFromStream error", bytesRead, err) 625 } 626 // always retry error on read 627 fh.reader.Close() 628 fh.reader = nil 629 err = nil 630 } 631 632 return 633 } 634 635 func (fh *FileHandle) flushSmallFile() (err error) { 636 buf := fh.buf 637 fh.buf = nil 638 639 if buf == nil { 640 buf = MBuf{}.Init(fh.poolHandle, 0, true) 641 } 642 643 defer buf.Free() 644 645 fs := fh.inode.fs 646 647 fs.replicators.Take(1, true) 648 defer fs.replicators.Return(1) 649 650 // we want to get key from inode because the file could have been renamed 651 _, key := fh.inode.cloud() 652 resp, err := fh.cloud.PutBlob(&PutBlobInput{ 653 Key: key, 654 Body: buf, 655 Size: PUInt64(uint64(buf.Len())), 656 ContentType: fs.flags.GetMimeType(*fh.inode.FullName()), 657 }) 658 if err != nil { 659 fh.lastWriteError = err 660 } else { 661 inode := fh.inode 662 inode.mu.Lock() 663 defer inode.mu.Unlock() 664 if resp.ETag != nil { 665 inode.s3Metadata["etag"] = []byte(*resp.ETag) 666 } 667 if resp.StorageClass != nil { 668 inode.s3Metadata["storage-class"] = []byte(*resp.StorageClass) 669 } 670 } 671 return 672 } 673 674 func (fh *FileHandle) resetToKnownSize() { 675 if fh.inode.KnownSize != nil { 676 fh.inode.Attributes.Size = *fh.inode.KnownSize 677 } else { 678 fh.inode.Attributes.Size = 0 679 fh.inode.Invalid = true 680 } 681 } 682 683 func (fh *FileHandle) FlushFile() (err error) { 684 fh.mu.Lock() 685 defer fh.mu.Unlock() 686 687 fh.inode.logFuse("FlushFile") 688 689 if !fh.dirty || fh.lastWriteError != nil { 690 if fh.lastWriteError != nil { 691 err = fh.lastWriteError 692 fh.resetToKnownSize() 693 } 694 return 695 } 696 697 if fh.inode.Parent == nil { 698 // the file is deleted 699 if fh.mpuId != nil { 700 go func() { 701 _, _ = fh.cloud.MultipartBlobAbort(fh.mpuId) 702 fh.mpuId = nil 703 }() 704 } 705 return 706 } 707 708 fs := fh.inode.fs 709 710 // abort mpu on error 711 defer func() { 712 if err != nil { 713 if fh.mpuId != nil { 714 go func() { 715 _, _ = fh.cloud.MultipartBlobAbort(fh.mpuId) 716 fh.mpuId = nil 717 }() 718 } 719 720 fh.resetToKnownSize() 721 } else { 722 if fh.dirty { 723 // don't unset this if we never actually flushed 724 size := fh.inode.Attributes.Size 725 fh.inode.KnownSize = &size 726 fh.inode.Invalid = false 727 } 728 fh.dirty = false 729 } 730 731 fh.writeInit = sync.Once{} 732 fh.nextWriteOffset = 0 733 fh.lastPartId = 0 734 }() 735 736 if fh.lastPartId == 0 { 737 return fh.flushSmallFile() 738 } 739 740 fh.mpuWG.Wait() 741 742 if fh.lastWriteError != nil { 743 return fh.lastWriteError 744 } 745 746 if fh.mpuId == nil { 747 return 748 } 749 750 nParts := fh.lastPartId 751 if fh.buf != nil { 752 // upload last part 753 nParts++ 754 err = fh.mpuPartNoSpawn(fh.buf, nParts, fh.nextWriteOffset, true) 755 if err != nil { 756 return 757 } 758 fh.buf = nil 759 } 760 761 _, err = fh.cloud.MultipartBlobCommit(fh.mpuId) 762 if err != nil { 763 return 764 } 765 766 fh.mpuId = nil 767 768 // we want to get key from inode because the file could have been renamed 769 _, key := fh.inode.cloud() 770 if *fh.mpuName != key { 771 // the file was renamed 772 err = fh.inode.renameObject(fs, PUInt64(uint64(fh.nextWriteOffset)), *fh.mpuName, *fh.inode.FullName()) 773 } 774 775 return 776 }