github.com/nefixestrada/goofys@v0.23.1/internal/file.go (about) 1 // Copyright 2015 - 2017 Ka-Hing Cheung 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package internal 16 17 import ( 18 "errors" 19 "fmt" 20 "io" 21 "sync" 22 "syscall" 23 24 "github.com/jacobsa/fuse" 25 "github.com/jacobsa/fuse/fuseops" 26 ) 27 28 type FileHandle struct { 29 inode *Inode 30 cloud StorageBackend 31 key string 32 33 mpuName *string 34 dirty bool 35 writeInit sync.Once 36 mpuWG sync.WaitGroup 37 38 mu sync.Mutex 39 mpuId *MultipartBlobCommitInput 40 nextWriteOffset int64 41 lastPartId uint32 42 43 poolHandle *BufferPool 44 buf *MBuf 45 46 lastWriteError error 47 48 // read 49 reader io.ReadCloser 50 readBufOffset int64 51 52 // parallel read 53 buffers []*S3ReadBuffer 54 existingReadahead int 55 seqReadAmount uint64 56 numOOORead uint64 // number of out of order read 57 // User space PID. All threads created by a process will have the same TGID, 58 // but different PIDs[1]. 59 // This value can be nil if we fail to get TGID from PID[2]. 60 // [1] : https://godoc.org/github.com/shirou/gopsutil/process#Process.Tgid 61 // [2] : https://github.com/shirou/gopsutil#process-class 62 Tgid *int32 63 } 64 65 const MAX_READAHEAD = uint32(400 * 1024 * 1024) 66 const READAHEAD_CHUNK = uint32(20 * 1024 * 1024) 67 68 // NewFileHandle returns a new file handle for the given `inode` triggered by fuse 69 // operation with the given `opMetadata` 70 func NewFileHandle(inode *Inode, opMetadata fuseops.OpMetadata) *FileHandle { 71 tgid, err := GetTgid(opMetadata.Pid) 72 if err != nil { 73 log.Debugf( 74 "Failed to retrieve tgid for the given pid. pid: %v err: %v inode id: %v err: %v", 75 opMetadata.Pid, err, inode.Id, err) 76 } 77 fh := &FileHandle{inode: inode, Tgid: tgid} 78 fh.cloud, fh.key = inode.cloud() 79 return fh 80 } 81 82 func (fh *FileHandle) initWrite() { 83 fh.writeInit.Do(func() { 84 fh.mpuWG.Add(1) 85 go fh.initMPU() 86 }) 87 } 88 89 func (fh *FileHandle) initMPU() { 90 defer func() { 91 fh.mpuWG.Done() 92 }() 93 94 fs := fh.inode.fs 95 fh.mpuName = &fh.key 96 97 resp, err := fh.cloud.MultipartBlobBegin(&MultipartBlobBeginInput{ 98 Key: *fh.mpuName, 99 ContentType: fs.flags.GetMimeType(*fh.mpuName), 100 }) 101 102 fh.mu.Lock() 103 defer fh.mu.Unlock() 104 105 if err != nil { 106 fh.lastWriteError = mapAwsError(err) 107 } else { 108 fh.mpuId = resp 109 } 110 111 return 112 } 113 114 func (fh *FileHandle) mpuPartNoSpawn(buf *MBuf, part uint32, total int64, last bool) (err error) { 115 fs := fh.inode.fs 116 117 fs.replicators.Take(1, true) 118 defer fs.replicators.Return(1) 119 120 if part == 0 || part > 10000 { 121 return errors.New(fmt.Sprintf("invalid part number: %v", part)) 122 } 123 124 mpu := MultipartBlobAddInput{ 125 Commit: fh.mpuId, 126 PartNumber: part, 127 Body: buf, 128 Size: uint64(buf.Len()), 129 Last: last, 130 Offset: uint64(total - int64(buf.Len())), 131 } 132 133 defer func() { 134 if mpu.Body != nil { 135 bufferLog.Debugf("Free %T", buf) 136 buf.Free() 137 } 138 }() 139 140 _, err = fh.cloud.MultipartBlobAdd(&mpu) 141 142 return 143 } 144 145 func (fh *FileHandle) mpuPart(buf *MBuf, part uint32, total int64) { 146 defer func() { 147 fh.mpuWG.Done() 148 }() 149 150 // maybe wait for CreateMultipartUpload 151 if fh.mpuId == nil { 152 fh.mpuWG.Wait() 153 // initMPU might have errored 154 if fh.mpuId == nil { 155 return 156 } 157 } 158 159 err := fh.mpuPartNoSpawn(buf, part, total, false) 160 if err != nil { 161 if fh.lastWriteError == nil { 162 fh.lastWriteError = err 163 } 164 } 165 } 166 167 func (fh *FileHandle) waitForCreateMPU() (err error) { 168 if fh.mpuId == nil { 169 fh.mu.Unlock() 170 fh.initWrite() 171 fh.mpuWG.Wait() // wait for initMPU 172 fh.mu.Lock() 173 174 if fh.lastWriteError != nil { 175 return fh.lastWriteError 176 } 177 } 178 179 return 180 } 181 182 func (fh *FileHandle) partSize() uint64 { 183 var size uint64 184 185 if fh.lastPartId < 1000 { 186 size = 5 * 1024 * 1024 187 } else if fh.lastPartId < 2000 { 188 size = 25 * 1024 * 1024 189 } else { 190 size = 125 * 1024 * 1024 191 } 192 193 maxPartSize := fh.cloud.Capabilities().MaxMultipartSize 194 if maxPartSize != 0 { 195 size = MinUInt64(maxPartSize, size) 196 } 197 return size 198 } 199 200 func (fh *FileHandle) uploadCurrentBuf(parallel bool) (err error) { 201 err = fh.waitForCreateMPU() 202 if err != nil { 203 return 204 } 205 206 fh.lastPartId++ 207 part := fh.lastPartId 208 buf := fh.buf 209 fh.buf = nil 210 211 if parallel { 212 fh.mpuWG.Add(1) 213 go fh.mpuPart(buf, part, fh.nextWriteOffset) 214 } else { 215 err = fh.mpuPartNoSpawn(buf, part, fh.nextWriteOffset, false) 216 if fh.lastWriteError == nil { 217 fh.lastWriteError = err 218 } 219 } 220 221 return 222 } 223 224 func (fh *FileHandle) WriteFile(offset int64, data []byte) (err error) { 225 fh.inode.logFuse("WriteFile", offset, len(data)) 226 227 fh.mu.Lock() 228 defer fh.mu.Unlock() 229 230 if fh.lastWriteError != nil { 231 return fh.lastWriteError 232 } 233 234 if offset != fh.nextWriteOffset { 235 fh.inode.errFuse("WriteFile: only sequential writes supported", fh.nextWriteOffset, offset) 236 fh.lastWriteError = syscall.ENOTSUP 237 return fh.lastWriteError 238 } 239 240 if offset == 0 { 241 fh.poolHandle = fh.inode.fs.bufferPool 242 fh.dirty = true 243 } 244 245 for { 246 if fh.buf == nil { 247 fh.buf = MBuf{}.Init(fh.poolHandle, fh.partSize(), true) 248 } 249 250 nCopied, _ := fh.buf.Write(data) 251 fh.nextWriteOffset += int64(nCopied) 252 253 if fh.buf.Full() { 254 err = fh.uploadCurrentBuf(!fh.cloud.Capabilities().NoParallelMultipart) 255 if err != nil { 256 return 257 } 258 } 259 260 if nCopied == len(data) { 261 break 262 } 263 264 data = data[nCopied:] 265 } 266 267 fh.inode.Attributes.Size = uint64(fh.nextWriteOffset) 268 269 return 270 } 271 272 type S3ReadBuffer struct { 273 s3 StorageBackend 274 startOffset uint64 275 nRetries uint8 276 mbuf *MBuf 277 278 offset uint64 279 size uint32 280 buf *Buffer 281 } 282 283 func (b S3ReadBuffer) Init(fh *FileHandle, offset uint64, size uint32) *S3ReadBuffer { 284 b.s3 = fh.cloud 285 b.offset = offset 286 b.startOffset = offset 287 b.size = size 288 b.nRetries = 3 289 290 b.mbuf = MBuf{}.Init(fh.poolHandle, uint64(size), false) 291 if b.mbuf == nil { 292 return nil 293 } 294 295 b.initBuffer(fh, offset, size) 296 return &b 297 } 298 299 func (b *S3ReadBuffer) initBuffer(fh *FileHandle, offset uint64, size uint32) { 300 getFunc := func() (io.ReadCloser, error) { 301 resp, err := b.s3.GetBlob(&GetBlobInput{ 302 Key: fh.key, 303 Start: offset, 304 Count: uint64(size), 305 }) 306 if err != nil { 307 return nil, err 308 } 309 310 return resp.Body, nil 311 } 312 313 if b.buf == nil { 314 b.buf = Buffer{}.Init(b.mbuf, getFunc) 315 } else { 316 b.buf.ReInit(getFunc) 317 } 318 } 319 320 func (b *S3ReadBuffer) Read(offset uint64, p []byte) (n int, err error) { 321 if b.offset == offset { 322 n, err = io.ReadFull(b.buf, p) 323 if n != 0 && err == io.ErrUnexpectedEOF { 324 err = nil 325 } 326 if n > 0 { 327 if uint32(n) > b.size { 328 panic(fmt.Sprintf("read more than available %v %v", n, b.size)) 329 } 330 331 b.offset += uint64(n) 332 b.size -= uint32(n) 333 } 334 if b.size == 0 && err != nil { 335 // we've read everything, sometimes we may 336 // request for more bytes then there's left in 337 // this chunk so we could get an error back, 338 // ex: http2: response body closed this 339 // doesn't tend to happen because our chunks 340 // are aligned to 4K and also 128K (except for 341 // the last chunk, but seems kernel requests 342 // for a smaller buffer for the last chunk) 343 err = nil 344 } 345 346 return 347 } else { 348 panic(fmt.Sprintf("not the right buffer, expecting %v got %v, %v left", b.offset, offset, b.size)) 349 err = errors.New(fmt.Sprintf("not the right buffer, expecting %v got %v", b.offset, offset)) 350 return 351 } 352 } 353 354 func (fh *FileHandle) readFromReadAhead(offset uint64, buf []byte) (bytesRead int, err error) { 355 var nread int 356 for len(fh.buffers) != 0 { 357 readAheadBuf := fh.buffers[0] 358 359 nread, err = readAheadBuf.Read(offset+uint64(bytesRead), buf) 360 bytesRead += nread 361 if err != nil { 362 if err == io.EOF && readAheadBuf.size != 0 { 363 // in case we hit 364 // https://github.com/kahing/goofys/issues/464 365 // again, this will convert that into 366 // an error 367 fuseLog.Errorf("got EOF when data remains: %v", *fh.inode.FullName()) 368 err = io.ErrUnexpectedEOF 369 } else if err != io.EOF && readAheadBuf.size > 0 { 370 // we hit some other errors when 371 // reading from this part. If we can 372 // retry, do that 373 if readAheadBuf.nRetries > 0 { 374 readAheadBuf.nRetries -= 1 375 readAheadBuf.initBuffer(fh, readAheadBuf.offset, readAheadBuf.size) 376 // we unset error and return, 377 // so upper layer will retry 378 // this read 379 err = nil 380 } 381 } 382 return 383 } 384 385 if readAheadBuf.size == 0 { 386 // we've exhausted the first buffer 387 readAheadBuf.buf.Close() 388 fh.buffers = fh.buffers[1:] 389 } 390 391 buf = buf[nread:] 392 393 if len(buf) == 0 { 394 // we've filled the user buffer 395 return 396 } 397 } 398 399 return 400 } 401 402 func (fh *FileHandle) readAhead(offset uint64, needAtLeast int) (err error) { 403 existingReadahead := uint32(0) 404 for _, b := range fh.buffers { 405 existingReadahead += b.size 406 } 407 408 readAheadAmount := MAX_READAHEAD 409 410 for readAheadAmount-existingReadahead >= READAHEAD_CHUNK { 411 off := offset + uint64(existingReadahead) 412 remaining := fh.inode.Attributes.Size - off 413 414 // only read up to readahead chunk each time 415 size := MinUInt32(readAheadAmount-existingReadahead, READAHEAD_CHUNK) 416 // but don't read past the file 417 size = uint32(MinUInt64(uint64(size), remaining)) 418 419 if size != 0 { 420 fh.inode.logFuse("readahead", off, size, existingReadahead) 421 422 readAheadBuf := S3ReadBuffer{}.Init(fh, off, size) 423 if readAheadBuf != nil { 424 fh.buffers = append(fh.buffers, readAheadBuf) 425 existingReadahead += size 426 } else { 427 if existingReadahead != 0 { 428 // don't do more readahead now, but don't fail, cross our 429 // fingers that we will be able to allocate the buffers 430 // later 431 return nil 432 } else { 433 return syscall.ENOMEM 434 } 435 } 436 } 437 438 if size != READAHEAD_CHUNK { 439 // that was the last remaining chunk to readahead 440 break 441 } 442 } 443 444 return nil 445 } 446 447 func (fh *FileHandle) ReadFile(offset int64, buf []byte) (bytesRead int, err error) { 448 fh.inode.logFuse("ReadFile", offset, len(buf)) 449 defer func() { 450 fh.inode.logFuse("< ReadFile", bytesRead, err) 451 452 if err != nil { 453 if err == io.EOF { 454 err = nil 455 } 456 } 457 }() 458 459 fh.mu.Lock() 460 defer fh.mu.Unlock() 461 462 nwant := len(buf) 463 var nread int 464 465 for bytesRead < nwant && err == nil { 466 nread, err = fh.readFile(offset+int64(bytesRead), buf[bytesRead:]) 467 if nread > 0 { 468 bytesRead += nread 469 } 470 } 471 472 return 473 } 474 475 func (fh *FileHandle) readFile(offset int64, buf []byte) (bytesRead int, err error) { 476 defer func() { 477 if bytesRead > 0 { 478 fh.readBufOffset += int64(bytesRead) 479 fh.seqReadAmount += uint64(bytesRead) 480 } 481 482 fh.inode.logFuse("< readFile", bytesRead, err) 483 }() 484 485 if uint64(offset) >= fh.inode.Attributes.Size { 486 // nothing to read 487 if fh.inode.Invalid { 488 err = fuse.ENOENT 489 } else if fh.inode.KnownSize == nil { 490 err = io.EOF 491 } else { 492 err = io.EOF 493 } 494 return 495 } 496 497 fs := fh.inode.fs 498 499 if fh.poolHandle == nil { 500 fh.poolHandle = fs.bufferPool 501 } 502 503 if fh.readBufOffset != offset { 504 // XXX out of order read, maybe disable prefetching 505 fh.inode.logFuse("out of order read", offset, fh.readBufOffset) 506 507 fh.readBufOffset = offset 508 fh.seqReadAmount = 0 509 if fh.reader != nil { 510 fh.reader.Close() 511 fh.reader = nil 512 } 513 514 if fh.buffers != nil { 515 // we misdetected 516 fh.numOOORead++ 517 } 518 519 for _, b := range fh.buffers { 520 b.buf.Close() 521 } 522 fh.buffers = nil 523 } 524 525 if !fs.flags.Cheap && fh.seqReadAmount >= uint64(READAHEAD_CHUNK) && fh.numOOORead < 3 { 526 if fh.reader != nil { 527 fh.inode.logFuse("cutover to the parallel algorithm") 528 fh.reader.Close() 529 fh.reader = nil 530 } 531 532 err = fh.readAhead(uint64(offset), len(buf)) 533 if err == nil { 534 bytesRead, err = fh.readFromReadAhead(uint64(offset), buf) 535 return 536 } else { 537 // fall back to read serially 538 fh.inode.logFuse("not enough memory, fallback to serial read") 539 fh.seqReadAmount = 0 540 for _, b := range fh.buffers { 541 b.buf.Close() 542 } 543 fh.buffers = nil 544 } 545 } 546 547 bytesRead, err = fh.readFromStream(offset, buf) 548 549 return 550 } 551 552 func (fh *FileHandle) Release() { 553 // read buffers 554 for _, b := range fh.buffers { 555 b.buf.Close() 556 } 557 fh.buffers = nil 558 559 if fh.reader != nil { 560 fh.reader.Close() 561 } 562 563 // write buffers 564 if fh.poolHandle != nil { 565 if fh.buf != nil && fh.buf.buffers != nil { 566 if fh.lastWriteError == nil { 567 panic("buf not freed but error is nil") 568 } 569 570 fh.buf.Free() 571 // the other in-flight multipart PUT buffers will be 572 // freed when they finish/error out 573 } 574 } 575 576 fh.inode.mu.Lock() 577 defer fh.inode.mu.Unlock() 578 579 if fh.inode.fileHandles == 0 { 580 panic(fh.inode.fileHandles) 581 } 582 583 fh.inode.fileHandles -= 1 584 } 585 586 func (fh *FileHandle) readFromStream(offset int64, buf []byte) (bytesRead int, err error) { 587 defer func() { 588 if fh.inode.fs.flags.DebugFuse { 589 fh.inode.logFuse("< readFromStream", bytesRead) 590 } 591 }() 592 593 if uint64(offset) >= fh.inode.Attributes.Size { 594 // nothing to read 595 return 596 } 597 598 if fh.reader == nil { 599 resp, err := fh.cloud.GetBlob(&GetBlobInput{ 600 Key: fh.key, 601 Start: uint64(offset), 602 }) 603 if err != nil { 604 return bytesRead, err 605 } 606 607 fh.reader = resp.Body 608 } 609 610 bytesRead, err = fh.reader.Read(buf) 611 if err != nil { 612 if err != io.EOF { 613 fh.inode.logFuse("< readFromStream error", bytesRead, err) 614 } 615 // always retry error on read 616 fh.reader.Close() 617 fh.reader = nil 618 err = nil 619 } 620 621 return 622 } 623 624 func (fh *FileHandle) flushSmallFile() (err error) { 625 buf := fh.buf 626 fh.buf = nil 627 628 if buf == nil { 629 buf = MBuf{}.Init(fh.poolHandle, 0, true) 630 } 631 632 defer buf.Free() 633 634 fs := fh.inode.fs 635 636 fs.replicators.Take(1, true) 637 defer fs.replicators.Return(1) 638 639 // we want to get key from inode because the file could have been renamed 640 _, key := fh.inode.cloud() 641 resp, err := fh.cloud.PutBlob(&PutBlobInput{ 642 Key: key, 643 Body: buf, 644 Size: PUInt64(uint64(buf.Len())), 645 ContentType: fs.flags.GetMimeType(*fh.inode.FullName()), 646 }) 647 if err != nil { 648 fh.lastWriteError = err 649 } else { 650 inode := fh.inode 651 inode.mu.Lock() 652 defer inode.mu.Unlock() 653 if resp.ETag != nil { 654 inode.s3Metadata["etag"] = []byte(*resp.ETag) 655 } 656 if resp.StorageClass != nil { 657 inode.s3Metadata["storage-class"] = []byte(*resp.StorageClass) 658 } 659 } 660 return 661 } 662 663 func (fh *FileHandle) resetToKnownSize() { 664 if fh.inode.KnownSize != nil { 665 fh.inode.Attributes.Size = *fh.inode.KnownSize 666 } else { 667 fh.inode.Attributes.Size = 0 668 fh.inode.Invalid = true 669 } 670 } 671 672 func (fh *FileHandle) FlushFile() (err error) { 673 fh.mu.Lock() 674 defer fh.mu.Unlock() 675 676 fh.inode.logFuse("FlushFile") 677 678 if !fh.dirty || fh.lastWriteError != nil { 679 if fh.lastWriteError != nil { 680 err = fh.lastWriteError 681 fh.resetToKnownSize() 682 } 683 return 684 } 685 686 fs := fh.inode.fs 687 688 // abort mpu on error 689 defer func() { 690 if err != nil { 691 if fh.mpuId != nil { 692 go func() { 693 _, _ = fh.cloud.MultipartBlobAbort(fh.mpuId) 694 fh.mpuId = nil 695 }() 696 } 697 698 fh.resetToKnownSize() 699 } else { 700 if fh.dirty { 701 // don't unset this if we never actually flushed 702 size := fh.inode.Attributes.Size 703 fh.inode.KnownSize = &size 704 fh.inode.Invalid = false 705 } 706 fh.dirty = false 707 } 708 709 fh.writeInit = sync.Once{} 710 fh.nextWriteOffset = 0 711 fh.lastPartId = 0 712 }() 713 714 if fh.lastPartId == 0 { 715 return fh.flushSmallFile() 716 } 717 718 fh.mpuWG.Wait() 719 720 if fh.lastWriteError != nil { 721 return fh.lastWriteError 722 } 723 724 if fh.mpuId == nil { 725 return 726 } 727 728 nParts := fh.lastPartId 729 if fh.buf != nil { 730 // upload last part 731 nParts++ 732 err = fh.mpuPartNoSpawn(fh.buf, nParts, fh.nextWriteOffset, true) 733 if err != nil { 734 return 735 } 736 fh.buf = nil 737 } 738 739 _, err = fh.cloud.MultipartBlobCommit(fh.mpuId) 740 if err != nil { 741 return 742 } 743 744 fh.mpuId = nil 745 746 // we want to get key from inode because the file could have been renamed 747 _, key := fh.inode.cloud() 748 if *fh.mpuName != key { 749 // the file was renamed 750 err = fh.inode.renameObject(fs, PUInt64(uint64(fh.nextWriteOffset)), *fh.mpuName, *fh.inode.FullName()) 751 } 752 753 return 754 }