github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/cmd/grail-fuse/gfs/gfs.go (about)

     1  // Package gfs implements FUSE on top oh grailfile.  Function Main is the entry
     2  // point.
     3  package gfs
     4  
     5  import (
     6  	"context"
     7  	"crypto/sha512"
     8  	"encoding/binary"
     9  	"fmt"
    10  	"io"
    11  	"os"
    12  	"runtime/debug"
    13  	"sync"
    14  	"sync/atomic"
    15  	"syscall"
    16  	"time"
    17  	"unsafe"
    18  
    19  	"github.com/Schaudge/grailbase/errors"
    20  	"github.com/Schaudge/grailbase/file"
    21  	"github.com/Schaudge/grailbase/log"
    22  	gunsafe "github.com/Schaudge/grailbase/unsafe"
    23  	"github.com/hanwen/go-fuse/v2/fs"
    24  	"github.com/hanwen/go-fuse/v2/fuse"
    25  )
    26  
    27  // Inode represents a file or a directory.
    28  type inode struct {
    29  	fs.Inode
    30  	// full pathname, such as "s3://bucket/key0/key1"
    31  	path string
    32  	// dir entry as stored in the parent directory.
    33  	ent fuse.DirEntry
    34  
    35  	mu   sync.Mutex // guards the following fields.
    36  	stat cachedStat // TODO: Remove this since we're now using kernel caching.
    37  
    38  	// nDirStreamRef tracks the usage of this inode in DirStreams. It is used
    39  	// to decide whether an inode can be reused to service LOOKUP
    40  	// operations. To handle READDIRPLUS, go-fuse interleaves LOOKUP calls for
    41  	// each directory entry. We allow the inode associated with the previous
    42  	// directory entry to be used in LOOKUP to avoid costly API calls.
    43  	//
    44  	// Because an inode can be the previous entry in multiple DirStreams, we
    45  	// maintain a reference count.
    46  	//
    47  	// It is possible for the inode to be forgotten, e.g. when the kernel is
    48  	// low on memory, before the LOOKUP call. If this happens, LOOKUP will not
    49  	// be able to reuse it. This seems to happen rarely, if at all, in
    50  	// practice.
    51  	nDirStreamRef int32
    52  }
    53  
    54  // Amount of time to cache directory entries and file stats (size, mtime).
    55  const cacheExpiration = 5 * time.Minute
    56  
    57  // RootInode is a singleton inode created for the root mount point.
    58  type rootInode struct {
    59  	inode
    60  	// The context to be used for all file operations. It's vcontext.Background()
    61  	// in Grail environments.
    62  	// TODO(josh): Consider removing and using operation-specific contexts instead (like readdir).
    63  	ctx context.Context
    64  	// Directory for storing tmp files.
    65  	tmpDir string
    66  }
    67  
    68  // Handle represents an open file handle.
    69  type handle struct {
    70  	// The file that the handle belongs to
    71  	inode *inode
    72  	// Open mode bits. O_WRONLY, etc.
    73  	openMode uint32
    74  	// Size passed to Setattr, if any. -1 if not set.
    75  	requestedSize int64
    76  	// Remembers the result of the first Flush. If Flush is called multiple times
    77  	// they will return this code.
    78  	closeErrno syscall.Errno
    79  
    80  	// At most one of the following three will be set.  Initialized lazily on
    81  	// first Read or Write.
    82  	dw  *directWrite // O_WRONLY|O_TRUNC, or O_WRONLY for a new file.
    83  	dr  *directRead  // O_RDONLY.
    84  	tmp *tmpIO       // everything else, e.g., O_RDWR or O_APPEND.
    85  }
    86  
    87  // openMode is a bitmap of O_RDONLY, O_APPEND, etc.
    88  func newHandle(inode *inode, openMode uint32) *handle {
    89  	return &handle{inode: inode, openMode: openMode, requestedSize: -1}
    90  }
    91  
    92  // DirectWrite is part of open file handle. It uploads data directly to the remote
    93  // file. Used when creating a new file, or overwriting an existing file with
    94  // O_WRONLY|O_TRUNC.
    95  type directWrite struct {
    96  	fp file.File
    97  	w  io.Writer
    98  	// The next expected write offset. Calling Write on a wrong offset results in
    99  	// error (w doesn't implement a seeker).
   100  	off int64
   101  }
   102  
   103  // DirectRead is part of open file handle. It is used when reading a file
   104  // readonly.
   105  type directRead struct {
   106  	fp file.File
   107  	r  io.ReadSeeker
   108  }
   109  
   110  // TmpIO is part of open file handle. It writes data to a file in the local file
   111  // system. On Flush (i.e., close), the file contents are copied to the remote
   112  // file. It is used w/ O_RDWR, O_APPEND, etc.
   113  type tmpIO struct {
   114  	fp *os.File // refers to a file in -tmp-dir.
   115  }
   116  
   117  // CachedStat is stored in inode and a directory entry to provide quick access
   118  // to basic stats.
   119  type cachedStat struct {
   120  	expiration time.Time
   121  	size       int64
   122  	modTime    time.Time
   123  }
   124  
   125  func downCast(n *fs.Inode) *inode {
   126  	nn := (*inode)(unsafe.Pointer(n))
   127  	if nn.path == "" {
   128  		log.Panicf("not an inode: %+v", n)
   129  	}
   130  	return nn
   131  }
   132  
   133  var (
   134  	_ fs.InodeEmbedder = (*inode)(nil)
   135  
   136  	_ fs.NodeAccesser  = (*inode)(nil)
   137  	_ fs.NodeCreater   = (*inode)(nil)
   138  	_ fs.NodeGetattrer = (*inode)(nil)
   139  	_ fs.NodeLookuper  = (*inode)(nil)
   140  	_ fs.NodeMkdirer   = (*inode)(nil)
   141  	_ fs.NodeOpener    = (*inode)(nil)
   142  	_ fs.NodeReaddirer = (*inode)(nil)
   143  	_ fs.NodeRmdirer   = (*inode)(nil)
   144  	_ fs.NodeSetattrer = (*inode)(nil)
   145  	_ fs.NodeUnlinker  = (*inode)(nil)
   146  
   147  	_ fs.FileFlusher  = (*handle)(nil)
   148  	_ fs.FileFsyncer  = (*handle)(nil)
   149  	_ fs.FileLseeker  = (*handle)(nil)
   150  	_ fs.FileReader   = (*handle)(nil)
   151  	_ fs.FileReleaser = (*handle)(nil)
   152  	_ fs.FileWriter   = (*handle)(nil)
   153  )
   154  
   155  func newAttr(ino uint64, mode uint32, size uint64, optionalMtime time.Time) (attr fuse.Attr) {
   156  	const blockSize = 1 << 20
   157  	attr.Ino = ino
   158  	attr.Mode = mode
   159  	attr.Nlink = 1
   160  	attr.Size = size
   161  	attr.Blocks = (attr.Size-1)/blockSize + 1
   162  	if !optionalMtime.IsZero() {
   163  		attr.SetTimes(nil, &optionalMtime, nil)
   164  	}
   165  	return
   166  }
   167  
   168  // GetModeBits produces the persistent mode bits so that the kernel can
   169  // distinguish regular files from directories.
   170  func getModeBits(isDir bool) uint32 {
   171  	mode := uint32(0)
   172  	if isDir {
   173  		mode |= syscall.S_IFDIR | 0755
   174  	} else {
   175  		mode |= syscall.S_IFREG | 0644
   176  	}
   177  	return mode
   178  }
   179  
   180  // GetIno produces a fake inode number by hashing the path.
   181  func getIno(path string) uint64 {
   182  	h := sha512.Sum512_256(gunsafe.StringToBytes(path))
   183  	return binary.LittleEndian.Uint64(h[:8])
   184  }
   185  
   186  // GetFileName extracts the filename part of the path. "dir" is the directory
   187  // that the file belongs in.
   188  func getFileName(dir *inode, path string) string {
   189  	if dir.IsRoot() {
   190  		return path[len(dir.path):]
   191  	}
   192  	return path[len(dir.path)+1:] // +1 to remove '/'.
   193  }
   194  
   195  func errToErrno(err error) syscall.Errno {
   196  	if err == nil {
   197  		return 0
   198  	}
   199  	log.Debug.Printf("error %v: stack=%s", err, string(debug.Stack()))
   200  	switch {
   201  	case err == nil:
   202  		return 0
   203  	case errors.Is(errors.Timeout, err):
   204  		return syscall.ETIMEDOUT
   205  	case errors.Is(errors.Canceled, err):
   206  		return syscall.EINTR
   207  	case errors.Is(errors.NotExist, err):
   208  		return syscall.ENOENT
   209  	case errors.Is(errors.Exists, err):
   210  		return syscall.EEXIST
   211  	case errors.Is(errors.NotAllowed, err):
   212  		return syscall.EACCES
   213  	case errors.Is(errors.Integrity, err):
   214  		return syscall.EIO
   215  	case errors.Is(errors.Invalid, err):
   216  		return syscall.EINVAL
   217  	case errors.Is(errors.Precondition, err), errors.Is(errors.Unavailable, err):
   218  		return syscall.EAGAIN
   219  	case errors.Is(errors.Net, err):
   220  		return syscall.ENETUNREACH
   221  	case errors.Is(errors.TooManyTries, err):
   222  		log.Error.Print(err)
   223  		return syscall.EINVAL
   224  	}
   225  	return fs.ToErrno(err)
   226  }
   227  
   228  // Root reports the inode of the root mountpoint.
   229  func (n *inode) root() *rootInode { return n.Root().Operations().(*rootInode) }
   230  
   231  // Ctx reports the context passed from the application when mounting the
   232  // filesystem.
   233  func (n *inode) ctx() context.Context { return n.root().ctx }
   234  
   235  // addDirStreamRef adds a single reference to this inode. It must be eventually
   236  // followed by a dropRef.
   237  func (n *inode) addDirStreamRef() {
   238  	_ = atomic.AddInt32(&n.nDirStreamRef, 1)
   239  }
   240  
   241  // dropDirStreamRef drops a single reference to this inode.
   242  func (n *inode) dropDirStreamRef() {
   243  	if x := atomic.AddInt32(&n.nDirStreamRef, -1); x < 0 {
   244  		panic("negative reference count; unmatched drop")
   245  	}
   246  }
   247  
   248  // previousOfAnyDirStream returns true iff the inode is the previous entry
   249  // returned by any outstanding DirStream.
   250  func (n *inode) previousOfAnyDirStream() bool {
   251  	return atomic.LoadInt32(&n.nDirStreamRef) > 0
   252  }
   253  
   254  // Access is called to implement access(2).
   255  func (n *inode) Access(_ context.Context, mask uint32) syscall.Errno {
   256  	// TODO(saito) I'm not sure returning 0 blindly is ok here.
   257  	log.Debug.Printf("setattr %s: mask=%x", n.path, mask)
   258  	return 0
   259  }
   260  
   261  // Setattr is called to change file attributes. This function only supports
   262  // changing the size.
   263  func (n *inode) Setattr(_ context.Context, fhi fs.FileHandle, in *fuse.SetAttrIn, out *fuse.AttrOut) syscall.Errno {
   264  	n.mu.Lock()
   265  	defer n.mu.Unlock()
   266  
   267  	usize, ok := in.GetSize()
   268  	if !ok {
   269  		// We don't support setting other attributes now.
   270  		return 0
   271  	}
   272  	size := int64(usize)
   273  
   274  	if fhi != nil {
   275  		fh := fhi.(*handle)
   276  		switch {
   277  		case fh.dw != nil:
   278  			if size == fh.dw.off {
   279  				return 0
   280  			}
   281  			log.Error.Printf("setattr %s: setting size to %d in directio mode not supported (request: %+v)", n.path, size, in)
   282  			return syscall.ENOSYS
   283  		case fh.dr != nil:
   284  			log.Error.Printf("setattr %s: readonly", n.path)
   285  			return syscall.EPERM
   286  		case fh.tmp != nil:
   287  			return errToErrno(fh.tmp.fp.Truncate(size))
   288  		default:
   289  			fh.requestedSize = size
   290  			return 0
   291  		}
   292  	}
   293  
   294  	if size != 0 {
   295  		log.Error.Printf("setattr %s: setting size to nonzero value (%d) not supported", n.path, size)
   296  		return syscall.ENOSYS
   297  	}
   298  	ctx := n.ctx()
   299  	fp, err := file.Create(ctx, n.path)
   300  	if err != nil {
   301  		log.Error.Printf("setattr %s: %v", n.path, err)
   302  		return errToErrno(err)
   303  	}
   304  	if err := fp.Close(ctx); err != nil {
   305  		log.Error.Printf("setattr %s: %v", n.path, err)
   306  		return errToErrno(err)
   307  	}
   308  	return 0
   309  }
   310  
   311  func (n *inode) Getattr(_ context.Context, fhi fs.FileHandle, out *fuse.AttrOut) syscall.Errno {
   312  	ctx := n.ctx()
   313  	if n.ent.Ino == 0 || n.ent.Mode == 0 {
   314  		log.Panicf("node %s: ino or mode unset: %+v", n.path, n)
   315  	}
   316  	if n.IsDir() {
   317  		log.Debug.Printf("getattr %s: directory", n.path)
   318  		out.Attr = newAttr(n.ent.Ino, n.ent.Mode, 0, time.Time{})
   319  		return 0
   320  	}
   321  
   322  	var fh *handle
   323  	if fhi != nil {
   324  		fh = fhi.(*handle)
   325  	}
   326  
   327  	n.mu.Lock()
   328  	defer n.mu.Unlock()
   329  	if fh != nil {
   330  		if err := fh.maybeInitIO(); err != nil {
   331  			return errToErrno(err)
   332  		}
   333  		if t := fh.tmp; t != nil {
   334  			log.Debug.Printf("getattr %s: tmp", n.path)
   335  			stat, err := t.fp.Stat()
   336  			if err != nil {
   337  				log.Printf("getattr %s (%s): %v", n.path, t.fp.Name(), err)
   338  				return errToErrno(err)
   339  			}
   340  			out.Attr = newAttr(n.ent.Ino, n.ent.Mode, uint64(stat.Size()), stat.ModTime())
   341  			return 0
   342  		}
   343  		if fh.dw != nil {
   344  			out.Attr = newAttr(n.ent.Ino, n.ent.Mode, uint64(n.stat.size), n.stat.modTime)
   345  			return 0
   346  		}
   347  		// fall through
   348  	}
   349  	stat, err := n.getCachedStat(ctx)
   350  	if err != nil {
   351  		log.Printf("getattr %s: err %v", n.path, err)
   352  		return errToErrno(err)
   353  	}
   354  	out.Attr = newAttr(n.ent.Ino, n.ent.Mode, uint64(stat.size), stat.modTime)
   355  	log.Debug.Printf("getattr %s: out %+v", n.path, out)
   356  	return 0
   357  }
   358  
   359  func (n *inode) getCachedStat(ctx context.Context) (cachedStat, error) {
   360  	now := time.Now()
   361  	if now.After(n.stat.expiration) {
   362  		log.Debug.Printf("getcachedstat %s: cache miss", n.path)
   363  		info, err := file.Stat(ctx, n.path)
   364  		if err != nil {
   365  			log.Printf("getcachedstat %s: err %v", n.path, err)
   366  			return cachedStat{}, err
   367  		}
   368  		n.stat = cachedStat{
   369  			expiration: now.Add(cacheExpiration),
   370  			size:       info.Size(),
   371  			modTime:    info.ModTime(),
   372  		}
   373  	} else {
   374  		log.Debug.Printf("getcachedstat %s: cache hit %+v now %v", n.path, n.stat, now)
   375  	}
   376  	return n.stat, nil
   377  }
   378  
   379  // MaybeInitIO is called on the first call to Read or Write after open.  It
   380  // initializes either the directio uploader or a tempfile.
   381  //
   382  // REQUIRES: fh.inode.mu is locked
   383  func (fh *handle) maybeInitIO() error {
   384  	n := fh.inode
   385  	if fh.dw != nil || fh.dr != nil || fh.tmp != nil {
   386  		return nil
   387  	}
   388  	if (fh.openMode & fuse.O_ANYWRITE) == 0 {
   389  		// Readonly handle should have fh.direct set at the time of Open.
   390  		log.Panicf("open %s: uninitialized readonly handle", n.path)
   391  	}
   392  	if fh.inode == nil {
   393  		log.Panicf("open %s: nil inode: %+v", n.path, fh)
   394  	}
   395  	ctx := n.ctx()
   396  	if (fh.openMode&syscall.O_RDWR) != syscall.O_RDWR &&
   397  		(fh.requestedSize == 0 || (fh.openMode&syscall.O_TRUNC == syscall.O_TRUNC)) {
   398  		// We are fully overwriting the file. Do that w/o a local tmpfile.
   399  		log.Debug.Printf("open %s: direct IO", n.path)
   400  		fp, err := file.Create(ctx, n.path)
   401  		if err != nil {
   402  			return err
   403  		}
   404  		fh.dw = &directWrite{fp: fp, w: fp.Writer(ctx)}
   405  		return nil
   406  	}
   407  	// Do all reads/writes on a local tmp file, and copy it to the remote file on
   408  	// close.
   409  	log.Debug.Printf("open %s: tmp IO", n.path)
   410  	in, err := file.Open(ctx, n.path)
   411  	if err != nil {
   412  		log.Error.Printf("open %s: %v", n.path, err)
   413  		return err
   414  	}
   415  	tmpPath := file.Join(n.root().tmpDir, fmt.Sprintf("%08x", n.ent.Ino))
   416  	tmp, err := os.OpenFile(tmpPath, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0600)
   417  	if err != nil {
   418  		log.Error.Printf("create %s (open %s): %v", tmpPath, n.path, err)
   419  		_ = in.Close(ctx)
   420  		return errToErrno(err)
   421  	}
   422  	inSize, err := io.Copy(tmp, in.Reader(ctx))
   423  	log.Debug.Printf("copy %s->%s: n+%d, %v", n.path, tmp.Name(), inSize, err)
   424  	if err != nil {
   425  		_ = in.Close(ctx)
   426  		_ = tmp.Close()
   427  		return errToErrno(err)
   428  	}
   429  	if err := in.Close(ctx); err != nil {
   430  		_ = tmp.Close()
   431  		return errToErrno(err)
   432  	}
   433  	now := time.Now()
   434  	n.stat.expiration = now.Add(cacheExpiration)
   435  	n.stat.size = inSize
   436  	n.stat.modTime = now
   437  	fh.tmp = &tmpIO{
   438  		fp: tmp,
   439  	}
   440  	return nil
   441  }
   442  
   443  func (fh *handle) Read(_ context.Context, dest []byte, off int64) (fuse.ReadResult, syscall.Errno) {
   444  	n := fh.inode
   445  	readDirect := func() (fuse.ReadResult, syscall.Errno) {
   446  		d := fh.dr
   447  		if d == nil {
   448  			return nil, syscall.EINVAL
   449  		}
   450  		log.Debug.Printf("read %s(fh=%p): off=%d seek start", n.path, fh, off)
   451  		newOff, err := d.r.Seek(off, io.SeekStart)
   452  		log.Debug.Printf("read %s(fh=%p): off=%d seek end", n.path, fh, off)
   453  		if err != nil {
   454  			return nil, errToErrno(err)
   455  		}
   456  		if newOff != off {
   457  			log.Panicf("%d <-> %d", newOff, off)
   458  		}
   459  
   460  		nByte, err := d.r.Read(dest)
   461  		log.Debug.Printf("read %s(fh=%p): off=%d, nbyte=%d, err=%v", n.path, fh, off, nByte, err)
   462  		if err != nil {
   463  			if err != io.EOF {
   464  				return nil, errToErrno(err)
   465  			}
   466  		}
   467  		return fuse.ReadResultData(dest[:nByte]), 0
   468  	}
   469  
   470  	readTmp := func() (fuse.ReadResult, syscall.Errno) {
   471  		t := fh.tmp
   472  		nByte, err := t.fp.ReadAt(dest, off)
   473  		if err != nil {
   474  			if err != io.EOF {
   475  				return nil, errToErrno(err)
   476  			}
   477  		}
   478  		return fuse.ReadResultData(dest[:nByte]), 0
   479  	}
   480  
   481  	n.mu.Lock()
   482  	defer n.mu.Unlock()
   483  	if err := fh.maybeInitIO(); err != nil {
   484  		//return fuse.ReadResult{}, errToErrno(err)
   485  		return nil, errToErrno(err)
   486  	}
   487  	switch {
   488  	case fh.dr != nil:
   489  		return readDirect()
   490  	case fh.tmp != nil:
   491  		return readTmp()
   492  	default:
   493  		log.Error.Printf("read %s: reading unopened or writeonly file", n.path)
   494  		return nil, syscall.EBADF
   495  	}
   496  }
   497  
   498  func (fh *handle) Lseek(ctx context.Context, off uint64, whence uint32) (uint64, syscall.Errno) {
   499  	const (
   500  		// Copied from https://github.com/torvalds/linux/blob/a050a6d2b7e80ca52b2f4141eaf3420d201b72b3/tools/include/uapi/linux/fs.h#L43-L47.
   501  		SEEK_DATA = 3
   502  		SEEK_HOLE = 4
   503  	)
   504  	switch whence {
   505  	case SEEK_DATA:
   506  		return off, 0 // We don't support holes so current offset is correct.
   507  	case SEEK_HOLE:
   508  		stat, err := fh.inode.getCachedStat(ctx)
   509  		if err != nil {
   510  			log.Error.Printf("lseek %s: stat: %v", fh.inode.path, err)
   511  			return 0, errToErrno(err)
   512  		}
   513  		return uint64(stat.size), 0
   514  	}
   515  	log.Error.Printf("lseek %s: unimplemented whence: %d", fh.inode.path, whence)
   516  	return 0, syscall.ENOSYS
   517  }
   518  
   519  func (fh *handle) Write(_ context.Context, dest []byte, off int64) (uint32, syscall.Errno) {
   520  	n := fh.inode
   521  	tmpWrite := func() (uint32, syscall.Errno) {
   522  		nByte, err := fh.tmp.fp.WriteAt(dest, off)
   523  		if err != nil {
   524  			log.Error.Printf("write %s: size=%d, off=%d: %v", n.path, len(dest), off, err)
   525  			return 0, errToErrno(err)
   526  		}
   527  		return uint32(nByte), 0
   528  	}
   529  
   530  	directWrite := func() (uint32, syscall.Errno) {
   531  		d := fh.dw
   532  		if d.off != off {
   533  			log.Error.Printf("write %s: offset mismatch (expect %d, got %d)", n.path, d.off, off)
   534  			return 0, syscall.EINVAL
   535  		}
   536  		if d.w == nil {
   537  			// closed already
   538  			log.Printf("write %s: already closed", n.path)
   539  			return 0, syscall.EBADF
   540  		}
   541  		nByte, err := d.w.Write(dest)
   542  		if err != nil {
   543  			if nByte > 0 {
   544  				panic(n)
   545  			}
   546  			return 0, errToErrno(err)
   547  		}
   548  		d.off += int64(nByte)
   549  		log.Debug.Printf("write %s: done %d bytes", n.path, nByte)
   550  		return uint32(nByte), 0
   551  	}
   552  
   553  	n.mu.Lock()
   554  	defer n.mu.Unlock()
   555  	log.Debug.Printf("write %s: %d bytes, off=%d", n.path, len(dest), off)
   556  	if err := fh.maybeInitIO(); err != nil {
   557  		return 0, errToErrno(err)
   558  	}
   559  	switch {
   560  	case fh.dw != nil:
   561  		return directWrite()
   562  	case fh.tmp != nil:
   563  		return tmpWrite()
   564  	default:
   565  		// file descriptor already closed
   566  		log.Error.Printf("write %s: writing after close", n.path)
   567  		return 0, syscall.EBADF
   568  	}
   569  }
   570  
   571  func (fh *handle) Fsync(_ context.Context, _ uint32) syscall.Errno {
   572  	n := fh.inode
   573  	n.mu.Lock()
   574  	defer n.mu.Unlock()
   575  	if d := fh.dw; d != nil {
   576  		n := fh.inode
   577  		// There's not much we can do, but returning ENOSYS breaks too many apps.
   578  		now := time.Now()
   579  		n.stat.expiration = now.Add(cacheExpiration)
   580  		n.stat.size = d.off
   581  		n.stat.modTime = now
   582  		log.Debug.Printf("fsync %s: update stats: stat=%v", n.path, n.stat)
   583  	}
   584  	return 0
   585  }
   586  
   587  // Release is called just before the inode is dropped from the kernel memory.
   588  // Return value is unused.
   589  func (fh *handle) Release(_ context.Context) syscall.Errno {
   590  	n := fh.inode
   591  	n.mu.Lock()
   592  	defer n.mu.Unlock()
   593  	switch {
   594  	case fh.tmp != nil:
   595  		if fh.tmp.fp != nil {
   596  			log.Panicf("%s: release called w/o flush", n.path)
   597  		}
   598  	case fh.dw != nil:
   599  		if fh.dw.fp != nil || fh.dw.w != nil {
   600  			log.Panicf("%s: release called w/o flush", n.path)
   601  		}
   602  	default:
   603  		if fh.dr != nil {
   604  			// Readonly handles are closed on the last release.
   605  			_ = fh.dr.fp.Close(n.ctx())
   606  		}
   607  	}
   608  	return 0
   609  }
   610  
   611  // Flush is called on close(2). It may be called multiple times when the file
   612  // descriptor is duped.
   613  //
   614  // TODO(saito) We don't support dups now. We close the underlying filestream on
   615  // the first close and subsequent flush calls will do nothing.
   616  func (fh *handle) Flush(_ context.Context) syscall.Errno {
   617  	n := fh.inode
   618  	ctx := n.ctx()
   619  
   620  	flushTmpAndUnlock := func() syscall.Errno {
   621  		t := fh.tmp
   622  		mu := &n.mu
   623  		defer func() {
   624  			if mu != nil {
   625  				mu.Unlock()
   626  			}
   627  		}()
   628  		if t.fp == nil {
   629  			mu.Unlock()
   630  			return fh.closeErrno
   631  		}
   632  		out, err := file.Create(ctx, n.path)
   633  		if err != nil {
   634  			log.Error.Printf("flush %s (create): err=%v", n.path, err)
   635  			fh.closeErrno = errToErrno(err)
   636  			_ = t.fp.Close()
   637  			mu.Unlock()
   638  			return fh.closeErrno
   639  		}
   640  		defer func() {
   641  			if out != nil {
   642  				_ = out.Close(ctx)
   643  			}
   644  			if t.fp != nil {
   645  				_ = t.fp.Close()
   646  				t.fp = nil
   647  			}
   648  		}()
   649  
   650  		newOff, err := t.fp.Seek(0, io.SeekStart)
   651  		if err != nil {
   652  			log.Error.Printf("flush %s (seek): err=%v", n.path, err)
   653  			fh.closeErrno = errToErrno(err)
   654  			return fh.closeErrno
   655  		}
   656  		if newOff != 0 {
   657  			log.Panicf("newoff %d", newOff)
   658  		}
   659  
   660  		nByte, err := io.Copy(out.Writer(ctx), t.fp)
   661  		if err != nil {
   662  			log.Error.Printf("flush %s (copy): err=%v", n.path, err)
   663  			fh.closeErrno = errToErrno(err)
   664  			return fh.closeErrno
   665  		}
   666  		errp := errors.Once{}
   667  		errp.Set(t.fp.Close())
   668  		errp.Set(out.Close(ctx))
   669  		out = nil
   670  		t.fp = nil
   671  		if err := errp.Err(); err != nil {
   672  			fh.closeErrno = errToErrno(err)
   673  			log.Error.Printf("flush %s (close): err=%v", n.path, err)
   674  			return fh.closeErrno
   675  		}
   676  
   677  		now := time.Now()
   678  		n.stat.expiration = now.Add(cacheExpiration)
   679  		n.stat.size = nByte
   680  		n.stat.modTime = now
   681  
   682  		closeErrno := fh.closeErrno
   683  		mu.Unlock()
   684  		mu = nil
   685  		return closeErrno
   686  	}
   687  
   688  	flushDirectAndUnlock := func() syscall.Errno {
   689  		mu := &n.mu
   690  		defer func() {
   691  			if mu != nil {
   692  				mu.Unlock()
   693  			}
   694  		}()
   695  		d := fh.dw
   696  		if d.fp == nil {
   697  			return fh.closeErrno
   698  		}
   699  
   700  		err := d.fp.Close(ctx)
   701  		fh.closeErrno = errToErrno(err)
   702  		log.Debug.Printf("flush %s fh=%p, err=%v", n.path, fh, err)
   703  		if d.w != nil {
   704  			now := time.Now()
   705  			n.stat.expiration = now.Add(cacheExpiration)
   706  			n.stat.size = d.off
   707  			n.stat.modTime = now
   708  		}
   709  		d.fp = nil
   710  		d.w = nil
   711  		closeErrno := fh.closeErrno
   712  		mu.Unlock()
   713  		mu = nil
   714  		return closeErrno
   715  	}
   716  	n.mu.Lock()
   717  	switch {
   718  	case fh.tmp != nil:
   719  		return flushTmpAndUnlock()
   720  	case fh.dw != nil:
   721  		return flushDirectAndUnlock()
   722  	}
   723  	n.mu.Unlock()
   724  	return 0
   725  }
   726  
   727  // Create is called to create a new file.
   728  func (n *inode) Create(ctx context.Context, name string, flags uint32, mode uint32,
   729  	out *fuse.EntryOut) (*fs.Inode, fs.FileHandle, uint32, syscall.Errno) {
   730  	newPath := file.Join(n.path, name)
   731  	childNode := &inode{
   732  		path: newPath,
   733  		ent: fuse.DirEntry{
   734  			Name: name,
   735  			Ino:  getIno(newPath),
   736  			Mode: getModeBits(false)}}
   737  	childInode := n.NewInode(ctx, childNode, fs.StableAttr{
   738  		Mode: childNode.ent.Mode,
   739  		Ino:  childNode.ent.Ino,
   740  	})
   741  	fh := newHandle(childNode, syscall.O_WRONLY|syscall.O_CREAT|syscall.O_TRUNC)
   742  	fh.requestedSize = 0
   743  	log.Debug.Printf("create %s: (mode %x)", n.path, mode)
   744  	out.Attr = newAttr(n.ent.Ino, n.ent.Mode, 0, time.Time{})
   745  	return childInode, fh, 0, 0
   746  }
   747  
   748  // Open opens an existing file.
   749  func (n *inode) Open(_ context.Context, mode uint32) (fs.FileHandle, uint32, syscall.Errno) {
   750  	n.mu.Lock()
   751  	defer n.mu.Unlock()
   752  	ctx := n.ctx()
   753  	if n.IsRoot() {
   754  		// The entries under the root must be buckets, so we can't open it directly.
   755  		log.Error.Printf("open %s: cannot open a file under root", n.path)
   756  		return nil, 0, syscall.EINVAL
   757  	}
   758  	_, dirInode := n.Parent()
   759  	if dirInode == nil {
   760  		log.Panicf("open %s: parent dir does't exist", n.path)
   761  	}
   762  	if (mode & fuse.O_ANYWRITE) == 0 {
   763  		fp, err := file.Open(n.ctx(), n.path)
   764  		if err != nil {
   765  			log.Error.Printf("open %s (mode %x): %v", n.path, mode, err)
   766  			return nil, 0, errToErrno(err)
   767  		}
   768  		fh := newHandle(n, mode)
   769  		fh.dr = &directRead{fp: fp, r: fp.Reader(ctx)}
   770  		log.Debug.Printf("open %s: mode %x, fh %p", n.path, mode, fh)
   771  		return fh, 0, 0
   772  	}
   773  
   774  	fh := newHandle(n, mode)
   775  	return fh, 0, 0
   776  }
   777  
   778  // FsDirStream implements readdir.
   779  type fsDirStream struct {
   780  	ctx    context.Context
   781  	dir    *inode
   782  	lister file.Lister
   783  	err    error
   784  
   785  	seenParent  bool // Whether Next has already returned '..'.
   786  	seenSelf    bool // Whether Next has already returned '.'.
   787  	peekedChild bool // Whether HasNext has Scan()-ed a child that Next hasn't returned yet.
   788  
   789  	// previousInode is the inode of the previous entry, i.e. the most recent
   790  	// entry returned by Next.  We hold a reference to service LOOKUP
   791  	// operations that go-fuse issues when servicing READDIRPLUS.  See
   792  	// dirStreamUsage.
   793  	previousInode *fs.Inode
   794  }
   795  
   796  // HasNext implements fs.DirStream
   797  func (s *fsDirStream) HasNext() bool {
   798  	s.dir.mu.Lock() // TODO: Remove?
   799  	defer s.dir.mu.Unlock()
   800  
   801  	if s.err != nil || s.lister == nil {
   802  		return false
   803  	}
   804  	if !s.seenParent || !s.seenSelf || s.peekedChild {
   805  		return true
   806  	}
   807  	for s.lister.Scan() {
   808  		if getFileName(s.dir, s.lister.Path()) != "" {
   809  			s.peekedChild = true
   810  			return true
   811  		}
   812  		// Assume this is a directory marker:
   813  		// https://web.archive.org/web/20190424231712/https://docs.aws.amazon.com/AmazonS3/latest/user-guide/using-folders.html
   814  		// s3file's List returns these, but empty filenames seem to cause problems for FUSE.
   815  		// TODO: Filtering these in s3file, if it's ok for other users.
   816  	}
   817  	return false
   818  }
   819  
   820  // Next implements fs.DirStream
   821  func (s *fsDirStream) Next() (fuse.DirEntry, syscall.Errno) {
   822  	s.dir.mu.Lock()
   823  	defer s.dir.mu.Unlock()
   824  
   825  	if s.err != nil {
   826  		return fuse.DirEntry{}, errToErrno(s.err)
   827  	}
   828  	if err := s.lister.Err(); err != nil {
   829  		if _, canceled := <-s.ctx.Done(); canceled {
   830  			s.err = errors.E(errors.Canceled, "list canceled", err)
   831  		} else {
   832  			s.err = err
   833  		}
   834  		return fuse.DirEntry{}, errToErrno(s.err)
   835  	}
   836  
   837  	ent := fuse.DirEntry{}
   838  	stat := cachedStat{expiration: time.Now().Add(cacheExpiration)}
   839  
   840  	if !s.seenParent {
   841  		s.seenParent = true
   842  		_, parent := s.dir.Parent()
   843  		if parent != nil {
   844  			// Not root.
   845  			parentDir := downCast(parent)
   846  			ent = parentDir.ent
   847  			ent.Name = ".."
   848  			stat = parentDir.stat
   849  			return ent, 0
   850  		}
   851  	}
   852  	if !s.seenSelf {
   853  		s.seenSelf = true
   854  		ent = s.dir.ent
   855  		ent.Name = "."
   856  		stat = s.dir.stat
   857  		return ent, 0
   858  	}
   859  	s.peekedChild = false
   860  
   861  	ent = fuse.DirEntry{
   862  		Name: getFileName(s.dir, s.lister.Path()),
   863  		Mode: getModeBits(s.lister.IsDir()),
   864  		Ino:  getIno(s.lister.Path()),
   865  	}
   866  	if info := s.lister.Info(); info != nil {
   867  		stat.size, stat.modTime = info.Size(), info.ModTime()
   868  	}
   869  	inode := s.dir.NewInode(
   870  		s.ctx,
   871  		&inode{path: file.Join(s.dir.path, ent.Name), ent: ent, stat: stat},
   872  		fs.StableAttr{Mode: ent.Mode, Ino: ent.Ino},
   873  	)
   874  	_ = s.dir.AddChild(ent.Name, inode, true)
   875  	s.lockedSetPreviousInode(inode)
   876  	return ent, 0
   877  }
   878  
   879  // Close implements fs.DirStream
   880  func (s *fsDirStream) Close() {
   881  	s.dir.mu.Lock()
   882  	s.lockedClearPreviousInode()
   883  	s.dir.mu.Unlock()
   884  }
   885  
   886  func (s *fsDirStream) lockedSetPreviousInode(n *fs.Inode) {
   887  	s.lockedClearPreviousInode()
   888  	s.previousInode = n
   889  	s.previousInode.Operations().(*inode).addDirStreamRef()
   890  }
   891  
   892  func (s *fsDirStream) lockedClearPreviousInode() {
   893  	if s.previousInode == nil {
   894  		return
   895  	}
   896  	s.previousInode.Operations().(*inode).dropDirStreamRef()
   897  	s.previousInode = nil
   898  }
   899  
   900  func (n *inode) Lookup(ctx context.Context, name string, out *fuse.EntryOut) (*fs.Inode, syscall.Errno) {
   901  	log.Debug.Printf("lookup %s: name=%s start", n.path, name)
   902  
   903  	childInode := n.GetChild(name)
   904  	if childInode != nil && childInode.Operations().(*inode).previousOfAnyDirStream() {
   905  		log.Debug.Printf("lookup %s: name=%s using existing child inode", n.path, name)
   906  	} else {
   907  		var (
   908  			childPath = file.Join(n.path, name)
   909  			foundDir  bool
   910  			foundFile cachedStat
   911  			lister    = file.List(ctx, childPath, true /* recursive */)
   912  		)
   913  		// Look for either a file or a directory at this path.
   914  		// If both exist, assume file is a directory marker.
   915  		for lister.Scan() {
   916  			if lister.IsDir() || // We've found an exact match, and it's a directory.
   917  				lister.Path() != childPath { // We're seeing children, so childPath must be a directory.
   918  				foundDir = true
   919  				break
   920  			}
   921  			info := lister.Info()
   922  			foundFile = cachedStat{time.Now().Add(cacheExpiration), info.Size(), info.ModTime()}
   923  		}
   924  		if err := lister.Err(); err != nil {
   925  			if errors.Is(errors.NotExist, err) || errors.Is(errors.NotAllowed, err) {
   926  				// Ignore.
   927  			} else {
   928  				return nil, errToErrno(err)
   929  			}
   930  		}
   931  
   932  		if !foundDir && foundFile == (cachedStat{}) {
   933  			log.Debug.Printf("lookup: %s name='%s' not found", n.path, name)
   934  			return nil, syscall.ENOENT
   935  		}
   936  
   937  		ent := fuse.DirEntry{
   938  			Name: childPath,
   939  			Mode: getModeBits(foundDir),
   940  			Ino:  getIno(childPath),
   941  		}
   942  		childInode = n.NewInode(
   943  			ctx,
   944  			&inode{path: childPath, ent: ent, stat: foundFile},
   945  			fs.StableAttr{
   946  				Mode: ent.Mode,
   947  				Ino:  ent.Ino,
   948  			})
   949  	}
   950  	ops := childInode.Operations().(*inode)
   951  	out.Attr = newAttr(ops.ent.Ino, ops.ent.Mode, uint64(ops.stat.size), ops.stat.modTime)
   952  	out.SetEntryTimeout(cacheExpiration)
   953  	out.SetAttrTimeout(cacheExpiration)
   954  	log.Debug.Printf("lookup %s name='%s' done: mode=%o ino=%d stat=%+v", n.path, name, ops.ent.Mode, ops.ent.Ino, ops.stat)
   955  	return childInode, 0
   956  }
   957  
   958  func (n *inode) Readdir(ctx context.Context) (fs.DirStream, syscall.Errno) {
   959  	log.Debug.Printf("readdir %s: start", n.path)
   960  	// TODO(josh): Newer Linux kernels (4.20+) can cache the entries from readdir. Make sure this works
   961  	// and invalidates reasonably.
   962  	// References:
   963  	//   Linux patch series: https://github.com/torvalds/linux/commit/69e345511
   964  	//   go-fuse support: https://github.com/hanwen/go-fuse/commit/fa1304749db6eafd8fe64338f10c9750cf693274
   965  	//   libfuse's documentation (describing some kernel behavior): http://web.archive.org/web/20210118113434/https://libfuse.github.io/doxygen/structfuse__lowlevel__ops.html#afa15612c68f7971cadfe3d3ec0a8b70e
   966  	return &fsDirStream{
   967  		ctx:    ctx,
   968  		dir:    n,
   969  		lister: file.List(ctx, n.path, false /*nonrecursive*/),
   970  	}, 0
   971  }
   972  
   973  func (n *inode) Unlink(_ context.Context, name string) syscall.Errno {
   974  	childPath := file.Join(n.path, name)
   975  	err := file.Remove(n.ctx(), childPath)
   976  	log.Debug.Printf("unlink %s: err %v", childPath, err)
   977  	return errToErrno(err)
   978  }
   979  
   980  func (n *inode) Rmdir(_ context.Context, name string) syscall.Errno {
   981  	// Nothing to do.
   982  	return 0
   983  }
   984  
   985  func (n *inode) Mkdir(ctx context.Context, name string, _ uint32, out *fuse.EntryOut) (*fs.Inode, syscall.Errno) {
   986  	n.mu.Lock()
   987  	defer n.mu.Unlock()
   988  	// TODO: Consider creating an S3 "directory" object so this new directory persists for new listings.
   989  	// https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-folders.html
   990  	newPath := file.Join(n.path, name)
   991  	childNode := &inode{
   992  		path: newPath,
   993  		ent: fuse.DirEntry{
   994  			Name: name,
   995  			Ino:  getIno(newPath),
   996  			Mode: getModeBits(true)}}
   997  	childInode := n.NewInode(ctx, childNode, fs.StableAttr{
   998  		Mode: childNode.ent.Mode,
   999  		Ino:  childNode.ent.Ino,
  1000  	})
  1001  	out.Attr = newAttr(n.ent.Ino, n.ent.Mode, 0, time.Time{})
  1002  	out.SetEntryTimeout(cacheExpiration)
  1003  	out.SetAttrTimeout(cacheExpiration)
  1004  	return childInode, 0
  1005  }