github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/vfs/file_description.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package vfs
    16  
    17  import (
    18  	"io"
    19  
    20  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    21  	"github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops"
    22  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    23  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/arch"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/lock"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsmetric"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth"
    28  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/memmap"
    29  	"github.com/nicocha30/gvisor-ligolo/pkg/sync"
    30  	"github.com/nicocha30/gvisor-ligolo/pkg/usermem"
    31  	"github.com/nicocha30/gvisor-ligolo/pkg/waiter"
    32  )
    33  
    34  // A FileDescription represents an open file description, which is the entity
    35  // referred to by a file descriptor (POSIX.1-2017 3.258 "Open File
    36  // Description").
    37  //
    38  // FileDescriptions are reference-counted. Unless otherwise specified, all
    39  // FileDescription methods require that a reference is held.
    40  //
    41  // FileDescription is analogous to Linux's struct file.
    42  //
    43  // +stateify savable
    44  type FileDescription struct {
    45  	FileDescriptionRefs
    46  
    47  	// flagsMu protects `statusFlags` and `asyncHandler` below.
    48  	flagsMu sync.Mutex `state:"nosave"`
    49  
    50  	// statusFlags contains status flags, "initialized by open(2) and possibly
    51  	// modified by fcntl()" - fcntl(2). statusFlags can be read using atomic
    52  	// memory operations when it does not need to be synchronized with an
    53  	// access to asyncHandler.
    54  	statusFlags atomicbitops.Uint32
    55  
    56  	// asyncHandler handles O_ASYNC signal generation. It is set with the
    57  	// F_SETOWN or F_SETOWN_EX fcntls. For asyncHandler to be used, O_ASYNC must
    58  	// also be set by fcntl(2).
    59  	asyncHandler FileAsync
    60  
    61  	// epolls is the set of epollInterests registered for this FileDescription.
    62  	// epolls is protected by epollMu.
    63  	epollMu epollMutex `state:"nosave"`
    64  	epolls  map[*epollInterest]struct{}
    65  
    66  	// vd is the filesystem location at which this FileDescription was opened.
    67  	// A reference is held on vd. vd is immutable.
    68  	vd VirtualDentry
    69  
    70  	// opts contains options passed to FileDescription.Init(). opts is
    71  	// immutable.
    72  	opts FileDescriptionOptions
    73  
    74  	// readable is MayReadFileWithOpenFlags(statusFlags). readable is
    75  	// immutable.
    76  	//
    77  	// readable is analogous to Linux's FMODE_READ.
    78  	readable bool
    79  
    80  	// writable is MayWriteFileWithOpenFlags(statusFlags). If writable is true,
    81  	// the FileDescription holds a write count on vd.mount. writable is
    82  	// immutable.
    83  	//
    84  	// writable is analogous to Linux's FMODE_WRITE.
    85  	writable bool
    86  
    87  	usedLockBSD atomicbitops.Uint32
    88  
    89  	// impl is the FileDescriptionImpl associated with this Filesystem. impl is
    90  	// immutable. This should be the last field in FileDescription.
    91  	impl FileDescriptionImpl
    92  }
    93  
    94  // FileDescriptionOptions contains options to FileDescription.Init().
    95  //
    96  // +stateify savable
    97  type FileDescriptionOptions struct {
    98  	// If AllowDirectIO is true, allow O_DIRECT to be set on the file.
    99  	AllowDirectIO bool
   100  
   101  	// If DenyPRead is true, calls to FileDescription.PRead() return ESPIPE.
   102  	DenyPRead bool
   103  
   104  	// If DenyPWrite is true, calls to FileDescription.PWrite() return
   105  	// ESPIPE.
   106  	DenyPWrite bool
   107  
   108  	// If UseDentryMetadata is true, calls to FileDescription methods that
   109  	// interact with file and filesystem metadata (Stat, SetStat, StatFS,
   110  	// ListXattr, GetXattr, SetXattr, RemoveXattr) are implemented by calling
   111  	// the corresponding FilesystemImpl methods instead of the corresponding
   112  	// FileDescriptionImpl methods.
   113  	//
   114  	// UseDentryMetadata is intended for file descriptions that are implemented
   115  	// outside of individual filesystems, such as pipes, sockets, and device
   116  	// special files. FileDescriptions for which UseDentryMetadata is true may
   117  	// embed DentryMetadataFileDescriptionImpl to obtain appropriate
   118  	// implementations of FileDescriptionImpl methods that should not be
   119  	// called.
   120  	UseDentryMetadata bool
   121  
   122  	// If DenySpliceIn is true, splice into descriptor isn't allowed.
   123  	DenySpliceIn bool
   124  }
   125  
   126  // FileCreationFlags are the set of flags passed to FileDescription.Init() but
   127  // omitted from FileDescription.StatusFlags().
   128  const FileCreationFlags = linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC
   129  
   130  // Init must be called before first use of fd. If it succeeds, it takes
   131  // references on mnt and d. flags is the initial file description flags, which
   132  // is usually the full set of flags passed to open(2).
   133  func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) error {
   134  	writable := MayWriteFileWithOpenFlags(flags)
   135  	if writable {
   136  		if err := mnt.CheckBeginWrite(); err != nil {
   137  			return err
   138  		}
   139  	}
   140  
   141  	fd.InitRefs()
   142  
   143  	// Remove "file creation flags" to mirror the behavior from file.f_flags in
   144  	// fs/open.c:do_dentry_open.
   145  	fd.statusFlags = atomicbitops.FromUint32(flags &^ FileCreationFlags)
   146  	fd.vd = VirtualDentry{
   147  		mount:  mnt,
   148  		dentry: d,
   149  	}
   150  	mnt.IncRef()
   151  	d.IncRef()
   152  	fd.opts = *opts
   153  	fd.readable = MayReadFileWithOpenFlags(flags)
   154  	fd.writable = writable
   155  	fd.impl = impl
   156  	return nil
   157  }
   158  
   159  // DecRef decrements fd's reference count.
   160  func (fd *FileDescription) DecRef(ctx context.Context) {
   161  	fd.FileDescriptionRefs.DecRef(func() {
   162  		// Generate inotify events.
   163  		ev := uint32(linux.IN_CLOSE_NOWRITE)
   164  		if fd.IsWritable() {
   165  			ev = linux.IN_CLOSE_WRITE
   166  		}
   167  		fd.Dentry().InotifyWithParent(ctx, ev, 0, PathEvent)
   168  
   169  		// Unregister fd from all epoll instances.
   170  		fd.epollMu.Lock()
   171  		epolls := fd.epolls
   172  		fd.epolls = nil
   173  		fd.epollMu.Unlock()
   174  		for epi := range epolls {
   175  			ep := epi.epoll
   176  			ep.interestMu.Lock()
   177  			// Check that epi has not been concurrently unregistered by
   178  			// EpollInstance.DeleteInterest() or EpollInstance.Release().
   179  			if _, ok := ep.interest[epi.key]; ok {
   180  				fd.EventUnregister(&epi.waiter)
   181  				ep.removeLocked(epi)
   182  			}
   183  			ep.interestMu.Unlock()
   184  		}
   185  
   186  		// If BSD locks were used, release any lock that it may have acquired.
   187  		if fd.usedLockBSD.Load() != 0 {
   188  			fd.impl.UnlockBSD(context.Background(), fd)
   189  		}
   190  
   191  		// Unlock any OFD locks.
   192  		if fd.impl.SupportsLocks() {
   193  			fd.impl.UnlockPOSIX(ctx, fd, lock.LockRange{0, lock.LockEOF})
   194  		}
   195  
   196  		// Release implementation resources.
   197  		fd.impl.Release(ctx)
   198  		if fd.writable {
   199  			fd.vd.mount.EndWrite()
   200  		}
   201  		fd.vd.DecRef(ctx)
   202  		fd.flagsMu.Lock()
   203  		if fd.statusFlags.RacyLoad()&linux.O_ASYNC != 0 && fd.asyncHandler != nil {
   204  			fd.impl.UnregisterFileAsyncHandler(fd)
   205  		}
   206  		fd.asyncHandler = nil
   207  		fd.flagsMu.Unlock()
   208  	})
   209  }
   210  
   211  // Mount returns the mount on which fd was opened. It does not take a reference
   212  // on the returned Mount.
   213  func (fd *FileDescription) Mount() *Mount {
   214  	return fd.vd.mount
   215  }
   216  
   217  // Dentry returns the dentry at which fd was opened. It does not take a
   218  // reference on the returned Dentry.
   219  func (fd *FileDescription) Dentry() *Dentry {
   220  	return fd.vd.dentry
   221  }
   222  
   223  // VirtualDentry returns the location at which fd was opened. It does not take
   224  // a reference on the returned VirtualDentry.
   225  func (fd *FileDescription) VirtualDentry() VirtualDentry {
   226  	return fd.vd
   227  }
   228  
   229  // Options returns the options passed to fd.Init().
   230  func (fd *FileDescription) Options() FileDescriptionOptions {
   231  	return fd.opts
   232  }
   233  
   234  // StatusFlags returns file description status flags, as for fcntl(F_GETFL).
   235  func (fd *FileDescription) StatusFlags() uint32 {
   236  	return fd.statusFlags.Load()
   237  }
   238  
   239  // SetStatusFlags sets file description status flags, as for fcntl(F_SETFL).
   240  func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Credentials, flags uint32) error {
   241  	// Compare Linux's fs/fcntl.c:setfl().
   242  	oldFlags := fd.StatusFlags()
   243  	// Linux documents this check as "O_APPEND cannot be cleared if the file is
   244  	// marked as append-only and the file is open for write", which would make
   245  	// sense. However, the check as actually implemented seems to be "O_APPEND
   246  	// cannot be changed if the file is marked as append-only".
   247  	if (flags^oldFlags)&linux.O_APPEND != 0 {
   248  		stat, err := fd.Stat(ctx, StatOptions{
   249  			// There is no mask bit for stx_attributes.
   250  			Mask: 0,
   251  			// Linux just reads inode::i_flags directly.
   252  			Sync: linux.AT_STATX_DONT_SYNC,
   253  		})
   254  		if err != nil {
   255  			return err
   256  		}
   257  		if (stat.AttributesMask&linux.STATX_ATTR_APPEND != 0) && (stat.Attributes&linux.STATX_ATTR_APPEND != 0) {
   258  			return linuxerr.EPERM
   259  		}
   260  	}
   261  	if (flags&linux.O_NOATIME != 0) && (oldFlags&linux.O_NOATIME == 0) {
   262  		stat, err := fd.Stat(ctx, StatOptions{
   263  			Mask: linux.STATX_UID,
   264  			// Linux's inode_owner_or_capable() just reads inode::i_uid
   265  			// directly.
   266  			Sync: linux.AT_STATX_DONT_SYNC,
   267  		})
   268  		if err != nil {
   269  			return err
   270  		}
   271  		if stat.Mask&linux.STATX_UID == 0 {
   272  			return linuxerr.EPERM
   273  		}
   274  		if !CanActAsOwner(creds, auth.KUID(stat.UID)) {
   275  			return linuxerr.EPERM
   276  		}
   277  	}
   278  	if flags&linux.O_DIRECT != 0 && !fd.opts.AllowDirectIO {
   279  		return linuxerr.EINVAL
   280  	}
   281  	// TODO(gvisor.dev/issue/1035): FileDescriptionImpl.SetOAsync()?
   282  	const settableFlags = linux.O_APPEND | linux.O_ASYNC | linux.O_DIRECT | linux.O_NOATIME | linux.O_NONBLOCK
   283  	fd.flagsMu.Lock()
   284  	defer fd.flagsMu.Unlock()
   285  	if fd.asyncHandler != nil {
   286  		// Use fd.statusFlags instead of oldFlags, which may have become outdated,
   287  		// to avoid double registering/unregistering.
   288  		if fd.statusFlags.RacyLoad()&linux.O_ASYNC == 0 && flags&linux.O_ASYNC != 0 {
   289  			if err := fd.impl.RegisterFileAsyncHandler(fd); err != nil {
   290  				return err
   291  			}
   292  		} else if fd.statusFlags.RacyLoad()&linux.O_ASYNC != 0 && flags&linux.O_ASYNC == 0 {
   293  			fd.impl.UnregisterFileAsyncHandler(fd)
   294  		}
   295  	}
   296  	fd.statusFlags.Store((oldFlags &^ settableFlags) | (flags & settableFlags))
   297  	return nil
   298  }
   299  
   300  // IsReadable returns true if fd was opened for reading.
   301  func (fd *FileDescription) IsReadable() bool {
   302  	return fd.readable
   303  }
   304  
   305  // IsWritable returns true if fd was opened for writing.
   306  func (fd *FileDescription) IsWritable() bool {
   307  	return fd.writable
   308  }
   309  
   310  // Impl returns the FileDescriptionImpl associated with fd.
   311  func (fd *FileDescription) Impl() FileDescriptionImpl {
   312  	return fd.impl
   313  }
   314  
   315  // FileDescriptionImpl contains implementation details for an FileDescription.
   316  // Implementations of FileDescriptionImpl should contain their associated
   317  // FileDescription by value as their first field.
   318  //
   319  // For all functions that return linux.Statx, Statx.Uid and Statx.Gid will
   320  // be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID and
   321  // auth.KGID respectively).
   322  //
   323  // All methods may return errors not specified.
   324  //
   325  // FileDescriptionImpl is analogous to Linux's struct file_operations.
   326  type FileDescriptionImpl interface {
   327  	// Release is called when the associated FileDescription reaches zero
   328  	// references.
   329  	Release(ctx context.Context)
   330  
   331  	// OnClose is called when a file descriptor representing the
   332  	// FileDescription is closed. Note that returning a non-nil error does not
   333  	// prevent the file descriptor from being closed.
   334  	OnClose(ctx context.Context) error
   335  
   336  	// Stat returns metadata for the file represented by the FileDescription.
   337  	Stat(ctx context.Context, opts StatOptions) (linux.Statx, error)
   338  
   339  	// SetStat updates metadata for the file represented by the
   340  	// FileDescription. Implementations are responsible for checking if the
   341  	// operation can be performed (see vfs.CheckSetStat() for common checks).
   342  	SetStat(ctx context.Context, opts SetStatOptions) error
   343  
   344  	// StatFS returns metadata for the filesystem containing the file
   345  	// represented by the FileDescription.
   346  	StatFS(ctx context.Context) (linux.Statfs, error)
   347  
   348  	// Allocate grows the file to offset + length bytes.
   349  	// Only mode == 0 is supported currently.
   350  	//
   351  	// Allocate should return EISDIR on directories, ESPIPE on pipes, and ENODEV on
   352  	// other files where it is not supported.
   353  	//
   354  	// Preconditions: The FileDescription was opened for writing.
   355  	Allocate(ctx context.Context, mode, offset, length uint64) error
   356  
   357  	// waiter.Waitable methods may be used to poll for I/O events.
   358  	waiter.Waitable
   359  
   360  	// Epollable indicates whether this file can be used with epoll_ctl(2).
   361  	Epollable() bool
   362  
   363  	// PRead reads from the file into dst, starting at the given offset, and
   364  	// returns the number of bytes read. PRead is permitted to return partial
   365  	// reads with a nil error.
   366  	//
   367  	// Errors:
   368  	//
   369  	//	- If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP.
   370  	//
   371  	// Preconditions:
   372  	//	* The FileDescription was opened for reading.
   373  	//	* FileDescriptionOptions.DenyPRead == false.
   374  	PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error)
   375  
   376  	// Read is similar to PRead, but does not specify an offset.
   377  	//
   378  	// For files with an implicit FileDescription offset (e.g. regular files),
   379  	// Read begins at the FileDescription offset, and advances the offset by
   380  	// the number of bytes read; note that POSIX 2.9.7 "Thread Interactions
   381  	// with Regular File Operations" requires that all operations that may
   382  	// mutate the FileDescription offset are serialized.
   383  	//
   384  	// Errors:
   385  	//
   386  	//	- If opts.Flags specifies unsupported options, Read returns EOPNOTSUPP.
   387  	//
   388  	// Preconditions: The FileDescription was opened for reading.
   389  	Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error)
   390  
   391  	// PWrite writes src to the file, starting at the given offset, and returns
   392  	// the number of bytes written. PWrite is permitted to return partial
   393  	// writes with a nil error.
   394  	//
   395  	// As in Linux (but not POSIX), if O_APPEND is in effect for the
   396  	// FileDescription, PWrite should ignore the offset and append data to the
   397  	// end of the file.
   398  	//
   399  	// Errors:
   400  	//
   401  	//	- If opts.Flags specifies unsupported options, PWrite returns
   402  	// EOPNOTSUPP.
   403  	//
   404  	// Preconditions:
   405  	//	* The FileDescription was opened for writing.
   406  	//	* FileDescriptionOptions.DenyPWrite == false.
   407  	PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error)
   408  
   409  	// Write is similar to PWrite, but does not specify an offset, which is
   410  	// implied as for Read.
   411  	//
   412  	// Write is a FileDescriptionImpl method, instead of a wrapper around
   413  	// PWrite that uses a FileDescription offset, to make it possible for
   414  	// remote filesystems to implement O_APPEND correctly (i.e. atomically with
   415  	// respect to writers outside the scope of VFS).
   416  	//
   417  	// Errors:
   418  	//
   419  	//	- If opts.Flags specifies unsupported options, Write returns EOPNOTSUPP.
   420  	//
   421  	// Preconditions: The FileDescription was opened for writing.
   422  	Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error)
   423  
   424  	// IterDirents invokes cb on each entry in the directory represented by the
   425  	// FileDescription. If IterDirents has been called since the last call to
   426  	// Seek, it continues iteration from the end of the last call.
   427  	IterDirents(ctx context.Context, cb IterDirentsCallback) error
   428  
   429  	// Seek changes the FileDescription offset (assuming one exists) and
   430  	// returns its new value.
   431  	//
   432  	// For directories, if whence == SEEK_SET and offset == 0, the caller is
   433  	// rewinddir(), such that Seek "shall also cause the directory stream to
   434  	// refer to the current state of the corresponding directory" -
   435  	// POSIX.1-2017.
   436  	Seek(ctx context.Context, offset int64, whence int32) (int64, error)
   437  
   438  	// Sync requests that cached state associated with the file represented by
   439  	// the FileDescription is synchronized with persistent storage, and blocks
   440  	// until this is complete.
   441  	Sync(ctx context.Context) error
   442  
   443  	// ConfigureMMap mutates opts to implement mmap(2) for the file. Most
   444  	// implementations that support memory mapping can call
   445  	// GenericConfigureMMap with the appropriate memmap.Mappable.
   446  	ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error
   447  
   448  	// Ioctl implements the ioctl(2) syscall.
   449  	Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error)
   450  
   451  	// ListXattr returns all extended attribute names for the file.
   452  	ListXattr(ctx context.Context, size uint64) ([]string, error)
   453  
   454  	// GetXattr returns the value associated with the given extended attribute
   455  	// for the file.
   456  	GetXattr(ctx context.Context, opts GetXattrOptions) (string, error)
   457  
   458  	// SetXattr changes the value associated with the given extended attribute
   459  	// for the file.
   460  	SetXattr(ctx context.Context, opts SetXattrOptions) error
   461  
   462  	// RemoveXattr removes the given extended attribute from the file.
   463  	RemoveXattr(ctx context.Context, name string) error
   464  
   465  	// SupportsLocks indicates whether file locks are supported.
   466  	SupportsLocks() bool
   467  
   468  	// LockBSD tries to acquire a BSD-style advisory file lock.
   469  	LockBSD(ctx context.Context, uid lock.UniqueID, ownerPID int32, t lock.LockType, block bool) error
   470  
   471  	// UnlockBSD releases a BSD-style advisory file lock.
   472  	UnlockBSD(ctx context.Context, uid lock.UniqueID) error
   473  
   474  	// LockPOSIX tries to acquire a POSIX-style advisory file lock.
   475  	LockPOSIX(ctx context.Context, uid lock.UniqueID, ownerPID int32, t lock.LockType, r lock.LockRange, block bool) error
   476  
   477  	// UnlockPOSIX releases a POSIX-style advisory file lock.
   478  	UnlockPOSIX(ctx context.Context, uid lock.UniqueID, ComputeLockRange lock.LockRange) error
   479  
   480  	// TestPOSIX returns information about whether the specified lock can be held, in the style of the F_GETLK fcntl.
   481  	TestPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, r lock.LockRange) (linux.Flock, error)
   482  
   483  	RegisterFileAsyncHandler(fd *FileDescription) error
   484  	UnregisterFileAsyncHandler(fd *FileDescription)
   485  }
   486  
   487  // Dirent holds the information contained in struct linux_dirent64.
   488  //
   489  // +stateify savable
   490  type Dirent struct {
   491  	// Name is the filename.
   492  	Name string
   493  
   494  	// Type is the file type, a linux.DT_* constant.
   495  	Type uint8
   496  
   497  	// Ino is the inode number.
   498  	Ino uint64
   499  
   500  	// NextOff is the offset of the *next* Dirent in the directory; that is,
   501  	// FileDescription.Seek(NextOff, SEEK_SET) (as called by seekdir(3)) will
   502  	// cause the next call to FileDescription.IterDirents() to yield the next
   503  	// Dirent. (The offset of the first Dirent in a directory is always 0.)
   504  	NextOff int64
   505  }
   506  
   507  // IterDirentsCallback receives Dirents from FileDescriptionImpl.IterDirents.
   508  type IterDirentsCallback interface {
   509  	// Handle handles the given iterated Dirent. If Handle returns a non-nil
   510  	// error, FileDescriptionImpl.IterDirents must stop iteration and return
   511  	// the error; the next call to FileDescriptionImpl.IterDirents should
   512  	// restart with the same Dirent.
   513  	Handle(dirent Dirent) error
   514  }
   515  
   516  // IterDirentsCallbackFunc implements IterDirentsCallback for a function with
   517  // the semantics of IterDirentsCallback.Handle.
   518  type IterDirentsCallbackFunc func(dirent Dirent) error
   519  
   520  // Handle implements IterDirentsCallback.Handle.
   521  func (f IterDirentsCallbackFunc) Handle(dirent Dirent) error {
   522  	return f(dirent)
   523  }
   524  
   525  // OnClose is called when a file descriptor representing the FileDescription is
   526  // closed. Returning a non-nil error should not prevent the file descriptor
   527  // from being closed.
   528  func (fd *FileDescription) OnClose(ctx context.Context) error {
   529  	return fd.impl.OnClose(ctx)
   530  }
   531  
   532  // Stat returns metadata for the file represented by fd.
   533  func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
   534  	if fd.opts.UseDentryMetadata {
   535  		vfsObj := fd.vd.mount.vfs
   536  		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
   537  			Root:  fd.vd,
   538  			Start: fd.vd,
   539  		})
   540  		stat, err := fd.vd.mount.fs.impl.StatAt(ctx, rp, opts)
   541  		rp.Release(ctx)
   542  		return stat, err
   543  	}
   544  	return fd.impl.Stat(ctx, opts)
   545  }
   546  
   547  // SetStat updates metadata for the file represented by fd.
   548  func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) error {
   549  	if fd.opts.UseDentryMetadata {
   550  		vfsObj := fd.vd.mount.vfs
   551  		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
   552  			Root:  fd.vd,
   553  			Start: fd.vd,
   554  		})
   555  		err := fd.vd.mount.fs.impl.SetStatAt(ctx, rp, opts)
   556  		rp.Release(ctx)
   557  		return err
   558  	}
   559  	if err := fd.impl.SetStat(ctx, opts); err != nil {
   560  		return err
   561  	}
   562  	if ev := InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
   563  		fd.Dentry().InotifyWithParent(ctx, ev, 0, InodeEvent)
   564  	}
   565  	return nil
   566  }
   567  
   568  // StatFS returns metadata for the filesystem containing the file represented
   569  // by fd.
   570  func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
   571  	if fd.opts.UseDentryMetadata {
   572  		vfsObj := fd.vd.mount.vfs
   573  		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
   574  			Root:  fd.vd,
   575  			Start: fd.vd,
   576  		})
   577  		statfs, err := fd.vd.mount.fs.impl.StatFSAt(ctx, rp)
   578  		rp.Release(ctx)
   579  		return statfs, err
   580  	}
   581  	return fd.impl.StatFS(ctx)
   582  }
   583  
   584  // Allocate grows file represented by FileDescription to offset + length bytes.
   585  func (fd *FileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error {
   586  	if !fd.IsWritable() {
   587  		return linuxerr.EBADF
   588  	}
   589  	if err := fd.impl.Allocate(ctx, mode, offset, length); err != nil {
   590  		return err
   591  	}
   592  	fd.Dentry().InotifyWithParent(ctx, linux.IN_MODIFY, 0, PathEvent)
   593  	return nil
   594  }
   595  
   596  // Readiness implements waiter.Waitable.Readiness.
   597  //
   598  // It returns fd's I/O readiness.
   599  func (fd *FileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
   600  	return fd.impl.Readiness(mask)
   601  }
   602  
   603  // EventRegister implements waiter.Waitable.EventRegister.
   604  //
   605  // It registers e for I/O readiness events in mask.
   606  func (fd *FileDescription) EventRegister(e *waiter.Entry) error {
   607  	return fd.impl.EventRegister(e)
   608  }
   609  
   610  // EventUnregister implements waiter.Waitable.EventUnregister.
   611  //
   612  // It unregisters e for I/O readiness events.
   613  func (fd *FileDescription) EventUnregister(e *waiter.Entry) {
   614  	fd.impl.EventUnregister(e)
   615  }
   616  
   617  // Epollable returns whether this file can be used with epoll_ctl(2).
   618  func (fd *FileDescription) Epollable() bool {
   619  	return fd.impl.Epollable()
   620  }
   621  
   622  // PRead reads from the file represented by fd into dst, starting at the given
   623  // offset, and returns the number of bytes read. PRead is permitted to return
   624  // partial reads with a nil error.
   625  func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
   626  	if fd.opts.DenyPRead {
   627  		return 0, linuxerr.ESPIPE
   628  	}
   629  	if !fd.readable {
   630  		return 0, linuxerr.EBADF
   631  	}
   632  	start := fsmetric.StartReadWait()
   633  	n, err := fd.impl.PRead(ctx, dst, offset, opts)
   634  	if n > 0 {
   635  		fd.Dentry().InotifyWithParent(ctx, linux.IN_ACCESS, 0, PathEvent)
   636  	}
   637  	fsmetric.Reads.Increment()
   638  	fsmetric.FinishReadWait(fsmetric.ReadWait, start)
   639  	return n, err
   640  }
   641  
   642  // Read is similar to PRead, but does not specify an offset.
   643  func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
   644  	if !fd.readable {
   645  		return 0, linuxerr.EBADF
   646  	}
   647  	start := fsmetric.StartReadWait()
   648  	n, err := fd.impl.Read(ctx, dst, opts)
   649  	if n > 0 {
   650  		fd.Dentry().InotifyWithParent(ctx, linux.IN_ACCESS, 0, PathEvent)
   651  	}
   652  	fsmetric.Reads.Increment()
   653  	fsmetric.FinishReadWait(fsmetric.ReadWait, start)
   654  	return n, err
   655  }
   656  
   657  // PWrite writes src to the file represented by fd, starting at the given
   658  // offset, and returns the number of bytes written. PWrite is permitted to
   659  // return partial writes with a nil error.
   660  func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
   661  	if fd.opts.DenyPWrite {
   662  		return 0, linuxerr.ESPIPE
   663  	}
   664  	if !fd.writable {
   665  		return 0, linuxerr.EBADF
   666  	}
   667  	n, err := fd.impl.PWrite(ctx, src, offset, opts)
   668  	if n > 0 {
   669  		fd.Dentry().InotifyWithParent(ctx, linux.IN_MODIFY, 0, PathEvent)
   670  	}
   671  	return n, err
   672  }
   673  
   674  // Write is similar to PWrite, but does not specify an offset.
   675  func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
   676  	if !fd.writable {
   677  		return 0, linuxerr.EBADF
   678  	}
   679  	n, err := fd.impl.Write(ctx, src, opts)
   680  	if n > 0 {
   681  		fd.Dentry().InotifyWithParent(ctx, linux.IN_MODIFY, 0, PathEvent)
   682  	}
   683  	return n, err
   684  }
   685  
   686  // IterDirents invokes cb on each entry in the directory represented by fd. If
   687  // IterDirents has been called since the last call to Seek, it continues
   688  // iteration from the end of the last call.
   689  func (fd *FileDescription) IterDirents(ctx context.Context, cb IterDirentsCallback) error {
   690  	defer fd.Dentry().InotifyWithParent(ctx, linux.IN_ACCESS, 0, PathEvent)
   691  	return fd.impl.IterDirents(ctx, cb)
   692  }
   693  
   694  // Seek changes fd's offset (assuming one exists) and returns its new value.
   695  func (fd *FileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
   696  	return fd.impl.Seek(ctx, offset, whence)
   697  }
   698  
   699  // Sync has the semantics of fsync(2).
   700  func (fd *FileDescription) Sync(ctx context.Context) error {
   701  	return fd.impl.Sync(ctx)
   702  }
   703  
   704  // ConfigureMMap mutates opts to implement mmap(2) for the file represented by
   705  // fd.
   706  func (fd *FileDescription) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
   707  	return fd.impl.ConfigureMMap(ctx, opts)
   708  }
   709  
   710  // Ioctl implements the ioctl(2) syscall.
   711  func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
   712  	return fd.impl.Ioctl(ctx, uio, sysno, args)
   713  }
   714  
   715  // ListXattr returns all extended attribute names for the file represented by
   716  // fd.
   717  //
   718  // If the size of the list (including a NUL terminating byte after every entry)
   719  // would exceed size, ERANGE may be returned. Note that implementations
   720  // are free to ignore size entirely and return without error). In all cases,
   721  // if size is 0, the list should be returned without error, regardless of size.
   722  func (fd *FileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
   723  	if fd.opts.UseDentryMetadata {
   724  		vfsObj := fd.vd.mount.vfs
   725  		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
   726  			Root:  fd.vd,
   727  			Start: fd.vd,
   728  		})
   729  		names, err := fd.vd.mount.fs.impl.ListXattrAt(ctx, rp, size)
   730  		rp.Release(ctx)
   731  		return names, err
   732  	}
   733  	names, err := fd.impl.ListXattr(ctx, size)
   734  	if linuxerr.Equals(linuxerr.EOPNOTSUPP, err) {
   735  		// Linux doesn't actually return EOPNOTSUPP in this case; instead,
   736  		// fs/xattr.c:vfs_listxattr() falls back to allowing the security
   737  		// subsystem to return security extended attributes, which by default
   738  		// don't exist.
   739  		return nil, nil
   740  	}
   741  	return names, err
   742  }
   743  
   744  // GetXattr returns the value associated with the given extended attribute for
   745  // the file represented by fd.
   746  //
   747  // If the size of the return value exceeds opts.Size, ERANGE may be returned
   748  // (note that implementations are free to ignore opts.Size entirely and return
   749  // without error). In all cases, if opts.Size is 0, the value should be
   750  // returned without error, regardless of size.
   751  func (fd *FileDescription) GetXattr(ctx context.Context, opts *GetXattrOptions) (string, error) {
   752  	if fd.opts.UseDentryMetadata {
   753  		vfsObj := fd.vd.mount.vfs
   754  		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
   755  			Root:  fd.vd,
   756  			Start: fd.vd,
   757  		})
   758  		val, err := fd.vd.mount.fs.impl.GetXattrAt(ctx, rp, *opts)
   759  		rp.Release(ctx)
   760  		return val, err
   761  	}
   762  	return fd.impl.GetXattr(ctx, *opts)
   763  }
   764  
   765  // SetXattr changes the value associated with the given extended attribute for
   766  // the file represented by fd.
   767  func (fd *FileDescription) SetXattr(ctx context.Context, opts *SetXattrOptions) error {
   768  	if fd.opts.UseDentryMetadata {
   769  		vfsObj := fd.vd.mount.vfs
   770  		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
   771  			Root:  fd.vd,
   772  			Start: fd.vd,
   773  		})
   774  		err := fd.vd.mount.fs.impl.SetXattrAt(ctx, rp, *opts)
   775  		rp.Release(ctx)
   776  		return err
   777  	}
   778  	if err := fd.impl.SetXattr(ctx, *opts); err != nil {
   779  		return err
   780  	}
   781  	fd.Dentry().InotifyWithParent(ctx, linux.IN_ATTRIB, 0, InodeEvent)
   782  	return nil
   783  }
   784  
   785  // RemoveXattr removes the given extended attribute from the file represented
   786  // by fd.
   787  func (fd *FileDescription) RemoveXattr(ctx context.Context, name string) error {
   788  	if fd.opts.UseDentryMetadata {
   789  		vfsObj := fd.vd.mount.vfs
   790  		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
   791  			Root:  fd.vd,
   792  			Start: fd.vd,
   793  		})
   794  		err := fd.vd.mount.fs.impl.RemoveXattrAt(ctx, rp, name)
   795  		rp.Release(ctx)
   796  		return err
   797  	}
   798  	if err := fd.impl.RemoveXattr(ctx, name); err != nil {
   799  		return err
   800  	}
   801  	fd.Dentry().InotifyWithParent(ctx, linux.IN_ATTRIB, 0, InodeEvent)
   802  	return nil
   803  }
   804  
   805  // SyncFS instructs the filesystem containing fd to execute the semantics of
   806  // syncfs(2).
   807  func (fd *FileDescription) SyncFS(ctx context.Context) error {
   808  	return fd.vd.mount.fs.impl.Sync(ctx)
   809  }
   810  
   811  // MappedName implements memmap.MappingIdentity.MappedName.
   812  func (fd *FileDescription) MappedName(ctx context.Context) string {
   813  	vfsroot := RootFromContext(ctx)
   814  	s, _ := fd.vd.mount.vfs.PathnameWithDeleted(ctx, vfsroot, fd.vd)
   815  	if vfsroot.Ok() {
   816  		vfsroot.DecRef(ctx)
   817  	}
   818  	return s
   819  }
   820  
   821  // DeviceID implements memmap.MappingIdentity.DeviceID.
   822  func (fd *FileDescription) DeviceID() uint64 {
   823  	stat, err := fd.Stat(context.Background(), StatOptions{
   824  		// There is no STATX_DEV; we assume that Stat will return it if it's
   825  		// available regardless of mask.
   826  		Mask: 0,
   827  		// fs/proc/task_mmu.c:show_map_vma() just reads inode::i_sb->s_dev
   828  		// directly.
   829  		Sync: linux.AT_STATX_DONT_SYNC,
   830  	})
   831  	if err != nil {
   832  		return 0
   833  	}
   834  	return uint64(linux.MakeDeviceID(uint16(stat.DevMajor), stat.DevMinor))
   835  }
   836  
   837  // InodeID implements memmap.MappingIdentity.InodeID.
   838  func (fd *FileDescription) InodeID() uint64 {
   839  	stat, err := fd.Stat(context.Background(), StatOptions{
   840  		Mask: linux.STATX_INO,
   841  		// fs/proc/task_mmu.c:show_map_vma() just reads inode::i_ino directly.
   842  		Sync: linux.AT_STATX_DONT_SYNC,
   843  	})
   844  	if err != nil || stat.Mask&linux.STATX_INO == 0 {
   845  		return 0
   846  	}
   847  	return stat.Ino
   848  }
   849  
   850  // Msync implements memmap.MappingIdentity.Msync.
   851  func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error {
   852  	return fd.Sync(ctx)
   853  }
   854  
   855  // SupportsLocks indicates whether file locks are supported.
   856  func (fd *FileDescription) SupportsLocks() bool {
   857  	return fd.impl.SupportsLocks()
   858  }
   859  
   860  // LockBSD tries to acquire a BSD-style advisory file lock.
   861  func (fd *FileDescription) LockBSD(ctx context.Context, ownerPID int32, lockType lock.LockType, block bool) error {
   862  	fd.usedLockBSD.Store(1)
   863  	return fd.impl.LockBSD(ctx, fd, ownerPID, lockType, block)
   864  }
   865  
   866  // UnlockBSD releases a BSD-style advisory file lock.
   867  func (fd *FileDescription) UnlockBSD(ctx context.Context) error {
   868  	return fd.impl.UnlockBSD(ctx, fd)
   869  }
   870  
   871  // LockPOSIX locks a POSIX-style file range lock.
   872  func (fd *FileDescription) LockPOSIX(ctx context.Context, uid lock.UniqueID, ownerPID int32, t lock.LockType, r lock.LockRange, block bool) error {
   873  	return fd.impl.LockPOSIX(ctx, uid, ownerPID, t, r, block)
   874  }
   875  
   876  // UnlockPOSIX unlocks a POSIX-style file range lock.
   877  func (fd *FileDescription) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, r lock.LockRange) error {
   878  	return fd.impl.UnlockPOSIX(ctx, uid, r)
   879  }
   880  
   881  // TestPOSIX returns information about whether the specified lock can be held.
   882  func (fd *FileDescription) TestPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, r lock.LockRange) (linux.Flock, error) {
   883  	return fd.impl.TestPOSIX(ctx, uid, t, r)
   884  }
   885  
   886  // ComputeLockRange computes the range of a file lock based on the given values.
   887  func (fd *FileDescription) ComputeLockRange(ctx context.Context, start uint64, length uint64, whence int16) (lock.LockRange, error) {
   888  	var off int64
   889  	switch whence {
   890  	case linux.SEEK_SET:
   891  		off = 0
   892  	case linux.SEEK_CUR:
   893  		// Note that Linux does not hold any mutexes while retrieving the file
   894  		// offset, see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk.
   895  		curOff, err := fd.Seek(ctx, 0, linux.SEEK_CUR)
   896  		if err != nil {
   897  			return lock.LockRange{}, err
   898  		}
   899  		off = curOff
   900  	case linux.SEEK_END:
   901  		stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_SIZE})
   902  		if err != nil {
   903  			return lock.LockRange{}, err
   904  		}
   905  		off = int64(stat.Size)
   906  	default:
   907  		return lock.LockRange{}, linuxerr.EINVAL
   908  	}
   909  
   910  	return lock.ComputeRange(int64(start), int64(length), off)
   911  }
   912  
   913  // ReadFull read all contents from the file.
   914  func (fd *FileDescription) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
   915  	var total int64
   916  	for dst.NumBytes() > 0 {
   917  		n, err := fd.PRead(ctx, dst, offset+total, ReadOptions{})
   918  		total += n
   919  		if err == io.EOF && total != 0 {
   920  			return total, io.ErrUnexpectedEOF
   921  		} else if err != nil {
   922  			return total, err
   923  		}
   924  		dst = dst.DropFirst64(n)
   925  	}
   926  	return total, nil
   927  }
   928  
   929  // A FileAsync sends signals to its owner when w is ready for IO. This is only
   930  // implemented by pkg/sentry/fasync:FileAsync, but we unfortunately need this
   931  // interface to avoid circular dependencies.
   932  type FileAsync interface {
   933  	Register(w waiter.Waitable) error
   934  	Unregister(w waiter.Waitable)
   935  }
   936  
   937  // AsyncHandler returns the FileAsync for fd.
   938  func (fd *FileDescription) AsyncHandler() FileAsync {
   939  	fd.flagsMu.Lock()
   940  	defer fd.flagsMu.Unlock()
   941  	return fd.asyncHandler
   942  }
   943  
   944  // SetAsyncHandler sets fd.asyncHandler if it has not been set before and
   945  // returns it.
   946  func (fd *FileDescription) SetAsyncHandler(newHandler func() FileAsync) (FileAsync, error) {
   947  	fd.flagsMu.Lock()
   948  	defer fd.flagsMu.Unlock()
   949  	if fd.asyncHandler == nil {
   950  		fd.asyncHandler = newHandler()
   951  		if fd.statusFlags.RacyLoad()&linux.O_ASYNC != 0 {
   952  			if err := fd.impl.RegisterFileAsyncHandler(fd); err != nil {
   953  				return nil, err
   954  			}
   955  		}
   956  	}
   957  	return fd.asyncHandler, nil
   958  }
   959  
   960  // CopyRegularFileData copies data from srcFD to dstFD until reading from srcFD
   961  // returns EOF or an error. It returns the number of bytes copied.
   962  func CopyRegularFileData(ctx context.Context, dstFD, srcFD *FileDescription) (int64, error) {
   963  	done := int64(0)
   964  	buf := usermem.BytesIOSequence(make([]byte, 32*1024)) // arbitrary buffer size
   965  	for {
   966  		readN, readErr := srcFD.Read(ctx, buf, ReadOptions{})
   967  		if readErr != nil && readErr != io.EOF {
   968  			return done, readErr
   969  		}
   970  		src := buf.TakeFirst64(readN)
   971  		for src.NumBytes() != 0 {
   972  			writeN, writeErr := dstFD.Write(ctx, src, WriteOptions{})
   973  			done += writeN
   974  			src = src.DropFirst64(writeN)
   975  			if writeErr != nil {
   976  				return done, writeErr
   977  			}
   978  		}
   979  		if readErr == io.EOF {
   980  			return done, nil
   981  		}
   982  	}
   983  }