github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/vfs/file_description.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package vfs
    16  
    17  import (
    18  	"io"
    19  	"sync/atomic"
    20  
    21  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    22  	"github.com/SagerNet/gvisor/pkg/context"
    23  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    24  	"github.com/SagerNet/gvisor/pkg/sentry/arch"
    25  	"github.com/SagerNet/gvisor/pkg/sentry/fs/lock"
    26  	"github.com/SagerNet/gvisor/pkg/sentry/fsmetric"
    27  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    28  	"github.com/SagerNet/gvisor/pkg/sentry/memmap"
    29  	"github.com/SagerNet/gvisor/pkg/sync"
    30  	"github.com/SagerNet/gvisor/pkg/usermem"
    31  	"github.com/SagerNet/gvisor/pkg/waiter"
    32  )
    33  
    34  // A FileDescription represents an open file description, which is the entity
    35  // referred to by a file descriptor (POSIX.1-2017 3.258 "Open File
    36  // Description").
    37  //
    38  // FileDescriptions are reference-counted. Unless otherwise specified, all
    39  // FileDescription methods require that a reference is held.
    40  //
    41  // FileDescription is analogous to Linux's struct file.
    42  //
    43  // +stateify savable
    44  type FileDescription struct {
    45  	FileDescriptionRefs
    46  
    47  	// flagsMu protects `statusFlags`, `saved`, and `asyncHandler` below.
    48  	flagsMu sync.Mutex `state:"nosave"`
    49  
    50  	// statusFlags contains status flags, "initialized by open(2) and possibly
    51  	// modified by fcntl()" - fcntl(2). statusFlags can be read using atomic
    52  	// memory operations when it does not need to be synchronized with an
    53  	// access to asyncHandler.
    54  	statusFlags uint32
    55  
    56  	// saved is true after beforeSave is called. This is used to prevent
    57  	// double-unregistration of asyncHandler. This does not work properly for
    58  	// save-resume, which is not currently supported in gVisor (see b/26588733).
    59  	saved bool `state:"nosave"`
    60  
    61  	// asyncHandler handles O_ASYNC signal generation. It is set with the
    62  	// F_SETOWN or F_SETOWN_EX fcntls. For asyncHandler to be used, O_ASYNC must
    63  	// also be set by fcntl(2).
    64  	asyncHandler FileAsync
    65  
    66  	// epolls is the set of epollInterests registered for this FileDescription.
    67  	// epolls is protected by epollMu.
    68  	epollMu sync.Mutex `state:"nosave"`
    69  	epolls  map[*epollInterest]struct{}
    70  
    71  	// vd is the filesystem location at which this FileDescription was opened.
    72  	// A reference is held on vd. vd is immutable.
    73  	vd VirtualDentry
    74  
    75  	// opts contains options passed to FileDescription.Init(). opts is
    76  	// immutable.
    77  	opts FileDescriptionOptions
    78  
    79  	// readable is MayReadFileWithOpenFlags(statusFlags). readable is
    80  	// immutable.
    81  	//
    82  	// readable is analogous to Linux's FMODE_READ.
    83  	readable bool
    84  
    85  	// writable is MayWriteFileWithOpenFlags(statusFlags). If writable is true,
    86  	// the FileDescription holds a write count on vd.mount. writable is
    87  	// immutable.
    88  	//
    89  	// writable is analogous to Linux's FMODE_WRITE.
    90  	writable bool
    91  
    92  	usedLockBSD uint32
    93  
    94  	// impl is the FileDescriptionImpl associated with this Filesystem. impl is
    95  	// immutable. This should be the last field in FileDescription.
    96  	impl FileDescriptionImpl
    97  }
    98  
    99  // FileDescriptionOptions contains options to FileDescription.Init().
   100  //
   101  // +stateify savable
   102  type FileDescriptionOptions struct {
   103  	// If AllowDirectIO is true, allow O_DIRECT to be set on the file.
   104  	AllowDirectIO bool
   105  
   106  	// If DenyPRead is true, calls to FileDescription.PRead() return ESPIPE.
   107  	DenyPRead bool
   108  
   109  	// If DenyPWrite is true, calls to FileDescription.PWrite() return
   110  	// ESPIPE.
   111  	DenyPWrite bool
   112  
   113  	// If UseDentryMetadata is true, calls to FileDescription methods that
   114  	// interact with file and filesystem metadata (Stat, SetStat, StatFS,
   115  	// ListXattr, GetXattr, SetXattr, RemoveXattr) are implemented by calling
   116  	// the corresponding FilesystemImpl methods instead of the corresponding
   117  	// FileDescriptionImpl methods.
   118  	//
   119  	// UseDentryMetadata is intended for file descriptions that are implemented
   120  	// outside of individual filesystems, such as pipes, sockets, and device
   121  	// special files. FileDescriptions for which UseDentryMetadata is true may
   122  	// embed DentryMetadataFileDescriptionImpl to obtain appropriate
   123  	// implementations of FileDescriptionImpl methods that should not be
   124  	// called.
   125  	UseDentryMetadata bool
   126  }
   127  
   128  // FileCreationFlags are the set of flags passed to FileDescription.Init() but
   129  // omitted from FileDescription.StatusFlags().
   130  const FileCreationFlags = linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC
   131  
   132  // Init must be called before first use of fd. If it succeeds, it takes
   133  // references on mnt and d. flags is the initial file description flags, which
   134  // is usually the full set of flags passed to open(2).
   135  func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) error {
   136  	writable := MayWriteFileWithOpenFlags(flags)
   137  	if writable {
   138  		if err := mnt.CheckBeginWrite(); err != nil {
   139  			return err
   140  		}
   141  	}
   142  
   143  	fd.InitRefs()
   144  
   145  	// Remove "file creation flags" to mirror the behavior from file.f_flags in
   146  	// fs/open.c:do_dentry_open.
   147  	fd.statusFlags = flags &^ FileCreationFlags
   148  	fd.vd = VirtualDentry{
   149  		mount:  mnt,
   150  		dentry: d,
   151  	}
   152  	mnt.IncRef()
   153  	d.IncRef()
   154  	fd.opts = *opts
   155  	fd.readable = MayReadFileWithOpenFlags(flags)
   156  	fd.writable = writable
   157  	fd.impl = impl
   158  	return nil
   159  }
   160  
   161  // DecRef decrements fd's reference count.
   162  func (fd *FileDescription) DecRef(ctx context.Context) {
   163  	fd.FileDescriptionRefs.DecRef(func() {
   164  		// Generate inotify events.
   165  		ev := uint32(linux.IN_CLOSE_NOWRITE)
   166  		if fd.IsWritable() {
   167  			ev = linux.IN_CLOSE_WRITE
   168  		}
   169  		fd.Dentry().InotifyWithParent(ctx, ev, 0, PathEvent)
   170  
   171  		// Unregister fd from all epoll instances.
   172  		fd.epollMu.Lock()
   173  		epolls := fd.epolls
   174  		fd.epolls = nil
   175  		fd.epollMu.Unlock()
   176  		for epi := range epolls {
   177  			ep := epi.epoll
   178  			ep.interestMu.Lock()
   179  			// Check that epi has not been concurrently unregistered by
   180  			// EpollInstance.DeleteInterest() or EpollInstance.Release().
   181  			if _, ok := ep.interest[epi.key]; ok {
   182  				fd.EventUnregister(&epi.waiter)
   183  				ep.removeLocked(epi)
   184  			}
   185  			ep.interestMu.Unlock()
   186  		}
   187  
   188  		// If BSD locks were used, release any lock that it may have acquired.
   189  		if atomic.LoadUint32(&fd.usedLockBSD) != 0 {
   190  			fd.impl.UnlockBSD(context.Background(), fd)
   191  		}
   192  
   193  		// Release implementation resources.
   194  		fd.impl.Release(ctx)
   195  		if fd.writable {
   196  			fd.vd.mount.EndWrite()
   197  		}
   198  		fd.vd.DecRef(ctx)
   199  		fd.flagsMu.Lock()
   200  		if !fd.saved && fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil {
   201  			fd.asyncHandler.Unregister(fd)
   202  		}
   203  		fd.asyncHandler = nil
   204  		fd.flagsMu.Unlock()
   205  	})
   206  }
   207  
   208  // Mount returns the mount on which fd was opened. It does not take a reference
   209  // on the returned Mount.
   210  func (fd *FileDescription) Mount() *Mount {
   211  	return fd.vd.mount
   212  }
   213  
   214  // Dentry returns the dentry at which fd was opened. It does not take a
   215  // reference on the returned Dentry.
   216  func (fd *FileDescription) Dentry() *Dentry {
   217  	return fd.vd.dentry
   218  }
   219  
   220  // VirtualDentry returns the location at which fd was opened. It does not take
   221  // a reference on the returned VirtualDentry.
   222  func (fd *FileDescription) VirtualDentry() VirtualDentry {
   223  	return fd.vd
   224  }
   225  
   226  // Options returns the options passed to fd.Init().
   227  func (fd *FileDescription) Options() FileDescriptionOptions {
   228  	return fd.opts
   229  }
   230  
   231  // StatusFlags returns file description status flags, as for fcntl(F_GETFL).
   232  func (fd *FileDescription) StatusFlags() uint32 {
   233  	return atomic.LoadUint32(&fd.statusFlags)
   234  }
   235  
   236  // SetStatusFlags sets file description status flags, as for fcntl(F_SETFL).
   237  func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Credentials, flags uint32) error {
   238  	// Compare Linux's fs/fcntl.c:setfl().
   239  	oldFlags := fd.StatusFlags()
   240  	// Linux documents this check as "O_APPEND cannot be cleared if the file is
   241  	// marked as append-only and the file is open for write", which would make
   242  	// sense. However, the check as actually implemented seems to be "O_APPEND
   243  	// cannot be changed if the file is marked as append-only".
   244  	if (flags^oldFlags)&linux.O_APPEND != 0 {
   245  		stat, err := fd.Stat(ctx, StatOptions{
   246  			// There is no mask bit for stx_attributes.
   247  			Mask: 0,
   248  			// Linux just reads inode::i_flags directly.
   249  			Sync: linux.AT_STATX_DONT_SYNC,
   250  		})
   251  		if err != nil {
   252  			return err
   253  		}
   254  		if (stat.AttributesMask&linux.STATX_ATTR_APPEND != 0) && (stat.Attributes&linux.STATX_ATTR_APPEND != 0) {
   255  			return linuxerr.EPERM
   256  		}
   257  	}
   258  	if (flags&linux.O_NOATIME != 0) && (oldFlags&linux.O_NOATIME == 0) {
   259  		stat, err := fd.Stat(ctx, StatOptions{
   260  			Mask: linux.STATX_UID,
   261  			// Linux's inode_owner_or_capable() just reads inode::i_uid
   262  			// directly.
   263  			Sync: linux.AT_STATX_DONT_SYNC,
   264  		})
   265  		if err != nil {
   266  			return err
   267  		}
   268  		if stat.Mask&linux.STATX_UID == 0 {
   269  			return linuxerr.EPERM
   270  		}
   271  		if !CanActAsOwner(creds, auth.KUID(stat.UID)) {
   272  			return linuxerr.EPERM
   273  		}
   274  	}
   275  	if flags&linux.O_DIRECT != 0 && !fd.opts.AllowDirectIO {
   276  		return linuxerr.EINVAL
   277  	}
   278  	// TODO(github.com/SagerNet/issue/1035): FileDescriptionImpl.SetOAsync()?
   279  	const settableFlags = linux.O_APPEND | linux.O_ASYNC | linux.O_DIRECT | linux.O_NOATIME | linux.O_NONBLOCK
   280  	fd.flagsMu.Lock()
   281  	if fd.asyncHandler != nil {
   282  		// Use fd.statusFlags instead of oldFlags, which may have become outdated,
   283  		// to avoid double registering/unregistering.
   284  		if fd.statusFlags&linux.O_ASYNC == 0 && flags&linux.O_ASYNC != 0 {
   285  			fd.asyncHandler.Register(fd)
   286  		} else if fd.statusFlags&linux.O_ASYNC != 0 && flags&linux.O_ASYNC == 0 {
   287  			fd.asyncHandler.Unregister(fd)
   288  		}
   289  	}
   290  	atomic.StoreUint32(&fd.statusFlags, (oldFlags&^settableFlags)|(flags&settableFlags))
   291  	fd.flagsMu.Unlock()
   292  	return nil
   293  }
   294  
   295  // IsReadable returns true if fd was opened for reading.
   296  func (fd *FileDescription) IsReadable() bool {
   297  	return fd.readable
   298  }
   299  
   300  // IsWritable returns true if fd was opened for writing.
   301  func (fd *FileDescription) IsWritable() bool {
   302  	return fd.writable
   303  }
   304  
   305  // Impl returns the FileDescriptionImpl associated with fd.
   306  func (fd *FileDescription) Impl() FileDescriptionImpl {
   307  	return fd.impl
   308  }
   309  
   310  // FileDescriptionImpl contains implementation details for an FileDescription.
   311  // Implementations of FileDescriptionImpl should contain their associated
   312  // FileDescription by value as their first field.
   313  //
   314  // For all functions that return linux.Statx, Statx.Uid and Statx.Gid will
   315  // be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID and
   316  // auth.KGID respectively).
   317  //
   318  // All methods may return errors not specified.
   319  //
   320  // FileDescriptionImpl is analogous to Linux's struct file_operations.
   321  type FileDescriptionImpl interface {
   322  	// Release is called when the associated FileDescription reaches zero
   323  	// references.
   324  	Release(ctx context.Context)
   325  
   326  	// OnClose is called when a file descriptor representing the
   327  	// FileDescription is closed. Note that returning a non-nil error does not
   328  	// prevent the file descriptor from being closed.
   329  	OnClose(ctx context.Context) error
   330  
   331  	// Stat returns metadata for the file represented by the FileDescription.
   332  	Stat(ctx context.Context, opts StatOptions) (linux.Statx, error)
   333  
   334  	// SetStat updates metadata for the file represented by the
   335  	// FileDescription. Implementations are responsible for checking if the
   336  	// operation can be performed (see vfs.CheckSetStat() for common checks).
   337  	SetStat(ctx context.Context, opts SetStatOptions) error
   338  
   339  	// StatFS returns metadata for the filesystem containing the file
   340  	// represented by the FileDescription.
   341  	StatFS(ctx context.Context) (linux.Statfs, error)
   342  
   343  	// Allocate grows the file to offset + length bytes.
   344  	// Only mode == 0 is supported currently.
   345  	//
   346  	// Allocate should return EISDIR on directories, ESPIPE on pipes, and ENODEV on
   347  	// other files where it is not supported.
   348  	//
   349  	// Preconditions: The FileDescription was opened for writing.
   350  	Allocate(ctx context.Context, mode, offset, length uint64) error
   351  
   352  	// waiter.Waitable methods may be used to poll for I/O events.
   353  	waiter.Waitable
   354  
   355  	// PRead reads from the file into dst, starting at the given offset, and
   356  	// returns the number of bytes read. PRead is permitted to return partial
   357  	// reads with a nil error.
   358  	//
   359  	// Errors:
   360  	//
   361  	// - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP.
   362  	//
   363  	// Preconditions:
   364  	// * The FileDescription was opened for reading.
   365  	// * FileDescriptionOptions.DenyPRead == false.
   366  	PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error)
   367  
   368  	// Read is similar to PRead, but does not specify an offset.
   369  	//
   370  	// For files with an implicit FileDescription offset (e.g. regular files),
   371  	// Read begins at the FileDescription offset, and advances the offset by
   372  	// the number of bytes read; note that POSIX 2.9.7 "Thread Interactions
   373  	// with Regular File Operations" requires that all operations that may
   374  	// mutate the FileDescription offset are serialized.
   375  	//
   376  	// Errors:
   377  	//
   378  	// - If opts.Flags specifies unsupported options, Read returns EOPNOTSUPP.
   379  	//
   380  	// Preconditions: The FileDescription was opened for reading.
   381  	Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error)
   382  
   383  	// PWrite writes src to the file, starting at the given offset, and returns
   384  	// the number of bytes written. PWrite is permitted to return partial
   385  	// writes with a nil error.
   386  	//
   387  	// As in Linux (but not POSIX), if O_APPEND is in effect for the
   388  	// FileDescription, PWrite should ignore the offset and append data to the
   389  	// end of the file.
   390  	//
   391  	// Errors:
   392  	//
   393  	// - If opts.Flags specifies unsupported options, PWrite returns
   394  	// EOPNOTSUPP.
   395  	//
   396  	// Preconditions:
   397  	// * The FileDescription was opened for writing.
   398  	// * FileDescriptionOptions.DenyPWrite == false.
   399  	PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error)
   400  
   401  	// Write is similar to PWrite, but does not specify an offset, which is
   402  	// implied as for Read.
   403  	//
   404  	// Write is a FileDescriptionImpl method, instead of a wrapper around
   405  	// PWrite that uses a FileDescription offset, to make it possible for
   406  	// remote filesystems to implement O_APPEND correctly (i.e. atomically with
   407  	// respect to writers outside the scope of VFS).
   408  	//
   409  	// Errors:
   410  	//
   411  	// - If opts.Flags specifies unsupported options, Write returns EOPNOTSUPP.
   412  	//
   413  	// Preconditions: The FileDescription was opened for writing.
   414  	Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error)
   415  
   416  	// IterDirents invokes cb on each entry in the directory represented by the
   417  	// FileDescription. If IterDirents has been called since the last call to
   418  	// Seek, it continues iteration from the end of the last call.
   419  	IterDirents(ctx context.Context, cb IterDirentsCallback) error
   420  
   421  	// Seek changes the FileDescription offset (assuming one exists) and
   422  	// returns its new value.
   423  	//
   424  	// For directories, if whence == SEEK_SET and offset == 0, the caller is
   425  	// rewinddir(), such that Seek "shall also cause the directory stream to
   426  	// refer to the current state of the corresponding directory" -
   427  	// POSIX.1-2017.
   428  	Seek(ctx context.Context, offset int64, whence int32) (int64, error)
   429  
   430  	// Sync requests that cached state associated with the file represented by
   431  	// the FileDescription is synchronized with persistent storage, and blocks
   432  	// until this is complete.
   433  	Sync(ctx context.Context) error
   434  
   435  	// ConfigureMMap mutates opts to implement mmap(2) for the file. Most
   436  	// implementations that support memory mapping can call
   437  	// GenericConfigureMMap with the appropriate memmap.Mappable.
   438  	ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error
   439  
   440  	// Ioctl implements the ioctl(2) syscall.
   441  	Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error)
   442  
   443  	// ListXattr returns all extended attribute names for the file.
   444  	ListXattr(ctx context.Context, size uint64) ([]string, error)
   445  
   446  	// GetXattr returns the value associated with the given extended attribute
   447  	// for the file.
   448  	GetXattr(ctx context.Context, opts GetXattrOptions) (string, error)
   449  
   450  	// SetXattr changes the value associated with the given extended attribute
   451  	// for the file.
   452  	SetXattr(ctx context.Context, opts SetXattrOptions) error
   453  
   454  	// RemoveXattr removes the given extended attribute from the file.
   455  	RemoveXattr(ctx context.Context, name string) error
   456  
   457  	// SupportsLocks indicates whether file locks are supported.
   458  	SupportsLocks() bool
   459  
   460  	// LockBSD tries to acquire a BSD-style advisory file lock.
   461  	LockBSD(ctx context.Context, uid lock.UniqueID, ownerPID int32, t lock.LockType, block lock.Blocker) error
   462  
   463  	// UnlockBSD releases a BSD-style advisory file lock.
   464  	UnlockBSD(ctx context.Context, uid lock.UniqueID) error
   465  
   466  	// LockPOSIX tries to acquire a POSIX-style advisory file lock.
   467  	LockPOSIX(ctx context.Context, uid lock.UniqueID, ownerPID int32, t lock.LockType, r lock.LockRange, block lock.Blocker) error
   468  
   469  	// UnlockPOSIX releases a POSIX-style advisory file lock.
   470  	UnlockPOSIX(ctx context.Context, uid lock.UniqueID, ComputeLockRange lock.LockRange) error
   471  
   472  	// TestPOSIX returns information about whether the specified lock can be held, in the style of the F_GETLK fcntl.
   473  	TestPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, r lock.LockRange) (linux.Flock, error)
   474  }
   475  
   476  // Dirent holds the information contained in struct linux_dirent64.
   477  //
   478  // +stateify savable
   479  type Dirent struct {
   480  	// Name is the filename.
   481  	Name string
   482  
   483  	// Type is the file type, a linux.DT_* constant.
   484  	Type uint8
   485  
   486  	// Ino is the inode number.
   487  	Ino uint64
   488  
   489  	// NextOff is the offset of the *next* Dirent in the directory; that is,
   490  	// FileDescription.Seek(NextOff, SEEK_SET) (as called by seekdir(3)) will
   491  	// cause the next call to FileDescription.IterDirents() to yield the next
   492  	// Dirent. (The offset of the first Dirent in a directory is always 0.)
   493  	NextOff int64
   494  }
   495  
   496  // IterDirentsCallback receives Dirents from FileDescriptionImpl.IterDirents.
   497  type IterDirentsCallback interface {
   498  	// Handle handles the given iterated Dirent. If Handle returns a non-nil
   499  	// error, FileDescriptionImpl.IterDirents must stop iteration and return
   500  	// the error; the next call to FileDescriptionImpl.IterDirents should
   501  	// restart with the same Dirent.
   502  	Handle(dirent Dirent) error
   503  }
   504  
   505  // IterDirentsCallbackFunc implements IterDirentsCallback for a function with
   506  // the semantics of IterDirentsCallback.Handle.
   507  type IterDirentsCallbackFunc func(dirent Dirent) error
   508  
   509  // Handle implements IterDirentsCallback.Handle.
   510  func (f IterDirentsCallbackFunc) Handle(dirent Dirent) error {
   511  	return f(dirent)
   512  }
   513  
   514  // OnClose is called when a file descriptor representing the FileDescription is
   515  // closed. Returning a non-nil error should not prevent the file descriptor
   516  // from being closed.
   517  func (fd *FileDescription) OnClose(ctx context.Context) error {
   518  	return fd.impl.OnClose(ctx)
   519  }
   520  
   521  // Stat returns metadata for the file represented by fd.
   522  func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
   523  	if fd.opts.UseDentryMetadata {
   524  		vfsObj := fd.vd.mount.vfs
   525  		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
   526  			Root:  fd.vd,
   527  			Start: fd.vd,
   528  		})
   529  		stat, err := fd.vd.mount.fs.impl.StatAt(ctx, rp, opts)
   530  		rp.Release(ctx)
   531  		return stat, err
   532  	}
   533  	return fd.impl.Stat(ctx, opts)
   534  }
   535  
   536  // SetStat updates metadata for the file represented by fd.
   537  func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) error {
   538  	if fd.opts.UseDentryMetadata {
   539  		vfsObj := fd.vd.mount.vfs
   540  		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
   541  			Root:  fd.vd,
   542  			Start: fd.vd,
   543  		})
   544  		err := fd.vd.mount.fs.impl.SetStatAt(ctx, rp, opts)
   545  		rp.Release(ctx)
   546  		return err
   547  	}
   548  	return fd.impl.SetStat(ctx, opts)
   549  }
   550  
   551  // StatFS returns metadata for the filesystem containing the file represented
   552  // by fd.
   553  func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
   554  	if fd.opts.UseDentryMetadata {
   555  		vfsObj := fd.vd.mount.vfs
   556  		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
   557  			Root:  fd.vd,
   558  			Start: fd.vd,
   559  		})
   560  		statfs, err := fd.vd.mount.fs.impl.StatFSAt(ctx, rp)
   561  		rp.Release(ctx)
   562  		return statfs, err
   563  	}
   564  	return fd.impl.StatFS(ctx)
   565  }
   566  
   567  // Allocate grows file represented by FileDescription to offset + length bytes.
   568  func (fd *FileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error {
   569  	if !fd.IsWritable() {
   570  		return linuxerr.EBADF
   571  	}
   572  	if err := fd.impl.Allocate(ctx, mode, offset, length); err != nil {
   573  		return err
   574  	}
   575  	fd.Dentry().InotifyWithParent(ctx, linux.IN_MODIFY, 0, PathEvent)
   576  	return nil
   577  }
   578  
   579  // Readiness implements waiter.Waitable.Readiness.
   580  //
   581  // It returns fd's I/O readiness.
   582  func (fd *FileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
   583  	return fd.impl.Readiness(mask)
   584  }
   585  
   586  // EventRegister implements waiter.Waitable.EventRegister.
   587  //
   588  // It registers e for I/O readiness events in mask.
   589  func (fd *FileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
   590  	fd.impl.EventRegister(e, mask)
   591  }
   592  
   593  // EventUnregister implements waiter.Waitable.EventUnregister.
   594  //
   595  // It unregisters e for I/O readiness events.
   596  func (fd *FileDescription) EventUnregister(e *waiter.Entry) {
   597  	fd.impl.EventUnregister(e)
   598  }
   599  
   600  // PRead reads from the file represented by fd into dst, starting at the given
   601  // offset, and returns the number of bytes read. PRead is permitted to return
   602  // partial reads with a nil error.
   603  func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
   604  	if fd.opts.DenyPRead {
   605  		return 0, linuxerr.ESPIPE
   606  	}
   607  	if !fd.readable {
   608  		return 0, linuxerr.EBADF
   609  	}
   610  	start := fsmetric.StartReadWait()
   611  	n, err := fd.impl.PRead(ctx, dst, offset, opts)
   612  	if n > 0 {
   613  		fd.Dentry().InotifyWithParent(ctx, linux.IN_ACCESS, 0, PathEvent)
   614  	}
   615  	fsmetric.Reads.Increment()
   616  	fsmetric.FinishReadWait(fsmetric.ReadWait, start)
   617  	return n, err
   618  }
   619  
   620  // Read is similar to PRead, but does not specify an offset.
   621  func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
   622  	if !fd.readable {
   623  		return 0, linuxerr.EBADF
   624  	}
   625  	start := fsmetric.StartReadWait()
   626  	n, err := fd.impl.Read(ctx, dst, opts)
   627  	if n > 0 {
   628  		fd.Dentry().InotifyWithParent(ctx, linux.IN_ACCESS, 0, PathEvent)
   629  	}
   630  	fsmetric.Reads.Increment()
   631  	fsmetric.FinishReadWait(fsmetric.ReadWait, start)
   632  	return n, err
   633  }
   634  
   635  // PWrite writes src to the file represented by fd, starting at the given
   636  // offset, and returns the number of bytes written. PWrite is permitted to
   637  // return partial writes with a nil error.
   638  func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
   639  	if fd.opts.DenyPWrite {
   640  		return 0, linuxerr.ESPIPE
   641  	}
   642  	if !fd.writable {
   643  		return 0, linuxerr.EBADF
   644  	}
   645  	n, err := fd.impl.PWrite(ctx, src, offset, opts)
   646  	if n > 0 {
   647  		fd.Dentry().InotifyWithParent(ctx, linux.IN_MODIFY, 0, PathEvent)
   648  	}
   649  	return n, err
   650  }
   651  
   652  // Write is similar to PWrite, but does not specify an offset.
   653  func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
   654  	if !fd.writable {
   655  		return 0, linuxerr.EBADF
   656  	}
   657  	n, err := fd.impl.Write(ctx, src, opts)
   658  	if n > 0 {
   659  		fd.Dentry().InotifyWithParent(ctx, linux.IN_MODIFY, 0, PathEvent)
   660  	}
   661  	return n, err
   662  }
   663  
   664  // IterDirents invokes cb on each entry in the directory represented by fd. If
   665  // IterDirents has been called since the last call to Seek, it continues
   666  // iteration from the end of the last call.
   667  func (fd *FileDescription) IterDirents(ctx context.Context, cb IterDirentsCallback) error {
   668  	return fd.impl.IterDirents(ctx, cb)
   669  }
   670  
   671  // Seek changes fd's offset (assuming one exists) and returns its new value.
   672  func (fd *FileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
   673  	return fd.impl.Seek(ctx, offset, whence)
   674  }
   675  
   676  // Sync has the semantics of fsync(2).
   677  func (fd *FileDescription) Sync(ctx context.Context) error {
   678  	return fd.impl.Sync(ctx)
   679  }
   680  
   681  // ConfigureMMap mutates opts to implement mmap(2) for the file represented by
   682  // fd.
   683  func (fd *FileDescription) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
   684  	return fd.impl.ConfigureMMap(ctx, opts)
   685  }
   686  
   687  // Ioctl implements the ioctl(2) syscall.
   688  func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
   689  	return fd.impl.Ioctl(ctx, uio, args)
   690  }
   691  
   692  // ListXattr returns all extended attribute names for the file represented by
   693  // fd.
   694  //
   695  // If the size of the list (including a NUL terminating byte after every entry)
   696  // would exceed size, ERANGE may be returned. Note that implementations
   697  // are free to ignore size entirely and return without error). In all cases,
   698  // if size is 0, the list should be returned without error, regardless of size.
   699  func (fd *FileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
   700  	if fd.opts.UseDentryMetadata {
   701  		vfsObj := fd.vd.mount.vfs
   702  		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
   703  			Root:  fd.vd,
   704  			Start: fd.vd,
   705  		})
   706  		names, err := fd.vd.mount.fs.impl.ListXattrAt(ctx, rp, size)
   707  		rp.Release(ctx)
   708  		return names, err
   709  	}
   710  	names, err := fd.impl.ListXattr(ctx, size)
   711  	if linuxerr.Equals(linuxerr.EOPNOTSUPP, err) {
   712  		// Linux doesn't actually return EOPNOTSUPP in this case; instead,
   713  		// fs/xattr.c:vfs_listxattr() falls back to allowing the security
   714  		// subsystem to return security extended attributes, which by default
   715  		// don't exist.
   716  		return nil, nil
   717  	}
   718  	return names, err
   719  }
   720  
   721  // GetXattr returns the value associated with the given extended attribute for
   722  // the file represented by fd.
   723  //
   724  // If the size of the return value exceeds opts.Size, ERANGE may be returned
   725  // (note that implementations are free to ignore opts.Size entirely and return
   726  // without error). In all cases, if opts.Size is 0, the value should be
   727  // returned without error, regardless of size.
   728  func (fd *FileDescription) GetXattr(ctx context.Context, opts *GetXattrOptions) (string, error) {
   729  	if fd.opts.UseDentryMetadata {
   730  		vfsObj := fd.vd.mount.vfs
   731  		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
   732  			Root:  fd.vd,
   733  			Start: fd.vd,
   734  		})
   735  		val, err := fd.vd.mount.fs.impl.GetXattrAt(ctx, rp, *opts)
   736  		rp.Release(ctx)
   737  		return val, err
   738  	}
   739  	return fd.impl.GetXattr(ctx, *opts)
   740  }
   741  
   742  // SetXattr changes the value associated with the given extended attribute for
   743  // the file represented by fd.
   744  func (fd *FileDescription) SetXattr(ctx context.Context, opts *SetXattrOptions) error {
   745  	if fd.opts.UseDentryMetadata {
   746  		vfsObj := fd.vd.mount.vfs
   747  		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
   748  			Root:  fd.vd,
   749  			Start: fd.vd,
   750  		})
   751  		err := fd.vd.mount.fs.impl.SetXattrAt(ctx, rp, *opts)
   752  		rp.Release(ctx)
   753  		return err
   754  	}
   755  	return fd.impl.SetXattr(ctx, *opts)
   756  }
   757  
   758  // RemoveXattr removes the given extended attribute from the file represented
   759  // by fd.
   760  func (fd *FileDescription) RemoveXattr(ctx context.Context, name string) error {
   761  	if fd.opts.UseDentryMetadata {
   762  		vfsObj := fd.vd.mount.vfs
   763  		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
   764  			Root:  fd.vd,
   765  			Start: fd.vd,
   766  		})
   767  		err := fd.vd.mount.fs.impl.RemoveXattrAt(ctx, rp, name)
   768  		rp.Release(ctx)
   769  		return err
   770  	}
   771  	return fd.impl.RemoveXattr(ctx, name)
   772  }
   773  
   774  // SyncFS instructs the filesystem containing fd to execute the semantics of
   775  // syncfs(2).
   776  func (fd *FileDescription) SyncFS(ctx context.Context) error {
   777  	return fd.vd.mount.fs.impl.Sync(ctx)
   778  }
   779  
   780  // MappedName implements memmap.MappingIdentity.MappedName.
   781  func (fd *FileDescription) MappedName(ctx context.Context) string {
   782  	vfsroot := RootFromContext(ctx)
   783  	s, _ := fd.vd.mount.vfs.PathnameWithDeleted(ctx, vfsroot, fd.vd)
   784  	if vfsroot.Ok() {
   785  		vfsroot.DecRef(ctx)
   786  	}
   787  	return s
   788  }
   789  
   790  // DeviceID implements memmap.MappingIdentity.DeviceID.
   791  func (fd *FileDescription) DeviceID() uint64 {
   792  	stat, err := fd.Stat(context.Background(), StatOptions{
   793  		// There is no STATX_DEV; we assume that Stat will return it if it's
   794  		// available regardless of mask.
   795  		Mask: 0,
   796  		// fs/proc/task_mmu.c:show_map_vma() just reads inode::i_sb->s_dev
   797  		// directly.
   798  		Sync: linux.AT_STATX_DONT_SYNC,
   799  	})
   800  	if err != nil {
   801  		return 0
   802  	}
   803  	return uint64(linux.MakeDeviceID(uint16(stat.DevMajor), stat.DevMinor))
   804  }
   805  
   806  // InodeID implements memmap.MappingIdentity.InodeID.
   807  func (fd *FileDescription) InodeID() uint64 {
   808  	stat, err := fd.Stat(context.Background(), StatOptions{
   809  		Mask: linux.STATX_INO,
   810  		// fs/proc/task_mmu.c:show_map_vma() just reads inode::i_ino directly.
   811  		Sync: linux.AT_STATX_DONT_SYNC,
   812  	})
   813  	if err != nil || stat.Mask&linux.STATX_INO == 0 {
   814  		return 0
   815  	}
   816  	return stat.Ino
   817  }
   818  
   819  // Msync implements memmap.MappingIdentity.Msync.
   820  func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error {
   821  	return fd.Sync(ctx)
   822  }
   823  
   824  // SupportsLocks indicates whether file locks are supported.
   825  func (fd *FileDescription) SupportsLocks() bool {
   826  	return fd.impl.SupportsLocks()
   827  }
   828  
   829  // LockBSD tries to acquire a BSD-style advisory file lock.
   830  func (fd *FileDescription) LockBSD(ctx context.Context, ownerPID int32, lockType lock.LockType, blocker lock.Blocker) error {
   831  	atomic.StoreUint32(&fd.usedLockBSD, 1)
   832  	return fd.impl.LockBSD(ctx, fd, ownerPID, lockType, blocker)
   833  }
   834  
   835  // UnlockBSD releases a BSD-style advisory file lock.
   836  func (fd *FileDescription) UnlockBSD(ctx context.Context) error {
   837  	return fd.impl.UnlockBSD(ctx, fd)
   838  }
   839  
   840  // LockPOSIX locks a POSIX-style file range lock.
   841  func (fd *FileDescription) LockPOSIX(ctx context.Context, uid lock.UniqueID, ownerPID int32, t lock.LockType, r lock.LockRange, block lock.Blocker) error {
   842  	return fd.impl.LockPOSIX(ctx, uid, ownerPID, t, r, block)
   843  }
   844  
   845  // UnlockPOSIX unlocks a POSIX-style file range lock.
   846  func (fd *FileDescription) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, r lock.LockRange) error {
   847  	return fd.impl.UnlockPOSIX(ctx, uid, r)
   848  }
   849  
   850  // TestPOSIX returns information about whether the specified lock can be held.
   851  func (fd *FileDescription) TestPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, r lock.LockRange) (linux.Flock, error) {
   852  	return fd.impl.TestPOSIX(ctx, uid, t, r)
   853  }
   854  
   855  // ComputeLockRange computes the range of a file lock based on the given values.
   856  func (fd *FileDescription) ComputeLockRange(ctx context.Context, start uint64, length uint64, whence int16) (lock.LockRange, error) {
   857  	var off int64
   858  	switch whence {
   859  	case linux.SEEK_SET:
   860  		off = 0
   861  	case linux.SEEK_CUR:
   862  		// Note that Linux does not hold any mutexes while retrieving the file
   863  		// offset, see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk.
   864  		curOff, err := fd.Seek(ctx, 0, linux.SEEK_CUR)
   865  		if err != nil {
   866  			return lock.LockRange{}, err
   867  		}
   868  		off = curOff
   869  	case linux.SEEK_END:
   870  		stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_SIZE})
   871  		if err != nil {
   872  			return lock.LockRange{}, err
   873  		}
   874  		off = int64(stat.Size)
   875  	default:
   876  		return lock.LockRange{}, linuxerr.EINVAL
   877  	}
   878  
   879  	return lock.ComputeRange(int64(start), int64(length), off)
   880  }
   881  
   882  // A FileAsync sends signals to its owner when w is ready for IO. This is only
   883  // implemented by pkg/sentry/fasync:FileAsync, but we unfortunately need this
   884  // interface to avoid circular dependencies.
   885  type FileAsync interface {
   886  	Register(w waiter.Waitable)
   887  	Unregister(w waiter.Waitable)
   888  }
   889  
   890  // AsyncHandler returns the FileAsync for fd.
   891  func (fd *FileDescription) AsyncHandler() FileAsync {
   892  	fd.flagsMu.Lock()
   893  	defer fd.flagsMu.Unlock()
   894  	return fd.asyncHandler
   895  }
   896  
   897  // SetAsyncHandler sets fd.asyncHandler if it has not been set before and
   898  // returns it.
   899  func (fd *FileDescription) SetAsyncHandler(newHandler func() FileAsync) FileAsync {
   900  	fd.flagsMu.Lock()
   901  	defer fd.flagsMu.Unlock()
   902  	if fd.asyncHandler == nil {
   903  		fd.asyncHandler = newHandler()
   904  		if fd.statusFlags&linux.O_ASYNC != 0 {
   905  			fd.asyncHandler.Register(fd)
   906  		}
   907  	}
   908  	return fd.asyncHandler
   909  }
   910  
   911  // CopyRegularFileData copies data from srcFD to dstFD until reading from srcFD
   912  // returns EOF or an error. It returns the number of bytes copied.
   913  func CopyRegularFileData(ctx context.Context, dstFD, srcFD *FileDescription) (int64, error) {
   914  	done := int64(0)
   915  	buf := usermem.BytesIOSequence(make([]byte, 32*1024)) // arbitrary buffer size
   916  	for {
   917  		readN, readErr := srcFD.Read(ctx, buf, ReadOptions{})
   918  		if readErr != nil && readErr != io.EOF {
   919  			return done, readErr
   920  		}
   921  		src := buf.TakeFirst64(readN)
   922  		for src.NumBytes() != 0 {
   923  			writeN, writeErr := dstFD.Write(ctx, src, WriteOptions{})
   924  			done += writeN
   925  			src = src.DropFirst64(writeN)
   926  			if writeErr != nil {
   927  				return done, writeErr
   928  			}
   929  		}
   930  		if readErr == io.EOF {
   931  			return done, nil
   932  		}
   933  	}
   934  }