github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/host/host.go (about)

     1  // Copyright 2020 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package host provides a filesystem implementation for host files imported as
    16  // file descriptors.
    17  package host
    18  
    19  import (
    20  	"fmt"
    21  	"math"
    22  
    23  	"golang.org/x/sys/unix"
    24  	"github.com/metacubex/gvisor/pkg/abi/linux"
    25  	"github.com/metacubex/gvisor/pkg/atomicbitops"
    26  	"github.com/metacubex/gvisor/pkg/context"
    27  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    28  	"github.com/metacubex/gvisor/pkg/fdnotifier"
    29  	"github.com/metacubex/gvisor/pkg/fspath"
    30  	"github.com/metacubex/gvisor/pkg/hostarch"
    31  	"github.com/metacubex/gvisor/pkg/log"
    32  	"github.com/metacubex/gvisor/pkg/sentry/arch"
    33  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/kernfs"
    34  	"github.com/metacubex/gvisor/pkg/sentry/hostfd"
    35  	"github.com/metacubex/gvisor/pkg/sentry/kernel"
    36  	"github.com/metacubex/gvisor/pkg/sentry/kernel/auth"
    37  	"github.com/metacubex/gvisor/pkg/sentry/memmap"
    38  	unixsocket "github.com/metacubex/gvisor/pkg/sentry/socket/unix"
    39  	"github.com/metacubex/gvisor/pkg/sentry/socket/unix/transport"
    40  	"github.com/metacubex/gvisor/pkg/sentry/uniqueid"
    41  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    42  	"github.com/metacubex/gvisor/pkg/sync"
    43  	"github.com/metacubex/gvisor/pkg/usermem"
    44  	"github.com/metacubex/gvisor/pkg/waiter"
    45  )
    46  
    47  // These are the modes that are stored with virtualOwner.
    48  const virtualOwnerModes = linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID
    49  
    50  // +stateify savable
    51  type virtualOwner struct {
    52  	// This field is initialized at creation time and is immutable.
    53  	enabled bool
    54  
    55  	// mu protects the fields below and they can be accessed using atomic memory
    56  	// operations.
    57  	mu  sync.Mutex `state:"nosave"`
    58  	uid atomicbitops.Uint32
    59  	gid atomicbitops.Uint32
    60  	// mode is also stored, otherwise setting the host file to `0000` could remove
    61  	// access to the file.
    62  	mode atomicbitops.Uint32
    63  }
    64  
    65  func (v *virtualOwner) atomicUID() uint32 {
    66  	return v.uid.Load()
    67  }
    68  
    69  func (v *virtualOwner) atomicGID() uint32 {
    70  	return v.gid.Load()
    71  }
    72  
    73  func (v *virtualOwner) atomicMode() uint32 {
    74  	return v.mode.Load()
    75  }
    76  
    77  func isEpollable(fd int) bool {
    78  	epollfd, err := unix.EpollCreate1(0)
    79  	if err != nil {
    80  		// This shouldn't happen. If it does, just say file doesn't support epoll.
    81  		return false
    82  	}
    83  	defer unix.Close(epollfd)
    84  
    85  	event := unix.EpollEvent{
    86  		Fd:     int32(fd),
    87  		Events: unix.EPOLLIN,
    88  	}
    89  	err = unix.EpollCtl(epollfd, unix.EPOLL_CTL_ADD, fd, &event)
    90  	return err == nil
    91  }
    92  
    93  // inode implements kernfs.Inode.
    94  //
    95  // +stateify savable
    96  type inode struct {
    97  	kernfs.CachedMappable
    98  	kernfs.InodeNoStatFS
    99  	kernfs.InodeAnonymous // inode is effectively anonymous because it represents a donated FD.
   100  	kernfs.InodeNotDirectory
   101  	kernfs.InodeNotSymlink
   102  	kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid.
   103  	kernfs.InodeWatches
   104  
   105  	locks vfs.FileLocks
   106  
   107  	// When the reference count reaches zero, the host fd is closed.
   108  	inodeRefs
   109  
   110  	// hostFD contains the host fd that this file was originally created from.
   111  	// Upon restore, it must be remapped using restoreKey and vfs.CtxRestoreFilesystemFDMap
   112  	// from the restore context.
   113  	//
   114  	// This field is initialized at creation time and is immutable.
   115  	hostFD int `state:"nosave"`
   116  
   117  	// restoreKey is used to identify the `hostFD` after a restore is performed.
   118  	restoreKey vfs.RestoreID
   119  
   120  	// ino is an inode number unique within this filesystem.
   121  	//
   122  	// This field is initialized at creation time and is immutable.
   123  	ino uint64
   124  
   125  	// ftype is the file's type (a linux.S_IFMT mask).
   126  	//
   127  	// This field is initialized at creation time and is immutable.
   128  	ftype uint16
   129  
   130  	// epollable indicates whether the hostFD can be used with epoll_ctl(2). This
   131  	// also indicates that hostFD has been set to non-blocking.
   132  	//
   133  	// This field is initialized at creation time and is immutable.
   134  	epollable bool
   135  
   136  	// seekable is false if lseek(hostFD) returns ESPIPE. We assume that file
   137  	// offsets are meaningful iff seekable is true.
   138  	//
   139  	// This field is initialized at creation time and is immutable.
   140  	seekable bool
   141  
   142  	// isTTY is true if this file represents a TTY.
   143  	//
   144  	// This field is initialized at creation time and is immutable.
   145  	isTTY bool
   146  
   147  	// savable is true if hostFD may be saved/restored by its numeric value.
   148  	//
   149  	// This field is initialized at creation time and is immutable.
   150  	savable bool
   151  
   152  	// readonly is true if operations that can potentially change the host file
   153  	// are blocked.
   154  	//
   155  	// This field is initialized at creation time and is immutable.
   156  	readonly bool
   157  
   158  	// Event queue for blocking operations.
   159  	queue waiter.Queue
   160  
   161  	// virtualOwner caches ownership and permission information to override the
   162  	// underlying file owner and permission. This is used to allow the unstrusted
   163  	// application to change these fields without affecting the host.
   164  	virtualOwner virtualOwner
   165  
   166  	// If haveBuf is non-zero, hostFD represents a pipe, and buf contains data
   167  	// read from the pipe from previous calls to inode.beforeSave(). haveBuf
   168  	// and buf are protected by bufMu.
   169  	bufMu   sync.Mutex `state:"nosave"`
   170  	haveBuf atomicbitops.Uint32
   171  	buf     []byte
   172  }
   173  
   174  func newInode(ctx context.Context, fs *filesystem, hostFD int, savable bool, restoreKey vfs.RestoreID, fileType linux.FileMode, isTTY bool, readonly bool) (*inode, error) {
   175  	// Determine if hostFD is seekable.
   176  	_, err := unix.Seek(hostFD, 0, linux.SEEK_CUR)
   177  	seekable := !linuxerr.Equals(linuxerr.ESPIPE, err)
   178  	// We expect regular files to be seekable, as this is required for them to
   179  	// be memory-mappable.
   180  	if !seekable && fileType == unix.S_IFREG {
   181  		ctx.Infof("host.newInode: host FD %d is a non-seekable regular file", hostFD)
   182  		return nil, linuxerr.ESPIPE
   183  	}
   184  
   185  	i := &inode{
   186  		hostFD:     hostFD,
   187  		ino:        fs.NextIno(),
   188  		ftype:      uint16(fileType),
   189  		epollable:  isEpollable(hostFD),
   190  		seekable:   seekable,
   191  		isTTY:      isTTY,
   192  		savable:    savable,
   193  		restoreKey: restoreKey,
   194  		readonly:   readonly,
   195  	}
   196  	i.InitRefs()
   197  	i.CachedMappable.Init(hostFD)
   198  
   199  	// If the hostFD can return EWOULDBLOCK when set to non-blocking, do so and
   200  	// handle blocking behavior in the sentry.
   201  	if i.epollable {
   202  		if err := unix.SetNonblock(i.hostFD, true); err != nil {
   203  			return nil, err
   204  		}
   205  		if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil {
   206  			return nil, err
   207  		}
   208  	}
   209  	return i, nil
   210  }
   211  
   212  // NewFDOptions contains options to NewFD.
   213  type NewFDOptions struct {
   214  	// If Savable is true, the host file descriptor may be saved/restored by
   215  	// numeric value. RestoreKey is used to map the FD after restore.
   216  	Savable bool
   217  
   218  	// RestoreKey is only used when Savable==true. It uniquely identifies the
   219  	// host FD so that a mapping to the corresponding FD can be provided during
   220  	// restore.
   221  	RestoreKey vfs.RestoreID
   222  
   223  	// If IsTTY is true, the file descriptor is a TTY.
   224  	IsTTY bool
   225  
   226  	// If HaveFlags is true, use Flags for the new file description. Otherwise,
   227  	// the new file description will inherit flags from hostFD.
   228  	HaveFlags bool
   229  	Flags     uint32
   230  
   231  	// VirtualOwner allow the host file to have owner and permissions different
   232  	// than the underlying host file.
   233  	VirtualOwner bool
   234  	UID          auth.KUID
   235  	GID          auth.KGID
   236  
   237  	// If Readonly is true, we disallow operations that can potentially change
   238  	// the host file associated with the file descriptor.
   239  	Readonly bool
   240  }
   241  
   242  // NewFD returns a vfs.FileDescription representing the given host file
   243  // descriptor. mnt must be Kernel.HostMount().
   244  func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) (*vfs.FileDescription, error) {
   245  	fs, ok := mnt.Filesystem().Impl().(*filesystem)
   246  	if !ok {
   247  		return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl())
   248  	}
   249  
   250  	if opts.Readonly {
   251  		if opts.IsTTY {
   252  			// This is not a technical limitation, but access checks for TTYs
   253  			// have not been implemented yet.
   254  			return nil, fmt.Errorf("readonly file descriptor may currently not be a TTY")
   255  		}
   256  
   257  		flagsInt, err := unix.FcntlInt(uintptr(hostFD), unix.F_GETFL, 0)
   258  		if err != nil {
   259  			return nil, err
   260  		}
   261  		accessMode := uint32(flagsInt) & unix.O_ACCMODE
   262  		if accessMode != unix.O_RDONLY {
   263  			return nil, fmt.Errorf("readonly file descriptor may only be opened as O_RDONLY on the host")
   264  		}
   265  	}
   266  
   267  	// Retrieve metadata.
   268  	var stat unix.Stat_t
   269  	if err := unix.Fstat(hostFD, &stat); err != nil {
   270  		return nil, err
   271  	}
   272  
   273  	flags := opts.Flags
   274  	if !opts.HaveFlags {
   275  		// Get flags for the imported FD.
   276  		flagsInt, err := unix.FcntlInt(uintptr(hostFD), unix.F_GETFL, 0)
   277  		if err != nil {
   278  			return nil, err
   279  		}
   280  		flags = uint32(flagsInt)
   281  	}
   282  
   283  	fileType := linux.FileMode(stat.Mode).FileType()
   284  	i, err := newInode(ctx, fs, hostFD, opts.Savable, opts.RestoreKey, fileType, opts.IsTTY, opts.Readonly)
   285  	if err != nil {
   286  		return nil, err
   287  	}
   288  	if opts.VirtualOwner {
   289  		i.virtualOwner.enabled = true
   290  		i.virtualOwner.uid = atomicbitops.FromUint32(uint32(opts.UID))
   291  		i.virtualOwner.gid = atomicbitops.FromUint32(uint32(opts.GID))
   292  		i.virtualOwner.mode = atomicbitops.FromUint32(stat.Mode)
   293  	}
   294  
   295  	d := &kernfs.Dentry{}
   296  	d.Init(&fs.Filesystem, i)
   297  
   298  	// i.open will take a reference on d.
   299  	defer d.DecRef(ctx)
   300  
   301  	// For simplicity, fileDescription.offset is set to 0. Technically, we
   302  	// should only set to 0 on files that are not seekable (sockets, pipes,
   303  	// etc.), and use the offset from the host fd otherwise when importing.
   304  	return i.open(ctx, d, mnt, fileType, flags)
   305  }
   306  
   307  // filesystemType implements vfs.FilesystemType.
   308  //
   309  // +stateify savable
   310  type filesystemType struct{}
   311  
   312  // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
   313  func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
   314  	panic("host.filesystemType.GetFilesystem should never be called")
   315  }
   316  
   317  // Name implements vfs.FilesystemType.Name.
   318  func (filesystemType) Name() string {
   319  	return "none"
   320  }
   321  
   322  // Release implements vfs.FilesystemType.Release.
   323  func (filesystemType) Release(ctx context.Context) {}
   324  
   325  // NewFilesystem sets up and returns a new hostfs filesystem.
   326  //
   327  // Note that there should only ever be one instance of host.filesystem,
   328  // a global mount for host fds.
   329  func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) {
   330  	devMinor, err := vfsObj.GetAnonBlockDevMinor()
   331  	if err != nil {
   332  		return nil, err
   333  	}
   334  	fs := &filesystem{
   335  		devMinor: devMinor,
   336  	}
   337  	fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs)
   338  	return fs.VFSFilesystem(), nil
   339  }
   340  
   341  // filesystem implements vfs.FilesystemImpl.
   342  //
   343  // +stateify savable
   344  type filesystem struct {
   345  	kernfs.Filesystem
   346  
   347  	devMinor uint32
   348  }
   349  
   350  func (fs *filesystem) Release(ctx context.Context) {
   351  	fs.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
   352  	fs.Filesystem.Release(ctx)
   353  }
   354  
   355  func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
   356  	d := vd.Dentry().Impl().(*kernfs.Dentry)
   357  	inode := d.Inode().(*inode)
   358  	b.PrependComponent(fmt.Sprintf("host:[%d]", inode.ino))
   359  	return vfs.PrependPathSyntheticError{}
   360  }
   361  
   362  // MountOptions implements vfs.FilesystemImpl.MountOptions.
   363  func (fs *filesystem) MountOptions() string {
   364  	return ""
   365  }
   366  
   367  // CheckPermissions implements kernfs.Inode.CheckPermissions.
   368  func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
   369  	var s unix.Stat_t
   370  	if err := i.stat(&s); err != nil {
   371  		return err
   372  	}
   373  	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid))
   374  }
   375  
   376  // Mode implements kernfs.Inode.Mode.
   377  func (i *inode) Mode() linux.FileMode {
   378  	var s unix.Stat_t
   379  	if err := i.stat(&s); err != nil {
   380  		// Retrieving the mode from the host fd using fstat(2) should not fail.
   381  		// If the syscall does not succeed, something is fundamentally wrong.
   382  		panic(fmt.Sprintf("failed to retrieve mode from host fd %d: %v", i.hostFD, err))
   383  	}
   384  	return linux.FileMode(s.Mode)
   385  }
   386  
   387  // Mode implements kernfs.Inode.UID
   388  func (i *inode) UID() auth.KUID {
   389  	return auth.KUID(i.virtualOwner.uid.Load())
   390  }
   391  
   392  // Mode implements kernfs.Inode.GID
   393  func (i *inode) GID() auth.KGID {
   394  	return auth.KGID(i.virtualOwner.gid.Load())
   395  }
   396  
   397  // Stat implements kernfs.Inode.Stat.
   398  func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
   399  	if opts.Mask&linux.STATX__RESERVED != 0 {
   400  		return linux.Statx{}, linuxerr.EINVAL
   401  	}
   402  	if opts.Sync&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE {
   403  		return linux.Statx{}, linuxerr.EINVAL
   404  	}
   405  
   406  	fs := vfsfs.Impl().(*filesystem)
   407  
   408  	// Limit our host call only to known flags.
   409  	mask := opts.Mask & linux.STATX_ALL
   410  	var s unix.Statx_t
   411  	err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(mask), &s)
   412  	if linuxerr.Equals(linuxerr.ENOSYS, err) {
   413  		// Fallback to fstat(2), if statx(2) is not supported on the host.
   414  		//
   415  		// TODO(b/151263641): Remove fallback.
   416  		return i.statxFromStat(fs)
   417  	}
   418  	if err != nil {
   419  		return linux.Statx{}, err
   420  	}
   421  
   422  	// Unconditionally fill blksize, attributes, and device numbers, as
   423  	// indicated by /include/uapi/linux/stat.h. Inode number is always
   424  	// available, since we use our own rather than the host's.
   425  	ls := linux.Statx{
   426  		Mask:           linux.STATX_INO,
   427  		Blksize:        s.Blksize,
   428  		Attributes:     s.Attributes,
   429  		Ino:            i.ino,
   430  		AttributesMask: s.Attributes_mask,
   431  		DevMajor:       linux.UNNAMED_MAJOR,
   432  		DevMinor:       fs.devMinor,
   433  	}
   434  
   435  	// Copy other fields that were returned by the host. RdevMajor/RdevMinor
   436  	// are never copied (and therefore left as zero), so as not to expose host
   437  	// device numbers.
   438  	ls.Mask |= s.Mask & linux.STATX_ALL
   439  	if s.Mask&linux.STATX_TYPE != 0 {
   440  		if i.virtualOwner.enabled {
   441  			ls.Mode |= uint16(i.virtualOwner.atomicMode()) & linux.S_IFMT
   442  		} else {
   443  			ls.Mode |= s.Mode & linux.S_IFMT
   444  		}
   445  	}
   446  	if s.Mask&linux.STATX_MODE != 0 {
   447  		if i.virtualOwner.enabled {
   448  			ls.Mode |= uint16(i.virtualOwner.atomicMode()) &^ linux.S_IFMT
   449  		} else {
   450  			ls.Mode |= s.Mode &^ linux.S_IFMT
   451  		}
   452  	}
   453  	if s.Mask&linux.STATX_NLINK != 0 {
   454  		ls.Nlink = s.Nlink
   455  	}
   456  	if s.Mask&linux.STATX_UID != 0 {
   457  		if i.virtualOwner.enabled {
   458  			ls.UID = i.virtualOwner.atomicUID()
   459  		} else {
   460  			ls.UID = s.Uid
   461  		}
   462  	}
   463  	if s.Mask&linux.STATX_GID != 0 {
   464  		if i.virtualOwner.enabled {
   465  			ls.GID = i.virtualOwner.atomicGID()
   466  		} else {
   467  			ls.GID = s.Gid
   468  		}
   469  	}
   470  	if s.Mask&linux.STATX_ATIME != 0 {
   471  		ls.Atime = unixToLinuxStatxTimestamp(s.Atime)
   472  	}
   473  	if s.Mask&linux.STATX_BTIME != 0 {
   474  		ls.Btime = unixToLinuxStatxTimestamp(s.Btime)
   475  	}
   476  	if s.Mask&linux.STATX_CTIME != 0 {
   477  		ls.Ctime = unixToLinuxStatxTimestamp(s.Ctime)
   478  	}
   479  	if s.Mask&linux.STATX_MTIME != 0 {
   480  		ls.Mtime = unixToLinuxStatxTimestamp(s.Mtime)
   481  	}
   482  	if s.Mask&linux.STATX_SIZE != 0 {
   483  		ls.Size = s.Size
   484  	}
   485  	if s.Mask&linux.STATX_BLOCKS != 0 {
   486  		ls.Blocks = s.Blocks
   487  	}
   488  
   489  	return ls, nil
   490  }
   491  
   492  // statxFromStat is a best-effort fallback for inode.Stat() if the host does not
   493  // support statx(2).
   494  //
   495  // We ignore the mask and sync flags in opts and simply supply
   496  // STATX_BASIC_STATS, as fstat(2) itself does not allow the specification
   497  // of a mask or sync flags. fstat(2) does not provide any metadata
   498  // equivalent to Statx.Attributes, Statx.AttributesMask, or Statx.Btime, so
   499  // those fields remain empty.
   500  func (i *inode) statxFromStat(fs *filesystem) (linux.Statx, error) {
   501  	var s unix.Stat_t
   502  	if err := i.stat(&s); err != nil {
   503  		return linux.Statx{}, err
   504  	}
   505  
   506  	// As with inode.Stat(), we always use internal device and inode numbers,
   507  	// and never expose the host's represented device numbers.
   508  	return linux.Statx{
   509  		Mask:     linux.STATX_BASIC_STATS,
   510  		Blksize:  uint32(s.Blksize),
   511  		Nlink:    uint32(s.Nlink),
   512  		UID:      s.Uid,
   513  		GID:      s.Gid,
   514  		Mode:     uint16(s.Mode),
   515  		Ino:      i.ino,
   516  		Size:     uint64(s.Size),
   517  		Blocks:   uint64(s.Blocks),
   518  		Atime:    timespecToStatxTimestamp(s.Atim),
   519  		Ctime:    timespecToStatxTimestamp(s.Ctim),
   520  		Mtime:    timespecToStatxTimestamp(s.Mtim),
   521  		DevMajor: linux.UNNAMED_MAJOR,
   522  		DevMinor: fs.devMinor,
   523  	}, nil
   524  }
   525  
   526  func (i *inode) stat(stat *unix.Stat_t) error {
   527  	if err := unix.Fstat(i.hostFD, stat); err != nil {
   528  		return err
   529  	}
   530  	if i.virtualOwner.enabled {
   531  		stat.Uid = i.virtualOwner.atomicUID()
   532  		stat.Gid = i.virtualOwner.atomicGID()
   533  		stat.Mode = i.virtualOwner.atomicMode()
   534  	}
   535  	return nil
   536  }
   537  
   538  // SetStat implements kernfs.Inode.SetStat.
   539  //
   540  // +checklocksignore
   541  func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
   542  	if i.readonly {
   543  		return linuxerr.EPERM
   544  	}
   545  
   546  	s := &opts.Stat
   547  
   548  	m := s.Mask
   549  	if m == 0 {
   550  		return nil
   551  	}
   552  	supportedModes := uint32(linux.STATX_MODE | linux.STATX_SIZE | linux.STATX_ATIME | linux.STATX_MTIME)
   553  	if i.virtualOwner.enabled {
   554  		if m&virtualOwnerModes != 0 {
   555  			// Take lock if any of the virtual owner fields will be updated.
   556  			i.virtualOwner.mu.Lock()
   557  			defer i.virtualOwner.mu.Unlock()
   558  		}
   559  
   560  		supportedModes |= virtualOwnerModes
   561  	}
   562  	if m&^supportedModes != 0 {
   563  		return linuxerr.EPERM
   564  	}
   565  
   566  	var hostStat unix.Stat_t
   567  	if err := i.stat(&hostStat); err != nil {
   568  		return err
   569  	}
   570  	if err := vfs.CheckSetStat(ctx, creds, &opts, linux.FileMode(hostStat.Mode), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil {
   571  		return err
   572  	}
   573  
   574  	if m&linux.STATX_MODE != 0 {
   575  		if i.virtualOwner.enabled {
   576  			// We hold i.virtualOwner.mu.
   577  			i.virtualOwner.mode = atomicbitops.FromUint32(uint32(opts.Stat.Mode))
   578  		} else {
   579  			log.Warningf("sentry seccomp filters don't allow making fchmod(2) syscall")
   580  			return unix.EPERM
   581  		}
   582  	}
   583  	if m&linux.STATX_SIZE != 0 {
   584  		if hostStat.Mode&linux.S_IFMT != linux.S_IFREG {
   585  			return linuxerr.EINVAL
   586  		}
   587  		if err := unix.Ftruncate(i.hostFD, int64(s.Size)); err != nil {
   588  			return err
   589  		}
   590  		oldSize := uint64(hostStat.Size)
   591  		if s.Size < oldSize {
   592  			oldpgend, _ := hostarch.PageRoundUp(oldSize)
   593  			newpgend, _ := hostarch.PageRoundUp(s.Size)
   594  			if oldpgend != newpgend {
   595  				i.CachedMappable.InvalidateRange(memmap.MappableRange{newpgend, oldpgend})
   596  			}
   597  		}
   598  	}
   599  	if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 {
   600  		ts := [2]unix.Timespec{
   601  			toTimespec(s.Atime, m&linux.STATX_ATIME == 0),
   602  			toTimespec(s.Mtime, m&linux.STATX_MTIME == 0),
   603  		}
   604  		if err := setTimestamps(i.hostFD, &ts); err != nil {
   605  			return err
   606  		}
   607  	}
   608  	if i.virtualOwner.enabled {
   609  		if m&linux.STATX_UID != 0 {
   610  			// We hold i.virtualOwner.mu.
   611  			i.virtualOwner.uid = atomicbitops.FromUint32(opts.Stat.UID)
   612  		}
   613  		if m&linux.STATX_GID != 0 {
   614  			// We hold i.virtualOwner.mu.
   615  			i.virtualOwner.gid = atomicbitops.FromUint32(opts.Stat.GID)
   616  		}
   617  	}
   618  	return nil
   619  }
   620  
   621  // DecRef implements kernfs.Inode.DecRef.
   622  func (i *inode) DecRef(ctx context.Context) {
   623  	i.inodeRefs.DecRef(func() {
   624  		if i.epollable {
   625  			fdnotifier.RemoveFD(int32(i.hostFD))
   626  		}
   627  		if err := unix.Close(i.hostFD); err != nil {
   628  			log.Warningf("failed to close host fd %d: %v", i.hostFD, err)
   629  		}
   630  		// We can't rely on fdnotifier when closing the fd, because the event may race
   631  		// with fdnotifier.RemoveFD. Instead, notify the queue explicitly.
   632  		i.queue.Notify(waiter.EventHUp | waiter.ReadableEvents | waiter.WritableEvents)
   633  	})
   634  }
   635  
   636  // Open implements kernfs.Inode.Open.
   637  func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   638  	// Once created, we cannot re-open a socket fd through /proc/[pid]/fd/.
   639  	if i.Mode().FileType() == linux.S_IFSOCK {
   640  		return nil, linuxerr.ENXIO
   641  	}
   642  	var stat unix.Stat_t
   643  	if err := i.stat(&stat); err != nil {
   644  		return nil, err
   645  	}
   646  	fileType := linux.FileMode(stat.Mode).FileType()
   647  	return i.open(ctx, d, rp.Mount(), fileType, opts.Flags)
   648  }
   649  
   650  func (i *inode) open(ctx context.Context, d *kernfs.Dentry, mnt *vfs.Mount, fileType linux.FileMode, flags uint32) (*vfs.FileDescription, error) {
   651  	// Constrain flags to a subset we can handle.
   652  	//
   653  	// TODO(gvisor.dev/issue/2601): Support O_NONBLOCK by adding RWF_NOWAIT to pread/pwrite calls.
   654  	flags &= unix.O_ACCMODE | unix.O_NONBLOCK | unix.O_DSYNC | unix.O_SYNC | unix.O_APPEND
   655  
   656  	switch fileType {
   657  	case unix.S_IFSOCK:
   658  		if i.isTTY {
   659  			log.Warningf("cannot use host socket fd %d as TTY", i.hostFD)
   660  			return nil, linuxerr.ENOTTY
   661  		}
   662  
   663  		ep, err := newEndpoint(ctx, i.hostFD, &i.queue)
   664  		if err != nil {
   665  			return nil, err
   666  		}
   667  		// Currently, we only allow Unix sockets to be imported.
   668  		return unixsocket.NewFileDescription(ep, ep.Type(), flags, nil, mnt, d.VFSDentry(), &i.locks)
   669  
   670  	case unix.S_IFREG, unix.S_IFIFO, unix.S_IFCHR:
   671  		if i.isTTY {
   672  			fd := &TTYFileDescription{
   673  				fileDescription: fileDescription{inode: i},
   674  				termios:         linux.DefaultReplicaTermios,
   675  			}
   676  			if task := kernel.TaskFromContext(ctx); task != nil {
   677  				fd.fgProcessGroup = task.ThreadGroup().ProcessGroup()
   678  				fd.session = fd.fgProcessGroup.Session()
   679  			}
   680  			fd.LockFD.Init(&i.locks)
   681  			vfsfd := &fd.vfsfd
   682  			if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
   683  				return nil, err
   684  			}
   685  			return vfsfd, nil
   686  		}
   687  
   688  		fd := &fileDescription{inode: i}
   689  		fd.LockFD.Init(&i.locks)
   690  		vfsfd := &fd.vfsfd
   691  		if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
   692  			return nil, err
   693  		}
   694  		return vfsfd, nil
   695  
   696  	default:
   697  		log.Warningf("cannot import host fd %d with file type %o", i.hostFD, fileType)
   698  		return nil, linuxerr.EPERM
   699  	}
   700  }
   701  
   702  // Create a new host-backed endpoint from the given fd and its corresponding
   703  // notification queue.
   704  func newEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue) (transport.Endpoint, error) {
   705  	// Set up an external transport.Endpoint using the host fd.
   706  	addr := fmt.Sprintf("hostfd:[%d]", hostFD)
   707  	e, err := transport.NewHostConnectedEndpoint(hostFD, addr)
   708  	if err != nil {
   709  		return nil, err.ToError()
   710  	}
   711  	ep := transport.NewExternal(e.SockType(), uniqueid.GlobalProviderFromContext(ctx), queue, e, e)
   712  	return ep, nil
   713  }
   714  
   715  // fileDescription is embedded by host fd implementations of FileDescriptionImpl.
   716  //
   717  // +stateify savable
   718  type fileDescription struct {
   719  	vfsfd vfs.FileDescription
   720  	vfs.FileDescriptionDefaultImpl
   721  	vfs.LockFD
   722  
   723  	// inode is vfsfd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode), but
   724  	// cached to reduce indirections and casting. fileDescription does not hold
   725  	// a reference on the inode through the inode field (since one is already
   726  	// held via the Dentry).
   727  	//
   728  	// inode is immutable after fileDescription creation.
   729  	inode *inode
   730  
   731  	// offsetMu protects offset.
   732  	offsetMu sync.Mutex `state:"nosave"`
   733  
   734  	// offset specifies the current file offset. It is only meaningful when
   735  	// inode.seekable is true.
   736  	offset int64
   737  }
   738  
   739  // SetStat implements vfs.FileDescriptionImpl.SetStat.
   740  func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
   741  	creds := auth.CredentialsFromContext(ctx)
   742  	return f.inode.SetStat(ctx, f.vfsfd.Mount().Filesystem(), creds, opts)
   743  }
   744  
   745  // Stat implements vfs.FileDescriptionImpl.Stat.
   746  func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
   747  	return f.inode.Stat(ctx, f.vfsfd.Mount().Filesystem(), opts)
   748  }
   749  
   750  // Release implements vfs.FileDescriptionImpl.Release.
   751  func (f *fileDescription) Release(context.Context) {
   752  	// noop
   753  }
   754  
   755  // Allocate implements vfs.FileDescriptionImpl.Allocate.
   756  func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error {
   757  	if f.inode.readonly {
   758  		return linuxerr.EPERM
   759  	}
   760  	return unix.Fallocate(f.inode.hostFD, uint32(mode), int64(offset), int64(length))
   761  }
   762  
   763  // PRead implements vfs.FileDescriptionImpl.PRead.
   764  func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
   765  	// Check that flags are supported.
   766  	//
   767  	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
   768  	if opts.Flags&^linux.RWF_HIPRI != 0 {
   769  		return 0, linuxerr.EOPNOTSUPP
   770  	}
   771  
   772  	i := f.inode
   773  	if !i.seekable {
   774  		return 0, linuxerr.ESPIPE
   775  	}
   776  
   777  	return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags)
   778  }
   779  
   780  // Read implements vfs.FileDescriptionImpl.Read.
   781  func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
   782  	// Check that flags are supported.
   783  	//
   784  	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
   785  	if opts.Flags&^linux.RWF_HIPRI != 0 {
   786  		return 0, linuxerr.EOPNOTSUPP
   787  	}
   788  
   789  	i := f.inode
   790  	if !i.seekable {
   791  		bufN, err := i.readFromBuf(ctx, &dst)
   792  		if err != nil {
   793  			return bufN, err
   794  		}
   795  		n, err := readFromHostFD(ctx, i.hostFD, dst, -1, opts.Flags)
   796  		total := bufN + n
   797  		if isBlockError(err) {
   798  			// If we got any data at all, return it as a "completed" partial read
   799  			// rather than retrying until complete.
   800  			if total != 0 {
   801  				err = nil
   802  			} else {
   803  				err = linuxerr.ErrWouldBlock
   804  			}
   805  		}
   806  		return total, err
   807  	}
   808  
   809  	f.offsetMu.Lock()
   810  	n, err := readFromHostFD(ctx, i.hostFD, dst, f.offset, opts.Flags)
   811  	f.offset += n
   812  	f.offsetMu.Unlock()
   813  	return n, err
   814  }
   815  
   816  func (i *inode) readFromBuf(ctx context.Context, dst *usermem.IOSequence) (int64, error) {
   817  	if i.haveBuf.Load() == 0 {
   818  		return 0, nil
   819  	}
   820  	i.bufMu.Lock()
   821  	defer i.bufMu.Unlock()
   822  	if len(i.buf) == 0 {
   823  		return 0, nil
   824  	}
   825  	n, err := dst.CopyOut(ctx, i.buf)
   826  	*dst = dst.DropFirst(n)
   827  	i.buf = i.buf[n:]
   828  	if len(i.buf) == 0 {
   829  		i.haveBuf.Store(0)
   830  		i.buf = nil
   831  	}
   832  	return int64(n), err
   833  }
   834  
   835  func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) {
   836  	reader := hostfd.GetReadWriterAt(int32(hostFD), offset, flags)
   837  	n, err := dst.CopyOutFrom(ctx, reader)
   838  	hostfd.PutReadWriterAt(reader)
   839  	return int64(n), err
   840  }
   841  
   842  // PWrite implements vfs.FileDescriptionImpl.PWrite.
   843  func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
   844  	if !f.inode.seekable {
   845  		return 0, linuxerr.ESPIPE
   846  	}
   847  
   848  	return f.writeToHostFD(ctx, src, offset, opts.Flags)
   849  }
   850  
   851  // Write implements vfs.FileDescriptionImpl.Write.
   852  func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
   853  	i := f.inode
   854  	if !i.seekable {
   855  		n, err := f.writeToHostFD(ctx, src, -1, opts.Flags)
   856  		if isBlockError(err) {
   857  			err = linuxerr.ErrWouldBlock
   858  		}
   859  		return n, err
   860  	}
   861  
   862  	f.offsetMu.Lock()
   863  	// NOTE(gvisor.dev/issue/2983): O_APPEND may cause memory corruption if
   864  	// another process modifies the host file between retrieving the file size
   865  	// and writing to the host fd. This is an unavoidable race condition because
   866  	// we cannot enforce synchronization on the host.
   867  	if f.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
   868  		var s unix.Stat_t
   869  		if err := unix.Fstat(i.hostFD, &s); err != nil {
   870  			f.offsetMu.Unlock()
   871  			return 0, err
   872  		}
   873  		f.offset = s.Size
   874  	}
   875  	n, err := f.writeToHostFD(ctx, src, f.offset, opts.Flags)
   876  	f.offset += n
   877  	f.offsetMu.Unlock()
   878  	return n, err
   879  }
   880  
   881  func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSequence, offset int64, flags uint32) (int64, error) {
   882  	if f.inode.readonly {
   883  		return 0, linuxerr.EPERM
   884  	}
   885  	hostFD := f.inode.hostFD
   886  	// TODO(gvisor.dev/issue/2601): Support select pwritev2 flags.
   887  	if flags != 0 {
   888  		return 0, linuxerr.EOPNOTSUPP
   889  	}
   890  	writer := hostfd.GetReadWriterAt(int32(hostFD), offset, flags)
   891  	n, err := src.CopyInTo(ctx, writer)
   892  	hostfd.PutReadWriterAt(writer)
   893  	// NOTE(gvisor.dev/issue/2979): We always sync everything, even for O_DSYNC.
   894  	if n > 0 && f.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 {
   895  		if syncErr := unix.Fsync(hostFD); syncErr != nil {
   896  			return int64(n), syncErr
   897  		}
   898  	}
   899  	return int64(n), err
   900  }
   901  
   902  // Seek implements vfs.FileDescriptionImpl.Seek.
   903  //
   904  // Note that we do not support seeking on directories, since we do not even
   905  // allow directory fds to be imported at all.
   906  func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (int64, error) {
   907  	i := f.inode
   908  	if !i.seekable {
   909  		return 0, linuxerr.ESPIPE
   910  	}
   911  
   912  	f.offsetMu.Lock()
   913  	defer f.offsetMu.Unlock()
   914  
   915  	switch whence {
   916  	case linux.SEEK_SET:
   917  		if offset < 0 {
   918  			return f.offset, linuxerr.EINVAL
   919  		}
   920  		f.offset = offset
   921  
   922  	case linux.SEEK_CUR:
   923  		// Check for overflow. Note that underflow cannot occur, since f.offset >= 0.
   924  		if offset > math.MaxInt64-f.offset {
   925  			return f.offset, linuxerr.EOVERFLOW
   926  		}
   927  		if f.offset+offset < 0 {
   928  			return f.offset, linuxerr.EINVAL
   929  		}
   930  		f.offset += offset
   931  
   932  	case linux.SEEK_END:
   933  		var s unix.Stat_t
   934  		if err := unix.Fstat(i.hostFD, &s); err != nil {
   935  			return f.offset, err
   936  		}
   937  		size := s.Size
   938  
   939  		// Check for overflow. Note that underflow cannot occur, since size >= 0.
   940  		if offset > math.MaxInt64-size {
   941  			return f.offset, linuxerr.EOVERFLOW
   942  		}
   943  		if size+offset < 0 {
   944  			return f.offset, linuxerr.EINVAL
   945  		}
   946  		f.offset = size + offset
   947  
   948  	case linux.SEEK_DATA, linux.SEEK_HOLE:
   949  		// Modifying the offset in the host file table should not matter, since
   950  		// this is the only place where we use it.
   951  		//
   952  		// For reading and writing, we always rely on our internal offset.
   953  		n, err := unix.Seek(i.hostFD, offset, int(whence))
   954  		if err != nil {
   955  			return f.offset, err
   956  		}
   957  		f.offset = n
   958  
   959  	default:
   960  		// Invalid whence.
   961  		return f.offset, linuxerr.EINVAL
   962  	}
   963  
   964  	return f.offset, nil
   965  }
   966  
   967  // Sync implements vfs.FileDescriptionImpl.Sync.
   968  func (f *fileDescription) Sync(ctx context.Context) error {
   969  	if f.inode.readonly {
   970  		return linuxerr.EPERM
   971  	}
   972  	// TODO(gvisor.dev/issue/1897): Currently, we always sync everything.
   973  	return unix.Fsync(f.inode.hostFD)
   974  }
   975  
   976  // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
   977  func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error {
   978  	// NOTE(b/38213152): Technically, some obscure char devices can be memory
   979  	// mapped, but we only allow regular files.
   980  	if f.inode.ftype != unix.S_IFREG {
   981  		return linuxerr.ENODEV
   982  	}
   983  	i := f.inode
   984  	i.CachedMappable.InitFileMapperOnce()
   985  	return vfs.GenericConfigureMMap(&f.vfsfd, i, opts)
   986  }
   987  
   988  // EventRegister implements waiter.Waitable.EventRegister.
   989  func (f *fileDescription) EventRegister(e *waiter.Entry) error {
   990  	f.inode.queue.EventRegister(e)
   991  	if f.inode.epollable {
   992  		if err := fdnotifier.UpdateFD(int32(f.inode.hostFD)); err != nil {
   993  			f.inode.queue.EventUnregister(e)
   994  			return err
   995  		}
   996  	}
   997  	return nil
   998  }
   999  
  1000  // EventUnregister implements waiter.Waitable.EventUnregister.
  1001  func (f *fileDescription) EventUnregister(e *waiter.Entry) {
  1002  	f.inode.queue.EventUnregister(e)
  1003  	if f.inode.epollable {
  1004  		if err := fdnotifier.UpdateFD(int32(f.inode.hostFD)); err != nil {
  1005  			panic(fmt.Sprint("UpdateFD:", err))
  1006  		}
  1007  	}
  1008  }
  1009  
  1010  // Readiness uses the poll() syscall to check the status of the underlying FD.
  1011  func (f *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
  1012  	return fdnotifier.NonBlockingPoll(int32(f.inode.hostFD), mask)
  1013  }
  1014  
  1015  // Epollable implements FileDescriptionImpl.Epollable.
  1016  func (f *fileDescription) Epollable() bool {
  1017  	return f.inode.epollable
  1018  }
  1019  
  1020  // Ioctl queries the underlying FD for allowed ioctl commands.
  1021  func (f *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
  1022  	switch cmd := args[1].Int(); cmd {
  1023  	case linux.FIONREAD:
  1024  		v, err := ioctlFionread(f.inode.hostFD)
  1025  		if err != nil {
  1026  			return 0, err
  1027  		}
  1028  
  1029  		var buf [4]byte
  1030  		hostarch.ByteOrder.PutUint32(buf[:], v)
  1031  		_, err = uio.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{})
  1032  		return 0, err
  1033  	}
  1034  
  1035  	return f.FileDescriptionDefaultImpl.Ioctl(ctx, uio, sysno, args)
  1036  }