github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/fsimpl/host/host.go (about)

     1  // Copyright 2020 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package host provides a filesystem implementation for host files imported as
    16  // file descriptors.
    17  package host
    18  
    19  import (
    20  	"fmt"
    21  	"math"
    22  
    23  	"golang.org/x/sys/unix"
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    28  	"github.com/nicocha30/gvisor-ligolo/pkg/fdnotifier"
    29  	"github.com/nicocha30/gvisor-ligolo/pkg/fspath"
    30  	"github.com/nicocha30/gvisor-ligolo/pkg/hostarch"
    31  	"github.com/nicocha30/gvisor-ligolo/pkg/log"
    32  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/arch"
    33  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/kernfs"
    34  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/hostfd"
    35  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel"
    36  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth"
    37  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/memmap"
    38  	unixsocket "github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/unix"
    39  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/unix/transport"
    40  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/uniqueid"
    41  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs"
    42  	"github.com/nicocha30/gvisor-ligolo/pkg/sync"
    43  	"github.com/nicocha30/gvisor-ligolo/pkg/usermem"
    44  	"github.com/nicocha30/gvisor-ligolo/pkg/waiter"
    45  )
    46  
    47  // These are the modes that are stored with virtualOwner.
    48  const virtualOwnerModes = linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID
    49  
    50  // +stateify savable
    51  type virtualOwner struct {
    52  	// This field is initialized at creation time and is immutable.
    53  	enabled bool
    54  
    55  	// mu protects the fields below and they can be accessed using atomic memory
    56  	// operations.
    57  	mu  sync.Mutex `state:"nosave"`
    58  	uid atomicbitops.Uint32
    59  	gid atomicbitops.Uint32
    60  	// mode is also stored, otherwise setting the host file to `0000` could remove
    61  	// access to the file.
    62  	mode atomicbitops.Uint32
    63  }
    64  
    65  func (v *virtualOwner) atomicUID() uint32 {
    66  	return v.uid.Load()
    67  }
    68  
    69  func (v *virtualOwner) atomicGID() uint32 {
    70  	return v.gid.Load()
    71  }
    72  
    73  func (v *virtualOwner) atomicMode() uint32 {
    74  	return v.mode.Load()
    75  }
    76  
    77  func isEpollable(fd int) bool {
    78  	epollfd, err := unix.EpollCreate1(0)
    79  	if err != nil {
    80  		// This shouldn't happen. If it does, just say file doesn't support epoll.
    81  		return false
    82  	}
    83  	defer unix.Close(epollfd)
    84  
    85  	event := unix.EpollEvent{
    86  		Fd:     int32(fd),
    87  		Events: unix.EPOLLIN,
    88  	}
    89  	err = unix.EpollCtl(epollfd, unix.EPOLL_CTL_ADD, fd, &event)
    90  	return err == nil
    91  }
    92  
    93  // inode implements kernfs.Inode.
    94  //
    95  // +stateify savable
    96  type inode struct {
    97  	kernfs.CachedMappable
    98  	kernfs.InodeNoStatFS
    99  	kernfs.InodeAnonymous // inode is effectively anonymous because it represents a donated FD.
   100  	kernfs.InodeNotDirectory
   101  	kernfs.InodeNotSymlink
   102  	kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid.
   103  	kernfs.InodeWatches
   104  
   105  	locks vfs.FileLocks
   106  
   107  	// When the reference count reaches zero, the host fd is closed.
   108  	inodeRefs
   109  
   110  	// hostFD contains the host fd that this file was originally created from,
   111  	// which must be available at time of restore.
   112  	//
   113  	// This field is initialized at creation time and is immutable.
   114  	hostFD int
   115  
   116  	// ino is an inode number unique within this filesystem.
   117  	//
   118  	// This field is initialized at creation time and is immutable.
   119  	ino uint64
   120  
   121  	// ftype is the file's type (a linux.S_IFMT mask).
   122  	//
   123  	// This field is initialized at creation time and is immutable.
   124  	ftype uint16
   125  
   126  	// epollable indicates whether the hostFD can be used with epoll_ctl(2). This
   127  	// also indicates that hostFD has been set to non-blocking.
   128  	//
   129  	// This field is initialized at creation time and is immutable.
   130  	epollable bool
   131  
   132  	// seekable is false if lseek(hostFD) returns ESPIPE. We assume that file
   133  	// offsets are meaningful iff seekable is true.
   134  	//
   135  	// This field is initialized at creation time and is immutable.
   136  	seekable bool
   137  
   138  	// isTTY is true if this file represents a TTY.
   139  	//
   140  	// This field is initialized at creation time and is immutable.
   141  	isTTY bool
   142  
   143  	// savable is true if hostFD may be saved/restored by its numeric value.
   144  	//
   145  	// This field is initialized at creation time and is immutable.
   146  	savable bool
   147  
   148  	// readonly is true if operations that can potentially change the host file
   149  	// are blocked.
   150  	//
   151  	// This field is initialized at creation time and is immutable.
   152  	readonly bool
   153  
   154  	// Event queue for blocking operations.
   155  	queue waiter.Queue
   156  
   157  	// virtualOwner caches ownership and permission information to override the
   158  	// underlying file owner and permission. This is used to allow the unstrusted
   159  	// application to change these fields without affecting the host.
   160  	virtualOwner virtualOwner
   161  
   162  	// If haveBuf is non-zero, hostFD represents a pipe, and buf contains data
   163  	// read from the pipe from previous calls to inode.beforeSave(). haveBuf
   164  	// and buf are protected by bufMu.
   165  	bufMu   sync.Mutex `state:"nosave"`
   166  	haveBuf atomicbitops.Uint32
   167  	buf     []byte
   168  }
   169  
   170  func newInode(ctx context.Context, fs *filesystem, hostFD int, savable bool, fileType linux.FileMode, isTTY bool, readonly bool) (*inode, error) {
   171  	// Determine if hostFD is seekable.
   172  	_, err := unix.Seek(hostFD, 0, linux.SEEK_CUR)
   173  	seekable := !linuxerr.Equals(linuxerr.ESPIPE, err)
   174  	// We expect regular files to be seekable, as this is required for them to
   175  	// be memory-mappable.
   176  	if !seekable && fileType == unix.S_IFREG {
   177  		ctx.Infof("host.newInode: host FD %d is a non-seekable regular file", hostFD)
   178  		return nil, linuxerr.ESPIPE
   179  	}
   180  
   181  	i := &inode{
   182  		hostFD:    hostFD,
   183  		ino:       fs.NextIno(),
   184  		ftype:     uint16(fileType),
   185  		epollable: isEpollable(hostFD),
   186  		seekable:  seekable,
   187  		isTTY:     isTTY,
   188  		savable:   savable,
   189  		readonly:  readonly,
   190  	}
   191  	i.InitRefs()
   192  	i.CachedMappable.Init(hostFD)
   193  
   194  	// If the hostFD can return EWOULDBLOCK when set to non-blocking, do so and
   195  	// handle blocking behavior in the sentry.
   196  	if i.epollable {
   197  		if err := unix.SetNonblock(i.hostFD, true); err != nil {
   198  			return nil, err
   199  		}
   200  		if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil {
   201  			return nil, err
   202  		}
   203  	}
   204  	return i, nil
   205  }
   206  
   207  // NewFDOptions contains options to NewFD.
   208  type NewFDOptions struct {
   209  	// If Savable is true, the host file descriptor may be saved/restored by
   210  	// numeric value; the sandbox API requires a corresponding host FD with the
   211  	// same numeric value to be provided at time of restore.
   212  	Savable bool
   213  
   214  	// If IsTTY is true, the file descriptor is a TTY.
   215  	IsTTY bool
   216  
   217  	// If HaveFlags is true, use Flags for the new file description. Otherwise,
   218  	// the new file description will inherit flags from hostFD.
   219  	HaveFlags bool
   220  	Flags     uint32
   221  
   222  	// VirtualOwner allow the host file to have owner and permissions different
   223  	// than the underlying host file.
   224  	VirtualOwner bool
   225  	UID          auth.KUID
   226  	GID          auth.KGID
   227  
   228  	// If Readonly is true, we disallow operations that can potentially change
   229  	// the host file associated with the file descriptor.
   230  	Readonly bool
   231  }
   232  
   233  // NewFD returns a vfs.FileDescription representing the given host file
   234  // descriptor. mnt must be Kernel.HostMount().
   235  func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) (*vfs.FileDescription, error) {
   236  	fs, ok := mnt.Filesystem().Impl().(*filesystem)
   237  	if !ok {
   238  		return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl())
   239  	}
   240  
   241  	if opts.Readonly {
   242  		if opts.IsTTY {
   243  			// This is not a technical limitation, but access checks for TTYs
   244  			// have not been implemented yet.
   245  			return nil, fmt.Errorf("readonly file descriptor may currently not be a TTY")
   246  		}
   247  
   248  		flagsInt, err := unix.FcntlInt(uintptr(hostFD), unix.F_GETFL, 0)
   249  		if err != nil {
   250  			return nil, err
   251  		}
   252  		accessMode := uint32(flagsInt) & unix.O_ACCMODE
   253  		if accessMode != unix.O_RDONLY {
   254  			return nil, fmt.Errorf("readonly file descriptor may only be opened as O_RDONLY on the host")
   255  		}
   256  	}
   257  
   258  	// Retrieve metadata.
   259  	var stat unix.Stat_t
   260  	if err := unix.Fstat(hostFD, &stat); err != nil {
   261  		return nil, err
   262  	}
   263  
   264  	flags := opts.Flags
   265  	if !opts.HaveFlags {
   266  		// Get flags for the imported FD.
   267  		flagsInt, err := unix.FcntlInt(uintptr(hostFD), unix.F_GETFL, 0)
   268  		if err != nil {
   269  			return nil, err
   270  		}
   271  		flags = uint32(flagsInt)
   272  	}
   273  
   274  	fileType := linux.FileMode(stat.Mode).FileType()
   275  	i, err := newInode(ctx, fs, hostFD, opts.Savable, fileType, opts.IsTTY, opts.Readonly)
   276  	if err != nil {
   277  		return nil, err
   278  	}
   279  	if opts.VirtualOwner {
   280  		i.virtualOwner.enabled = true
   281  		i.virtualOwner.uid = atomicbitops.FromUint32(uint32(opts.UID))
   282  		i.virtualOwner.gid = atomicbitops.FromUint32(uint32(opts.GID))
   283  		i.virtualOwner.mode = atomicbitops.FromUint32(stat.Mode)
   284  	}
   285  
   286  	d := &kernfs.Dentry{}
   287  	d.Init(&fs.Filesystem, i)
   288  
   289  	// i.open will take a reference on d.
   290  	defer d.DecRef(ctx)
   291  
   292  	// For simplicity, fileDescription.offset is set to 0. Technically, we
   293  	// should only set to 0 on files that are not seekable (sockets, pipes,
   294  	// etc.), and use the offset from the host fd otherwise when importing.
   295  	return i.open(ctx, d, mnt, fileType, flags)
   296  }
   297  
   298  // filesystemType implements vfs.FilesystemType.
   299  //
   300  // +stateify savable
   301  type filesystemType struct{}
   302  
   303  // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
   304  func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
   305  	panic("host.filesystemType.GetFilesystem should never be called")
   306  }
   307  
   308  // Name implements vfs.FilesystemType.Name.
   309  func (filesystemType) Name() string {
   310  	return "none"
   311  }
   312  
   313  // Release implements vfs.FilesystemType.Release.
   314  func (filesystemType) Release(ctx context.Context) {}
   315  
   316  // NewFilesystem sets up and returns a new hostfs filesystem.
   317  //
   318  // Note that there should only ever be one instance of host.filesystem,
   319  // a global mount for host fds.
   320  func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) {
   321  	devMinor, err := vfsObj.GetAnonBlockDevMinor()
   322  	if err != nil {
   323  		return nil, err
   324  	}
   325  	fs := &filesystem{
   326  		devMinor: devMinor,
   327  	}
   328  	fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs)
   329  	return fs.VFSFilesystem(), nil
   330  }
   331  
   332  // filesystem implements vfs.FilesystemImpl.
   333  //
   334  // +stateify savable
   335  type filesystem struct {
   336  	kernfs.Filesystem
   337  
   338  	devMinor uint32
   339  }
   340  
   341  func (fs *filesystem) Release(ctx context.Context) {
   342  	fs.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
   343  	fs.Filesystem.Release(ctx)
   344  }
   345  
   346  func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
   347  	d := vd.Dentry().Impl().(*kernfs.Dentry)
   348  	inode := d.Inode().(*inode)
   349  	b.PrependComponent(fmt.Sprintf("host:[%d]", inode.ino))
   350  	return vfs.PrependPathSyntheticError{}
   351  }
   352  
   353  // MountOptions implements vfs.FilesystemImpl.MountOptions.
   354  func (fs *filesystem) MountOptions() string {
   355  	return ""
   356  }
   357  
   358  // CheckPermissions implements kernfs.Inode.CheckPermissions.
   359  func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
   360  	var s unix.Stat_t
   361  	if err := i.stat(&s); err != nil {
   362  		return err
   363  	}
   364  	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid))
   365  }
   366  
   367  // Mode implements kernfs.Inode.Mode.
   368  func (i *inode) Mode() linux.FileMode {
   369  	var s unix.Stat_t
   370  	if err := i.stat(&s); err != nil {
   371  		// Retrieving the mode from the host fd using fstat(2) should not fail.
   372  		// If the syscall does not succeed, something is fundamentally wrong.
   373  		panic(fmt.Sprintf("failed to retrieve mode from host fd %d: %v", i.hostFD, err))
   374  	}
   375  	return linux.FileMode(s.Mode)
   376  }
   377  
   378  // Mode implements kernfs.Inode.UID
   379  func (i *inode) UID() auth.KUID {
   380  	return auth.KUID(i.virtualOwner.uid.Load())
   381  }
   382  
   383  // Mode implements kernfs.Inode.GID
   384  func (i *inode) GID() auth.KGID {
   385  	return auth.KGID(i.virtualOwner.gid.Load())
   386  }
   387  
   388  // Stat implements kernfs.Inode.Stat.
   389  func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
   390  	if opts.Mask&linux.STATX__RESERVED != 0 {
   391  		return linux.Statx{}, linuxerr.EINVAL
   392  	}
   393  	if opts.Sync&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE {
   394  		return linux.Statx{}, linuxerr.EINVAL
   395  	}
   396  
   397  	fs := vfsfs.Impl().(*filesystem)
   398  
   399  	// Limit our host call only to known flags.
   400  	mask := opts.Mask & linux.STATX_ALL
   401  	var s unix.Statx_t
   402  	err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(mask), &s)
   403  	if linuxerr.Equals(linuxerr.ENOSYS, err) {
   404  		// Fallback to fstat(2), if statx(2) is not supported on the host.
   405  		//
   406  		// TODO(b/151263641): Remove fallback.
   407  		return i.statxFromStat(fs)
   408  	}
   409  	if err != nil {
   410  		return linux.Statx{}, err
   411  	}
   412  
   413  	// Unconditionally fill blksize, attributes, and device numbers, as
   414  	// indicated by /include/uapi/linux/stat.h. Inode number is always
   415  	// available, since we use our own rather than the host's.
   416  	ls := linux.Statx{
   417  		Mask:           linux.STATX_INO,
   418  		Blksize:        s.Blksize,
   419  		Attributes:     s.Attributes,
   420  		Ino:            i.ino,
   421  		AttributesMask: s.Attributes_mask,
   422  		DevMajor:       linux.UNNAMED_MAJOR,
   423  		DevMinor:       fs.devMinor,
   424  	}
   425  
   426  	// Copy other fields that were returned by the host. RdevMajor/RdevMinor
   427  	// are never copied (and therefore left as zero), so as not to expose host
   428  	// device numbers.
   429  	ls.Mask |= s.Mask & linux.STATX_ALL
   430  	if s.Mask&linux.STATX_TYPE != 0 {
   431  		if i.virtualOwner.enabled {
   432  			ls.Mode |= uint16(i.virtualOwner.atomicMode()) & linux.S_IFMT
   433  		} else {
   434  			ls.Mode |= s.Mode & linux.S_IFMT
   435  		}
   436  	}
   437  	if s.Mask&linux.STATX_MODE != 0 {
   438  		if i.virtualOwner.enabled {
   439  			ls.Mode |= uint16(i.virtualOwner.atomicMode()) &^ linux.S_IFMT
   440  		} else {
   441  			ls.Mode |= s.Mode &^ linux.S_IFMT
   442  		}
   443  	}
   444  	if s.Mask&linux.STATX_NLINK != 0 {
   445  		ls.Nlink = s.Nlink
   446  	}
   447  	if s.Mask&linux.STATX_UID != 0 {
   448  		if i.virtualOwner.enabled {
   449  			ls.UID = i.virtualOwner.atomicUID()
   450  		} else {
   451  			ls.UID = s.Uid
   452  		}
   453  	}
   454  	if s.Mask&linux.STATX_GID != 0 {
   455  		if i.virtualOwner.enabled {
   456  			ls.GID = i.virtualOwner.atomicGID()
   457  		} else {
   458  			ls.GID = s.Gid
   459  		}
   460  	}
   461  	if s.Mask&linux.STATX_ATIME != 0 {
   462  		ls.Atime = unixToLinuxStatxTimestamp(s.Atime)
   463  	}
   464  	if s.Mask&linux.STATX_BTIME != 0 {
   465  		ls.Btime = unixToLinuxStatxTimestamp(s.Btime)
   466  	}
   467  	if s.Mask&linux.STATX_CTIME != 0 {
   468  		ls.Ctime = unixToLinuxStatxTimestamp(s.Ctime)
   469  	}
   470  	if s.Mask&linux.STATX_MTIME != 0 {
   471  		ls.Mtime = unixToLinuxStatxTimestamp(s.Mtime)
   472  	}
   473  	if s.Mask&linux.STATX_SIZE != 0 {
   474  		ls.Size = s.Size
   475  	}
   476  	if s.Mask&linux.STATX_BLOCKS != 0 {
   477  		ls.Blocks = s.Blocks
   478  	}
   479  
   480  	return ls, nil
   481  }
   482  
   483  // statxFromStat is a best-effort fallback for inode.Stat() if the host does not
   484  // support statx(2).
   485  //
   486  // We ignore the mask and sync flags in opts and simply supply
   487  // STATX_BASIC_STATS, as fstat(2) itself does not allow the specification
   488  // of a mask or sync flags. fstat(2) does not provide any metadata
   489  // equivalent to Statx.Attributes, Statx.AttributesMask, or Statx.Btime, so
   490  // those fields remain empty.
   491  func (i *inode) statxFromStat(fs *filesystem) (linux.Statx, error) {
   492  	var s unix.Stat_t
   493  	if err := i.stat(&s); err != nil {
   494  		return linux.Statx{}, err
   495  	}
   496  
   497  	// As with inode.Stat(), we always use internal device and inode numbers,
   498  	// and never expose the host's represented device numbers.
   499  	return linux.Statx{
   500  		Mask:     linux.STATX_BASIC_STATS,
   501  		Blksize:  uint32(s.Blksize),
   502  		Nlink:    uint32(s.Nlink),
   503  		UID:      s.Uid,
   504  		GID:      s.Gid,
   505  		Mode:     uint16(s.Mode),
   506  		Ino:      i.ino,
   507  		Size:     uint64(s.Size),
   508  		Blocks:   uint64(s.Blocks),
   509  		Atime:    timespecToStatxTimestamp(s.Atim),
   510  		Ctime:    timespecToStatxTimestamp(s.Ctim),
   511  		Mtime:    timespecToStatxTimestamp(s.Mtim),
   512  		DevMajor: linux.UNNAMED_MAJOR,
   513  		DevMinor: fs.devMinor,
   514  	}, nil
   515  }
   516  
   517  func (i *inode) stat(stat *unix.Stat_t) error {
   518  	if err := unix.Fstat(i.hostFD, stat); err != nil {
   519  		return err
   520  	}
   521  	if i.virtualOwner.enabled {
   522  		stat.Uid = i.virtualOwner.atomicUID()
   523  		stat.Gid = i.virtualOwner.atomicGID()
   524  		stat.Mode = i.virtualOwner.atomicMode()
   525  	}
   526  	return nil
   527  }
   528  
   529  // SetStat implements kernfs.Inode.SetStat.
   530  //
   531  // +checklocksignore
   532  func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
   533  	if i.readonly {
   534  		return linuxerr.EPERM
   535  	}
   536  
   537  	s := &opts.Stat
   538  
   539  	m := s.Mask
   540  	if m == 0 {
   541  		return nil
   542  	}
   543  	supportedModes := uint32(linux.STATX_MODE | linux.STATX_SIZE | linux.STATX_ATIME | linux.STATX_MTIME)
   544  	if i.virtualOwner.enabled {
   545  		if m&virtualOwnerModes != 0 {
   546  			// Take lock if any of the virtual owner fields will be updated.
   547  			i.virtualOwner.mu.Lock()
   548  			defer i.virtualOwner.mu.Unlock()
   549  		}
   550  
   551  		supportedModes |= virtualOwnerModes
   552  	}
   553  	if m&^supportedModes != 0 {
   554  		return linuxerr.EPERM
   555  	}
   556  
   557  	var hostStat unix.Stat_t
   558  	if err := i.stat(&hostStat); err != nil {
   559  		return err
   560  	}
   561  	if err := vfs.CheckSetStat(ctx, creds, &opts, linux.FileMode(hostStat.Mode), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil {
   562  		return err
   563  	}
   564  
   565  	if m&linux.STATX_MODE != 0 {
   566  		if i.virtualOwner.enabled {
   567  			// We hold i.virtualOwner.mu.
   568  			i.virtualOwner.mode = atomicbitops.FromUint32(uint32(opts.Stat.Mode))
   569  		} else {
   570  			log.Warningf("sentry seccomp filters don't allow making fchmod(2) syscall")
   571  			return unix.EPERM
   572  		}
   573  	}
   574  	if m&linux.STATX_SIZE != 0 {
   575  		if hostStat.Mode&linux.S_IFMT != linux.S_IFREG {
   576  			return linuxerr.EINVAL
   577  		}
   578  		if err := unix.Ftruncate(i.hostFD, int64(s.Size)); err != nil {
   579  			return err
   580  		}
   581  		oldSize := uint64(hostStat.Size)
   582  		if s.Size < oldSize {
   583  			oldpgend, _ := hostarch.PageRoundUp(oldSize)
   584  			newpgend, _ := hostarch.PageRoundUp(s.Size)
   585  			if oldpgend != newpgend {
   586  				i.CachedMappable.InvalidateRange(memmap.MappableRange{newpgend, oldpgend})
   587  			}
   588  		}
   589  	}
   590  	if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 {
   591  		ts := [2]unix.Timespec{
   592  			toTimespec(s.Atime, m&linux.STATX_ATIME == 0),
   593  			toTimespec(s.Mtime, m&linux.STATX_MTIME == 0),
   594  		}
   595  		if err := setTimestamps(i.hostFD, &ts); err != nil {
   596  			return err
   597  		}
   598  	}
   599  	if i.virtualOwner.enabled {
   600  		if m&linux.STATX_UID != 0 {
   601  			// We hold i.virtualOwner.mu.
   602  			i.virtualOwner.uid = atomicbitops.FromUint32(opts.Stat.UID)
   603  		}
   604  		if m&linux.STATX_GID != 0 {
   605  			// We hold i.virtualOwner.mu.
   606  			i.virtualOwner.gid = atomicbitops.FromUint32(opts.Stat.GID)
   607  		}
   608  	}
   609  	return nil
   610  }
   611  
   612  // DecRef implements kernfs.Inode.DecRef.
   613  func (i *inode) DecRef(ctx context.Context) {
   614  	i.inodeRefs.DecRef(func() {
   615  		if i.epollable {
   616  			fdnotifier.RemoveFD(int32(i.hostFD))
   617  		}
   618  		if err := unix.Close(i.hostFD); err != nil {
   619  			log.Warningf("failed to close host fd %d: %v", i.hostFD, err)
   620  		}
   621  		// We can't rely on fdnotifier when closing the fd, because the event may race
   622  		// with fdnotifier.RemoveFD. Instead, notify the queue explicitly.
   623  		i.queue.Notify(waiter.EventHUp | waiter.ReadableEvents | waiter.WritableEvents)
   624  	})
   625  }
   626  
   627  // Open implements kernfs.Inode.Open.
   628  func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   629  	// Once created, we cannot re-open a socket fd through /proc/[pid]/fd/.
   630  	if i.Mode().FileType() == linux.S_IFSOCK {
   631  		return nil, linuxerr.ENXIO
   632  	}
   633  	var stat unix.Stat_t
   634  	if err := i.stat(&stat); err != nil {
   635  		return nil, err
   636  	}
   637  	fileType := linux.FileMode(stat.Mode).FileType()
   638  	return i.open(ctx, d, rp.Mount(), fileType, opts.Flags)
   639  }
   640  
   641  func (i *inode) open(ctx context.Context, d *kernfs.Dentry, mnt *vfs.Mount, fileType linux.FileMode, flags uint32) (*vfs.FileDescription, error) {
   642  	// Constrain flags to a subset we can handle.
   643  	//
   644  	// TODO(gvisor.dev/issue/2601): Support O_NONBLOCK by adding RWF_NOWAIT to pread/pwrite calls.
   645  	flags &= unix.O_ACCMODE | unix.O_NONBLOCK | unix.O_DSYNC | unix.O_SYNC | unix.O_APPEND
   646  
   647  	switch fileType {
   648  	case unix.S_IFSOCK:
   649  		if i.isTTY {
   650  			log.Warningf("cannot use host socket fd %d as TTY", i.hostFD)
   651  			return nil, linuxerr.ENOTTY
   652  		}
   653  
   654  		ep, err := newEndpoint(ctx, i.hostFD, &i.queue)
   655  		if err != nil {
   656  			return nil, err
   657  		}
   658  		// Currently, we only allow Unix sockets to be imported.
   659  		return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d.VFSDentry(), &i.locks)
   660  
   661  	case unix.S_IFREG, unix.S_IFIFO, unix.S_IFCHR:
   662  		if i.isTTY {
   663  			fd := &TTYFileDescription{
   664  				fileDescription: fileDescription{inode: i},
   665  				termios:         linux.DefaultReplicaTermios,
   666  			}
   667  			if task := kernel.TaskFromContext(ctx); task != nil {
   668  				fd.fgProcessGroup = task.ThreadGroup().ProcessGroup()
   669  				fd.session = fd.fgProcessGroup.Session()
   670  			}
   671  			fd.LockFD.Init(&i.locks)
   672  			vfsfd := &fd.vfsfd
   673  			if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
   674  				return nil, err
   675  			}
   676  			return vfsfd, nil
   677  		}
   678  
   679  		fd := &fileDescription{inode: i}
   680  		fd.LockFD.Init(&i.locks)
   681  		vfsfd := &fd.vfsfd
   682  		if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
   683  			return nil, err
   684  		}
   685  		return vfsfd, nil
   686  
   687  	default:
   688  		log.Warningf("cannot import host fd %d with file type %o", i.hostFD, fileType)
   689  		return nil, linuxerr.EPERM
   690  	}
   691  }
   692  
   693  // Create a new host-backed endpoint from the given fd and its corresponding
   694  // notification queue.
   695  func newEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue) (transport.Endpoint, error) {
   696  	// Set up an external transport.Endpoint using the host fd.
   697  	addr := fmt.Sprintf("hostfd:[%d]", hostFD)
   698  	e, err := transport.NewHostConnectedEndpoint(hostFD, addr)
   699  	if err != nil {
   700  		return nil, err.ToError()
   701  	}
   702  	ep := transport.NewExternal(e.SockType(), uniqueid.GlobalProviderFromContext(ctx), queue, e, e)
   703  	return ep, nil
   704  }
   705  
   706  // fileDescription is embedded by host fd implementations of FileDescriptionImpl.
   707  //
   708  // +stateify savable
   709  type fileDescription struct {
   710  	vfsfd vfs.FileDescription
   711  	vfs.FileDescriptionDefaultImpl
   712  	vfs.LockFD
   713  
   714  	// inode is vfsfd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode), but
   715  	// cached to reduce indirections and casting. fileDescription does not hold
   716  	// a reference on the inode through the inode field (since one is already
   717  	// held via the Dentry).
   718  	//
   719  	// inode is immutable after fileDescription creation.
   720  	inode *inode
   721  
   722  	// offsetMu protects offset.
   723  	offsetMu sync.Mutex `state:"nosave"`
   724  
   725  	// offset specifies the current file offset. It is only meaningful when
   726  	// inode.seekable is true.
   727  	offset int64
   728  }
   729  
   730  // SetStat implements vfs.FileDescriptionImpl.SetStat.
   731  func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
   732  	creds := auth.CredentialsFromContext(ctx)
   733  	return f.inode.SetStat(ctx, f.vfsfd.Mount().Filesystem(), creds, opts)
   734  }
   735  
   736  // Stat implements vfs.FileDescriptionImpl.Stat.
   737  func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
   738  	return f.inode.Stat(ctx, f.vfsfd.Mount().Filesystem(), opts)
   739  }
   740  
   741  // Release implements vfs.FileDescriptionImpl.Release.
   742  func (f *fileDescription) Release(context.Context) {
   743  	// noop
   744  }
   745  
   746  // Allocate implements vfs.FileDescriptionImpl.Allocate.
   747  func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error {
   748  	if f.inode.readonly {
   749  		return linuxerr.EPERM
   750  	}
   751  	return unix.Fallocate(f.inode.hostFD, uint32(mode), int64(offset), int64(length))
   752  }
   753  
   754  // PRead implements vfs.FileDescriptionImpl.PRead.
   755  func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
   756  	// Check that flags are supported.
   757  	//
   758  	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
   759  	if opts.Flags&^linux.RWF_HIPRI != 0 {
   760  		return 0, linuxerr.EOPNOTSUPP
   761  	}
   762  
   763  	i := f.inode
   764  	if !i.seekable {
   765  		return 0, linuxerr.ESPIPE
   766  	}
   767  
   768  	return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags)
   769  }
   770  
   771  // Read implements vfs.FileDescriptionImpl.Read.
   772  func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
   773  	// Check that flags are supported.
   774  	//
   775  	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
   776  	if opts.Flags&^linux.RWF_HIPRI != 0 {
   777  		return 0, linuxerr.EOPNOTSUPP
   778  	}
   779  
   780  	i := f.inode
   781  	if !i.seekable {
   782  		bufN, err := i.readFromBuf(ctx, &dst)
   783  		if err != nil {
   784  			return bufN, err
   785  		}
   786  		n, err := readFromHostFD(ctx, i.hostFD, dst, -1, opts.Flags)
   787  		total := bufN + n
   788  		if isBlockError(err) {
   789  			// If we got any data at all, return it as a "completed" partial read
   790  			// rather than retrying until complete.
   791  			if total != 0 {
   792  				err = nil
   793  			} else {
   794  				err = linuxerr.ErrWouldBlock
   795  			}
   796  		}
   797  		return total, err
   798  	}
   799  
   800  	f.offsetMu.Lock()
   801  	n, err := readFromHostFD(ctx, i.hostFD, dst, f.offset, opts.Flags)
   802  	f.offset += n
   803  	f.offsetMu.Unlock()
   804  	return n, err
   805  }
   806  
   807  func (i *inode) readFromBuf(ctx context.Context, dst *usermem.IOSequence) (int64, error) {
   808  	if i.haveBuf.Load() == 0 {
   809  		return 0, nil
   810  	}
   811  	i.bufMu.Lock()
   812  	defer i.bufMu.Unlock()
   813  	if len(i.buf) == 0 {
   814  		return 0, nil
   815  	}
   816  	n, err := dst.CopyOut(ctx, i.buf)
   817  	*dst = dst.DropFirst(n)
   818  	i.buf = i.buf[n:]
   819  	if len(i.buf) == 0 {
   820  		i.haveBuf.Store(0)
   821  		i.buf = nil
   822  	}
   823  	return int64(n), err
   824  }
   825  
   826  func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) {
   827  	reader := hostfd.GetReadWriterAt(int32(hostFD), offset, flags)
   828  	n, err := dst.CopyOutFrom(ctx, reader)
   829  	hostfd.PutReadWriterAt(reader)
   830  	return int64(n), err
   831  }
   832  
   833  // PWrite implements vfs.FileDescriptionImpl.PWrite.
   834  func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
   835  	if !f.inode.seekable {
   836  		return 0, linuxerr.ESPIPE
   837  	}
   838  
   839  	return f.writeToHostFD(ctx, src, offset, opts.Flags)
   840  }
   841  
   842  // Write implements vfs.FileDescriptionImpl.Write.
   843  func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
   844  	i := f.inode
   845  	if !i.seekable {
   846  		n, err := f.writeToHostFD(ctx, src, -1, opts.Flags)
   847  		if isBlockError(err) {
   848  			err = linuxerr.ErrWouldBlock
   849  		}
   850  		return n, err
   851  	}
   852  
   853  	f.offsetMu.Lock()
   854  	// NOTE(gvisor.dev/issue/2983): O_APPEND may cause memory corruption if
   855  	// another process modifies the host file between retrieving the file size
   856  	// and writing to the host fd. This is an unavoidable race condition because
   857  	// we cannot enforce synchronization on the host.
   858  	if f.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
   859  		var s unix.Stat_t
   860  		if err := unix.Fstat(i.hostFD, &s); err != nil {
   861  			f.offsetMu.Unlock()
   862  			return 0, err
   863  		}
   864  		f.offset = s.Size
   865  	}
   866  	n, err := f.writeToHostFD(ctx, src, f.offset, opts.Flags)
   867  	f.offset += n
   868  	f.offsetMu.Unlock()
   869  	return n, err
   870  }
   871  
   872  func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSequence, offset int64, flags uint32) (int64, error) {
   873  	if f.inode.readonly {
   874  		return 0, linuxerr.EPERM
   875  	}
   876  	hostFD := f.inode.hostFD
   877  	// TODO(gvisor.dev/issue/2601): Support select pwritev2 flags.
   878  	if flags != 0 {
   879  		return 0, linuxerr.EOPNOTSUPP
   880  	}
   881  	writer := hostfd.GetReadWriterAt(int32(hostFD), offset, flags)
   882  	n, err := src.CopyInTo(ctx, writer)
   883  	hostfd.PutReadWriterAt(writer)
   884  	// NOTE(gvisor.dev/issue/2979): We always sync everything, even for O_DSYNC.
   885  	if n > 0 && f.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 {
   886  		if syncErr := unix.Fsync(hostFD); syncErr != nil {
   887  			return int64(n), syncErr
   888  		}
   889  	}
   890  	return int64(n), err
   891  }
   892  
   893  // Seek implements vfs.FileDescriptionImpl.Seek.
   894  //
   895  // Note that we do not support seeking on directories, since we do not even
   896  // allow directory fds to be imported at all.
   897  func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (int64, error) {
   898  	i := f.inode
   899  	if !i.seekable {
   900  		return 0, linuxerr.ESPIPE
   901  	}
   902  
   903  	f.offsetMu.Lock()
   904  	defer f.offsetMu.Unlock()
   905  
   906  	switch whence {
   907  	case linux.SEEK_SET:
   908  		if offset < 0 {
   909  			return f.offset, linuxerr.EINVAL
   910  		}
   911  		f.offset = offset
   912  
   913  	case linux.SEEK_CUR:
   914  		// Check for overflow. Note that underflow cannot occur, since f.offset >= 0.
   915  		if offset > math.MaxInt64-f.offset {
   916  			return f.offset, linuxerr.EOVERFLOW
   917  		}
   918  		if f.offset+offset < 0 {
   919  			return f.offset, linuxerr.EINVAL
   920  		}
   921  		f.offset += offset
   922  
   923  	case linux.SEEK_END:
   924  		var s unix.Stat_t
   925  		if err := unix.Fstat(i.hostFD, &s); err != nil {
   926  			return f.offset, err
   927  		}
   928  		size := s.Size
   929  
   930  		// Check for overflow. Note that underflow cannot occur, since size >= 0.
   931  		if offset > math.MaxInt64-size {
   932  			return f.offset, linuxerr.EOVERFLOW
   933  		}
   934  		if size+offset < 0 {
   935  			return f.offset, linuxerr.EINVAL
   936  		}
   937  		f.offset = size + offset
   938  
   939  	case linux.SEEK_DATA, linux.SEEK_HOLE:
   940  		// Modifying the offset in the host file table should not matter, since
   941  		// this is the only place where we use it.
   942  		//
   943  		// For reading and writing, we always rely on our internal offset.
   944  		n, err := unix.Seek(i.hostFD, offset, int(whence))
   945  		if err != nil {
   946  			return f.offset, err
   947  		}
   948  		f.offset = n
   949  
   950  	default:
   951  		// Invalid whence.
   952  		return f.offset, linuxerr.EINVAL
   953  	}
   954  
   955  	return f.offset, nil
   956  }
   957  
   958  // Sync implements vfs.FileDescriptionImpl.Sync.
   959  func (f *fileDescription) Sync(ctx context.Context) error {
   960  	if f.inode.readonly {
   961  		return linuxerr.EPERM
   962  	}
   963  	// TODO(gvisor.dev/issue/1897): Currently, we always sync everything.
   964  	return unix.Fsync(f.inode.hostFD)
   965  }
   966  
   967  // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
   968  func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error {
   969  	// NOTE(b/38213152): Technically, some obscure char devices can be memory
   970  	// mapped, but we only allow regular files.
   971  	if f.inode.ftype != unix.S_IFREG {
   972  		return linuxerr.ENODEV
   973  	}
   974  	i := f.inode
   975  	i.CachedMappable.InitFileMapperOnce()
   976  	return vfs.GenericConfigureMMap(&f.vfsfd, i, opts)
   977  }
   978  
   979  // EventRegister implements waiter.Waitable.EventRegister.
   980  func (f *fileDescription) EventRegister(e *waiter.Entry) error {
   981  	f.inode.queue.EventRegister(e)
   982  	if f.inode.epollable {
   983  		if err := fdnotifier.UpdateFD(int32(f.inode.hostFD)); err != nil {
   984  			f.inode.queue.EventUnregister(e)
   985  			return err
   986  		}
   987  	}
   988  	return nil
   989  }
   990  
   991  // EventUnregister implements waiter.Waitable.EventUnregister.
   992  func (f *fileDescription) EventUnregister(e *waiter.Entry) {
   993  	f.inode.queue.EventUnregister(e)
   994  	if f.inode.epollable {
   995  		if err := fdnotifier.UpdateFD(int32(f.inode.hostFD)); err != nil {
   996  			panic(fmt.Sprint("UpdateFD:", err))
   997  		}
   998  	}
   999  }
  1000  
  1001  // Readiness uses the poll() syscall to check the status of the underlying FD.
  1002  func (f *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
  1003  	return fdnotifier.NonBlockingPoll(int32(f.inode.hostFD), mask)
  1004  }
  1005  
  1006  // Epollable implements FileDescriptionImpl.Epollable.
  1007  func (f *fileDescription) Epollable() bool {
  1008  	return f.inode.epollable
  1009  }
  1010  
  1011  // Ioctl queries the underlying FD for allowed ioctl commands.
  1012  func (f *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
  1013  	switch cmd := args[1].Int(); cmd {
  1014  	case linux.FIONREAD:
  1015  		v, err := ioctlFionread(f.inode.hostFD)
  1016  		if err != nil {
  1017  			return 0, err
  1018  		}
  1019  
  1020  		var buf [4]byte
  1021  		hostarch.ByteOrder.PutUint32(buf[:], v)
  1022  		_, err = uio.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{})
  1023  		return 0, err
  1024  	}
  1025  
  1026  	return f.FileDescriptionDefaultImpl.Ioctl(ctx, uio, sysno, args)
  1027  }