github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fsimpl/host/host.go (about)

     1  // Copyright 2020 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package host provides a filesystem implementation for host files imported as
    16  // file descriptors.
    17  package host
    18  
    19  import (
    20  	"fmt"
    21  	"math"
    22  	"sync/atomic"
    23  
    24  	"golang.org/x/sys/unix"
    25  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    26  	"github.com/SagerNet/gvisor/pkg/context"
    27  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    28  	"github.com/SagerNet/gvisor/pkg/fdnotifier"
    29  	"github.com/SagerNet/gvisor/pkg/fspath"
    30  	"github.com/SagerNet/gvisor/pkg/hostarch"
    31  	"github.com/SagerNet/gvisor/pkg/log"
    32  	"github.com/SagerNet/gvisor/pkg/sentry/fsimpl/kernfs"
    33  	"github.com/SagerNet/gvisor/pkg/sentry/hostfd"
    34  	"github.com/SagerNet/gvisor/pkg/sentry/kernel"
    35  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    36  	"github.com/SagerNet/gvisor/pkg/sentry/memmap"
    37  	unixsocket "github.com/SagerNet/gvisor/pkg/sentry/socket/unix"
    38  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    39  	"github.com/SagerNet/gvisor/pkg/sync"
    40  	"github.com/SagerNet/gvisor/pkg/syserror"
    41  	"github.com/SagerNet/gvisor/pkg/usermem"
    42  	"github.com/SagerNet/gvisor/pkg/waiter"
    43  )
    44  
    45  // inode implements kernfs.Inode.
    46  //
    47  // +stateify savable
    48  type inode struct {
    49  	kernfs.InodeNoStatFS
    50  	kernfs.InodeNotDirectory
    51  	kernfs.InodeNotSymlink
    52  	kernfs.CachedMappable
    53  	kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid.
    54  
    55  	locks vfs.FileLocks
    56  
    57  	// When the reference count reaches zero, the host fd is closed.
    58  	inodeRefs
    59  
    60  	// hostFD contains the host fd that this file was originally created from,
    61  	// which must be available at time of restore.
    62  	//
    63  	// This field is initialized at creation time and is immutable.
    64  	hostFD int
    65  
    66  	// ino is an inode number unique within this filesystem.
    67  	//
    68  	// This field is initialized at creation time and is immutable.
    69  	ino uint64
    70  
    71  	// ftype is the file's type (a linux.S_IFMT mask).
    72  	//
    73  	// This field is initialized at creation time and is immutable.
    74  	ftype uint16
    75  
    76  	// mayBlock is true if hostFD is non-blocking, and operations on it may
    77  	// return EAGAIN or EWOULDBLOCK instead of blocking.
    78  	//
    79  	// This field is initialized at creation time and is immutable.
    80  	mayBlock bool
    81  
    82  	// seekable is false if lseek(hostFD) returns ESPIPE. We assume that file
    83  	// offsets are meaningful iff seekable is true.
    84  	//
    85  	// This field is initialized at creation time and is immutable.
    86  	seekable bool
    87  
    88  	// isTTY is true if this file represents a TTY.
    89  	//
    90  	// This field is initialized at creation time and is immutable.
    91  	isTTY bool
    92  
    93  	// savable is true if hostFD may be saved/restored by its numeric value.
    94  	//
    95  	// This field is initialized at creation time and is immutable.
    96  	savable bool
    97  
    98  	// Event queue for blocking operations.
    99  	queue waiter.Queue
   100  
   101  	// If haveBuf is non-zero, hostFD represents a pipe, and buf contains data
   102  	// read from the pipe from previous calls to inode.beforeSave(). haveBuf
   103  	// and buf are protected by bufMu. haveBuf is accessed using atomic memory
   104  	// operations.
   105  	bufMu   sync.Mutex `state:"nosave"`
   106  	haveBuf uint32
   107  	buf     []byte
   108  }
   109  
   110  func newInode(ctx context.Context, fs *filesystem, hostFD int, savable bool, fileType linux.FileMode, isTTY bool) (*inode, error) {
   111  	// Determine if hostFD is seekable.
   112  	_, err := unix.Seek(hostFD, 0, linux.SEEK_CUR)
   113  	seekable := !linuxerr.Equals(linuxerr.ESPIPE, err)
   114  	// We expect regular files to be seekable, as this is required for them to
   115  	// be memory-mappable.
   116  	if !seekable && fileType == unix.S_IFREG {
   117  		ctx.Infof("host.newInode: host FD %d is a non-seekable regular file", hostFD)
   118  		return nil, linuxerr.ESPIPE
   119  	}
   120  
   121  	i := &inode{
   122  		hostFD:   hostFD,
   123  		ino:      fs.NextIno(),
   124  		ftype:    uint16(fileType),
   125  		mayBlock: fileType != unix.S_IFREG && fileType != unix.S_IFDIR,
   126  		seekable: seekable,
   127  		isTTY:    isTTY,
   128  		savable:  savable,
   129  	}
   130  	i.InitRefs()
   131  	i.CachedMappable.Init(hostFD)
   132  
   133  	// If the hostFD can return EWOULDBLOCK when set to non-blocking, do so and
   134  	// handle blocking behavior in the sentry.
   135  	if i.mayBlock {
   136  		if err := unix.SetNonblock(i.hostFD, true); err != nil {
   137  			return nil, err
   138  		}
   139  		if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil {
   140  			return nil, err
   141  		}
   142  	}
   143  	return i, nil
   144  }
   145  
   146  // NewFDOptions contains options to NewFD.
   147  type NewFDOptions struct {
   148  	// If Savable is true, the host file descriptor may be saved/restored by
   149  	// numeric value; the sandbox API requires a corresponding host FD with the
   150  	// same numeric value to be provieded at time of restore.
   151  	Savable bool
   152  
   153  	// If IsTTY is true, the file descriptor is a TTY.
   154  	IsTTY bool
   155  
   156  	// If HaveFlags is true, use Flags for the new file description. Otherwise,
   157  	// the new file description will inherit flags from hostFD.
   158  	HaveFlags bool
   159  	Flags     uint32
   160  }
   161  
   162  // NewFD returns a vfs.FileDescription representing the given host file
   163  // descriptor. mnt must be Kernel.HostMount().
   164  func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) (*vfs.FileDescription, error) {
   165  	fs, ok := mnt.Filesystem().Impl().(*filesystem)
   166  	if !ok {
   167  		return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl())
   168  	}
   169  
   170  	// Retrieve metadata.
   171  	var s unix.Stat_t
   172  	if err := unix.Fstat(hostFD, &s); err != nil {
   173  		return nil, err
   174  	}
   175  
   176  	flags := opts.Flags
   177  	if !opts.HaveFlags {
   178  		// Get flags for the imported FD.
   179  		flagsInt, err := unix.FcntlInt(uintptr(hostFD), unix.F_GETFL, 0)
   180  		if err != nil {
   181  			return nil, err
   182  		}
   183  		flags = uint32(flagsInt)
   184  	}
   185  
   186  	d := &kernfs.Dentry{}
   187  	i, err := newInode(ctx, fs, hostFD, opts.Savable, linux.FileMode(s.Mode).FileType(), opts.IsTTY)
   188  	if err != nil {
   189  		return nil, err
   190  	}
   191  	d.Init(&fs.Filesystem, i)
   192  
   193  	// i.open will take a reference on d.
   194  	defer d.DecRef(ctx)
   195  
   196  	// For simplicity, fileDescription.offset is set to 0. Technically, we
   197  	// should only set to 0 on files that are not seekable (sockets, pipes,
   198  	// etc.), and use the offset from the host fd otherwise when importing.
   199  	return i.open(ctx, d, mnt, flags)
   200  }
   201  
   202  // ImportFD sets up and returns a vfs.FileDescription from a donated fd.
   203  func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) {
   204  	return NewFD(ctx, mnt, hostFD, &NewFDOptions{
   205  		Savable: true,
   206  		IsTTY:   isTTY,
   207  	})
   208  }
   209  
   210  // filesystemType implements vfs.FilesystemType.
   211  //
   212  // +stateify savable
   213  type filesystemType struct{}
   214  
   215  // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
   216  func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
   217  	panic("host.filesystemType.GetFilesystem should never be called")
   218  }
   219  
   220  // Name implements vfs.FilesystemType.Name.
   221  func (filesystemType) Name() string {
   222  	return "none"
   223  }
   224  
   225  // Release implements vfs.FilesystemType.Release.
   226  func (filesystemType) Release(ctx context.Context) {}
   227  
   228  // NewFilesystem sets up and returns a new hostfs filesystem.
   229  //
   230  // Note that there should only ever be one instance of host.filesystem,
   231  // a global mount for host fds.
   232  func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) {
   233  	devMinor, err := vfsObj.GetAnonBlockDevMinor()
   234  	if err != nil {
   235  		return nil, err
   236  	}
   237  	fs := &filesystem{
   238  		devMinor: devMinor,
   239  	}
   240  	fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs)
   241  	return fs.VFSFilesystem(), nil
   242  }
   243  
   244  // filesystem implements vfs.FilesystemImpl.
   245  //
   246  // +stateify savable
   247  type filesystem struct {
   248  	kernfs.Filesystem
   249  
   250  	devMinor uint32
   251  }
   252  
   253  func (fs *filesystem) Release(ctx context.Context) {
   254  	fs.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
   255  	fs.Filesystem.Release(ctx)
   256  }
   257  
   258  func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
   259  	d := vd.Dentry().Impl().(*kernfs.Dentry)
   260  	inode := d.Inode().(*inode)
   261  	b.PrependComponent(fmt.Sprintf("host:[%d]", inode.ino))
   262  	return vfs.PrependPathSyntheticError{}
   263  }
   264  
   265  // MountOptions implements vfs.FilesystemImpl.MountOptions.
   266  func (fs *filesystem) MountOptions() string {
   267  	return ""
   268  }
   269  
   270  // CheckPermissions implements kernfs.Inode.CheckPermissions.
   271  func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
   272  	var s unix.Stat_t
   273  	if err := unix.Fstat(i.hostFD, &s); err != nil {
   274  		return err
   275  	}
   276  	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid))
   277  }
   278  
   279  // Mode implements kernfs.Inode.Mode.
   280  func (i *inode) Mode() linux.FileMode {
   281  	var s unix.Stat_t
   282  	if err := unix.Fstat(i.hostFD, &s); err != nil {
   283  		// Retrieving the mode from the host fd using fstat(2) should not fail.
   284  		// If the syscall does not succeed, something is fundamentally wrong.
   285  		panic(fmt.Sprintf("failed to retrieve mode from host fd %d: %v", i.hostFD, err))
   286  	}
   287  	return linux.FileMode(s.Mode)
   288  }
   289  
   290  // Stat implements kernfs.Inode.Stat.
   291  func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
   292  	if opts.Mask&linux.STATX__RESERVED != 0 {
   293  		return linux.Statx{}, linuxerr.EINVAL
   294  	}
   295  	if opts.Sync&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE {
   296  		return linux.Statx{}, linuxerr.EINVAL
   297  	}
   298  
   299  	fs := vfsfs.Impl().(*filesystem)
   300  
   301  	// Limit our host call only to known flags.
   302  	mask := opts.Mask & linux.STATX_ALL
   303  	var s unix.Statx_t
   304  	err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(mask), &s)
   305  	if linuxerr.Equals(linuxerr.ENOSYS, err) {
   306  		// Fallback to fstat(2), if statx(2) is not supported on the host.
   307  		//
   308  		// TODO(b/151263641): Remove fallback.
   309  		return i.fstat(fs)
   310  	}
   311  	if err != nil {
   312  		return linux.Statx{}, err
   313  	}
   314  
   315  	// Unconditionally fill blksize, attributes, and device numbers, as
   316  	// indicated by /include/uapi/linux/stat.h. Inode number is always
   317  	// available, since we use our own rather than the host's.
   318  	ls := linux.Statx{
   319  		Mask:           linux.STATX_INO,
   320  		Blksize:        s.Blksize,
   321  		Attributes:     s.Attributes,
   322  		Ino:            i.ino,
   323  		AttributesMask: s.Attributes_mask,
   324  		DevMajor:       linux.UNNAMED_MAJOR,
   325  		DevMinor:       fs.devMinor,
   326  	}
   327  
   328  	// Copy other fields that were returned by the host. RdevMajor/RdevMinor
   329  	// are never copied (and therefore left as zero), so as not to expose host
   330  	// device numbers.
   331  	ls.Mask |= s.Mask & linux.STATX_ALL
   332  	if s.Mask&linux.STATX_TYPE != 0 {
   333  		ls.Mode |= s.Mode & linux.S_IFMT
   334  	}
   335  	if s.Mask&linux.STATX_MODE != 0 {
   336  		ls.Mode |= s.Mode &^ linux.S_IFMT
   337  	}
   338  	if s.Mask&linux.STATX_NLINK != 0 {
   339  		ls.Nlink = s.Nlink
   340  	}
   341  	if s.Mask&linux.STATX_UID != 0 {
   342  		ls.UID = s.Uid
   343  	}
   344  	if s.Mask&linux.STATX_GID != 0 {
   345  		ls.GID = s.Gid
   346  	}
   347  	if s.Mask&linux.STATX_ATIME != 0 {
   348  		ls.Atime = unixToLinuxStatxTimestamp(s.Atime)
   349  	}
   350  	if s.Mask&linux.STATX_BTIME != 0 {
   351  		ls.Btime = unixToLinuxStatxTimestamp(s.Btime)
   352  	}
   353  	if s.Mask&linux.STATX_CTIME != 0 {
   354  		ls.Ctime = unixToLinuxStatxTimestamp(s.Ctime)
   355  	}
   356  	if s.Mask&linux.STATX_MTIME != 0 {
   357  		ls.Mtime = unixToLinuxStatxTimestamp(s.Mtime)
   358  	}
   359  	if s.Mask&linux.STATX_SIZE != 0 {
   360  		ls.Size = s.Size
   361  	}
   362  	if s.Mask&linux.STATX_BLOCKS != 0 {
   363  		ls.Blocks = s.Blocks
   364  	}
   365  
   366  	return ls, nil
   367  }
   368  
   369  // fstat is a best-effort fallback for inode.Stat() if the host does not
   370  // support statx(2).
   371  //
   372  // We ignore the mask and sync flags in opts and simply supply
   373  // STATX_BASIC_STATS, as fstat(2) itself does not allow the specification
   374  // of a mask or sync flags. fstat(2) does not provide any metadata
   375  // equivalent to Statx.Attributes, Statx.AttributesMask, or Statx.Btime, so
   376  // those fields remain empty.
   377  func (i *inode) fstat(fs *filesystem) (linux.Statx, error) {
   378  	var s unix.Stat_t
   379  	if err := unix.Fstat(i.hostFD, &s); err != nil {
   380  		return linux.Statx{}, err
   381  	}
   382  
   383  	// As with inode.Stat(), we always use internal device and inode numbers,
   384  	// and never expose the host's represented device numbers.
   385  	return linux.Statx{
   386  		Mask:     linux.STATX_BASIC_STATS,
   387  		Blksize:  uint32(s.Blksize),
   388  		Nlink:    uint32(s.Nlink),
   389  		UID:      s.Uid,
   390  		GID:      s.Gid,
   391  		Mode:     uint16(s.Mode),
   392  		Ino:      i.ino,
   393  		Size:     uint64(s.Size),
   394  		Blocks:   uint64(s.Blocks),
   395  		Atime:    timespecToStatxTimestamp(s.Atim),
   396  		Ctime:    timespecToStatxTimestamp(s.Ctim),
   397  		Mtime:    timespecToStatxTimestamp(s.Mtim),
   398  		DevMajor: linux.UNNAMED_MAJOR,
   399  		DevMinor: fs.devMinor,
   400  	}, nil
   401  }
   402  
   403  // SetStat implements kernfs.Inode.SetStat.
   404  func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
   405  	s := &opts.Stat
   406  
   407  	m := s.Mask
   408  	if m == 0 {
   409  		return nil
   410  	}
   411  	if m&^(linux.STATX_MODE|linux.STATX_SIZE|linux.STATX_ATIME|linux.STATX_MTIME) != 0 {
   412  		return linuxerr.EPERM
   413  	}
   414  	var hostStat unix.Stat_t
   415  	if err := unix.Fstat(i.hostFD, &hostStat); err != nil {
   416  		return err
   417  	}
   418  	if err := vfs.CheckSetStat(ctx, creds, &opts, linux.FileMode(hostStat.Mode), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil {
   419  		return err
   420  	}
   421  
   422  	if m&linux.STATX_MODE != 0 {
   423  		if err := unix.Fchmod(i.hostFD, uint32(s.Mode)); err != nil {
   424  			return err
   425  		}
   426  	}
   427  	if m&linux.STATX_SIZE != 0 {
   428  		if hostStat.Mode&linux.S_IFMT != linux.S_IFREG {
   429  			return linuxerr.EINVAL
   430  		}
   431  		if err := unix.Ftruncate(i.hostFD, int64(s.Size)); err != nil {
   432  			return err
   433  		}
   434  		oldSize := uint64(hostStat.Size)
   435  		if s.Size < oldSize {
   436  			oldpgend, _ := hostarch.PageRoundUp(oldSize)
   437  			newpgend, _ := hostarch.PageRoundUp(s.Size)
   438  			if oldpgend != newpgend {
   439  				i.CachedMappable.InvalidateRange(memmap.MappableRange{newpgend, oldpgend})
   440  			}
   441  		}
   442  	}
   443  	if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 {
   444  		ts := [2]unix.Timespec{
   445  			toTimespec(s.Atime, m&linux.STATX_ATIME == 0),
   446  			toTimespec(s.Mtime, m&linux.STATX_MTIME == 0),
   447  		}
   448  		if err := setTimestamps(i.hostFD, &ts); err != nil {
   449  			return err
   450  		}
   451  	}
   452  	return nil
   453  }
   454  
   455  // DecRef implements kernfs.Inode.DecRef.
   456  func (i *inode) DecRef(ctx context.Context) {
   457  	i.inodeRefs.DecRef(func() {
   458  		if i.mayBlock {
   459  			fdnotifier.RemoveFD(int32(i.hostFD))
   460  		}
   461  		if err := unix.Close(i.hostFD); err != nil {
   462  			log.Warningf("failed to close host fd %d: %v", i.hostFD, err)
   463  		}
   464  		// We can't rely on fdnotifier when closing the fd, because the event may race
   465  		// with fdnotifier.RemoveFD. Instead, notify the queue explicitly.
   466  		i.queue.Notify(waiter.EventHUp | waiter.ReadableEvents | waiter.WritableEvents)
   467  	})
   468  }
   469  
   470  // Open implements kernfs.Inode.Open.
   471  func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   472  	// Once created, we cannot re-open a socket fd through /proc/[pid]/fd/.
   473  	if i.Mode().FileType() == linux.S_IFSOCK {
   474  		return nil, linuxerr.ENXIO
   475  	}
   476  	return i.open(ctx, d, rp.Mount(), opts.Flags)
   477  }
   478  
   479  func (i *inode) open(ctx context.Context, d *kernfs.Dentry, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, error) {
   480  	var s unix.Stat_t
   481  	if err := unix.Fstat(i.hostFD, &s); err != nil {
   482  		return nil, err
   483  	}
   484  	fileType := s.Mode & linux.FileTypeMask
   485  
   486  	// Constrain flags to a subset we can handle.
   487  	//
   488  	// TODO(github.com/SagerNet/issue/2601): Support O_NONBLOCK by adding RWF_NOWAIT to pread/pwrite calls.
   489  	flags &= unix.O_ACCMODE | unix.O_NONBLOCK | unix.O_DSYNC | unix.O_SYNC | unix.O_APPEND
   490  
   491  	switch fileType {
   492  	case unix.S_IFSOCK:
   493  		if i.isTTY {
   494  			log.Warningf("cannot use host socket fd %d as TTY", i.hostFD)
   495  			return nil, syserror.ENOTTY
   496  		}
   497  
   498  		ep, err := newEndpoint(ctx, i.hostFD, &i.queue)
   499  		if err != nil {
   500  			return nil, err
   501  		}
   502  		// Currently, we only allow Unix sockets to be imported.
   503  		return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d.VFSDentry(), &i.locks)
   504  
   505  	case unix.S_IFREG, unix.S_IFIFO, unix.S_IFCHR:
   506  		if i.isTTY {
   507  			fd := &TTYFileDescription{
   508  				fileDescription: fileDescription{inode: i},
   509  				termios:         linux.DefaultReplicaTermios,
   510  			}
   511  			if task := kernel.TaskFromContext(ctx); task != nil {
   512  				fd.fgProcessGroup = task.ThreadGroup().ProcessGroup()
   513  				fd.session = fd.fgProcessGroup.Session()
   514  			}
   515  			fd.LockFD.Init(&i.locks)
   516  			vfsfd := &fd.vfsfd
   517  			if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
   518  				return nil, err
   519  			}
   520  			return vfsfd, nil
   521  		}
   522  
   523  		fd := &fileDescription{inode: i}
   524  		fd.LockFD.Init(&i.locks)
   525  		vfsfd := &fd.vfsfd
   526  		if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
   527  			return nil, err
   528  		}
   529  		return vfsfd, nil
   530  
   531  	default:
   532  		log.Warningf("cannot import host fd %d with file type %o", i.hostFD, fileType)
   533  		return nil, linuxerr.EPERM
   534  	}
   535  }
   536  
   537  // fileDescription is embedded by host fd implementations of FileDescriptionImpl.
   538  //
   539  // +stateify savable
   540  type fileDescription struct {
   541  	vfsfd vfs.FileDescription
   542  	vfs.FileDescriptionDefaultImpl
   543  	vfs.LockFD
   544  
   545  	// inode is vfsfd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode), but
   546  	// cached to reduce indirections and casting. fileDescription does not hold
   547  	// a reference on the inode through the inode field (since one is already
   548  	// held via the Dentry).
   549  	//
   550  	// inode is immutable after fileDescription creation.
   551  	inode *inode
   552  
   553  	// offsetMu protects offset.
   554  	offsetMu sync.Mutex `state:"nosave"`
   555  
   556  	// offset specifies the current file offset. It is only meaningful when
   557  	// inode.seekable is true.
   558  	offset int64
   559  }
   560  
   561  // SetStat implements vfs.FileDescriptionImpl.SetStat.
   562  func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
   563  	creds := auth.CredentialsFromContext(ctx)
   564  	return f.inode.SetStat(ctx, f.vfsfd.Mount().Filesystem(), creds, opts)
   565  }
   566  
   567  // Stat implements vfs.FileDescriptionImpl.Stat.
   568  func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
   569  	return f.inode.Stat(ctx, f.vfsfd.Mount().Filesystem(), opts)
   570  }
   571  
   572  // Release implements vfs.FileDescriptionImpl.Release.
   573  func (f *fileDescription) Release(context.Context) {
   574  	// noop
   575  }
   576  
   577  // Allocate implements vfs.FileDescriptionImpl.Allocate.
   578  func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error {
   579  	return unix.Fallocate(f.inode.hostFD, uint32(mode), int64(offset), int64(length))
   580  }
   581  
   582  // PRead implements vfs.FileDescriptionImpl.PRead.
   583  func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
   584  	// Check that flags are supported.
   585  	//
   586  	// TODO(github.com/SagerNet/issue/2601): Support select preadv2 flags.
   587  	if opts.Flags&^linux.RWF_HIPRI != 0 {
   588  		return 0, syserror.EOPNOTSUPP
   589  	}
   590  
   591  	i := f.inode
   592  	if !i.seekable {
   593  		return 0, linuxerr.ESPIPE
   594  	}
   595  
   596  	return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags)
   597  }
   598  
   599  // Read implements vfs.FileDescriptionImpl.Read.
   600  func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
   601  	// Check that flags are supported.
   602  	//
   603  	// TODO(github.com/SagerNet/issue/2601): Support select preadv2 flags.
   604  	if opts.Flags&^linux.RWF_HIPRI != 0 {
   605  		return 0, syserror.EOPNOTSUPP
   606  	}
   607  
   608  	i := f.inode
   609  	if !i.seekable {
   610  		bufN, err := i.readFromBuf(ctx, &dst)
   611  		if err != nil {
   612  			return bufN, err
   613  		}
   614  		n, err := readFromHostFD(ctx, i.hostFD, dst, -1, opts.Flags)
   615  		total := bufN + n
   616  		if isBlockError(err) {
   617  			// If we got any data at all, return it as a "completed" partial read
   618  			// rather than retrying until complete.
   619  			if total != 0 {
   620  				err = nil
   621  			} else {
   622  				err = syserror.ErrWouldBlock
   623  			}
   624  		}
   625  		return total, err
   626  	}
   627  
   628  	f.offsetMu.Lock()
   629  	n, err := readFromHostFD(ctx, i.hostFD, dst, f.offset, opts.Flags)
   630  	f.offset += n
   631  	f.offsetMu.Unlock()
   632  	return n, err
   633  }
   634  
   635  func (i *inode) readFromBuf(ctx context.Context, dst *usermem.IOSequence) (int64, error) {
   636  	if atomic.LoadUint32(&i.haveBuf) == 0 {
   637  		return 0, nil
   638  	}
   639  	i.bufMu.Lock()
   640  	defer i.bufMu.Unlock()
   641  	if len(i.buf) == 0 {
   642  		return 0, nil
   643  	}
   644  	n, err := dst.CopyOut(ctx, i.buf)
   645  	*dst = dst.DropFirst(n)
   646  	i.buf = i.buf[n:]
   647  	if len(i.buf) == 0 {
   648  		atomic.StoreUint32(&i.haveBuf, 0)
   649  		i.buf = nil
   650  	}
   651  	return int64(n), err
   652  }
   653  
   654  func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) {
   655  	reader := hostfd.GetReadWriterAt(int32(hostFD), offset, flags)
   656  	n, err := dst.CopyOutFrom(ctx, reader)
   657  	hostfd.PutReadWriterAt(reader)
   658  	return int64(n), err
   659  }
   660  
   661  // PWrite implements vfs.FileDescriptionImpl.PWrite.
   662  func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
   663  	if !f.inode.seekable {
   664  		return 0, linuxerr.ESPIPE
   665  	}
   666  
   667  	return f.writeToHostFD(ctx, src, offset, opts.Flags)
   668  }
   669  
   670  // Write implements vfs.FileDescriptionImpl.Write.
   671  func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
   672  	i := f.inode
   673  	if !i.seekable {
   674  		n, err := f.writeToHostFD(ctx, src, -1, opts.Flags)
   675  		if isBlockError(err) {
   676  			err = syserror.ErrWouldBlock
   677  		}
   678  		return n, err
   679  	}
   680  
   681  	f.offsetMu.Lock()
   682  	// NOTE(github.com/SagerNet/issue/2983): O_APPEND may cause memory corruption if
   683  	// another process modifies the host file between retrieving the file size
   684  	// and writing to the host fd. This is an unavoidable race condition because
   685  	// we cannot enforce synchronization on the host.
   686  	if f.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
   687  		var s unix.Stat_t
   688  		if err := unix.Fstat(i.hostFD, &s); err != nil {
   689  			f.offsetMu.Unlock()
   690  			return 0, err
   691  		}
   692  		f.offset = s.Size
   693  	}
   694  	n, err := f.writeToHostFD(ctx, src, f.offset, opts.Flags)
   695  	f.offset += n
   696  	f.offsetMu.Unlock()
   697  	return n, err
   698  }
   699  
   700  func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSequence, offset int64, flags uint32) (int64, error) {
   701  	hostFD := f.inode.hostFD
   702  	// TODO(github.com/SagerNet/issue/2601): Support select pwritev2 flags.
   703  	if flags != 0 {
   704  		return 0, syserror.EOPNOTSUPP
   705  	}
   706  	writer := hostfd.GetReadWriterAt(int32(hostFD), offset, flags)
   707  	n, err := src.CopyInTo(ctx, writer)
   708  	hostfd.PutReadWriterAt(writer)
   709  	// NOTE(github.com/SagerNet/issue/2979): We always sync everything, even for O_DSYNC.
   710  	if n > 0 && f.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 {
   711  		if syncErr := unix.Fsync(hostFD); syncErr != nil {
   712  			return int64(n), syncErr
   713  		}
   714  	}
   715  	return int64(n), err
   716  }
   717  
   718  // Seek implements vfs.FileDescriptionImpl.Seek.
   719  //
   720  // Note that we do not support seeking on directories, since we do not even
   721  // allow directory fds to be imported at all.
   722  func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (int64, error) {
   723  	i := f.inode
   724  	if !i.seekable {
   725  		return 0, linuxerr.ESPIPE
   726  	}
   727  
   728  	f.offsetMu.Lock()
   729  	defer f.offsetMu.Unlock()
   730  
   731  	switch whence {
   732  	case linux.SEEK_SET:
   733  		if offset < 0 {
   734  			return f.offset, linuxerr.EINVAL
   735  		}
   736  		f.offset = offset
   737  
   738  	case linux.SEEK_CUR:
   739  		// Check for overflow. Note that underflow cannot occur, since f.offset >= 0.
   740  		if offset > math.MaxInt64-f.offset {
   741  			return f.offset, linuxerr.EOVERFLOW
   742  		}
   743  		if f.offset+offset < 0 {
   744  			return f.offset, linuxerr.EINVAL
   745  		}
   746  		f.offset += offset
   747  
   748  	case linux.SEEK_END:
   749  		var s unix.Stat_t
   750  		if err := unix.Fstat(i.hostFD, &s); err != nil {
   751  			return f.offset, err
   752  		}
   753  		size := s.Size
   754  
   755  		// Check for overflow. Note that underflow cannot occur, since size >= 0.
   756  		if offset > math.MaxInt64-size {
   757  			return f.offset, linuxerr.EOVERFLOW
   758  		}
   759  		if size+offset < 0 {
   760  			return f.offset, linuxerr.EINVAL
   761  		}
   762  		f.offset = size + offset
   763  
   764  	case linux.SEEK_DATA, linux.SEEK_HOLE:
   765  		// Modifying the offset in the host file table should not matter, since
   766  		// this is the only place where we use it.
   767  		//
   768  		// For reading and writing, we always rely on our internal offset.
   769  		n, err := unix.Seek(i.hostFD, offset, int(whence))
   770  		if err != nil {
   771  			return f.offset, err
   772  		}
   773  		f.offset = n
   774  
   775  	default:
   776  		// Invalid whence.
   777  		return f.offset, linuxerr.EINVAL
   778  	}
   779  
   780  	return f.offset, nil
   781  }
   782  
   783  // Sync implements vfs.FileDescriptionImpl.Sync.
   784  func (f *fileDescription) Sync(ctx context.Context) error {
   785  	// TODO(github.com/SagerNet/issue/1897): Currently, we always sync everything.
   786  	return unix.Fsync(f.inode.hostFD)
   787  }
   788  
   789  // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
   790  func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error {
   791  	// NOTE(b/38213152): Technically, some obscure char devices can be memory
   792  	// mapped, but we only allow regular files.
   793  	if f.inode.ftype != unix.S_IFREG {
   794  		return linuxerr.ENODEV
   795  	}
   796  	i := f.inode
   797  	i.CachedMappable.InitFileMapperOnce()
   798  	return vfs.GenericConfigureMMap(&f.vfsfd, i, opts)
   799  }
   800  
   801  // EventRegister implements waiter.Waitable.EventRegister.
   802  func (f *fileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
   803  	f.inode.queue.EventRegister(e, mask)
   804  	if f.inode.mayBlock {
   805  		fdnotifier.UpdateFD(int32(f.inode.hostFD))
   806  	}
   807  }
   808  
   809  // EventUnregister implements waiter.Waitable.EventUnregister.
   810  func (f *fileDescription) EventUnregister(e *waiter.Entry) {
   811  	f.inode.queue.EventUnregister(e)
   812  	if f.inode.mayBlock {
   813  		fdnotifier.UpdateFD(int32(f.inode.hostFD))
   814  	}
   815  }
   816  
   817  // Readiness uses the poll() syscall to check the status of the underlying FD.
   818  func (f *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
   819  	return fdnotifier.NonBlockingPoll(int32(f.inode.hostFD), mask)
   820  }