github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/vfs/vfs.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package vfs implements a virtual filesystem layer.
    16  //
    17  // Lock order:
    18  //
    19  // EpollInstance.interestMu
    20  //   FileDescription.epollMu
    21  //     FilesystemImpl/FileDescriptionImpl locks
    22  //       VirtualFilesystem.mountMu
    23  //         Dentry.mu
    24  //           Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry
    25  //         VirtualFilesystem.filesystemsMu
    26  //       fdnotifier.notifier.mu
    27  //         EpollInstance.mu
    28  //           Locks acquired by FileDescriptionImpl.Readiness
    29  //       Inotify.mu
    30  //         Watches.mu
    31  //           Inotify.evMu
    32  // VirtualFilesystem.fsTypesMu
    33  //
    34  // Locking Dentry.mu in multiple Dentries requires holding
    35  // VirtualFilesystem.mountMu. Locking EpollInstance.interestMu in multiple
    36  // EpollInstances requires holding epollCycleMu.
    37  package vfs
    38  
    39  import (
    40  	"fmt"
    41  	"path"
    42  
    43  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    44  	"github.com/SagerNet/gvisor/pkg/context"
    45  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    46  	"github.com/SagerNet/gvisor/pkg/fspath"
    47  	"github.com/SagerNet/gvisor/pkg/sentry/fsmetric"
    48  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    49  	"github.com/SagerNet/gvisor/pkg/sentry/socket/unix/transport"
    50  	"github.com/SagerNet/gvisor/pkg/sync"
    51  	"github.com/SagerNet/gvisor/pkg/syserror"
    52  )
    53  
    54  // A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts.
    55  //
    56  // There is no analogue to the VirtualFilesystem type in Linux, as the
    57  // equivalent state in Linux is global.
    58  //
    59  // +stateify savable
    60  type VirtualFilesystem struct {
    61  	// mountMu serializes mount mutations.
    62  	//
    63  	// mountMu is analogous to Linux's namespace_sem.
    64  	mountMu sync.Mutex `state:"nosave"`
    65  
    66  	// mounts maps (mount parent, mount point) pairs to mounts. (Since mounts
    67  	// are uniquely namespaced, including mount parent in the key correctly
    68  	// handles both bind mounts and mount namespaces; Linux does the same.)
    69  	// Synchronization between mutators and readers is provided by mounts.seq;
    70  	// synchronization between mutators is provided by mountMu.
    71  	//
    72  	// mounts is used to follow mount points during path traversal. We use a
    73  	// single table rather than per-Dentry tables to reduce size (and therefore
    74  	// cache footprint) for the vast majority of Dentries that are not mount
    75  	// points.
    76  	//
    77  	// mounts is analogous to Linux's mount_hashtable.
    78  	mounts mountTable `state:".([]*Mount)"`
    79  
    80  	// mountpoints maps mount points to mounts at those points in all
    81  	// namespaces. mountpoints is protected by mountMu.
    82  	//
    83  	// mountpoints is used to find mounts that must be umounted due to
    84  	// removal of a mount point Dentry from another mount namespace. ("A file
    85  	// or directory that is a mount point in one namespace that is not a mount
    86  	// point in another namespace, may be renamed, unlinked, or removed
    87  	// (rmdir(2)) in the mount namespace in which it is not a mount point
    88  	// (subject to the usual permission checks)." - mount_namespaces(7))
    89  	//
    90  	// mountpoints is analogous to Linux's mountpoint_hashtable.
    91  	mountpoints map[*Dentry]map[*Mount]struct{}
    92  
    93  	// lastMountID is the last allocated mount ID. lastMountID is accessed
    94  	// using atomic memory operations.
    95  	lastMountID uint64
    96  
    97  	// anonMount is a Mount, not included in mounts or mountpoints,
    98  	// representing an anonFilesystem. anonMount is used to back
    99  	// VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
   100  	// anonMount is immutable.
   101  	//
   102  	// anonMount is analogous to Linux's anon_inode_mnt.
   103  	anonMount *Mount
   104  
   105  	// devices contains all registered Devices. devices is protected by
   106  	// devicesMu.
   107  	devicesMu sync.RWMutex `state:"nosave"`
   108  	devices   map[devTuple]*registeredDevice
   109  
   110  	// anonBlockDevMinor contains all allocated anonymous block device minor
   111  	// numbers. anonBlockDevMinorNext is a lower bound for the smallest
   112  	// unallocated anonymous block device number. anonBlockDevMinorNext and
   113  	// anonBlockDevMinor are protected by anonBlockDevMinorMu.
   114  	anonBlockDevMinorMu   sync.Mutex `state:"nosave"`
   115  	anonBlockDevMinorNext uint32
   116  	anonBlockDevMinor     map[uint32]struct{}
   117  
   118  	// fsTypes contains all registered FilesystemTypes. fsTypes is protected by
   119  	// fsTypesMu.
   120  	fsTypesMu sync.RWMutex `state:"nosave"`
   121  	fsTypes   map[string]*registeredFilesystemType
   122  
   123  	// filesystems contains all Filesystems. filesystems is protected by
   124  	// filesystemsMu.
   125  	filesystemsMu sync.Mutex `state:"nosave"`
   126  	filesystems   map[*Filesystem]struct{}
   127  }
   128  
   129  // Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes.
   130  func (vfs *VirtualFilesystem) Init(ctx context.Context) error {
   131  	if vfs.mountpoints != nil {
   132  		panic("VFS already initialized")
   133  	}
   134  	vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{})
   135  	vfs.devices = make(map[devTuple]*registeredDevice)
   136  	vfs.anonBlockDevMinorNext = 1
   137  	vfs.anonBlockDevMinor = make(map[uint32]struct{})
   138  	vfs.fsTypes = make(map[string]*registeredFilesystemType)
   139  	vfs.filesystems = make(map[*Filesystem]struct{})
   140  	vfs.mounts.Init()
   141  
   142  	// Construct vfs.anonMount.
   143  	anonfsDevMinor, err := vfs.GetAnonBlockDevMinor()
   144  	if err != nil {
   145  		// This shouldn't be possible since anonBlockDevMinorNext was
   146  		// initialized to 1 above (no device numbers have been allocated yet).
   147  		panic(fmt.Sprintf("VirtualFilesystem.Init: device number allocation for anonfs failed: %v", err))
   148  	}
   149  	anonfs := anonFilesystem{
   150  		devMinor: anonfsDevMinor,
   151  	}
   152  	anonfs.vfsfs.Init(vfs, &anonFilesystemType{}, &anonfs)
   153  	defer anonfs.vfsfs.DecRef(ctx)
   154  	anonMount, err := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{})
   155  	if err != nil {
   156  		// We should not be passing any MountOptions that would cause
   157  		// construction of this mount to fail.
   158  		panic(fmt.Sprintf("VirtualFilesystem.Init: anonfs mount failed: %v", err))
   159  	}
   160  	vfs.anonMount = anonMount
   161  
   162  	return nil
   163  }
   164  
   165  // Release drops references on filesystem objects held by vfs.
   166  //
   167  // Precondition: This must be called after VFS.Init() has succeeded.
   168  func (vfs *VirtualFilesystem) Release(ctx context.Context) {
   169  	vfs.anonMount.DecRef(ctx)
   170  	for _, fst := range vfs.fsTypes {
   171  		fst.fsType.Release(ctx)
   172  	}
   173  }
   174  
   175  // PathOperation specifies the path operated on by a VFS method.
   176  //
   177  // PathOperation is passed to VFS methods by pointer to reduce memory copying:
   178  // it's somewhat large and should never escape. (Options structs are passed by
   179  // pointer to VFS and FileDescription methods for the same reason.)
   180  //
   181  // +stateify savable
   182  type PathOperation struct {
   183  	// Root is the VFS root. References on Root are borrowed from the provider
   184  	// of the PathOperation.
   185  	//
   186  	// Invariants: Root.Ok().
   187  	Root VirtualDentry
   188  
   189  	// Start is the starting point for the path traversal. References on Start
   190  	// are borrowed from the provider of the PathOperation (i.e. the caller of
   191  	// the VFS method to which the PathOperation was passed).
   192  	//
   193  	// Invariants: Start.Ok(). If Path.Absolute, then Start == Root.
   194  	Start VirtualDentry
   195  
   196  	// Path is the pathname traversed by this operation.
   197  	Path fspath.Path
   198  
   199  	// If FollowFinalSymlink is true, and the Dentry traversed by the final
   200  	// path component represents a symbolic link, the symbolic link should be
   201  	// followed.
   202  	FollowFinalSymlink bool
   203  }
   204  
   205  // AccessAt checks whether a user with creds has access to the file at
   206  // the given path.
   207  func (vfs *VirtualFilesystem) AccessAt(ctx context.Context, creds *auth.Credentials, ats AccessTypes, pop *PathOperation) error {
   208  	rp := vfs.getResolvingPath(creds, pop)
   209  	for {
   210  		err := rp.mount.fs.impl.AccessAt(ctx, rp, creds, ats)
   211  		if err == nil {
   212  			rp.Release(ctx)
   213  			return nil
   214  		}
   215  		if !rp.handleError(ctx, err) {
   216  			rp.Release(ctx)
   217  			return err
   218  		}
   219  	}
   220  }
   221  
   222  // GetDentryAt returns a VirtualDentry representing the given path, at which a
   223  // file must exist. A reference is taken on the returned VirtualDentry.
   224  func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) {
   225  	rp := vfs.getResolvingPath(creds, pop)
   226  	for {
   227  		d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts)
   228  		if err == nil {
   229  			vd := VirtualDentry{
   230  				mount:  rp.mount,
   231  				dentry: d,
   232  			}
   233  			rp.mount.IncRef()
   234  			rp.Release(ctx)
   235  			return vd, nil
   236  		}
   237  		if !rp.handleError(ctx, err) {
   238  			rp.Release(ctx)
   239  			return VirtualDentry{}, err
   240  		}
   241  	}
   242  }
   243  
   244  // Preconditions: pop.Path.Begin.Ok().
   245  func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (VirtualDentry, string, error) {
   246  	rp := vfs.getResolvingPath(creds, pop)
   247  	for {
   248  		parent, err := rp.mount.fs.impl.GetParentDentryAt(ctx, rp)
   249  		if err == nil {
   250  			parentVD := VirtualDentry{
   251  				mount:  rp.mount,
   252  				dentry: parent,
   253  			}
   254  			rp.mount.IncRef()
   255  			name := rp.Component()
   256  			rp.Release(ctx)
   257  			return parentVD, name, nil
   258  		}
   259  		if checkInvariants {
   260  			if rp.canHandleError(err) && rp.Done() {
   261  				panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
   262  			}
   263  		}
   264  		if !rp.handleError(ctx, err) {
   265  			rp.Release(ctx)
   266  			return VirtualDentry{}, "", err
   267  		}
   268  	}
   269  }
   270  
   271  // LinkAt creates a hard link at newpop representing the existing file at
   272  // oldpop.
   273  func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation) error {
   274  	oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{})
   275  	if err != nil {
   276  		return err
   277  	}
   278  
   279  	if !newpop.Path.Begin.Ok() {
   280  		oldVD.DecRef(ctx)
   281  		if newpop.Path.Absolute {
   282  			return syserror.EEXIST
   283  		}
   284  		return syserror.ENOENT
   285  	}
   286  	if newpop.FollowFinalSymlink {
   287  		oldVD.DecRef(ctx)
   288  		ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink")
   289  		return linuxerr.EINVAL
   290  	}
   291  
   292  	rp := vfs.getResolvingPath(creds, newpop)
   293  	for {
   294  		err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD)
   295  		if err == nil {
   296  			rp.Release(ctx)
   297  			oldVD.DecRef(ctx)
   298  			return nil
   299  		}
   300  		if checkInvariants {
   301  			if rp.canHandleError(err) && rp.Done() {
   302  				panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
   303  			}
   304  		}
   305  		if !rp.handleError(ctx, err) {
   306  			rp.Release(ctx)
   307  			oldVD.DecRef(ctx)
   308  			return err
   309  		}
   310  	}
   311  }
   312  
   313  // MkdirAt creates a directory at the given path.
   314  func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
   315  	if !pop.Path.Begin.Ok() {
   316  		// pop.Path should not be empty in operations that create/delete files.
   317  		// This is consistent with mkdirat(dirfd, "", mode).
   318  		if pop.Path.Absolute {
   319  			return syserror.EEXIST
   320  		}
   321  		return syserror.ENOENT
   322  	}
   323  	if pop.FollowFinalSymlink {
   324  		ctx.Warningf("VirtualFilesystem.MkdirAt: file creation paths can't follow final symlink")
   325  		return linuxerr.EINVAL
   326  	}
   327  	// "Under Linux, apart from the permission bits, the S_ISVTX mode bit is
   328  	// also honored." - mkdir(2)
   329  	opts.Mode &= 0777 | linux.S_ISVTX
   330  
   331  	rp := vfs.getResolvingPath(creds, pop)
   332  	for {
   333  		err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts)
   334  		if err == nil {
   335  			rp.Release(ctx)
   336  			return nil
   337  		}
   338  		if checkInvariants {
   339  			if rp.canHandleError(err) && rp.Done() {
   340  				panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
   341  			}
   342  		}
   343  		if !rp.handleError(ctx, err) {
   344  			rp.Release(ctx)
   345  			return err
   346  		}
   347  	}
   348  }
   349  
   350  // MknodAt creates a file of the given mode at the given path. It returns an
   351  // error from the syserror package.
   352  func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error {
   353  	if !pop.Path.Begin.Ok() {
   354  		// pop.Path should not be empty in operations that create/delete files.
   355  		// This is consistent with mknodat(dirfd, "", mode, dev).
   356  		if pop.Path.Absolute {
   357  			return syserror.EEXIST
   358  		}
   359  		return syserror.ENOENT
   360  	}
   361  	if pop.FollowFinalSymlink {
   362  		ctx.Warningf("VirtualFilesystem.MknodAt: file creation paths can't follow final symlink")
   363  		return linuxerr.EINVAL
   364  	}
   365  
   366  	rp := vfs.getResolvingPath(creds, pop)
   367  	for {
   368  		err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts)
   369  		if err == nil {
   370  			rp.Release(ctx)
   371  			return nil
   372  		}
   373  		if checkInvariants {
   374  			if rp.canHandleError(err) && rp.Done() {
   375  				panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
   376  			}
   377  		}
   378  		if !rp.handleError(ctx, err) {
   379  			rp.Release(ctx)
   380  			return err
   381  		}
   382  	}
   383  }
   384  
   385  // OpenAt returns a FileDescription providing access to the file at the given
   386  // path. A reference is taken on the returned FileDescription.
   387  func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) {
   388  	fsmetric.Opens.Increment()
   389  
   390  	// Remove:
   391  	//
   392  	// - O_CLOEXEC, which affects file descriptors and therefore must be
   393  	// handled outside of VFS.
   394  	//
   395  	// - Unknown flags.
   396  	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_LARGEFILE | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE
   397  	// Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC.
   398  	if opts.Flags&linux.O_SYNC != 0 {
   399  		opts.Flags |= linux.O_DSYNC
   400  	}
   401  	// Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified
   402  	// with O_DIRECTORY and a writable access mode (to ensure that it fails on
   403  	// filesystem implementations that do not support it).
   404  	if opts.Flags&linux.O_TMPFILE != 0 {
   405  		if opts.Flags&linux.O_DIRECTORY == 0 {
   406  			return nil, linuxerr.EINVAL
   407  		}
   408  		if opts.Flags&linux.O_CREAT != 0 {
   409  			return nil, linuxerr.EINVAL
   410  		}
   411  		if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY {
   412  			return nil, linuxerr.EINVAL
   413  		}
   414  	}
   415  	// O_PATH causes most other flags to be ignored.
   416  	if opts.Flags&linux.O_PATH != 0 {
   417  		opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH
   418  	}
   419  	// "On Linux, the following bits are also honored in mode: [S_ISUID,
   420  	// S_ISGID, S_ISVTX]" - open(2)
   421  	opts.Mode &= 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX
   422  
   423  	if opts.Flags&linux.O_NOFOLLOW != 0 {
   424  		pop.FollowFinalSymlink = false
   425  	}
   426  	rp := vfs.getResolvingPath(creds, pop)
   427  	if opts.Flags&linux.O_DIRECTORY != 0 {
   428  		rp.mustBeDir = true
   429  	}
   430  	// Ignore O_PATH for verity, as verity performs extra operations on the fd for verification.
   431  	// The underlying filesystem that verity wraps opens the fd with O_PATH.
   432  	if opts.Flags&linux.O_PATH != 0 && rp.mount.fs.FilesystemType().Name() != "verity" {
   433  		vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{})
   434  		if err != nil {
   435  			return nil, err
   436  		}
   437  		fd := &opathFD{}
   438  		if err := fd.vfsfd.Init(fd, opts.Flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{}); err != nil {
   439  			return nil, err
   440  		}
   441  		vd.DecRef(ctx)
   442  		return &fd.vfsfd, err
   443  	}
   444  	for {
   445  		fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts)
   446  		if err == nil {
   447  			rp.Release(ctx)
   448  
   449  			if opts.FileExec {
   450  				if fd.Mount().Flags.NoExec {
   451  					fd.DecRef(ctx)
   452  					return nil, linuxerr.EACCES
   453  				}
   454  
   455  				// Only a regular file can be executed.
   456  				stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE})
   457  				if err != nil {
   458  					fd.DecRef(ctx)
   459  					return nil, err
   460  				}
   461  				if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG {
   462  					fd.DecRef(ctx)
   463  					return nil, linuxerr.EACCES
   464  				}
   465  			}
   466  
   467  			fd.Dentry().InotifyWithParent(ctx, linux.IN_OPEN, 0, PathEvent)
   468  			return fd, nil
   469  		}
   470  		if !rp.handleError(ctx, err) {
   471  			rp.Release(ctx)
   472  			return nil, err
   473  		}
   474  	}
   475  }
   476  
   477  // ReadlinkAt returns the target of the symbolic link at the given path.
   478  func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (string, error) {
   479  	rp := vfs.getResolvingPath(creds, pop)
   480  	for {
   481  		target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp)
   482  		if err == nil {
   483  			rp.Release(ctx)
   484  			return target, nil
   485  		}
   486  		if !rp.handleError(ctx, err) {
   487  			rp.Release(ctx)
   488  			return "", err
   489  		}
   490  	}
   491  }
   492  
   493  // RenameAt renames the file at oldpop to newpop.
   494  func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error {
   495  	if !oldpop.Path.Begin.Ok() {
   496  		if oldpop.Path.Absolute {
   497  			return linuxerr.EBUSY
   498  		}
   499  		return syserror.ENOENT
   500  	}
   501  	if oldpop.FollowFinalSymlink {
   502  		ctx.Warningf("VirtualFilesystem.RenameAt: source path can't follow final symlink")
   503  		return linuxerr.EINVAL
   504  	}
   505  
   506  	oldParentVD, oldName, err := vfs.getParentDirAndName(ctx, creds, oldpop)
   507  	if err != nil {
   508  		return err
   509  	}
   510  	if oldName == "." || oldName == ".." {
   511  		oldParentVD.DecRef(ctx)
   512  		return linuxerr.EBUSY
   513  	}
   514  
   515  	if !newpop.Path.Begin.Ok() {
   516  		oldParentVD.DecRef(ctx)
   517  		if newpop.Path.Absolute {
   518  			return linuxerr.EBUSY
   519  		}
   520  		return syserror.ENOENT
   521  	}
   522  	if newpop.FollowFinalSymlink {
   523  		oldParentVD.DecRef(ctx)
   524  		ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink")
   525  		return linuxerr.EINVAL
   526  	}
   527  
   528  	rp := vfs.getResolvingPath(creds, newpop)
   529  	renameOpts := *opts
   530  	if oldpop.Path.Dir {
   531  		renameOpts.MustBeDir = true
   532  	}
   533  	for {
   534  		err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts)
   535  		if err == nil {
   536  			rp.Release(ctx)
   537  			oldParentVD.DecRef(ctx)
   538  			return nil
   539  		}
   540  		if checkInvariants {
   541  			if rp.canHandleError(err) && rp.Done() {
   542  				panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
   543  			}
   544  		}
   545  		if !rp.handleError(ctx, err) {
   546  			rp.Release(ctx)
   547  			oldParentVD.DecRef(ctx)
   548  			return err
   549  		}
   550  	}
   551  }
   552  
   553  // RmdirAt removes the directory at the given path.
   554  func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
   555  	if !pop.Path.Begin.Ok() {
   556  		// pop.Path should not be empty in operations that create/delete files.
   557  		// This is consistent with unlinkat(dirfd, "", AT_REMOVEDIR).
   558  		if pop.Path.Absolute {
   559  			return linuxerr.EBUSY
   560  		}
   561  		return syserror.ENOENT
   562  	}
   563  	if pop.FollowFinalSymlink {
   564  		ctx.Warningf("VirtualFilesystem.RmdirAt: file deletion paths can't follow final symlink")
   565  		return linuxerr.EINVAL
   566  	}
   567  
   568  	rp := vfs.getResolvingPath(creds, pop)
   569  	for {
   570  		err := rp.mount.fs.impl.RmdirAt(ctx, rp)
   571  		if err == nil {
   572  			rp.Release(ctx)
   573  			return nil
   574  		}
   575  		if checkInvariants {
   576  			if rp.canHandleError(err) && rp.Done() {
   577  				panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
   578  			}
   579  		}
   580  		if !rp.handleError(ctx, err) {
   581  			rp.Release(ctx)
   582  			return err
   583  		}
   584  	}
   585  }
   586  
   587  // SetStatAt changes metadata for the file at the given path.
   588  func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetStatOptions) error {
   589  	rp := vfs.getResolvingPath(creds, pop)
   590  	for {
   591  		err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts)
   592  		if err == nil {
   593  			rp.Release(ctx)
   594  			return nil
   595  		}
   596  		if !rp.handleError(ctx, err) {
   597  			rp.Release(ctx)
   598  			return err
   599  		}
   600  	}
   601  }
   602  
   603  // StatAt returns metadata for the file at the given path.
   604  func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) {
   605  	rp := vfs.getResolvingPath(creds, pop)
   606  	for {
   607  		stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts)
   608  		if err == nil {
   609  			rp.Release(ctx)
   610  			return stat, nil
   611  		}
   612  		if !rp.handleError(ctx, err) {
   613  			rp.Release(ctx)
   614  			return linux.Statx{}, err
   615  		}
   616  	}
   617  }
   618  
   619  // StatFSAt returns metadata for the filesystem containing the file at the
   620  // given path.
   621  func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (linux.Statfs, error) {
   622  	rp := vfs.getResolvingPath(creds, pop)
   623  	for {
   624  		statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp)
   625  		if err == nil {
   626  			rp.Release(ctx)
   627  			return statfs, nil
   628  		}
   629  		if !rp.handleError(ctx, err) {
   630  			rp.Release(ctx)
   631  			return linux.Statfs{}, err
   632  		}
   633  	}
   634  }
   635  
   636  // SymlinkAt creates a symbolic link at the given path with the given target.
   637  func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error {
   638  	if !pop.Path.Begin.Ok() {
   639  		// pop.Path should not be empty in operations that create/delete files.
   640  		// This is consistent with symlinkat(oldpath, newdirfd, "").
   641  		if pop.Path.Absolute {
   642  			return syserror.EEXIST
   643  		}
   644  		return syserror.ENOENT
   645  	}
   646  	if pop.FollowFinalSymlink {
   647  		ctx.Warningf("VirtualFilesystem.SymlinkAt: file creation paths can't follow final symlink")
   648  		return linuxerr.EINVAL
   649  	}
   650  
   651  	rp := vfs.getResolvingPath(creds, pop)
   652  	for {
   653  		err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target)
   654  		if err == nil {
   655  			rp.Release(ctx)
   656  			return nil
   657  		}
   658  		if checkInvariants {
   659  			if rp.canHandleError(err) && rp.Done() {
   660  				panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
   661  			}
   662  		}
   663  		if !rp.handleError(ctx, err) {
   664  			rp.Release(ctx)
   665  			return err
   666  		}
   667  	}
   668  }
   669  
   670  // UnlinkAt deletes the non-directory file at the given path.
   671  func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
   672  	if !pop.Path.Begin.Ok() {
   673  		// pop.Path should not be empty in operations that create/delete files.
   674  		// This is consistent with unlinkat(dirfd, "", 0).
   675  		if pop.Path.Absolute {
   676  			return linuxerr.EBUSY
   677  		}
   678  		return syserror.ENOENT
   679  	}
   680  	if pop.FollowFinalSymlink {
   681  		ctx.Warningf("VirtualFilesystem.UnlinkAt: file deletion paths can't follow final symlink")
   682  		return linuxerr.EINVAL
   683  	}
   684  
   685  	rp := vfs.getResolvingPath(creds, pop)
   686  	for {
   687  		err := rp.mount.fs.impl.UnlinkAt(ctx, rp)
   688  		if err == nil {
   689  			rp.Release(ctx)
   690  			return nil
   691  		}
   692  		if checkInvariants {
   693  			if rp.canHandleError(err) && rp.Done() {
   694  				panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
   695  			}
   696  		}
   697  		if !rp.handleError(ctx, err) {
   698  			rp.Release(ctx)
   699  			return err
   700  		}
   701  	}
   702  }
   703  
   704  // BoundEndpointAt gets the bound endpoint at the given path, if one exists.
   705  func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *BoundEndpointOptions) (transport.BoundEndpoint, error) {
   706  	rp := vfs.getResolvingPath(creds, pop)
   707  	for {
   708  		bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts)
   709  		if err == nil {
   710  			rp.Release(ctx)
   711  			return bep, nil
   712  		}
   713  		if checkInvariants {
   714  			if rp.canHandleError(err) && rp.Done() {
   715  				panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
   716  			}
   717  		}
   718  		if !rp.handleError(ctx, err) {
   719  			rp.Release(ctx)
   720  			return nil, err
   721  		}
   722  	}
   723  }
   724  
   725  // ListXattrAt returns all extended attribute names for the file at the given
   726  // path.
   727  func (vfs *VirtualFilesystem) ListXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) {
   728  	rp := vfs.getResolvingPath(creds, pop)
   729  	for {
   730  		names, err := rp.mount.fs.impl.ListXattrAt(ctx, rp, size)
   731  		if err == nil {
   732  			rp.Release(ctx)
   733  			return names, nil
   734  		}
   735  		if linuxerr.Equals(linuxerr.EOPNOTSUPP, err) {
   736  			// Linux doesn't actually return EOPNOTSUPP in this case; instead,
   737  			// fs/xattr.c:vfs_listxattr() falls back to allowing the security
   738  			// subsystem to return security extended attributes, which by
   739  			// default don't exist.
   740  			rp.Release(ctx)
   741  			return nil, nil
   742  		}
   743  		if !rp.handleError(ctx, err) {
   744  			rp.Release(ctx)
   745  			return nil, err
   746  		}
   747  	}
   748  }
   749  
   750  // GetXattrAt returns the value associated with the given extended attribute
   751  // for the file at the given path.
   752  func (vfs *VirtualFilesystem) GetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetXattrOptions) (string, error) {
   753  	rp := vfs.getResolvingPath(creds, pop)
   754  	for {
   755  		val, err := rp.mount.fs.impl.GetXattrAt(ctx, rp, *opts)
   756  		if err == nil {
   757  			rp.Release(ctx)
   758  			return val, nil
   759  		}
   760  		if !rp.handleError(ctx, err) {
   761  			rp.Release(ctx)
   762  			return "", err
   763  		}
   764  	}
   765  }
   766  
   767  // SetXattrAt changes the value associated with the given extended attribute
   768  // for the file at the given path.
   769  func (vfs *VirtualFilesystem) SetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetXattrOptions) error {
   770  	rp := vfs.getResolvingPath(creds, pop)
   771  	for {
   772  		err := rp.mount.fs.impl.SetXattrAt(ctx, rp, *opts)
   773  		if err == nil {
   774  			rp.Release(ctx)
   775  			return nil
   776  		}
   777  		if !rp.handleError(ctx, err) {
   778  			rp.Release(ctx)
   779  			return err
   780  		}
   781  	}
   782  }
   783  
   784  // RemoveXattrAt removes the given extended attribute from the file at rp.
   785  func (vfs *VirtualFilesystem) RemoveXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error {
   786  	rp := vfs.getResolvingPath(creds, pop)
   787  	for {
   788  		err := rp.mount.fs.impl.RemoveXattrAt(ctx, rp, name)
   789  		if err == nil {
   790  			rp.Release(ctx)
   791  			return nil
   792  		}
   793  		if !rp.handleError(ctx, err) {
   794  			rp.Release(ctx)
   795  			return err
   796  		}
   797  	}
   798  }
   799  
   800  // SyncAllFilesystems has the semantics of Linux's sync(2).
   801  func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error {
   802  	var retErr error
   803  	for fs := range vfs.getFilesystems() {
   804  		if err := fs.impl.Sync(ctx); err != nil && retErr == nil {
   805  			retErr = err
   806  		}
   807  		fs.DecRef(ctx)
   808  	}
   809  	return retErr
   810  }
   811  
   812  func (vfs *VirtualFilesystem) getFilesystems() map[*Filesystem]struct{} {
   813  	fss := make(map[*Filesystem]struct{})
   814  	vfs.filesystemsMu.Lock()
   815  	defer vfs.filesystemsMu.Unlock()
   816  	for fs := range vfs.filesystems {
   817  		if !fs.TryIncRef() {
   818  			continue
   819  		}
   820  		fss[fs] = struct{}{}
   821  	}
   822  	return fss
   823  }
   824  
   825  // MkdirAllAt recursively creates non-existent directories on the given path
   826  // (including the last component).
   827  func (vfs *VirtualFilesystem) MkdirAllAt(ctx context.Context, currentPath string, root VirtualDentry, creds *auth.Credentials, mkdirOpts *MkdirOptions) error {
   828  	pop := &PathOperation{
   829  		Root:  root,
   830  		Start: root,
   831  		Path:  fspath.Parse(currentPath),
   832  	}
   833  	stat, err := vfs.StatAt(ctx, creds, pop, &StatOptions{Mask: linux.STATX_TYPE})
   834  	switch {
   835  	case err == nil:
   836  		if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.FileTypeMask != linux.ModeDirectory {
   837  			return syserror.ENOTDIR
   838  		}
   839  		// Directory already exists.
   840  		return nil
   841  	case linuxerr.Equals(linuxerr.ENOENT, err):
   842  		// Expected, we will create the dir.
   843  	default:
   844  		return fmt.Errorf("stat failed for %q during directory creation: %w", currentPath, err)
   845  	}
   846  
   847  	// Recurse to ensure parent is created and then create the final directory.
   848  	if err := vfs.MkdirAllAt(ctx, path.Dir(currentPath), root, creds, mkdirOpts); err != nil {
   849  		return err
   850  	}
   851  	if err := vfs.MkdirAt(ctx, creds, pop, mkdirOpts); err != nil {
   852  		return fmt.Errorf("failed to create directory %q: %w", currentPath, err)
   853  	}
   854  	return nil
   855  }
   856  
   857  // MakeSyntheticMountpoint creates parent directories of target if they do not
   858  // exist and attempts to create a directory for the mountpoint. If a
   859  // non-directory file already exists there then we allow it.
   860  func (vfs *VirtualFilesystem) MakeSyntheticMountpoint(ctx context.Context, target string, root VirtualDentry, creds *auth.Credentials) error {
   861  	mkdirOpts := &MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}
   862  
   863  	// Make sure the parent directory of target exists.
   864  	if err := vfs.MkdirAllAt(ctx, path.Dir(target), root, creds, mkdirOpts); err != nil {
   865  		return fmt.Errorf("failed to create parent directory of mountpoint %q: %w", target, err)
   866  	}
   867  
   868  	// Attempt to mkdir the final component. If a file (of any type) exists
   869  	// then we let allow mounting on top of that because we do not require the
   870  	// target to be an existing directory, unlike Linux mount(2).
   871  	if err := vfs.MkdirAt(ctx, creds, &PathOperation{
   872  		Root:  root,
   873  		Start: root,
   874  		Path:  fspath.Parse(target),
   875  	}, mkdirOpts); err != nil && !linuxerr.Equals(linuxerr.EEXIST, err) {
   876  		return fmt.Errorf("failed to create mountpoint %q: %w", target, err)
   877  	}
   878  	return nil
   879  }
   880  
   881  // A VirtualDentry represents a node in a VFS tree, by combining a Dentry
   882  // (which represents a node in a Filesystem's tree) and a Mount (which
   883  // represents the Filesystem's position in a VFS mount tree).
   884  //
   885  // VirtualDentry's semantics are similar to that of a Go interface object
   886  // representing a pointer: it is a copyable value type that represents
   887  // references to another entity. The zero value of VirtualDentry is an "empty
   888  // VirtualDentry", directly analogous to a nil interface object.
   889  // VirtualDentry.Ok() checks that a VirtualDentry is not zero-valued; unless
   890  // otherwise specified, all other VirtualDentry methods require
   891  // VirtualDentry.Ok() == true.
   892  //
   893  // Mounts and Dentries are reference-counted, requiring that users call
   894  // VirtualDentry.{Inc,Dec}Ref() as appropriate. We often colloquially refer to
   895  // references on the Mount and Dentry referred to by a VirtualDentry as
   896  // references on the VirtualDentry itself. Unless otherwise specified, all
   897  // VirtualDentry methods require that a reference is held on the VirtualDentry.
   898  //
   899  // VirtualDentry is analogous to Linux's struct path.
   900  //
   901  // +stateify savable
   902  type VirtualDentry struct {
   903  	mount  *Mount
   904  	dentry *Dentry
   905  }
   906  
   907  // MakeVirtualDentry creates a VirtualDentry.
   908  func MakeVirtualDentry(mount *Mount, dentry *Dentry) VirtualDentry {
   909  	return VirtualDentry{
   910  		mount:  mount,
   911  		dentry: dentry,
   912  	}
   913  }
   914  
   915  // Ok returns true if vd is not empty. It does not require that a reference is
   916  // held.
   917  func (vd VirtualDentry) Ok() bool {
   918  	return vd.mount != nil
   919  }
   920  
   921  // IncRef increments the reference counts on the Mount and Dentry represented
   922  // by vd.
   923  func (vd VirtualDentry) IncRef() {
   924  	vd.mount.IncRef()
   925  	vd.dentry.IncRef()
   926  }
   927  
   928  // DecRef decrements the reference counts on the Mount and Dentry represented
   929  // by vd.
   930  func (vd VirtualDentry) DecRef(ctx context.Context) {
   931  	vd.dentry.DecRef(ctx)
   932  	vd.mount.DecRef(ctx)
   933  }
   934  
   935  // Mount returns the Mount associated with vd. It does not take a reference on
   936  // the returned Mount.
   937  func (vd VirtualDentry) Mount() *Mount {
   938  	return vd.mount
   939  }
   940  
   941  // Dentry returns the Dentry associated with vd. It does not take a reference
   942  // on the returned Dentry.
   943  func (vd VirtualDentry) Dentry() *Dentry {
   944  	return vd.dentry
   945  }