github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/vfs/vfs.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package vfs implements a virtual filesystem layer.
    16  //
    17  // Lock order:
    18  //
    19  //	EpollInstance.interestMu
    20  //		FileDescription.epollMu
    21  //		  Locks acquired by FilesystemImpl/FileDescriptionImpl methods
    22  //		    VirtualFilesystem.mountMu
    23  //		      Dentry.mu
    24  //		        Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry
    25  //		      VirtualFilesystem.filesystemsMu
    26  //		    fdnotifier.notifier.mu
    27  //		      EpollInstance.readyMu
    28  //		    Inotify.mu
    29  //		      Watches.mu
    30  //		        Inotify.evMu
    31  //	VirtualFilesystem.fsTypesMu
    32  //
    33  // Locking Dentry.mu in multiple Dentries requires holding
    34  // VirtualFilesystem.mountMu. Locking EpollInstance.interestMu in multiple
    35  // EpollInstances requires holding epollCycleMu.
    36  package vfs
    37  
    38  import (
    39  	"fmt"
    40  	"path"
    41  	"time"
    42  
    43  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    44  	"github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops"
    45  	"github.com/nicocha30/gvisor-ligolo/pkg/bitmap"
    46  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    47  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    48  	"github.com/nicocha30/gvisor-ligolo/pkg/eventchannel"
    49  	"github.com/nicocha30/gvisor-ligolo/pkg/fspath"
    50  	"github.com/nicocha30/gvisor-ligolo/pkg/log"
    51  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsmetric"
    52  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth"
    53  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/unix/transport"
    54  	epb "github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs/events_go_proto"
    55  	"github.com/nicocha30/gvisor-ligolo/pkg/sync"
    56  	"github.com/nicocha30/gvisor-ligolo/pkg/waiter"
    57  )
    58  
    59  // How long to wait for a mount promise before proceeding with the VFS
    60  // operation. This should be configurable by the user eventually.
    61  const mountPromiseTimeout = 10 * time.Second
    62  
    63  // A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts.
    64  //
    65  // There is no analogue to the VirtualFilesystem type in Linux, as the
    66  // equivalent state in Linux is global.
    67  //
    68  // +stateify savable
    69  type VirtualFilesystem struct {
    70  	// mountMu serializes mount mutations.
    71  	//
    72  	// mountMu is analogous to Linux's namespace_sem.
    73  	mountMu virtualFilesystemMutex `state:"nosave"`
    74  
    75  	// mounts maps (mount parent, mount point) pairs to mounts. (Since mounts
    76  	// are uniquely namespaced, including mount parent in the key correctly
    77  	// handles both bind mounts and mount namespaces; Linux does the same.)
    78  	// Synchronization between mutators and readers is provided by mounts.seq;
    79  	// synchronization between mutators is provided by mountMu.
    80  	//
    81  	// mounts is used to follow mount points during path traversal. We use a
    82  	// single table rather than per-Dentry tables to reduce size (and therefore
    83  	// cache footprint) for the vast majority of Dentries that are not mount
    84  	// points.
    85  	//
    86  	// mounts is analogous to Linux's mount_hashtable.
    87  	mounts mountTable `state:".([]*Mount)"`
    88  
    89  	// mountpoints maps mount points to mounts at those points in all
    90  	// namespaces. mountpoints is protected by mountMu.
    91  	//
    92  	// mountpoints is used to find mounts that must be umounted due to
    93  	// removal of a mount point Dentry from another mount namespace. ("A file
    94  	// or directory that is a mount point in one namespace that is not a mount
    95  	// point in another namespace, may be renamed, unlinked, or removed
    96  	// (rmdir(2)) in the mount namespace in which it is not a mount point
    97  	// (subject to the usual permission checks)." - mount_namespaces(7))
    98  	//
    99  	// mountpoints is analogous to Linux's mountpoint_hashtable.
   100  	mountpoints map[*Dentry]map[*Mount]struct{}
   101  
   102  	// lastMountID is the last allocated mount ID. lastMountID is accessed
   103  	// using atomic memory operations.
   104  	lastMountID atomicbitops.Uint64
   105  
   106  	// anonMount is a Mount, not included in mounts or mountpoints,
   107  	// representing an anonFilesystem. anonMount is used to back
   108  	// VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
   109  	// anonMount is immutable.
   110  	//
   111  	// anonMount is analogous to Linux's anon_inode_mnt.
   112  	anonMount *Mount
   113  
   114  	// devices contains all registered Devices. devices is protected by
   115  	// devicesMu.
   116  	devicesMu sync.RWMutex `state:"nosave"`
   117  	devices   map[devTuple]*registeredDevice
   118  
   119  	// dynCharDevMajorUsed contains all allocated dynamic character device
   120  	// major numbers. dynCharDevMajor is protected by dynCharDevMajorMu.
   121  	dynCharDevMajorMu   sync.Mutex `state:"nosave"`
   122  	dynCharDevMajorUsed map[uint32]struct{}
   123  
   124  	// anonBlockDevMinor contains all allocated anonymous block device minor
   125  	// numbers. anonBlockDevMinorNext is a lower bound for the smallest
   126  	// unallocated anonymous block device number. anonBlockDevMinorNext and
   127  	// anonBlockDevMinor are protected by anonBlockDevMinorMu.
   128  	anonBlockDevMinorMu   sync.Mutex `state:"nosave"`
   129  	anonBlockDevMinorNext uint32
   130  	anonBlockDevMinor     map[uint32]struct{}
   131  
   132  	// fsTypes contains all registered FilesystemTypes. fsTypes is protected by
   133  	// fsTypesMu.
   134  	fsTypesMu sync.RWMutex `state:"nosave"`
   135  	fsTypes   map[string]*registeredFilesystemType
   136  
   137  	// filesystems contains all Filesystems. filesystems is protected by
   138  	// filesystemsMu.
   139  	filesystemsMu sync.Mutex `state:"nosave"`
   140  	filesystems   map[*Filesystem]struct{}
   141  
   142  	// groupIDBitmap tracks which mount group IDs are available for allocation.
   143  	groupIDBitmap bitmap.Bitmap
   144  
   145  	// mountPromises contains all unresolved mount promises.
   146  	mountPromisesMu sync.RWMutex `state:"nosave"`
   147  	mountPromises   map[VirtualDentry]*waiter.Queue
   148  }
   149  
   150  // Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes.
   151  func (vfs *VirtualFilesystem) Init(ctx context.Context) error {
   152  	if vfs.mountpoints != nil {
   153  		panic("VFS already initialized")
   154  	}
   155  	vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{})
   156  	vfs.devices = make(map[devTuple]*registeredDevice)
   157  	vfs.dynCharDevMajorUsed = make(map[uint32]struct{})
   158  	vfs.anonBlockDevMinorNext = 1
   159  	vfs.anonBlockDevMinor = make(map[uint32]struct{})
   160  	vfs.fsTypes = make(map[string]*registeredFilesystemType)
   161  	vfs.filesystems = make(map[*Filesystem]struct{})
   162  	vfs.mounts.Init()
   163  	vfs.groupIDBitmap = bitmap.New(1024)
   164  	vfs.mountPromises = make(map[VirtualDentry]*waiter.Queue)
   165  
   166  	// Construct vfs.anonMount.
   167  	anonfsDevMinor, err := vfs.GetAnonBlockDevMinor()
   168  	if err != nil {
   169  		// This shouldn't be possible since anonBlockDevMinorNext was
   170  		// initialized to 1 above (no device numbers have been allocated yet).
   171  		panic(fmt.Sprintf("VirtualFilesystem.Init: device number allocation for anonfs failed: %v", err))
   172  	}
   173  	anonfs := anonFilesystem{
   174  		devMinor: anonfsDevMinor,
   175  	}
   176  	anonfs.vfsfs.Init(vfs, &anonFilesystemType{}, &anonfs)
   177  	defer anonfs.vfsfs.DecRef(ctx)
   178  	anonMount := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{})
   179  	vfs.anonMount = anonMount
   180  
   181  	return nil
   182  }
   183  
   184  // Release drops references on filesystem objects held by vfs.
   185  //
   186  // Precondition: This must be called after VFS.Init() has succeeded.
   187  func (vfs *VirtualFilesystem) Release(ctx context.Context) {
   188  	vfs.anonMount.DecRef(ctx)
   189  	for _, fst := range vfs.fsTypes {
   190  		fst.fsType.Release(ctx)
   191  	}
   192  }
   193  
   194  // PathOperation specifies the path operated on by a VFS method.
   195  //
   196  // PathOperation is passed to VFS methods by pointer to reduce memory copying:
   197  // it's somewhat large and should never escape. (Options structs are passed by
   198  // pointer to VFS and FileDescription methods for the same reason.)
   199  //
   200  // +stateify savable
   201  type PathOperation struct {
   202  	// Root is the VFS root. References on Root are borrowed from the provider
   203  	// of the PathOperation.
   204  	//
   205  	// Invariants: Root.Ok().
   206  	Root VirtualDentry
   207  
   208  	// Start is the starting point for the path traversal. References on Start
   209  	// are borrowed from the provider of the PathOperation (i.e. the caller of
   210  	// the VFS method to which the PathOperation was passed).
   211  	//
   212  	// Invariants: Start.Ok(). If Path.Absolute, then Start == Root.
   213  	Start VirtualDentry
   214  
   215  	// Path is the pathname traversed by this operation.
   216  	Path fspath.Path
   217  
   218  	// If FollowFinalSymlink is true, and the Dentry traversed by the final
   219  	// path component represents a symbolic link, the symbolic link should be
   220  	// followed.
   221  	FollowFinalSymlink bool
   222  }
   223  
   224  // AccessAt checks whether a user with creds has access to the file at
   225  // the given path.
   226  func (vfs *VirtualFilesystem) AccessAt(ctx context.Context, creds *auth.Credentials, ats AccessTypes, pop *PathOperation) error {
   227  	rp := vfs.getResolvingPath(creds, pop)
   228  	for {
   229  		vfs.maybeBlockOnMountPromise(ctx, rp)
   230  		err := rp.mount.fs.impl.AccessAt(ctx, rp, creds, ats)
   231  		if err == nil {
   232  			rp.Release(ctx)
   233  			return nil
   234  		}
   235  		if !rp.handleError(ctx, err) {
   236  			rp.Release(ctx)
   237  			return err
   238  		}
   239  	}
   240  }
   241  
   242  // GetDentryAt returns a VirtualDentry representing the given path, at which a
   243  // file must exist. A reference is taken on the returned VirtualDentry.
   244  func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) {
   245  	rp := vfs.getResolvingPath(creds, pop)
   246  	for {
   247  		vfs.maybeBlockOnMountPromise(ctx, rp)
   248  		d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts)
   249  		if err == nil {
   250  			vd := VirtualDentry{
   251  				mount:  rp.mount,
   252  				dentry: d,
   253  			}
   254  			rp.mount.IncRef()
   255  			rp.Release(ctx)
   256  			return vd, nil
   257  		}
   258  		if !rp.handleError(ctx, err) {
   259  			rp.Release(ctx)
   260  			return VirtualDentry{}, err
   261  		}
   262  	}
   263  }
   264  
   265  // Preconditions: pop.Path.Begin.Ok().
   266  func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (VirtualDentry, string, error) {
   267  	rp := vfs.getResolvingPath(creds, pop)
   268  	for {
   269  		vfs.maybeBlockOnMountPromise(ctx, rp)
   270  		parent, err := rp.mount.fs.impl.GetParentDentryAt(ctx, rp)
   271  		if err == nil {
   272  			parentVD := VirtualDentry{
   273  				mount:  rp.mount,
   274  				dentry: parent,
   275  			}
   276  			rp.mount.IncRef()
   277  			name := rp.Component()
   278  			rp.Release(ctx)
   279  			return parentVD, name, nil
   280  		}
   281  		if checkInvariants {
   282  			if rp.canHandleError(err) && rp.Done() {
   283  				panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
   284  			}
   285  		}
   286  		if !rp.handleError(ctx, err) {
   287  			rp.Release(ctx)
   288  			return VirtualDentry{}, "", err
   289  		}
   290  	}
   291  }
   292  
   293  // LinkAt creates a hard link at newpop representing the existing file at
   294  // oldpop.
   295  func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation) error {
   296  	oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{})
   297  	if err != nil {
   298  		return err
   299  	}
   300  
   301  	if !newpop.Path.Begin.Ok() {
   302  		oldVD.DecRef(ctx)
   303  		if newpop.Path.Absolute {
   304  			return linuxerr.EEXIST
   305  		}
   306  		return linuxerr.ENOENT
   307  	}
   308  	if newpop.FollowFinalSymlink {
   309  		oldVD.DecRef(ctx)
   310  		ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink")
   311  		return linuxerr.EINVAL
   312  	}
   313  
   314  	rp := vfs.getResolvingPath(creds, newpop)
   315  	for {
   316  		vfs.maybeBlockOnMountPromise(ctx, rp)
   317  		err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD)
   318  		if err == nil {
   319  			rp.Release(ctx)
   320  			oldVD.DecRef(ctx)
   321  			return nil
   322  		}
   323  		if checkInvariants {
   324  			if rp.canHandleError(err) && rp.Done() {
   325  				panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
   326  			}
   327  		}
   328  		if !rp.handleError(ctx, err) {
   329  			rp.Release(ctx)
   330  			oldVD.DecRef(ctx)
   331  			return err
   332  		}
   333  	}
   334  }
   335  
   336  // MkdirAt creates a directory at the given path.
   337  func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
   338  	if !pop.Path.Begin.Ok() {
   339  		// pop.Path should not be empty in operations that create/delete files.
   340  		// This is consistent with mkdirat(dirfd, "", mode).
   341  		if pop.Path.Absolute {
   342  			return linuxerr.EEXIST
   343  		}
   344  		return linuxerr.ENOENT
   345  	}
   346  	if pop.FollowFinalSymlink {
   347  		ctx.Warningf("VirtualFilesystem.MkdirAt: file creation paths can't follow final symlink")
   348  		return linuxerr.EINVAL
   349  	}
   350  	// "Under Linux, apart from the permission bits, the S_ISVTX mode bit is
   351  	// also honored." - mkdir(2)
   352  	opts.Mode &= 0777 | linux.S_ISVTX
   353  
   354  	rp := vfs.getResolvingPath(creds, pop)
   355  	for {
   356  		vfs.maybeBlockOnMountPromise(ctx, rp)
   357  		err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts)
   358  		if err == nil {
   359  			rp.Release(ctx)
   360  			return nil
   361  		}
   362  		if checkInvariants {
   363  			if rp.canHandleError(err) && rp.Done() {
   364  				panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
   365  			}
   366  		}
   367  		if !rp.handleError(ctx, err) {
   368  			rp.Release(ctx)
   369  			return err
   370  		}
   371  	}
   372  }
   373  
   374  // MknodAt creates a file of the given mode at the given path. It returns an
   375  // error from the linuxerr package.
   376  func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error {
   377  	if !pop.Path.Begin.Ok() {
   378  		// pop.Path should not be empty in operations that create/delete files.
   379  		// This is consistent with mknodat(dirfd, "", mode, dev).
   380  		if pop.Path.Absolute {
   381  			return linuxerr.EEXIST
   382  		}
   383  		return linuxerr.ENOENT
   384  	}
   385  	if pop.FollowFinalSymlink {
   386  		ctx.Warningf("VirtualFilesystem.MknodAt: file creation paths can't follow final symlink")
   387  		return linuxerr.EINVAL
   388  	}
   389  
   390  	rp := vfs.getResolvingPath(creds, pop)
   391  	for {
   392  		vfs.maybeBlockOnMountPromise(ctx, rp)
   393  		err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts)
   394  		if err == nil {
   395  			rp.Release(ctx)
   396  			return nil
   397  		}
   398  		if checkInvariants {
   399  			if rp.canHandleError(err) && rp.Done() {
   400  				panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
   401  			}
   402  		}
   403  		if !rp.handleError(ctx, err) {
   404  			rp.Release(ctx)
   405  			return err
   406  		}
   407  	}
   408  }
   409  
   410  // OpenAt returns a FileDescription providing access to the file at the given
   411  // path. A reference is taken on the returned FileDescription.
   412  func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) {
   413  	fsmetric.Opens.Increment()
   414  
   415  	// Remove:
   416  	//
   417  	//	- O_CLOEXEC, which affects file descriptors and therefore must be
   418  	//		handled outside of VFS.
   419  	//
   420  	//	- Unknown flags.
   421  	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_LARGEFILE | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE
   422  	// Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC.
   423  	if opts.Flags&linux.O_SYNC != 0 {
   424  		opts.Flags |= linux.O_DSYNC
   425  	}
   426  	// Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified
   427  	// with O_DIRECTORY and a writable access mode (to ensure that it fails on
   428  	// filesystem implementations that do not support it).
   429  	if opts.Flags&linux.O_TMPFILE != 0 {
   430  		if opts.Flags&linux.O_DIRECTORY == 0 {
   431  			return nil, linuxerr.EINVAL
   432  		}
   433  		if opts.Flags&linux.O_CREAT != 0 {
   434  			return nil, linuxerr.EINVAL
   435  		}
   436  		if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY {
   437  			return nil, linuxerr.EINVAL
   438  		}
   439  	}
   440  	// O_PATH causes most other flags to be ignored.
   441  	if opts.Flags&linux.O_PATH != 0 {
   442  		opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH
   443  	}
   444  	// "On Linux, the following bits are also honored in mode: [S_ISUID,
   445  	// S_ISGID, S_ISVTX]" - open(2)
   446  	opts.Mode &= 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX
   447  
   448  	if opts.Flags&linux.O_NOFOLLOW != 0 {
   449  		pop.FollowFinalSymlink = false
   450  	}
   451  	if opts.Flags&linux.O_PATH != 0 {
   452  		return vfs.openOPathFD(ctx, creds, pop, opts.Flags)
   453  	}
   454  	rp := vfs.getResolvingPath(creds, pop)
   455  	if opts.Flags&linux.O_DIRECTORY != 0 {
   456  		rp.mustBeDir = true
   457  	}
   458  	for {
   459  		vfs.maybeBlockOnMountPromise(ctx, rp)
   460  		fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts)
   461  		if err == nil {
   462  			rp.Release(ctx)
   463  
   464  			if opts.FileExec {
   465  				if fd.Mount().Flags.NoExec {
   466  					fd.DecRef(ctx)
   467  					return nil, linuxerr.EACCES
   468  				}
   469  
   470  				// Only a regular file can be executed.
   471  				stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE})
   472  				if err != nil {
   473  					fd.DecRef(ctx)
   474  					return nil, err
   475  				}
   476  				if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG {
   477  					fd.DecRef(ctx)
   478  					return nil, linuxerr.EACCES
   479  				}
   480  			}
   481  
   482  			fd.Dentry().InotifyWithParent(ctx, linux.IN_OPEN, 0, PathEvent)
   483  			return fd, nil
   484  		}
   485  		if !rp.handleError(ctx, err) {
   486  			rp.Release(ctx)
   487  			return nil, err
   488  		}
   489  	}
   490  }
   491  
   492  // ReadlinkAt returns the target of the symbolic link at the given path.
   493  func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (string, error) {
   494  	rp := vfs.getResolvingPath(creds, pop)
   495  	for {
   496  		vfs.maybeBlockOnMountPromise(ctx, rp)
   497  		target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp)
   498  		if err == nil {
   499  			rp.Release(ctx)
   500  			return target, nil
   501  		}
   502  		if !rp.handleError(ctx, err) {
   503  			rp.Release(ctx)
   504  			return "", err
   505  		}
   506  	}
   507  }
   508  
   509  // RenameAt renames the file at oldpop to newpop.
   510  func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error {
   511  	if !oldpop.Path.Begin.Ok() {
   512  		if oldpop.Path.Absolute {
   513  			return linuxerr.EBUSY
   514  		}
   515  		return linuxerr.ENOENT
   516  	}
   517  	if oldpop.FollowFinalSymlink {
   518  		ctx.Warningf("VirtualFilesystem.RenameAt: source path can't follow final symlink")
   519  		return linuxerr.EINVAL
   520  	}
   521  
   522  	oldParentVD, oldName, err := vfs.getParentDirAndName(ctx, creds, oldpop)
   523  	if err != nil {
   524  		return err
   525  	}
   526  	if oldName == "." || oldName == ".." {
   527  		oldParentVD.DecRef(ctx)
   528  		return linuxerr.EBUSY
   529  	}
   530  	if len(oldName) > linux.NAME_MAX {
   531  		oldParentVD.DecRef(ctx)
   532  		return linuxerr.ENAMETOOLONG
   533  	}
   534  
   535  	if !newpop.Path.Begin.Ok() {
   536  		oldParentVD.DecRef(ctx)
   537  		if newpop.Path.Absolute {
   538  			return linuxerr.EBUSY
   539  		}
   540  		return linuxerr.ENOENT
   541  	}
   542  	if newpop.FollowFinalSymlink {
   543  		oldParentVD.DecRef(ctx)
   544  		ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink")
   545  		return linuxerr.EINVAL
   546  	}
   547  
   548  	rp := vfs.getResolvingPath(creds, newpop)
   549  	renameOpts := *opts
   550  	if oldpop.Path.Dir {
   551  		renameOpts.MustBeDir = true
   552  	}
   553  	for {
   554  		vfs.maybeBlockOnMountPromise(ctx, rp)
   555  		err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts)
   556  		if err == nil {
   557  			rp.Release(ctx)
   558  			oldParentVD.DecRef(ctx)
   559  			return nil
   560  		}
   561  		if checkInvariants {
   562  			if rp.canHandleError(err) && rp.Done() {
   563  				panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
   564  			}
   565  		}
   566  		if !rp.handleError(ctx, err) {
   567  			rp.Release(ctx)
   568  			oldParentVD.DecRef(ctx)
   569  			return err
   570  		}
   571  	}
   572  }
   573  
   574  // RmdirAt removes the directory at the given path.
   575  func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
   576  	if !pop.Path.Begin.Ok() {
   577  		// pop.Path should not be empty in operations that create/delete files.
   578  		// This is consistent with unlinkat(dirfd, "", AT_REMOVEDIR).
   579  		if pop.Path.Absolute {
   580  			return linuxerr.EBUSY
   581  		}
   582  		return linuxerr.ENOENT
   583  	}
   584  	if pop.FollowFinalSymlink {
   585  		ctx.Warningf("VirtualFilesystem.RmdirAt: file deletion paths can't follow final symlink")
   586  		return linuxerr.EINVAL
   587  	}
   588  
   589  	rp := vfs.getResolvingPath(creds, pop)
   590  	for {
   591  		vfs.maybeBlockOnMountPromise(ctx, rp)
   592  		err := rp.mount.fs.impl.RmdirAt(ctx, rp)
   593  		if err == nil {
   594  			rp.Release(ctx)
   595  			return nil
   596  		}
   597  		if checkInvariants {
   598  			if rp.canHandleError(err) && rp.Done() {
   599  				panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
   600  			}
   601  		}
   602  		if !rp.handleError(ctx, err) {
   603  			rp.Release(ctx)
   604  			return err
   605  		}
   606  	}
   607  }
   608  
   609  // SetStatAt changes metadata for the file at the given path.
   610  func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetStatOptions) error {
   611  	rp := vfs.getResolvingPath(creds, pop)
   612  	for {
   613  		vfs.maybeBlockOnMountPromise(ctx, rp)
   614  		err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts)
   615  		if err == nil {
   616  			rp.Release(ctx)
   617  			return nil
   618  		}
   619  		if !rp.handleError(ctx, err) {
   620  			rp.Release(ctx)
   621  			return err
   622  		}
   623  	}
   624  }
   625  
   626  // StatAt returns metadata for the file at the given path.
   627  func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) {
   628  	rp := vfs.getResolvingPath(creds, pop)
   629  	for {
   630  		vfs.maybeBlockOnMountPromise(ctx, rp)
   631  		stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts)
   632  		if err == nil {
   633  			rp.Release(ctx)
   634  			return stat, nil
   635  		}
   636  		if !rp.handleError(ctx, err) {
   637  			rp.Release(ctx)
   638  			return linux.Statx{}, err
   639  		}
   640  	}
   641  }
   642  
   643  // StatFSAt returns metadata for the filesystem containing the file at the
   644  // given path.
   645  func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (linux.Statfs, error) {
   646  	rp := vfs.getResolvingPath(creds, pop)
   647  	for {
   648  		vfs.maybeBlockOnMountPromise(ctx, rp)
   649  		statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp)
   650  		if err == nil {
   651  			rp.Release(ctx)
   652  			return statfs, nil
   653  		}
   654  		if !rp.handleError(ctx, err) {
   655  			rp.Release(ctx)
   656  			return linux.Statfs{}, err
   657  		}
   658  	}
   659  }
   660  
   661  // SymlinkAt creates a symbolic link at the given path with the given target.
   662  func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error {
   663  	if !pop.Path.Begin.Ok() {
   664  		// pop.Path should not be empty in operations that create/delete files.
   665  		// This is consistent with symlinkat(oldpath, newdirfd, "").
   666  		if pop.Path.Absolute {
   667  			return linuxerr.EEXIST
   668  		}
   669  		return linuxerr.ENOENT
   670  	}
   671  	if pop.FollowFinalSymlink {
   672  		ctx.Warningf("VirtualFilesystem.SymlinkAt: file creation paths can't follow final symlink")
   673  		return linuxerr.EINVAL
   674  	}
   675  
   676  	rp := vfs.getResolvingPath(creds, pop)
   677  	for {
   678  		vfs.maybeBlockOnMountPromise(ctx, rp)
   679  		err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target)
   680  		if err == nil {
   681  			rp.Release(ctx)
   682  			return nil
   683  		}
   684  		if checkInvariants {
   685  			if rp.canHandleError(err) && rp.Done() {
   686  				panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
   687  			}
   688  		}
   689  		if !rp.handleError(ctx, err) {
   690  			rp.Release(ctx)
   691  			return err
   692  		}
   693  	}
   694  }
   695  
   696  // UnlinkAt deletes the non-directory file at the given path.
   697  func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
   698  	if !pop.Path.Begin.Ok() {
   699  		// pop.Path should not be empty in operations that create/delete files.
   700  		// This is consistent with unlinkat(dirfd, "", 0).
   701  		if pop.Path.Absolute {
   702  			return linuxerr.EBUSY
   703  		}
   704  		return linuxerr.ENOENT
   705  	}
   706  	if pop.FollowFinalSymlink {
   707  		ctx.Warningf("VirtualFilesystem.UnlinkAt: file deletion paths can't follow final symlink")
   708  		return linuxerr.EINVAL
   709  	}
   710  
   711  	rp := vfs.getResolvingPath(creds, pop)
   712  	for {
   713  		vfs.maybeBlockOnMountPromise(ctx, rp)
   714  		err := rp.mount.fs.impl.UnlinkAt(ctx, rp)
   715  		if err == nil {
   716  			rp.Release(ctx)
   717  			return nil
   718  		}
   719  		if checkInvariants {
   720  			if rp.canHandleError(err) && rp.Done() {
   721  				panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
   722  			}
   723  		}
   724  		if !rp.handleError(ctx, err) {
   725  			rp.Release(ctx)
   726  			return err
   727  		}
   728  	}
   729  }
   730  
   731  // BoundEndpointAt gets the bound endpoint at the given path, if one exists.
   732  func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *BoundEndpointOptions) (transport.BoundEndpoint, error) {
   733  	rp := vfs.getResolvingPath(creds, pop)
   734  	for {
   735  		vfs.maybeBlockOnMountPromise(ctx, rp)
   736  		bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts)
   737  		if err == nil {
   738  			rp.Release(ctx)
   739  			return bep, nil
   740  		}
   741  		if checkInvariants {
   742  			if rp.canHandleError(err) && rp.Done() {
   743  				panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
   744  			}
   745  		}
   746  		if !rp.handleError(ctx, err) {
   747  			rp.Release(ctx)
   748  			return nil, err
   749  		}
   750  	}
   751  }
   752  
   753  // ListXattrAt returns all extended attribute names for the file at the given
   754  // path.
   755  func (vfs *VirtualFilesystem) ListXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) {
   756  	rp := vfs.getResolvingPath(creds, pop)
   757  	for {
   758  		vfs.maybeBlockOnMountPromise(ctx, rp)
   759  		names, err := rp.mount.fs.impl.ListXattrAt(ctx, rp, size)
   760  		if err == nil {
   761  			rp.Release(ctx)
   762  			return names, nil
   763  		}
   764  		if linuxerr.Equals(linuxerr.EOPNOTSUPP, err) {
   765  			// Linux doesn't actually return EOPNOTSUPP in this case; instead,
   766  			// fs/xattr.c:vfs_listxattr() falls back to allowing the security
   767  			// subsystem to return security extended attributes, which by
   768  			// default don't exist.
   769  			rp.Release(ctx)
   770  			return nil, nil
   771  		}
   772  		if !rp.handleError(ctx, err) {
   773  			rp.Release(ctx)
   774  			return nil, err
   775  		}
   776  	}
   777  }
   778  
   779  // GetXattrAt returns the value associated with the given extended attribute
   780  // for the file at the given path.
   781  func (vfs *VirtualFilesystem) GetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetXattrOptions) (string, error) {
   782  	rp := vfs.getResolvingPath(creds, pop)
   783  	for {
   784  		vfs.maybeBlockOnMountPromise(ctx, rp)
   785  		val, err := rp.mount.fs.impl.GetXattrAt(ctx, rp, *opts)
   786  		if err == nil {
   787  			rp.Release(ctx)
   788  			return val, nil
   789  		}
   790  		if !rp.handleError(ctx, err) {
   791  			rp.Release(ctx)
   792  			return "", err
   793  		}
   794  	}
   795  }
   796  
   797  // SetXattrAt changes the value associated with the given extended attribute
   798  // for the file at the given path.
   799  func (vfs *VirtualFilesystem) SetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetXattrOptions) error {
   800  	rp := vfs.getResolvingPath(creds, pop)
   801  	for {
   802  		vfs.maybeBlockOnMountPromise(ctx, rp)
   803  		err := rp.mount.fs.impl.SetXattrAt(ctx, rp, *opts)
   804  		if err == nil {
   805  			rp.Release(ctx)
   806  			return nil
   807  		}
   808  		if !rp.handleError(ctx, err) {
   809  			rp.Release(ctx)
   810  			return err
   811  		}
   812  	}
   813  }
   814  
   815  // RemoveXattrAt removes the given extended attribute from the file at rp.
   816  func (vfs *VirtualFilesystem) RemoveXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error {
   817  	rp := vfs.getResolvingPath(creds, pop)
   818  	for {
   819  		vfs.maybeBlockOnMountPromise(ctx, rp)
   820  		err := rp.mount.fs.impl.RemoveXattrAt(ctx, rp, name)
   821  		if err == nil {
   822  			rp.Release(ctx)
   823  			return nil
   824  		}
   825  		if !rp.handleError(ctx, err) {
   826  			rp.Release(ctx)
   827  			return err
   828  		}
   829  	}
   830  }
   831  
   832  // SyncAllFilesystems has the semantics of Linux's sync(2).
   833  func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error {
   834  	var retErr error
   835  	for fs := range vfs.getFilesystems() {
   836  		if err := fs.impl.Sync(ctx); err != nil && retErr == nil {
   837  			retErr = err
   838  		}
   839  		fs.DecRef(ctx)
   840  	}
   841  	return retErr
   842  }
   843  
   844  func (vfs *VirtualFilesystem) getFilesystems() map[*Filesystem]struct{} {
   845  	fss := make(map[*Filesystem]struct{})
   846  	vfs.filesystemsMu.Lock()
   847  	defer vfs.filesystemsMu.Unlock()
   848  	for fs := range vfs.filesystems {
   849  		if !fs.TryIncRef() {
   850  			continue
   851  		}
   852  		fss[fs] = struct{}{}
   853  	}
   854  	return fss
   855  }
   856  
   857  // MkdirAllAt recursively creates non-existent directories on the given path
   858  // (including the last component).
   859  func (vfs *VirtualFilesystem) MkdirAllAt(ctx context.Context, currentPath string, root VirtualDentry, creds *auth.Credentials, mkdirOpts *MkdirOptions, mustBeDir bool) error {
   860  	pop := &PathOperation{
   861  		Root:  root,
   862  		Start: root,
   863  		Path:  fspath.Parse(currentPath),
   864  	}
   865  	stat, err := vfs.StatAt(ctx, creds, pop, &StatOptions{Mask: linux.STATX_TYPE})
   866  	switch {
   867  	case err == nil:
   868  		if mustBeDir && (stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.FileTypeMask != linux.ModeDirectory) {
   869  			return linuxerr.ENOTDIR
   870  		}
   871  		// Directory already exists.
   872  		return nil
   873  	case linuxerr.Equals(linuxerr.ENOENT, err):
   874  		// Expected, we will create the dir.
   875  	default:
   876  		return fmt.Errorf("stat failed for %q during directory creation: %w", currentPath, err)
   877  	}
   878  
   879  	// Recurse to ensure parent is created and then create the final directory.
   880  	if err := vfs.MkdirAllAt(ctx, path.Dir(currentPath), root, creds, mkdirOpts, true /* mustBeDir */); err != nil {
   881  		return err
   882  	}
   883  	if err := vfs.MkdirAt(ctx, creds, pop, mkdirOpts); err != nil {
   884  		return fmt.Errorf("failed to create directory %q: %w", currentPath, err)
   885  	}
   886  	return nil
   887  }
   888  
   889  // MakeSyntheticMountpoint creates parent directories of target if they do not
   890  // exist and attempts to create a directory for the mountpoint. If a
   891  // non-directory file already exists there then we allow it.
   892  func (vfs *VirtualFilesystem) MakeSyntheticMountpoint(ctx context.Context, target string, root VirtualDentry, creds *auth.Credentials) error {
   893  	mkdirOpts := &MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}
   894  
   895  	// Make sure the parent directory of target exists.
   896  	if err := vfs.MkdirAllAt(ctx, path.Dir(target), root, creds, mkdirOpts, true /* mustBeDir */); err != nil {
   897  		return fmt.Errorf("failed to create parent directory of mountpoint %q: %w", target, err)
   898  	}
   899  
   900  	// Attempt to mkdir the final component. If a file (of any type) exists
   901  	// then we let allow mounting on top of that because we do not require the
   902  	// target to be an existing directory, unlike Linux mount(2).
   903  	if err := vfs.MkdirAllAt(ctx, target, root, creds, mkdirOpts, false /* mustBeDir */); err != nil {
   904  		return fmt.Errorf("failed to create mountpoint %q: %w", target, err)
   905  	}
   906  	return nil
   907  }
   908  
   909  // RegisterMountPromise marks vd as a mount promise. This means any VFS
   910  // operation on vd will be blocked until another process mounts over it or the
   911  // mount promise times out.
   912  func (vfs *VirtualFilesystem) RegisterMountPromise(vd VirtualDentry) error {
   913  	vfs.mountPromisesMu.Lock()
   914  	defer vfs.mountPromisesMu.Unlock()
   915  	if _, ok := vfs.mountPromises[vd]; ok {
   916  		return fmt.Errorf("mount promise for %v already exists", vd)
   917  	}
   918  	wq := &waiter.Queue{}
   919  	vfs.mountPromises[vd] = wq
   920  	return nil
   921  }
   922  
   923  // Emit a SentryMountPromiseBlockEvent and wait for the mount promise to be
   924  // resolved or time out.
   925  func (vfs *VirtualFilesystem) maybeBlockOnMountPromise(ctx context.Context, rp *ResolvingPath) {
   926  	vd := VirtualDentry{rp.mount, rp.start}
   927  	vfs.mountPromisesMu.RLock()
   928  	wq, ok := vfs.mountPromises[vd]
   929  	vfs.mountPromisesMu.RUnlock()
   930  	if !ok {
   931  		return
   932  	}
   933  
   934  	path, err := vfs.PathnameReachable(ctx, rp.root, vd)
   935  	if err != nil {
   936  		panic(fmt.Sprintf("could not reach %v from root", rp.Component()))
   937  	}
   938  	e, ch := waiter.NewChannelEntry(waiter.EventOut)
   939  	wq.EventRegister(&e)
   940  	eventchannel.Emit(&epb.SentryMountPromiseBlockEvent{Path: path})
   941  
   942  	select {
   943  	case <-ch:
   944  		// Update rp to point to the promised mount.
   945  		newMnt := vfs.getMountAt(ctx, rp.mount, rp.start)
   946  		rp.mount = newMnt
   947  		rp.start = newMnt.root
   948  		rp.flags = rp.flags&^rpflagsHaveStartRef | rpflagsHaveMountRef
   949  	case <-time.After(mountPromiseTimeout):
   950  		log.Warningf("mount promise for %s timed out, proceeding with VFS operation", path)
   951  	}
   952  }
   953  
   954  func (vfs *VirtualFilesystem) maybeResolveMountPromise(vd VirtualDentry) {
   955  	vfs.mountPromisesMu.Lock()
   956  	defer vfs.mountPromisesMu.Unlock()
   957  	wq, ok := vfs.mountPromises[vd]
   958  	if !ok {
   959  		return
   960  	}
   961  	wq.Notify(waiter.EventOut)
   962  	delete(vfs.mountPromises, vd)
   963  }
   964  
   965  // A VirtualDentry represents a node in a VFS tree, by combining a Dentry
   966  // (which represents a node in a Filesystem's tree) and a Mount (which
   967  // represents the Filesystem's position in a VFS mount tree).
   968  //
   969  // VirtualDentry's semantics are similar to that of a Go interface object
   970  // representing a pointer: it is a copyable value type that represents
   971  // references to another entity. The zero value of VirtualDentry is an "empty
   972  // VirtualDentry", directly analogous to a nil interface object.
   973  // VirtualDentry.Ok() checks that a VirtualDentry is not zero-valued; unless
   974  // otherwise specified, all other VirtualDentry methods require
   975  // VirtualDentry.Ok() == true.
   976  //
   977  // Mounts and Dentries are reference-counted, requiring that users call
   978  // VirtualDentry.{Inc,Dec}Ref() as appropriate. We often colloquially refer to
   979  // references on the Mount and Dentry referred to by a VirtualDentry as
   980  // references on the VirtualDentry itself. Unless otherwise specified, all
   981  // VirtualDentry methods require that a reference is held on the VirtualDentry.
   982  //
   983  // VirtualDentry is analogous to Linux's struct path.
   984  //
   985  // +stateify savable
   986  type VirtualDentry struct {
   987  	mount  *Mount
   988  	dentry *Dentry
   989  }
   990  
   991  // MakeVirtualDentry creates a VirtualDentry.
   992  func MakeVirtualDentry(mount *Mount, dentry *Dentry) VirtualDentry {
   993  	return VirtualDentry{
   994  		mount:  mount,
   995  		dentry: dentry,
   996  	}
   997  }
   998  
   999  // Ok returns true if vd is not empty. It does not require that a reference is
  1000  // held.
  1001  func (vd VirtualDentry) Ok() bool {
  1002  	return vd.mount != nil
  1003  }
  1004  
  1005  // IncRef increments the reference counts on the Mount and Dentry represented
  1006  // by vd.
  1007  func (vd VirtualDentry) IncRef() {
  1008  	vd.mount.IncRef()
  1009  	vd.dentry.IncRef()
  1010  }
  1011  
  1012  // DecRef decrements the reference counts on the Mount and Dentry represented
  1013  // by vd.
  1014  func (vd VirtualDentry) DecRef(ctx context.Context) {
  1015  	vd.dentry.DecRef(ctx)
  1016  	vd.mount.DecRef(ctx)
  1017  }
  1018  
  1019  // Mount returns the Mount associated with vd. It does not take a reference on
  1020  // the returned Mount.
  1021  func (vd VirtualDentry) Mount() *Mount {
  1022  	return vd.mount
  1023  }
  1024  
  1025  // Dentry returns the Dentry associated with vd. It does not take a reference
  1026  // on the returned Dentry.
  1027  func (vd VirtualDentry) Dentry() *Dentry {
  1028  	return vd.dentry
  1029  }