gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/vfs/mount.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package vfs
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"math"
    21  	"sort"
    22  	"strings"
    23  
    24  	"gvisor.dev/gvisor/pkg/abi/linux"
    25  	"gvisor.dev/gvisor/pkg/atomicbitops"
    26  	"gvisor.dev/gvisor/pkg/cleanup"
    27  	"gvisor.dev/gvisor/pkg/context"
    28  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    29  	"gvisor.dev/gvisor/pkg/refs"
    30  	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
    31  )
    32  
    33  // MountMax is the maximum number of mounts allowed. In Linux this can be
    34  // configured by the user at /proc/sys/fs/mount-max, but the default is
    35  // 100,000. We set the gVisor limit to 10,000.
    36  const (
    37  	MountMax     = 10000
    38  	nsfsName     = "nsfs"
    39  	cgroupFsName = "cgroup"
    40  )
    41  
    42  // A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem
    43  // (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem
    44  // (Mount.fs), which applies to path resolution in the context of a particular
    45  // Mount (Mount.key.parent).
    46  //
    47  // Mounts are reference-counted. Unless otherwise specified, all Mount methods
    48  // require that a reference is held.
    49  //
    50  // Mount and Filesystem are distinct types because it's possible for a single
    51  // Filesystem to be mounted at multiple locations and/or in multiple mount
    52  // namespaces.
    53  //
    54  // Mount is analogous to Linux's struct mount. (gVisor does not distinguish
    55  // between struct mount and struct vfsmount.)
    56  //
    57  // +stateify savable
    58  type Mount struct {
    59  	// vfs, fs, root are immutable. References are held on fs and root.
    60  	// Note that for a disconnected mount, root may be nil.
    61  	//
    62  	// Invariant: if not nil, root belongs to fs.
    63  	vfs  *VirtualFilesystem
    64  	fs   *Filesystem
    65  	root *Dentry
    66  
    67  	// ID is the immutable mount ID.
    68  	ID uint64
    69  
    70  	// Flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except
    71  	// for MS_RDONLY which is tracked in "writers". flags is protected by
    72  	// VirtualFilesystem.mountMu.
    73  	flags MountFlags
    74  
    75  	// key is protected by VirtualFilesystem.mountMu and
    76  	// VirtualFilesystem.mounts.seq, and may be nil. References are held on
    77  	// key.parent and key.point if they are not nil.
    78  	//
    79  	// Invariant: key.parent != nil iff key.point != nil. key.point belongs to
    80  	// key.parent.fs.
    81  	key mountKey `state:".(VirtualDentry)"`
    82  
    83  	// ns is the namespace in which this Mount was mounted. ns is protected by
    84  	// VirtualFilesystem.mountMu.
    85  	ns *MountNamespace
    86  
    87  	// The lower 63 bits of refs are a reference count. The MSB of refs is set
    88  	// if the Mount has been eagerly umounted, as by umount(2) without the
    89  	// MNT_DETACH flag. refs is accessed using atomic memory operations.
    90  	refs atomicbitops.Int64
    91  
    92  	// children is the set of all Mounts for which Mount.key.parent is this
    93  	// Mount. children is protected by VirtualFilesystem.mountMu.
    94  	children map[*Mount]struct{}
    95  
    96  	// isShared indicates this mount has the MS_SHARED propagation type.
    97  	isShared bool
    98  
    99  	// sharedEntry is an entry in a circular list (ring) of mounts in a shared
   100  	// peer group.
   101  	sharedEntry mountEntry
   102  
   103  	// followerList is a list of mounts which has this mount as its leader.
   104  	followerList followerList
   105  
   106  	// followerEntry is an entry in a followerList.
   107  	followerEntry
   108  
   109  	// leader is the mount that this mount receives propagation events from.
   110  	leader *Mount
   111  
   112  	// groupID is the ID for this mount's shared peer group. If the mount is not
   113  	// in a peer group, this is 0.
   114  	groupID uint32
   115  
   116  	// umounted is true if VFS.umountRecursiveLocked() has been called on this
   117  	// Mount. VirtualFilesystem does not hold a reference on Mounts for which
   118  	// umounted is true. umounted is protected by VirtualFilesystem.mountMu.
   119  	umounted bool
   120  
   121  	// locked is true if the mount cannot be unmounted in the current mount
   122  	// namespace. It is analogous to MNT_LOCKED in Linux.
   123  	locked bool
   124  
   125  	// The lower 63 bits of writers is the number of calls to
   126  	// Mount.CheckBeginWrite() that have not yet been paired with a call to
   127  	// Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect.
   128  	// writers is accessed using atomic memory operations.
   129  	writers atomicbitops.Int64
   130  }
   131  
   132  func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount {
   133  	mnt := &Mount{
   134  		ID:       vfs.lastMountID.Add(1),
   135  		flags:    opts.Flags,
   136  		vfs:      vfs,
   137  		fs:       fs,
   138  		root:     root,
   139  		ns:       mntns,
   140  		locked:   opts.Locked,
   141  		isShared: false,
   142  		refs:     atomicbitops.FromInt64(1),
   143  	}
   144  	if opts.ReadOnly {
   145  		mnt.setReadOnlyLocked(true)
   146  	}
   147  	mnt.sharedEntry.Init(mnt)
   148  	refs.Register(mnt)
   149  	return mnt
   150  }
   151  
   152  // Options returns a copy of the MountOptions currently applicable to mnt.
   153  func (mnt *Mount) Options() MountOptions {
   154  	mnt.vfs.lockMounts()
   155  	defer mnt.vfs.unlockMounts(context.Background())
   156  	return MountOptions{
   157  		Flags:    mnt.flags,
   158  		ReadOnly: mnt.ReadOnlyLocked(),
   159  	}
   160  }
   161  
   162  // setMountOptions sets mnt's options to the given opts.
   163  //
   164  // Preconditions:
   165  //   - vfs.mountMu must be locked.
   166  func (mnt *Mount) setMountOptions(opts *MountOptions) error {
   167  	if opts == nil {
   168  		return linuxerr.EINVAL
   169  	}
   170  	if err := mnt.setReadOnlyLocked(opts.ReadOnly); err != nil {
   171  		return err
   172  	}
   173  	mnt.flags = opts.Flags
   174  	return nil
   175  }
   176  
   177  // MountFlags returns a bit mask that indicates mount options.
   178  func (mnt *Mount) MountFlags() uint64 {
   179  	mnt.vfs.lockMounts()
   180  	defer mnt.vfs.unlockMounts(context.Background())
   181  	var flags uint64
   182  	if mnt.flags.NoExec {
   183  		flags |= linux.ST_NOEXEC
   184  	}
   185  	if mnt.flags.NoATime {
   186  		flags |= linux.ST_NOATIME
   187  	}
   188  	if mnt.flags.NoDev {
   189  		flags |= linux.ST_NODEV
   190  	}
   191  	if mnt.flags.NoSUID {
   192  		flags |= linux.ST_NOSUID
   193  	}
   194  	if mnt.ReadOnlyLocked() {
   195  		flags |= linux.ST_RDONLY
   196  	}
   197  	return flags
   198  }
   199  
   200  func (mnt *Mount) isFollower() bool {
   201  	return mnt.leader != nil
   202  }
   203  
   204  func (mnt *Mount) neverConnected() bool {
   205  	return mnt.ns == nil
   206  }
   207  
   208  // coveringMount returns a mount that completely covers mnt if it exists and nil
   209  // otherwise. A mount that covers another is one that is the only child of its
   210  // parent and whose mountpoint is its parent's root.
   211  func (mnt *Mount) coveringMount() *Mount {
   212  	if len(mnt.children) != 1 {
   213  		return nil
   214  	}
   215  	// Get the child from the children map.
   216  	var child *Mount
   217  	for child = range mnt.children {
   218  		break
   219  	}
   220  	if child.point() != mnt.root {
   221  		return nil
   222  	}
   223  	return child
   224  }
   225  
   226  // validInMountNS checks if the mount is valid in the current mount namespace. This includes
   227  // checking if has previously been unmounted. It is analogous to fs/namespace.c:check_mnt() in
   228  // Linux.
   229  //
   230  // +checklocks:vfs.mountMu
   231  func (vfs *VirtualFilesystem) validInMountNS(ctx context.Context, mnt *Mount) bool {
   232  	if mntns := MountNamespaceFromContext(ctx); mntns != nil {
   233  		vfs.delayDecRef(mntns)
   234  		return mnt.ns == mntns && !mnt.umounted
   235  	}
   236  	return false
   237  }
   238  
   239  // NewFilesystem creates a new filesystem object not yet associated with any
   240  // mounts. It can be installed into the filesystem tree with ConnectMountAt.
   241  // Note that only the filesystem-specific mount options from opts are used by
   242  // this function, mount flags are ignored. To set mount flags, pass them to a
   243  // corresponding ConnectMountAt.
   244  func (vfs *VirtualFilesystem) NewFilesystem(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *MountOptions) (*Filesystem, *Dentry, error) {
   245  	rft := vfs.getFilesystemType(fsTypeName)
   246  	if rft == nil {
   247  		return nil, nil, linuxerr.ENODEV
   248  	}
   249  	if !opts.GetFilesystemOptions.InternalMount && !rft.opts.AllowUserMount {
   250  		return nil, nil, linuxerr.ENODEV
   251  	}
   252  	return rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
   253  }
   254  
   255  // NewDisconnectedMount returns a Mount representing fs with the given root
   256  // (which may be nil). The new Mount is not associated with any MountNamespace
   257  // and is not connected to any other Mounts. References are taken on fs and
   258  // root.
   259  func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry, opts *MountOptions) *Mount {
   260  	fs.IncRef()
   261  	if root != nil {
   262  		root.IncRef()
   263  	}
   264  	return newMount(vfs, fs, root, nil /* mntns */, opts)
   265  }
   266  
   267  // MountDisconnected creates a Filesystem configured by the given arguments,
   268  // then returns a Mount representing it. The new Mount is not associated with
   269  // any MountNamespace and is not connected to any other Mounts.
   270  func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth.Credentials, source string, fsTypeName string, opts *MountOptions) (*Mount, error) {
   271  	fs, root, err := vfs.NewFilesystem(ctx, creds, source, fsTypeName, opts)
   272  	if err != nil {
   273  		return nil, err
   274  	}
   275  	return newMount(vfs, fs, root, nil /* mntns */, opts), nil
   276  }
   277  
   278  // attachTreeLocked attaches the mount tree at mnt to mp and propagates the mount to mp.mount's
   279  // peers and followers. This method consumes the reference on mp. It is analogous to
   280  // fs/namespace.c:attach_recursive_mnt() in Linux. The mount point mp must have its dentry locked
   281  // before calling attachTreeLocked.
   282  //
   283  // +checklocks:vfs.mountMu
   284  func (vfs *VirtualFilesystem) attachTreeLocked(ctx context.Context, mnt *Mount, mp VirtualDentry) error {
   285  	cleanup := cleanup.Make(func() {
   286  		vfs.cleanupGroupIDs(mnt.submountsLocked()) // +checklocksforce
   287  		mp.dentry.mu.Unlock()
   288  		vfs.delayDecRef(mp)
   289  	})
   290  	defer cleanup.Clean()
   291  	// This is equivalent to checking for SB_NOUSER in Linux, which is set on all
   292  	// anon mounts and sentry-internal filesystems like pipefs.
   293  	if mp.mount.neverConnected() {
   294  		return linuxerr.EINVAL
   295  	}
   296  	defer func() { mp.mount.ns.pending = 0 }()
   297  	if err := mp.mount.ns.checkMountCount(ctx, mnt); err != nil {
   298  		return err
   299  	}
   300  
   301  	var (
   302  		propMnts map[*Mount]struct{}
   303  		err      error
   304  	)
   305  	if mp.mount.isShared {
   306  		if err := vfs.allocMountGroupIDs(mnt, true); err != nil {
   307  			return err
   308  		}
   309  		propMnts, err = vfs.doPropagation(ctx, mnt, mp)
   310  		if err != nil {
   311  			for pmnt := range propMnts {
   312  				if !pmnt.parent().neverConnected() {
   313  					pmnt.parent().ns.pending -= pmnt.countSubmountsLocked()
   314  				}
   315  				vfs.abortUncommitedMount(ctx, pmnt)
   316  			}
   317  			return err
   318  		}
   319  	}
   320  	cleanup.Release()
   321  
   322  	if mp.mount.isShared {
   323  		for _, m := range mnt.submountsLocked() {
   324  			m.isShared = true
   325  		}
   326  	}
   327  	vfs.mounts.seq.BeginWrite()
   328  	vfs.connectLocked(mnt, mp, mp.mount.ns)
   329  	vfs.mounts.seq.EndWrite()
   330  	mp.dentry.mu.Unlock()
   331  	vfs.commitChildren(ctx, mnt)
   332  
   333  	var owner *auth.UserNamespace
   334  	if mntns := MountNamespaceFromContext(ctx); mntns != nil {
   335  		owner = mntns.Owner
   336  		mntns.DecRef(ctx)
   337  	}
   338  	for pmnt := range propMnts {
   339  		vfs.commitMount(ctx, pmnt)
   340  		if pmnt.parent().ns.Owner != owner {
   341  			vfs.lockMountTree(pmnt)
   342  		}
   343  		pmnt.locked = false
   344  	}
   345  	return nil
   346  }
   347  
   348  // +checklocks:vfs.mountMu
   349  func (vfs *VirtualFilesystem) lockMountTree(mnt *Mount) {
   350  	for _, m := range mnt.submountsLocked() {
   351  		// TODO(b/315839347): Add equivalents for MNT_LOCK_ATIME,
   352  		// MNT_LOCK_READONLY, etc.
   353  		m.locked = true
   354  	}
   355  }
   356  
   357  // +checklocks:vfs.mountMu
   358  func (vfs *VirtualFilesystem) mountHasLockedChildren(mnt *Mount, vd VirtualDentry) bool {
   359  	for child := range mnt.children {
   360  		mp := child.getKey()
   361  		if !mp.mount.fs.Impl().IsDescendant(vd, mp) {
   362  			continue
   363  		}
   364  		if child.locked {
   365  			return true
   366  		}
   367  	}
   368  	return false
   369  }
   370  
   371  // ConnectMountAt connects mnt at the path represented by target.
   372  //
   373  // Preconditions: mnt must be disconnected.
   374  func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Credentials, mnt *Mount, target *PathOperation) error {
   375  	// We can't hold vfs.mountMu while calling FilesystemImpl methods due to
   376  	// lock ordering.
   377  	vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{})
   378  	if err != nil {
   379  		return err
   380  	}
   381  	vfs.lockMounts()
   382  	defer vfs.unlockMounts(ctx)
   383  	mp, err := vfs.lockMountpoint(vd)
   384  	if err != nil {
   385  		return err
   386  	}
   387  	if mp.mount.neverConnected() || mp.mount.umounted {
   388  		mp.dentry.mu.Unlock()
   389  		vfs.delayDecRef(mp)
   390  		return linuxerr.EINVAL
   391  	}
   392  	return vfs.attachTreeLocked(ctx, mnt, mp)
   393  }
   394  
   395  // lockMountpoint returns VirtualDentry with a locked Dentry. If vd is a
   396  // mountpoint, the method returns a VirtualDentry with a locked Dentry that is
   397  // the top most mount stacked on that Dentry. This method consumes a reference
   398  // on vd and returns a VirtualDentry with an extra reference. It is analogous to
   399  // fs/namespace.c:do_lock_mount() in Linux.
   400  //
   401  // +checklocks:vfs.mountMu
   402  func (vfs *VirtualFilesystem) lockMountpoint(vd VirtualDentry) (VirtualDentry, error) {
   403  	vd.dentry.mu.Lock()
   404  	for {
   405  		if vd.mount.umounted || vd.dentry.dead {
   406  			vd.dentry.mu.Unlock()
   407  			vfs.delayDecRef(vd)
   408  			return VirtualDentry{}, linuxerr.ENOENT
   409  		}
   410  		// vd might have been mounted over between vfs.GetDentryAt() and
   411  		// vfs.mountMu.Lock().
   412  		if !vd.dentry.isMounted() {
   413  			break
   414  		}
   415  		nextmnt := vfs.mounts.Lookup(vd.mount, vd.dentry)
   416  		if nextmnt == nil {
   417  			break
   418  		}
   419  		// It's possible that nextmnt has been umounted but not disconnected,
   420  		// in which case vfs no longer holds a reference on it, and the last
   421  		// reference may be concurrently dropped even though we're holding
   422  		// vfs.mountMu.
   423  		if !nextmnt.tryIncMountedRef() {
   424  			break
   425  		}
   426  		// This can't fail since we're holding vfs.mountMu.
   427  		nextmnt.root.IncRef()
   428  		vd.dentry.mu.Unlock()
   429  		vfs.delayDecRef(vd)
   430  		vd = VirtualDentry{
   431  			mount:  nextmnt,
   432  			dentry: nextmnt.root,
   433  		}
   434  		vd.dentry.mu.Lock()
   435  	}
   436  	return vd, nil
   437  }
   438  
   439  // CloneMountAt returns a new mount with the same fs, specified root and
   440  // mount options.  If mount options are nil, mnt's options are copied. The clone
   441  // is added to mnt's peer group if mnt is shared. If not the clone is in a
   442  // shared peer group by itself.
   443  func (vfs *VirtualFilesystem) CloneMountAt(mnt *Mount, root *Dentry, mopts *MountOptions) (*Mount, error) {
   444  	vfs.lockMounts()
   445  	defer vfs.unlockMounts(context.Background())
   446  	return vfs.cloneMount(mnt, root, mopts, makeSharedClone)
   447  }
   448  
   449  // cloneMount returns a new mount with mnt.fs as the filesystem and root as the
   450  // root, with a propagation type specified by cloneType. The returned mount has
   451  // an extra reference. If mopts is nil, use the options found in mnt.
   452  // This method is analogous to fs/namespace.c:clone_mnt() in Linux.
   453  //
   454  // +checklocks:vfs.mountMu
   455  func (vfs *VirtualFilesystem) cloneMount(mnt *Mount, root *Dentry, mopts *MountOptions, cloneType int) (*Mount, error) {
   456  	opts := mopts
   457  	if opts == nil {
   458  		opts = &MountOptions{
   459  			Flags:    mnt.flags,
   460  			ReadOnly: mnt.ReadOnlyLocked(),
   461  		}
   462  	}
   463  	clone := vfs.NewDisconnectedMount(mnt.fs, root, opts)
   464  	if cloneType&(makeFollowerClone|makePrivateClone|sharedToFollowerClone) != 0 {
   465  		clone.groupID = 0
   466  	} else {
   467  		clone.groupID = mnt.groupID
   468  	}
   469  	if cloneType&makeSharedClone != 0 && clone.groupID == 0 {
   470  		if err := vfs.allocateGroupID(clone); err != nil {
   471  			vfs.delayDecRef(clone)
   472  			return nil, err
   473  		}
   474  	}
   475  	clone.isShared = mnt.isShared
   476  	clone.locked = mnt.locked
   477  	if cloneType&makeFollowerClone != 0 || (cloneType&sharedToFollowerClone != 0 && mnt.isShared) {
   478  		mnt.followerList.PushFront(clone)
   479  		clone.leader = mnt
   480  		clone.isShared = false
   481  	} else if cloneType&makePrivateClone == 0 {
   482  		if cloneType&makeSharedClone != 0 || mnt.isShared {
   483  			mnt.sharedEntry.Add(&clone.sharedEntry)
   484  		}
   485  		if mnt.isFollower() {
   486  			mnt.leader.followerList.InsertAfter(mnt, clone)
   487  		}
   488  		clone.leader = mnt.leader
   489  	} else {
   490  		clone.isShared = false
   491  	}
   492  	if cloneType&makeSharedClone != 0 {
   493  		clone.isShared = true
   494  	}
   495  	return clone, nil
   496  }
   497  
   498  type cloneTreeNode struct {
   499  	prevMount   *Mount
   500  	parentMount *Mount
   501  }
   502  
   503  // cloneMountTree creates a copy of mnt's tree with the specified root
   504  // dentry at root. The new descendants are added to mnt's children list but are
   505  // not connected with call to connectLocked.
   506  // `cloneFunc` is a callback that is executed for each cloned mount.
   507  // This method is analogous to fs/namespace.c:copy_tree() in Linux.
   508  //
   509  // +checklocks:vfs.mountMu
   510  func (vfs *VirtualFilesystem) cloneMountTree(ctx context.Context, mnt *Mount, root *Dentry, cloneType int, cloneFunc func(ctx context.Context, oldmnt, newMnt *Mount)) (*Mount, error) {
   511  	clone, err := vfs.cloneMount(mnt, root, nil, cloneType)
   512  	if err != nil {
   513  		return nil, err
   514  	}
   515  	if cloneFunc != nil {
   516  		cloneFunc(ctx, mnt, clone)
   517  	}
   518  	queue := []cloneTreeNode{{mnt, clone}}
   519  	for len(queue) != 0 {
   520  		p := queue[len(queue)-1]
   521  		queue = queue[:len(queue)-1]
   522  		for c := range p.prevMount.children {
   523  			if mp := c.getKey(); p.prevMount == mnt && !mp.mount.fs.Impl().IsDescendant(VirtualDentry{mnt, root}, mp) {
   524  				continue
   525  			}
   526  			m, err := vfs.cloneMount(c, c.root, nil, cloneType)
   527  			if err != nil {
   528  				vfs.abortUncommitedMount(ctx, clone)
   529  				return nil, err
   530  			}
   531  			mp := VirtualDentry{
   532  				mount:  p.parentMount,
   533  				dentry: c.point(),
   534  			}
   535  			mp.IncRef()
   536  			m.setKey(mp)
   537  			if p.parentMount.children == nil {
   538  				p.parentMount.children = make(map[*Mount]struct{})
   539  			}
   540  			p.parentMount.children[m] = struct{}{}
   541  			if len(c.children) != 0 {
   542  				queue = append(queue, cloneTreeNode{c, m})
   543  			}
   544  			if cloneFunc != nil {
   545  				cloneFunc(ctx, c, m)
   546  			}
   547  		}
   548  	}
   549  	return clone, nil
   550  }
   551  
   552  // BindAt creates a clone of the source path's parent mount and mounts it at
   553  // the target path. The new mount's root dentry is one pointed to by the source
   554  // path.
   555  func (vfs *VirtualFilesystem) BindAt(ctx context.Context, creds *auth.Credentials, source, target *PathOperation, recursive bool) error {
   556  	sourceVd, err := vfs.GetDentryAt(ctx, creds, source, &GetDentryOptions{})
   557  	if err != nil {
   558  		return err
   559  	}
   560  	defer sourceVd.DecRef(ctx)
   561  	targetVd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{})
   562  	if err != nil {
   563  		return err
   564  	}
   565  
   566  	vfs.lockMounts()
   567  	defer vfs.unlockMounts(ctx)
   568  	mp, err := vfs.lockMountpoint(targetVd)
   569  	if err != nil {
   570  		return err
   571  	}
   572  	cleanup := cleanup.Make(func() {
   573  		mp.dentry.mu.Unlock()
   574  		vfs.delayDecRef(mp) // +checklocksforce
   575  	})
   576  	defer cleanup.Clean()
   577  	// Namespace mounts can be binded to other mount points.
   578  	fsName := sourceVd.mount.Filesystem().FilesystemType().Name()
   579  	if !vfs.validInMountNS(ctx, sourceVd.mount) && fsName != nsfsName && fsName != cgroupFsName {
   580  		return linuxerr.EINVAL
   581  	}
   582  	if !vfs.validInMountNS(ctx, mp.mount) {
   583  		return linuxerr.EINVAL
   584  	}
   585  
   586  	var clone *Mount
   587  	if recursive {
   588  		clone, err = vfs.cloneMountTree(ctx, sourceVd.mount, sourceVd.dentry, 0, nil)
   589  	} else {
   590  		if vfs.mountHasLockedChildren(sourceVd.mount, sourceVd) {
   591  			return linuxerr.EINVAL
   592  		}
   593  		clone, err = vfs.cloneMount(sourceVd.mount, sourceVd.dentry, nil, 0)
   594  	}
   595  	if err != nil {
   596  		return err
   597  	}
   598  	cleanup.Release()
   599  
   600  	vfs.delayDecRef(clone)
   601  	clone.locked = false
   602  	if err := vfs.attachTreeLocked(ctx, clone, mp); err != nil {
   603  		vfs.abortUncomittedChildren(ctx, clone)
   604  		return err
   605  	}
   606  	return nil
   607  }
   608  
   609  // RemountAt changes the mountflags and data of an existing mount without having to unmount and remount the filesystem.
   610  func (vfs *VirtualFilesystem) RemountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MountOptions) error {
   611  	vd, err := vfs.getMountpoint(ctx, creds, pop)
   612  	if err != nil {
   613  		return err
   614  	}
   615  	defer vd.DecRef(ctx)
   616  	vfs.lockMounts()
   617  	defer vfs.unlockMounts(ctx)
   618  	mnt := vd.Mount()
   619  	if !vfs.validInMountNS(ctx, mnt) {
   620  		return linuxerr.EINVAL
   621  	}
   622  	return mnt.setMountOptions(opts)
   623  }
   624  
   625  // MountAt creates and mounts a Filesystem configured by the given arguments.
   626  // The VirtualFilesystem will hold a reference to the Mount until it is
   627  // unmounted.
   628  //
   629  // This method returns the mounted Mount without a reference, for convenience
   630  // during VFS setup when there is no chance of racing with unmount.
   631  func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) (*Mount, error) {
   632  	mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts)
   633  	if err != nil {
   634  		return nil, err
   635  	}
   636  	defer mnt.DecRef(ctx)
   637  	if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil {
   638  		return nil, err
   639  	}
   640  	return mnt, nil
   641  }
   642  
   643  // UmountAt removes the Mount at the given path.
   644  func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error {
   645  	if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 {
   646  		return linuxerr.EINVAL
   647  	}
   648  
   649  	// MNT_FORCE is currently unimplemented except for the permission check.
   650  	// Force unmounting specifically requires CAP_SYS_ADMIN in the root user
   651  	// namespace, and not in the owner user namespace for the target mount. See
   652  	// fs/namespace.c:SYSCALL_DEFINE2(umount, ...)
   653  	if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) {
   654  		return linuxerr.EPERM
   655  	}
   656  	vd, err := vfs.getMountpoint(ctx, creds, pop)
   657  	if err != nil {
   658  		return err
   659  	}
   660  	defer vd.DecRef(ctx)
   661  
   662  	vfs.lockMounts()
   663  	defer vfs.unlockMounts(ctx)
   664  	if vd.mount.locked {
   665  		return linuxerr.EINVAL
   666  	}
   667  	if !vfs.validInMountNS(ctx, vd.mount) {
   668  		return linuxerr.EINVAL
   669  	}
   670  	if vd.mount == vd.mount.ns.root {
   671  		return linuxerr.EINVAL
   672  	}
   673  
   674  	if opts.Flags&linux.MNT_DETACH == 0 && vfs.arePropMountsBusy(vd.mount) {
   675  		return linuxerr.EBUSY
   676  	}
   677  
   678  	// TODO(gvisor.dev/issue/1035): Linux special-cases umount of the caller's
   679  	// root, which we don't implement yet (we'll just fail it since the caller
   680  	// holds a reference on it).
   681  	vfs.umountTreeLocked(vd.mount, &umountRecursiveOptions{
   682  		eager:               opts.Flags&linux.MNT_DETACH == 0,
   683  		disconnectHierarchy: true,
   684  		propagate:           true,
   685  	})
   686  	return nil
   687  }
   688  
   689  // mountHasExpectedRefs checks that mnt has the correct number of references
   690  // before a umount. It is analogous to fs/pnode.c:do_refcount_check().
   691  //
   692  // +checklocks:vfs.mountMu
   693  func (vfs *VirtualFilesystem) mountHasExpectedRefs(mnt *Mount) bool {
   694  	expectedRefs := int64(1)
   695  	if !mnt.umounted {
   696  		expectedRefs++
   697  	}
   698  	if mnt.coveringMount() != nil {
   699  		expectedRefs++
   700  	}
   701  	return mnt.refs.Load()&^math.MinInt64 == expectedRefs // mask out MSB
   702  }
   703  
   704  // +stateify savable
   705  type umountRecursiveOptions struct {
   706  	// If eager is true, ensure that future calls to Mount.tryIncMountedRef()
   707  	// on umounted mounts fail.
   708  	//
   709  	// eager is analogous to Linux's UMOUNT_SYNC.
   710  	eager bool
   711  
   712  	// If disconnectHierarchy is true, Mounts that are umounted hierarchically
   713  	// should be disconnected from their parents. (Mounts whose parents are not
   714  	// umounted, which in most cases means the Mount passed to the initial call
   715  	// to umountRecursiveLocked, are unconditionally disconnected for
   716  	// consistency with Linux.)
   717  	//
   718  	// disconnectHierarchy is analogous to Linux's !UMOUNT_CONNECTED.
   719  	disconnectHierarchy bool
   720  
   721  	// If propagate is true, mounts located at the same point on the mount's
   722  	// parent's peers and follows will also be umounted if they do not have any
   723  	// children.
   724  	//
   725  	// propagate is analogous to Linux's UMOUNT_PROPAGATE.
   726  	propagate bool
   727  }
   728  
   729  // shouldUmount returns if this mount should be disconnected from its parent.
   730  // It is analogous to fs/namespace.c:disconnect_mount() in Linux.
   731  //
   732  // +checklocks:vfs.mountMu
   733  func (vfs *VirtualFilesystem) shouldUmount(mnt *Mount, opts *umountRecursiveOptions) bool {
   734  	// Always disconnect when it's not a lazy unmount.
   735  	if opts.eager {
   736  		return true
   737  	}
   738  	// If a mount does not have a parent, it won't be disconnected but will be
   739  	// DecRef-ed.
   740  	if mnt.parent() == nil {
   741  		return true
   742  	}
   743  	// Always unmount if the parent is not marked as unmounted.
   744  	if !mnt.parent().umounted {
   745  		return true
   746  	}
   747  	// If the parent is marked as unmounted, we can only unmount is
   748  	// UMOUNT_CONNECTED is false.
   749  	if !opts.disconnectHierarchy {
   750  		return false
   751  	}
   752  	if mnt.locked {
   753  		return false
   754  	}
   755  	return true
   756  }
   757  
   758  // umountTreeLocked marks mnt and its descendants as umounted.
   759  //
   760  // umountTreeLocked is analogous to Linux's fs/namespace.c:umount_tree().
   761  // +checklocks:vfs.mountMu
   762  func (vfs *VirtualFilesystem) umountTreeLocked(mnt *Mount, opts *umountRecursiveOptions) {
   763  	if opts.propagate {
   764  		vfs.unlockPropagationMounts(mnt)
   765  	}
   766  	umountMnts := mnt.submountsLocked()
   767  	for _, mnt := range umountMnts {
   768  		vfs.umount(mnt)
   769  	}
   770  	if opts.propagate {
   771  		umountMnts = append(umountMnts, vfs.propagateUmount(umountMnts)...)
   772  	}
   773  
   774  	vfs.mounts.seq.BeginWrite()
   775  	for _, mnt := range umountMnts {
   776  		if opts.eager {
   777  			for {
   778  				refs := mnt.refs.Load()
   779  				if refs < 0 {
   780  					break
   781  				}
   782  				if mnt.refs.CompareAndSwap(refs, refs|math.MinInt64) {
   783  					break
   784  				}
   785  			}
   786  		}
   787  		if mnt.parent() != nil {
   788  			vfs.delayDecRef(mnt.getKey())
   789  			if vfs.shouldUmount(mnt, opts) {
   790  				vfs.disconnectLocked(mnt)
   791  			} else {
   792  				// Restore mnt in it's parent children list with a reference, but leave
   793  				// it marked as unmounted. These partly unmounted mounts are cleaned up
   794  				// in vfs.forgetDeadMountpoints and Mount.destroy. We keep the extra
   795  				// reference on the mount but remove a reference on the mount point so
   796  				// that mount.Destroy is called when there are no other references on
   797  				// the parent.
   798  				mnt.IncRef()
   799  				mnt.parent().children[mnt] = struct{}{}
   800  			}
   801  		}
   802  		vfs.setPropagation(mnt, linux.MS_PRIVATE)
   803  	}
   804  	vfs.mounts.seq.EndWrite()
   805  }
   806  
   807  // +checklocks:vfs.mountMu
   808  func (vfs *VirtualFilesystem) umount(mnt *Mount) {
   809  	if !mnt.umounted {
   810  		mnt.umounted = true
   811  		vfs.delayDecRef(mnt)
   812  	}
   813  	if parent := mnt.parent(); parent != nil {
   814  		delete(parent.children, mnt)
   815  	}
   816  }
   817  
   818  // changeMountpoint disconnects mnt from its current mount point and connects
   819  // it to mp. It must be called from a vfs.mounts.seq writer critical section.
   820  //
   821  // +checklocks:vfs.mountMu
   822  func (vfs *VirtualFilesystem) changeMountpoint(mnt *Mount, mp VirtualDentry) {
   823  	mp.dentry.mu.Lock()
   824  	vfs.delayDecRef(vfs.disconnectLocked(mnt))
   825  	vfs.delayDecRef(mnt)
   826  	mp.IncRef()
   827  	vfs.connectLocked(mnt, mp, mp.mount.ns)
   828  	mp.dentry.mu.Unlock()
   829  }
   830  
   831  // connectLocked makes vd the mount parent/point for mnt. It consumes
   832  // references held by vd.
   833  //
   834  // Preconditions:
   835  //   - vfs.mountMu must be locked.
   836  //   - vfs.mounts.seq must be in a writer critical section.
   837  //   - d.mu must be locked.
   838  //   - mnt.parent() == nil or mnt.parent().children doesn't contain mnt.
   839  //     i.e. mnt must not already be connected.
   840  func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) {
   841  	if checkInvariants {
   842  		if mnt.parent() != nil && mnt.parent().children != nil {
   843  			if _, ok := mnt.parent().children[mnt]; ok {
   844  				panic("VFS.connectLocked called on connected mount")
   845  			}
   846  		}
   847  	}
   848  	mnt.IncRef() // dropped by vfs.umount().
   849  	mnt.setKey(vd)
   850  	if vd.mount.children == nil {
   851  		vd.mount.children = make(map[*Mount]struct{})
   852  	}
   853  	vd.mount.children[mnt] = struct{}{}
   854  	vd.dentry.mounts.Add(1)
   855  	mnt.ns = mntns
   856  	mntns.mountpoints[vd.dentry]++
   857  	mntns.mounts++
   858  	vfs.mounts.insertSeqed(mnt)
   859  	vfsmpmounts, ok := vfs.mountpoints[vd.dentry]
   860  	if !ok {
   861  		vfsmpmounts = make(map[*Mount]struct{})
   862  		vfs.mountpoints[vd.dentry] = vfsmpmounts
   863  	}
   864  	vfsmpmounts[mnt] = struct{}{}
   865  	vfs.maybeResolveMountPromise(vd)
   866  }
   867  
   868  // disconnectLocked makes vd have no mount parent/point and returns its old
   869  // mount parent/point with a reference held.
   870  //
   871  // Preconditions:
   872  //   - vfs.mountMu must be locked.
   873  //   - vfs.mounts.seq must be in a writer critical section.
   874  //   - mnt.parent() != nil.
   875  func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry {
   876  	vd := mnt.getKey()
   877  	if checkInvariants {
   878  		if vd.mount == nil {
   879  			panic("VFS.disconnectLocked called on disconnected mount")
   880  		}
   881  		if mnt.ns.mountpoints[vd.dentry] == 0 {
   882  			panic("VFS.disconnectLocked called on dentry with zero mountpoints.")
   883  		}
   884  		if mnt.ns.mounts == 0 {
   885  			panic("VFS.disconnectLocked called on namespace with zero mounts.")
   886  		}
   887  	}
   888  	delete(vd.mount.children, mnt)
   889  	vd.dentry.mounts.Add(math.MaxUint32) // -1
   890  	mnt.ns.mountpoints[vd.dentry]--
   891  	mnt.ns.mounts--
   892  	if mnt.ns.mountpoints[vd.dentry] == 0 {
   893  		delete(mnt.ns.mountpoints, vd.dentry)
   894  	}
   895  	vfs.mounts.removeSeqed(mnt)
   896  	mnt.setKey(VirtualDentry{}) // Clear mnt.key.
   897  	vfsmpmounts := vfs.mountpoints[vd.dentry]
   898  	delete(vfsmpmounts, mnt)
   899  	if len(vfsmpmounts) == 0 {
   900  		delete(vfs.mountpoints, vd.dentry)
   901  	}
   902  	return vd
   903  }
   904  
   905  // tryIncMountedRef increments mnt's reference count and returns true. If mnt's
   906  // reference count is already zero, or has been eagerly umounted,
   907  // tryIncMountedRef does nothing and returns false.
   908  //
   909  // tryIncMountedRef does not require that a reference is held on mnt.
   910  func (mnt *Mount) tryIncMountedRef() bool {
   911  	for {
   912  		r := mnt.refs.Load()
   913  		if r <= 0 { // r < 0 => MSB set => eagerly unmounted
   914  			return false
   915  		}
   916  		if mnt.refs.CompareAndSwap(r, r+1) {
   917  			if mnt.LogRefs() {
   918  				refs.LogTryIncRef(mnt, r+1)
   919  			}
   920  			return true
   921  		}
   922  	}
   923  }
   924  
   925  // IncRef increments mnt's reference count.
   926  func (mnt *Mount) IncRef() {
   927  	// In general, negative values for mnt.refs are valid because the MSB is
   928  	// the eager-unmount bit.
   929  	r := mnt.refs.Add(1)
   930  	if mnt.LogRefs() {
   931  		refs.LogIncRef(mnt, r)
   932  	}
   933  }
   934  
   935  // DecRef decrements mnt's reference count.
   936  func (mnt *Mount) DecRef(ctx context.Context) {
   937  	r := mnt.refs.Add(-1)
   938  	if mnt.LogRefs() {
   939  		refs.LogDecRef(mnt, r)
   940  	}
   941  	if r&^math.MinInt64 == 0 { // mask out MSB
   942  		refs.Unregister(mnt)
   943  		mnt.destroy(ctx)
   944  	}
   945  }
   946  
   947  func (mnt *Mount) destroy(ctx context.Context) {
   948  	mnt.vfs.lockMounts()
   949  	defer mnt.vfs.unlockMounts(ctx)
   950  	if mnt.parent() != nil {
   951  		mnt.vfs.mounts.seq.BeginWrite()
   952  		vd := mnt.vfs.disconnectLocked(mnt)
   953  		if vd.Ok() {
   954  			mnt.vfs.delayDecRef(vd)
   955  		}
   956  		mnt.vfs.mounts.seq.EndWrite()
   957  	}
   958  
   959  	// Cleanup any leftover children. The mount point has already been decref'd in
   960  	// umount so we just need to clean up the actual mounts.
   961  	if len(mnt.children) != 0 {
   962  		mnt.vfs.mounts.seq.BeginWrite()
   963  		for child := range mnt.children {
   964  			if checkInvariants {
   965  				if !child.umounted {
   966  					panic("children of a mount that has no references should already be marked as unmounted.")
   967  				}
   968  			}
   969  			mnt.vfs.disconnectLocked(child)
   970  			mnt.vfs.delayDecRef(child)
   971  		}
   972  		mnt.vfs.mounts.seq.EndWrite()
   973  	}
   974  
   975  	if mnt.root != nil {
   976  		mnt.vfs.delayDecRef(mnt.root)
   977  	}
   978  	mnt.vfs.delayDecRef(mnt.fs)
   979  }
   980  
   981  // RefType implements refs.CheckedObject.Type.
   982  func (mnt *Mount) RefType() string {
   983  	return "vfs.Mount"
   984  }
   985  
   986  // LeakMessage implements refs.CheckedObject.LeakMessage.
   987  func (mnt *Mount) LeakMessage() string {
   988  	return fmt.Sprintf("[vfs.Mount %p] reference count of %d instead of 0", mnt, mnt.refs.Load())
   989  }
   990  
   991  // LogRefs implements refs.CheckedObject.LogRefs.
   992  //
   993  // This should only be set to true for debugging purposes, as it can generate an
   994  // extremely large amount of output and drastically degrade performance.
   995  func (mnt *Mount) LogRefs() bool {
   996  	return false
   997  }
   998  
   999  // getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes
  1000  // a reference on the returned Mount. If (mnt, d) is not a mount point,
  1001  // getMountAt returns nil.
  1002  //
  1003  // getMountAt is analogous to Linux's fs/namei.c:follow_mount().
  1004  //
  1005  // Preconditions: References are held on mnt and d.
  1006  func (vfs *VirtualFilesystem) getMountAt(ctx context.Context, mnt *Mount, d *Dentry) *Mount {
  1007  	// The first mount is special-cased:
  1008  	//
  1009  	//	- The caller is assumed to have checked d.isMounted() already. (This
  1010  	//		isn't a precondition because it doesn't matter for correctness.)
  1011  	//
  1012  	//	- We return nil, instead of mnt, if there is no mount at (mnt, d).
  1013  	//
  1014  	//	- We don't drop the caller's references on mnt and d.
  1015  retryFirst:
  1016  	next := vfs.mounts.Lookup(mnt, d)
  1017  	if next == nil {
  1018  		return nil
  1019  	}
  1020  	if !next.tryIncMountedRef() {
  1021  		// Raced with umount.
  1022  		goto retryFirst
  1023  	}
  1024  	mnt = next
  1025  	d = next.root
  1026  	// We don't need to take Dentry refs anywhere in this function because
  1027  	// Mounts hold references on Mount.root, which is immutable.
  1028  	for d.isMounted() {
  1029  		next := vfs.mounts.Lookup(mnt, d)
  1030  		if next == nil {
  1031  			break
  1032  		}
  1033  		if !next.tryIncMountedRef() {
  1034  			// Raced with umount.
  1035  			continue
  1036  		}
  1037  		mnt.DecRef(ctx)
  1038  		mnt = next
  1039  		d = next.root
  1040  	}
  1041  	return mnt
  1042  }
  1043  
  1044  // getMountpoint returns the top mount for the given path.
  1045  // If the path is not a mountpoint, it returns an error.
  1046  //
  1047  // The returned VirtualDentry has an extra reference.
  1048  func (vfs *VirtualFilesystem) getMountpoint(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (VirtualDentry, error) {
  1049  	vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{})
  1050  	if err != nil {
  1051  		return VirtualDentry{}, err
  1052  	}
  1053  	// Linux passes the LOOKUP_MOUNPOINT flag to user_path_at in ksys_umount to
  1054  	// resolve to the toppmost mount in the stack located at the specified path.
  1055  	// vfs.GetMountAt() imitates this behavior. See fs/namei.c:user_path_at(...)
  1056  	// and fs/namespace.c:ksys_umount(...).
  1057  	if vd.dentry.isMounted() {
  1058  		if mnt := vfs.getMountAt(ctx, vd.mount, vd.dentry); mnt != nil {
  1059  			vd.mount.DecRef(ctx)
  1060  			vd.mount = mnt
  1061  		}
  1062  	} else if vd.dentry != vd.mount.root {
  1063  		vd.DecRef(ctx)
  1064  		return VirtualDentry{}, linuxerr.EINVAL
  1065  	}
  1066  	return vd, nil
  1067  }
  1068  
  1069  // getMountpointAt returns the mount point for the stack of Mounts including
  1070  // mnt. It takes a reference on the returned VirtualDentry. If no such mount
  1071  // point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil).
  1072  //
  1073  // Preconditions:
  1074  //   - References are held on mnt and root.
  1075  //   - vfsroot is not (mnt, mnt.root).
  1076  func (vfs *VirtualFilesystem) getMountpointAt(ctx context.Context, mnt *Mount, vfsroot VirtualDentry) VirtualDentry {
  1077  	// The first mount is special-cased:
  1078  	//
  1079  	//	- The caller must have already checked mnt against vfsroot.
  1080  	//
  1081  	//	- We return nil, instead of mnt, if there is no mount point for mnt.
  1082  	//
  1083  	//	- We don't drop the caller's reference on mnt.
  1084  retryFirst:
  1085  	epoch := vfs.mounts.seq.BeginRead()
  1086  	parent, point := mnt.parent(), mnt.point()
  1087  	if !vfs.mounts.seq.ReadOk(epoch) {
  1088  		goto retryFirst
  1089  	}
  1090  	if parent == nil {
  1091  		return VirtualDentry{}
  1092  	}
  1093  	if !parent.tryIncMountedRef() {
  1094  		// Raced with umount.
  1095  		goto retryFirst
  1096  	}
  1097  	if !point.TryIncRef() {
  1098  		// Since Mount holds a reference on Mount.key.point, this can only
  1099  		// happen due to a racing change to Mount.key.
  1100  		parent.DecRef(ctx)
  1101  		goto retryFirst
  1102  	}
  1103  	if !vfs.mounts.seq.ReadOk(epoch) {
  1104  		point.DecRef(ctx)
  1105  		parent.DecRef(ctx)
  1106  		goto retryFirst
  1107  	}
  1108  	mnt = parent
  1109  	d := point
  1110  	for {
  1111  		if mnt == vfsroot.mount && d == vfsroot.dentry {
  1112  			break
  1113  		}
  1114  		if d != mnt.root {
  1115  			break
  1116  		}
  1117  	retryNotFirst:
  1118  		epoch := vfs.mounts.seq.BeginRead()
  1119  		parent, point := mnt.parent(), mnt.point()
  1120  		if !vfs.mounts.seq.ReadOk(epoch) {
  1121  			goto retryNotFirst
  1122  		}
  1123  		if parent == nil {
  1124  			break
  1125  		}
  1126  		if !parent.tryIncMountedRef() {
  1127  			// Raced with umount.
  1128  			goto retryNotFirst
  1129  		}
  1130  		if !point.TryIncRef() {
  1131  			// Since Mount holds a reference on Mount.key.point, this can
  1132  			// only happen due to a racing change to Mount.key.
  1133  			parent.DecRef(ctx)
  1134  			goto retryNotFirst
  1135  		}
  1136  		if !vfs.mounts.seq.ReadOk(epoch) {
  1137  			point.DecRef(ctx)
  1138  			parent.DecRef(ctx)
  1139  			goto retryNotFirst
  1140  		}
  1141  		d.DecRef(ctx)
  1142  		mnt.DecRef(ctx)
  1143  		mnt = parent
  1144  		d = point
  1145  	}
  1146  	return VirtualDentry{mnt, d}
  1147  }
  1148  
  1149  // PivotRoot makes location pointed to by newRootPop the root of the current
  1150  // namespace, and moves the current root to the location pointed to by
  1151  // putOldPop. If the operation is successful, it returns virtual dentries for
  1152  // the new root and the old root with an extra reference taken.
  1153  func (vfs *VirtualFilesystem) PivotRoot(ctx context.Context, creds *auth.Credentials, newRootPop *PathOperation, putOldPop *PathOperation) (newRoot, oldRoot VirtualDentry, err error) {
  1154  	newRoot, err = vfs.GetDentryAt(ctx, creds, newRootPop, &GetDentryOptions{CheckSearchable: true})
  1155  	if err != nil {
  1156  		return
  1157  	}
  1158  	defer newRoot.DecRef(ctx)
  1159  
  1160  	oldRoot = RootFromContext(ctx)
  1161  	defer oldRoot.DecRef(ctx)
  1162  
  1163  	putOldVd, err := vfs.GetDentryAt(ctx, creds, putOldPop, &GetDentryOptions{CheckSearchable: true})
  1164  	if err != nil {
  1165  		return
  1166  	}
  1167  	vfs.lockMounts()
  1168  	defer vfs.unlockMounts(ctx)
  1169  	putOld, err := vfs.lockMountpoint(putOldVd)
  1170  	if err != nil {
  1171  		return
  1172  	}
  1173  	vfs.delayDecRef(putOld)
  1174  
  1175  	cleanup := cleanup.Make(func() { putOld.dentry.mu.Unlock() })
  1176  	defer cleanup.Clean()
  1177  	// Neither new_root nor put_old can be on the same mount as the current
  1178  	// root mount.
  1179  	if newRoot.mount == oldRoot.mount || putOld.mount == oldRoot.mount {
  1180  		return newRoot, oldRoot, linuxerr.EBUSY
  1181  	}
  1182  	// new_root must be a mountpoint.
  1183  	if newRoot.mount.root != newRoot.dentry {
  1184  		return newRoot, oldRoot, linuxerr.EINVAL
  1185  	}
  1186  	// new_root must not be locked.
  1187  	if newRoot.mount.locked {
  1188  		return newRoot, oldRoot, linuxerr.EINVAL
  1189  	}
  1190  	// put_old must be at or underneath new_root.
  1191  	if !vfs.isPathReachable(ctx, newRoot, putOld) {
  1192  		return newRoot, oldRoot, linuxerr.EINVAL
  1193  	}
  1194  	// the new root must be at or underneath the current root.
  1195  	if !vfs.isPathReachable(ctx, oldRoot, newRoot) {
  1196  		return newRoot, oldRoot, linuxerr.EINVAL
  1197  	}
  1198  	// The current root directory must be a mountpoint
  1199  	// (in the case it has been chrooted).
  1200  	if oldRoot.mount.root != oldRoot.dentry {
  1201  		return newRoot, oldRoot, linuxerr.EINVAL
  1202  	}
  1203  	// The current root and the new root must be in the context's mount namespace.
  1204  	if !vfs.validInMountNS(ctx, oldRoot.mount) || !vfs.validInMountNS(ctx, newRoot.mount) {
  1205  		return newRoot, oldRoot, linuxerr.EINVAL
  1206  	}
  1207  	// The current root and the new root cannot be on the rootfs mount.
  1208  	if oldRoot.mount.parent() == nil || newRoot.mount.parent() == nil {
  1209  		return newRoot, oldRoot, linuxerr.EINVAL
  1210  	}
  1211  	// Either the mount point at new_root, or the parent mount of that mount
  1212  	// point, has propagation type MS_SHARED.
  1213  	if newRootParent := newRoot.mount.parent(); newRoot.mount.isShared || newRootParent.isShared {
  1214  		return newRoot, oldRoot, linuxerr.EINVAL
  1215  	}
  1216  	// put_old is a mount point and has the propagation type MS_SHARED.
  1217  	if putOld.mount.root == putOld.dentry && putOld.mount.isShared {
  1218  		return newRoot, oldRoot, linuxerr.EINVAL
  1219  	}
  1220  	cleanup.Release()
  1221  
  1222  	vfs.mounts.seq.BeginWrite()
  1223  	mp := vfs.disconnectLocked(newRoot.mount)
  1224  	vfs.delayDecRef(mp)
  1225  	rootMp := vfs.disconnectLocked(oldRoot.mount)
  1226  	if oldRoot.mount.locked {
  1227  		newRoot.mount.locked = true
  1228  		oldRoot.mount.locked = false
  1229  	}
  1230  
  1231  	putOld.IncRef()
  1232  	vfs.connectLocked(oldRoot.mount, putOld, putOld.mount.ns)
  1233  	putOld.dentry.mu.Unlock()
  1234  
  1235  	rootMp.dentry.mu.Lock()
  1236  	vfs.connectLocked(newRoot.mount, rootMp, rootMp.mount.ns)
  1237  	rootMp.dentry.mu.Unlock()
  1238  	vfs.mounts.seq.EndWrite()
  1239  
  1240  	vfs.delayDecRef(newRoot.mount)
  1241  	vfs.delayDecRef(oldRoot.mount)
  1242  
  1243  	newRoot.IncRef()
  1244  	oldRoot.IncRef()
  1245  	return
  1246  }
  1247  
  1248  // SetMountReadOnly sets the mount as ReadOnly.
  1249  func (vfs *VirtualFilesystem) SetMountReadOnly(mnt *Mount, ro bool) error {
  1250  	vfs.lockMounts()
  1251  	defer vfs.unlockMounts(context.Background())
  1252  	return mnt.setReadOnlyLocked(ro)
  1253  }
  1254  
  1255  // CheckBeginWrite increments the counter of in-progress write operations on
  1256  // mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns
  1257  // EROFS.
  1258  //
  1259  // If CheckBeginWrite succeeds, EndWrite must be called when the write
  1260  // operation is finished.
  1261  func (mnt *Mount) CheckBeginWrite() error {
  1262  	if mnt.writers.Add(1) < 0 {
  1263  		mnt.writers.Add(-1)
  1264  		return linuxerr.EROFS
  1265  	}
  1266  	return nil
  1267  }
  1268  
  1269  // EndWrite indicates that a write operation signaled by a previous successful
  1270  // call to CheckBeginWrite has finished.
  1271  func (mnt *Mount) EndWrite() {
  1272  	mnt.writers.Add(-1)
  1273  }
  1274  
  1275  // Preconditions: VirtualFilesystem.mountMu must be locked.
  1276  func (mnt *Mount) setReadOnlyLocked(ro bool) error {
  1277  	if oldRO := mnt.writers.Load() < 0; oldRO == ro {
  1278  		return nil
  1279  	}
  1280  	if ro {
  1281  		if !mnt.writers.CompareAndSwap(0, math.MinInt64) {
  1282  			return linuxerr.EBUSY
  1283  		}
  1284  		return nil
  1285  	}
  1286  	// Unset MSB without dropping any temporary increments from failed calls to
  1287  	// mnt.CheckBeginWrite().
  1288  	mnt.writers.Add(math.MinInt64)
  1289  	return nil
  1290  }
  1291  
  1292  // ReadOnly returns true if mount is readonly.
  1293  func (mnt *Mount) ReadOnly() bool {
  1294  	mnt.vfs.lockMounts()
  1295  	defer mnt.vfs.unlockMounts(context.Background())
  1296  	return mnt.writers.Load() < 0
  1297  }
  1298  
  1299  // ReadOnlyLocked returns true if mount is readonly.
  1300  //
  1301  // Preconditions: VirtualFilesystem.mountMu must be locked.
  1302  func (mnt *Mount) ReadOnlyLocked() bool {
  1303  	return mnt.writers.Load() < 0
  1304  }
  1305  
  1306  // Filesystem returns the mounted Filesystem. It does not take a reference on
  1307  // the returned Filesystem.
  1308  func (mnt *Mount) Filesystem() *Filesystem {
  1309  	return mnt.fs
  1310  }
  1311  
  1312  // submountsLocked returns this Mount and all Mounts that are descendents of
  1313  // it.
  1314  //
  1315  // Precondition: mnt.vfs.mountMu must be held.
  1316  func (mnt *Mount) submountsLocked() []*Mount {
  1317  	mounts := []*Mount{mnt}
  1318  	for m := range mnt.children {
  1319  		mounts = append(mounts, m.submountsLocked()...)
  1320  	}
  1321  	return mounts
  1322  }
  1323  
  1324  // countSubmountsLocked returns mnt's total number of descendants including
  1325  // uncommitted descendants.
  1326  //
  1327  // Precondition: mnt.vfs.mountMu must be held.
  1328  func (mnt *Mount) countSubmountsLocked() uint32 {
  1329  	mounts := uint32(1)
  1330  	for m := range mnt.children {
  1331  		mounts += m.countSubmountsLocked()
  1332  	}
  1333  	return mounts
  1334  }
  1335  
  1336  // Root returns the mount's root. It does not take a reference on the returned
  1337  // Dentry.
  1338  func (mnt *Mount) Root() *Dentry {
  1339  	return mnt.root
  1340  }
  1341  
  1342  // GenerateProcMounts emits the contents of /proc/[pid]/mounts for vfs to buf.
  1343  //
  1344  // Preconditions: taskRootDir.Ok().
  1345  func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
  1346  	rootMnt := taskRootDir.mount
  1347  
  1348  	vfs.lockMounts()
  1349  	mounts := rootMnt.submountsLocked()
  1350  	// Take a reference on mounts since we need to drop vfs.mountMu before
  1351  	// calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()).
  1352  	for _, mnt := range mounts {
  1353  		mnt.IncRef()
  1354  	}
  1355  	vfs.unlockMounts(ctx)
  1356  	defer func() {
  1357  		for _, mnt := range mounts {
  1358  			mnt.DecRef(ctx)
  1359  		}
  1360  	}()
  1361  	sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
  1362  
  1363  	for _, mnt := range mounts {
  1364  		// Get the path to this mount relative to task root.
  1365  		mntRootVD := VirtualDentry{
  1366  			mount:  mnt,
  1367  			dentry: mnt.root,
  1368  		}
  1369  		path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
  1370  		if err != nil {
  1371  			// For some reason we didn't get a path. Log a warning
  1372  			// and run with empty path.
  1373  			ctx.Warningf("VFS.GenerateProcMounts: error getting pathname for mount root %+v: %v", mnt.root, err)
  1374  			path = ""
  1375  		}
  1376  		if path == "" {
  1377  			// Either an error occurred, or path is not reachable
  1378  			// from root.
  1379  			break
  1380  		}
  1381  
  1382  		mntOpts := mnt.Options()
  1383  		opts := "rw"
  1384  		if mntOpts.ReadOnly {
  1385  			opts = "ro"
  1386  		}
  1387  		if mntOpts.Flags.NoATime {
  1388  			opts = ",noatime"
  1389  		}
  1390  		if mntOpts.Flags.NoExec {
  1391  			opts += ",noexec"
  1392  		}
  1393  		if mopts := mnt.fs.Impl().MountOptions(); mopts != "" {
  1394  			opts += "," + mopts
  1395  		}
  1396  
  1397  		// Format:
  1398  		// <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order>
  1399  		//
  1400  		// The "needs dump" and "fsck order" flags are always 0, which
  1401  		// is allowed.
  1402  		fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", path, mnt.fs.FilesystemType().Name(), opts, 0, 0)
  1403  	}
  1404  }
  1405  
  1406  // GenerateProcMountInfo emits the contents of /proc/[pid]/mountinfo for vfs to
  1407  // buf.
  1408  //
  1409  // Preconditions: taskRootDir.Ok().
  1410  func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
  1411  	rootMnt := taskRootDir.mount
  1412  
  1413  	vfs.lockMounts()
  1414  	mounts := rootMnt.submountsLocked()
  1415  	// Take a reference on mounts since we need to drop vfs.mountMu before
  1416  	// calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()) or
  1417  	// vfs.StatAt() (=> FilesystemImpl.StatAt()).
  1418  	for _, mnt := range mounts {
  1419  		mnt.IncRef()
  1420  	}
  1421  	vfs.unlockMounts(ctx)
  1422  	defer func() {
  1423  		for _, mnt := range mounts {
  1424  			mnt.DecRef(ctx)
  1425  		}
  1426  	}()
  1427  	sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
  1428  
  1429  	creds := auth.CredentialsFromContext(ctx)
  1430  	for _, mnt := range mounts {
  1431  		// Get the path to this mount relative to task root.
  1432  		mntRootVD := VirtualDentry{
  1433  			mount:  mnt,
  1434  			dentry: mnt.root,
  1435  		}
  1436  		pathFromRoot, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
  1437  		if err != nil {
  1438  			// For some reason we didn't get a path. Log a warning
  1439  			// and run with empty path.
  1440  			ctx.Warningf("VFS.GenerateProcMountInfo: error getting pathname for mount root %+v: %v", mnt.root, err)
  1441  			continue
  1442  		}
  1443  		if pathFromRoot == "" {
  1444  			// The path is not reachable from root.
  1445  			continue
  1446  		}
  1447  		var pathFromFS string
  1448  		pathFromFS, err = vfs.PathnameInFilesystem(ctx, mntRootVD)
  1449  		if err != nil {
  1450  			// For some reason we didn't get a path. Log a warning
  1451  			// and run with empty path.
  1452  			ctx.Warningf("VFS.GenerateProcMountInfo: error getting pathname for mount root %+v: %v", mnt.root, err)
  1453  			continue
  1454  		}
  1455  		if pathFromFS == "" {
  1456  			// The path is not reachable from root.
  1457  			continue
  1458  		}
  1459  		// Stat the mount root to get the major/minor device numbers.
  1460  		pop := &PathOperation{
  1461  			Root:  mntRootVD,
  1462  			Start: mntRootVD,
  1463  		}
  1464  		statx, err := vfs.StatAt(ctx, creds, pop, &StatOptions{})
  1465  		if err != nil {
  1466  			// Well that's not good. Ignore this mount.
  1467  			ctx.Warningf("VFS.GenerateProcMountInfo: failed to stat mount root %+v: %v", mnt.root, err)
  1468  			continue
  1469  		}
  1470  
  1471  		// Format:
  1472  		// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
  1473  		// (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
  1474  
  1475  		// (1) Mount ID.
  1476  		fmt.Fprintf(buf, "%d ", mnt.ID)
  1477  
  1478  		// (2)  Parent ID (or this ID if there is no parent).
  1479  		// Note that even if the call to mnt.parent() races with Mount
  1480  		// destruction (which is possible since we're not holding vfs.mountMu),
  1481  		// its Mount.ID will still be valid.
  1482  		pID := mnt.ID
  1483  		if p := mnt.parent(); p != nil {
  1484  			pID = p.ID
  1485  		}
  1486  		fmt.Fprintf(buf, "%d ", pID)
  1487  
  1488  		// (3) Major:Minor device ID. We don't have a superblock, so we
  1489  		// just use the root inode device number.
  1490  		fmt.Fprintf(buf, "%d:%d ", statx.DevMajor, statx.DevMinor)
  1491  
  1492  		// (4) Root: the pathname of the directory in the filesystem
  1493  		// which forms the root of this mount.
  1494  		fmt.Fprintf(buf, "%s ", manglePath(pathFromFS))
  1495  
  1496  		// (5) Mount point (relative to process root).
  1497  		fmt.Fprintf(buf, "%s ", manglePath(pathFromRoot))
  1498  
  1499  		// (6) Mount options.
  1500  		opts := "rw"
  1501  		if mnt.ReadOnly() {
  1502  			opts = "ro"
  1503  		}
  1504  		if mnt.flags.NoATime {
  1505  			opts = ",noatime"
  1506  		}
  1507  		if mnt.flags.NoExec {
  1508  			opts += ",noexec"
  1509  		}
  1510  		fmt.Fprintf(buf, "%s ", opts)
  1511  
  1512  		// (7) Optional fields: zero or more fields of the form "tag[:value]".
  1513  		fmt.Fprintf(buf, "%s", vfs.generateOptionalTags(ctx, mnt, taskRootDir))
  1514  		// (8) Separator: the end of the optional fields is marked by a single hyphen.
  1515  		fmt.Fprintf(buf, "- ")
  1516  
  1517  		// (9) Filesystem type.
  1518  		fmt.Fprintf(buf, "%s ", mnt.fs.FilesystemType().Name())
  1519  
  1520  		// (10) Mount source: filesystem-specific information or "none".
  1521  		fmt.Fprintf(buf, "none ")
  1522  
  1523  		// (11) Superblock options, and final newline.
  1524  		fmt.Fprintf(buf, "%s\n", superBlockOpts(pathFromRoot, mnt))
  1525  	}
  1526  }
  1527  
  1528  // manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents.
  1529  // See Linux fs/seq_file.c:mangle_path.
  1530  func manglePath(p string) string {
  1531  	r := strings.NewReplacer(" ", "\\040", "\t", "\\011", "\n", "\\012", "\\", "\\134")
  1532  	return r.Replace(p)
  1533  }
  1534  
  1535  // superBlockOpts returns the super block options string for the mount at
  1536  // the given path.
  1537  func superBlockOpts(mountPath string, mnt *Mount) string {
  1538  	// Compose super block options by combining global mount flags with
  1539  	// FS-specific mount options.
  1540  	opts := "rw"
  1541  	if mnt.ReadOnly() {
  1542  		opts = "ro"
  1543  	}
  1544  
  1545  	if mopts := mnt.fs.Impl().MountOptions(); mopts != "" {
  1546  		opts += "," + mopts
  1547  	}
  1548  
  1549  	// NOTE(b/147673608): If the mount is a ramdisk-based fake cgroupfs, we also
  1550  	// need to include the cgroup name in the options. For now we just read that
  1551  	// from the path. Note that this is only possible when "cgroup" isn't
  1552  	// registered as a valid filesystem type.
  1553  	//
  1554  	// TODO(gvisor.dev/issue/190): Once we removed fake cgroupfs support, we
  1555  	// should remove this.
  1556  	if cgroupfs := mnt.vfs.getFilesystemType("cgroup"); cgroupfs != nil && cgroupfs.opts.AllowUserMount {
  1557  		// Real cgroupfs available.
  1558  		return opts
  1559  	}
  1560  	if mnt.fs.FilesystemType().Name() == "cgroup" {
  1561  		splitPath := strings.Split(mountPath, "/")
  1562  		cgroupType := splitPath[len(splitPath)-1]
  1563  		opts += "," + cgroupType
  1564  	}
  1565  
  1566  	return opts
  1567  }
  1568  
  1569  func (vfs *VirtualFilesystem) generateOptionalTags(ctx context.Context, mnt *Mount, root VirtualDentry) string {
  1570  	vfs.lockMounts()
  1571  	defer vfs.unlockMounts(ctx)
  1572  	// TODO(b/249777195): Support MS_UNBINDABLE propagation type.
  1573  	var optionalSb strings.Builder
  1574  	if mnt.isShared {
  1575  		optionalSb.WriteString(fmt.Sprintf("shared:%d ", mnt.groupID))
  1576  	}
  1577  	if mnt.isFollower() {
  1578  		// Per man mount_namespaces(7), propagate_from should not be
  1579  		// included in optional tags if the leader "is the immediate leader of the
  1580  		// mount, or if there is no dominant peer group under the same root". A
  1581  		// dominant peer group is the nearest reachable mount in the leader/follower
  1582  		// chain.
  1583  		optionalSb.WriteString(fmt.Sprintf("master:%d ", mnt.leader.groupID))
  1584  		var dominant *Mount
  1585  		for m := mnt.leader; m != nil; m = m.leader {
  1586  			if dominant = vfs.peerUnderRoot(ctx, m, mnt.ns, root); dominant != nil {
  1587  				break
  1588  			}
  1589  		}
  1590  		if dominant != nil && dominant != mnt.leader {
  1591  			optionalSb.WriteString(fmt.Sprintf("propagate_from:%d ", dominant.groupID))
  1592  		}
  1593  	}
  1594  	return optionalSb.String()
  1595  }