github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/vfs/mount.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package vfs
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"math"
    21  	"sort"
    22  	"strings"
    23  
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    28  	"github.com/nicocha30/gvisor-ligolo/pkg/refs"
    29  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth"
    30  )
    31  
    32  // MountMax is the maximum number of mounts allowed. In Linux this can be
    33  // configured by the user at /proc/sys/fs/mount-max, but the default is
    34  // 100,000. We set the gVisor limit to 10,000.
    35  const MountMax = 10000
    36  
    37  // A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem
    38  // (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem
    39  // (Mount.fs), which applies to path resolution in the context of a particular
    40  // Mount (Mount.key.parent).
    41  //
    42  // Mounts are reference-counted. Unless otherwise specified, all Mount methods
    43  // require that a reference is held.
    44  //
    45  // Mount and Filesystem are distinct types because it's possible for a single
    46  // Filesystem to be mounted at multiple locations and/or in multiple mount
    47  // namespaces.
    48  //
    49  // Mount is analogous to Linux's struct mount. (gVisor does not distinguish
    50  // between struct mount and struct vfsmount.)
    51  //
    52  // +stateify savable
    53  type Mount struct {
    54  	// vfs, fs, root are immutable. References are held on fs and root.
    55  	// Note that for a disconnected mount, root may be nil.
    56  	//
    57  	// Invariant: if not nil, root belongs to fs.
    58  	vfs  *VirtualFilesystem
    59  	fs   *Filesystem
    60  	root *Dentry
    61  
    62  	// ID is the immutable mount ID.
    63  	ID uint64
    64  
    65  	// Flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except
    66  	// for MS_RDONLY which is tracked in "writers". Immutable.
    67  	Flags MountFlags
    68  
    69  	// key is protected by VirtualFilesystem.mountMu and
    70  	// VirtualFilesystem.mounts.seq, and may be nil. References are held on
    71  	// key.parent and key.point if they are not nil.
    72  	//
    73  	// Invariant: key.parent != nil iff key.point != nil. key.point belongs to
    74  	// key.parent.fs.
    75  	key mountKey `state:".(VirtualDentry)"`
    76  
    77  	// ns is the namespace in which this Mount was mounted. ns is protected by
    78  	// VirtualFilesystem.mountMu.
    79  	ns *MountNamespace
    80  
    81  	// The lower 63 bits of refs are a reference count. The MSB of refs is set
    82  	// if the Mount has been eagerly umounted, as by umount(2) without the
    83  	// MNT_DETACH flag. refs is accessed using atomic memory operations.
    84  	refs atomicbitops.Int64
    85  
    86  	// children is the set of all Mounts for which Mount.key.parent is this
    87  	// Mount. children is protected by VirtualFilesystem.mountMu.
    88  	children map[*Mount]struct{}
    89  
    90  	// propagationType is propagation type of this mount. It can be shared or
    91  	// private.
    92  	propType PropagationType
    93  
    94  	// sharedList is a list of mounts in the shared peer group. It is nil if
    95  	// propType is not Shared. All mounts in a shared peer group hold the same
    96  	// sharedList. The mounts in sharedList do not need an extra reference taken
    97  	// because it would be redundant with the taken for being attached to a
    98  	// parent mount. If a mount is in a shared list if and only if it is attached
    99  	// and has the shared propagation type.
   100  	sharedList  *sharedList
   101  	sharedEntry sharedEntry
   102  
   103  	// groupID is the ID for this mount's shared peer group. If the mount is not
   104  	// in a peer group, this is 0.
   105  	groupID uint32
   106  
   107  	// umounted is true if VFS.umountRecursiveLocked() has been called on this
   108  	// Mount. VirtualFilesystem does not hold a reference on Mounts for which
   109  	// umounted is true. umounted is protected by VirtualFilesystem.mountMu.
   110  	umounted bool
   111  
   112  	// The lower 63 bits of writers is the number of calls to
   113  	// Mount.CheckBeginWrite() that have not yet been paired with a call to
   114  	// Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect.
   115  	// writers is accessed using atomic memory operations.
   116  	writers atomicbitops.Int64
   117  }
   118  
   119  type sharedMapper struct{}
   120  
   121  func (sharedMapper) linkerFor(mnt *Mount) *sharedEntry { return &mnt.sharedEntry }
   122  
   123  func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount {
   124  	mnt := &Mount{
   125  		ID:       vfs.lastMountID.Add(1),
   126  		Flags:    opts.Flags,
   127  		vfs:      vfs,
   128  		fs:       fs,
   129  		root:     root,
   130  		ns:       mntns,
   131  		propType: Private,
   132  		refs:     atomicbitops.FromInt64(1),
   133  	}
   134  	if opts.ReadOnly {
   135  		mnt.setReadOnlyLocked(true)
   136  	}
   137  	refs.Register(mnt)
   138  	return mnt
   139  }
   140  
   141  // Options returns a copy of the MountOptions currently applicable to mnt.
   142  func (mnt *Mount) Options() MountOptions {
   143  	mnt.vfs.mountMu.Lock()
   144  	defer mnt.vfs.mountMu.Unlock()
   145  	return MountOptions{
   146  		Flags:    mnt.Flags,
   147  		ReadOnly: mnt.ReadOnly(),
   148  	}
   149  }
   150  
   151  func (mnt *Mount) generateOptionalTags() string {
   152  	mnt.vfs.mountMu.Lock()
   153  	defer mnt.vfs.mountMu.Unlock()
   154  	// TODO(b/249777195): Support MS_SLAVE and MS_UNBINDABLE propagation types.
   155  	var optional string
   156  	if mnt.propType == Shared {
   157  		optional = fmt.Sprintf("shared:%d", mnt.groupID)
   158  	}
   159  	return optional
   160  }
   161  
   162  // A MountNamespace is a collection of Mounts.//
   163  // MountNamespaces are reference-counted. Unless otherwise specified, all
   164  // MountNamespace methods require that a reference is held.
   165  //
   166  // MountNamespace is analogous to Linux's struct mnt_namespace.
   167  //
   168  // +stateify savable
   169  type MountNamespace struct {
   170  	MountNamespaceRefs
   171  
   172  	// Owner is the usernamespace that owns this mount namespace.
   173  	Owner *auth.UserNamespace
   174  
   175  	// root is the MountNamespace's root mount.
   176  	root *Mount
   177  
   178  	// mountpoints maps all Dentries which are mount points in this namespace
   179  	// to the number of Mounts for which they are mount points. mountpoints is
   180  	// protected by VirtualFilesystem.mountMu.
   181  	//
   182  	// mountpoints is used to determine if a Dentry can be moved or removed
   183  	// (which requires that the Dentry is not a mount point in the calling
   184  	// namespace).
   185  	//
   186  	// mountpoints is maintained even if there are no references held on the
   187  	// MountNamespace; this is required to ensure that
   188  	// VFS.PrepareDeleteDentry() and VFS.PrepareRemoveDentry() operate
   189  	// correctly on unreferenced MountNamespaces.
   190  	mountpoints map[*Dentry]uint32
   191  
   192  	// mounts is the total number of mounts in this mount namespace.
   193  	mounts uint32
   194  }
   195  
   196  // NewMountNamespace returns a new mount namespace with a root filesystem
   197  // configured by the given arguments. A reference is taken on the returned
   198  // MountNamespace.
   199  func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *MountOptions) (*MountNamespace, error) {
   200  	rft := vfs.getFilesystemType(fsTypeName)
   201  	if rft == nil {
   202  		ctx.Warningf("Unknown filesystem type: %s", fsTypeName)
   203  		return nil, linuxerr.ENODEV
   204  	}
   205  	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
   206  	if err != nil {
   207  		return nil, err
   208  	}
   209  	return vfs.NewMountNamespaceFrom(ctx, creds, fs, root, opts), nil
   210  }
   211  
   212  // NewMountNamespaceFrom constructs a new mount namespace from an existing
   213  // filesystem and its root dentry. This is similar to NewMountNamespace, but
   214  // uses an existing filesystem instead of constructing a new one.
   215  func (vfs *VirtualFilesystem) NewMountNamespaceFrom(ctx context.Context, creds *auth.Credentials, fs *Filesystem, root *Dentry, opts *MountOptions) *MountNamespace {
   216  	mntns := &MountNamespace{
   217  		Owner:       creds.UserNamespace,
   218  		mountpoints: make(map[*Dentry]uint32),
   219  	}
   220  	mntns.InitRefs()
   221  	mntns.root = newMount(vfs, fs, root, mntns, opts)
   222  	return mntns
   223  }
   224  
   225  // NewFilesystem creates a new filesystem object not yet associated with any
   226  // mounts. It can be installed into the filesystem tree with ConnectMountAt.
   227  // Note that only the filesystem-specific mount options from opts are used by
   228  // this function, mount flags are ignored. To set mount flags, pass them to a
   229  // corresponding ConnectMountAt.
   230  func (vfs *VirtualFilesystem) NewFilesystem(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *MountOptions) (*Filesystem, *Dentry, error) {
   231  	rft := vfs.getFilesystemType(fsTypeName)
   232  	if rft == nil {
   233  		return nil, nil, linuxerr.ENODEV
   234  	}
   235  	if !opts.InternalMount && !rft.opts.AllowUserMount {
   236  		return nil, nil, linuxerr.ENODEV
   237  	}
   238  	return rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
   239  }
   240  
   241  // NewDisconnectedMount returns a Mount representing fs with the given root
   242  // (which may be nil). The new Mount is not associated with any MountNamespace
   243  // and is not connected to any other Mounts. References are taken on fs and
   244  // root.
   245  func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry, opts *MountOptions) *Mount {
   246  	fs.IncRef()
   247  	if root != nil {
   248  		root.IncRef()
   249  	}
   250  	return newMount(vfs, fs, root, nil /* mntns */, opts)
   251  }
   252  
   253  // MountDisconnected creates a Filesystem configured by the given arguments,
   254  // then returns a Mount representing it. The new Mount is not associated with
   255  // any MountNamespace and is not connected to any other Mounts.
   256  func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth.Credentials, source string, fsTypeName string, opts *MountOptions) (*Mount, error) {
   257  	fs, root, err := vfs.NewFilesystem(ctx, creds, source, fsTypeName, opts)
   258  	if err != nil {
   259  		return nil, err
   260  	}
   261  	return newMount(vfs, fs, root, nil /* mntns */, opts), nil
   262  }
   263  
   264  // ConnectMountAt connects mnt at the path represented by target.
   265  //
   266  // Preconditions: mnt must be disconnected.
   267  func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Credentials, mnt *Mount, target *PathOperation) error {
   268  	// We can't hold vfs.mountMu while calling FilesystemImpl methods due to
   269  	// lock ordering.
   270  	vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{})
   271  	if err != nil {
   272  		return err
   273  	}
   274  	vfs.mountMu.Lock()
   275  	tree := vfs.preparePropagationTree(mnt, vd)
   276  	// Check if the new mount + all the propagation mounts puts us over the max.
   277  	if uint32(len(tree)+1)+vd.mount.ns.mounts > MountMax {
   278  		// We need to unlock mountMu first because DecRef takes a lock on the
   279  		// filesystem mutex in some implementations, which can lead to circular
   280  		// locking.
   281  		vfs.abortPropagationTree(ctx, tree)
   282  		vfs.mountMu.Unlock()
   283  		vd.DecRef(ctx)
   284  		return linuxerr.ENOSPC
   285  	}
   286  	vdsToDecRef, err := vfs.connectMountAtLocked(ctx, mnt, vd)
   287  	defer func() {
   288  		for _, vd := range vdsToDecRef {
   289  			vd.DecRef(ctx)
   290  		}
   291  	}()
   292  	if err != nil {
   293  		vfs.abortPropagationTree(ctx, tree)
   294  		vfs.mountMu.Unlock()
   295  		return err
   296  	}
   297  	vfs.commitPropagationTree(ctx, tree)
   298  	vfs.mountMu.Unlock()
   299  	return nil
   300  }
   301  
   302  // connectMountAtLocked attaches mnt at vd. This method consumes a reference on
   303  // vd and returns a list of VirtualDentry with an extra reference that must be
   304  // DecRef'd outside of vfs.mountMu.
   305  //
   306  // Preconditions:
   307  //   - mnt must be disconnected.
   308  //   - vfs.mountMu must be locked.
   309  //
   310  // +checklocks:vfs.mountMu
   311  func (vfs *VirtualFilesystem) connectMountAtLocked(ctx context.Context, mnt *Mount, vd VirtualDentry) ([]VirtualDentry, error) {
   312  	var vdsToDecRef []VirtualDentry
   313  	vd.dentry.mu.Lock()
   314  	for {
   315  		if vd.mount.umounted || vd.dentry.dead {
   316  			vd.dentry.mu.Unlock()
   317  			vdsToDecRef = append(vdsToDecRef, vd)
   318  			return vdsToDecRef, linuxerr.ENOENT
   319  		}
   320  		// vd might have been mounted over between vfs.GetDentryAt() and
   321  		// vfs.mountMu.Lock().
   322  		if !vd.dentry.isMounted() {
   323  			break
   324  		}
   325  		nextmnt := vfs.mounts.Lookup(vd.mount, vd.dentry)
   326  		if nextmnt == nil {
   327  			break
   328  		}
   329  		// It's possible that nextmnt has been umounted but not disconnected,
   330  		// in which case vfs no longer holds a reference on it, and the last
   331  		// reference may be concurrently dropped even though we're holding
   332  		// vfs.mountMu.
   333  		if !nextmnt.tryIncMountedRef() {
   334  			break
   335  		}
   336  		// This can't fail since we're holding vfs.mountMu.
   337  		nextmnt.root.IncRef()
   338  		vd.dentry.mu.Unlock()
   339  		vdsToDecRef = append(vdsToDecRef, vd)
   340  		vd = VirtualDentry{
   341  			mount:  nextmnt,
   342  			dentry: nextmnt.root,
   343  		}
   344  		vd.dentry.mu.Lock()
   345  	}
   346  	// TODO(gvisor.dev/issue/1035): Linux requires that either both the mount
   347  	// point and the mount root are directories, or neither are, and returns
   348  	// ENOTDIR if this is not the case.
   349  	mntns := vd.mount.ns
   350  	vfs.mounts.seq.BeginWrite()
   351  	vfs.connectLocked(mnt, vd, mntns)
   352  	vfs.mounts.seq.EndWrite()
   353  	vd.dentry.mu.Unlock()
   354  	return vdsToDecRef, nil
   355  }
   356  
   357  // CloneMountAt returns a new mount with the same fs, specified root and
   358  // mount options. If mnt's propagation type is shared the new mount is
   359  // automatically made a peer of mnt. If mount options are nil, mnt's
   360  // options are copied.
   361  func (vfs *VirtualFilesystem) CloneMountAt(mnt *Mount, root *Dentry, mopts *MountOptions) *Mount {
   362  	vfs.mountMu.Lock()
   363  	defer vfs.mountMu.Unlock()
   364  	clone := vfs.cloneMount(mnt, root, mopts)
   365  	vfs.addPeer(mnt, clone)
   366  	return clone
   367  }
   368  
   369  // cloneMount returns a new mount with mnt.fs as the filesystem and root as the
   370  // root. The returned mount has an extra reference.
   371  //
   372  // +checklocks:vfs.mountMu
   373  // +checklocksalias:mnt.vfs.mountMu=vfs.mountMu
   374  func (vfs *VirtualFilesystem) cloneMount(mnt *Mount, root *Dentry, mopts *MountOptions) *Mount {
   375  	opts := mopts
   376  	if opts == nil {
   377  		opts = &MountOptions{
   378  			Flags:    mnt.Flags,
   379  			ReadOnly: mnt.ReadOnly(),
   380  		}
   381  	}
   382  	return vfs.NewDisconnectedMount(mnt.fs, root, opts)
   383  }
   384  
   385  // BindAt creates a clone of the source path's parent mount and mounts it at
   386  // the target path. The new mount's root dentry is one pointed to by the source
   387  // path.
   388  //
   389  // TODO(b/249121230): Support recursive bind mounting.
   390  func (vfs *VirtualFilesystem) BindAt(ctx context.Context, creds *auth.Credentials, source, target *PathOperation) (*Mount, error) {
   391  	sourceVd, err := vfs.GetDentryAt(ctx, creds, source, &GetDentryOptions{})
   392  	if err != nil {
   393  		return nil, err
   394  	}
   395  	defer sourceVd.DecRef(ctx)
   396  	targetVd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{})
   397  	if err != nil {
   398  		return nil, err
   399  	}
   400  
   401  	vfs.mountMu.Lock()
   402  	clone := vfs.cloneMount(sourceVd.mount, sourceVd.dentry, nil)
   403  	defer clone.DecRef(ctx)
   404  	tree := vfs.preparePropagationTree(clone, targetVd)
   405  	if sourceVd.mount.propType == Shared {
   406  		if clone.propType == Private {
   407  			vfs.addPeer(sourceVd.mount, clone)
   408  		} else {
   409  			vfs.mergePeerGroup(sourceVd.mount, clone)
   410  		}
   411  	}
   412  	if uint32(1+len(tree))+targetVd.mount.ns.mounts > MountMax {
   413  		vfs.setPropagation(clone, Private)
   414  		vfs.abortPropagationTree(ctx, tree)
   415  		vfs.mountMu.Unlock()
   416  		targetVd.DecRef(ctx)
   417  		return nil, linuxerr.ENOSPC
   418  	}
   419  
   420  	vdsToDecRef, err := vfs.connectMountAtLocked(ctx, clone, targetVd)
   421  	defer func() {
   422  		for _, vd := range vdsToDecRef {
   423  			vd.DecRef(ctx)
   424  		}
   425  	}()
   426  	if err != nil {
   427  		vfs.setPropagation(clone, Private)
   428  		vfs.abortPropagationTree(ctx, tree)
   429  		vfs.mountMu.Unlock()
   430  		return nil, err
   431  	}
   432  	vfs.commitPropagationTree(ctx, tree)
   433  	vfs.mountMu.Unlock()
   434  	return clone, nil
   435  }
   436  
   437  // MountAt creates and mounts a Filesystem configured by the given arguments.
   438  // The VirtualFilesystem will hold a reference to the Mount until it is
   439  // unmounted.
   440  //
   441  // This method returns the mounted Mount without a reference, for convenience
   442  // during VFS setup when there is no chance of racing with unmount.
   443  func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) (*Mount, error) {
   444  	mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts)
   445  	if err != nil {
   446  		return nil, err
   447  	}
   448  	defer mnt.DecRef(ctx)
   449  	if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil {
   450  		return nil, err
   451  	}
   452  	return mnt, nil
   453  }
   454  
   455  // UmountAt removes the Mount at the given path.
   456  func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error {
   457  	if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 {
   458  		return linuxerr.EINVAL
   459  	}
   460  
   461  	// MNT_FORCE is currently unimplemented except for the permission check.
   462  	// Force unmounting specifically requires CAP_SYS_ADMIN in the root user
   463  	// namespace, and not in the owner user namespace for the target mount. See
   464  	// fs/namespace.c:SYSCALL_DEFINE2(umount, ...)
   465  	if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) {
   466  		return linuxerr.EPERM
   467  	}
   468  
   469  	vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{})
   470  	if err != nil {
   471  		return err
   472  	}
   473  	// This defer statement is encapsulated in a function because vd.mount can be
   474  	// modified in the block below. The arguments to defer are evaluated during
   475  	// the construction of a defer statement, so if vd.DecRef() was not
   476  	// encapsulated, the vd structure and its underlying pointers _at this point_
   477  	// would be copied and DecRefd at the end of this function.
   478  	defer func() {
   479  		vd.DecRef(ctx)
   480  	}()
   481  	// Linux passes the LOOKUP_MOUNPOINT flag to user_path_at in ksys_umount to
   482  	// resolve to the toppmost mount in the stack located at the specified path.
   483  	// vfs.GetMountAt() imitiates this behavior. See fs/namei.c:user_path_at(...)
   484  	// and fs/namespace.c:ksys_umount(...).
   485  	if vd.dentry.isMounted() {
   486  		if realmnt := vfs.getMountAt(ctx, vd.mount, vd.dentry); realmnt != nil {
   487  			vd.mount.DecRef(ctx)
   488  			vd.mount = realmnt
   489  		}
   490  	} else if vd.dentry != vd.mount.root {
   491  		return linuxerr.EINVAL
   492  	}
   493  
   494  	vfs.mountMu.Lock()
   495  	if mntns := MountNamespaceFromContext(ctx); mntns != nil {
   496  		defer mntns.DecRef(ctx)
   497  		if mntns != vd.mount.ns {
   498  			vfs.mountMu.Unlock()
   499  			return linuxerr.EINVAL
   500  		}
   501  
   502  		if vd.mount == vd.mount.ns.root {
   503  			vfs.mountMu.Unlock()
   504  			return linuxerr.EINVAL
   505  		}
   506  	}
   507  
   508  	umountTree := []*Mount{vd.mount}
   509  	parent, mountpoint := vd.mount.parent(), vd.mount.point()
   510  	if parent != nil && parent.propType == Shared {
   511  		for peer := parent.sharedList.Front(); peer != nil; peer = peer.sharedEntry.Next() {
   512  			if peer == parent {
   513  				continue
   514  			}
   515  			umountMnt := vfs.mounts.Lookup(peer, mountpoint)
   516  			// From https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt:
   517  			// If any peer has some child mounts, then that mount is not unmounted,
   518  			// but all other mounts are unmounted.
   519  			if umountMnt != nil && len(umountMnt.children) == 0 {
   520  				umountTree = append(umountTree, umountMnt)
   521  			}
   522  		}
   523  	}
   524  
   525  	// TODO(gvisor.dev/issue/1035): Linux special-cases umount of the caller's
   526  	// root, which we don't implement yet (we'll just fail it since the caller
   527  	// holds a reference on it).
   528  
   529  	vfs.mounts.seq.BeginWrite()
   530  	if opts.Flags&linux.MNT_DETACH == 0 {
   531  		if len(vd.mount.children) != 0 {
   532  			vfs.mounts.seq.EndWrite()
   533  			vfs.mountMu.Unlock()
   534  			return linuxerr.EBUSY
   535  		}
   536  		// We are holding a reference on vd.mount.
   537  		expectedRefs := int64(1)
   538  		if !vd.mount.umounted {
   539  			expectedRefs = 2
   540  		}
   541  		if vd.mount.refs.Load()&^math.MinInt64 != expectedRefs { // mask out MSB
   542  			vfs.mounts.seq.EndWrite()
   543  			vfs.mountMu.Unlock()
   544  			return linuxerr.EBUSY
   545  		}
   546  	}
   547  	var (
   548  		vdsToDecRef    []VirtualDentry
   549  		mountsToDecRef []*Mount
   550  	)
   551  	for _, mnt := range umountTree {
   552  		vdsToDecRef, mountsToDecRef = vfs.umountRecursiveLocked(mnt, &umountRecursiveOptions{
   553  			eager:               opts.Flags&linux.MNT_DETACH == 0,
   554  			disconnectHierarchy: true,
   555  		}, vdsToDecRef, mountsToDecRef)
   556  	}
   557  	vfs.mounts.seq.EndWrite()
   558  	vfs.mountMu.Unlock()
   559  	for _, vd := range vdsToDecRef {
   560  		vd.DecRef(ctx)
   561  	}
   562  	for _, m := range mountsToDecRef {
   563  		m.DecRef(ctx)
   564  	}
   565  	return nil
   566  }
   567  
   568  // +stateify savable
   569  type umountRecursiveOptions struct {
   570  	// If eager is true, ensure that future calls to Mount.tryIncMountedRef()
   571  	// on umounted mounts fail.
   572  	//
   573  	// eager is analogous to Linux's UMOUNT_SYNC.
   574  	eager bool
   575  
   576  	// If disconnectHierarchy is true, Mounts that are umounted hierarchically
   577  	// should be disconnected from their parents. (Mounts whose parents are not
   578  	// umounted, which in most cases means the Mount passed to the initial call
   579  	// to umountRecursiveLocked, are unconditionally disconnected for
   580  	// consistency with Linux.)
   581  	//
   582  	// disconnectHierarchy is analogous to Linux's !UMOUNT_CONNECTED.
   583  	disconnectHierarchy bool
   584  }
   585  
   586  // umountRecursiveLocked marks mnt and its descendants as umounted. It does not
   587  // release mount or dentry references; instead, it appends VirtualDentries and
   588  // Mounts on which references must be dropped to vdsToDecRef and mountsToDecRef
   589  // respectively, and returns updated slices. (This is necessary because
   590  // filesystem locks possibly taken by DentryImpl.DecRef() may precede
   591  // vfs.mountMu in the lock order, and Mount.DecRef() may lock vfs.mountMu.)
   592  //
   593  // umountRecursiveLocked is analogous to Linux's fs/namespace.c:umount_tree().
   594  //
   595  // Preconditions:
   596  //   - vfs.mountMu must be locked.
   597  //   - vfs.mounts.seq must be in a writer critical section.
   598  //
   599  // +checklocks:vfs.mountMu
   600  func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecursiveOptions, vdsToDecRef []VirtualDentry, mountsToDecRef []*Mount) ([]VirtualDentry, []*Mount) {
   601  	if !mnt.umounted {
   602  		mnt.umounted = true
   603  		mountsToDecRef = append(mountsToDecRef, mnt)
   604  		if parent := mnt.parent(); parent != nil && (opts.disconnectHierarchy || !parent.umounted) {
   605  			vdsToDecRef = append(vdsToDecRef, vfs.disconnectLocked(mnt))
   606  		}
   607  		if mnt.propType == Shared {
   608  			vfs.setPropagation(mnt, Private)
   609  		}
   610  	}
   611  	if opts.eager {
   612  		for {
   613  			refs := mnt.refs.Load()
   614  			if refs < 0 {
   615  				break
   616  			}
   617  			if mnt.refs.CompareAndSwap(refs, refs|math.MinInt64) {
   618  				break
   619  			}
   620  		}
   621  	}
   622  	for child := range mnt.children {
   623  		vdsToDecRef, mountsToDecRef = vfs.umountRecursiveLocked(child, opts, vdsToDecRef, mountsToDecRef)
   624  	}
   625  	return vdsToDecRef, mountsToDecRef
   626  }
   627  
   628  // connectLocked makes vd the mount parent/point for mnt. It consumes
   629  // references held by vd.
   630  //
   631  // Preconditions:
   632  //   - vfs.mountMu must be locked.
   633  //   - vfs.mounts.seq must be in a writer critical section.
   634  //   - d.mu must be locked.
   635  //   - mnt.parent() == nil, i.e. mnt must not already be connected.
   636  func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) {
   637  	if checkInvariants {
   638  		if mnt.parent() != nil {
   639  			panic("VFS.connectLocked called on connected mount")
   640  		}
   641  	}
   642  	mnt.IncRef() // dropped by callers of umountRecursiveLocked
   643  	mnt.setKey(vd)
   644  	if vd.mount.children == nil {
   645  		vd.mount.children = make(map[*Mount]struct{})
   646  	}
   647  	vd.mount.children[mnt] = struct{}{}
   648  	vd.dentry.mounts.Add(1)
   649  	mnt.ns = mntns
   650  	mntns.mountpoints[vd.dentry]++
   651  	mntns.mounts++
   652  	vfs.mounts.insertSeqed(mnt)
   653  	vfsmpmounts, ok := vfs.mountpoints[vd.dentry]
   654  	if !ok {
   655  		vfsmpmounts = make(map[*Mount]struct{})
   656  		vfs.mountpoints[vd.dentry] = vfsmpmounts
   657  	}
   658  	vfsmpmounts[mnt] = struct{}{}
   659  	vfs.maybeResolveMountPromise(vd)
   660  }
   661  
   662  // disconnectLocked makes vd have no mount parent/point and returns its old
   663  // mount parent/point with a reference held.
   664  //
   665  // Preconditions:
   666  //   - vfs.mountMu must be locked.
   667  //   - vfs.mounts.seq must be in a writer critical section.
   668  //   - mnt.parent() != nil.
   669  func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry {
   670  	vd := mnt.getKey()
   671  	if checkInvariants {
   672  		if vd.mount == nil {
   673  			panic("VFS.disconnectLocked called on disconnected mount")
   674  		}
   675  		if mnt.ns.mountpoints[vd.dentry] == 0 {
   676  			panic("VFS.disconnectLocked called on dentry with zero mountpoints.")
   677  		}
   678  		if mnt.ns.mounts == 0 {
   679  			panic("VFS.disconnectLocked called on namespace with zero mounts.")
   680  		}
   681  	}
   682  	delete(vd.mount.children, mnt)
   683  	vd.dentry.mounts.Add(math.MaxUint32) // -1
   684  	mnt.ns.mountpoints[vd.dentry]--
   685  	mnt.ns.mounts--
   686  	if mnt.ns.mountpoints[vd.dentry] == 0 {
   687  		delete(mnt.ns.mountpoints, vd.dentry)
   688  	}
   689  	vfs.mounts.removeSeqed(mnt)
   690  	mnt.loadKey(VirtualDentry{}) // Clear mnt.key.
   691  	vfsmpmounts := vfs.mountpoints[vd.dentry]
   692  	delete(vfsmpmounts, mnt)
   693  	if len(vfsmpmounts) == 0 {
   694  		delete(vfs.mountpoints, vd.dentry)
   695  	}
   696  	return vd
   697  }
   698  
   699  // tryIncMountedRef increments mnt's reference count and returns true. If mnt's
   700  // reference count is already zero, or has been eagerly umounted,
   701  // tryIncMountedRef does nothing and returns false.
   702  //
   703  // tryIncMountedRef does not require that a reference is held on mnt.
   704  func (mnt *Mount) tryIncMountedRef() bool {
   705  	for {
   706  		r := mnt.refs.Load()
   707  		if r <= 0 { // r < 0 => MSB set => eagerly unmounted
   708  			return false
   709  		}
   710  		if mnt.refs.CompareAndSwap(r, r+1) {
   711  			if mnt.LogRefs() {
   712  				refs.LogTryIncRef(mnt, r+1)
   713  			}
   714  			return true
   715  		}
   716  	}
   717  }
   718  
   719  // IncRef increments mnt's reference count.
   720  func (mnt *Mount) IncRef() {
   721  	// In general, negative values for mnt.refs are valid because the MSB is
   722  	// the eager-unmount bit.
   723  	r := mnt.refs.Add(1)
   724  	if mnt.LogRefs() {
   725  		refs.LogIncRef(mnt, r)
   726  	}
   727  }
   728  
   729  // DecRef decrements mnt's reference count.
   730  func (mnt *Mount) DecRef(ctx context.Context) {
   731  	r := mnt.refs.Add(-1)
   732  	if mnt.LogRefs() {
   733  		refs.LogDecRef(mnt, r)
   734  	}
   735  	if r&^math.MinInt64 == 0 { // mask out MSB
   736  		refs.Unregister(mnt)
   737  		mnt.destroy(ctx)
   738  	}
   739  }
   740  
   741  func (mnt *Mount) destroy(ctx context.Context) {
   742  	var vd VirtualDentry
   743  	if mnt.parent() != nil {
   744  		mnt.vfs.mountMu.Lock()
   745  		mnt.vfs.mounts.seq.BeginWrite()
   746  		vd = mnt.vfs.disconnectLocked(mnt)
   747  		mnt.vfs.mounts.seq.EndWrite()
   748  		mnt.vfs.mountMu.Unlock()
   749  	}
   750  	if mnt.root != nil {
   751  		mnt.root.DecRef(ctx)
   752  	}
   753  	mnt.fs.DecRef(ctx)
   754  	if vd.Ok() {
   755  		vd.DecRef(ctx)
   756  	}
   757  }
   758  
   759  // RefType implements refs.CheckedObject.Type.
   760  func (mnt *Mount) RefType() string {
   761  	return "vfs.Mount"
   762  }
   763  
   764  // LeakMessage implements refs.CheckedObject.LeakMessage.
   765  func (mnt *Mount) LeakMessage() string {
   766  	return fmt.Sprintf("[vfs.Mount %p] reference count of %d instead of 0", mnt, mnt.refs.Load())
   767  }
   768  
   769  // LogRefs implements refs.CheckedObject.LogRefs.
   770  //
   771  // This should only be set to true for debugging purposes, as it can generate an
   772  // extremely large amount of output and drastically degrade performance.
   773  func (mnt *Mount) LogRefs() bool {
   774  	return false
   775  }
   776  
   777  // DecRef decrements mntns' reference count.
   778  func (mntns *MountNamespace) DecRef(ctx context.Context) {
   779  	vfs := mntns.root.fs.VirtualFilesystem()
   780  	mntns.MountNamespaceRefs.DecRef(func() {
   781  		vfs.mountMu.Lock()
   782  		vfs.mounts.seq.BeginWrite()
   783  		vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(mntns.root, &umountRecursiveOptions{
   784  			disconnectHierarchy: true,
   785  		}, nil, nil)
   786  		vfs.mounts.seq.EndWrite()
   787  		vfs.mountMu.Unlock()
   788  		for _, vd := range vdsToDecRef {
   789  			vd.DecRef(ctx)
   790  		}
   791  		for _, mnt := range mountsToDecRef {
   792  			mnt.DecRef(ctx)
   793  		}
   794  	})
   795  }
   796  
   797  // getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes
   798  // a reference on the returned Mount. If (mnt, d) is not a mount point,
   799  // getMountAt returns nil.
   800  //
   801  // getMountAt is analogous to Linux's fs/namei.c:follow_mount().
   802  //
   803  // Preconditions: References are held on mnt and d.
   804  func (vfs *VirtualFilesystem) getMountAt(ctx context.Context, mnt *Mount, d *Dentry) *Mount {
   805  	// The first mount is special-cased:
   806  	//
   807  	//	- The caller is assumed to have checked d.isMounted() already. (This
   808  	//		isn't a precondition because it doesn't matter for correctness.)
   809  	//
   810  	//	- We return nil, instead of mnt, if there is no mount at (mnt, d).
   811  	//
   812  	//	- We don't drop the caller's references on mnt and d.
   813  retryFirst:
   814  	next := vfs.mounts.Lookup(mnt, d)
   815  	if next == nil {
   816  		return nil
   817  	}
   818  	if !next.tryIncMountedRef() {
   819  		// Raced with umount.
   820  		goto retryFirst
   821  	}
   822  	mnt = next
   823  	d = next.root
   824  	// We don't need to take Dentry refs anywhere in this function because
   825  	// Mounts hold references on Mount.root, which is immutable.
   826  	for d.isMounted() {
   827  		next := vfs.mounts.Lookup(mnt, d)
   828  		if next == nil {
   829  			break
   830  		}
   831  		if !next.tryIncMountedRef() {
   832  			// Raced with umount.
   833  			continue
   834  		}
   835  		mnt.DecRef(ctx)
   836  		mnt = next
   837  		d = next.root
   838  	}
   839  	return mnt
   840  }
   841  
   842  // getMountpointAt returns the mount point for the stack of Mounts including
   843  // mnt. It takes a reference on the returned VirtualDentry. If no such mount
   844  // point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil).
   845  //
   846  // Preconditions:
   847  //   - References are held on mnt and root.
   848  //   - vfsroot is not (mnt, mnt.root).
   849  func (vfs *VirtualFilesystem) getMountpointAt(ctx context.Context, mnt *Mount, vfsroot VirtualDentry) VirtualDentry {
   850  	// The first mount is special-cased:
   851  	//
   852  	//	- The caller must have already checked mnt against vfsroot.
   853  	//
   854  	//	- We return nil, instead of mnt, if there is no mount point for mnt.
   855  	//
   856  	//	- We don't drop the caller's reference on mnt.
   857  retryFirst:
   858  	epoch := vfs.mounts.seq.BeginRead()
   859  	parent, point := mnt.parent(), mnt.point()
   860  	if !vfs.mounts.seq.ReadOk(epoch) {
   861  		goto retryFirst
   862  	}
   863  	if parent == nil {
   864  		return VirtualDentry{}
   865  	}
   866  	if !parent.tryIncMountedRef() {
   867  		// Raced with umount.
   868  		goto retryFirst
   869  	}
   870  	if !point.TryIncRef() {
   871  		// Since Mount holds a reference on Mount.key.point, this can only
   872  		// happen due to a racing change to Mount.key.
   873  		parent.DecRef(ctx)
   874  		goto retryFirst
   875  	}
   876  	if !vfs.mounts.seq.ReadOk(epoch) {
   877  		point.DecRef(ctx)
   878  		parent.DecRef(ctx)
   879  		goto retryFirst
   880  	}
   881  	mnt = parent
   882  	d := point
   883  	for {
   884  		if mnt == vfsroot.mount && d == vfsroot.dentry {
   885  			break
   886  		}
   887  		if d != mnt.root {
   888  			break
   889  		}
   890  	retryNotFirst:
   891  		epoch := vfs.mounts.seq.BeginRead()
   892  		parent, point := mnt.parent(), mnt.point()
   893  		if !vfs.mounts.seq.ReadOk(epoch) {
   894  			goto retryNotFirst
   895  		}
   896  		if parent == nil {
   897  			break
   898  		}
   899  		if !parent.tryIncMountedRef() {
   900  			// Raced with umount.
   901  			goto retryNotFirst
   902  		}
   903  		if !point.TryIncRef() {
   904  			// Since Mount holds a reference on Mount.key.point, this can
   905  			// only happen due to a racing change to Mount.key.
   906  			parent.DecRef(ctx)
   907  			goto retryNotFirst
   908  		}
   909  		if !vfs.mounts.seq.ReadOk(epoch) {
   910  			point.DecRef(ctx)
   911  			parent.DecRef(ctx)
   912  			goto retryNotFirst
   913  		}
   914  		d.DecRef(ctx)
   915  		mnt.DecRef(ctx)
   916  		mnt = parent
   917  		d = point
   918  	}
   919  	return VirtualDentry{mnt, d}
   920  }
   921  
   922  // PivotRoot makes location pointed to by newRootPop the root of the current
   923  // namespace, and moves the current root to the location pointed to by
   924  // putOldPop.
   925  func (vfs *VirtualFilesystem) PivotRoot(ctx context.Context, creds *auth.Credentials, newRootPop *PathOperation, putOldPop *PathOperation) error {
   926  	newRootVd, err := vfs.GetDentryAt(ctx, creds, newRootPop, &GetDentryOptions{CheckSearchable: true})
   927  	if err != nil {
   928  		return err
   929  	}
   930  	defer newRootVd.DecRef(ctx)
   931  	putOldVd, err := vfs.GetDentryAt(ctx, creds, putOldPop, &GetDentryOptions{CheckSearchable: true})
   932  	if err != nil {
   933  		return err
   934  	}
   935  	defer putOldVd.DecRef(ctx)
   936  	rootVd := RootFromContext(ctx)
   937  	defer rootVd.DecRef(ctx)
   938  
   939  retry:
   940  	epoch := vfs.mounts.seq.BeginRead()
   941  	// Neither new_root nor put_old can be on the same mount as the current
   942  	//root mount.
   943  	if newRootVd.mount == rootVd.mount || putOldVd.mount == rootVd.mount {
   944  		return linuxerr.EBUSY
   945  	}
   946  	// new_root must be a mountpoint.
   947  	if newRootVd.mount.root != newRootVd.dentry {
   948  		return linuxerr.EINVAL
   949  	}
   950  	// put_old must be at or underneath new_root.
   951  	path, err := vfs.PathnameReachable(ctx, newRootVd, putOldVd)
   952  	if err != nil || len(path) == 0 {
   953  		return linuxerr.EINVAL
   954  	}
   955  	// The current root directory must be a mountpoint
   956  	// (in the case it has been chrooted).
   957  	if rootVd.mount.root != rootVd.dentry {
   958  		return linuxerr.EINVAL
   959  	}
   960  	// The current root and the new root cannot be on the rootfs mount.
   961  	if rootVd.mount.parent() == nil || newRootVd.mount.parent() == nil {
   962  		return linuxerr.EINVAL
   963  	}
   964  	// The current root and the new root must be in the context's mount namespace.
   965  	ns := MountNamespaceFromContext(ctx)
   966  	defer ns.DecRef(ctx)
   967  	vfs.mountMu.Lock()
   968  	if rootVd.mount.ns != ns || newRootVd.mount.ns != ns {
   969  		vfs.mountMu.Unlock()
   970  		return linuxerr.EINVAL
   971  	}
   972  
   973  	// Either the mount point at new_root, or the parent mount of that mount
   974  	// point, has propagation type MS_SHARED.
   975  	if newRootParent := newRootVd.mount.parent(); newRootVd.mount.propType == Shared || newRootParent.propType == Shared {
   976  		vfs.mountMu.Unlock()
   977  		return linuxerr.EINVAL
   978  	}
   979  	// put_old is a mount point and has the propagation type MS_SHARED.
   980  	if putOldVd.mount.root == putOldVd.dentry && putOldVd.mount.propType == Shared {
   981  		vfs.mountMu.Unlock()
   982  		return linuxerr.EINVAL
   983  	}
   984  
   985  	if !vfs.mounts.seq.BeginWriteOk(epoch) {
   986  		// Checks above raced with a mount change.
   987  		vfs.mountMu.Unlock()
   988  		goto retry
   989  	}
   990  	defer vfs.mountMu.Unlock()
   991  	mp := vfs.disconnectLocked(newRootVd.mount)
   992  	mp.DecRef(ctx)
   993  	rootMp := vfs.disconnectLocked(rootVd.mount)
   994  
   995  	putOldVd.IncRef()
   996  	putOldVd.dentry.mu.Lock()
   997  	vfs.connectLocked(rootVd.mount, putOldVd, ns)
   998  	putOldVd.dentry.mu.Unlock()
   999  
  1000  	rootMp.dentry.mu.Lock()
  1001  	vfs.connectLocked(newRootVd.mount, rootMp, ns)
  1002  	rootMp.dentry.mu.Unlock()
  1003  	vfs.mounts.seq.EndWrite()
  1004  
  1005  	newRootVd.mount.DecRef(ctx)
  1006  	rootVd.mount.DecRef(ctx)
  1007  	return nil
  1008  }
  1009  
  1010  // SetMountReadOnly sets the mount as ReadOnly.
  1011  func (vfs *VirtualFilesystem) SetMountReadOnly(mnt *Mount, ro bool) error {
  1012  	vfs.mountMu.Lock()
  1013  	defer vfs.mountMu.Unlock()
  1014  	return mnt.setReadOnlyLocked(ro)
  1015  }
  1016  
  1017  // CheckBeginWrite increments the counter of in-progress write operations on
  1018  // mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns
  1019  // EROFS.
  1020  //
  1021  // If CheckBeginWrite succeeds, EndWrite must be called when the write
  1022  // operation is finished.
  1023  func (mnt *Mount) CheckBeginWrite() error {
  1024  	if mnt.writers.Add(1) < 0 {
  1025  		mnt.writers.Add(-1)
  1026  		return linuxerr.EROFS
  1027  	}
  1028  	return nil
  1029  }
  1030  
  1031  // EndWrite indicates that a write operation signaled by a previous successful
  1032  // call to CheckBeginWrite has finished.
  1033  func (mnt *Mount) EndWrite() {
  1034  	mnt.writers.Add(-1)
  1035  }
  1036  
  1037  // Preconditions: VirtualFilesystem.mountMu must be locked.
  1038  func (mnt *Mount) setReadOnlyLocked(ro bool) error {
  1039  	if oldRO := mnt.writers.Load() < 0; oldRO == ro {
  1040  		return nil
  1041  	}
  1042  	if ro {
  1043  		if !mnt.writers.CompareAndSwap(0, math.MinInt64) {
  1044  			return linuxerr.EBUSY
  1045  		}
  1046  		return nil
  1047  	}
  1048  	// Unset MSB without dropping any temporary increments from failed calls to
  1049  	// mnt.CheckBeginWrite().
  1050  	mnt.writers.Add(math.MinInt64)
  1051  	return nil
  1052  }
  1053  
  1054  // ReadOnly returns true if mount is readonly.
  1055  func (mnt *Mount) ReadOnly() bool {
  1056  	return mnt.writers.Load() < 0
  1057  }
  1058  
  1059  // Filesystem returns the mounted Filesystem. It does not take a reference on
  1060  // the returned Filesystem.
  1061  func (mnt *Mount) Filesystem() *Filesystem {
  1062  	return mnt.fs
  1063  }
  1064  
  1065  // submountsLocked returns this Mount and all Mounts that are descendents of
  1066  // it.
  1067  //
  1068  // Precondition: mnt.vfs.mountMu must be held.
  1069  func (mnt *Mount) submountsLocked() []*Mount {
  1070  	mounts := []*Mount{mnt}
  1071  	for m := range mnt.children {
  1072  		mounts = append(mounts, m.submountsLocked()...)
  1073  	}
  1074  	return mounts
  1075  }
  1076  
  1077  // Root returns the mount's root. It does not take a reference on the returned
  1078  // Dentry.
  1079  func (mnt *Mount) Root() *Dentry {
  1080  	return mnt.root
  1081  }
  1082  
  1083  // Root returns mntns' root. It does not take a reference on the returned
  1084  // Dentry.
  1085  func (mntns *MountNamespace) Root() VirtualDentry {
  1086  	vd := VirtualDentry{
  1087  		mount:  mntns.root,
  1088  		dentry: mntns.root.root,
  1089  	}
  1090  	return vd
  1091  }
  1092  
  1093  // GenerateProcMounts emits the contents of /proc/[pid]/mounts for vfs to buf.
  1094  //
  1095  // Preconditions: taskRootDir.Ok().
  1096  func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
  1097  	rootMnt := taskRootDir.mount
  1098  
  1099  	vfs.mountMu.Lock()
  1100  	mounts := rootMnt.submountsLocked()
  1101  	// Take a reference on mounts since we need to drop vfs.mountMu before
  1102  	// calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()).
  1103  	for _, mnt := range mounts {
  1104  		mnt.IncRef()
  1105  	}
  1106  	vfs.mountMu.Unlock()
  1107  	defer func() {
  1108  		for _, mnt := range mounts {
  1109  			mnt.DecRef(ctx)
  1110  		}
  1111  	}()
  1112  	sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
  1113  
  1114  	for _, mnt := range mounts {
  1115  		// Get the path to this mount relative to task root.
  1116  		mntRootVD := VirtualDentry{
  1117  			mount:  mnt,
  1118  			dentry: mnt.root,
  1119  		}
  1120  		path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
  1121  		if err != nil {
  1122  			// For some reason we didn't get a path. Log a warning
  1123  			// and run with empty path.
  1124  			ctx.Warningf("VFS.GenerateProcMounts: error getting pathname for mount root %+v: %v", mnt.root, err)
  1125  			path = ""
  1126  		}
  1127  		if path == "" {
  1128  			// Either an error occurred, or path is not reachable
  1129  			// from root.
  1130  			break
  1131  		}
  1132  
  1133  		opts := "rw"
  1134  		if mnt.ReadOnly() {
  1135  			opts = "ro"
  1136  		}
  1137  		if mnt.Flags.NoATime {
  1138  			opts = ",noatime"
  1139  		}
  1140  		if mnt.Flags.NoExec {
  1141  			opts += ",noexec"
  1142  		}
  1143  		if mopts := mnt.fs.Impl().MountOptions(); mopts != "" {
  1144  			opts += "," + mopts
  1145  		}
  1146  
  1147  		// Format:
  1148  		// <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order>
  1149  		//
  1150  		// The "needs dump" and "fsck order" flags are always 0, which
  1151  		// is allowed.
  1152  		fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", path, mnt.fs.FilesystemType().Name(), opts, 0, 0)
  1153  	}
  1154  }
  1155  
  1156  // GenerateProcMountInfo emits the contents of /proc/[pid]/mountinfo for vfs to
  1157  // buf.
  1158  //
  1159  // Preconditions: taskRootDir.Ok().
  1160  func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
  1161  	rootMnt := taskRootDir.mount
  1162  
  1163  	vfs.mountMu.Lock()
  1164  	mounts := rootMnt.submountsLocked()
  1165  	// Take a reference on mounts since we need to drop vfs.mountMu before
  1166  	// calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()) or
  1167  	// vfs.StatAt() (=> FilesystemImpl.StatAt()).
  1168  	for _, mnt := range mounts {
  1169  		mnt.IncRef()
  1170  	}
  1171  	vfs.mountMu.Unlock()
  1172  	defer func() {
  1173  		for _, mnt := range mounts {
  1174  			mnt.DecRef(ctx)
  1175  		}
  1176  	}()
  1177  	sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
  1178  
  1179  	creds := auth.CredentialsFromContext(ctx)
  1180  	for _, mnt := range mounts {
  1181  		// Get the path to this mount relative to task root.
  1182  		mntRootVD := VirtualDentry{
  1183  			mount:  mnt,
  1184  			dentry: mnt.root,
  1185  		}
  1186  		pathFromRoot, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
  1187  		if err != nil {
  1188  			// For some reason we didn't get a path. Log a warning
  1189  			// and run with empty path.
  1190  			ctx.Warningf("VFS.GenerateProcMountInfo: error getting pathname for mount root %+v: %v", mnt.root, err)
  1191  			continue
  1192  		}
  1193  		if pathFromRoot == "" {
  1194  			// The path is not reachable from root.
  1195  			continue
  1196  		}
  1197  		var pathFromFS string
  1198  		pathFromFS, err = vfs.PathnameInFilesystem(ctx, mntRootVD)
  1199  		if err != nil {
  1200  			// For some reason we didn't get a path. Log a warning
  1201  			// and run with empty path.
  1202  			ctx.Warningf("VFS.GenerateProcMountInfo: error getting pathname for mount root %+v: %v", mnt.root, err)
  1203  			continue
  1204  		}
  1205  		if pathFromFS == "" {
  1206  			// The path is not reachable from root.
  1207  			continue
  1208  		}
  1209  		// Stat the mount root to get the major/minor device numbers.
  1210  		pop := &PathOperation{
  1211  			Root:  mntRootVD,
  1212  			Start: mntRootVD,
  1213  		}
  1214  		statx, err := vfs.StatAt(ctx, creds, pop, &StatOptions{})
  1215  		if err != nil {
  1216  			// Well that's not good. Ignore this mount.
  1217  			ctx.Warningf("VFS.GenerateProcMountInfo: failed to stat mount root %+v: %v", mnt.root, err)
  1218  			continue
  1219  		}
  1220  
  1221  		// Format:
  1222  		// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
  1223  		// (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
  1224  
  1225  		// (1) Mount ID.
  1226  		fmt.Fprintf(buf, "%d ", mnt.ID)
  1227  
  1228  		// (2)  Parent ID (or this ID if there is no parent).
  1229  		// Note that even if the call to mnt.parent() races with Mount
  1230  		// destruction (which is possible since we're not holding vfs.mountMu),
  1231  		// its Mount.ID will still be valid.
  1232  		pID := mnt.ID
  1233  		if p := mnt.parent(); p != nil {
  1234  			pID = p.ID
  1235  		}
  1236  		fmt.Fprintf(buf, "%d ", pID)
  1237  
  1238  		// (3) Major:Minor device ID. We don't have a superblock, so we
  1239  		// just use the root inode device number.
  1240  		fmt.Fprintf(buf, "%d:%d ", statx.DevMajor, statx.DevMinor)
  1241  
  1242  		// (4) Root: the pathname of the directory in the filesystem
  1243  		// which forms the root of this mount.
  1244  		fmt.Fprintf(buf, "%s ", manglePath(pathFromFS))
  1245  
  1246  		// (5) Mount point (relative to process root).
  1247  		fmt.Fprintf(buf, "%s ", manglePath(pathFromRoot))
  1248  
  1249  		// (6) Mount options.
  1250  		opts := "rw"
  1251  		if mnt.ReadOnly() {
  1252  			opts = "ro"
  1253  		}
  1254  		if mnt.Flags.NoATime {
  1255  			opts = ",noatime"
  1256  		}
  1257  		if mnt.Flags.NoExec {
  1258  			opts += ",noexec"
  1259  		}
  1260  		fmt.Fprintf(buf, "%s ", opts)
  1261  
  1262  		// (7) Optional fields: zero or more fields of the form "tag[:value]".
  1263  		fmt.Fprintf(buf, "%s ", mnt.generateOptionalTags())
  1264  		// (8) Separator: the end of the optional fields is marked by a single hyphen.
  1265  		fmt.Fprintf(buf, "- ")
  1266  
  1267  		// (9) Filesystem type.
  1268  		fmt.Fprintf(buf, "%s ", mnt.fs.FilesystemType().Name())
  1269  
  1270  		// (10) Mount source: filesystem-specific information or "none".
  1271  		fmt.Fprintf(buf, "none ")
  1272  
  1273  		// (11) Superblock options, and final newline.
  1274  		fmt.Fprintf(buf, "%s\n", superBlockOpts(pathFromRoot, mnt))
  1275  	}
  1276  }
  1277  
  1278  // manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents.
  1279  // See Linux fs/seq_file.c:mangle_path.
  1280  func manglePath(p string) string {
  1281  	r := strings.NewReplacer(" ", "\\040", "\t", "\\011", "\n", "\\012", "\\", "\\134")
  1282  	return r.Replace(p)
  1283  }
  1284  
  1285  // superBlockOpts returns the super block options string for the the mount at
  1286  // the given path.
  1287  func superBlockOpts(mountPath string, mnt *Mount) string {
  1288  	// Compose super block options by combining global mount flags with
  1289  	// FS-specific mount options.
  1290  	opts := "rw"
  1291  	if mnt.ReadOnly() {
  1292  		opts = "ro"
  1293  	}
  1294  
  1295  	if mopts := mnt.fs.Impl().MountOptions(); mopts != "" {
  1296  		opts += "," + mopts
  1297  	}
  1298  
  1299  	// NOTE(b/147673608): If the mount is a ramdisk-based fake cgroupfs, we also
  1300  	// need to include the cgroup name in the options. For now we just read that
  1301  	// from the path. Note that this is only possible when "cgroup" isn't
  1302  	// registered as a valid filesystem type.
  1303  	//
  1304  	// TODO(gvisor.dev/issue/190): Once we removed fake cgroupfs support, we
  1305  	// should remove this.
  1306  	if cgroupfs := mnt.vfs.getFilesystemType("cgroup"); cgroupfs != nil && cgroupfs.opts.AllowUserMount {
  1307  		// Real cgroupfs available.
  1308  		return opts
  1309  	}
  1310  	if mnt.fs.FilesystemType().Name() == "cgroup" {
  1311  		splitPath := strings.Split(mountPath, "/")
  1312  		cgroupType := splitPath[len(splitPath)-1]
  1313  		opts += "," + cgroupType
  1314  	}
  1315  
  1316  	return opts
  1317  }
  1318  
  1319  // allocateGroupID returns a new mount group id if one is available, and
  1320  // error otherwise. If the group ID bitmap is full, double the size of the
  1321  // bitmap before allocating the new group id.
  1322  //
  1323  // +checklocks:vfs.mountMu
  1324  func (vfs *VirtualFilesystem) allocateGroupID() (uint32, error) {
  1325  	groupID, err := vfs.groupIDBitmap.FirstZero(1)
  1326  	if err != nil {
  1327  		if err := vfs.groupIDBitmap.Grow(uint32(vfs.groupIDBitmap.Size())); err != nil {
  1328  			return 0, err
  1329  		}
  1330  	}
  1331  	vfs.groupIDBitmap.Add(groupID)
  1332  	return groupID, nil
  1333  }
  1334  
  1335  // freeGroupID marks a groupID as available for reuse.
  1336  //
  1337  // +checklocks:vfs.mountMu
  1338  func (vfs *VirtualFilesystem) freeGroupID(id uint32) {
  1339  	vfs.groupIDBitmap.Remove(id)
  1340  }