github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/vfs/mount.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package vfs
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"math"
    21  	"sort"
    22  	"strings"
    23  
    24  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    25  	"github.com/MerlinKodo/gvisor/pkg/atomicbitops"
    26  	"github.com/MerlinKodo/gvisor/pkg/context"
    27  	"github.com/MerlinKodo/gvisor/pkg/errors/linuxerr"
    28  	"github.com/MerlinKodo/gvisor/pkg/refs"
    29  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth"
    30  )
    31  
    32  // MountMax is the maximum number of mounts allowed. In Linux this can be
    33  // configured by the user at /proc/sys/fs/mount-max, but the default is
    34  // 100,000. We set the gVisor limit to 10,000.
    35  const MountMax = 10000
    36  
    37  // A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem
    38  // (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem
    39  // (Mount.fs), which applies to path resolution in the context of a particular
    40  // Mount (Mount.key.parent).
    41  //
    42  // Mounts are reference-counted. Unless otherwise specified, all Mount methods
    43  // require that a reference is held.
    44  //
    45  // Mount and Filesystem are distinct types because it's possible for a single
    46  // Filesystem to be mounted at multiple locations and/or in multiple mount
    47  // namespaces.
    48  //
    49  // Mount is analogous to Linux's struct mount. (gVisor does not distinguish
    50  // between struct mount and struct vfsmount.)
    51  //
    52  // +stateify savable
    53  type Mount struct {
    54  	// vfs, fs, root are immutable. References are held on fs and root.
    55  	// Note that for a disconnected mount, root may be nil.
    56  	//
    57  	// Invariant: if not nil, root belongs to fs.
    58  	vfs  *VirtualFilesystem
    59  	fs   *Filesystem
    60  	root *Dentry
    61  
    62  	// ID is the immutable mount ID.
    63  	ID uint64
    64  
    65  	// Flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except
    66  	// for MS_RDONLY which is tracked in "writers". Immutable.
    67  	Flags MountFlags
    68  
    69  	// key is protected by VirtualFilesystem.mountMu and
    70  	// VirtualFilesystem.mounts.seq, and may be nil. References are held on
    71  	// key.parent and key.point if they are not nil.
    72  	//
    73  	// Invariant: key.parent != nil iff key.point != nil. key.point belongs to
    74  	// key.parent.fs.
    75  	key mountKey `state:".(VirtualDentry)"`
    76  
    77  	// ns is the namespace in which this Mount was mounted. ns is protected by
    78  	// VirtualFilesystem.mountMu.
    79  	ns *MountNamespace
    80  
    81  	// The lower 63 bits of refs are a reference count. The MSB of refs is set
    82  	// if the Mount has been eagerly umounted, as by umount(2) without the
    83  	// MNT_DETACH flag. refs is accessed using atomic memory operations.
    84  	refs atomicbitops.Int64
    85  
    86  	// children is the set of all Mounts for which Mount.key.parent is this
    87  	// Mount. children is protected by VirtualFilesystem.mountMu.
    88  	children map[*Mount]struct{}
    89  
    90  	// isShared indicates this mount has the MS_SHARED propagation type.
    91  	isShared bool
    92  
    93  	// sharedEntry represents an entry in a circular list (ring) of mounts in a
    94  	// shared peer group.
    95  	sharedEntry mountEntry
    96  
    97  	// groupID is the ID for this mount's shared peer group. If the mount is not
    98  	// in a peer group, this is 0.
    99  	groupID uint32
   100  
   101  	// umounted is true if VFS.umountRecursiveLocked() has been called on this
   102  	// Mount. VirtualFilesystem does not hold a reference on Mounts for which
   103  	// umounted is true. umounted is protected by VirtualFilesystem.mountMu.
   104  	umounted bool
   105  
   106  	// The lower 63 bits of writers is the number of calls to
   107  	// Mount.CheckBeginWrite() that have not yet been paired with a call to
   108  	// Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect.
   109  	// writers is accessed using atomic memory operations.
   110  	writers atomicbitops.Int64
   111  
   112  	// pendingChildren is a list of new child mounts that have not yet been
   113  	// connected to this mount as the parent.
   114  	pendingChildren []*Mount
   115  }
   116  
   117  func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount {
   118  	mnt := &Mount{
   119  		ID:       vfs.lastMountID.Add(1),
   120  		Flags:    opts.Flags,
   121  		vfs:      vfs,
   122  		fs:       fs,
   123  		root:     root,
   124  		ns:       mntns,
   125  		isShared: false,
   126  		refs:     atomicbitops.FromInt64(1),
   127  	}
   128  	if opts.ReadOnly {
   129  		mnt.setReadOnlyLocked(true)
   130  	}
   131  	mnt.sharedEntry.Init(mnt)
   132  	refs.Register(mnt)
   133  	return mnt
   134  }
   135  
   136  // Options returns a copy of the MountOptions currently applicable to mnt.
   137  func (mnt *Mount) Options() MountOptions {
   138  	mnt.vfs.lockMounts()
   139  	defer mnt.vfs.unlockMounts(context.Background())
   140  	return MountOptions{
   141  		Flags:    mnt.Flags,
   142  		ReadOnly: mnt.ReadOnly(),
   143  	}
   144  }
   145  
   146  func (mnt *Mount) generateOptionalTags() string {
   147  	mnt.vfs.lockMounts()
   148  	defer mnt.vfs.unlockMounts(context.Background())
   149  	// TODO(b/249777195): Support MS_SLAVE and MS_UNBINDABLE propagation types.
   150  	var optional string
   151  	if mnt.isShared {
   152  		optional = fmt.Sprintf("shared:%d", mnt.groupID)
   153  	}
   154  	return optional
   155  }
   156  
   157  // coveringMount returns a mount that completely covers mnt if it exists and nil
   158  // otherwise. A mount that covers another is one that is the only child of its
   159  // parent and whose mountpoint is its parent's root.
   160  func (mnt *Mount) coveringMount() *Mount {
   161  	if len(mnt.children) != 1 {
   162  		return nil
   163  	}
   164  	// Get the child from the children map.
   165  	var child *Mount
   166  	for child = range mnt.children {
   167  		break
   168  	}
   169  	if child.point() != mnt.root {
   170  		return nil
   171  	}
   172  	return child
   173  }
   174  
   175  // NewFilesystem creates a new filesystem object not yet associated with any
   176  // mounts. It can be installed into the filesystem tree with ConnectMountAt.
   177  // Note that only the filesystem-specific mount options from opts are used by
   178  // this function, mount flags are ignored. To set mount flags, pass them to a
   179  // corresponding ConnectMountAt.
   180  func (vfs *VirtualFilesystem) NewFilesystem(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *MountOptions) (*Filesystem, *Dentry, error) {
   181  	rft := vfs.getFilesystemType(fsTypeName)
   182  	if rft == nil {
   183  		return nil, nil, linuxerr.ENODEV
   184  	}
   185  	if !opts.InternalMount && !rft.opts.AllowUserMount {
   186  		return nil, nil, linuxerr.ENODEV
   187  	}
   188  	return rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
   189  }
   190  
   191  // NewDisconnectedMount returns a Mount representing fs with the given root
   192  // (which may be nil). The new Mount is not associated with any MountNamespace
   193  // and is not connected to any other Mounts. References are taken on fs and
   194  // root.
   195  func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry, opts *MountOptions) *Mount {
   196  	fs.IncRef()
   197  	if root != nil {
   198  		root.IncRef()
   199  	}
   200  	return newMount(vfs, fs, root, nil /* mntns */, opts)
   201  }
   202  
   203  // MountDisconnected creates a Filesystem configured by the given arguments,
   204  // then returns a Mount representing it. The new Mount is not associated with
   205  // any MountNamespace and is not connected to any other Mounts.
   206  func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth.Credentials, source string, fsTypeName string, opts *MountOptions) (*Mount, error) {
   207  	fs, root, err := vfs.NewFilesystem(ctx, creds, source, fsTypeName, opts)
   208  	if err != nil {
   209  		return nil, err
   210  	}
   211  	return newMount(vfs, fs, root, nil /* mntns */, opts), nil
   212  }
   213  
   214  // ConnectMountAt connects mnt at the path represented by target.
   215  //
   216  // Preconditions: mnt must be disconnected.
   217  func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Credentials, mnt *Mount, target *PathOperation) error {
   218  	// We can't hold vfs.mountMu while calling FilesystemImpl methods due to
   219  	// lock ordering.
   220  	vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{})
   221  	if err != nil {
   222  		return err
   223  	}
   224  	vfs.lockMounts()
   225  	defer vfs.unlockMounts(ctx)
   226  	// This is equivalent to checking for SB_NOUSER in Linux, which is set on all
   227  	// anon mounts and sentry-internal filesystems like pipefs.
   228  	if vd.mount.ns == nil {
   229  		vfs.delayDecRef(vd)
   230  		return linuxerr.EINVAL
   231  	}
   232  	tree := vfs.preparePropagationTree(mnt, vd)
   233  	// Check if the new mount + all the propagation mounts puts us over the max.
   234  	if uint32(len(tree)+1)+vd.mount.ns.mounts > MountMax {
   235  		// We need to unlock mountMu first because DecRef takes a lock on the
   236  		// filesystem mutex in some implementations, which can lead to circular
   237  		// locking.
   238  		vfs.abortPropagationTree(ctx, tree)
   239  		vfs.delayDecRef(vd)
   240  		return linuxerr.ENOSPC
   241  	}
   242  	if err := vfs.connectMountAtLocked(ctx, mnt, vd); err != nil {
   243  		vfs.abortPropagationTree(ctx, tree)
   244  		return err
   245  	}
   246  	vfs.commitPropagationTree(ctx, tree)
   247  	return nil
   248  }
   249  
   250  // connectMountAtLocked attaches mnt at vd. This method consumes a reference on
   251  // vd and returns a list of VirtualDentry with an extra reference that must be
   252  // DecRef'd outside of vfs.mountMu.
   253  //
   254  // Preconditions:
   255  //   - mnt must be disconnected.
   256  //   - vfs.mountMu must be locked.
   257  //
   258  // +checklocks:vfs.mountMu
   259  func (vfs *VirtualFilesystem) connectMountAtLocked(ctx context.Context, mnt *Mount, vd VirtualDentry) error {
   260  	vd.dentry.mu.Lock()
   261  	for {
   262  		if vd.mount.umounted || vd.dentry.dead {
   263  			vd.dentry.mu.Unlock()
   264  			vfs.delayDecRef(vd)
   265  			return linuxerr.ENOENT
   266  		}
   267  		// vd might have been mounted over between vfs.GetDentryAt() and
   268  		// vfs.mountMu.Lock().
   269  		if !vd.dentry.isMounted() {
   270  			break
   271  		}
   272  		nextmnt := vfs.mounts.Lookup(vd.mount, vd.dentry)
   273  		if nextmnt == nil {
   274  			break
   275  		}
   276  		// It's possible that nextmnt has been umounted but not disconnected,
   277  		// in which case vfs no longer holds a reference on it, and the last
   278  		// reference may be concurrently dropped even though we're holding
   279  		// vfs.mountMu.
   280  		if !nextmnt.tryIncMountedRef() {
   281  			break
   282  		}
   283  		// This can't fail since we're holding vfs.mountMu.
   284  		nextmnt.root.IncRef()
   285  		vd.dentry.mu.Unlock()
   286  		vfs.delayDecRef(vd)
   287  		vd = VirtualDentry{
   288  			mount:  nextmnt,
   289  			dentry: nextmnt.root,
   290  		}
   291  		vd.dentry.mu.Lock()
   292  	}
   293  	// TODO(gvisor.dev/issue/1035): Linux requires that either both the mount
   294  	// point and the mount root are directories, or neither are, and returns
   295  	// ENOTDIR if this is not the case.
   296  	mntns := vd.mount.ns
   297  	vfs.mounts.seq.BeginWrite()
   298  	vfs.connectLocked(mnt, vd, mntns)
   299  	vfs.mounts.seq.EndWrite()
   300  	vd.dentry.mu.Unlock()
   301  	return nil
   302  }
   303  
   304  // CloneMountAt returns a new mount with the same fs, specified root and
   305  // mount options. If mnt's propagation type is shared the new mount is
   306  // automatically made a peer of mnt. If mount options are nil, mnt's
   307  // options are copied.
   308  func (vfs *VirtualFilesystem) CloneMountAt(mnt *Mount, root *Dentry, mopts *MountOptions) *Mount {
   309  	vfs.lockMounts()
   310  	defer vfs.unlockMounts(context.Background())
   311  	clone := vfs.cloneMount(mnt, root, mopts)
   312  	return clone
   313  }
   314  
   315  // cloneMount returns a new mount with mnt.fs as the filesystem and root as the
   316  // root. The returned mount has an extra reference.
   317  //
   318  // +checklocks:vfs.mountMu
   319  // +checklocksalias:mnt.vfs.mountMu=vfs.mountMu
   320  func (vfs *VirtualFilesystem) cloneMount(mnt *Mount, root *Dentry, mopts *MountOptions) *Mount {
   321  	opts := mopts
   322  	if opts == nil {
   323  		opts = &MountOptions{
   324  			Flags:    mnt.Flags,
   325  			ReadOnly: mnt.ReadOnly(),
   326  		}
   327  	}
   328  	clone := vfs.NewDisconnectedMount(mnt.fs, root, opts)
   329  	if mnt.isShared {
   330  		vfs.addPeer(mnt, clone)
   331  	}
   332  	return clone
   333  }
   334  
   335  type cloneTreeNode struct {
   336  	prevMount   *Mount
   337  	parentMount *Mount
   338  }
   339  
   340  // cloneMountTree creates a copy of mnt's tree with the specified root
   341  // dentry at root. The new descendents are added to mnt's pending mount list.
   342  //
   343  // +checklocks:vfs.mountMu
   344  func (vfs *VirtualFilesystem) cloneMountTree(ctx context.Context, mnt *Mount, root *Dentry) (*Mount, error) {
   345  	clone := vfs.cloneMount(mnt, root, nil)
   346  	queue := []cloneTreeNode{{mnt, clone}}
   347  	for len(queue) != 0 {
   348  		p := queue[len(queue)-1]
   349  		queue = queue[:len(queue)-1]
   350  		for c := range p.prevMount.children {
   351  			m := vfs.cloneMount(c, c.root, nil)
   352  			mp := VirtualDentry{
   353  				mount:  p.parentMount,
   354  				dentry: c.point(),
   355  			}
   356  			mp.IncRef()
   357  			m.setKey(mp)
   358  			p.parentMount.pendingChildren = append(p.parentMount.pendingChildren, m)
   359  			if len(c.children) != 0 {
   360  				queue = append(queue, cloneTreeNode{c, m})
   361  			}
   362  		}
   363  	}
   364  	return clone, nil
   365  }
   366  
   367  // BindAt creates a clone of the source path's parent mount and mounts it at
   368  // the target path. The new mount's root dentry is one pointed to by the source
   369  // path.
   370  //
   371  // TODO(b/249121230): Support recursive bind mounting.
   372  func (vfs *VirtualFilesystem) BindAt(ctx context.Context, creds *auth.Credentials, source, target *PathOperation) (*Mount, error) {
   373  	sourceVd, err := vfs.GetDentryAt(ctx, creds, source, &GetDentryOptions{})
   374  	if err != nil {
   375  		return nil, err
   376  	}
   377  	defer sourceVd.DecRef(ctx)
   378  	targetVd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{})
   379  	if err != nil {
   380  		return nil, err
   381  	}
   382  	vfs.lockMounts()
   383  	defer vfs.unlockMounts(ctx)
   384  	// This is equivalent to checking for SB_NOUSER in Linux, which is set on all
   385  	// anon mounts.
   386  	if targetVd.mount.ns == nil {
   387  		vfs.delayDecRef(targetVd)
   388  		return nil, linuxerr.EINVAL
   389  	}
   390  
   391  	clone := vfs.cloneMount(sourceVd.mount, sourceVd.dentry, nil)
   392  	vfs.delayDecRef(clone)
   393  	tree := vfs.preparePropagationTree(clone, targetVd)
   394  	if uint32(1+len(tree))+targetVd.mount.ns.mounts > MountMax {
   395  		vfs.setPropagation(clone, linux.MS_PRIVATE)
   396  		vfs.abortPropagationTree(ctx, tree)
   397  		vfs.delayDecRef(targetVd)
   398  		return nil, linuxerr.ENOSPC
   399  	}
   400  
   401  	if err := vfs.connectMountAtLocked(ctx, clone, targetVd); err != nil {
   402  		vfs.setPropagation(clone, linux.MS_PRIVATE)
   403  		vfs.abortPropagationTree(ctx, tree)
   404  		return nil, err
   405  	}
   406  	vfs.commitPropagationTree(ctx, tree)
   407  	return clone, nil
   408  }
   409  
   410  // MountAt creates and mounts a Filesystem configured by the given arguments.
   411  // The VirtualFilesystem will hold a reference to the Mount until it is
   412  // unmounted.
   413  //
   414  // This method returns the mounted Mount without a reference, for convenience
   415  // during VFS setup when there is no chance of racing with unmount.
   416  func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) (*Mount, error) {
   417  	mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts)
   418  	if err != nil {
   419  		return nil, err
   420  	}
   421  	defer mnt.DecRef(ctx)
   422  	if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil {
   423  		return nil, err
   424  	}
   425  	return mnt, nil
   426  }
   427  
   428  // UmountAt removes the Mount at the given path.
   429  func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error {
   430  	if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 {
   431  		return linuxerr.EINVAL
   432  	}
   433  
   434  	// MNT_FORCE is currently unimplemented except for the permission check.
   435  	// Force unmounting specifically requires CAP_SYS_ADMIN in the root user
   436  	// namespace, and not in the owner user namespace for the target mount. See
   437  	// fs/namespace.c:SYSCALL_DEFINE2(umount, ...)
   438  	if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) {
   439  		return linuxerr.EPERM
   440  	}
   441  
   442  	vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{})
   443  	if err != nil {
   444  		return err
   445  	}
   446  	// This defer statement is encapsulated in a function because vd.mount can be
   447  	// modified in the block below. The arguments to defer are evaluated during
   448  	// the construction of a defer statement, so if vd.DecRef() was not
   449  	// encapsulated, the vd structure and its underlying pointers _at this point_
   450  	// would be copied and DecRefd at the end of this function.
   451  	defer func() {
   452  		vd.DecRef(ctx)
   453  	}()
   454  	// Linux passes the LOOKUP_MOUNPOINT flag to user_path_at in ksys_umount to
   455  	// resolve to the toppmost mount in the stack located at the specified path.
   456  	// vfs.GetMountAt() imitates this behavior. See fs/namei.c:user_path_at(...)
   457  	// and fs/namespace.c:ksys_umount(...).
   458  	if vd.dentry.isMounted() {
   459  		if realmnt := vfs.getMountAt(ctx, vd.mount, vd.dentry); realmnt != nil {
   460  			vd.mount.DecRef(ctx)
   461  			vd.mount = realmnt
   462  		}
   463  	} else if vd.dentry != vd.mount.root {
   464  		return linuxerr.EINVAL
   465  	}
   466  
   467  	vfs.lockMounts()
   468  	defer vfs.unlockMounts(ctx)
   469  	if mntns := MountNamespaceFromContext(ctx); mntns != nil {
   470  		vfs.delayDecRef(mntns)
   471  		if mntns != vd.mount.ns {
   472  			return linuxerr.EINVAL
   473  		}
   474  
   475  		if vd.mount == vd.mount.ns.root {
   476  			return linuxerr.EINVAL
   477  		}
   478  	}
   479  
   480  	umountTree := []*Mount{vd.mount}
   481  	parent, mountpoint := vd.mount.parent(), vd.mount.point()
   482  	if parent != nil && parent.isShared {
   483  		for peer := parent.sharedEntry.Next(); peer != parent; peer = peer.sharedEntry.Next() {
   484  			umountMnt := vfs.mounts.Lookup(peer, mountpoint)
   485  			// From https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt:
   486  			// If any peer has some child mounts, then that mount is not unmounted,
   487  			// but all other mounts are unmounted.
   488  			if umountMnt == nil {
   489  				continue
   490  			}
   491  			if len(umountMnt.children) == 0 || umountMnt.coveringMount() != nil {
   492  				umountTree = append(umountTree, umountMnt)
   493  			}
   494  		}
   495  	}
   496  
   497  	// TODO(gvisor.dev/issue/1035): Linux special-cases umount of the caller's
   498  	// root, which we don't implement yet (we'll just fail it since the caller
   499  	// holds a reference on it).
   500  
   501  	vfs.mounts.seq.BeginWrite()
   502  	if opts.Flags&linux.MNT_DETACH == 0 {
   503  		if len(vd.mount.children) != 0 {
   504  			vfs.mounts.seq.EndWrite()
   505  			return linuxerr.EBUSY
   506  		}
   507  		// We are holding a reference on vd.mount.
   508  		expectedRefs := int64(1)
   509  		if !vd.mount.umounted {
   510  			expectedRefs = 2
   511  		}
   512  		if vd.mount.refs.Load()&^math.MinInt64 != expectedRefs { // mask out MSB
   513  			vfs.mounts.seq.EndWrite()
   514  			return linuxerr.EBUSY
   515  		}
   516  	}
   517  	for _, mnt := range umountTree {
   518  		vfs.umountRecursiveLocked(mnt, &umountRecursiveOptions{
   519  			eager:               opts.Flags&linux.MNT_DETACH == 0,
   520  			disconnectHierarchy: true,
   521  		})
   522  	}
   523  	vfs.mounts.seq.EndWrite()
   524  	return nil
   525  }
   526  
   527  // +stateify savable
   528  type umountRecursiveOptions struct {
   529  	// If eager is true, ensure that future calls to Mount.tryIncMountedRef()
   530  	// on umounted mounts fail.
   531  	//
   532  	// eager is analogous to Linux's UMOUNT_SYNC.
   533  	eager bool
   534  
   535  	// If disconnectHierarchy is true, Mounts that are umounted hierarchically
   536  	// should be disconnected from their parents. (Mounts whose parents are not
   537  	// umounted, which in most cases means the Mount passed to the initial call
   538  	// to umountRecursiveLocked, are unconditionally disconnected for
   539  	// consistency with Linux.)
   540  	//
   541  	// disconnectHierarchy is analogous to Linux's !UMOUNT_CONNECTED.
   542  	disconnectHierarchy bool
   543  }
   544  
   545  // umountRecursiveLocked marks mnt and its descendants as umounted.
   546  //
   547  // umountRecursiveLocked is analogous to Linux's fs/namespace.c:umount_tree().
   548  //
   549  // Preconditions:
   550  //   - vfs.mountMu must be locked.
   551  //   - vfs.mounts.seq must be in a writer critical section.
   552  //
   553  // +checklocks:vfs.mountMu
   554  func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecursiveOptions) {
   555  	// covered mounts are a special case where the grandchild mount is
   556  	// reconnected to the parent after the child is disconnected.
   557  	var cover *Mount
   558  	if parent := mnt.parent(); parent != nil && !parent.umounted {
   559  		if cover = mnt.coveringMount(); cover != nil {
   560  			vfs.delayDecRef(vfs.disconnectLocked(cover))
   561  			cover.setKey(mnt.getKey())
   562  		}
   563  	}
   564  	if !mnt.umounted {
   565  		mnt.umounted = true
   566  		vfs.delayDecRef(mnt)
   567  		if parent := mnt.parent(); parent != nil && (opts.disconnectHierarchy || !parent.umounted) {
   568  			vfs.delayDecRef(vfs.disconnectLocked(mnt))
   569  		}
   570  		if mnt.isShared {
   571  			vfs.setPropagation(mnt, linux.MS_PRIVATE)
   572  		}
   573  	}
   574  	if opts.eager {
   575  		for {
   576  			refs := mnt.refs.Load()
   577  			if refs < 0 {
   578  				break
   579  			}
   580  			if mnt.refs.CompareAndSwap(refs, refs|math.MinInt64) {
   581  				break
   582  			}
   583  		}
   584  	}
   585  	for child := range mnt.children {
   586  		vfs.umountRecursiveLocked(child, opts)
   587  	}
   588  	if cover != nil {
   589  		mp := cover.getKey()
   590  		mp.IncRef()
   591  		mp.dentry.mu.Lock()
   592  		vfs.connectLocked(cover, mp, mp.mount.ns)
   593  		mp.dentry.mu.Unlock()
   594  		vfs.delayDecRef(cover)
   595  	}
   596  }
   597  
   598  // connectLocked makes vd the mount parent/point for mnt. It consumes
   599  // references held by vd.
   600  //
   601  // Preconditions:
   602  //   - vfs.mountMu must be locked.
   603  //   - vfs.mounts.seq must be in a writer critical section.
   604  //   - d.mu must be locked.
   605  //   - mnt.parent() == nil, i.e. mnt must not already be connected.
   606  func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) {
   607  	if checkInvariants {
   608  		if mnt.parent() != nil {
   609  			panic("VFS.connectLocked called on connected mount")
   610  		}
   611  	}
   612  	mnt.IncRef() // dropped by callers of umountRecursiveLocked
   613  	mnt.setKey(vd)
   614  	if vd.mount.children == nil {
   615  		vd.mount.children = make(map[*Mount]struct{})
   616  	}
   617  	vd.mount.children[mnt] = struct{}{}
   618  	vd.dentry.mounts.Add(1)
   619  	mnt.ns = mntns
   620  	mntns.mountpoints[vd.dentry]++
   621  	mntns.mounts++
   622  	vfs.mounts.insertSeqed(mnt)
   623  	vfsmpmounts, ok := vfs.mountpoints[vd.dentry]
   624  	if !ok {
   625  		vfsmpmounts = make(map[*Mount]struct{})
   626  		vfs.mountpoints[vd.dentry] = vfsmpmounts
   627  	}
   628  	vfsmpmounts[mnt] = struct{}{}
   629  	vfs.maybeResolveMountPromise(vd)
   630  }
   631  
   632  // disconnectLocked makes vd have no mount parent/point and returns its old
   633  // mount parent/point with a reference held.
   634  //
   635  // Preconditions:
   636  //   - vfs.mountMu must be locked.
   637  //   - vfs.mounts.seq must be in a writer critical section.
   638  //   - mnt.parent() != nil.
   639  func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry {
   640  	vd := mnt.getKey()
   641  	if checkInvariants {
   642  		if vd.mount == nil {
   643  			panic("VFS.disconnectLocked called on disconnected mount")
   644  		}
   645  		if mnt.ns.mountpoints[vd.dentry] == 0 {
   646  			panic("VFS.disconnectLocked called on dentry with zero mountpoints.")
   647  		}
   648  		if mnt.ns.mounts == 0 {
   649  			panic("VFS.disconnectLocked called on namespace with zero mounts.")
   650  		}
   651  	}
   652  	delete(vd.mount.children, mnt)
   653  	vd.dentry.mounts.Add(math.MaxUint32) // -1
   654  	mnt.ns.mountpoints[vd.dentry]--
   655  	mnt.ns.mounts--
   656  	if mnt.ns.mountpoints[vd.dentry] == 0 {
   657  		delete(mnt.ns.mountpoints, vd.dentry)
   658  	}
   659  	vfs.mounts.removeSeqed(mnt)
   660  	mnt.loadKey(VirtualDentry{}) // Clear mnt.key.
   661  	vfsmpmounts := vfs.mountpoints[vd.dentry]
   662  	delete(vfsmpmounts, mnt)
   663  	if len(vfsmpmounts) == 0 {
   664  		delete(vfs.mountpoints, vd.dentry)
   665  	}
   666  	return vd
   667  }
   668  
   669  // tryIncMountedRef increments mnt's reference count and returns true. If mnt's
   670  // reference count is already zero, or has been eagerly umounted,
   671  // tryIncMountedRef does nothing and returns false.
   672  //
   673  // tryIncMountedRef does not require that a reference is held on mnt.
   674  func (mnt *Mount) tryIncMountedRef() bool {
   675  	for {
   676  		r := mnt.refs.Load()
   677  		if r <= 0 { // r < 0 => MSB set => eagerly unmounted
   678  			return false
   679  		}
   680  		if mnt.refs.CompareAndSwap(r, r+1) {
   681  			if mnt.LogRefs() {
   682  				refs.LogTryIncRef(mnt, r+1)
   683  			}
   684  			return true
   685  		}
   686  	}
   687  }
   688  
   689  // IncRef increments mnt's reference count.
   690  func (mnt *Mount) IncRef() {
   691  	// In general, negative values for mnt.refs are valid because the MSB is
   692  	// the eager-unmount bit.
   693  	r := mnt.refs.Add(1)
   694  	if mnt.LogRefs() {
   695  		refs.LogIncRef(mnt, r)
   696  	}
   697  }
   698  
   699  // DecRef decrements mnt's reference count.
   700  func (mnt *Mount) DecRef(ctx context.Context) {
   701  	r := mnt.refs.Add(-1)
   702  	if mnt.LogRefs() {
   703  		refs.LogDecRef(mnt, r)
   704  	}
   705  	if r&^math.MinInt64 == 0 { // mask out MSB
   706  		refs.Unregister(mnt)
   707  		mnt.destroy(ctx)
   708  	}
   709  }
   710  
   711  func (mnt *Mount) destroy(ctx context.Context) {
   712  	if mnt.parent() != nil {
   713  		mnt.vfs.lockMounts()
   714  		mnt.vfs.mounts.seq.BeginWrite()
   715  		vd := mnt.vfs.disconnectLocked(mnt)
   716  		if vd.Ok() {
   717  			mnt.vfs.delayDecRef(vd)
   718  		}
   719  		mnt.vfs.mounts.seq.EndWrite()
   720  		mnt.vfs.unlockMounts(ctx)
   721  	}
   722  	if mnt.root != nil {
   723  		mnt.root.DecRef(ctx)
   724  	}
   725  	mnt.fs.DecRef(ctx)
   726  }
   727  
   728  // RefType implements refs.CheckedObject.Type.
   729  func (mnt *Mount) RefType() string {
   730  	return "vfs.Mount"
   731  }
   732  
   733  // LeakMessage implements refs.CheckedObject.LeakMessage.
   734  func (mnt *Mount) LeakMessage() string {
   735  	return fmt.Sprintf("[vfs.Mount %p] reference count of %d instead of 0", mnt, mnt.refs.Load())
   736  }
   737  
   738  // LogRefs implements refs.CheckedObject.LogRefs.
   739  //
   740  // This should only be set to true for debugging purposes, as it can generate an
   741  // extremely large amount of output and drastically degrade performance.
   742  func (mnt *Mount) LogRefs() bool {
   743  	return false
   744  }
   745  
   746  // getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes
   747  // a reference on the returned Mount. If (mnt, d) is not a mount point,
   748  // getMountAt returns nil.
   749  //
   750  // getMountAt is analogous to Linux's fs/namei.c:follow_mount().
   751  //
   752  // Preconditions: References are held on mnt and d.
   753  func (vfs *VirtualFilesystem) getMountAt(ctx context.Context, mnt *Mount, d *Dentry) *Mount {
   754  	// The first mount is special-cased:
   755  	//
   756  	//	- The caller is assumed to have checked d.isMounted() already. (This
   757  	//		isn't a precondition because it doesn't matter for correctness.)
   758  	//
   759  	//	- We return nil, instead of mnt, if there is no mount at (mnt, d).
   760  	//
   761  	//	- We don't drop the caller's references on mnt and d.
   762  retryFirst:
   763  	next := vfs.mounts.Lookup(mnt, d)
   764  	if next == nil {
   765  		return nil
   766  	}
   767  	if !next.tryIncMountedRef() {
   768  		// Raced with umount.
   769  		goto retryFirst
   770  	}
   771  	mnt = next
   772  	d = next.root
   773  	// We don't need to take Dentry refs anywhere in this function because
   774  	// Mounts hold references on Mount.root, which is immutable.
   775  	for d.isMounted() {
   776  		next := vfs.mounts.Lookup(mnt, d)
   777  		if next == nil {
   778  			break
   779  		}
   780  		if !next.tryIncMountedRef() {
   781  			// Raced with umount.
   782  			continue
   783  		}
   784  		mnt.DecRef(ctx)
   785  		mnt = next
   786  		d = next.root
   787  	}
   788  	return mnt
   789  }
   790  
   791  // getMountpointAt returns the mount point for the stack of Mounts including
   792  // mnt. It takes a reference on the returned VirtualDentry. If no such mount
   793  // point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil).
   794  //
   795  // Preconditions:
   796  //   - References are held on mnt and root.
   797  //   - vfsroot is not (mnt, mnt.root).
   798  func (vfs *VirtualFilesystem) getMountpointAt(ctx context.Context, mnt *Mount, vfsroot VirtualDentry) VirtualDentry {
   799  	// The first mount is special-cased:
   800  	//
   801  	//	- The caller must have already checked mnt against vfsroot.
   802  	//
   803  	//	- We return nil, instead of mnt, if there is no mount point for mnt.
   804  	//
   805  	//	- We don't drop the caller's reference on mnt.
   806  retryFirst:
   807  	epoch := vfs.mounts.seq.BeginRead()
   808  	parent, point := mnt.parent(), mnt.point()
   809  	if !vfs.mounts.seq.ReadOk(epoch) {
   810  		goto retryFirst
   811  	}
   812  	if parent == nil {
   813  		return VirtualDentry{}
   814  	}
   815  	if !parent.tryIncMountedRef() {
   816  		// Raced with umount.
   817  		goto retryFirst
   818  	}
   819  	if !point.TryIncRef() {
   820  		// Since Mount holds a reference on Mount.key.point, this can only
   821  		// happen due to a racing change to Mount.key.
   822  		parent.DecRef(ctx)
   823  		goto retryFirst
   824  	}
   825  	if !vfs.mounts.seq.ReadOk(epoch) {
   826  		point.DecRef(ctx)
   827  		parent.DecRef(ctx)
   828  		goto retryFirst
   829  	}
   830  	mnt = parent
   831  	d := point
   832  	for {
   833  		if mnt == vfsroot.mount && d == vfsroot.dentry {
   834  			break
   835  		}
   836  		if d != mnt.root {
   837  			break
   838  		}
   839  	retryNotFirst:
   840  		epoch := vfs.mounts.seq.BeginRead()
   841  		parent, point := mnt.parent(), mnt.point()
   842  		if !vfs.mounts.seq.ReadOk(epoch) {
   843  			goto retryNotFirst
   844  		}
   845  		if parent == nil {
   846  			break
   847  		}
   848  		if !parent.tryIncMountedRef() {
   849  			// Raced with umount.
   850  			goto retryNotFirst
   851  		}
   852  		if !point.TryIncRef() {
   853  			// Since Mount holds a reference on Mount.key.point, this can
   854  			// only happen due to a racing change to Mount.key.
   855  			parent.DecRef(ctx)
   856  			goto retryNotFirst
   857  		}
   858  		if !vfs.mounts.seq.ReadOk(epoch) {
   859  			point.DecRef(ctx)
   860  			parent.DecRef(ctx)
   861  			goto retryNotFirst
   862  		}
   863  		d.DecRef(ctx)
   864  		mnt.DecRef(ctx)
   865  		mnt = parent
   866  		d = point
   867  	}
   868  	return VirtualDentry{mnt, d}
   869  }
   870  
   871  // PivotRoot makes location pointed to by newRootPop the root of the current
   872  // namespace, and moves the current root to the location pointed to by
   873  // putOldPop.
   874  func (vfs *VirtualFilesystem) PivotRoot(ctx context.Context, creds *auth.Credentials, newRootPop *PathOperation, putOldPop *PathOperation) error {
   875  	newRootVd, err := vfs.GetDentryAt(ctx, creds, newRootPop, &GetDentryOptions{CheckSearchable: true})
   876  	if err != nil {
   877  		return err
   878  	}
   879  	defer newRootVd.DecRef(ctx)
   880  	putOldVd, err := vfs.GetDentryAt(ctx, creds, putOldPop, &GetDentryOptions{CheckSearchable: true})
   881  	if err != nil {
   882  		return err
   883  	}
   884  	defer putOldVd.DecRef(ctx)
   885  	rootVd := RootFromContext(ctx)
   886  	defer rootVd.DecRef(ctx)
   887  
   888  retry:
   889  	epoch := vfs.mounts.seq.BeginRead()
   890  	// Neither new_root nor put_old can be on the same mount as the current
   891  	//root mount.
   892  	if newRootVd.mount == rootVd.mount || putOldVd.mount == rootVd.mount {
   893  		return linuxerr.EBUSY
   894  	}
   895  	// new_root must be a mountpoint.
   896  	if newRootVd.mount.root != newRootVd.dentry {
   897  		return linuxerr.EINVAL
   898  	}
   899  	// put_old must be at or underneath new_root.
   900  	path, err := vfs.PathnameReachable(ctx, newRootVd, putOldVd)
   901  	if err != nil || len(path) == 0 {
   902  		return linuxerr.EINVAL
   903  	}
   904  	// The current root directory must be a mountpoint
   905  	// (in the case it has been chrooted).
   906  	if rootVd.mount.root != rootVd.dentry {
   907  		return linuxerr.EINVAL
   908  	}
   909  	// The current root and the new root cannot be on the rootfs mount.
   910  	if rootVd.mount.parent() == nil || newRootVd.mount.parent() == nil {
   911  		return linuxerr.EINVAL
   912  	}
   913  	// The current root and the new root must be in the context's mount namespace.
   914  	ns := MountNamespaceFromContext(ctx)
   915  	defer ns.DecRef(ctx)
   916  	vfs.lockMounts()
   917  	if rootVd.mount.ns != ns || newRootVd.mount.ns != ns {
   918  		vfs.unlockMounts(ctx)
   919  		return linuxerr.EINVAL
   920  	}
   921  
   922  	// Either the mount point at new_root, or the parent mount of that mount
   923  	// point, has propagation type MS_SHARED.
   924  	if newRootParent := newRootVd.mount.parent(); newRootVd.mount.isShared || newRootParent.isShared {
   925  		vfs.unlockMounts(ctx)
   926  		return linuxerr.EINVAL
   927  	}
   928  	// put_old is a mount point and has the propagation type MS_SHARED.
   929  	if putOldVd.mount.root == putOldVd.dentry && putOldVd.mount.isShared {
   930  		vfs.unlockMounts(ctx)
   931  		return linuxerr.EINVAL
   932  	}
   933  
   934  	if !vfs.mounts.seq.BeginWriteOk(epoch) {
   935  		// Checks above raced with a mount change.
   936  		vfs.unlockMounts(ctx)
   937  		goto retry
   938  	}
   939  	defer vfs.unlockMounts(ctx)
   940  	mp := vfs.disconnectLocked(newRootVd.mount)
   941  	vfs.delayDecRef(mp)
   942  	rootMp := vfs.disconnectLocked(rootVd.mount)
   943  
   944  	putOldVd.IncRef()
   945  	putOldVd.dentry.mu.Lock()
   946  	vfs.connectLocked(rootVd.mount, putOldVd, ns)
   947  	putOldVd.dentry.mu.Unlock()
   948  
   949  	rootMp.dentry.mu.Lock()
   950  	vfs.connectLocked(newRootVd.mount, rootMp, ns)
   951  	rootMp.dentry.mu.Unlock()
   952  	vfs.mounts.seq.EndWrite()
   953  
   954  	vfs.delayDecRef(newRootVd.mount)
   955  	vfs.delayDecRef(rootVd.mount)
   956  	return nil
   957  }
   958  
   959  // SetMountReadOnly sets the mount as ReadOnly.
   960  func (vfs *VirtualFilesystem) SetMountReadOnly(mnt *Mount, ro bool) error {
   961  	vfs.lockMounts()
   962  	defer vfs.unlockMounts(context.Background())
   963  	return mnt.setReadOnlyLocked(ro)
   964  }
   965  
   966  // CheckBeginWrite increments the counter of in-progress write operations on
   967  // mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns
   968  // EROFS.
   969  //
   970  // If CheckBeginWrite succeeds, EndWrite must be called when the write
   971  // operation is finished.
   972  func (mnt *Mount) CheckBeginWrite() error {
   973  	if mnt.writers.Add(1) < 0 {
   974  		mnt.writers.Add(-1)
   975  		return linuxerr.EROFS
   976  	}
   977  	return nil
   978  }
   979  
   980  // EndWrite indicates that a write operation signaled by a previous successful
   981  // call to CheckBeginWrite has finished.
   982  func (mnt *Mount) EndWrite() {
   983  	mnt.writers.Add(-1)
   984  }
   985  
   986  // Preconditions: VirtualFilesystem.mountMu must be locked.
   987  func (mnt *Mount) setReadOnlyLocked(ro bool) error {
   988  	if oldRO := mnt.writers.Load() < 0; oldRO == ro {
   989  		return nil
   990  	}
   991  	if ro {
   992  		if !mnt.writers.CompareAndSwap(0, math.MinInt64) {
   993  			return linuxerr.EBUSY
   994  		}
   995  		return nil
   996  	}
   997  	// Unset MSB without dropping any temporary increments from failed calls to
   998  	// mnt.CheckBeginWrite().
   999  	mnt.writers.Add(math.MinInt64)
  1000  	return nil
  1001  }
  1002  
  1003  // ReadOnly returns true if mount is readonly.
  1004  func (mnt *Mount) ReadOnly() bool {
  1005  	return mnt.writers.Load() < 0
  1006  }
  1007  
  1008  // Filesystem returns the mounted Filesystem. It does not take a reference on
  1009  // the returned Filesystem.
  1010  func (mnt *Mount) Filesystem() *Filesystem {
  1011  	return mnt.fs
  1012  }
  1013  
  1014  // submountsLocked returns this Mount and all Mounts that are descendents of
  1015  // it.
  1016  //
  1017  // Precondition: mnt.vfs.mountMu must be held.
  1018  func (mnt *Mount) submountsLocked() []*Mount {
  1019  	mounts := []*Mount{mnt}
  1020  	for m := range mnt.children {
  1021  		mounts = append(mounts, m.submountsLocked()...)
  1022  	}
  1023  	return mounts
  1024  }
  1025  
  1026  // Root returns the mount's root. It does not take a reference on the returned
  1027  // Dentry.
  1028  func (mnt *Mount) Root() *Dentry {
  1029  	return mnt.root
  1030  }
  1031  
  1032  // GenerateProcMounts emits the contents of /proc/[pid]/mounts for vfs to buf.
  1033  //
  1034  // Preconditions: taskRootDir.Ok().
  1035  func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
  1036  	rootMnt := taskRootDir.mount
  1037  
  1038  	vfs.lockMounts()
  1039  	mounts := rootMnt.submountsLocked()
  1040  	// Take a reference on mounts since we need to drop vfs.mountMu before
  1041  	// calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()).
  1042  	for _, mnt := range mounts {
  1043  		mnt.IncRef()
  1044  	}
  1045  	vfs.unlockMounts(ctx)
  1046  	defer func() {
  1047  		for _, mnt := range mounts {
  1048  			mnt.DecRef(ctx)
  1049  		}
  1050  	}()
  1051  	sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
  1052  
  1053  	for _, mnt := range mounts {
  1054  		// Get the path to this mount relative to task root.
  1055  		mntRootVD := VirtualDentry{
  1056  			mount:  mnt,
  1057  			dentry: mnt.root,
  1058  		}
  1059  		path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
  1060  		if err != nil {
  1061  			// For some reason we didn't get a path. Log a warning
  1062  			// and run with empty path.
  1063  			ctx.Warningf("VFS.GenerateProcMounts: error getting pathname for mount root %+v: %v", mnt.root, err)
  1064  			path = ""
  1065  		}
  1066  		if path == "" {
  1067  			// Either an error occurred, or path is not reachable
  1068  			// from root.
  1069  			break
  1070  		}
  1071  
  1072  		opts := "rw"
  1073  		if mnt.ReadOnly() {
  1074  			opts = "ro"
  1075  		}
  1076  		if mnt.Flags.NoATime {
  1077  			opts = ",noatime"
  1078  		}
  1079  		if mnt.Flags.NoExec {
  1080  			opts += ",noexec"
  1081  		}
  1082  		if mopts := mnt.fs.Impl().MountOptions(); mopts != "" {
  1083  			opts += "," + mopts
  1084  		}
  1085  
  1086  		// Format:
  1087  		// <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order>
  1088  		//
  1089  		// The "needs dump" and "fsck order" flags are always 0, which
  1090  		// is allowed.
  1091  		fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", path, mnt.fs.FilesystemType().Name(), opts, 0, 0)
  1092  	}
  1093  }
  1094  
  1095  // GenerateProcMountInfo emits the contents of /proc/[pid]/mountinfo for vfs to
  1096  // buf.
  1097  //
  1098  // Preconditions: taskRootDir.Ok().
  1099  func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
  1100  	rootMnt := taskRootDir.mount
  1101  
  1102  	vfs.lockMounts()
  1103  	mounts := rootMnt.submountsLocked()
  1104  	// Take a reference on mounts since we need to drop vfs.mountMu before
  1105  	// calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()) or
  1106  	// vfs.StatAt() (=> FilesystemImpl.StatAt()).
  1107  	for _, mnt := range mounts {
  1108  		mnt.IncRef()
  1109  	}
  1110  	vfs.unlockMounts(ctx)
  1111  	defer func() {
  1112  		for _, mnt := range mounts {
  1113  			mnt.DecRef(ctx)
  1114  		}
  1115  	}()
  1116  	sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
  1117  
  1118  	creds := auth.CredentialsFromContext(ctx)
  1119  	for _, mnt := range mounts {
  1120  		// Get the path to this mount relative to task root.
  1121  		mntRootVD := VirtualDentry{
  1122  			mount:  mnt,
  1123  			dentry: mnt.root,
  1124  		}
  1125  		pathFromRoot, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
  1126  		if err != nil {
  1127  			// For some reason we didn't get a path. Log a warning
  1128  			// and run with empty path.
  1129  			ctx.Warningf("VFS.GenerateProcMountInfo: error getting pathname for mount root %+v: %v", mnt.root, err)
  1130  			continue
  1131  		}
  1132  		if pathFromRoot == "" {
  1133  			// The path is not reachable from root.
  1134  			continue
  1135  		}
  1136  		var pathFromFS string
  1137  		pathFromFS, err = vfs.PathnameInFilesystem(ctx, mntRootVD)
  1138  		if err != nil {
  1139  			// For some reason we didn't get a path. Log a warning
  1140  			// and run with empty path.
  1141  			ctx.Warningf("VFS.GenerateProcMountInfo: error getting pathname for mount root %+v: %v", mnt.root, err)
  1142  			continue
  1143  		}
  1144  		if pathFromFS == "" {
  1145  			// The path is not reachable from root.
  1146  			continue
  1147  		}
  1148  		// Stat the mount root to get the major/minor device numbers.
  1149  		pop := &PathOperation{
  1150  			Root:  mntRootVD,
  1151  			Start: mntRootVD,
  1152  		}
  1153  		statx, err := vfs.StatAt(ctx, creds, pop, &StatOptions{})
  1154  		if err != nil {
  1155  			// Well that's not good. Ignore this mount.
  1156  			ctx.Warningf("VFS.GenerateProcMountInfo: failed to stat mount root %+v: %v", mnt.root, err)
  1157  			continue
  1158  		}
  1159  
  1160  		// Format:
  1161  		// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
  1162  		// (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
  1163  
  1164  		// (1) Mount ID.
  1165  		fmt.Fprintf(buf, "%d ", mnt.ID)
  1166  
  1167  		// (2)  Parent ID (or this ID if there is no parent).
  1168  		// Note that even if the call to mnt.parent() races with Mount
  1169  		// destruction (which is possible since we're not holding vfs.mountMu),
  1170  		// its Mount.ID will still be valid.
  1171  		pID := mnt.ID
  1172  		if p := mnt.parent(); p != nil {
  1173  			pID = p.ID
  1174  		}
  1175  		fmt.Fprintf(buf, "%d ", pID)
  1176  
  1177  		// (3) Major:Minor device ID. We don't have a superblock, so we
  1178  		// just use the root inode device number.
  1179  		fmt.Fprintf(buf, "%d:%d ", statx.DevMajor, statx.DevMinor)
  1180  
  1181  		// (4) Root: the pathname of the directory in the filesystem
  1182  		// which forms the root of this mount.
  1183  		fmt.Fprintf(buf, "%s ", manglePath(pathFromFS))
  1184  
  1185  		// (5) Mount point (relative to process root).
  1186  		fmt.Fprintf(buf, "%s ", manglePath(pathFromRoot))
  1187  
  1188  		// (6) Mount options.
  1189  		opts := "rw"
  1190  		if mnt.ReadOnly() {
  1191  			opts = "ro"
  1192  		}
  1193  		if mnt.Flags.NoATime {
  1194  			opts = ",noatime"
  1195  		}
  1196  		if mnt.Flags.NoExec {
  1197  			opts += ",noexec"
  1198  		}
  1199  		fmt.Fprintf(buf, "%s ", opts)
  1200  
  1201  		// (7) Optional fields: zero or more fields of the form "tag[:value]".
  1202  		fmt.Fprintf(buf, "%s ", mnt.generateOptionalTags())
  1203  		// (8) Separator: the end of the optional fields is marked by a single hyphen.
  1204  		fmt.Fprintf(buf, "- ")
  1205  
  1206  		// (9) Filesystem type.
  1207  		fmt.Fprintf(buf, "%s ", mnt.fs.FilesystemType().Name())
  1208  
  1209  		// (10) Mount source: filesystem-specific information or "none".
  1210  		fmt.Fprintf(buf, "none ")
  1211  
  1212  		// (11) Superblock options, and final newline.
  1213  		fmt.Fprintf(buf, "%s\n", superBlockOpts(pathFromRoot, mnt))
  1214  	}
  1215  }
  1216  
  1217  // manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents.
  1218  // See Linux fs/seq_file.c:mangle_path.
  1219  func manglePath(p string) string {
  1220  	r := strings.NewReplacer(" ", "\\040", "\t", "\\011", "\n", "\\012", "\\", "\\134")
  1221  	return r.Replace(p)
  1222  }
  1223  
  1224  // superBlockOpts returns the super block options string for the the mount at
  1225  // the given path.
  1226  func superBlockOpts(mountPath string, mnt *Mount) string {
  1227  	// Compose super block options by combining global mount flags with
  1228  	// FS-specific mount options.
  1229  	opts := "rw"
  1230  	if mnt.ReadOnly() {
  1231  		opts = "ro"
  1232  	}
  1233  
  1234  	if mopts := mnt.fs.Impl().MountOptions(); mopts != "" {
  1235  		opts += "," + mopts
  1236  	}
  1237  
  1238  	// NOTE(b/147673608): If the mount is a ramdisk-based fake cgroupfs, we also
  1239  	// need to include the cgroup name in the options. For now we just read that
  1240  	// from the path. Note that this is only possible when "cgroup" isn't
  1241  	// registered as a valid filesystem type.
  1242  	//
  1243  	// TODO(gvisor.dev/issue/190): Once we removed fake cgroupfs support, we
  1244  	// should remove this.
  1245  	if cgroupfs := mnt.vfs.getFilesystemType("cgroup"); cgroupfs != nil && cgroupfs.opts.AllowUserMount {
  1246  		// Real cgroupfs available.
  1247  		return opts
  1248  	}
  1249  	if mnt.fs.FilesystemType().Name() == "cgroup" {
  1250  		splitPath := strings.Split(mountPath, "/")
  1251  		cgroupType := splitPath[len(splitPath)-1]
  1252  		opts += "," + cgroupType
  1253  	}
  1254  
  1255  	return opts
  1256  }
  1257  
  1258  // allocateGroupID returns a new mount group id if one is available, and
  1259  // error otherwise. If the group ID bitmap is full, double the size of the
  1260  // bitmap before allocating the new group id.
  1261  //
  1262  // +checklocks:vfs.mountMu
  1263  func (vfs *VirtualFilesystem) allocateGroupID() (uint32, error) {
  1264  	groupID, err := vfs.groupIDBitmap.FirstZero(1)
  1265  	if err != nil {
  1266  		if err := vfs.groupIDBitmap.Grow(uint32(vfs.groupIDBitmap.Size())); err != nil {
  1267  			return 0, err
  1268  		}
  1269  	}
  1270  	vfs.groupIDBitmap.Add(groupID)
  1271  	return groupID, nil
  1272  }
  1273  
  1274  // freeGroupID marks a groupID as available for reuse.
  1275  //
  1276  // +checklocks:vfs.mountMu
  1277  func (vfs *VirtualFilesystem) freeGroupID(id uint32) {
  1278  	vfs.groupIDBitmap.Remove(id)
  1279  }