github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/vfs/mount.go

github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/vfs/mount.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package vfs
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"math"
    21  	"sort"
    22  	"strings"
    23  	"sync/atomic"
    24  
    25  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    26  	"github.com/SagerNet/gvisor/pkg/context"
    27  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    28  	"github.com/SagerNet/gvisor/pkg/refsvfs2"
    29  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    30  	"github.com/SagerNet/gvisor/pkg/syserror"
    31  )
    32  
    33  // A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem
    34  // (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem
    35  // (Mount.fs), which applies to path resolution in the context of a particular
    36  // Mount (Mount.key.parent).
    37  //
    38  // Mounts are reference-counted. Unless otherwise specified, all Mount methods
    39  // require that a reference is held.
    40  //
    41  // Mount and Filesystem are distinct types because it's possible for a single
    42  // Filesystem to be mounted at multiple locations and/or in multiple mount
    43  // namespaces.
    44  //
    45  // Mount is analogous to Linux's struct mount. (gVisor does not distinguish
    46  // between struct mount and struct vfsmount.)
    47  //
    48  // +stateify savable
    49  type Mount struct {
    50  	// vfs, fs, root are immutable. References are held on fs and root.
    51  	// Note that for a disconnected mount, root may be nil.
    52  	//
    53  	// Invariant: if not nil, root belongs to fs.
    54  	vfs  *VirtualFilesystem
    55  	fs   *Filesystem
    56  	root *Dentry
    57  
    58  	// ID is the immutable mount ID.
    59  	ID uint64
    60  
    61  	// Flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except
    62  	// for MS_RDONLY which is tracked in "writers". Immutable.
    63  	Flags MountFlags
    64  
    65  	// key is protected by VirtualFilesystem.mountMu and
    66  	// VirtualFilesystem.mounts.seq, and may be nil. References are held on
    67  	// key.parent and key.point if they are not nil.
    68  	//
    69  	// Invariant: key.parent != nil iff key.point != nil. key.point belongs to
    70  	// key.parent.fs.
    71  	key mountKey `state:".(VirtualDentry)"`
    72  
    73  	// ns is the namespace in which this Mount was mounted. ns is protected by
    74  	// VirtualFilesystem.mountMu.
    75  	ns *MountNamespace
    76  
    77  	// The lower 63 bits of refs are a reference count. The MSB of refs is set
    78  	// if the Mount has been eagerly umounted, as by umount(2) without the
    79  	// MNT_DETACH flag. refs is accessed using atomic memory operations.
    80  	refs int64
    81  
    82  	// children is the set of all Mounts for which Mount.key.parent is this
    83  	// Mount. children is protected by VirtualFilesystem.mountMu.
    84  	children map[*Mount]struct{}
    85  
    86  	// umounted is true if VFS.umountRecursiveLocked() has been called on this
    87  	// Mount. VirtualFilesystem does not hold a reference on Mounts for which
    88  	// umounted is true. umounted is protected by VirtualFilesystem.mountMu.
    89  	umounted bool
    90  
    91  	// The lower 63 bits of writers is the number of calls to
    92  	// Mount.CheckBeginWrite() that have not yet been paired with a call to
    93  	// Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect.
    94  	// writers is accessed using atomic memory operations.
    95  	writers int64
    96  }
    97  
    98  func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount {
    99  	mnt := &Mount{
   100  		ID:    atomic.AddUint64(&vfs.lastMountID, 1),
   101  		Flags: opts.Flags,
   102  		vfs:   vfs,
   103  		fs:    fs,
   104  		root:  root,
   105  		ns:    mntns,
   106  		refs:  1,
   107  	}
   108  	if opts.ReadOnly {
   109  		mnt.setReadOnlyLocked(true)
   110  	}
   111  	refsvfs2.Register(mnt)
   112  	return mnt
   113  }
   114  
   115  // Options returns a copy of the MountOptions currently applicable to mnt.
   116  func (mnt *Mount) Options() MountOptions {
   117  	mnt.vfs.mountMu.Lock()
   118  	defer mnt.vfs.mountMu.Unlock()
   119  	return MountOptions{
   120  		Flags:    mnt.Flags,
   121  		ReadOnly: mnt.ReadOnly(),
   122  	}
   123  }
   124  
   125  // A MountNamespace is a collection of Mounts.//
   126  // MountNamespaces are reference-counted. Unless otherwise specified, all
   127  // MountNamespace methods require that a reference is held.
   128  //
   129  // MountNamespace is analogous to Linux's struct mnt_namespace.
   130  //
   131  // +stateify savable
   132  type MountNamespace struct {
   133  	MountNamespaceRefs
   134  
   135  	// Owner is the usernamespace that owns this mount namespace.
   136  	Owner *auth.UserNamespace
   137  
   138  	// root is the MountNamespace's root mount. root is immutable.
   139  	root *Mount
   140  
   141  	// mountpoints maps all Dentries which are mount points in this namespace
   142  	// to the number of Mounts for which they are mount points. mountpoints is
   143  	// protected by VirtualFilesystem.mountMu.
   144  	//
   145  	// mountpoints is used to determine if a Dentry can be moved or removed
   146  	// (which requires that the Dentry is not a mount point in the calling
   147  	// namespace).
   148  	//
   149  	// mountpoints is maintained even if there are no references held on the
   150  	// MountNamespace; this is required to ensure that
   151  	// VFS.PrepareDeleteDentry() and VFS.PrepareRemoveDentry() operate
   152  	// correctly on unreferenced MountNamespaces.
   153  	mountpoints map[*Dentry]uint32
   154  }
   155  
   156  // NewMountNamespace returns a new mount namespace with a root filesystem
   157  // configured by the given arguments. A reference is taken on the returned
   158  // MountNamespace.
   159  func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *MountOptions) (*MountNamespace, error) {
   160  	rft := vfs.getFilesystemType(fsTypeName)
   161  	if rft == nil {
   162  		ctx.Warningf("Unknown filesystem type: %s", fsTypeName)
   163  		return nil, linuxerr.ENODEV
   164  	}
   165  	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
   166  	if err != nil {
   167  		return nil, err
   168  	}
   169  	mntns := &MountNamespace{
   170  		Owner:       creds.UserNamespace,
   171  		mountpoints: make(map[*Dentry]uint32),
   172  	}
   173  	mntns.InitRefs()
   174  	mntns.root = newMount(vfs, fs, root, mntns, opts)
   175  	return mntns, nil
   176  }
   177  
   178  // NewDisconnectedMount returns a Mount representing fs with the given root
   179  // (which may be nil). The new Mount is not associated with any MountNamespace
   180  // and is not connected to any other Mounts. References are taken on fs and
   181  // root.
   182  func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry, opts *MountOptions) (*Mount, error) {
   183  	fs.IncRef()
   184  	if root != nil {
   185  		root.IncRef()
   186  	}
   187  	return newMount(vfs, fs, root, nil /* mntns */, opts), nil
   188  }
   189  
   190  // MountDisconnected creates a Filesystem configured by the given arguments,
   191  // then returns a Mount representing it. The new Mount is not associated with
   192  // any MountNamespace and is not connected to any other Mounts.
   193  func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth.Credentials, source string, fsTypeName string, opts *MountOptions) (*Mount, error) {
   194  	rft := vfs.getFilesystemType(fsTypeName)
   195  	if rft == nil {
   196  		return nil, linuxerr.ENODEV
   197  	}
   198  	if !opts.InternalMount && !rft.opts.AllowUserMount {
   199  		return nil, linuxerr.ENODEV
   200  	}
   201  	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
   202  	if err != nil {
   203  		return nil, err
   204  	}
   205  	defer root.DecRef(ctx)
   206  	defer fs.DecRef(ctx)
   207  	return vfs.NewDisconnectedMount(fs, root, opts)
   208  }
   209  
   210  // ConnectMountAt connects mnt at the path represented by target.
   211  //
   212  // Preconditions: mnt must be disconnected.
   213  func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Credentials, mnt *Mount, target *PathOperation) error {
   214  	// We can't hold vfs.mountMu while calling FilesystemImpl methods due to
   215  	// lock ordering.
   216  	vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{})
   217  	if err != nil {
   218  		return err
   219  	}
   220  	vfs.mountMu.Lock()
   221  	vdDentry := vd.dentry
   222  	vdDentry.mu.Lock()
   223  	for {
   224  		if vd.mount.umounted || vdDentry.dead {
   225  			vdDentry.mu.Unlock()
   226  			vfs.mountMu.Unlock()
   227  			vd.DecRef(ctx)
   228  			return syserror.ENOENT
   229  		}
   230  		// vd might have been mounted over between vfs.GetDentryAt() and
   231  		// vfs.mountMu.Lock().
   232  		if !vdDentry.isMounted() {
   233  			break
   234  		}
   235  		nextmnt := vfs.mounts.Lookup(vd.mount, vdDentry)
   236  		if nextmnt == nil {
   237  			break
   238  		}
   239  		// It's possible that nextmnt has been umounted but not disconnected,
   240  		// in which case vfs no longer holds a reference on it, and the last
   241  		// reference may be concurrently dropped even though we're holding
   242  		// vfs.mountMu.
   243  		if !nextmnt.tryIncMountedRef() {
   244  			break
   245  		}
   246  		// This can't fail since we're holding vfs.mountMu.
   247  		nextmnt.root.IncRef()
   248  		vdDentry.mu.Unlock()
   249  		vd.DecRef(ctx)
   250  		vd = VirtualDentry{
   251  			mount:  nextmnt,
   252  			dentry: nextmnt.root,
   253  		}
   254  		vdDentry.mu.Lock()
   255  	}
   256  	// TODO(github.com/SagerNet/issue/1035): Linux requires that either both the mount
   257  	// point and the mount root are directories, or neither are, and returns
   258  	// ENOTDIR if this is not the case.
   259  	mntns := vd.mount.ns
   260  	vfs.mounts.seq.BeginWrite()
   261  	vfs.connectLocked(mnt, vd, mntns)
   262  	vfs.mounts.seq.EndWrite()
   263  	vdDentry.mu.Unlock()
   264  	vfs.mountMu.Unlock()
   265  	return nil
   266  }
   267  
   268  // MountAt creates and mounts a Filesystem configured by the given arguments.
   269  // The VirtualFilesystem will hold a reference to the Mount until it is unmounted.
   270  //
   271  // This method returns the mounted Mount without a reference, for convenience
   272  // during VFS setup when there is no chance of racing with unmount.
   273  func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) (*Mount, error) {
   274  	mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts)
   275  	if err != nil {
   276  		return nil, err
   277  	}
   278  	defer mnt.DecRef(ctx)
   279  	if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil {
   280  		return nil, err
   281  	}
   282  	return mnt, nil
   283  }
   284  
   285  // UmountAt removes the Mount at the given path.
   286  func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error {
   287  	if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 {
   288  		return linuxerr.EINVAL
   289  	}
   290  
   291  	// MNT_FORCE is currently unimplemented except for the permission check.
   292  	// Force unmounting specifically requires CAP_SYS_ADMIN in the root user
   293  	// namespace, and not in the owner user namespace for the target mount. See
   294  	// fs/namespace.c:SYSCALL_DEFINE2(umount, ...)
   295  	if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) {
   296  		return linuxerr.EPERM
   297  	}
   298  
   299  	vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{})
   300  	if err != nil {
   301  		return err
   302  	}
   303  	defer vd.DecRef(ctx)
   304  	if vd.dentry != vd.mount.root {
   305  		return linuxerr.EINVAL
   306  	}
   307  	vfs.mountMu.Lock()
   308  	if mntns := MountNamespaceFromContext(ctx); mntns != nil {
   309  		defer mntns.DecRef(ctx)
   310  		if mntns != vd.mount.ns {
   311  			vfs.mountMu.Unlock()
   312  			return linuxerr.EINVAL
   313  		}
   314  
   315  		if vd.mount == vd.mount.ns.root {
   316  			vfs.mountMu.Unlock()
   317  			return linuxerr.EINVAL
   318  		}
   319  	}
   320  
   321  	// TODO(github.com/SagerNet/issue/1035): Linux special-cases umount of the caller's
   322  	// root, which we don't implement yet (we'll just fail it since the caller
   323  	// holds a reference on it).
   324  
   325  	vfs.mounts.seq.BeginWrite()
   326  	if opts.Flags&linux.MNT_DETACH == 0 {
   327  		if len(vd.mount.children) != 0 {
   328  			vfs.mounts.seq.EndWrite()
   329  			vfs.mountMu.Unlock()
   330  			return linuxerr.EBUSY
   331  		}
   332  		// We are holding a reference on vd.mount.
   333  		expectedRefs := int64(1)
   334  		if !vd.mount.umounted {
   335  			expectedRefs = 2
   336  		}
   337  		if atomic.LoadInt64(&vd.mount.refs)&^math.MinInt64 != expectedRefs { // mask out MSB
   338  			vfs.mounts.seq.EndWrite()
   339  			vfs.mountMu.Unlock()
   340  			return linuxerr.EBUSY
   341  		}
   342  	}
   343  	vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(vd.mount, &umountRecursiveOptions{
   344  		eager:               opts.Flags&linux.MNT_DETACH == 0,
   345  		disconnectHierarchy: true,
   346  	}, nil, nil)
   347  	vfs.mounts.seq.EndWrite()
   348  	vfs.mountMu.Unlock()
   349  	for _, vd := range vdsToDecRef {
   350  		vd.DecRef(ctx)
   351  	}
   352  	for _, mnt := range mountsToDecRef {
   353  		mnt.DecRef(ctx)
   354  	}
   355  	return nil
   356  }
   357  
   358  // +stateify savable
   359  type umountRecursiveOptions struct {
   360  	// If eager is true, ensure that future calls to Mount.tryIncMountedRef()
   361  	// on umounted mounts fail.
   362  	//
   363  	// eager is analogous to Linux's UMOUNT_SYNC.
   364  	eager bool
   365  
   366  	// If disconnectHierarchy is true, Mounts that are umounted hierarchically
   367  	// should be disconnected from their parents. (Mounts whose parents are not
   368  	// umounted, which in most cases means the Mount passed to the initial call
   369  	// to umountRecursiveLocked, are unconditionally disconnected for
   370  	// consistency with Linux.)
   371  	//
   372  	// disconnectHierarchy is analogous to Linux's !UMOUNT_CONNECTED.
   373  	disconnectHierarchy bool
   374  }
   375  
   376  // umountRecursiveLocked marks mnt and its descendants as umounted. It does not
   377  // release mount or dentry references; instead, it appends VirtualDentries and
   378  // Mounts on which references must be dropped to vdsToDecRef and mountsToDecRef
   379  // respectively, and returns updated slices. (This is necessary because
   380  // filesystem locks possibly taken by DentryImpl.DecRef() may precede
   381  // vfs.mountMu in the lock order, and Mount.DecRef() may lock vfs.mountMu.)
   382  //
   383  // umountRecursiveLocked is analogous to Linux's fs/namespace.c:umount_tree().
   384  //
   385  // Preconditions:
   386  // * vfs.mountMu must be locked.
   387  // * vfs.mounts.seq must be in a writer critical section.
   388  func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecursiveOptions, vdsToDecRef []VirtualDentry, mountsToDecRef []*Mount) ([]VirtualDentry, []*Mount) {
   389  	if !mnt.umounted {
   390  		mnt.umounted = true
   391  		mountsToDecRef = append(mountsToDecRef, mnt)
   392  		if parent := mnt.parent(); parent != nil && (opts.disconnectHierarchy || !parent.umounted) {
   393  			vdsToDecRef = append(vdsToDecRef, vfs.disconnectLocked(mnt))
   394  		}
   395  	}
   396  	if opts.eager {
   397  		for {
   398  			refs := atomic.LoadInt64(&mnt.refs)
   399  			if refs < 0 {
   400  				break
   401  			}
   402  			if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs|math.MinInt64) {
   403  				break
   404  			}
   405  		}
   406  	}
   407  	for child := range mnt.children {
   408  		vdsToDecRef, mountsToDecRef = vfs.umountRecursiveLocked(child, opts, vdsToDecRef, mountsToDecRef)
   409  	}
   410  	return vdsToDecRef, mountsToDecRef
   411  }
   412  
   413  // connectLocked makes vd the mount parent/point for mnt. It consumes
   414  // references held by vd.
   415  //
   416  // Preconditions:
   417  // * vfs.mountMu must be locked.
   418  // * vfs.mounts.seq must be in a writer critical section.
   419  // * d.mu must be locked.
   420  // * mnt.parent() == nil, i.e. mnt must not already be connected.
   421  func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) {
   422  	if checkInvariants {
   423  		if mnt.parent() != nil {
   424  			panic("VFS.connectLocked called on connected mount")
   425  		}
   426  	}
   427  	mnt.IncRef() // dropped by callers of umountRecursiveLocked
   428  	mnt.setKey(vd)
   429  	if vd.mount.children == nil {
   430  		vd.mount.children = make(map[*Mount]struct{})
   431  	}
   432  	vd.mount.children[mnt] = struct{}{}
   433  	atomic.AddUint32(&vd.dentry.mounts, 1)
   434  	mnt.ns = mntns
   435  	mntns.mountpoints[vd.dentry]++
   436  	vfs.mounts.insertSeqed(mnt)
   437  	vfsmpmounts, ok := vfs.mountpoints[vd.dentry]
   438  	if !ok {
   439  		vfsmpmounts = make(map[*Mount]struct{})
   440  		vfs.mountpoints[vd.dentry] = vfsmpmounts
   441  	}
   442  	vfsmpmounts[mnt] = struct{}{}
   443  }
   444  
   445  // disconnectLocked makes vd have no mount parent/point and returns its old
   446  // mount parent/point with a reference held.
   447  //
   448  // Preconditions:
   449  // * vfs.mountMu must be locked.
   450  // * vfs.mounts.seq must be in a writer critical section.
   451  // * mnt.parent() != nil.
   452  func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry {
   453  	vd := mnt.getKey()
   454  	if checkInvariants {
   455  		if vd.mount != nil {
   456  			panic("VFS.disconnectLocked called on disconnected mount")
   457  		}
   458  	}
   459  	mnt.loadKey(VirtualDentry{})
   460  	delete(vd.mount.children, mnt)
   461  	atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1
   462  	mnt.ns.mountpoints[vd.dentry]--
   463  	if mnt.ns.mountpoints[vd.dentry] == 0 {
   464  		delete(mnt.ns.mountpoints, vd.dentry)
   465  	}
   466  	vfs.mounts.removeSeqed(mnt)
   467  	vfsmpmounts := vfs.mountpoints[vd.dentry]
   468  	delete(vfsmpmounts, mnt)
   469  	if len(vfsmpmounts) == 0 {
   470  		delete(vfs.mountpoints, vd.dentry)
   471  	}
   472  	return vd
   473  }
   474  
   475  // tryIncMountedRef increments mnt's reference count and returns true. If mnt's
   476  // reference count is already zero, or has been eagerly umounted,
   477  // tryIncMountedRef does nothing and returns false.
   478  //
   479  // tryIncMountedRef does not require that a reference is held on mnt.
   480  func (mnt *Mount) tryIncMountedRef() bool {
   481  	for {
   482  		r := atomic.LoadInt64(&mnt.refs)
   483  		if r <= 0 { // r < 0 => MSB set => eagerly unmounted
   484  			return false
   485  		}
   486  		if atomic.CompareAndSwapInt64(&mnt.refs, r, r+1) {
   487  			if mnt.LogRefs() {
   488  				refsvfs2.LogTryIncRef(mnt, r+1)
   489  			}
   490  			return true
   491  		}
   492  	}
   493  }
   494  
   495  // IncRef increments mnt's reference count.
   496  func (mnt *Mount) IncRef() {
   497  	// In general, negative values for mnt.refs are valid because the MSB is
   498  	// the eager-unmount bit.
   499  	r := atomic.AddInt64(&mnt.refs, 1)
   500  	if mnt.LogRefs() {
   501  		refsvfs2.LogIncRef(mnt, r)
   502  	}
   503  }
   504  
   505  // DecRef decrements mnt's reference count.
   506  func (mnt *Mount) DecRef(ctx context.Context) {
   507  	r := atomic.AddInt64(&mnt.refs, -1)
   508  	if mnt.LogRefs() {
   509  		refsvfs2.LogDecRef(mnt, r)
   510  	}
   511  	if r&^math.MinInt64 == 0 { // mask out MSB
   512  		refsvfs2.Unregister(mnt)
   513  		mnt.destroy(ctx)
   514  	}
   515  }
   516  
   517  func (mnt *Mount) destroy(ctx context.Context) {
   518  	var vd VirtualDentry
   519  	if mnt.parent() != nil {
   520  		mnt.vfs.mountMu.Lock()
   521  		mnt.vfs.mounts.seq.BeginWrite()
   522  		vd = mnt.vfs.disconnectLocked(mnt)
   523  		mnt.vfs.mounts.seq.EndWrite()
   524  		mnt.vfs.mountMu.Unlock()
   525  	}
   526  	if mnt.root != nil {
   527  		mnt.root.DecRef(ctx)
   528  	}
   529  	mnt.fs.DecRef(ctx)
   530  	if vd.Ok() {
   531  		vd.DecRef(ctx)
   532  	}
   533  }
   534  
   535  // RefType implements refsvfs2.CheckedObject.Type.
   536  func (mnt *Mount) RefType() string {
   537  	return "vfs.Mount"
   538  }
   539  
   540  // LeakMessage implements refsvfs2.CheckedObject.LeakMessage.
   541  func (mnt *Mount) LeakMessage() string {
   542  	return fmt.Sprintf("[vfs.Mount %p] reference count of %d instead of 0", mnt, atomic.LoadInt64(&mnt.refs))
   543  }
   544  
   545  // LogRefs implements refsvfs2.CheckedObject.LogRefs.
   546  //
   547  // This should only be set to true for debugging purposes, as it can generate an
   548  // extremely large amount of output and drastically degrade performance.
   549  func (mnt *Mount) LogRefs() bool {
   550  	return false
   551  }
   552  
   553  // DecRef decrements mntns' reference count.
   554  func (mntns *MountNamespace) DecRef(ctx context.Context) {
   555  	vfs := mntns.root.fs.VirtualFilesystem()
   556  	mntns.MountNamespaceRefs.DecRef(func() {
   557  		vfs.mountMu.Lock()
   558  		vfs.mounts.seq.BeginWrite()
   559  		vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(mntns.root, &umountRecursiveOptions{
   560  			disconnectHierarchy: true,
   561  		}, nil, nil)
   562  		vfs.mounts.seq.EndWrite()
   563  		vfs.mountMu.Unlock()
   564  		for _, vd := range vdsToDecRef {
   565  			vd.DecRef(ctx)
   566  		}
   567  		for _, mnt := range mountsToDecRef {
   568  			mnt.DecRef(ctx)
   569  		}
   570  	})
   571  }
   572  
   573  // getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes
   574  // a reference on the returned Mount. If (mnt, d) is not a mount point,
   575  // getMountAt returns nil.
   576  //
   577  // getMountAt is analogous to Linux's fs/namei.c:follow_mount().
   578  //
   579  // Preconditions: References are held on mnt and d.
   580  func (vfs *VirtualFilesystem) getMountAt(ctx context.Context, mnt *Mount, d *Dentry) *Mount {
   581  	// The first mount is special-cased:
   582  	//
   583  	// - The caller is assumed to have checked d.isMounted() already. (This
   584  	// isn't a precondition because it doesn't matter for correctness.)
   585  	//
   586  	// - We return nil, instead of mnt, if there is no mount at (mnt, d).
   587  	//
   588  	// - We don't drop the caller's references on mnt and d.
   589  retryFirst:
   590  	next := vfs.mounts.Lookup(mnt, d)
   591  	if next == nil {
   592  		return nil
   593  	}
   594  	if !next.tryIncMountedRef() {
   595  		// Raced with umount.
   596  		goto retryFirst
   597  	}
   598  	mnt = next
   599  	d = next.root
   600  	// We don't need to take Dentry refs anywhere in this function because
   601  	// Mounts hold references on Mount.root, which is immutable.
   602  	for d.isMounted() {
   603  		next := vfs.mounts.Lookup(mnt, d)
   604  		if next == nil {
   605  			break
   606  		}
   607  		if !next.tryIncMountedRef() {
   608  			// Raced with umount.
   609  			continue
   610  		}
   611  		mnt.DecRef(ctx)
   612  		mnt = next
   613  		d = next.root
   614  	}
   615  	return mnt
   616  }
   617  
   618  // getMountpointAt returns the mount point for the stack of Mounts including
   619  // mnt. It takes a reference on the returned VirtualDentry. If no such mount
   620  // point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil).
   621  //
   622  // Preconditions:
   623  // * References are held on mnt and root.
   624  // * vfsroot is not (mnt, mnt.root).
   625  func (vfs *VirtualFilesystem) getMountpointAt(ctx context.Context, mnt *Mount, vfsroot VirtualDentry) VirtualDentry {
   626  	// The first mount is special-cased:
   627  	//
   628  	// - The caller must have already checked mnt against vfsroot.
   629  	//
   630  	// - We return nil, instead of mnt, if there is no mount point for mnt.
   631  	//
   632  	// - We don't drop the caller's reference on mnt.
   633  retryFirst:
   634  	epoch := vfs.mounts.seq.BeginRead()
   635  	parent, point := mnt.parent(), mnt.point()
   636  	if !vfs.mounts.seq.ReadOk(epoch) {
   637  		goto retryFirst
   638  	}
   639  	if parent == nil {
   640  		return VirtualDentry{}
   641  	}
   642  	if !parent.tryIncMountedRef() {
   643  		// Raced with umount.
   644  		goto retryFirst
   645  	}
   646  	if !point.TryIncRef() {
   647  		// Since Mount holds a reference on Mount.key.point, this can only
   648  		// happen due to a racing change to Mount.key.
   649  		parent.DecRef(ctx)
   650  		goto retryFirst
   651  	}
   652  	if !vfs.mounts.seq.ReadOk(epoch) {
   653  		point.DecRef(ctx)
   654  		parent.DecRef(ctx)
   655  		goto retryFirst
   656  	}
   657  	mnt = parent
   658  	d := point
   659  	for {
   660  		if mnt == vfsroot.mount && d == vfsroot.dentry {
   661  			break
   662  		}
   663  		if d != mnt.root {
   664  			break
   665  		}
   666  	retryNotFirst:
   667  		epoch := vfs.mounts.seq.BeginRead()
   668  		parent, point := mnt.parent(), mnt.point()
   669  		if !vfs.mounts.seq.ReadOk(epoch) {
   670  			goto retryNotFirst
   671  		}
   672  		if parent == nil {
   673  			break
   674  		}
   675  		if !parent.tryIncMountedRef() {
   676  			// Raced with umount.
   677  			goto retryNotFirst
   678  		}
   679  		if !point.TryIncRef() {
   680  			// Since Mount holds a reference on Mount.key.point, this can
   681  			// only happen due to a racing change to Mount.key.
   682  			parent.DecRef(ctx)
   683  			goto retryNotFirst
   684  		}
   685  		if !vfs.mounts.seq.ReadOk(epoch) {
   686  			point.DecRef(ctx)
   687  			parent.DecRef(ctx)
   688  			goto retryNotFirst
   689  		}
   690  		d.DecRef(ctx)
   691  		mnt.DecRef(ctx)
   692  		mnt = parent
   693  		d = point
   694  	}
   695  	return VirtualDentry{mnt, d}
   696  }
   697  
   698  // SetMountReadOnly sets the mount as ReadOnly.
   699  func (vfs *VirtualFilesystem) SetMountReadOnly(mnt *Mount, ro bool) error {
   700  	vfs.mountMu.Lock()
   701  	defer vfs.mountMu.Unlock()
   702  	return mnt.setReadOnlyLocked(ro)
   703  }
   704  
   705  // CheckBeginWrite increments the counter of in-progress write operations on
   706  // mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns
   707  // EROFS.
   708  //
   709  // If CheckBeginWrite succeeds, EndWrite must be called when the write
   710  // operation is finished.
   711  func (mnt *Mount) CheckBeginWrite() error {
   712  	if atomic.AddInt64(&mnt.writers, 1) < 0 {
   713  		atomic.AddInt64(&mnt.writers, -1)
   714  		return linuxerr.EROFS
   715  	}
   716  	return nil
   717  }
   718  
   719  // EndWrite indicates that a write operation signaled by a previous successful
   720  // call to CheckBeginWrite has finished.
   721  func (mnt *Mount) EndWrite() {
   722  	atomic.AddInt64(&mnt.writers, -1)
   723  }
   724  
   725  // Preconditions: VirtualFilesystem.mountMu must be locked.
   726  func (mnt *Mount) setReadOnlyLocked(ro bool) error {
   727  	if oldRO := atomic.LoadInt64(&mnt.writers) < 0; oldRO == ro {
   728  		return nil
   729  	}
   730  	if ro {
   731  		if !atomic.CompareAndSwapInt64(&mnt.writers, 0, math.MinInt64) {
   732  			return linuxerr.EBUSY
   733  		}
   734  		return nil
   735  	}
   736  	// Unset MSB without dropping any temporary increments from failed calls to
   737  	// mnt.CheckBeginWrite().
   738  	atomic.AddInt64(&mnt.writers, math.MinInt64)
   739  	return nil
   740  }
   741  
   742  // ReadOnly returns true if mount is readonly.
   743  func (mnt *Mount) ReadOnly() bool {
   744  	return atomic.LoadInt64(&mnt.writers) < 0
   745  }
   746  
   747  // Filesystem returns the mounted Filesystem. It does not take a reference on
   748  // the returned Filesystem.
   749  func (mnt *Mount) Filesystem() *Filesystem {
   750  	return mnt.fs
   751  }
   752  
   753  // submountsLocked returns this Mount and all Mounts that are descendents of
   754  // it.
   755  //
   756  // Precondition: mnt.vfs.mountMu must be held.
   757  func (mnt *Mount) submountsLocked() []*Mount {
   758  	mounts := []*Mount{mnt}
   759  	for m := range mnt.children {
   760  		mounts = append(mounts, m.submountsLocked()...)
   761  	}
   762  	return mounts
   763  }
   764  
   765  // Root returns the mount's root. It does not take a reference on the returned
   766  // Dentry.
   767  func (mnt *Mount) Root() *Dentry {
   768  	return mnt.root
   769  }
   770  
   771  // Root returns mntns' root. It does not take a reference on the returned Dentry.
   772  func (mntns *MountNamespace) Root() VirtualDentry {
   773  	vd := VirtualDentry{
   774  		mount:  mntns.root,
   775  		dentry: mntns.root.root,
   776  	}
   777  	return vd
   778  }
   779  
   780  // GenerateProcMounts emits the contents of /proc/[pid]/mounts for vfs to buf.
   781  //
   782  // Preconditions: taskRootDir.Ok().
   783  func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
   784  	rootMnt := taskRootDir.mount
   785  
   786  	vfs.mountMu.Lock()
   787  	mounts := rootMnt.submountsLocked()
   788  	// Take a reference on mounts since we need to drop vfs.mountMu before
   789  	// calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()).
   790  	for _, mnt := range mounts {
   791  		mnt.IncRef()
   792  	}
   793  	vfs.mountMu.Unlock()
   794  	defer func() {
   795  		for _, mnt := range mounts {
   796  			mnt.DecRef(ctx)
   797  		}
   798  	}()
   799  	sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
   800  
   801  	for _, mnt := range mounts {
   802  		// Get the path to this mount relative to task root.
   803  		mntRootVD := VirtualDentry{
   804  			mount:  mnt,
   805  			dentry: mnt.root,
   806  		}
   807  		path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
   808  		if err != nil {
   809  			// For some reason we didn't get a path. Log a warning
   810  			// and run with empty path.
   811  			ctx.Warningf("VFS.GenerateProcMounts: error getting pathname for mount root %+v: %v", mnt.root, err)
   812  			path = ""
   813  		}
   814  		if path == "" {
   815  			// Either an error occurred, or path is not reachable
   816  			// from root.
   817  			break
   818  		}
   819  
   820  		opts := "rw"
   821  		if mnt.ReadOnly() {
   822  			opts = "ro"
   823  		}
   824  		if mnt.Flags.NoATime {
   825  			opts = ",noatime"
   826  		}
   827  		if mnt.Flags.NoExec {
   828  			opts += ",noexec"
   829  		}
   830  		if mopts := mnt.fs.Impl().MountOptions(); mopts != "" {
   831  			opts += "," + mopts
   832  		}
   833  
   834  		// Format:
   835  		// <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order>
   836  		//
   837  		// The "needs dump" and "fsck order" flags are always 0, which
   838  		// is allowed.
   839  		fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", path, mnt.fs.FilesystemType().Name(), opts, 0, 0)
   840  	}
   841  }
   842  
   843  // GenerateProcMountInfo emits the contents of /proc/[pid]/mountinfo for vfs to
   844  // buf.
   845  //
   846  // Preconditions: taskRootDir.Ok().
   847  func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
   848  	rootMnt := taskRootDir.mount
   849  
   850  	vfs.mountMu.Lock()
   851  	mounts := rootMnt.submountsLocked()
   852  	// Take a reference on mounts since we need to drop vfs.mountMu before
   853  	// calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()) or
   854  	// vfs.StatAt() (=> FilesystemImpl.StatAt()).
   855  	for _, mnt := range mounts {
   856  		mnt.IncRef()
   857  	}
   858  	vfs.mountMu.Unlock()
   859  	defer func() {
   860  		for _, mnt := range mounts {
   861  			mnt.DecRef(ctx)
   862  		}
   863  	}()
   864  	sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
   865  
   866  	creds := auth.CredentialsFromContext(ctx)
   867  	for _, mnt := range mounts {
   868  		// Get the path to this mount relative to task root.
   869  		mntRootVD := VirtualDentry{
   870  			mount:  mnt,
   871  			dentry: mnt.root,
   872  		}
   873  		path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
   874  		if err != nil {
   875  			// For some reason we didn't get a path. Log a warning
   876  			// and run with empty path.
   877  			ctx.Warningf("VFS.GenerateProcMountInfo: error getting pathname for mount root %+v: %v", mnt.root, err)
   878  			path = ""
   879  		}
   880  		if path == "" {
   881  			// Either an error occurred, or path is not reachable
   882  			// from root.
   883  			break
   884  		}
   885  		// Stat the mount root to get the major/minor device numbers.
   886  		pop := &PathOperation{
   887  			Root:  mntRootVD,
   888  			Start: mntRootVD,
   889  		}
   890  		statx, err := vfs.StatAt(ctx, creds, pop, &StatOptions{})
   891  		if err != nil {
   892  			// Well that's not good. Ignore this mount.
   893  			ctx.Warningf("VFS.GenerateProcMountInfo: failed to stat mount root %+v: %v", mnt.root, err)
   894  			break
   895  		}
   896  
   897  		// Format:
   898  		// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
   899  		// (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
   900  
   901  		// (1) Mount ID.
   902  		fmt.Fprintf(buf, "%d ", mnt.ID)
   903  
   904  		// (2)  Parent ID (or this ID if there is no parent).
   905  		// Note that even if the call to mnt.parent() races with Mount
   906  		// destruction (which is possible since we're not holding vfs.mountMu),
   907  		// its Mount.ID will still be valid.
   908  		pID := mnt.ID
   909  		if p := mnt.parent(); p != nil {
   910  			pID = p.ID
   911  		}
   912  		fmt.Fprintf(buf, "%d ", pID)
   913  
   914  		// (3) Major:Minor device ID. We don't have a superblock, so we
   915  		// just use the root inode device number.
   916  		fmt.Fprintf(buf, "%d:%d ", statx.DevMajor, statx.DevMinor)
   917  
   918  		// (4) Root: the pathname of the directory in the filesystem
   919  		// which forms the root of this mount.
   920  		//
   921  		// NOTE(b/78135857): This will always be "/" until we implement
   922  		// bind mounts.
   923  		fmt.Fprintf(buf, "/ ")
   924  
   925  		// (5) Mount point (relative to process root).
   926  		fmt.Fprintf(buf, "%s ", manglePath(path))
   927  
   928  		// (6) Mount options.
   929  		opts := "rw"
   930  		if mnt.ReadOnly() {
   931  			opts = "ro"
   932  		}
   933  		if mnt.Flags.NoATime {
   934  			opts = ",noatime"
   935  		}
   936  		if mnt.Flags.NoExec {
   937  			opts += ",noexec"
   938  		}
   939  		fmt.Fprintf(buf, "%s ", opts)
   940  
   941  		// (7) Optional fields: zero or more fields of the form "tag[:value]".
   942  		// (8) Separator: the end of the optional fields is marked by a single hyphen.
   943  		fmt.Fprintf(buf, "- ")
   944  
   945  		// (9) Filesystem type.
   946  		fmt.Fprintf(buf, "%s ", mnt.fs.FilesystemType().Name())
   947  
   948  		// (10) Mount source: filesystem-specific information or "none".
   949  		fmt.Fprintf(buf, "none ")
   950  
   951  		// (11) Superblock options, and final newline.
   952  		fmt.Fprintf(buf, "%s\n", superBlockOpts(path, mnt))
   953  	}
   954  }
   955  
   956  // manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents.
   957  // See Linux fs/seq_file.c:mangle_path.
   958  func manglePath(p string) string {
   959  	r := strings.NewReplacer(" ", "\\040", "\t", "\\011", "\n", "\\012", "\\", "\\134")
   960  	return r.Replace(p)
   961  }
   962  
   963  // superBlockOpts returns the super block options string for the the mount at
   964  // the given path.
   965  func superBlockOpts(mountPath string, mnt *Mount) string {
   966  	// Compose super block options by combining global mount flags with
   967  	// FS-specific mount options.
   968  	opts := "rw"
   969  	if mnt.ReadOnly() {
   970  		opts = "ro"
   971  	}
   972  
   973  	if mopts := mnt.fs.Impl().MountOptions(); mopts != "" {
   974  		opts += "," + mopts
   975  	}
   976  
   977  	// NOTE(b/147673608): If the mount is a ramdisk-based fake cgroupfs, we also
   978  	// need to include the cgroup name in the options. For now we just read that
   979  	// from the path. Note that this is only possible when "cgroup" isn't
   980  	// registered as a valid filesystem type.
   981  	//
   982  	// TODO(github.com/SagerNet/issue/190): Once we removed fake cgroupfs support, we
   983  	// should remove this.
   984  	if cgroupfs := mnt.vfs.getFilesystemType("cgroup"); cgroupfs != nil && cgroupfs.opts.AllowUserMount {
   985  		// Real cgroupfs available.
   986  		return opts
   987  	}
   988  	if mnt.fs.FilesystemType().Name() == "cgroup" {
   989  		splitPath := strings.Split(mountPath, "/")
   990  		cgroupType := splitPath[len(splitPath)-1]
   991  		opts += "," + cgroupType
   992  	}
   993  
   994  	return opts
   995  }