github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fs/mounts.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package fs
    16  
    17  import (
    18  	"fmt"
    19  	"math"
    20  
    21  	"golang.org/x/sys/unix"
    22  	"github.com/SagerNet/gvisor/pkg/context"
    23  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    24  	"github.com/SagerNet/gvisor/pkg/refs"
    25  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    26  	"github.com/SagerNet/gvisor/pkg/sync"
    27  	"github.com/SagerNet/gvisor/pkg/syserror"
    28  )
    29  
    30  // DefaultTraversalLimit provides a sensible default traversal limit that may
    31  // be passed to FindInode and FindLink. You may want to provide other options in
    32  // individual syscall implementations, but for internal functions this will be
    33  // sane.
    34  const DefaultTraversalLimit = 10
    35  
    36  const invalidMountID = math.MaxUint64
    37  
    38  // Mount represents a mount in the file system. It holds the root dirent for the
    39  // mount. It also points back to the dirent or mount where it was mounted over,
    40  // so that it can be restored when unmounted. The chained mount can be either:
    41  //   - Mount: when it's mounted on top of another mount point.
    42  //   - Dirent: when it's mounted on top of a dirent. In this case the mount is
    43  //     called an "undo" mount and only 'root' is set. All other fields are
    44  //     either invalid or nil.
    45  //
    46  // +stateify savable
    47  type Mount struct {
    48  	// ID is a unique id for this mount. It may be invalidMountID if this is
    49  	// used to cache a dirent that was mounted over.
    50  	ID uint64
    51  
    52  	// ParentID is the parent's mount unique id. It may be invalidMountID if this
    53  	// is the root mount or if this is used to cache a dirent that was mounted
    54  	// over.
    55  	ParentID uint64
    56  
    57  	// root is the root Dirent of this mount. A reference on this Dirent must be
    58  	// held through the lifetime of the Mount which contains it.
    59  	root *Dirent
    60  
    61  	// previous is the existing dirent or mount that this object was mounted over.
    62  	// It's nil for the root mount and for the last entry in the chain (always an
    63  	// "undo" mount).
    64  	previous *Mount
    65  }
    66  
    67  // newMount creates a new mount, taking a reference on 'root'. Caller must
    68  // release the reference when it's done with the mount.
    69  func newMount(id, pid uint64, root *Dirent) *Mount {
    70  	root.IncRef()
    71  	return &Mount{
    72  		ID:       id,
    73  		ParentID: pid,
    74  		root:     root,
    75  	}
    76  }
    77  
    78  // newRootMount creates a new root mount (no parent), taking a reference on
    79  // 'root'. Caller must release the reference when it's done with the mount.
    80  func newRootMount(id uint64, root *Dirent) *Mount {
    81  	root.IncRef()
    82  	return &Mount{
    83  		ID:       id,
    84  		ParentID: invalidMountID,
    85  		root:     root,
    86  	}
    87  }
    88  
    89  // newUndoMount creates a new undo mount, taking a reference on 'd'. Caller must
    90  // release the reference when it's done with the mount.
    91  func newUndoMount(d *Dirent) *Mount {
    92  	d.IncRef()
    93  	return &Mount{
    94  		ID:       invalidMountID,
    95  		ParentID: invalidMountID,
    96  		root:     d,
    97  	}
    98  }
    99  
   100  // Root returns the root dirent of this mount.
   101  //
   102  // This may return nil if the mount has already been free. Callers must handle this
   103  // case appropriately. If non-nil, callers must call DecRef on the returned *Dirent.
   104  func (m *Mount) Root() *Dirent {
   105  	if !m.root.TryIncRef() {
   106  		return nil
   107  	}
   108  	return m.root
   109  }
   110  
   111  // IsRoot returns true if the mount has no parent.
   112  func (m *Mount) IsRoot() bool {
   113  	return !m.IsUndo() && m.ParentID == invalidMountID
   114  }
   115  
   116  // IsUndo returns true if 'm' is an undo mount that should be used to restore
   117  // the original dirent during unmount only and it's not a valid mount.
   118  func (m *Mount) IsUndo() bool {
   119  	if m.ID == invalidMountID {
   120  		if m.ParentID != invalidMountID {
   121  			panic(fmt.Sprintf("Undo mount with valid parentID: %+v", m))
   122  		}
   123  		return true
   124  	}
   125  	return false
   126  }
   127  
   128  // MountNamespace defines a VFS root. It contains collection of Mounts that are
   129  // mounted inside the Dirent tree rooted at the Root Dirent. It provides
   130  // methods for traversing the Dirent, and for mounting/unmounting in the tree.
   131  //
   132  // Note that this does not correspond to a "mount namespace" in the Linux. It
   133  // is more like a unique VFS instance.
   134  //
   135  // It's possible for different processes to have different MountNamespaces. In
   136  // this case, the file systems exposed to the processes are completely
   137  // distinct.
   138  //
   139  // +stateify savable
   140  type MountNamespace struct {
   141  	refs.AtomicRefCount
   142  
   143  	// userns is the user namespace associated with this mount namespace.
   144  	//
   145  	// All privileged operations on this mount namespace must have
   146  	// appropriate capabilities in this userns.
   147  	//
   148  	// userns is immutable.
   149  	userns *auth.UserNamespace
   150  
   151  	// root is the root directory.
   152  	root *Dirent
   153  
   154  	// mu protects mounts and mountID counter.
   155  	mu sync.Mutex `state:"nosave"`
   156  
   157  	// mounts is a map of mounted Dirent -> Mount object. There are three
   158  	// possible cases:
   159  	//   - Dirent is mounted over a mount point: the stored Mount object will be
   160  	//     the Mount for that mount point.
   161  	//   - Dirent is mounted over a regular (non-mount point) Dirent: the stored
   162  	//     Mount object will be an "undo" mount containing the mounted-over
   163  	//     Dirent.
   164  	//   - Dirent is the root mount: the stored Mount object will be a root mount
   165  	//     containing the Dirent itself.
   166  	mounts map[*Dirent]*Mount
   167  
   168  	// mountID is the next mount id to assign.
   169  	mountID uint64
   170  }
   171  
   172  // NewMountNamespace returns a new MountNamespace, with the provided node at the
   173  // root, and the given cache size. A root must always be provided.
   174  func NewMountNamespace(ctx context.Context, root *Inode) (*MountNamespace, error) {
   175  	// Set the root dirent and id on the root mount. The reference returned from
   176  	// NewDirent will be donated to the MountNamespace constructed below.
   177  	d := NewDirent(ctx, root, "/")
   178  
   179  	mnts := map[*Dirent]*Mount{
   180  		d: newRootMount(1, d),
   181  	}
   182  
   183  	creds := auth.CredentialsFromContext(ctx)
   184  	mns := MountNamespace{
   185  		userns:  creds.UserNamespace,
   186  		root:    d,
   187  		mounts:  mnts,
   188  		mountID: 2,
   189  	}
   190  	mns.EnableLeakCheck("fs.MountNamespace")
   191  	return &mns, nil
   192  }
   193  
   194  // UserNamespace returns the user namespace associated with this mount manager.
   195  func (mns *MountNamespace) UserNamespace() *auth.UserNamespace {
   196  	return mns.userns
   197  }
   198  
   199  // Root returns the MountNamespace's root Dirent and increments its reference
   200  // count.  The caller must call DecRef when finished.
   201  func (mns *MountNamespace) Root() *Dirent {
   202  	mns.root.IncRef()
   203  	return mns.root
   204  }
   205  
   206  // FlushMountSourceRefs flushes extra references held by MountSources for all active mount points;
   207  // see fs/mount.go:MountSource.FlushDirentRefs.
   208  func (mns *MountNamespace) FlushMountSourceRefs() {
   209  	mns.mu.Lock()
   210  	defer mns.mu.Unlock()
   211  	mns.flushMountSourceRefsLocked()
   212  }
   213  
   214  func (mns *MountNamespace) flushMountSourceRefsLocked() {
   215  	// Flush mounts' MountSource references.
   216  	for _, mp := range mns.mounts {
   217  		for ; mp != nil; mp = mp.previous {
   218  			mp.root.Inode.MountSource.FlushDirentRefs()
   219  		}
   220  	}
   221  
   222  	if mns.root == nil {
   223  		// No root? This MountSource must have already been destroyed.
   224  		// This can happen when a Save is triggered while a process is
   225  		// exiting. There is nothing to flush.
   226  		return
   227  	}
   228  
   229  	// Flush root's MountSource references.
   230  	mns.root.Inode.MountSource.FlushDirentRefs()
   231  }
   232  
   233  // destroy drops root and mounts dirent references and closes any original nodes.
   234  //
   235  // After destroy is called, the MountNamespace may continue to be referenced (for
   236  // example via /proc/mounts), but should free all resources and shouldn't have
   237  // Find* methods called.
   238  func (mns *MountNamespace) destroy(ctx context.Context) {
   239  	mns.mu.Lock()
   240  	defer mns.mu.Unlock()
   241  
   242  	// Flush all mounts' MountSource references to Dirents. This allows for mount
   243  	// points to be torn down since there should be no remaining references after
   244  	// this and DecRef below.
   245  	mns.flushMountSourceRefsLocked()
   246  
   247  	// Teardown mounts.
   248  	for _, mp := range mns.mounts {
   249  		// Drop the mount reference on all mounted dirents.
   250  		for ; mp != nil; mp = mp.previous {
   251  			mp.root.DecRef(ctx)
   252  		}
   253  	}
   254  	mns.mounts = nil
   255  
   256  	// Drop reference on the root.
   257  	mns.root.DecRef(ctx)
   258  
   259  	// Ensure that root cannot be accessed via this MountNamespace any
   260  	// more.
   261  	mns.root = nil
   262  
   263  	// Wait for asynchronous work (queued by dropping Dirent references
   264  	// above) to complete before destroying this MountNamespace.
   265  	AsyncBarrier()
   266  }
   267  
   268  // DecRef implements RefCounter.DecRef with destructor mns.destroy.
   269  func (mns *MountNamespace) DecRef(ctx context.Context) {
   270  	mns.DecRefWithDestructor(ctx, mns.destroy)
   271  }
   272  
   273  // withMountLocked prevents further walks to `node`, because `node` is about to
   274  // be a mount point.
   275  func (mns *MountNamespace) withMountLocked(node *Dirent, fn func() error) error {
   276  	mns.mu.Lock()
   277  	defer mns.mu.Unlock()
   278  
   279  	renameMu.Lock()
   280  	defer renameMu.Unlock()
   281  
   282  	// Linux allows mounting over the root (?). It comes with a strange set
   283  	// of semantics. We'll just not do this for now.
   284  	if node.parent == nil {
   285  		return linuxerr.EBUSY
   286  	}
   287  
   288  	// For both mount and unmount, we take this lock so we can swap out the
   289  	// appropriate child in parent.children.
   290  	//
   291  	// For unmount, this also ensures that if `node` is a mount point, the
   292  	// underlying mount's MountSource.direntRefs cannot increase by preventing
   293  	// walks to node.
   294  	node.parent.dirMu.Lock()
   295  	defer node.parent.dirMu.Unlock()
   296  
   297  	node.parent.mu.Lock()
   298  	defer node.parent.mu.Unlock()
   299  
   300  	// We need not take node.dirMu since we have parent.dirMu.
   301  
   302  	// We need to take node.mu, so that we can check for deletion.
   303  	node.mu.Lock()
   304  	defer node.mu.Unlock()
   305  
   306  	return fn()
   307  }
   308  
   309  // Mount mounts a `inode` over the subtree at `node`.
   310  func (mns *MountNamespace) Mount(ctx context.Context, mountPoint *Dirent, inode *Inode) error {
   311  	return mns.withMountLocked(mountPoint, func() error {
   312  		replacement, err := mountPoint.mount(ctx, inode)
   313  		if err != nil {
   314  			return err
   315  		}
   316  		defer replacement.DecRef(ctx)
   317  
   318  		// Set the mount's root dirent and id.
   319  		parentMnt := mns.findMountLocked(mountPoint)
   320  		childMnt := newMount(mns.mountID, parentMnt.ID, replacement)
   321  		mns.mountID++
   322  
   323  		// Drop mountPoint from its dirent cache.
   324  		mountPoint.dropExtendedReference()
   325  
   326  		// If mountPoint is already a mount, push mountPoint on the stack so it can
   327  		// be recovered on unmount.
   328  		if prev := mns.mounts[mountPoint]; prev != nil {
   329  			childMnt.previous = prev
   330  			mns.mounts[replacement] = childMnt
   331  			delete(mns.mounts, mountPoint)
   332  			return nil
   333  		}
   334  
   335  		// Was not already mounted, just add another mount point.
   336  		childMnt.previous = newUndoMount(mountPoint)
   337  		mns.mounts[replacement] = childMnt
   338  		return nil
   339  	})
   340  }
   341  
   342  // Unmount ensures no references to the MountSource remain and removes `node` from
   343  // this subtree. The subtree formerly mounted in `node`'s place will be
   344  // restored. node's MountSource will be destroyed as soon as the last reference to
   345  // `node` is dropped, as no references to Dirents within will remain.
   346  //
   347  // If detachOnly is set, Unmount merely removes `node` from the subtree, but
   348  // allows existing references to the MountSource remain. E.g. if an open file still
   349  // refers to Dirents in MountSource, the Unmount will succeed anyway and MountSource will
   350  // be destroyed at a later time when all references to Dirents within are
   351  // dropped.
   352  //
   353  // The caller must hold a reference to node from walking to it.
   354  func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly bool) error {
   355  	// This takes locks to prevent further walks to Dirents in this mount
   356  	// under the assumption that `node` is the root of the mount.
   357  	return mns.withMountLocked(node, func() error {
   358  		orig, ok := mns.mounts[node]
   359  		if !ok {
   360  			// node is not a mount point.
   361  			return linuxerr.EINVAL
   362  		}
   363  
   364  		if orig.previous == nil {
   365  			panic("cannot unmount initial dirent")
   366  		}
   367  
   368  		m := node.Inode.MountSource
   369  		if !detachOnly {
   370  			// Flush all references on the mounted node.
   371  			m.FlushDirentRefs()
   372  
   373  			// At this point, exactly two references must be held
   374  			// to mount: one mount reference on node, and one due
   375  			// to walking to node.
   376  			//
   377  			// We must also be guaranteed that no more references
   378  			// can be taken on mount. This is why withMountLocked
   379  			// must be held at this point to prevent any walks to
   380  			// and from node.
   381  			if refs := m.DirentRefs(); refs < 2 {
   382  				panic(fmt.Sprintf("have %d refs on unmount, expect 2 or more", refs))
   383  			} else if refs != 2 {
   384  				return linuxerr.EBUSY
   385  			}
   386  		}
   387  
   388  		prev := orig.previous
   389  		if err := node.unmount(ctx, prev.root); err != nil {
   390  			return err
   391  		}
   392  
   393  		if prev.previous == nil {
   394  			if !prev.IsUndo() {
   395  				panic(fmt.Sprintf("Last mount in the chain must be a undo mount: %+v", prev))
   396  			}
   397  			// Drop mount reference taken at the end of MountNamespace.Mount.
   398  			prev.root.DecRef(ctx)
   399  		} else {
   400  			mns.mounts[prev.root] = prev
   401  		}
   402  		delete(mns.mounts, node)
   403  
   404  		return nil
   405  	})
   406  }
   407  
   408  // FindMount returns the mount that 'd' belongs to. It walks the dirent back
   409  // until a mount is found. It may return nil if no mount was found.
   410  func (mns *MountNamespace) FindMount(d *Dirent) *Mount {
   411  	mns.mu.Lock()
   412  	defer mns.mu.Unlock()
   413  	renameMu.Lock()
   414  	defer renameMu.Unlock()
   415  
   416  	return mns.findMountLocked(d)
   417  }
   418  
   419  func (mns *MountNamespace) findMountLocked(d *Dirent) *Mount {
   420  	for {
   421  		if mnt := mns.mounts[d]; mnt != nil {
   422  			return mnt
   423  		}
   424  		if d.parent == nil {
   425  			return nil
   426  		}
   427  		d = d.parent
   428  	}
   429  }
   430  
   431  // AllMountsUnder returns a slice of all mounts under the parent, including
   432  // itself.
   433  func (mns *MountNamespace) AllMountsUnder(parent *Mount) []*Mount {
   434  	mns.mu.Lock()
   435  	defer mns.mu.Unlock()
   436  
   437  	var rv []*Mount
   438  	for _, mp := range mns.mounts {
   439  		if !mp.IsUndo() && mp.root.descendantOf(parent.root) {
   440  			rv = append(rv, mp)
   441  		}
   442  	}
   443  	return rv
   444  }
   445  
   446  // FindLink returns an Dirent from a given node, which may be a symlink.
   447  //
   448  // The root argument is treated as the root directory, and FindLink will not
   449  // return anything above that. The wd dirent provides the starting directory,
   450  // and may be nil which indicates the root should be used. You must call DecRef
   451  // on the resulting Dirent when you are no longer using the object.
   452  //
   453  // If wd is nil, then the root will be used as the working directory. If the
   454  // path is absolute, this has no functional impact.
   455  //
   456  // Precondition: root must be non-nil.
   457  // Precondition: the path must be non-empty.
   458  func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path string, remainingTraversals *uint) (*Dirent, error) {
   459  	if root == nil {
   460  		panic("MountNamespace.FindLink: root must not be nil")
   461  	}
   462  	if len(path) == 0 {
   463  		panic("MountNamespace.FindLink: path is empty")
   464  	}
   465  
   466  	// Split the path.
   467  	first, remainder := SplitFirst(path)
   468  
   469  	// Where does this walk originate?
   470  	current := wd
   471  	if current == nil {
   472  		current = root
   473  	}
   474  	for first == "/" {
   475  		// Special case: it's possible that we have nothing to walk at
   476  		// all. This is necessary since we're resplitting the path.
   477  		if remainder == "" {
   478  			root.IncRef()
   479  			return root, nil
   480  		}
   481  
   482  		// Start at the root and advance the path component so that the
   483  		// walk below can proceed. Note at this point, it handles the
   484  		// no-op walk case perfectly fine.
   485  		current = root
   486  		first, remainder = SplitFirst(remainder)
   487  	}
   488  
   489  	current.IncRef() // Transferred during walk.
   490  
   491  	for {
   492  		// Check that the file is a directory and that we have
   493  		// permissions to walk.
   494  		//
   495  		// Note that we elide this check for the root directory as an
   496  		// optimization; a non-executable root may still be walked.  A
   497  		// non-directory root is hopeless.
   498  		if current != root {
   499  			if !IsDir(current.Inode.StableAttr) {
   500  				current.DecRef(ctx) // Drop reference from above.
   501  				return nil, syserror.ENOTDIR
   502  			}
   503  			if err := current.Inode.CheckPermission(ctx, PermMask{Execute: true}); err != nil {
   504  				current.DecRef(ctx) // Drop reference from above.
   505  				return nil, err
   506  			}
   507  		}
   508  
   509  		// Move to the next level.
   510  		next, err := current.Walk(ctx, root, first)
   511  		if err != nil {
   512  			// Allow failed walks to cache the dirent, because no
   513  			// children will acquire a reference at the end.
   514  			current.maybeExtendReference()
   515  			current.DecRef(ctx)
   516  			return nil, err
   517  		}
   518  
   519  		// Drop old reference.
   520  		current.DecRef(ctx)
   521  
   522  		if remainder != "" {
   523  			// Ensure it's resolved, unless it's the last level.
   524  			//
   525  			// See resolve for reference semantics; on err next
   526  			// will have one dropped.
   527  			current, err = mns.resolve(ctx, root, next, remainingTraversals)
   528  			if err != nil {
   529  				return nil, err
   530  			}
   531  		} else {
   532  			// Allow the file system to take an extra reference on the
   533  			// found child. This will hold a reference on the containing
   534  			// directory, so the whole tree will be implicitly cached.
   535  			next.maybeExtendReference()
   536  			return next, nil
   537  		}
   538  
   539  		// Move to the next element.
   540  		first, remainder = SplitFirst(remainder)
   541  	}
   542  }
   543  
   544  // FindInode is identical to FindLink except the return value is resolved.
   545  //
   546  //go:nosplit
   547  func (mns *MountNamespace) FindInode(ctx context.Context, root, wd *Dirent, path string, remainingTraversals *uint) (*Dirent, error) {
   548  	d, err := mns.FindLink(ctx, root, wd, path, remainingTraversals)
   549  	if err != nil {
   550  		return nil, err
   551  	}
   552  
   553  	// See resolve for reference semantics; on err d will have the
   554  	// reference dropped.
   555  	return mns.resolve(ctx, root, d, remainingTraversals)
   556  }
   557  
   558  // resolve resolves the given link.
   559  //
   560  // If successful, a reference is dropped on node and one is acquired on the
   561  // caller's behalf for the returned dirent.
   562  //
   563  // If not successful, a reference is _also_ dropped on the node and an error
   564  // returned. This is for convenience in using resolve directly as a return
   565  // value.
   566  func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, remainingTraversals *uint) (*Dirent, error) {
   567  	// Resolve the path.
   568  	target, err := node.Inode.Getlink(ctx)
   569  
   570  	switch {
   571  	case err == nil:
   572  		// Make sure we didn't exhaust the traversal budget.
   573  		if *remainingTraversals == 0 {
   574  			target.DecRef(ctx)
   575  			return nil, unix.ELOOP
   576  		}
   577  
   578  		node.DecRef(ctx) // Drop the original reference.
   579  		return target, nil
   580  
   581  	case linuxerr.Equals(linuxerr.ENOLINK, err):
   582  		// Not a symlink.
   583  		return node, nil
   584  
   585  	case err == ErrResolveViaReadlink:
   586  		defer node.DecRef(ctx) // See above.
   587  
   588  		// First, check if we should traverse.
   589  		if *remainingTraversals == 0 {
   590  			return nil, unix.ELOOP
   591  		}
   592  
   593  		// Read the target path.
   594  		targetPath, err := node.Inode.Readlink(ctx)
   595  		if err != nil {
   596  			return nil, err
   597  		}
   598  
   599  		// Find the node; we resolve relative to the current symlink's parent.
   600  		renameMu.RLock()
   601  		parent := node.parent
   602  		renameMu.RUnlock()
   603  		*remainingTraversals--
   604  		d, err := mns.FindInode(ctx, root, parent, targetPath, remainingTraversals)
   605  		if err != nil {
   606  			return nil, err
   607  		}
   608  
   609  		return d, err
   610  
   611  	default:
   612  		node.DecRef(ctx) // Drop for err; see above.
   613  
   614  		// Propagate the error.
   615  		return nil, err
   616  	}
   617  }
   618  
   619  // SyncAll calls Dirent.SyncAll on the root.
   620  func (mns *MountNamespace) SyncAll(ctx context.Context) {
   621  	mns.mu.Lock()
   622  	defer mns.mu.Unlock()
   623  	mns.root.SyncAll(ctx)
   624  }