gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/vfs/propagation.go

gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/vfs/propagation.go (about)

     1  // Copyright 2022 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package vfs
    16  
    17  import (
    18  	"gvisor.dev/gvisor/pkg/abi/linux"
    19  	"gvisor.dev/gvisor/pkg/bits"
    20  	"gvisor.dev/gvisor/pkg/context"
    21  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    22  	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
    23  )
    24  
    25  const (
    26  	// The following constants are possible bits for the cloneType argument to
    27  	// VirtualFilesystem.cloneMount() and related functions.
    28  	// Analogous to CL_MAKE_SHARED in Linux.
    29  	makeSharedClone = 1 << iota
    30  	// Analogous to CL_SLAVE in Linux.
    31  	makeFollowerClone
    32  	// Analogous to CL_PRIVATE in Linux.
    33  	makePrivateClone
    34  	// Analogous to CL_SHARED_TO_SLAVE in Linux.
    35  	sharedToFollowerClone
    36  
    37  	propagationFlags = linux.MS_SHARED | linux.MS_PRIVATE | linux.MS_SLAVE | linux.MS_UNBINDABLE
    38  )
    39  
    40  // +checklocks:vfs.mountMu
    41  func (vfs *VirtualFilesystem) commitChildren(ctx context.Context, mnt *Mount) {
    42  	for c := range mnt.children {
    43  		if c.neverConnected() {
    44  			vfs.commitMount(ctx, c)
    45  		}
    46  	}
    47  }
    48  
    49  // commitMount attaches mnt to the parent and mountpoint specified by its
    50  // mountKey and recursively does the same for all of mnt's descendants.
    51  //
    52  // +checklocks:vfs.mountMu
    53  func (vfs *VirtualFilesystem) commitMount(ctx context.Context, mnt *Mount) {
    54  	mp := mnt.getKey()
    55  
    56  	// If there is already a mount at this (parent, point), disconnect it from its
    57  	// parent and reconnect it to mnt once mnt has been connected.
    58  	child := vfs.mounts.Lookup(mp.mount, mp.dentry)
    59  	vfs.mounts.seq.BeginWrite()
    60  	if child != nil {
    61  		vfs.delayDecRef(vfs.disconnectLocked(child))
    62  	}
    63  	mp.dentry.mu.Lock()
    64  	vfs.connectLocked(mnt, mp, mp.mount.ns)
    65  	mp.dentry.mu.Unlock()
    66  	vfs.delayDecRef(mnt)
    67  
    68  	if child != nil {
    69  		newmp := VirtualDentry{mnt, mnt.root}
    70  		newmp.IncRef()
    71  		newmp.dentry.mu.Lock()
    72  		vfs.connectLocked(child, newmp, newmp.mount.ns)
    73  		newmp.dentry.mu.Unlock()
    74  		vfs.delayDecRef(child)
    75  	}
    76  	vfs.mounts.seq.EndWrite()
    77  	vfs.commitChildren(ctx, mnt)
    78  }
    79  
    80  // +checklocks:vfs.mountMu
    81  func (vfs *VirtualFilesystem) abortUncomittedChildren(ctx context.Context, mnt *Mount) {
    82  	for c := range mnt.children {
    83  		if c.neverConnected() {
    84  			vfs.abortUncommitedMount(ctx, c)
    85  			delete(mnt.children, c)
    86  		}
    87  	}
    88  }
    89  
    90  // abortUncommitedMount releases references on mnt and all its descendants.
    91  //
    92  // Prerequisite: mnt is not connected, i.e. mnt.ns == nil.
    93  // +checklocks:vfs.mountMu
    94  func (vfs *VirtualFilesystem) abortUncommitedMount(ctx context.Context, mnt *Mount) {
    95  	vfs.delayDecRef(mnt)
    96  	vfs.delayDecRef(mnt.getKey())
    97  	mnt.setKey(VirtualDentry{})
    98  	vfs.setPropagation(mnt, linux.MS_PRIVATE)
    99  	vfs.abortUncomittedChildren(ctx, mnt)
   100  }
   101  
   102  // SetMountPropagationAt changes the propagation type of the mount pointed to by
   103  // pop.
   104  func (vfs *VirtualFilesystem) SetMountPropagationAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, propFlag uint32) error {
   105  	recursive := propFlag&linux.MS_REC != 0
   106  	propFlag &= propagationFlags
   107  	// Check if flags is a power of 2. If not then more than one flag is set.
   108  	if !bits.IsPowerOfTwo32(propFlag) {
   109  		return linuxerr.EINVAL
   110  	}
   111  	vd, err := vfs.getMountpoint(ctx, creds, pop)
   112  	if err != nil {
   113  		return err
   114  	}
   115  	defer vd.DecRef(ctx)
   116  	vfs.SetMountPropagation(vd.mount, propFlag, recursive)
   117  	return nil
   118  }
   119  
   120  // SetMountPropagation changes the propagation type of the mount.
   121  func (vfs *VirtualFilesystem) SetMountPropagation(mnt *Mount, propFlag uint32, recursive bool) error {
   122  	vfs.lockMounts()
   123  	defer vfs.unlockMounts(context.Background())
   124  	if propFlag == linux.MS_SHARED {
   125  		if err := vfs.allocMountGroupIDs(mnt, recursive); err != nil {
   126  			return err
   127  		}
   128  	}
   129  
   130  	if !recursive {
   131  		vfs.setPropagation(mnt, propFlag)
   132  		return nil
   133  	}
   134  	for _, m := range mnt.submountsLocked() {
   135  		vfs.setPropagation(m, propFlag)
   136  	}
   137  	return nil
   138  }
   139  
   140  // setPropagation sets the propagation on mnt for a propagation type. This
   141  // method is analogous to fs/pnode.c:change_mnt_propagation() in Linux.
   142  //
   143  // +checklocks:vfs.mountMu
   144  func (vfs *VirtualFilesystem) setPropagation(mnt *Mount, propFlags uint32) {
   145  	if propFlags == linux.MS_SHARED {
   146  		mnt.isShared = true
   147  		return
   148  	}
   149  	// pflag is MS_PRIVATE, MS_SLAVE, or MS_UNBINDABLE. The algorithm is the same
   150  	// for MS_PRIVATE/MS_SLAVE/MS_UNBINDABLE, except that in the
   151  	// private/unbindable case we clear the leader and followerEntry after the
   152  	// procedure is finished.
   153  	var leader *Mount
   154  	if mnt.sharedEntry.Empty() {
   155  		// If mnt is shared and in a peer group with only itself, just make it
   156  		// private.
   157  		if mnt.isShared {
   158  			vfs.freeGroupID(mnt)
   159  			mnt.isShared = false
   160  		}
   161  		// If mnt is not a follower to any other mount, make all of its followers
   162  		// also private.
   163  		leader = mnt.leader
   164  		if leader == nil {
   165  			for !mnt.followerList.Empty() {
   166  				f := mnt.followerList.Front()
   167  				mnt.followerList.Remove(f)
   168  				f.leader = nil
   169  			}
   170  		}
   171  	} else {
   172  		// Pick a suitable new leader. Linux chooses the first peer that shares a
   173  		// root dentry, or any peer if none matches that criteria.
   174  		leader = mnt.sharedEntry.Next()
   175  		for m := mnt.sharedEntry.Next(); m != mnt; m = m.sharedEntry.Next() {
   176  			if m.root == mnt.root {
   177  				leader = m
   178  				break
   179  			}
   180  		}
   181  		// Clear out mnt's shared attributes.
   182  		mnt.sharedEntry.Remove()
   183  		mnt.groupID = 0
   184  		mnt.isShared = false
   185  	}
   186  	// Transfer all of mnt's followers to the new leader.
   187  	for f := mnt.followerList.Front(); f != nil; f = f.followerEntry.Next() {
   188  		f.leader = leader
   189  	}
   190  	// Remove mnt from its current follower list and add it to the new leader.
   191  	if mnt.leader != nil {
   192  		mnt.leader.followerList.Remove(mnt)
   193  	}
   194  	if leader != nil && propFlags == linux.MS_SLAVE {
   195  		leader.followerList.PushFront(mnt)
   196  		mnt.leader = leader
   197  	} else {
   198  		mnt.leader = nil
   199  	}
   200  
   201  	// Add mnts followers to leader's follower list. This also links all their
   202  	// followerEntry together.
   203  	if !mnt.followerList.Empty() && leader != nil {
   204  		leader.followerList.PushBackList(&mnt.followerList)
   205  	}
   206  }
   207  
   208  type propState struct {
   209  	origSrc        *Mount
   210  	prevSrc        *Mount
   211  	prevDst        *Mount
   212  	dstLeader      *Mount
   213  	propList       map[*Mount]struct{}
   214  	visitedLeaders map[*Mount]struct{}
   215  }
   216  
   217  // doPropagation returns a list of propagated mounts with their mount points
   218  // set. The  mounts are clones of src and have an extra reference taken. If
   219  // propagation fails at any point, the method returns all the mounts propagated
   220  // up until that point so they can be properly released. This method is
   221  // analogous to fs/pnode.c:propagate_mnt() in Linux.
   222  //
   223  // +checklocks:vfs.mountMu
   224  func (vfs *VirtualFilesystem) doPropagation(ctx context.Context, src *Mount, dst VirtualDentry) (map[*Mount]struct{}, error) {
   225  	if !dst.mount.isShared {
   226  		return nil, nil
   227  	}
   228  	s := propState{
   229  		origSrc:        src,
   230  		prevSrc:        src,
   231  		prevDst:        dst.mount,
   232  		dstLeader:      dst.mount.leader,
   233  		propList:       map[*Mount]struct{}{},
   234  		visitedLeaders: map[*Mount]struct{}{},
   235  	}
   236  	for peer := dst.mount.sharedEntry.Next(); peer != dst.mount; peer = peer.sharedEntry.Next() {
   237  		if err := vfs.propagateMount(ctx, peer, dst.dentry, &s); err != nil {
   238  			return s.propList, err
   239  		}
   240  	}
   241  	for follower := nextFollowerPeerGroup(dst.mount, dst.mount); follower != nil; follower = nextFollowerPeerGroup(follower, dst.mount) {
   242  		peer := follower
   243  		for {
   244  			if err := vfs.propagateMount(ctx, peer, dst.dentry, &s); err != nil {
   245  				return s.propList, err
   246  			}
   247  			peer = peer.sharedEntry.Next()
   248  			if peer == follower {
   249  				break
   250  			}
   251  		}
   252  	}
   253  	return s.propList, nil
   254  }
   255  
   256  // peers returns if two mounts are in the same peer group.
   257  //
   258  // +checklocks:vfs.mountMu
   259  func (vfs *VirtualFilesystem) peers(m1, m2 *Mount) bool {
   260  	return m1.groupID == m2.groupID && m1.groupID != 0
   261  }
   262  
   263  // propagateMount propagates state.srcMount to dstMount at dstPoint.
   264  // This method is analogous to fs/pnode.c:propagate_one() in Linux.
   265  //
   266  // +checklocks:vfs.mountMu
   267  func (vfs *VirtualFilesystem) propagateMount(ctx context.Context, dstMnt *Mount, dstPoint *Dentry, state *propState) error {
   268  	// Skip newly added mounts.
   269  	if dstMnt.neverConnected() || dstMnt.umounted {
   270  		return nil
   271  	}
   272  	mp := VirtualDentry{mount: dstMnt, dentry: dstPoint}
   273  	if !mp.mount.fs.Impl().IsDescendant(VirtualDentry{dstMnt, dstMnt.root}, mp) {
   274  		return nil
   275  	}
   276  	cloneType := 0
   277  	if vfs.peers(dstMnt, state.prevDst) {
   278  		cloneType = makeSharedClone
   279  	} else {
   280  		done := false
   281  		// Get the most recent leader that we've propagated from in the tree.
   282  		var leader, underLeader *Mount
   283  		for underLeader = dstMnt; ; underLeader = leader {
   284  			leader = underLeader.leader
   285  			if _, ok := state.visitedLeaders[leader]; ok {
   286  				break
   287  			}
   288  			if leader == state.dstLeader {
   289  				break
   290  			}
   291  		}
   292  		for {
   293  			parent := state.prevSrc.parent()
   294  			// Check that prevSrc is a follower, not a peer of the original.
   295  			if vfs.peers(state.prevSrc, state.origSrc) {
   296  				break
   297  			}
   298  			// Check if the mount prvSrc attached to (aka parent) has the same leader
   299  			// as the most recently visited leader in the mount tree.
   300  			done = parent.leader == leader
   301  			// If the leader under the most recently visited leader is not peers with
   302  			// the mount prevSrc attached to, then it's not part of this propagation
   303  			// tree and we need to traverse up the tree to get to the real src.
   304  			if done && vfs.peers(underLeader, parent) {
   305  				break
   306  			}
   307  			// Traverse back up the propagation tree to get the proper src. We only
   308  			// want to propagate from this mount's leader or peers of that leader.
   309  			state.prevSrc = state.prevSrc.leader
   310  			if done {
   311  				break
   312  			}
   313  		}
   314  		cloneType = makeFollowerClone
   315  		if dstMnt.isShared {
   316  			cloneType |= makeSharedClone
   317  		}
   318  	}
   319  	clone, err := vfs.cloneMountTree(ctx, state.prevSrc, state.prevSrc.root, cloneType, nil)
   320  	if err != nil {
   321  		return err
   322  	}
   323  	mp.IncRef()
   324  	clone.setKey(mp)
   325  	state.propList[clone] = struct{}{}
   326  	if dstMnt.leader != state.dstLeader {
   327  		state.visitedLeaders[dstMnt.leader] = struct{}{}
   328  	}
   329  	state.prevDst = dstMnt
   330  	state.prevSrc = clone
   331  	return dstMnt.ns.checkMountCount(ctx, clone)
   332  }
   333  
   334  // nextFollowerPeerGroup iterates through the propagation tree and returns the
   335  // first mount in each follower peer group under mnt. Once all the groups
   336  // have been iterated through the method returns nil. This method is analogous
   337  // to fs/pnode.c:next_group() in Linux.
   338  func nextFollowerPeerGroup(mnt *Mount, start *Mount) *Mount {
   339  	for {
   340  		// If mnt has any followers, this loop returns that follower. Otherwise mnt
   341  		// is updated until it is the last peer in its peer group. This has the
   342  		// effect of moving down the propagation tree until the bottommost follower.
   343  		// After that the loop moves across peers (if possible) to the last peer
   344  		// in the group.
   345  		for {
   346  			if !mnt.neverConnected() && !mnt.followerList.Empty() {
   347  				return mnt.followerList.Front()
   348  			}
   349  			next := mnt.sharedEntry.Next()
   350  			if mnt.groupID == start.groupID {
   351  				if next == start {
   352  					return nil
   353  				}
   354  				// If mnt is shared+slave, its next follower will be the same as its
   355  				// next peer.
   356  			} else if mnt.isFollower() && mnt.followerEntry.Next() != next {
   357  				break
   358  			}
   359  			mnt = next
   360  		}
   361  		// At this point mnt is the last peer in its shared+slave peer group.
   362  		// This loop returns the next follower in mnt's leader's follower list. Once
   363  		// the list of followers is exhausted it sets mnt to be the leader and
   364  		// breaks out of the loop. This has the effect of moving across the tree
   365  		// branches until all branches are exhausted. Then it moves up the tree to
   366  		// the parent.
   367  		for {
   368  			leader := mnt.leader
   369  			if mnt.followerEntry.Next() != nil {
   370  				return mnt.followerEntry.Next()
   371  			}
   372  			mnt = leader.sharedEntry.Next()
   373  			if leader.groupID == start.groupID {
   374  				break
   375  			}
   376  			if leader.followerEntry.Next() == mnt {
   377  				break
   378  			}
   379  			mnt = leader
   380  		}
   381  		if mnt == start {
   382  			return nil
   383  		}
   384  	}
   385  }
   386  
   387  // nextPropMount iterates through the propagation tree rooted at start. It
   388  // returns nil when there are no more mounts in the tree. Otherwise, it returns
   389  // the next mount in the tree. It is analogous to fs/pnode.c:propagation_next()
   390  // in Linux.
   391  func nextPropMount(mnt, start *Mount) *Mount {
   392  	m := mnt
   393  	if !m.neverConnected() && !m.followerList.Empty() {
   394  		return m.followerList.Front()
   395  	}
   396  	for {
   397  		leader := m.leader
   398  		if leader == start.leader {
   399  			next := m.sharedEntry.Next()
   400  			if next == start {
   401  				return nil
   402  			}
   403  			return next
   404  		} else if m.followerEntry.Next() != nil {
   405  			return m.followerEntry.Next()
   406  		}
   407  		m = leader
   408  	}
   409  }
   410  
   411  // arePropMountsBusy checks if all the mounts that mnt's parents propagate to
   412  // have the correct number of references before a call to umount. It is
   413  // analogous to fs/pnode.c:propagate_mount_busy() in Linux.
   414  //
   415  // +checklocks:vfs.mountMu
   416  func (vfs *VirtualFilesystem) arePropMountsBusy(mnt *Mount) bool {
   417  	parent := mnt.parent()
   418  	if parent == nil {
   419  		return !vfs.mountHasExpectedRefs(mnt)
   420  	}
   421  	if len(mnt.children) != 0 || !vfs.mountHasExpectedRefs(mnt) {
   422  		return true
   423  	}
   424  	for m := nextPropMount(parent, parent); m != nil; m = nextPropMount(m, parent) {
   425  		child := vfs.mounts.Lookup(m, mnt.point())
   426  		if child == nil {
   427  			continue
   428  		}
   429  		if len(child.children) != 0 && child.coveringMount() == nil {
   430  			continue
   431  		}
   432  		if !vfs.mountHasExpectedRefs(child) {
   433  			return true
   434  		}
   435  	}
   436  	return false
   437  }
   438  
   439  // allocateGroupID populates mnt.groupID with a new group id if one is
   440  // available, and returns an error otherwise. If the group ID bitmap is full,
   441  // double the size of the bitmap before allocating the new group id. It is
   442  // analogous to fs/namespace.c:mnt_alloc_group_id() in Linux.
   443  //
   444  // +checklocks:vfs.mountMu
   445  func (vfs *VirtualFilesystem) allocateGroupID(mnt *Mount) error {
   446  	groupID, err := vfs.groupIDBitmap.FirstZero(1)
   447  	if err != nil {
   448  		if err := vfs.groupIDBitmap.Grow(uint32(vfs.groupIDBitmap.Size())); err != nil {
   449  			return linuxerr.ENOSPC
   450  		}
   451  		groupID, err = vfs.groupIDBitmap.FirstZero(1)
   452  		if err != nil {
   453  			return err
   454  		}
   455  	}
   456  	vfs.groupIDBitmap.Add(groupID)
   457  	mnt.groupID = groupID
   458  	return nil
   459  }
   460  
   461  // freeGroupID marks a groupID as available for reuse. It is analogous to
   462  // fs/namespace.c:mnt_release_group_id() in Linux.
   463  //
   464  // +checklocks:vfs.mountMu
   465  func (vfs *VirtualFilesystem) freeGroupID(mnt *Mount) {
   466  	vfs.groupIDBitmap.Remove(mnt.groupID)
   467  	mnt.groupID = 0
   468  }
   469  
   470  // cleanupGroupIDs zeroes out all of the mounts' groupIDs and returns them
   471  // to the pool of available ids. It is analogous to
   472  // fs/namespace.c:cleanup_group_ids() in Linux.
   473  //
   474  // +checklocks:vfs.mountMu
   475  func (vfs *VirtualFilesystem) cleanupGroupIDs(mnts []*Mount) {
   476  	for _, m := range mnts {
   477  		if m.groupID != 0 && !m.isShared {
   478  			vfs.freeGroupID(m)
   479  		}
   480  	}
   481  }
   482  
   483  // allocMountGroupIDs allocates a new group id for mnt. If recursive is true, it
   484  // also allocates a new group id for all mounts children. It is analogous to
   485  // fs/namespace.c:invent_group_ids() in Linux.
   486  //
   487  // +checklocks:vfs.mountMu
   488  func (vfs *VirtualFilesystem) allocMountGroupIDs(mnt *Mount, recursive bool) error {
   489  	var mnts []*Mount
   490  	if recursive {
   491  		mnts = mnt.submountsLocked()
   492  	} else {
   493  		mnts = []*Mount{mnt}
   494  	}
   495  	for _, m := range mnts {
   496  		if m.groupID == 0 && !m.isShared {
   497  			if err := vfs.allocateGroupID(m); err != nil {
   498  				vfs.cleanupGroupIDs(mnts)
   499  				return err
   500  			}
   501  		}
   502  	}
   503  	return nil
   504  }
   505  
   506  // propagateUmount returns a list of mounts that the umount of mnts propagates
   507  // to.
   508  //
   509  // Prerequisites: all the mounts in mnts have had vfs.umount() called on them.
   510  //
   511  // +checklocks:vfs.mountMu
   512  func (vfs *VirtualFilesystem) propagateUmount(mnts []*Mount) []*Mount {
   513  	const (
   514  		umountVisited = iota
   515  		umountRestore
   516  	)
   517  	var toUmount []*Mount
   518  	noChildren := make(map[*Mount]struct{})
   519  	// Processed contains all the mounts that the algorithm has processed so far.
   520  	// If the mount maps to umountRestore, it should be restored after processing
   521  	// all the mounts. This happens in cases where a mount was speculatively
   522  	// unmounted that had children or is a cover mount.
   523  	processed := make(map[*Mount]int)
   524  
   525  	// Iterate through the mounts from the leafs back to the root.
   526  	for i := len(mnts) - 1; i >= 0; i-- {
   527  		mnt := mnts[i]
   528  
   529  		// If a mount has already been visited we know all its peers and followers
   530  		// have been visited so there's no need to visit them again.
   531  		if _, ok := processed[mnt]; ok {
   532  			continue
   533  		}
   534  		processed[mnt] = umountVisited
   535  
   536  		parent := mnt.parent()
   537  		if parent == nil {
   538  			continue
   539  		}
   540  		for m := nextPropMount(parent, parent); m != nil; m = nextPropMount(m, parent) {
   541  			child := vfs.mounts.Lookup(m, mnt.point())
   542  			if child == nil {
   543  				continue
   544  			}
   545  			if _, ok := processed[child]; ok {
   546  				// If the child has been visited we know its peer group and followers
   547  				// have all been visited so there's no need to visit them again. We can
   548  				// skip this propagation subtree by setting the iterator to be the last
   549  				// mount in the follower group.
   550  				if !child.followerList.Empty() {
   551  					m = child.followerList.Back()
   552  				}
   553  				continue
   554  			} else if child.umounted {
   555  				// If this child has already been marked for unmounting, just mark it
   556  				// as visited and move on. This means it was either part of the original
   557  				// mount list passed to this method or was umounted from another mount's
   558  				// propagation. In either case we can consider all its peers and
   559  				// followers as visited.
   560  				processed[child] = umountVisited
   561  				continue
   562  			}
   563  
   564  			// This loop starts at the child we are propagating the umount to and
   565  			// iterates through the child's parents. It continues as until it
   566  			// encounters a parent that's been visited.
   567  		loop:
   568  			for {
   569  				if _, ok := noChildren[child]; ok || child.umounted {
   570  					break
   571  				}
   572  				// If there are any children that have mountpoint != parent's root then
   573  				// the current mount cannot be unmounted.
   574  				for gchild := range child.children {
   575  					if gchild.point() == child.root {
   576  						continue
   577  					}
   578  					_, isProcessed := processed[gchild]
   579  					_, hasNoChildren := noChildren[gchild]
   580  					if isProcessed && hasNoChildren {
   581  						continue
   582  					}
   583  					processed[child] = umountRestore
   584  					break loop
   585  				}
   586  				if child.locked {
   587  					processed[child] = umountRestore
   588  					noChildren[child] = struct{}{}
   589  				} else {
   590  					vfs.umount(child)
   591  					toUmount = append(toUmount, child)
   592  				}
   593  				// If this parent was a mount that had to be restored because it had
   594  				// children, it might be safe to umount now that its child is gone. If
   595  				// it has been visited then it's already being umounted.
   596  				child = child.parent()
   597  				if _, ok := processed[child]; !ok {
   598  					break
   599  				}
   600  			}
   601  		}
   602  	}
   603  
   604  	// Add all the children of mounts marked for umount to the umount list. This
   605  	// excludes "cover" mounts (mounts whose mount point is equal to their
   606  	// parent's root) which will be reparented in the next step.
   607  	for i := 0; i < len(toUmount); i++ {
   608  		umount := toUmount[i]
   609  		for child := range umount.children {
   610  			if child.point() == umount.root {
   611  				processed[child] = umountRestore
   612  			} else {
   613  				vfs.umount(child)
   614  				toUmount = append(toUmount, child)
   615  			}
   616  		}
   617  	}
   618  
   619  	vfs.mounts.seq.BeginWrite()
   620  	for m, status := range processed {
   621  		if status == umountVisited {
   622  			continue
   623  		}
   624  		mp := m.getKey()
   625  		for mp.mount.umounted {
   626  			mp = mp.mount.getKey()
   627  		}
   628  		if mp != m.getKey() {
   629  			vfs.changeMountpoint(m, mp)
   630  		}
   631  	}
   632  	vfs.mounts.seq.EndWrite()
   633  
   634  	return toUmount
   635  }
   636  
   637  // unlockPropagationMounts sets locked to false for every mount that a umount
   638  // of mnt propagates to. It is analogous to fs/pnode.c:propagate_mount_unlock()
   639  // in Linux.
   640  //
   641  // +checklocks:vfs.mountMu
   642  func (vfs *VirtualFilesystem) unlockPropagationMounts(mnt *Mount) {
   643  	parent := mnt.parent()
   644  	if parent == nil {
   645  		return
   646  	}
   647  	for m := nextPropMount(parent, parent); m != nil; m = nextPropMount(m, parent) {
   648  		child := vfs.mounts.Lookup(m, mnt.point())
   649  		if child == nil {
   650  			continue
   651  		}
   652  		child.locked = false
   653  	}
   654  }
   655  
   656  // peerUnderRoot iterates through mnt's peers until it finds a mount that is in
   657  // ns and is reachable from root. This method is analogous to
   658  // fs/pnode.c:get_peer_under_root() in Linux.
   659  //
   660  // +checklocks:vfs.mountMu
   661  func (vfs *VirtualFilesystem) peerUnderRoot(ctx context.Context, mnt *Mount, ns *MountNamespace, root VirtualDentry) *Mount {
   662  	m := mnt
   663  	for {
   664  		if m.ns == ns {
   665  			if vfs.isPathReachable(ctx, root, VirtualDentry{mnt, mnt.root}) {
   666  				return m
   667  			}
   668  		}
   669  		m = m.sharedEntry.Next()
   670  		if m == mnt {
   671  			break
   672  		}
   673  	}
   674  	return nil
   675  }
   676  
   677  // isPathReachable returns true if vd is reachable from vfsroot. It is analogous
   678  // to fs/namespace.c:is_path_reachable() in Linux.
   679  //
   680  // +checklocks:vfs.mountMu
   681  func (vfs *VirtualFilesystem) isPathReachable(ctx context.Context, vfsroot VirtualDentry, vd VirtualDentry) bool {
   682  	for vd.mount != vfsroot.mount && vd.mount.parent() != nil {
   683  		vd = vd.mount.getKey()
   684  	}
   685  	return vd.mount == vfsroot.mount && vd.mount.fs.Impl().IsDescendant(vfsroot, vd)
   686  }