github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fs/dirent.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package fs
    16  
    17  import (
    18  	"fmt"
    19  	"path"
    20  	"sync/atomic"
    21  
    22  	"golang.org/x/sys/unix"
    23  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    24  	"github.com/SagerNet/gvisor/pkg/context"
    25  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    26  	"github.com/SagerNet/gvisor/pkg/refs"
    27  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    28  	"github.com/SagerNet/gvisor/pkg/sentry/socket/unix/transport"
    29  	"github.com/SagerNet/gvisor/pkg/sentry/uniqueid"
    30  	"github.com/SagerNet/gvisor/pkg/sync"
    31  	"github.com/SagerNet/gvisor/pkg/syserror"
    32  )
    33  
    34  type globalDirentMap struct {
    35  	mu      sync.Mutex
    36  	dirents map[*Dirent]struct{}
    37  }
    38  
    39  func (g *globalDirentMap) add(d *Dirent) {
    40  	g.mu.Lock()
    41  	g.dirents[d] = struct{}{}
    42  	g.mu.Unlock()
    43  }
    44  
    45  func (g *globalDirentMap) remove(d *Dirent) {
    46  	g.mu.Lock()
    47  	delete(g.dirents, d)
    48  	g.mu.Unlock()
    49  }
    50  
    51  // allDirents keeps track of all Dirents that need to be considered in
    52  // Save/Restore for inode mappings.
    53  //
    54  // Because inodes do not hold paths, but inodes for external file systems map
    55  // to an external path, every user-visible Dirent is stored in this map and
    56  // iterated through upon save to keep inode ID -> restore path mappings.
    57  var allDirents = globalDirentMap{
    58  	dirents: map[*Dirent]struct{}{},
    59  }
    60  
    61  // renameMu protects the parent of *all* Dirents. (See explanation in
    62  // lockForRename.)
    63  //
    64  // See fs.go for lock ordering.
    65  var renameMu sync.RWMutex
    66  
    67  // Dirent holds an Inode in memory.
    68  //
    69  // A Dirent may be negative or positive:
    70  //
    71  // A negative Dirent contains a nil Inode and indicates that a path does not exist. This
    72  // is a convention taken from the Linux dcache, see fs/dcache.c. A negative Dirent remains
    73  // cached until a create operation replaces it with a positive Dirent. A negative Dirent
    74  // always has one reference owned by its parent and takes _no_ reference on its parent. This
    75  // ensures that its parent can be unhashed regardless of negative children.
    76  //
    77  // A positive Dirent contains a non-nil Inode. It remains cached for as long as there remain
    78  // references to it. A positive Dirent always takes a reference on its parent.
    79  //
    80  // A Dirent may be a root Dirent (parent is nil) or be parented (non-nil parent).
    81  //
    82  // Dirents currently do not attempt to free entries that lack application references under
    83  // memory pressure.
    84  //
    85  // +stateify savable
    86  type Dirent struct {
    87  	// AtomicRefCount is our reference count.
    88  	refs.AtomicRefCount
    89  
    90  	// userVisible indicates whether the Dirent is visible to the user or
    91  	// not.  Only user-visible Dirents should save inode mappings in
    92  	// save/restore, as only they hold the real path to the underlying
    93  	// inode.
    94  	//
    95  	// See newDirent and Dirent.afterLoad.
    96  	userVisible bool
    97  
    98  	// Inode is the underlying file object.
    99  	//
   100  	// Inode is exported currently to assist in implementing overlay Inodes (where a
   101  	// Inode.InodeOperations.Lookup may need to merge the Inode contained in a positive Dirent with
   102  	// another Inode). This is normally done before the Dirent is parented (there are
   103  	// no external references to it).
   104  	//
   105  	// Other objects in the VFS may take a reference to this Inode but only while holding
   106  	// a reference to this Dirent.
   107  	Inode *Inode
   108  
   109  	// name is the name (i.e. basename) of this entry.
   110  	//
   111  	// N.B. name is protected by parent.mu, not this node's mu!
   112  	name string
   113  
   114  	// parent is the parent directory.
   115  	//
   116  	// We hold a hard reference to the parent.
   117  	//
   118  	// parent is protected by renameMu.
   119  	parent *Dirent
   120  
   121  	// deleted may be set atomically when removed.
   122  	deleted int32
   123  
   124  	// mounted is true if Dirent is a mount point, similar to include/linux/dcache.h:DCACHE_MOUNTED.
   125  	mounted bool
   126  
   127  	// direntEntry identifies this Dirent as an element in a DirentCache. DirentCaches
   128  	// and their contents are not saved.
   129  	direntEntry `state:"nosave"`
   130  
   131  	// dirMu is a read-write mutex that protects caching decisions made by directory operations.
   132  	// Lock ordering: dirMu must be taken before mu (see below). Details:
   133  	//
   134  	// dirMu does not participate in Rename; instead mu and renameMu are used, see lockForRename.
   135  	//
   136  	// Creation and Removal operations must be synchronized with Walk to prevent stale negative
   137  	// caching. Note that this requirement is not specific to a _Dirent_ doing negative caching.
   138  	// The following race exists at any level of the VFS:
   139  	//
   140  	// For an object D that represents a directory, containing a cache of non-existent paths,
   141  	// protected by D.cacheMu:
   142  	//
   143  	// T1:                       T2:
   144  	//                           D.lookup(name)
   145  	//                           --> ENOENT
   146  	// D.create(name)
   147  	// --> success
   148  	// D.cacheMu.Lock
   149  	//   delete(D.cache, name)
   150  	// D.cacheMu.Unlock
   151  	//                           D.cacheMu.Lock
   152  	//                             D.cache[name] = true
   153  	//                           D.cacheMu.Unlock
   154  	//
   155  	// D.lookup(name)
   156  	// D.cacheMu.Lock
   157  	//   if D.cache[name] {
   158  	//   --> ENOENT (wrong)
   159  	//   }
   160  	// D.cacheMu.Lock
   161  	//
   162  	// Correct:
   163  	//
   164  	// T1:                       T2:
   165  	//                           D.cacheMu.Lock
   166  	//                             D.lookup(name)
   167  	//                             --> ENOENT
   168  	//                             D.cache[name] = true
   169  	//                           D.cacheMu.Unlock
   170  	// D.cacheMu.Lock
   171  	//   D.create(name)
   172  	//   --> success
   173  	//   delete(D.cache, name)
   174  	// D.cacheMu.Unlock
   175  	//
   176  	// D.cacheMu.Lock
   177  	//   D.lookup(name)
   178  	//   --> EXISTS (right)
   179  	// D.cacheMu.Unlock
   180  	//
   181  	// Note that the above "correct" solution causes too much lock contention: all lookups are
   182  	// synchronized with each other. This is a problem because lookups are involved in any VFS
   183  	// path operation.
   184  	//
   185  	// A Dirent diverges from the single D.cacheMu and instead uses two locks: dirMu to protect
   186  	// concurrent creation/removal/lookup caching, and mu to protect the Dirent's children map
   187  	// in general.
   188  	//
   189  	// This allows for concurrent Walks to be executed in order to pipeline lookups. For instance
   190  	// for a hot directory /a/b, threads T1, T2, T3 will only block on each other update the
   191  	// children map of /a/b when their individual lookups complete.
   192  	//
   193  	// T1:           T2:           T3:
   194  	// stat(/a/b/c)  stat(/a/b/d)  stat(/a/b/e)
   195  	dirMu sync.RWMutex `state:"nosave"`
   196  
   197  	// mu protects the below fields. Lock ordering: mu must be taken after dirMu.
   198  	mu sync.Mutex `state:"nosave"`
   199  
   200  	// children are cached via weak references.
   201  	children map[string]*refs.WeakRef `state:".(map[string]*Dirent)"`
   202  }
   203  
   204  // NewDirent returns a new root Dirent, taking the caller's reference on inode. The caller
   205  // holds the only reference to the Dirent. Parents may call hashChild to parent this Dirent.
   206  func NewDirent(ctx context.Context, inode *Inode, name string) *Dirent {
   207  	d := newDirent(inode, name)
   208  	allDirents.add(d)
   209  	d.userVisible = true
   210  	return d
   211  }
   212  
   213  // NewTransientDirent creates a transient Dirent that shouldn't actually be
   214  // visible to users.
   215  //
   216  // An Inode is required.
   217  func NewTransientDirent(inode *Inode) *Dirent {
   218  	if inode == nil {
   219  		panic("an inode is required")
   220  	}
   221  	return newDirent(inode, "transient")
   222  }
   223  
   224  func newDirent(inode *Inode, name string) *Dirent {
   225  	// The Dirent needs to maintain one reference to MountSource.
   226  	if inode != nil {
   227  		inode.MountSource.IncDirentRefs()
   228  	}
   229  	d := Dirent{
   230  		Inode:    inode,
   231  		name:     name,
   232  		children: make(map[string]*refs.WeakRef),
   233  	}
   234  	d.EnableLeakCheck("fs.Dirent")
   235  	return &d
   236  }
   237  
   238  // NewNegativeDirent returns a new root negative Dirent. Otherwise same as NewDirent.
   239  func NewNegativeDirent(name string) *Dirent {
   240  	return newDirent(nil, name)
   241  }
   242  
   243  // IsRoot returns true if d is a root Dirent.
   244  func (d *Dirent) IsRoot() bool {
   245  	return d.parent == nil
   246  }
   247  
   248  // IsNegative returns true if d represents a path that does not exist.
   249  func (d *Dirent) IsNegative() bool {
   250  	return d.Inode == nil
   251  }
   252  
   253  // hashChild will hash child into the children list of its new parent d.
   254  //
   255  // Returns (*WeakRef, true) if hashing child caused a Dirent to be unhashed. The caller must
   256  // validate the returned unhashed weak reference. Common cases:
   257  //
   258  // * Remove: hashing a negative Dirent unhashes a positive Dirent (unimplemented).
   259  // * Create: hashing a positive Dirent unhashes a negative Dirent.
   260  // * Lookup: hashing any Dirent should not unhash any other Dirent.
   261  //
   262  // Preconditions:
   263  // * d.mu must be held.
   264  // * child must be a root Dirent.
   265  func (d *Dirent) hashChild(child *Dirent) (*refs.WeakRef, bool) {
   266  	if !child.IsRoot() {
   267  		panic("hashChild must be a root Dirent")
   268  	}
   269  
   270  	// Assign parentage.
   271  	child.parent = d
   272  
   273  	// Avoid letting negative Dirents take a reference on their parent; these Dirents
   274  	// don't have a role outside of the Dirent cache and should not keep their parent
   275  	// indefinitely pinned.
   276  	if !child.IsNegative() {
   277  		// Positive dirents must take a reference on their parent.
   278  		d.IncRef()
   279  	}
   280  
   281  	return d.hashChildParentSet(child)
   282  }
   283  
   284  // hashChildParentSet will rehash child into the children list of its parent d.
   285  //
   286  // Assumes that child.parent = d already.
   287  func (d *Dirent) hashChildParentSet(child *Dirent) (*refs.WeakRef, bool) {
   288  	if child.parent != d {
   289  		panic("hashChildParentSet assumes the child already belongs to the parent")
   290  	}
   291  
   292  	// Save any replaced child so our caller can validate it.
   293  	old, ok := d.children[child.name]
   294  
   295  	// Hash the child.
   296  	d.children[child.name] = refs.NewWeakRef(child, nil)
   297  
   298  	// Return any replaced child.
   299  	return old, ok
   300  }
   301  
   302  // SyncAll iterates through mount points under d and writes back their buffered
   303  // modifications to filesystems.
   304  func (d *Dirent) SyncAll(ctx context.Context) {
   305  	d.mu.Lock()
   306  	defer d.mu.Unlock()
   307  
   308  	// For negative Dirents there is nothing to sync. By definition these are
   309  	// leaves (there is nothing left to traverse).
   310  	if d.IsNegative() {
   311  		return
   312  	}
   313  
   314  	// There is nothing to sync for a read-only filesystem.
   315  	if !d.Inode.MountSource.Flags.ReadOnly {
   316  		// NOTE(b/34856369): This should be a mount traversal, not a Dirent
   317  		// traversal, because some Inodes that need to be synced may no longer
   318  		// be reachable by name (after sys_unlink).
   319  		//
   320  		// Write out metadata, dirty page cached pages, and sync disk/remote
   321  		// caches.
   322  		d.Inode.WriteOut(ctx)
   323  	}
   324  
   325  	// Continue iterating through other mounted filesystems.
   326  	for _, w := range d.children {
   327  		if child := w.Get(); child != nil {
   328  			child.(*Dirent).SyncAll(ctx)
   329  			child.DecRef(ctx)
   330  		}
   331  	}
   332  }
   333  
   334  // BaseName returns the base name of the dirent.
   335  func (d *Dirent) BaseName() string {
   336  	p := d.parent
   337  	if p == nil {
   338  		return d.name
   339  	}
   340  	p.mu.Lock()
   341  	defer p.mu.Unlock()
   342  	return d.name
   343  }
   344  
   345  // FullName returns the fully-qualified name and a boolean value representing
   346  // whether this Dirent was a descendant of root.
   347  // If the root argument is nil it is assumed to be the root of the Dirent tree.
   348  func (d *Dirent) FullName(root *Dirent) (string, bool) {
   349  	renameMu.RLock()
   350  	defer renameMu.RUnlock()
   351  	return d.fullName(root)
   352  }
   353  
   354  // fullName returns the fully-qualified name and a boolean value representing
   355  // if the root node was reachable from this Dirent.
   356  func (d *Dirent) fullName(root *Dirent) (string, bool) {
   357  	if d == root {
   358  		return "/", true
   359  	}
   360  
   361  	if d.IsRoot() {
   362  		if root != nil {
   363  			// We reached the top of the Dirent tree but did not encounter
   364  			// the given root. Return false for reachable so the caller
   365  			// can handle this situation accordingly.
   366  			return d.name, false
   367  		}
   368  		return d.name, true
   369  	}
   370  
   371  	// Traverse up to parent.
   372  	d.parent.mu.Lock()
   373  	name := d.name
   374  	d.parent.mu.Unlock()
   375  	parentName, reachable := d.parent.fullName(root)
   376  	s := path.Join(parentName, name)
   377  	if atomic.LoadInt32(&d.deleted) != 0 {
   378  		return s + " (deleted)", reachable
   379  	}
   380  	return s, reachable
   381  }
   382  
   383  // MountRoot finds and returns the mount-root for a given dirent.
   384  func (d *Dirent) MountRoot() *Dirent {
   385  	renameMu.RLock()
   386  	defer renameMu.RUnlock()
   387  
   388  	mountRoot := d
   389  	for !mountRoot.mounted && mountRoot.parent != nil {
   390  		mountRoot = mountRoot.parent
   391  	}
   392  	mountRoot.IncRef()
   393  	return mountRoot
   394  }
   395  
   396  // descendantOf returns true if the receiver dirent is equal to, or a
   397  // descendant of, the argument dirent.
   398  //
   399  // d.mu must be held.
   400  func (d *Dirent) descendantOf(p *Dirent) bool {
   401  	if d == p {
   402  		return true
   403  	}
   404  	if d.IsRoot() {
   405  		return false
   406  	}
   407  	return d.parent.descendantOf(p)
   408  }
   409  
   410  // walk walks to path name starting at the dirent, and will not traverse above
   411  // root Dirent.
   412  //
   413  // If walkMayUnlock is true then walk can unlock d.mu to execute a slow
   414  // Inode.Lookup, otherwise walk will keep d.mu locked.
   415  //
   416  // Preconditions:
   417  // * renameMu must be held for reading.
   418  // * d.mu must be held.
   419  // * name must must not contain "/"s.
   420  func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnlock bool) (*Dirent, error) {
   421  	if !IsDir(d.Inode.StableAttr) {
   422  		return nil, unix.ENOTDIR
   423  	}
   424  
   425  	if name == "" || name == "." {
   426  		d.IncRef()
   427  		return d, nil
   428  	} else if name == ".." {
   429  		// Respect the chroot. Note that in Linux there is no check to enforce
   430  		// that d is a descendant of root.
   431  		if d == root {
   432  			d.IncRef()
   433  			return d, nil
   434  		}
   435  		// Are we already at the root? Then ".." is ".".
   436  		if d.IsRoot() {
   437  			d.IncRef()
   438  			return d, nil
   439  		}
   440  		d.parent.IncRef()
   441  		return d.parent, nil
   442  	}
   443  
   444  	if w, ok := d.children[name]; ok {
   445  		// Try to resolve the weak reference to a hard reference.
   446  		if child := w.Get(); child != nil {
   447  			cd := child.(*Dirent)
   448  
   449  			// Is this a negative Dirent?
   450  			if cd.IsNegative() {
   451  				// Don't leak a reference; this doesn't matter as much for negative Dirents,
   452  				// which don't hold a hard reference on their parent (their parent holds a
   453  				// hard reference on them, and they contain virtually no state). But this is
   454  				// good house-keeping.
   455  				child.DecRef(ctx)
   456  				return nil, unix.ENOENT
   457  			}
   458  
   459  			// Do we need to revalidate this child?
   460  			//
   461  			// We never allow the file system to revalidate mounts, that could cause them
   462  			// to unexpectedly drop out before umount.
   463  			if cd.mounted || !cd.Inode.MountSource.Revalidate(ctx, name, d.Inode, cd.Inode) {
   464  				// Good to go. This is the fast-path.
   465  				return cd, nil
   466  			}
   467  
   468  			// If we're revalidating a child, we must ensure all inotify watches release
   469  			// their pins on the child. Inotify doesn't properly support filesystems that
   470  			// revalidate dirents (since watches are lost on revalidation), but if we fail
   471  			// to unpin the watches child will never be GCed.
   472  			cd.Inode.Watches.Unpin(ctx, cd)
   473  
   474  			// This child needs to be revalidated, fallthrough to unhash it. Make sure
   475  			// to not leak a reference from Get().
   476  			//
   477  			// Note that previous lookups may still have a reference to this stale child;
   478  			// this can't be helped, but we can ensure that *new* lookups are up-to-date.
   479  			child.DecRef(ctx)
   480  		}
   481  
   482  		// Either our weak reference expired or we need to revalidate it. Unhash child first, we're
   483  		// about to replace it.
   484  		delete(d.children, name)
   485  		w.Drop(ctx)
   486  	}
   487  
   488  	// Slow path: load the InodeOperations into memory. Since this is a hot path and the lookup may be
   489  	// expensive, if possible release the lock and re-acquire it.
   490  	if walkMayUnlock {
   491  		d.mu.Unlock() // +checklocksforce: results in an inconsistent block.
   492  	}
   493  	c, err := d.Inode.Lookup(ctx, name)
   494  	if walkMayUnlock {
   495  		d.mu.Lock() // +checklocksforce: see above.
   496  	}
   497  	// No dice.
   498  	if err != nil {
   499  		return nil, err
   500  	}
   501  
   502  	// Sanity check c, its name must be consistent.
   503  	if c.name != name {
   504  		panic(fmt.Sprintf("lookup from %q to %q returned unexpected name %q", d.name, name, c.name))
   505  	}
   506  
   507  	// Now that we have the lock again, check if we raced.
   508  	if w, ok := d.children[name]; ok {
   509  		// Someone else looked up or created a child at name before us.
   510  		if child := w.Get(); child != nil {
   511  			cd := child.(*Dirent)
   512  
   513  			// There are active references to the existing child, prefer it to the one we
   514  			// retrieved from Lookup. Likely the Lookup happened very close to the insertion
   515  			// of child, so considering one stale over the other is fairly arbitrary.
   516  			c.DecRef(ctx)
   517  
   518  			// The child that was installed could be negative.
   519  			if cd.IsNegative() {
   520  				// If so, don't leak a reference and short circuit.
   521  				child.DecRef(ctx)
   522  				return nil, unix.ENOENT
   523  			}
   524  
   525  			// We make the judgement call that if c raced with cd they are close enough to have
   526  			// the same staleness, so we don't attempt to revalidate cd. In Linux revalidations
   527  			// can continue indefinitely (see fs/namei.c, retry_estale); we try to avoid this.
   528  			return cd, nil
   529  		}
   530  
   531  		// Weak reference expired. We went through a full cycle of create/destroy in the time
   532  		// we did the Inode.Lookup. Fully drop the weak reference and fallback to using the child
   533  		// we looked up.
   534  		delete(d.children, name)
   535  		w.Drop(ctx)
   536  	}
   537  
   538  	// Give the looked up child a parent. We cannot kick out entries, since we just checked above
   539  	// that there is nothing at name in d's children list.
   540  	if _, kicked := d.hashChild(c); kicked {
   541  		// Yell loudly.
   542  		panic(fmt.Sprintf("hashed child %q over existing child", c.name))
   543  	}
   544  
   545  	// Is this a negative Dirent?
   546  	if c.IsNegative() {
   547  		// Don't drop a reference on the negative Dirent, it was just installed and this is the
   548  		// only reference we'll ever get. d owns the reference.
   549  		return nil, unix.ENOENT
   550  	}
   551  
   552  	// Return the positive Dirent.
   553  	return c, nil
   554  }
   555  
   556  // Walk walks to a new dirent, and will not walk higher than the given root
   557  // Dirent, which must not be nil.
   558  func (d *Dirent) Walk(ctx context.Context, root *Dirent, name string) (*Dirent, error) {
   559  	if root == nil {
   560  		panic("Dirent.Walk: root must not be nil")
   561  	}
   562  
   563  	// We could use lockDirectory here, but this is a hot path and we want
   564  	// to avoid defer.
   565  	renameMu.RLock()
   566  	d.dirMu.RLock()
   567  	d.mu.Lock()
   568  
   569  	child, err := d.walk(ctx, root, name, true /* may unlock */)
   570  
   571  	d.mu.Unlock()
   572  	d.dirMu.RUnlock()
   573  	renameMu.RUnlock()
   574  
   575  	return child, err
   576  }
   577  
   578  // exists returns true if name exists in relation to d.
   579  //
   580  // Preconditions:
   581  // * renameMu must be held for reading.
   582  // * d.mu must be held.
   583  // * name must must not contain "/"s.
   584  func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool {
   585  	child, err := d.walk(ctx, root, name, false /* may unlock */)
   586  	if err != nil {
   587  		// Child may not exist.
   588  		return false
   589  	}
   590  	// Child exists.
   591  	child.DecRef(ctx)
   592  	return true
   593  }
   594  
   595  // lockDirectory should be called for any operation that changes this `d`s
   596  // children (creating or removing them).
   597  // +checklocksacquire:d.dirMu
   598  // +checklocksacquire:d.mu
   599  func (d *Dirent) lockDirectory() {
   600  	renameMu.RLock()
   601  	d.dirMu.Lock()
   602  	d.mu.Lock()
   603  }
   604  
   605  // unlockDirectory is the reverse of lockDirectory.
   606  // +checklocksrelease:d.dirMu
   607  // +checklocksrelease:d.mu
   608  func (d *Dirent) unlockDirectory() {
   609  	d.mu.Unlock()
   610  	d.dirMu.Unlock()
   611  	renameMu.RUnlock() // +checklocksforce: see lockDirectory.
   612  }
   613  
   614  // Create creates a new regular file in this directory.
   615  func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags FileFlags, perms FilePermissions) (*File, error) {
   616  	d.lockDirectory()
   617  	defer d.unlockDirectory()
   618  
   619  	// Does something already exist?
   620  	if d.exists(ctx, root, name) {
   621  		return nil, unix.EEXIST
   622  	}
   623  
   624  	// Try the create. We need to trust the file system to return EEXIST (or something
   625  	// that will translate to EEXIST) if name already exists.
   626  	file, err := d.Inode.Create(ctx, d, name, flags, perms)
   627  	if err != nil {
   628  		return nil, err
   629  	}
   630  	child := file.Dirent
   631  
   632  	d.finishCreate(ctx, child, name)
   633  
   634  	// Return the reference and the new file. When the last reference to
   635  	// the file is dropped, file.Dirent may no longer be cached.
   636  	return file, nil
   637  }
   638  
   639  // finishCreate validates the created file, adds it as a child of this dirent,
   640  // and notifies any watchers.
   641  func (d *Dirent) finishCreate(ctx context.Context, child *Dirent, name string) {
   642  	// Sanity check c, its name must be consistent.
   643  	if child.name != name {
   644  		panic(fmt.Sprintf("create from %q to %q returned unexpected name %q", d.name, name, child.name))
   645  	}
   646  
   647  	// File systems cannot return a negative Dirent on Create, that makes no sense.
   648  	if child.IsNegative() {
   649  		panic(fmt.Sprintf("create from %q to %q returned negative Dirent", d.name, name))
   650  	}
   651  
   652  	// Hash the child into its parent. We can only kick out a Dirent if it is negative
   653  	// (we are replacing something that does not exist with something that now does).
   654  	if w, kicked := d.hashChild(child); kicked {
   655  		if old := w.Get(); old != nil {
   656  			if !old.(*Dirent).IsNegative() {
   657  				panic(fmt.Sprintf("hashed child %q over a positive child", child.name))
   658  			}
   659  			// Don't leak a reference.
   660  			old.DecRef(ctx)
   661  
   662  			// Drop d's reference.
   663  			old.DecRef(ctx)
   664  		}
   665  
   666  		// Finally drop the useless weak reference on the floor.
   667  		w.Drop(ctx)
   668  	}
   669  
   670  	d.Inode.Watches.Notify(name, linux.IN_CREATE, 0)
   671  
   672  	// Allow the file system to take extra references on c.
   673  	child.maybeExtendReference()
   674  }
   675  
   676  // genericCreate executes create if name does not exist. Removes a negative Dirent at name if
   677  // create succeeds.
   678  func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, create func() error) error {
   679  	d.lockDirectory()
   680  	defer d.unlockDirectory()
   681  
   682  	// Does something already exist?
   683  	if d.exists(ctx, root, name) {
   684  		return unix.EEXIST
   685  	}
   686  
   687  	// Remove any negative Dirent. We've already asserted above with d.exists
   688  	// that the only thing remaining here can be a negative Dirent.
   689  	if w, ok := d.children[name]; ok {
   690  		// Same as Create.
   691  		if old := w.Get(); old != nil {
   692  			if !old.(*Dirent).IsNegative() {
   693  				panic(fmt.Sprintf("hashed over a positive child %q", old.(*Dirent).name))
   694  			}
   695  			// Don't leak a reference.
   696  			old.DecRef(ctx)
   697  
   698  			// Drop d's reference.
   699  			old.DecRef(ctx)
   700  		}
   701  
   702  		// Unhash the negative Dirent, name needs to exist now.
   703  		delete(d.children, name)
   704  
   705  		// Finally drop the useless weak reference on the floor.
   706  		w.Drop(ctx)
   707  	}
   708  
   709  	// Execute the create operation.
   710  	return create()
   711  }
   712  
   713  // CreateLink creates a new link in this directory.
   714  func (d *Dirent) CreateLink(ctx context.Context, root *Dirent, oldname, newname string) error {
   715  	return d.genericCreate(ctx, root, newname, func() error {
   716  		if err := d.Inode.CreateLink(ctx, d, oldname, newname); err != nil {
   717  			return err
   718  		}
   719  		d.Inode.Watches.Notify(newname, linux.IN_CREATE, 0)
   720  		return nil
   721  	})
   722  }
   723  
   724  // CreateHardLink creates a new hard link in this directory.
   725  func (d *Dirent) CreateHardLink(ctx context.Context, root *Dirent, target *Dirent, name string) error {
   726  	// Make sure that target does not span filesystems.
   727  	if d.Inode.MountSource != target.Inode.MountSource {
   728  		return unix.EXDEV
   729  	}
   730  
   731  	// Directories are never linkable. See fs/namei.c:vfs_link.
   732  	if IsDir(target.Inode.StableAttr) {
   733  		return unix.EPERM
   734  	}
   735  
   736  	return d.genericCreate(ctx, root, name, func() error {
   737  		if err := d.Inode.CreateHardLink(ctx, d, target, name); err != nil {
   738  			return err
   739  		}
   740  		target.Inode.Watches.Notify("", linux.IN_ATTRIB, 0) // Link count change.
   741  		d.Inode.Watches.Notify(name, linux.IN_CREATE, 0)
   742  		return nil
   743  	})
   744  }
   745  
   746  // CreateDirectory creates a new directory under this dirent.
   747  func (d *Dirent) CreateDirectory(ctx context.Context, root *Dirent, name string, perms FilePermissions) error {
   748  	return d.genericCreate(ctx, root, name, func() error {
   749  		if err := d.Inode.CreateDirectory(ctx, d, name, perms); err != nil {
   750  			return err
   751  		}
   752  		d.Inode.Watches.Notify(name, linux.IN_ISDIR|linux.IN_CREATE, 0)
   753  		return nil
   754  	})
   755  }
   756  
   757  // Bind satisfies the InodeOperations interface; otherwise same as GetFile.
   758  func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, data transport.BoundEndpoint, perms FilePermissions) (*Dirent, error) {
   759  	var childDir *Dirent
   760  	err := d.genericCreate(ctx, root, name, func() error {
   761  		var e error
   762  		childDir, e = d.Inode.Bind(ctx, d, name, data, perms)
   763  		if e != nil {
   764  			return e
   765  		}
   766  		d.finishCreate(ctx, childDir, name)
   767  		return nil
   768  	})
   769  	if err == unix.EEXIST {
   770  		return nil, unix.EADDRINUSE
   771  	}
   772  	if err != nil {
   773  		return nil, err
   774  	}
   775  	return childDir, err
   776  }
   777  
   778  // CreateFifo creates a new named pipe under this dirent.
   779  func (d *Dirent) CreateFifo(ctx context.Context, root *Dirent, name string, perms FilePermissions) error {
   780  	return d.genericCreate(ctx, root, name, func() error {
   781  		if err := d.Inode.CreateFifo(ctx, d, name, perms); err != nil {
   782  			return err
   783  		}
   784  		d.Inode.Watches.Notify(name, linux.IN_CREATE, 0)
   785  		return nil
   786  	})
   787  }
   788  
   789  // GetDotAttrs returns the DentAttrs corresponding to "." and ".." directories.
   790  func (d *Dirent) GetDotAttrs(root *Dirent) (DentAttr, DentAttr) {
   791  	// Get '.'.
   792  	sattr := d.Inode.StableAttr
   793  	dot := DentAttr{
   794  		Type:    sattr.Type,
   795  		InodeID: sattr.InodeID,
   796  	}
   797  
   798  	// Hold d.mu while we call d.descendantOf.
   799  	d.mu.Lock()
   800  	defer d.mu.Unlock()
   801  
   802  	// Get '..'.
   803  	if !d.IsRoot() && d.descendantOf(root) {
   804  		// Dirent is a descendant of the root.  Get its parent's attrs.
   805  		psattr := d.parent.Inode.StableAttr
   806  		dotdot := DentAttr{
   807  			Type:    psattr.Type,
   808  			InodeID: psattr.InodeID,
   809  		}
   810  		return dot, dotdot
   811  	}
   812  	// Dirent is either root or not a descendant of the root.  ".." is the
   813  	// same as ".".
   814  	return dot, dot
   815  }
   816  
   817  // DirIterator is an open directory containing directory entries that can be read.
   818  type DirIterator interface {
   819  	// IterateDir emits directory entries by calling dirCtx.EmitDir, beginning
   820  	// with the entry at offset and returning the next directory offset.
   821  	//
   822  	// Entries for "." and ".." must *not* be included.
   823  	//
   824  	// If the offset returned is the same as the argument offset, then
   825  	// nothing has been serialized.  This is equivalent to reaching EOF.
   826  	// In this case serializer.Written() should return 0.
   827  	//
   828  	// The order of entries to emit must be consistent between Readdir
   829  	// calls, and must start with the given offset.
   830  	//
   831  	// The caller must ensure that this operation is permitted.
   832  	IterateDir(ctx context.Context, d *Dirent, dirCtx *DirCtx, offset int) (int, error)
   833  }
   834  
   835  // DirentReaddir serializes the directory entries of d including "." and "..".
   836  //
   837  // Arguments:
   838  //
   839  // * d:		the Dirent of the directory being read; required to provide "." and "..".
   840  // * it:	the directory iterator; which represents an open directory handle.
   841  // * root: 	fs root; if d is equal to the root, then '..' will refer to d.
   842  // * ctx: 	context provided to file systems in order to select and serialize entries.
   843  // * offset:	the current directory offset.
   844  //
   845  // Returns the offset of the *next* element which was not serialized.
   846  func DirentReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent, dirCtx *DirCtx, offset int64) (int64, error) {
   847  	offset, err := direntReaddir(ctx, d, it, root, dirCtx, offset)
   848  	// Serializing any directory entries at all means success.
   849  	if dirCtx.Serializer.Written() > 0 {
   850  		return offset, nil
   851  	}
   852  	return offset, err
   853  }
   854  
   855  func direntReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent, dirCtx *DirCtx, offset int64) (int64, error) {
   856  	if root == nil {
   857  		panic("Dirent.Readdir: root must not be nil")
   858  	}
   859  	if dirCtx.Serializer == nil {
   860  		panic("Dirent.Readdir: serializer must not be nil")
   861  	}
   862  
   863  	// Check that this is actually a directory before emitting anything.
   864  	// Once we have written entries for "." and "..", future errors from
   865  	// IterateDir will be hidden.
   866  	if !IsDir(d.Inode.StableAttr) {
   867  		return 0, syserror.ENOTDIR
   868  	}
   869  
   870  	// This is a special case for lseek(fd, 0, SEEK_END).
   871  	// See SeekWithDirCursor for more details.
   872  	if offset == FileMaxOffset {
   873  		return offset, nil
   874  	}
   875  
   876  	// Collect attrs for "." and "..".
   877  	dot, dotdot := d.GetDotAttrs(root)
   878  
   879  	// Emit "." and ".." if the offset is low enough.
   880  	if offset == 0 {
   881  		// Serialize ".".
   882  		if err := dirCtx.DirEmit(".", dot); err != nil {
   883  			return offset, err
   884  		}
   885  		offset++
   886  	}
   887  	if offset == 1 {
   888  		// Serialize "..".
   889  		if err := dirCtx.DirEmit("..", dotdot); err != nil {
   890  			return offset, err
   891  		}
   892  		offset++
   893  	}
   894  
   895  	// it.IterateDir should be passed an offset that does not include the
   896  	// initial dot elements.  We will add them back later.
   897  	offset -= 2
   898  	newOffset, err := it.IterateDir(ctx, d, dirCtx, int(offset))
   899  	if int64(newOffset) < offset {
   900  		panic(fmt.Sprintf("node.Readdir returned offset %v less than input offset %v", newOffset, offset))
   901  	}
   902  	// Add the initial nodes back to the offset count.
   903  	newOffset += 2
   904  	return int64(newOffset), err
   905  }
   906  
   907  // flush flushes all weak references recursively, and removes any cached
   908  // references to children.
   909  //
   910  // Preconditions: d.mu must be held.
   911  func (d *Dirent) flush(ctx context.Context) {
   912  	expired := make(map[string]*refs.WeakRef)
   913  	for n, w := range d.children {
   914  		// Call flush recursively on each child before removing our
   915  		// reference on it, and removing the cache's reference.
   916  		if child := w.Get(); child != nil {
   917  			cd := child.(*Dirent)
   918  
   919  			if !cd.IsNegative() {
   920  				// Flush the child.
   921  				cd.mu.Lock()
   922  				cd.flush(ctx)
   923  				cd.mu.Unlock()
   924  
   925  				// Allow the file system to drop extra references on child.
   926  				cd.dropExtendedReference()
   927  			}
   928  
   929  			// Don't leak a reference.
   930  			child.DecRef(ctx)
   931  		}
   932  		// Check if the child dirent is closed, and mark it as expired if it is.
   933  		// We must call w.Get() again here, since the child could have been closed
   934  		// by the calls to flush() and cache.Remove() in the above if-block.
   935  		if child := w.Get(); child != nil {
   936  			child.DecRef(ctx)
   937  		} else {
   938  			expired[n] = w
   939  		}
   940  	}
   941  
   942  	// Remove expired entries.
   943  	for n, w := range expired {
   944  		delete(d.children, n)
   945  		w.Drop(ctx)
   946  	}
   947  }
   948  
   949  // isMountPoint returns true if the dirent is a mount point or the root.
   950  func (d *Dirent) isMountPoint() bool {
   951  	d.mu.Lock()
   952  	defer d.mu.Unlock()
   953  	return d.isMountPointLocked()
   954  }
   955  
   956  func (d *Dirent) isMountPointLocked() bool {
   957  	return d.mounted || d.parent == nil
   958  }
   959  
   960  // mount mounts a new dirent with the given inode over d.
   961  //
   962  // Precondition: must be called with mm.withMountLocked held on `d`.
   963  func (d *Dirent) mount(ctx context.Context, inode *Inode) (newChild *Dirent, err error) {
   964  	// Did we race with deletion?
   965  	if atomic.LoadInt32(&d.deleted) != 0 {
   966  		return nil, syserror.ENOENT
   967  	}
   968  
   969  	// Refuse to mount a symlink.
   970  	//
   971  	// See Linux equivalent in fs/namespace.c:do_add_mount.
   972  	if IsSymlink(inode.StableAttr) {
   973  		return nil, linuxerr.EINVAL
   974  	}
   975  
   976  	// Dirent that'll replace d.
   977  	//
   978  	// Note that NewDirent returns with one reference taken; the reference
   979  	// is donated to the caller as the mount reference.
   980  	replacement := NewDirent(ctx, inode, d.name)
   981  	replacement.mounted = true
   982  
   983  	weakRef, ok := d.parent.hashChild(replacement)
   984  	if !ok {
   985  		panic("mount must mount over an existing dirent")
   986  	}
   987  	weakRef.Drop(ctx)
   988  
   989  	// Note that even though `d` is now hidden, it still holds a reference
   990  	// to its parent.
   991  	return replacement, nil
   992  }
   993  
   994  // unmount unmounts `d` and replaces it with the last Dirent that was in its
   995  // place, supplied by the MountNamespace as `replacement`.
   996  //
   997  // Precondition: must be called with mm.withMountLocked held on `d`.
   998  func (d *Dirent) unmount(ctx context.Context, replacement *Dirent) error {
   999  	// Did we race with deletion?
  1000  	if atomic.LoadInt32(&d.deleted) != 0 {
  1001  		return syserror.ENOENT
  1002  	}
  1003  
  1004  	// Remount our former child in its place.
  1005  	//
  1006  	// As replacement used to be our child, it must already have the right
  1007  	// parent.
  1008  	weakRef, ok := d.parent.hashChildParentSet(replacement)
  1009  	if !ok {
  1010  		panic("mount must mount over an existing dirent")
  1011  	}
  1012  	weakRef.Drop(ctx)
  1013  
  1014  	// d is not reachable anymore, and hence not mounted anymore.
  1015  	d.mounted = false
  1016  
  1017  	// Drop mount reference.
  1018  	d.DecRef(ctx)
  1019  	return nil
  1020  }
  1021  
  1022  // Remove removes the given file or symlink.  The root dirent is used to
  1023  // resolve name, and must not be nil.
  1024  func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath bool) error {
  1025  	// Check the root.
  1026  	if root == nil {
  1027  		panic("Dirent.Remove: root must not be nil")
  1028  	}
  1029  
  1030  	d.lockDirectory()
  1031  	defer d.unlockDirectory()
  1032  
  1033  	// Try to walk to the node.
  1034  	child, err := d.walk(ctx, root, name, false /* may unlock */)
  1035  	if err != nil {
  1036  		// Child does not exist.
  1037  		return err
  1038  	}
  1039  	defer child.DecRef(ctx)
  1040  
  1041  	// Remove cannot remove directories.
  1042  	if IsDir(child.Inode.StableAttr) {
  1043  		return unix.EISDIR
  1044  	} else if dirPath {
  1045  		return unix.ENOTDIR
  1046  	}
  1047  
  1048  	// Remove cannot remove a mount point.
  1049  	if child.isMountPoint() {
  1050  		return unix.EBUSY
  1051  	}
  1052  
  1053  	// Try to remove name on the file system.
  1054  	if err := d.Inode.Remove(ctx, d, child); err != nil {
  1055  		return err
  1056  	}
  1057  
  1058  	// Link count changed, this only applies to non-directory nodes.
  1059  	child.Inode.Watches.Notify("", linux.IN_ATTRIB, 0)
  1060  
  1061  	// Mark name as deleted and remove from children.
  1062  	atomic.StoreInt32(&child.deleted, 1)
  1063  	if w, ok := d.children[name]; ok {
  1064  		delete(d.children, name)
  1065  		w.Drop(ctx)
  1066  	}
  1067  
  1068  	// Allow the file system to drop extra references on child.
  1069  	child.dropExtendedReference()
  1070  
  1071  	// Finally, let inotify know the child is being unlinked. Drop any extra
  1072  	// refs from inotify to this child dirent. This doesn't necessarily mean the
  1073  	// watches on the underlying inode will be destroyed, since the underlying
  1074  	// inode may have other links. If this was the last link, the events for the
  1075  	// watch removal will be queued by the inode destructor.
  1076  	child.Inode.Watches.MarkUnlinked()
  1077  	child.Inode.Watches.Unpin(ctx, child)
  1078  	d.Inode.Watches.Notify(name, linux.IN_DELETE, 0)
  1079  
  1080  	return nil
  1081  }
  1082  
  1083  // RemoveDirectory removes the given directory.  The root dirent is used to
  1084  // resolve name, and must not be nil.
  1085  func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string) error {
  1086  	// Check the root.
  1087  	if root == nil {
  1088  		panic("Dirent.Remove: root must not be nil")
  1089  	}
  1090  
  1091  	d.lockDirectory()
  1092  	defer d.unlockDirectory()
  1093  
  1094  	// Check for dots.
  1095  	if name == "." {
  1096  		// Rejected as the last component by rmdir(2).
  1097  		return unix.EINVAL
  1098  	}
  1099  	if name == ".." {
  1100  		// If d was found, then its parent is not empty.
  1101  		return unix.ENOTEMPTY
  1102  	}
  1103  
  1104  	// Try to walk to the node.
  1105  	child, err := d.walk(ctx, root, name, false /* may unlock */)
  1106  	if err != nil {
  1107  		// Child does not exist.
  1108  		return err
  1109  	}
  1110  	defer child.DecRef(ctx)
  1111  
  1112  	// RemoveDirectory can only remove directories.
  1113  	if !IsDir(child.Inode.StableAttr) {
  1114  		return unix.ENOTDIR
  1115  	}
  1116  
  1117  	// Remove cannot remove a mount point.
  1118  	if child.isMountPoint() {
  1119  		return unix.EBUSY
  1120  	}
  1121  
  1122  	// Try to remove name on the file system.
  1123  	if err := d.Inode.Remove(ctx, d, child); err != nil {
  1124  		return err
  1125  	}
  1126  
  1127  	// Mark name as deleted and remove from children.
  1128  	atomic.StoreInt32(&child.deleted, 1)
  1129  	if w, ok := d.children[name]; ok {
  1130  		delete(d.children, name)
  1131  		w.Drop(ctx)
  1132  	}
  1133  
  1134  	// Allow the file system to drop extra references on child.
  1135  	child.dropExtendedReference()
  1136  
  1137  	// Finally, let inotify know the child is being unlinked. Drop any extra
  1138  	// refs from inotify to this child dirent.
  1139  	child.Inode.Watches.MarkUnlinked()
  1140  	child.Inode.Watches.Unpin(ctx, child)
  1141  	d.Inode.Watches.Notify(name, linux.IN_ISDIR|linux.IN_DELETE, 0)
  1142  
  1143  	return nil
  1144  }
  1145  
  1146  // destroy closes this node and all children.
  1147  func (d *Dirent) destroy(ctx context.Context) {
  1148  	if d.IsNegative() {
  1149  		// Nothing to tear-down and no parent references to drop, since a negative
  1150  		// Dirent does not take a references on its parent, has no Inode and no children.
  1151  		return
  1152  	}
  1153  
  1154  	d.mu.Lock()
  1155  	defer d.mu.Unlock()
  1156  
  1157  	// Drop all weak references.
  1158  	for _, w := range d.children {
  1159  		if c := w.Get(); c != nil {
  1160  			if c.(*Dirent).IsNegative() {
  1161  				// The parent holds both weak and strong refs in the case of
  1162  				// negative dirents.
  1163  				c.DecRef(ctx)
  1164  			}
  1165  			// Drop the reference we just acquired in WeakRef.Get.
  1166  			c.DecRef(ctx)
  1167  		}
  1168  		w.Drop(ctx)
  1169  	}
  1170  	d.children = nil
  1171  
  1172  	allDirents.remove(d)
  1173  
  1174  	// Drop our reference to the Inode.
  1175  	d.Inode.DecRef(ctx)
  1176  
  1177  	// Allow the Dirent to be GC'ed after this point, since the Inode may still
  1178  	// be referenced after the Dirent is destroyed (for instance by filesystem
  1179  	// internal caches or hard links).
  1180  	d.Inode = nil
  1181  
  1182  	// Drop the reference we have on our parent if we took one. renameMu doesn't need to be
  1183  	// held because d can't be reparented without any references to it left.
  1184  	if d.parent != nil {
  1185  		d.parent.DecRef(ctx)
  1186  	}
  1187  }
  1188  
  1189  // IncRef increases the Dirent's refcount as well as its mount's refcount.
  1190  //
  1191  // IncRef implements RefCounter.IncRef.
  1192  func (d *Dirent) IncRef() {
  1193  	if d.Inode != nil {
  1194  		d.Inode.MountSource.IncDirentRefs()
  1195  	}
  1196  	d.AtomicRefCount.IncRef()
  1197  }
  1198  
  1199  // TryIncRef implements RefCounter.TryIncRef.
  1200  func (d *Dirent) TryIncRef() bool {
  1201  	ok := d.AtomicRefCount.TryIncRef()
  1202  	if ok && d.Inode != nil {
  1203  		d.Inode.MountSource.IncDirentRefs()
  1204  	}
  1205  	return ok
  1206  }
  1207  
  1208  // DecRef decreases the Dirent's refcount and drops its reference on its mount.
  1209  //
  1210  // DecRef implements RefCounter.DecRef with destructor d.destroy.
  1211  func (d *Dirent) DecRef(ctx context.Context) {
  1212  	if d.Inode != nil {
  1213  		// Keep mount around, since DecRef may destroy d.Inode.
  1214  		msrc := d.Inode.MountSource
  1215  		d.DecRefWithDestructor(ctx, d.destroy)
  1216  		msrc.DecDirentRefs()
  1217  	} else {
  1218  		d.DecRefWithDestructor(ctx, d.destroy)
  1219  	}
  1220  }
  1221  
  1222  // InotifyEvent notifies all watches on the inode for this dirent and its parent
  1223  // of potential events. The events may not actually propagate up to the user,
  1224  // depending on the event masks. InotifyEvent automatically provides the name of
  1225  // the current dirent as the subject of the event as required, and adds the
  1226  // IN_ISDIR flag for dirents that refer to directories.
  1227  func (d *Dirent) InotifyEvent(events, cookie uint32) {
  1228  	// N.B. We don't defer the unlocks because InotifyEvent is in the hot
  1229  	// path of all IO operations, and the defers cost too much for small IO
  1230  	// operations.
  1231  	renameMu.RLock()
  1232  
  1233  	if IsDir(d.Inode.StableAttr) {
  1234  		events |= linux.IN_ISDIR
  1235  	}
  1236  
  1237  	// The ordering below is important, Linux always notifies the parent first.
  1238  	if d.parent != nil {
  1239  		// name is immediately stale w.r.t. renames (renameMu doesn't
  1240  		// protect against renames in the same directory). Holding
  1241  		// d.parent.mu around Notify() wouldn't matter since Notify
  1242  		// doesn't provide a synchronous mechanism for reading the name
  1243  		// anyway.
  1244  		d.parent.mu.Lock()
  1245  		name := d.name
  1246  		d.parent.mu.Unlock()
  1247  		d.parent.Inode.Watches.Notify(name, events, cookie)
  1248  	}
  1249  	d.Inode.Watches.Notify("", events, cookie)
  1250  
  1251  	renameMu.RUnlock()
  1252  }
  1253  
  1254  // maybeExtendReference caches a reference on this Dirent if
  1255  // MountSourceOperations.Keep returns true.
  1256  func (d *Dirent) maybeExtendReference() {
  1257  	if msrc := d.Inode.MountSource; msrc.Keep(d) {
  1258  		msrc.fscache.Add(d)
  1259  	}
  1260  }
  1261  
  1262  // dropExtendedReference drops any cached reference held by the
  1263  // MountSource on the dirent.
  1264  func (d *Dirent) dropExtendedReference() {
  1265  	d.Inode.MountSource.fscache.Remove(d)
  1266  }
  1267  
  1268  // lockForRename takes locks on oldParent and newParent as required by Rename.
  1269  // On return, unlockForRename must always be called, even with an error.
  1270  // +checklocksacquire:oldParent.mu
  1271  // +checklocksacquire:newParent.mu
  1272  func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName string) error {
  1273  	renameMu.Lock()
  1274  	if oldParent == newParent {
  1275  		oldParent.mu.Lock()
  1276  		return nil // +checklocksforce: only one lock exists.
  1277  	}
  1278  
  1279  	// Renaming between directories is a bit subtle:
  1280  	//
  1281  	// - A concurrent cross-directory Rename may try to lock in the opposite
  1282  	// order; take renameMu to prevent this from happening.
  1283  	//
  1284  	// - If either directory is an ancestor of the other, then a concurrent
  1285  	// Remove may lock the descendant (in DecRef -> closeAll) while holding a
  1286  	// lock on the ancestor; to avoid this, ensure we take locks in the same
  1287  	// ancestor-to-descendant order. (Holding renameMu prevents this
  1288  	// relationship from changing.)
  1289  
  1290  	// First check if newParent is a descendant of oldParent.
  1291  	child := newParent
  1292  	for p := newParent.parent; p != nil; p = p.parent {
  1293  		if p == oldParent {
  1294  			oldParent.mu.Lock()
  1295  			newParent.mu.Lock()
  1296  			var err error
  1297  			if child.name == oldName {
  1298  				// newParent is not just a descendant of oldParent, but
  1299  				// more specifically of oldParent/oldName. That is, we're
  1300  				// trying to rename something into a subdirectory of
  1301  				// itself.
  1302  				err = unix.EINVAL
  1303  			}
  1304  			return err
  1305  		}
  1306  		child = p
  1307  	}
  1308  
  1309  	// Otherwise, either oldParent is a descendant of newParent or the two
  1310  	// have no relationship; in either case we can do this:
  1311  	newParent.mu.Lock()
  1312  	oldParent.mu.Lock()
  1313  	return nil
  1314  }
  1315  
  1316  // unlockForRename is the opposite of lockForRename.
  1317  // +checklocksrelease:oldParent.mu
  1318  // +checklocksrelease:newParent.mu
  1319  func unlockForRename(oldParent, newParent *Dirent) {
  1320  	if oldParent == newParent {
  1321  		oldParent.mu.Unlock()
  1322  		renameMu.Unlock() // +checklocksforce: only one lock exists.
  1323  		return
  1324  	}
  1325  	newParent.mu.Unlock()
  1326  	oldParent.mu.Unlock()
  1327  	renameMu.Unlock() // +checklocksforce: not tracked.
  1328  }
  1329  
  1330  func (d *Dirent) checkSticky(ctx context.Context, victim *Dirent) error {
  1331  	uattr, err := d.Inode.UnstableAttr(ctx)
  1332  	if err != nil {
  1333  		return linuxerr.EPERM
  1334  	}
  1335  	if !uattr.Perms.Sticky {
  1336  		return nil
  1337  	}
  1338  
  1339  	creds := auth.CredentialsFromContext(ctx)
  1340  	if uattr.Owner.UID == creds.EffectiveKUID {
  1341  		return nil
  1342  	}
  1343  
  1344  	vuattr, err := victim.Inode.UnstableAttr(ctx)
  1345  	if err != nil {
  1346  		return linuxerr.EPERM
  1347  	}
  1348  	if vuattr.Owner.UID == creds.EffectiveKUID {
  1349  		return nil
  1350  	}
  1351  	if victim.Inode.CheckCapability(ctx, linux.CAP_FOWNER) {
  1352  		return nil
  1353  	}
  1354  	return linuxerr.EPERM
  1355  }
  1356  
  1357  // MayDelete determines whether `name`, a child of `d`, can be deleted or
  1358  // renamed by `ctx`.
  1359  //
  1360  // Compare Linux kernel fs/namei.c:may_delete.
  1361  func (d *Dirent) MayDelete(ctx context.Context, root *Dirent, name string) error {
  1362  	if err := d.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
  1363  		return err
  1364  	}
  1365  
  1366  	d.lockDirectory()
  1367  	defer d.unlockDirectory()
  1368  
  1369  	victim, err := d.walk(ctx, root, name, true /* may unlock */)
  1370  	if err != nil {
  1371  		return err
  1372  	}
  1373  	defer victim.DecRef(ctx)
  1374  
  1375  	return d.mayDelete(ctx, victim)
  1376  }
  1377  
  1378  // mayDelete determines whether `victim`, a child of `dir`, can be deleted or
  1379  // renamed by `ctx`.
  1380  //
  1381  // Preconditions: `dir` is writable and executable by `ctx`.
  1382  func (d *Dirent) mayDelete(ctx context.Context, victim *Dirent) error {
  1383  	if err := d.checkSticky(ctx, victim); err != nil {
  1384  		return err
  1385  	}
  1386  
  1387  	if victim.IsRoot() {
  1388  		return linuxerr.EBUSY
  1389  	}
  1390  
  1391  	return nil
  1392  }
  1393  
  1394  // Rename atomically converts the child of oldParent named oldName to a
  1395  // child of newParent named newName.
  1396  func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string, newParent *Dirent, newName string) error {
  1397  	if root == nil {
  1398  		panic("Rename: root must not be nil")
  1399  	}
  1400  	if oldParent == newParent && oldName == newName {
  1401  		return nil
  1402  	}
  1403  
  1404  	// Acquire global renameMu lock, and mu locks on oldParent/newParent.
  1405  	err := lockForRename(oldParent, oldName, newParent, newName)
  1406  	defer unlockForRename(oldParent, newParent)
  1407  	if err != nil {
  1408  		return err
  1409  	}
  1410  
  1411  	// Do we have general permission to remove from oldParent and
  1412  	// create/replace in newParent?
  1413  	if err := oldParent.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
  1414  		return err
  1415  	}
  1416  	if err := newParent.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
  1417  		return err
  1418  	}
  1419  
  1420  	// renamed is the dirent that will be renamed to something else.
  1421  	renamed, err := oldParent.walk(ctx, root, oldName, false /* may unlock */)
  1422  	if err != nil {
  1423  		return err
  1424  	}
  1425  	defer renamed.DecRef(ctx)
  1426  
  1427  	// Check that the renamed dirent is deletable.
  1428  	if err := oldParent.mayDelete(ctx, renamed); err != nil {
  1429  		return err
  1430  	}
  1431  
  1432  	// Check that the renamed dirent is not a mount point.
  1433  	if renamed.isMountPointLocked() {
  1434  		return unix.EBUSY
  1435  	}
  1436  
  1437  	// Source should not be an ancestor of the target.
  1438  	if newParent.descendantOf(renamed) {
  1439  		return unix.EINVAL
  1440  	}
  1441  
  1442  	// Per rename(2): "... EACCES: ... or oldpath is a directory and does not
  1443  	// allow write permission (needed to update the .. entry)."
  1444  	if IsDir(renamed.Inode.StableAttr) {
  1445  		if err := renamed.Inode.CheckPermission(ctx, PermMask{Write: true}); err != nil {
  1446  			return err
  1447  		}
  1448  	}
  1449  
  1450  	// replaced is the dirent that is being overwritten by rename.
  1451  	replaced, err := newParent.walk(ctx, root, newName, false /* may unlock */)
  1452  	if err != nil {
  1453  		if !linuxerr.Equals(linuxerr.ENOENT, err) {
  1454  			return err
  1455  		}
  1456  
  1457  		// newName doesn't exist; simply create it below.
  1458  		replaced = nil
  1459  	} else {
  1460  		// Check constraints on the dirent being replaced.
  1461  
  1462  		// NOTE(b/111808347): We don't want to keep replaced alive
  1463  		// across the Rename, so must call DecRef manually (no defer).
  1464  
  1465  		// Check that we can delete replaced.
  1466  		if err := newParent.mayDelete(ctx, replaced); err != nil {
  1467  			replaced.DecRef(ctx)
  1468  			return err
  1469  		}
  1470  
  1471  		// Target should not be an ancestor of source.
  1472  		if oldParent.descendantOf(replaced) {
  1473  			replaced.DecRef(ctx)
  1474  
  1475  			// Note that Linux returns EINVAL if the source is an
  1476  			// ancestor of target, but ENOTEMPTY if the target is
  1477  			// an ancestor of source (unless RENAME_EXCHANGE flag
  1478  			// is present).  See fs/namei.c:renameat2.
  1479  			return unix.ENOTEMPTY
  1480  		}
  1481  
  1482  		// Check that replaced is not a mount point.
  1483  		if replaced.isMountPointLocked() {
  1484  			replaced.DecRef(ctx)
  1485  			return unix.EBUSY
  1486  		}
  1487  
  1488  		// Require that a directory is replaced by a directory.
  1489  		oldIsDir := IsDir(renamed.Inode.StableAttr)
  1490  		newIsDir := IsDir(replaced.Inode.StableAttr)
  1491  		if !newIsDir && oldIsDir {
  1492  			replaced.DecRef(ctx)
  1493  			return unix.ENOTDIR
  1494  		}
  1495  		if !oldIsDir && newIsDir {
  1496  			replaced.DecRef(ctx)
  1497  			return unix.EISDIR
  1498  		}
  1499  
  1500  		// Allow the file system to drop extra references on replaced.
  1501  		replaced.dropExtendedReference()
  1502  
  1503  		// NOTE(b/31798319,b/31867149,b/31867671): Keeping a dirent
  1504  		// open across renames is currently broken for multiple
  1505  		// reasons, so we flush all references on the replaced node and
  1506  		// its children.
  1507  		replaced.Inode.Watches.Unpin(ctx, replaced)
  1508  		replaced.mu.Lock()
  1509  		replaced.flush(ctx)
  1510  		replaced.mu.Unlock()
  1511  
  1512  		// Done with replaced.
  1513  		replaced.DecRef(ctx)
  1514  	}
  1515  
  1516  	if err := renamed.Inode.Rename(ctx, oldParent, renamed, newParent, newName, replaced != nil); err != nil {
  1517  		return err
  1518  	}
  1519  
  1520  	renamed.name = newName
  1521  	renamed.parent = newParent
  1522  	if oldParent != newParent {
  1523  		// Reparent the reference held by renamed.parent. oldParent.DecRef
  1524  		// can't destroy oldParent (and try to retake its lock) because
  1525  		// Rename's caller must be holding a reference.
  1526  		newParent.IncRef()
  1527  		oldParent.DecRef(ctx)
  1528  	}
  1529  	if w, ok := newParent.children[newName]; ok {
  1530  		w.Drop(ctx)
  1531  		delete(newParent.children, newName)
  1532  	}
  1533  	if w, ok := oldParent.children[oldName]; ok {
  1534  		w.Drop(ctx)
  1535  		delete(oldParent.children, oldName)
  1536  	}
  1537  
  1538  	// Add a weak reference from the new parent.  This ensures that the child
  1539  	// can still be found from the new parent if a prior hard reference is
  1540  	// held on renamed.
  1541  	//
  1542  	// This is required for file lock correctness because file locks are per-Dirent
  1543  	// and without maintaining the a cached child (via a weak reference) for renamed,
  1544  	// multiple Dirents can correspond to the same resource (by virtue of the renamed
  1545  	// Dirent being unreachable by its parent and it being looked up).
  1546  	newParent.children[newName] = refs.NewWeakRef(renamed, nil)
  1547  
  1548  	// Queue inotify events for the rename.
  1549  	var ev uint32
  1550  	if IsDir(renamed.Inode.StableAttr) {
  1551  		ev |= linux.IN_ISDIR
  1552  	}
  1553  
  1554  	cookie := uniqueid.InotifyCookie(ctx)
  1555  	oldParent.Inode.Watches.Notify(oldName, ev|linux.IN_MOVED_FROM, cookie)
  1556  	newParent.Inode.Watches.Notify(newName, ev|linux.IN_MOVED_TO, cookie)
  1557  	// Somewhat surprisingly, self move events do not have a cookie.
  1558  	renamed.Inode.Watches.Notify("", linux.IN_MOVE_SELF, 0)
  1559  
  1560  	// Allow the file system to drop extra references on renamed.
  1561  	renamed.dropExtendedReference()
  1562  
  1563  	// Same as replaced.flush above.
  1564  	renamed.mu.Lock()
  1565  	renamed.flush(ctx)
  1566  	renamed.mu.Unlock()
  1567  
  1568  	return nil
  1569  }