github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/fsimpl/kernfs/filesystem.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernfs
    16  
    17  // This file implements vfs.FilesystemImpl for kernfs.
    18  
    19  import (
    20  	"fmt"
    21  
    22  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    23  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/fspath"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/unix/transport"
    28  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs"
    29  )
    30  
    31  // stepExistingLocked resolves rp.Component() in parent directory vfsd.
    32  //
    33  // stepExistingLocked is loosely analogous to fs/namei.c:walk_component().
    34  //
    35  // Preconditions:
    36  //   - Filesystem.mu must be locked for at least reading.
    37  //   - !rp.Done().
    38  //
    39  // Postcondition: Caller must call fs.processDeferredDecRefs*.
    40  func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) (*Dentry, bool, error) {
    41  	if !d.isDir() {
    42  		return nil, false, linuxerr.ENOTDIR
    43  	}
    44  	// Directory searchable?
    45  	if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
    46  		return nil, false, err
    47  	}
    48  	name := rp.Component()
    49  	// Revalidation must be skipped if name is "." or ".."; d or its parent
    50  	// respectively can't be expected to transition from invalidated back to
    51  	// valid, so detecting invalidation and retrying would loop forever. This
    52  	// is consistent with Linux: fs/namei.c:walk_component() => lookup_fast()
    53  	// calls d_revalidate(), but walk_component() => handle_dots() does not.
    54  	if name == "." {
    55  		rp.Advance()
    56  		return d, false, nil
    57  	}
    58  	if name == ".." {
    59  		if isRoot, err := rp.CheckRoot(ctx, d.VFSDentry()); err != nil {
    60  			return nil, false, err
    61  		} else if isRoot || d.parent == nil {
    62  			rp.Advance()
    63  			return d, false, nil
    64  		}
    65  		if err := rp.CheckMount(ctx, d.parent.VFSDentry()); err != nil {
    66  			return nil, false, err
    67  		}
    68  		rp.Advance()
    69  		return d.parent, false, nil
    70  	}
    71  	if len(name) > linux.NAME_MAX {
    72  		return nil, false, linuxerr.ENAMETOOLONG
    73  	}
    74  	d.dirMu.Lock()
    75  	next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, d.children[name])
    76  	d.dirMu.Unlock()
    77  	if err != nil {
    78  		return nil, false, err
    79  	}
    80  	if err := rp.CheckMount(ctx, next.VFSDentry()); err != nil {
    81  		return nil, false, err
    82  	}
    83  	// Resolve any symlink at current path component.
    84  	if rp.ShouldFollowSymlink() && next.isSymlink() {
    85  		targetVD, targetPathname, err := next.inode.Getlink(ctx, rp.Mount())
    86  		if err != nil {
    87  			return nil, false, err
    88  		}
    89  		if targetVD.Ok() {
    90  			followedTarget, err := rp.HandleJump(targetVD)
    91  			fs.deferDecRefVD(ctx, targetVD)
    92  			return d, followedTarget, err
    93  		}
    94  		followedSymlink, err := rp.HandleSymlink(targetPathname)
    95  		return d, followedSymlink, err
    96  	}
    97  	rp.Advance()
    98  	return next, false, nil
    99  }
   100  
   101  // revalidateChildLocked must be called after a call to parent.vfsd.Child(name)
   102  // or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be
   103  // nil) to verify that the returned child (or lack thereof) is correct.
   104  //
   105  // Preconditions:
   106  //   - Filesystem.mu must be locked for at least reading.
   107  //   - parent.dirMu must be locked.
   108  //   - parent.isDir().
   109  //   - name is not "." or "..".
   110  //
   111  // Postconditions: Caller must call fs.processDeferredDecRefs*.
   112  func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, child *Dentry) (*Dentry, error) {
   113  	if child != nil {
   114  		// Cached dentry exists, revalidate.
   115  		if !child.inode.Valid(ctx) {
   116  			delete(parent.children, name)
   117  			if child.inode.Keep() {
   118  				// Drop the ref owned by kernfs.
   119  				fs.deferDecRef(child)
   120  			}
   121  			rcs := vfsObj.InvalidateDentry(ctx, child.VFSDentry())
   122  			for _, rc := range rcs {
   123  				fs.deferDecRef(rc)
   124  			}
   125  			child = nil
   126  		}
   127  	}
   128  	if child == nil {
   129  		// Dentry isn't cached; it either doesn't exist or failed revalidation.
   130  		// Attempt to resolve it via Lookup.
   131  		childInode, err := parent.inode.Lookup(ctx, name)
   132  		if err != nil {
   133  			return nil, err
   134  		}
   135  		var newChild Dentry
   136  		newChild.Init(fs, childInode) // childInode's ref is transferred to newChild.
   137  		parent.insertChildLocked(name, &newChild)
   138  		child = &newChild
   139  
   140  		// Drop the ref on newChild. This will cause the dentry to get pruned
   141  		// from the dentry tree by the end of current filesystem operation
   142  		// (before returning to the VFS layer) if another ref is not picked on
   143  		// this dentry.
   144  		if !childInode.Keep() {
   145  			fs.deferDecRef(&newChild)
   146  		}
   147  	}
   148  	return child, nil
   149  }
   150  
   151  // walkExistingLocked resolves rp to an existing file.
   152  //
   153  // walkExistingLocked is loosely analogous to Linux's
   154  // fs/namei.c:path_lookupat().
   155  //
   156  // Preconditions: Filesystem.mu must be locked for at least reading.
   157  //
   158  // Postconditions: Caller must call fs.processDeferredDecRefs*.
   159  func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) {
   160  	d := rp.Start().Impl().(*Dentry)
   161  	for !rp.Done() {
   162  		var err error
   163  		d, _, err = fs.stepExistingLocked(ctx, rp, d)
   164  		if err != nil {
   165  			return nil, err
   166  		}
   167  	}
   168  	if rp.MustBeDir() && !d.isDir() {
   169  		return nil, linuxerr.ENOTDIR
   170  	}
   171  	return d, nil
   172  }
   173  
   174  // walkParentDirLocked resolves all but the last path component of rp to an
   175  // existing directory. It does not check that the returned directory is
   176  // searchable by the provider of rp.
   177  //
   178  // walkParentDirLocked is loosely analogous to Linux's
   179  // fs/namei.c:path_parentat().
   180  //
   181  // Preconditions:
   182  //   - Filesystem.mu must be locked for at least reading.
   183  //   - !rp.Done().
   184  //
   185  // Postconditions: Caller must call fs.processDeferredDecRefs*.
   186  func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) (*Dentry, error) {
   187  	for !rp.Final() {
   188  		var err error
   189  		d, _, err = fs.stepExistingLocked(ctx, rp, d)
   190  		if err != nil {
   191  			return nil, err
   192  		}
   193  	}
   194  	if !d.isDir() {
   195  		return nil, linuxerr.ENOTDIR
   196  	}
   197  	return d, nil
   198  }
   199  
   200  // checkCreateLocked checks that a file named rp.Component() may be created in
   201  // directory parent, then returns rp.Component().
   202  //
   203  // Preconditions:
   204  //   - Filesystem.mu must be locked for at least reading.
   205  //   - isDir(parentInode) == true.
   206  func checkCreateLocked(ctx context.Context, creds *auth.Credentials, name string, parent *Dentry) error {
   207  	// Order of checks is important. First check if parent directory can be
   208  	// executed, then check for existence, and lastly check if mount is writable.
   209  	if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayExec); err != nil {
   210  		return err
   211  	}
   212  	if name == "." || name == ".." {
   213  		return linuxerr.EEXIST
   214  	}
   215  	if len(name) > linux.NAME_MAX {
   216  		return linuxerr.ENAMETOOLONG
   217  	}
   218  	if _, ok := parent.children[name]; ok {
   219  		return linuxerr.EEXIST
   220  	}
   221  	if parent.VFSDentry().IsDead() {
   222  		return linuxerr.ENOENT
   223  	}
   224  	if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayWrite); err != nil {
   225  		return err
   226  	}
   227  	return nil
   228  }
   229  
   230  // checkDeleteLocked checks that the file represented by vfsd may be deleted.
   231  //
   232  // Preconditions: Filesystem.mu must be locked for at least reading.
   233  func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) error {
   234  	parent := d.parent
   235  	if parent == nil {
   236  		return linuxerr.EBUSY
   237  	}
   238  	if parent.vfsd.IsDead() {
   239  		return linuxerr.ENOENT
   240  	}
   241  	if d.vfsd.IsDead() {
   242  		// This implies a duplicate unlink on an orphaned dentry, where the path
   243  		// resolution was successful. This is possible when the orphan is
   244  		// replaced by a new node of the same name (so the path resolution
   245  		// succeeds), and the orphan is unlinked again through a dirfd using
   246  		// unlinkat(2) (so the unlink refers to the orphan and not the new
   247  		// node). See Linux, fs/namei.c:do_rmdir().
   248  		return linuxerr.EINVAL
   249  	}
   250  	if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
   251  		return err
   252  	}
   253  	return nil
   254  }
   255  
   256  // Release implements vfs.FilesystemImpl.Release.
   257  func (fs *Filesystem) Release(ctx context.Context) {
   258  	root := fs.root
   259  	if root == nil {
   260  		return
   261  	}
   262  	fs.mu.Lock()
   263  	root.releaseKeptDentriesLocked(ctx)
   264  	for fs.cachedDentriesLen != 0 {
   265  		fs.evictCachedDentryLocked(ctx)
   266  	}
   267  	fs.mu.Unlock()
   268  	// Drop ref acquired in Dentry.InitRoot().
   269  	root.DecRef(ctx)
   270  }
   271  
   272  // releaseKeptDentriesLocked recursively drops all dentry references created by
   273  // Lookup when Dentry.inode.Keep() is true.
   274  //
   275  // Precondition: Filesystem.mu is held.
   276  func (d *Dentry) releaseKeptDentriesLocked(ctx context.Context) {
   277  	if d.inode.Keep() && d != d.fs.root {
   278  		d.decRefLocked(ctx)
   279  	}
   280  
   281  	if d.isDir() {
   282  		var children []*Dentry
   283  		d.dirMu.Lock()
   284  		for _, child := range d.children {
   285  			children = append(children, child)
   286  		}
   287  		d.dirMu.Unlock()
   288  		for _, child := range children {
   289  			child.releaseKeptDentriesLocked(ctx)
   290  		}
   291  	}
   292  }
   293  
   294  // Sync implements vfs.FilesystemImpl.Sync.
   295  func (fs *Filesystem) Sync(ctx context.Context) error {
   296  	// All filesystem state is in-memory.
   297  	return nil
   298  }
   299  
   300  // AccessAt implements vfs.Filesystem.Impl.AccessAt.
   301  func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
   302  	fs.mu.RLock()
   303  	defer fs.processDeferredDecRefs(ctx)
   304  	defer fs.mu.RUnlock()
   305  
   306  	d, err := fs.walkExistingLocked(ctx, rp)
   307  	if err != nil {
   308  		return err
   309  	}
   310  	if err := d.inode.CheckPermissions(ctx, creds, ats); err != nil {
   311  		return err
   312  	}
   313  	if ats.MayWrite() && rp.Mount().ReadOnly() {
   314  		return linuxerr.EROFS
   315  	}
   316  	return nil
   317  }
   318  
   319  // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
   320  func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
   321  	fs.mu.RLock()
   322  	defer fs.processDeferredDecRefs(ctx)
   323  	defer fs.mu.RUnlock()
   324  	d, err := fs.walkExistingLocked(ctx, rp)
   325  	if err != nil {
   326  		return nil, err
   327  	}
   328  
   329  	if opts.CheckSearchable {
   330  		if !d.isDir() {
   331  			return nil, linuxerr.ENOTDIR
   332  		}
   333  		if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
   334  			return nil, err
   335  		}
   336  	}
   337  	vfsd := d.VFSDentry()
   338  	vfsd.IncRef() // Ownership transferred to caller.
   339  	return vfsd, nil
   340  }
   341  
   342  // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
   343  func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
   344  	fs.mu.RLock()
   345  	defer fs.processDeferredDecRefs(ctx)
   346  	defer fs.mu.RUnlock()
   347  	d, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry))
   348  	if err != nil {
   349  		return nil, err
   350  	}
   351  	d.IncRef() // Ownership transferred to caller.
   352  	return d.VFSDentry(), nil
   353  }
   354  
   355  // LinkAt implements vfs.FilesystemImpl.LinkAt.
   356  func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
   357  	if rp.Done() {
   358  		return linuxerr.EEXIST
   359  	}
   360  	fs.mu.Lock()
   361  	defer fs.processDeferredDecRefs(ctx)
   362  	defer fs.mu.Unlock()
   363  	parent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry))
   364  	if err != nil {
   365  		return err
   366  	}
   367  
   368  	if rp.Mount() != vd.Mount() {
   369  		return linuxerr.EXDEV
   370  	}
   371  	inode := vd.Dentry().Impl().(*Dentry).Inode()
   372  	if inode.Mode().IsDir() {
   373  		return linuxerr.EPERM
   374  	}
   375  	if err := vfs.MayLink(rp.Credentials(), inode.Mode(), inode.UID(), inode.GID()); err != nil {
   376  		return err
   377  	}
   378  	parent.dirMu.Lock()
   379  	defer parent.dirMu.Unlock()
   380  	pc := rp.Component()
   381  	if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil {
   382  		return err
   383  	}
   384  	if rp.MustBeDir() {
   385  		return linuxerr.ENOENT
   386  	}
   387  	if err := rp.Mount().CheckBeginWrite(); err != nil {
   388  		return err
   389  	}
   390  	defer rp.Mount().EndWrite()
   391  
   392  	childI, err := parent.inode.NewLink(ctx, pc, inode)
   393  	if err != nil {
   394  		return err
   395  	}
   396  	parent.inode.Watches().Notify(ctx, pc, linux.IN_CREATE, 0, vfs.InodeEvent, false /* unlinked */)
   397  	inode.Watches().Notify(ctx, "", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */)
   398  	var child Dentry
   399  	child.Init(fs, childI)
   400  	parent.insertChildLocked(pc, &child)
   401  	return nil
   402  }
   403  
   404  // MkdirAt implements vfs.FilesystemImpl.MkdirAt.
   405  func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
   406  	if rp.Done() {
   407  		return linuxerr.EEXIST
   408  	}
   409  	fs.mu.Lock()
   410  	defer fs.processDeferredDecRefs(ctx)
   411  	defer fs.mu.Unlock()
   412  	parent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry))
   413  	if err != nil {
   414  		return err
   415  	}
   416  
   417  	parent.dirMu.Lock()
   418  	defer parent.dirMu.Unlock()
   419  	pc := rp.Component()
   420  	if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil {
   421  		return err
   422  	}
   423  	if err := rp.Mount().CheckBeginWrite(); err != nil {
   424  		return err
   425  	}
   426  	defer rp.Mount().EndWrite()
   427  	childI, err := parent.inode.NewDir(ctx, pc, opts)
   428  	if err != nil {
   429  		if !opts.ForSyntheticMountpoint || linuxerr.Equals(linuxerr.EEXIST, err) {
   430  			return err
   431  		}
   432  		childI = newSyntheticDirectory(ctx, rp.Credentials(), opts.Mode)
   433  	}
   434  	var child Dentry
   435  	child.Init(fs, childI)
   436  	parent.inode.Watches().Notify(ctx, pc, linux.IN_CREATE|linux.IN_ISDIR, 0, vfs.InodeEvent, false /* unlinked */)
   437  	parent.insertChildLocked(pc, &child)
   438  	return nil
   439  }
   440  
   441  // MknodAt implements vfs.FilesystemImpl.MknodAt.
   442  func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
   443  	if rp.Done() {
   444  		return linuxerr.EEXIST
   445  	}
   446  	fs.mu.Lock()
   447  	defer fs.processDeferredDecRefs(ctx)
   448  	defer fs.mu.Unlock()
   449  	parent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry))
   450  	if err != nil {
   451  		return err
   452  	}
   453  
   454  	parent.dirMu.Lock()
   455  	defer parent.dirMu.Unlock()
   456  	pc := rp.Component()
   457  	if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil {
   458  		return err
   459  	}
   460  	if rp.MustBeDir() {
   461  		return linuxerr.ENOENT
   462  	}
   463  	if err := rp.Mount().CheckBeginWrite(); err != nil {
   464  		return err
   465  	}
   466  	defer rp.Mount().EndWrite()
   467  	newI, err := parent.inode.NewNode(ctx, pc, opts)
   468  	if err != nil {
   469  		return err
   470  	}
   471  	parent.inode.Watches().Notify(ctx, pc, linux.IN_CREATE, 0, vfs.InodeEvent, false /* unlinked */)
   472  	var newD Dentry
   473  	newD.Init(fs, newI)
   474  	parent.insertChildLocked(pc, &newD)
   475  	return nil
   476  }
   477  
   478  // OpenAt implements vfs.FilesystemImpl.OpenAt.
   479  func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   480  	ats := vfs.AccessTypesForOpenFlags(&opts)
   481  
   482  	// Do not create new file.
   483  	if opts.Flags&linux.O_CREAT == 0 {
   484  		fs.mu.RLock()
   485  		defer fs.processDeferredDecRefs(ctx)
   486  		d, err := fs.walkExistingLocked(ctx, rp)
   487  		if err != nil {
   488  			fs.mu.RUnlock()
   489  			return nil, err
   490  		}
   491  		if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
   492  			fs.mu.RUnlock()
   493  			return nil, err
   494  		}
   495  		// Open may block so we need to unlock fs.mu. IncRef d to prevent
   496  		// its destruction while fs.mu is unlocked.
   497  		d.IncRef()
   498  		fs.mu.RUnlock()
   499  		fd, err := d.inode.Open(ctx, rp, d, opts)
   500  		d.DecRef(ctx)
   501  		return fd, err
   502  	}
   503  
   504  	// May create new file.
   505  	mustCreate := opts.Flags&linux.O_EXCL != 0
   506  	start := rp.Start().Impl().(*Dentry)
   507  	fs.mu.Lock()
   508  	unlocked := false
   509  	unlock := func() {
   510  		if !unlocked {
   511  			fs.mu.Unlock()
   512  			unlocked = true
   513  		}
   514  	}
   515  	// Process all to-be-decref'd dentries at the end at once.
   516  	// Since we defer unlock() AFTER this, fs.mu is guaranteed to be unlocked
   517  	// when this is executed.
   518  	defer fs.processDeferredDecRefs(ctx)
   519  	defer unlock()
   520  	if rp.Done() {
   521  		if rp.MustBeDir() {
   522  			return nil, linuxerr.EISDIR
   523  		}
   524  		if mustCreate {
   525  			return nil, linuxerr.EEXIST
   526  		}
   527  		if err := start.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
   528  			return nil, err
   529  		}
   530  		// Open may block so we need to unlock fs.mu. IncRef d to prevent
   531  		// its destruction while fs.mu is unlocked.
   532  		start.IncRef()
   533  		unlock()
   534  		fd, err := start.inode.Open(ctx, rp, start, opts)
   535  		start.DecRef(ctx)
   536  		return fd, err
   537  	}
   538  afterTrailingSymlink:
   539  	parent, err := fs.walkParentDirLocked(ctx, rp, start)
   540  	if err != nil {
   541  		return nil, err
   542  	}
   543  	// Check for search permission in the parent directory.
   544  	if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
   545  		return nil, err
   546  	}
   547  	// Reject attempts to open directories with O_CREAT.
   548  	if rp.MustBeDir() {
   549  		return nil, linuxerr.EISDIR
   550  	}
   551  	pc := rp.Component()
   552  	if pc == "." || pc == ".." {
   553  		return nil, linuxerr.EISDIR
   554  	}
   555  	if len(pc) > linux.NAME_MAX {
   556  		return nil, linuxerr.ENAMETOOLONG
   557  	}
   558  	if parent.VFSDentry().IsDead() {
   559  		return nil, linuxerr.ENOENT
   560  	}
   561  	// Determine whether or not we need to create a file.
   562  	child, followedSymlink, err := fs.stepExistingLocked(ctx, rp, parent)
   563  	if followedSymlink {
   564  		if mustCreate {
   565  			// EEXIST must be returned if an existing symlink is opened with O_EXCL.
   566  			return nil, linuxerr.EEXIST
   567  		}
   568  		if err != nil {
   569  			// If followedSymlink && err != nil, then this symlink resolution error
   570  			// must be handled by the VFS layer.
   571  			return nil, err
   572  		}
   573  		start = parent
   574  		goto afterTrailingSymlink
   575  	}
   576  	if linuxerr.Equals(linuxerr.ENOENT, err) {
   577  		// Already checked for searchability above; now check for writability.
   578  		if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
   579  			return nil, err
   580  		}
   581  		if err := rp.Mount().CheckBeginWrite(); err != nil {
   582  			return nil, err
   583  		}
   584  		defer rp.Mount().EndWrite()
   585  		// Create and open the child.
   586  		childI, err := parent.inode.NewFile(ctx, pc, opts)
   587  		if err != nil {
   588  			return nil, err
   589  		}
   590  		var child Dentry
   591  		child.Init(fs, childI)
   592  		parent.insertChild(pc, &child)
   593  		// Open may block so we need to unlock fs.mu. IncRef child to prevent
   594  		// its destruction while fs.mu is unlocked.
   595  		child.IncRef()
   596  		unlock()
   597  		parent.inode.Watches().Notify(ctx, pc, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */)
   598  		fd, err := child.inode.Open(ctx, rp, &child, opts)
   599  		child.DecRef(ctx)
   600  		return fd, err
   601  	}
   602  	if err != nil {
   603  		return nil, err
   604  	}
   605  	// Open existing file or follow symlink.
   606  	if mustCreate {
   607  		return nil, linuxerr.EEXIST
   608  	}
   609  	if rp.MustBeDir() && !child.isDir() {
   610  		return nil, linuxerr.ENOTDIR
   611  	}
   612  	if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
   613  		return nil, err
   614  	}
   615  	if child.isDir() {
   616  		// Can't open directories with O_CREAT.
   617  		if opts.Flags&linux.O_CREAT != 0 {
   618  			return nil, linuxerr.EISDIR
   619  		}
   620  		// Can't open directories writably.
   621  		if ats&vfs.MayWrite != 0 {
   622  			return nil, linuxerr.EISDIR
   623  		}
   624  		if opts.Flags&linux.O_DIRECT != 0 {
   625  			return nil, linuxerr.EINVAL
   626  		}
   627  	}
   628  	// Open may block so we need to unlock fs.mu. IncRef child to prevent
   629  	// its destruction while fs.mu is unlocked.
   630  	child.IncRef()
   631  	unlock()
   632  	fd, err := child.inode.Open(ctx, rp, child, opts)
   633  	child.DecRef(ctx)
   634  	return fd, err
   635  }
   636  
   637  // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
   638  func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
   639  	defer fs.processDeferredDecRefs(ctx)
   640  
   641  	fs.mu.RLock()
   642  	d, err := fs.walkExistingLocked(ctx, rp)
   643  	if err != nil {
   644  		fs.mu.RUnlock()
   645  		return "", err
   646  	}
   647  	if !d.isSymlink() {
   648  		fs.mu.RUnlock()
   649  		return "", linuxerr.EINVAL
   650  	}
   651  
   652  	// Inode.Readlink() cannot be called holding fs locks.
   653  	d.IncRef()
   654  	defer d.DecRef(ctx)
   655  	fs.mu.RUnlock()
   656  
   657  	return d.inode.Readlink(ctx, rp.Mount())
   658  }
   659  
   660  // RenameAt implements vfs.FilesystemImpl.RenameAt.
   661  func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
   662  	fs.mu.Lock()
   663  	defer fs.processDeferredDecRefs(ctx)
   664  	defer fs.mu.Unlock()
   665  
   666  	// Resolve the destination directory first to verify that it's on this
   667  	// Mount.
   668  	dstDir, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry))
   669  	if err != nil {
   670  		return err
   671  	}
   672  
   673  	// Only RENAME_NOREPLACE is supported.
   674  	if opts.Flags&^linux.RENAME_NOREPLACE != 0 {
   675  		return linuxerr.EINVAL
   676  	}
   677  	noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0
   678  
   679  	mnt := rp.Mount()
   680  	if mnt != oldParentVD.Mount() {
   681  		return linuxerr.EXDEV
   682  	}
   683  	if err := mnt.CheckBeginWrite(); err != nil {
   684  		return err
   685  	}
   686  	defer mnt.EndWrite()
   687  	oldParentDir := oldParentVD.Dentry().Impl().(*Dentry).Inode()
   688  	if err := oldParentDir.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
   689  		return err
   690  	}
   691  	if err := dstDir.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
   692  		return err
   693  	}
   694  
   695  	srcDirVFSD := oldParentVD.Dentry()
   696  	srcDir := srcDirVFSD.Impl().(*Dentry)
   697  	srcDir.dirMu.Lock()
   698  	src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName, srcDir.children[oldName])
   699  	srcDir.dirMu.Unlock()
   700  	if err != nil {
   701  		return err
   702  	}
   703  
   704  	// Can we remove the src dentry?
   705  	if err := checkDeleteLocked(ctx, rp, src); err != nil {
   706  		return err
   707  	}
   708  
   709  	// Can we create the dst dentry?
   710  	var dst *Dentry
   711  	newName := rp.Component()
   712  	if newName == "." || newName == ".." {
   713  		if noReplace {
   714  			return linuxerr.EEXIST
   715  		}
   716  		return linuxerr.EBUSY
   717  	}
   718  	if len(newName) > linux.NAME_MAX {
   719  		return linuxerr.ENAMETOOLONG
   720  	}
   721  
   722  	err = checkCreateLocked(ctx, rp.Credentials(), newName, dstDir)
   723  	switch {
   724  	case err == nil:
   725  		// Ok, continue with rename as replacement.
   726  	case linuxerr.Equals(linuxerr.EEXIST, err):
   727  		if noReplace {
   728  			// Won't overwrite existing node since RENAME_NOREPLACE was requested.
   729  			return linuxerr.EEXIST
   730  		}
   731  		dst = dstDir.children[newName]
   732  		if dst == nil {
   733  			panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", newName, dstDir))
   734  		}
   735  	default:
   736  		return err
   737  	}
   738  
   739  	if srcDir == dstDir && oldName == newName {
   740  		return nil
   741  	}
   742  
   743  	var dstVFSD *vfs.Dentry
   744  	if dst != nil {
   745  		dstVFSD = dst.VFSDentry()
   746  	}
   747  
   748  	mntns := vfs.MountNamespaceFromContext(ctx)
   749  	defer mntns.DecRef(ctx)
   750  	virtfs := rp.VirtualFilesystem()
   751  
   752  	// We can't deadlock here due to lock ordering because we're protected from
   753  	// concurrent renames by fs.mu held for writing.
   754  	srcDir.dirMu.Lock()
   755  	defer srcDir.dirMu.Unlock()
   756  	if srcDir != dstDir {
   757  		dstDir.dirMu.Lock()
   758  		defer dstDir.dirMu.Unlock()
   759  	}
   760  
   761  	srcVFSD := src.VFSDentry()
   762  	if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil {
   763  		return err
   764  	}
   765  	err = srcDir.inode.Rename(ctx, src.name, newName, src.inode, dstDir.inode)
   766  	if err != nil {
   767  		virtfs.AbortRenameDentry(srcVFSD, dstVFSD)
   768  		return err
   769  	}
   770  	delete(srcDir.children, src.name)
   771  	if srcDir != dstDir {
   772  		fs.deferDecRef(srcDir) // child (src) drops ref on old parent.
   773  		dstDir.IncRef()        // child (src) takes a ref on the new parent.
   774  	}
   775  	src.parent = dstDir
   776  	src.name = newName
   777  	if dstDir.children == nil {
   778  		dstDir.children = make(map[string]*Dentry)
   779  	}
   780  	replaced := dstDir.children[newName]
   781  	dstDir.children[newName] = src
   782  	var replaceVFSD *vfs.Dentry
   783  	if replaced != nil {
   784  		// deferDecRef so that fs.mu and dstDir.mu are unlocked by then.
   785  		fs.deferDecRef(replaced)
   786  		replaceVFSD = replaced.VFSDentry()
   787  		replaced.setDeleted()
   788  	}
   789  	vfs.InotifyRename(ctx, src.inode.Watches(), srcDir.inode.Watches(), dstDir.inode.Watches(), oldName, newName, src.isDir())
   790  	virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD) // +checklocksforce: to may be nil, that's okay.
   791  	return nil
   792  }
   793  
   794  // RmdirAt implements vfs.FilesystemImpl.RmdirAt.
   795  func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
   796  	fs.mu.Lock()
   797  	defer fs.processDeferredDecRefs(ctx)
   798  	defer fs.mu.Unlock()
   799  	parent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry))
   800  	if err != nil {
   801  		return err
   802  	}
   803  	if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
   804  		return err
   805  	}
   806  	if err := rp.Mount().CheckBeginWrite(); err != nil {
   807  		return err
   808  	}
   809  	defer rp.Mount().EndWrite()
   810  	name := rp.Component()
   811  	if name == "." {
   812  		return linuxerr.EINVAL
   813  	}
   814  	if name == ".." {
   815  		return linuxerr.ENOTEMPTY
   816  	}
   817  	child, ok := parent.children[name]
   818  	if !ok {
   819  		return linuxerr.ENOENT
   820  	}
   821  	if err := checkDeleteLocked(ctx, rp, child); err != nil {
   822  		return err
   823  	}
   824  	if err := vfs.CheckDeleteSticky(
   825  		rp.Credentials(),
   826  		linux.FileMode(parent.inode.Mode()),
   827  		auth.KUID(parent.inode.UID()),
   828  		auth.KUID(child.inode.UID()),
   829  		auth.KGID(child.inode.GID()),
   830  	); err != nil {
   831  		return err
   832  	}
   833  	if !child.isDir() {
   834  		return linuxerr.ENOTDIR
   835  	}
   836  	if child.inode.HasChildren() {
   837  		return linuxerr.ENOTEMPTY
   838  	}
   839  	virtfs := rp.VirtualFilesystem()
   840  	parent.dirMu.Lock()
   841  	defer parent.dirMu.Unlock()
   842  
   843  	mntns := vfs.MountNamespaceFromContext(ctx)
   844  	defer mntns.DecRef(ctx)
   845  	vfsd := child.VFSDentry()
   846  	if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
   847  		return err // +checklocksforce: vfsd is not locked.
   848  	}
   849  
   850  	if err := parent.inode.RmDir(ctx, child.name, child.inode); err != nil {
   851  		virtfs.AbortDeleteDentry(vfsd)
   852  		return err
   853  	}
   854  	delete(parent.children, child.name)
   855  	parent.inode.Watches().Notify(ctx, child.name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */)
   856  	// Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then.
   857  	fs.deferDecRef(child)
   858  	virtfs.CommitDeleteDentry(ctx, vfsd)
   859  	child.setDeleted()
   860  	return nil
   861  }
   862  
   863  // SetStatAt implements vfs.FilesystemImpl.SetStatAt.
   864  func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
   865  	fs.mu.RLock()
   866  	defer fs.processDeferredDecRefs(ctx)
   867  	d, err := fs.walkExistingLocked(ctx, rp)
   868  	if err != nil {
   869  		fs.mu.RUnlock()
   870  		return err
   871  	}
   872  	if opts.Stat.Mask == 0 {
   873  		fs.mu.RUnlock()
   874  		return nil
   875  	}
   876  	err = d.inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts)
   877  	fs.mu.RUnlock()
   878  	if err != nil {
   879  		return err
   880  	}
   881  	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
   882  		d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
   883  	}
   884  	return nil
   885  }
   886  
   887  // StatAt implements vfs.FilesystemImpl.StatAt.
   888  func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
   889  	fs.mu.RLock()
   890  	defer fs.processDeferredDecRefs(ctx)
   891  	defer fs.mu.RUnlock()
   892  	d, err := fs.walkExistingLocked(ctx, rp)
   893  	if err != nil {
   894  		return linux.Statx{}, err
   895  	}
   896  	return d.inode.Stat(ctx, fs.VFSFilesystem(), opts)
   897  }
   898  
   899  // StatFSAt implements vfs.FilesystemImpl.StatFSAt.
   900  func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
   901  	fs.mu.RLock()
   902  	defer fs.processDeferredDecRefs(ctx)
   903  	defer fs.mu.RUnlock()
   904  	d, err := fs.walkExistingLocked(ctx, rp)
   905  	if err != nil {
   906  		return linux.Statfs{}, err
   907  	}
   908  	return d.inode.StatFS(ctx, fs.VFSFilesystem())
   909  }
   910  
   911  // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
   912  func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
   913  	if rp.Done() {
   914  		return linuxerr.EEXIST
   915  	}
   916  	fs.mu.Lock()
   917  	defer fs.processDeferredDecRefs(ctx)
   918  	defer fs.mu.Unlock()
   919  	parent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry))
   920  	if err != nil {
   921  		return err
   922  	}
   923  	parent.dirMu.Lock()
   924  	defer parent.dirMu.Unlock()
   925  
   926  	pc := rp.Component()
   927  	if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil {
   928  		return err
   929  	}
   930  	if rp.MustBeDir() {
   931  		return linuxerr.ENOENT
   932  	}
   933  	if err := rp.Mount().CheckBeginWrite(); err != nil {
   934  		return err
   935  	}
   936  	defer rp.Mount().EndWrite()
   937  	childI, err := parent.inode.NewSymlink(ctx, pc, target)
   938  	if err != nil {
   939  		return err
   940  	}
   941  	parent.inode.Watches().Notify(ctx, pc, linux.IN_CREATE, 0, vfs.InodeEvent, false /* unlinked */)
   942  	var child Dentry
   943  	child.Init(fs, childI)
   944  	parent.insertChildLocked(pc, &child)
   945  	return nil
   946  }
   947  
   948  // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
   949  func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
   950  	fs.mu.Lock()
   951  	defer fs.processDeferredDecRefs(ctx)
   952  	defer fs.mu.Unlock()
   953  
   954  	d, err := fs.walkExistingLocked(ctx, rp)
   955  	if err != nil {
   956  		return err
   957  	}
   958  	if err := rp.Mount().CheckBeginWrite(); err != nil {
   959  		return err
   960  	}
   961  	defer rp.Mount().EndWrite()
   962  	if err := checkDeleteLocked(ctx, rp, d); err != nil {
   963  		return err
   964  	}
   965  	if d.isDir() {
   966  		return linuxerr.EISDIR
   967  	}
   968  	virtfs := rp.VirtualFilesystem()
   969  	parentDentry := d.parent
   970  	parentDentry.dirMu.Lock()
   971  	defer parentDentry.dirMu.Unlock()
   972  	mntns := vfs.MountNamespaceFromContext(ctx)
   973  	defer mntns.DecRef(ctx)
   974  	vfsd := d.VFSDentry()
   975  	if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
   976  		return err
   977  	}
   978  	if err := parentDentry.inode.Unlink(ctx, d.name, d.inode); err != nil {
   979  		virtfs.AbortDeleteDentry(vfsd)
   980  		return err
   981  	}
   982  	delete(parentDentry.children, d.name)
   983  	vfs.InotifyRemoveChild(ctx, d.inode.Watches(), parentDentry.inode.Watches(), d.name)
   984  	// Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then.
   985  	fs.deferDecRef(d)
   986  	virtfs.CommitDeleteDentry(ctx, vfsd)
   987  	d.setDeleted()
   988  	return nil
   989  }
   990  
   991  // BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
   992  func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
   993  	fs.mu.RLock()
   994  	defer fs.processDeferredDecRefs(ctx)
   995  	defer fs.mu.RUnlock()
   996  	d, err := fs.walkExistingLocked(ctx, rp)
   997  	if err != nil {
   998  		return nil, err
   999  	}
  1000  	if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
  1001  		return nil, err
  1002  	}
  1003  	return nil, linuxerr.ECONNREFUSED
  1004  }
  1005  
  1006  // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
  1007  func (fs *Filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
  1008  	fs.mu.RLock()
  1009  	defer fs.processDeferredDecRefs(ctx)
  1010  	defer fs.mu.RUnlock()
  1011  	_, err := fs.walkExistingLocked(ctx, rp)
  1012  	if err != nil {
  1013  		return nil, err
  1014  	}
  1015  	// kernfs currently does not support extended attributes.
  1016  	return nil, linuxerr.ENOTSUP
  1017  }
  1018  
  1019  // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
  1020  func (fs *Filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
  1021  	fs.mu.RLock()
  1022  	defer fs.processDeferredDecRefs(ctx)
  1023  	defer fs.mu.RUnlock()
  1024  	_, err := fs.walkExistingLocked(ctx, rp)
  1025  	if err != nil {
  1026  		return "", err
  1027  	}
  1028  	// kernfs currently does not support extended attributes.
  1029  	return "", linuxerr.ENOTSUP
  1030  }
  1031  
  1032  // SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
  1033  func (fs *Filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
  1034  	fs.mu.RLock()
  1035  	defer fs.processDeferredDecRefs(ctx)
  1036  	defer fs.mu.RUnlock()
  1037  	_, err := fs.walkExistingLocked(ctx, rp)
  1038  	if err != nil {
  1039  		return err
  1040  	}
  1041  	// kernfs currently does not support extended attributes.
  1042  	return linuxerr.ENOTSUP
  1043  }
  1044  
  1045  // RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
  1046  func (fs *Filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
  1047  	fs.mu.RLock()
  1048  	defer fs.processDeferredDecRefs(ctx)
  1049  	defer fs.mu.RUnlock()
  1050  	_, err := fs.walkExistingLocked(ctx, rp)
  1051  	if err != nil {
  1052  		return err
  1053  	}
  1054  	// kernfs currently does not support extended attributes.
  1055  	return linuxerr.ENOTSUP
  1056  }
  1057  
  1058  // PrependPath implements vfs.FilesystemImpl.PrependPath.
  1059  func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
  1060  	fs.mu.RLock()
  1061  	defer fs.mu.RUnlock()
  1062  	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*Dentry), b)
  1063  }
  1064  
  1065  func (fs *Filesystem) deferDecRefVD(ctx context.Context, vd vfs.VirtualDentry) {
  1066  	if d, ok := vd.Dentry().Impl().(*Dentry); ok && d.fs == fs {
  1067  		// The following is equivalent to vd.DecRef(ctx). This is needed
  1068  		// because if d belongs to this filesystem, we can not DecRef it right
  1069  		// away as we may be holding fs.mu. d.DecRef may acquire fs.mu. So we
  1070  		// defer the DecRef to when locks are dropped.
  1071  		vd.Mount().DecRef(ctx)
  1072  		fs.deferDecRef(d)
  1073  	} else {
  1074  		vd.DecRef(ctx)
  1075  	}
  1076  }