gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/fsimpl/kernfs/filesystem.go

gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/fsimpl/kernfs/filesystem.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernfs
    16  
    17  // This file implements vfs.FilesystemImpl for kernfs.
    18  
    19  import (
    20  	"fmt"
    21  
    22  	"gvisor.dev/gvisor/pkg/abi/linux"
    23  	"gvisor.dev/gvisor/pkg/context"
    24  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    25  	"gvisor.dev/gvisor/pkg/fspath"
    26  	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
    27  	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
    28  	"gvisor.dev/gvisor/pkg/sentry/vfs"
    29  )
    30  
    31  // stepExistingLocked resolves rp.Component() in parent directory vfsd.
    32  //
    33  // stepExistingLocked is loosely analogous to fs/namei.c:walk_component().
    34  //
    35  // Preconditions:
    36  //   - Filesystem.mu must be locked for at least reading.
    37  //   - !rp.Done().
    38  //
    39  // Postcondition: Caller must call fs.processDeferredDecRefs*.
    40  func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) (*Dentry, bool, error) {
    41  	if !d.isDir() {
    42  		return nil, false, linuxerr.ENOTDIR
    43  	}
    44  	// Directory searchable?
    45  	if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
    46  		return nil, false, err
    47  	}
    48  	name := rp.Component()
    49  	// Revalidation must be skipped if name is "." or ".."; d or its parent
    50  	// respectively can't be expected to transition from invalidated back to
    51  	// valid, so detecting invalidation and retrying would loop forever. This
    52  	// is consistent with Linux: fs/namei.c:walk_component() => lookup_fast()
    53  	// calls d_revalidate(), but walk_component() => handle_dots() does not.
    54  	if name == "." {
    55  		rp.Advance()
    56  		return d, false, nil
    57  	}
    58  	if name == ".." {
    59  		if isRoot, err := rp.CheckRoot(ctx, d.VFSDentry()); err != nil {
    60  			return nil, false, err
    61  		} else if isRoot || d.parent.Load() == nil {
    62  			rp.Advance()
    63  			return d, false, nil
    64  		}
    65  		if err := rp.CheckMount(ctx, d.Parent().VFSDentry()); err != nil {
    66  			return nil, false, err
    67  		}
    68  		rp.Advance()
    69  		return d.parent.Load(), false, nil
    70  	}
    71  	if len(name) > linux.NAME_MAX {
    72  		return nil, false, linuxerr.ENAMETOOLONG
    73  	}
    74  	next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name)
    75  	if err != nil {
    76  		return nil, false, err
    77  	}
    78  	if err := rp.CheckMount(ctx, next.VFSDentry()); err != nil {
    79  		return nil, false, err
    80  	}
    81  	// Resolve any symlink at current path component.
    82  	if rp.ShouldFollowSymlink() && next.isSymlink() {
    83  		targetVD, targetPathname, err := next.inode.Getlink(ctx, rp.Mount())
    84  		if err != nil {
    85  			return nil, false, err
    86  		}
    87  		if targetVD.Ok() {
    88  			followedTarget, err := rp.HandleJump(targetVD)
    89  			fs.deferDecRefVD(ctx, targetVD)
    90  			return d, followedTarget, err
    91  		}
    92  		followedSymlink, err := rp.HandleSymlink(targetPathname)
    93  		return d, followedSymlink, err
    94  	}
    95  	rp.Advance()
    96  	return next, false, nil
    97  }
    98  
    99  // revalidateChildLocked is called to look up the child of parent named name,
   100  // while verifying that any cached lookups are still correct.
   101  //
   102  // Preconditions:
   103  //   - Filesystem.mu must be locked for at least reading.
   104  //   - parent.isDir().
   105  //   - name is not "." or "..".
   106  //
   107  // Postconditions: Caller must call fs.processDeferredDecRefs*.
   108  func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string) (*Dentry, error) {
   109  	parent.dirMu.Lock()
   110  	defer parent.dirMu.Unlock() // may be temporarily unlocked and re-locked below
   111  	child := parent.children[name]
   112  	for child != nil {
   113  		// Cached dentry exists, revalidate.
   114  		if child.inode.Valid(ctx, parent, name) {
   115  			break
   116  		}
   117  		delete(parent.children, child.name)
   118  		parent.dirMu.Unlock()
   119  		fs.invalidateRemovedChildLocked(ctx, vfsObj, child)
   120  		parent.dirMu.Lock()
   121  		// Check for concurrent insertion of a new cached dentry.
   122  		child = parent.children[name]
   123  	}
   124  	if child == nil {
   125  		// Dentry isn't cached; it either doesn't exist or failed revalidation.
   126  		// Attempt to resolve it via Lookup.
   127  		childInode, err := parent.inode.Lookup(ctx, name)
   128  		if err != nil {
   129  			return nil, err
   130  		}
   131  		var newChild Dentry
   132  		newChild.Init(fs, childInode) // childInode's ref is transferred to newChild.
   133  		parent.insertChildLocked(name, &newChild)
   134  		child = &newChild
   135  
   136  		// Drop the ref on newChild. This will cause the dentry to get pruned
   137  		// from the dentry tree by the end of current filesystem operation
   138  		// (before returning to the VFS layer) if another ref is not picked on
   139  		// this dentry.
   140  		if !childInode.Keep() {
   141  			fs.deferDecRef(&newChild)
   142  		}
   143  	}
   144  	return child, nil
   145  }
   146  
   147  // Preconditions:
   148  //   - Filesystem.mu must be locked for at least reading.
   149  //   - d has been removed from its parent.children.
   150  //
   151  // Postconditions: Caller must call fs.processDeferredDecRefs*.
   152  func (fs *Filesystem) invalidateRemovedChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, d *Dentry) {
   153  	toInvalidate := []*Dentry{d}
   154  	for len(toInvalidate) != 0 {
   155  		d := toInvalidate[len(toInvalidate)-1]
   156  		toInvalidate = toInvalidate[:len(toInvalidate)-1]
   157  
   158  		if d.inode.Keep() {
   159  			fs.deferDecRef(d)
   160  		}
   161  		rcs := vfsObj.InvalidateDentry(ctx, d.VFSDentry())
   162  		for _, rc := range rcs {
   163  			fs.deferDecRef(rc)
   164  		}
   165  
   166  		if d.isDir() {
   167  			d.dirMu.Lock()
   168  			for name, child := range d.children {
   169  				toInvalidate = append(toInvalidate, child)
   170  				delete(d.children, name)
   171  			}
   172  			d.dirMu.Unlock()
   173  		}
   174  	}
   175  }
   176  
   177  // walkExistingLocked resolves rp to an existing file.
   178  //
   179  // walkExistingLocked is loosely analogous to Linux's
   180  // fs/namei.c:path_lookupat().
   181  //
   182  // Preconditions: Filesystem.mu must be locked for at least reading.
   183  //
   184  // Postconditions: Caller must call fs.processDeferredDecRefs*.
   185  func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) {
   186  	d := rp.Start().Impl().(*Dentry)
   187  	for !rp.Done() {
   188  		var err error
   189  		d, _, err = fs.stepExistingLocked(ctx, rp, d)
   190  		if err != nil {
   191  			return nil, err
   192  		}
   193  	}
   194  	if rp.MustBeDir() && !d.isDir() {
   195  		return nil, linuxerr.ENOTDIR
   196  	}
   197  	return d, nil
   198  }
   199  
   200  // walkParentDirLocked resolves all but the last path component of rp to an
   201  // existing directory. It does not check that the returned directory is
   202  // searchable by the provider of rp.
   203  //
   204  // walkParentDirLocked is loosely analogous to Linux's
   205  // fs/namei.c:path_parentat().
   206  //
   207  // Preconditions:
   208  //   - Filesystem.mu must be locked for at least reading.
   209  //   - !rp.Done().
   210  //
   211  // Postconditions: Caller must call fs.processDeferredDecRefs*.
   212  func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) (*Dentry, error) {
   213  	for !rp.Final() {
   214  		var err error
   215  		d, _, err = fs.stepExistingLocked(ctx, rp, d)
   216  		if err != nil {
   217  			return nil, err
   218  		}
   219  	}
   220  	if !d.isDir() {
   221  		return nil, linuxerr.ENOTDIR
   222  	}
   223  	return d, nil
   224  }
   225  
   226  // checkCreateLocked checks that a file named rp.Component() may be created in
   227  // directory parent, then returns rp.Component().
   228  //
   229  // Preconditions:
   230  //   - Filesystem.mu must be locked for at least reading.
   231  //   - isDir(parentInode) == true.
   232  func checkCreateLocked(ctx context.Context, creds *auth.Credentials, name string, parent *Dentry) error {
   233  	// Order of checks is important. First check if parent directory can be
   234  	// executed, then check for existence, and lastly check if mount is writable.
   235  	if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayExec); err != nil {
   236  		return err
   237  	}
   238  	if name == "." || name == ".." {
   239  		return linuxerr.EEXIST
   240  	}
   241  	if len(name) > linux.NAME_MAX {
   242  		return linuxerr.ENAMETOOLONG
   243  	}
   244  	if _, ok := parent.children[name]; ok {
   245  		return linuxerr.EEXIST
   246  	}
   247  	if parent.VFSDentry().IsDead() {
   248  		return linuxerr.ENOENT
   249  	}
   250  	if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayWrite); err != nil {
   251  		return err
   252  	}
   253  	return nil
   254  }
   255  
   256  // checkDeleteLocked checks that the file represented by vfsd may be deleted.
   257  //
   258  // Preconditions: Filesystem.mu must be locked for at least reading.
   259  func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) error {
   260  	parent := d.parent.Load()
   261  	if parent == nil {
   262  		return linuxerr.EBUSY
   263  	}
   264  	if parent.vfsd.IsDead() {
   265  		return linuxerr.ENOENT
   266  	}
   267  	if d.vfsd.IsDead() {
   268  		// This implies a duplicate unlink on an orphaned dentry, where the path
   269  		// resolution was successful. This is possible when the orphan is
   270  		// replaced by a new node of the same name (so the path resolution
   271  		// succeeds), and the orphan is unlinked again through a dirfd using
   272  		// unlinkat(2) (so the unlink refers to the orphan and not the new
   273  		// node). See Linux, fs/namei.c:do_rmdir().
   274  		return linuxerr.EINVAL
   275  	}
   276  	if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
   277  		return err
   278  	}
   279  	return nil
   280  }
   281  
   282  // Release implements vfs.FilesystemImpl.Release.
   283  func (fs *Filesystem) Release(ctx context.Context) {
   284  	root := fs.root
   285  	if root == nil {
   286  		return
   287  	}
   288  	fs.mu.Lock()
   289  	root.releaseKeptDentriesLocked(ctx)
   290  	for fs.cachedDentriesLen != 0 {
   291  		fs.evictCachedDentryLocked(ctx)
   292  	}
   293  	fs.mu.Unlock()
   294  	// Drop ref acquired in Dentry.InitRoot().
   295  	root.DecRef(ctx)
   296  }
   297  
   298  // releaseKeptDentriesLocked recursively drops all dentry references created by
   299  // Lookup when Dentry.inode.Keep() is true.
   300  //
   301  // Precondition: Filesystem.mu is held.
   302  func (d *Dentry) releaseKeptDentriesLocked(ctx context.Context) {
   303  	if d.inode.Keep() && d != d.fs.root {
   304  		d.decRefLocked(ctx)
   305  	}
   306  
   307  	if d.isDir() {
   308  		var children []*Dentry
   309  		d.dirMu.Lock()
   310  		for _, child := range d.children {
   311  			children = append(children, child)
   312  		}
   313  		d.dirMu.Unlock()
   314  		for _, child := range children {
   315  			child.releaseKeptDentriesLocked(ctx)
   316  		}
   317  	}
   318  }
   319  
   320  // Sync implements vfs.FilesystemImpl.Sync.
   321  func (fs *Filesystem) Sync(ctx context.Context) error {
   322  	// All filesystem state is in-memory.
   323  	return nil
   324  }
   325  
   326  // AccessAt implements vfs.Filesystem.Impl.AccessAt.
   327  func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
   328  	fs.mu.RLock()
   329  	defer fs.processDeferredDecRefs(ctx)
   330  	defer fs.mu.RUnlock()
   331  
   332  	d, err := fs.walkExistingLocked(ctx, rp)
   333  	if err != nil {
   334  		return err
   335  	}
   336  	if err := d.inode.CheckPermissions(ctx, creds, ats); err != nil {
   337  		return err
   338  	}
   339  	if ats.MayWrite() && rp.Mount().ReadOnly() {
   340  		return linuxerr.EROFS
   341  	}
   342  	return nil
   343  }
   344  
   345  // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
   346  func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
   347  	fs.mu.RLock()
   348  	defer fs.processDeferredDecRefs(ctx)
   349  	defer fs.mu.RUnlock()
   350  	d, err := fs.walkExistingLocked(ctx, rp)
   351  	if err != nil {
   352  		return nil, err
   353  	}
   354  
   355  	if opts.CheckSearchable {
   356  		if !d.isDir() {
   357  			return nil, linuxerr.ENOTDIR
   358  		}
   359  		if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
   360  			return nil, err
   361  		}
   362  	}
   363  	vfsd := d.VFSDentry()
   364  	vfsd.IncRef() // Ownership transferred to caller.
   365  	return vfsd, nil
   366  }
   367  
   368  // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
   369  func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
   370  	fs.mu.RLock()
   371  	defer fs.processDeferredDecRefs(ctx)
   372  	defer fs.mu.RUnlock()
   373  	d, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry))
   374  	if err != nil {
   375  		return nil, err
   376  	}
   377  	d.IncRef() // Ownership transferred to caller.
   378  	return d.VFSDentry(), nil
   379  }
   380  
   381  // LinkAt implements vfs.FilesystemImpl.LinkAt.
   382  func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
   383  	if rp.Done() {
   384  		return linuxerr.EEXIST
   385  	}
   386  	fs.mu.Lock()
   387  	defer fs.processDeferredDecRefs(ctx)
   388  	defer fs.mu.Unlock()
   389  	parent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry))
   390  	if err != nil {
   391  		return err
   392  	}
   393  
   394  	if rp.Mount() != vd.Mount() {
   395  		return linuxerr.EXDEV
   396  	}
   397  	inode := vd.Dentry().Impl().(*Dentry).Inode()
   398  	if inode.Mode().IsDir() {
   399  		return linuxerr.EPERM
   400  	}
   401  	if err := vfs.MayLink(rp.Credentials(), inode.Mode(), inode.UID(), inode.GID()); err != nil {
   402  		return err
   403  	}
   404  	parent.dirMu.Lock()
   405  	defer parent.dirMu.Unlock()
   406  	pc := rp.Component()
   407  	if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil {
   408  		return err
   409  	}
   410  	if rp.MustBeDir() {
   411  		return linuxerr.ENOENT
   412  	}
   413  	if err := rp.Mount().CheckBeginWrite(); err != nil {
   414  		return err
   415  	}
   416  	defer rp.Mount().EndWrite()
   417  
   418  	childI, err := parent.inode.NewLink(ctx, pc, inode)
   419  	if err != nil {
   420  		return err
   421  	}
   422  	parent.inode.Watches().Notify(ctx, pc, linux.IN_CREATE, 0, vfs.InodeEvent, false /* unlinked */)
   423  	inode.Watches().Notify(ctx, "", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */)
   424  	var child Dentry
   425  	child.Init(fs, childI)
   426  	parent.insertChildLocked(pc, &child)
   427  	return nil
   428  }
   429  
   430  // MkdirAt implements vfs.FilesystemImpl.MkdirAt.
   431  func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
   432  	if rp.Done() {
   433  		return linuxerr.EEXIST
   434  	}
   435  	fs.mu.Lock()
   436  	defer fs.processDeferredDecRefs(ctx)
   437  	defer fs.mu.Unlock()
   438  	parent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry))
   439  	if err != nil {
   440  		return err
   441  	}
   442  
   443  	parent.dirMu.Lock()
   444  	defer parent.dirMu.Unlock()
   445  	pc := rp.Component()
   446  	if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil {
   447  		return err
   448  	}
   449  	if err := rp.Mount().CheckBeginWrite(); err != nil {
   450  		return err
   451  	}
   452  	defer rp.Mount().EndWrite()
   453  	childI, err := parent.inode.NewDir(ctx, pc, opts)
   454  	if err != nil {
   455  		if !opts.ForSyntheticMountpoint || linuxerr.Equals(linuxerr.EEXIST, err) {
   456  			return err
   457  		}
   458  		childI = newSyntheticDirectory(ctx, rp.Credentials(), opts.Mode)
   459  	}
   460  	var child Dentry
   461  	child.Init(fs, childI)
   462  	parent.inode.Watches().Notify(ctx, pc, linux.IN_CREATE|linux.IN_ISDIR, 0, vfs.InodeEvent, false /* unlinked */)
   463  	parent.insertChildLocked(pc, &child)
   464  	return nil
   465  }
   466  
   467  // MknodAt implements vfs.FilesystemImpl.MknodAt.
   468  func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
   469  	if rp.Done() {
   470  		return linuxerr.EEXIST
   471  	}
   472  	fs.mu.Lock()
   473  	defer fs.processDeferredDecRefs(ctx)
   474  	defer fs.mu.Unlock()
   475  	parent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry))
   476  	if err != nil {
   477  		return err
   478  	}
   479  
   480  	parent.dirMu.Lock()
   481  	defer parent.dirMu.Unlock()
   482  	pc := rp.Component()
   483  	if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil {
   484  		return err
   485  	}
   486  	if rp.MustBeDir() {
   487  		return linuxerr.ENOENT
   488  	}
   489  	if err := rp.Mount().CheckBeginWrite(); err != nil {
   490  		return err
   491  	}
   492  	defer rp.Mount().EndWrite()
   493  	newI, err := parent.inode.NewNode(ctx, pc, opts)
   494  	if err != nil {
   495  		return err
   496  	}
   497  	parent.inode.Watches().Notify(ctx, pc, linux.IN_CREATE, 0, vfs.InodeEvent, false /* unlinked */)
   498  	var newD Dentry
   499  	newD.Init(fs, newI)
   500  	parent.insertChildLocked(pc, &newD)
   501  	return nil
   502  }
   503  
   504  // OpenAt implements vfs.FilesystemImpl.OpenAt.
   505  func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   506  	ats := vfs.AccessTypesForOpenFlags(&opts)
   507  
   508  	// Do not create new file.
   509  	if opts.Flags&linux.O_CREAT == 0 {
   510  		fs.mu.RLock()
   511  		defer fs.processDeferredDecRefs(ctx)
   512  		d, err := fs.walkExistingLocked(ctx, rp)
   513  		if err != nil {
   514  			fs.mu.RUnlock()
   515  			return nil, err
   516  		}
   517  		if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
   518  			fs.mu.RUnlock()
   519  			return nil, err
   520  		}
   521  		// Open may block so we need to unlock fs.mu. IncRef d to prevent
   522  		// its destruction while fs.mu is unlocked.
   523  		d.IncRef()
   524  		fs.mu.RUnlock()
   525  		fd, err := d.inode.Open(ctx, rp, d, opts)
   526  		d.DecRef(ctx)
   527  		return fd, err
   528  	}
   529  
   530  	// May create new file.
   531  	mustCreate := opts.Flags&linux.O_EXCL != 0
   532  	start := rp.Start().Impl().(*Dentry)
   533  	fs.mu.Lock()
   534  	unlocked := false
   535  	unlock := func() {
   536  		if !unlocked {
   537  			fs.mu.Unlock()
   538  			unlocked = true
   539  		}
   540  	}
   541  	// Process all to-be-decref'd dentries at the end at once.
   542  	// Since we defer unlock() AFTER this, fs.mu is guaranteed to be unlocked
   543  	// when this is executed.
   544  	defer fs.processDeferredDecRefs(ctx)
   545  	defer unlock()
   546  	if rp.Done() {
   547  		if rp.MustBeDir() {
   548  			return nil, linuxerr.EISDIR
   549  		}
   550  		if mustCreate {
   551  			return nil, linuxerr.EEXIST
   552  		}
   553  		if err := start.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
   554  			return nil, err
   555  		}
   556  		// Open may block so we need to unlock fs.mu. IncRef d to prevent
   557  		// its destruction while fs.mu is unlocked.
   558  		start.IncRef()
   559  		unlock()
   560  		fd, err := start.inode.Open(ctx, rp, start, opts)
   561  		start.DecRef(ctx)
   562  		return fd, err
   563  	}
   564  afterTrailingSymlink:
   565  	parent, err := fs.walkParentDirLocked(ctx, rp, start)
   566  	if err != nil {
   567  		return nil, err
   568  	}
   569  	// Check for search permission in the parent directory.
   570  	if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
   571  		return nil, err
   572  	}
   573  	// Reject attempts to open directories with O_CREAT.
   574  	if rp.MustBeDir() {
   575  		return nil, linuxerr.EISDIR
   576  	}
   577  	pc := rp.Component()
   578  	if pc == "." || pc == ".." {
   579  		return nil, linuxerr.EISDIR
   580  	}
   581  	if len(pc) > linux.NAME_MAX {
   582  		return nil, linuxerr.ENAMETOOLONG
   583  	}
   584  	if parent.VFSDentry().IsDead() {
   585  		return nil, linuxerr.ENOENT
   586  	}
   587  	// Determine whether or not we need to create a file.
   588  	child, followedSymlink, err := fs.stepExistingLocked(ctx, rp, parent)
   589  	if followedSymlink {
   590  		if mustCreate {
   591  			// EEXIST must be returned if an existing symlink is opened with O_EXCL.
   592  			return nil, linuxerr.EEXIST
   593  		}
   594  		if err != nil {
   595  			// If followedSymlink && err != nil, then this symlink resolution error
   596  			// must be handled by the VFS layer.
   597  			return nil, err
   598  		}
   599  		start = parent
   600  		goto afterTrailingSymlink
   601  	}
   602  	if linuxerr.Equals(linuxerr.ENOENT, err) {
   603  		// Already checked for searchability above; now check for writability.
   604  		if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
   605  			return nil, err
   606  		}
   607  		if err := rp.Mount().CheckBeginWrite(); err != nil {
   608  			return nil, err
   609  		}
   610  		defer rp.Mount().EndWrite()
   611  		// Create and open the child.
   612  		childI, err := parent.inode.NewFile(ctx, pc, opts)
   613  		if err != nil {
   614  			return nil, err
   615  		}
   616  		var child Dentry
   617  		child.Init(fs, childI)
   618  		parent.insertChild(pc, &child)
   619  		// Open may block so we need to unlock fs.mu. IncRef child to prevent
   620  		// its destruction while fs.mu is unlocked.
   621  		child.IncRef()
   622  		unlock()
   623  		parent.inode.Watches().Notify(ctx, pc, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */)
   624  		fd, err := child.inode.Open(ctx, rp, &child, opts)
   625  		child.DecRef(ctx)
   626  		return fd, err
   627  	}
   628  	if err != nil {
   629  		return nil, err
   630  	}
   631  	// Open existing file or follow symlink.
   632  	if mustCreate {
   633  		return nil, linuxerr.EEXIST
   634  	}
   635  	if rp.MustBeDir() && !child.isDir() {
   636  		return nil, linuxerr.ENOTDIR
   637  	}
   638  	if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
   639  		return nil, err
   640  	}
   641  	if child.isDir() {
   642  		// Can't open directories with O_CREAT.
   643  		if opts.Flags&linux.O_CREAT != 0 {
   644  			return nil, linuxerr.EISDIR
   645  		}
   646  		// Can't open directories writably.
   647  		if ats&vfs.MayWrite != 0 {
   648  			return nil, linuxerr.EISDIR
   649  		}
   650  		if opts.Flags&linux.O_DIRECT != 0 {
   651  			return nil, linuxerr.EINVAL
   652  		}
   653  	}
   654  	// Open may block so we need to unlock fs.mu. IncRef child to prevent
   655  	// its destruction while fs.mu is unlocked.
   656  	child.IncRef()
   657  	unlock()
   658  	fd, err := child.inode.Open(ctx, rp, child, opts)
   659  	child.DecRef(ctx)
   660  	return fd, err
   661  }
   662  
   663  // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
   664  func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
   665  	defer fs.processDeferredDecRefs(ctx)
   666  
   667  	fs.mu.RLock()
   668  	d, err := fs.walkExistingLocked(ctx, rp)
   669  	if err != nil {
   670  		fs.mu.RUnlock()
   671  		return "", err
   672  	}
   673  	if !d.isSymlink() {
   674  		fs.mu.RUnlock()
   675  		return "", linuxerr.EINVAL
   676  	}
   677  
   678  	// Inode.Readlink() cannot be called holding fs locks.
   679  	d.IncRef()
   680  	defer d.DecRef(ctx)
   681  	fs.mu.RUnlock()
   682  
   683  	return d.inode.Readlink(ctx, rp.Mount())
   684  }
   685  
   686  // RenameAt implements vfs.FilesystemImpl.RenameAt.
   687  func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
   688  	fs.mu.Lock()
   689  	defer fs.processDeferredDecRefs(ctx)
   690  	defer fs.mu.Unlock()
   691  
   692  	// Resolve the destination directory first to verify that it's on this
   693  	// Mount.
   694  	dstDir, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry))
   695  	if err != nil {
   696  		return err
   697  	}
   698  
   699  	// Only RENAME_NOREPLACE is supported.
   700  	if opts.Flags&^linux.RENAME_NOREPLACE != 0 {
   701  		return linuxerr.EINVAL
   702  	}
   703  	noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0
   704  
   705  	mnt := rp.Mount()
   706  	if mnt != oldParentVD.Mount() {
   707  		return linuxerr.EXDEV
   708  	}
   709  	if err := mnt.CheckBeginWrite(); err != nil {
   710  		return err
   711  	}
   712  	defer mnt.EndWrite()
   713  	oldParentDir := oldParentVD.Dentry().Impl().(*Dentry).Inode()
   714  	if err := oldParentDir.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
   715  		return err
   716  	}
   717  	if err := dstDir.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
   718  		return err
   719  	}
   720  
   721  	srcDirVFSD := oldParentVD.Dentry()
   722  	srcDir := srcDirVFSD.Impl().(*Dentry)
   723  	src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName)
   724  	if err != nil {
   725  		return err
   726  	}
   727  
   728  	// Can we remove the src dentry?
   729  	if err := checkDeleteLocked(ctx, rp, src); err != nil {
   730  		return err
   731  	}
   732  
   733  	// Can we create the dst dentry?
   734  	var dst *Dentry
   735  	newName := rp.Component()
   736  	if newName == "." || newName == ".." {
   737  		if noReplace {
   738  			return linuxerr.EEXIST
   739  		}
   740  		return linuxerr.EBUSY
   741  	}
   742  	if len(newName) > linux.NAME_MAX {
   743  		return linuxerr.ENAMETOOLONG
   744  	}
   745  
   746  	err = checkCreateLocked(ctx, rp.Credentials(), newName, dstDir)
   747  	switch {
   748  	case err == nil:
   749  		// Ok, continue with rename as replacement.
   750  	case linuxerr.Equals(linuxerr.EEXIST, err):
   751  		if noReplace {
   752  			// Won't overwrite existing node since RENAME_NOREPLACE was requested.
   753  			return linuxerr.EEXIST
   754  		}
   755  		dst = dstDir.children[newName]
   756  		if dst == nil {
   757  			panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", newName, dstDir))
   758  		}
   759  	default:
   760  		return err
   761  	}
   762  
   763  	if srcDir == dstDir && oldName == newName {
   764  		return nil
   765  	}
   766  
   767  	var dstVFSD *vfs.Dentry
   768  	if dst != nil {
   769  		dstVFSD = dst.VFSDentry()
   770  	}
   771  
   772  	mntns := vfs.MountNamespaceFromContext(ctx)
   773  	defer mntns.DecRef(ctx)
   774  	virtfs := rp.VirtualFilesystem()
   775  
   776  	// We can't deadlock here due to lock ordering because we're protected from
   777  	// concurrent renames by fs.mu held for writing.
   778  	srcDir.dirMu.Lock()
   779  	defer srcDir.dirMu.Unlock()
   780  	if srcDir != dstDir {
   781  		dstDir.dirMu.Lock()
   782  		defer dstDir.dirMu.Unlock()
   783  	}
   784  
   785  	srcVFSD := src.VFSDentry()
   786  	if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil {
   787  		return err
   788  	}
   789  	err = srcDir.inode.Rename(ctx, src.name, newName, src.inode, dstDir.inode)
   790  	if err != nil {
   791  		virtfs.AbortRenameDentry(srcVFSD, dstVFSD)
   792  		return err
   793  	}
   794  	delete(srcDir.children, src.name)
   795  	if srcDir != dstDir {
   796  		fs.deferDecRef(srcDir) // child (src) drops ref on old parent.
   797  		dstDir.IncRef()        // child (src) takes a ref on the new parent.
   798  	}
   799  	src.parent.Store(dstDir)
   800  	src.name = newName
   801  	if dstDir.children == nil {
   802  		dstDir.children = make(map[string]*Dentry)
   803  	}
   804  	replaced := dstDir.children[newName]
   805  	dstDir.children[newName] = src
   806  	var replaceVFSD *vfs.Dentry
   807  	if replaced != nil {
   808  		// deferDecRef so that fs.mu and dstDir.mu are unlocked by then.
   809  		fs.deferDecRef(replaced)
   810  		replaceVFSD = replaced.VFSDentry()
   811  		replaced.setDeleted()
   812  	}
   813  	vfs.InotifyRename(ctx, src.inode.Watches(), srcDir.inode.Watches(), dstDir.inode.Watches(), oldName, newName, src.isDir())
   814  	for _, rc := range virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD) { // +checklocksforce: to may be nil, that's okay.
   815  		fs.deferDecRef(rc)
   816  	}
   817  	return nil
   818  }
   819  
   820  // RmdirAt implements vfs.FilesystemImpl.RmdirAt.
   821  func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
   822  	fs.mu.Lock()
   823  	defer fs.processDeferredDecRefs(ctx)
   824  	defer fs.mu.Unlock()
   825  	parent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry))
   826  	if err != nil {
   827  		return err
   828  	}
   829  	if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
   830  		return err
   831  	}
   832  	if err := rp.Mount().CheckBeginWrite(); err != nil {
   833  		return err
   834  	}
   835  	defer rp.Mount().EndWrite()
   836  	name := rp.Component()
   837  	if name == "." {
   838  		return linuxerr.EINVAL
   839  	}
   840  	if name == ".." {
   841  		return linuxerr.ENOTEMPTY
   842  	}
   843  	child, ok := parent.children[name]
   844  	if !ok {
   845  		return linuxerr.ENOENT
   846  	}
   847  	if err := checkDeleteLocked(ctx, rp, child); err != nil {
   848  		return err
   849  	}
   850  	if err := vfs.CheckDeleteSticky(
   851  		rp.Credentials(),
   852  		linux.FileMode(parent.inode.Mode()),
   853  		auth.KUID(parent.inode.UID()),
   854  		auth.KUID(child.inode.UID()),
   855  		auth.KGID(child.inode.GID()),
   856  	); err != nil {
   857  		return err
   858  	}
   859  	if !child.isDir() {
   860  		return linuxerr.ENOTDIR
   861  	}
   862  	if child.inode.HasChildren() {
   863  		return linuxerr.ENOTEMPTY
   864  	}
   865  	virtfs := rp.VirtualFilesystem()
   866  	parent.dirMu.Lock()
   867  	defer parent.dirMu.Unlock()
   868  
   869  	mntns := vfs.MountNamespaceFromContext(ctx)
   870  	defer mntns.DecRef(ctx)
   871  	vfsd := child.VFSDentry()
   872  	if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
   873  		return err // +checklocksforce: vfsd is not locked.
   874  	}
   875  
   876  	if err := parent.inode.RmDir(ctx, child.name, child.inode); err != nil {
   877  		virtfs.AbortDeleteDentry(vfsd)
   878  		return err
   879  	}
   880  	delete(parent.children, child.name)
   881  	parent.inode.Watches().Notify(ctx, child.name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */)
   882  	// Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then.
   883  	fs.deferDecRef(child)
   884  	rcs := virtfs.CommitDeleteDentry(ctx, vfsd)
   885  	for _, rc := range rcs {
   886  		fs.deferDecRef(rc)
   887  	}
   888  	child.setDeleted()
   889  	return nil
   890  }
   891  
   892  // SetStatAt implements vfs.FilesystemImpl.SetStatAt.
   893  func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
   894  	fs.mu.RLock()
   895  	defer fs.processDeferredDecRefs(ctx)
   896  	d, err := fs.walkExistingLocked(ctx, rp)
   897  	if err != nil {
   898  		fs.mu.RUnlock()
   899  		return err
   900  	}
   901  	if opts.Stat.Mask == 0 {
   902  		fs.mu.RUnlock()
   903  		return nil
   904  	}
   905  	err = d.inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts)
   906  	fs.mu.RUnlock()
   907  	if err != nil {
   908  		return err
   909  	}
   910  	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
   911  		d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
   912  	}
   913  	return nil
   914  }
   915  
   916  // StatAt implements vfs.FilesystemImpl.StatAt.
   917  func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
   918  	fs.mu.RLock()
   919  	defer fs.processDeferredDecRefs(ctx)
   920  	defer fs.mu.RUnlock()
   921  	d, err := fs.walkExistingLocked(ctx, rp)
   922  	if err != nil {
   923  		return linux.Statx{}, err
   924  	}
   925  	return d.inode.Stat(ctx, fs.VFSFilesystem(), opts)
   926  }
   927  
   928  // StatFSAt implements vfs.FilesystemImpl.StatFSAt.
   929  func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
   930  	fs.mu.RLock()
   931  	defer fs.processDeferredDecRefs(ctx)
   932  	defer fs.mu.RUnlock()
   933  	d, err := fs.walkExistingLocked(ctx, rp)
   934  	if err != nil {
   935  		return linux.Statfs{}, err
   936  	}
   937  	return d.inode.StatFS(ctx, fs.VFSFilesystem())
   938  }
   939  
   940  // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
   941  func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
   942  	if rp.Done() {
   943  		return linuxerr.EEXIST
   944  	}
   945  	fs.mu.Lock()
   946  	defer fs.processDeferredDecRefs(ctx)
   947  	defer fs.mu.Unlock()
   948  	parent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry))
   949  	if err != nil {
   950  		return err
   951  	}
   952  	parent.dirMu.Lock()
   953  	defer parent.dirMu.Unlock()
   954  
   955  	pc := rp.Component()
   956  	if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil {
   957  		return err
   958  	}
   959  	if rp.MustBeDir() {
   960  		return linuxerr.ENOENT
   961  	}
   962  	if err := rp.Mount().CheckBeginWrite(); err != nil {
   963  		return err
   964  	}
   965  	defer rp.Mount().EndWrite()
   966  	childI, err := parent.inode.NewSymlink(ctx, pc, target)
   967  	if err != nil {
   968  		return err
   969  	}
   970  	parent.inode.Watches().Notify(ctx, pc, linux.IN_CREATE, 0, vfs.InodeEvent, false /* unlinked */)
   971  	var child Dentry
   972  	child.Init(fs, childI)
   973  	parent.insertChildLocked(pc, &child)
   974  	return nil
   975  }
   976  
   977  // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
   978  func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
   979  	fs.mu.Lock()
   980  	defer fs.processDeferredDecRefs(ctx)
   981  	defer fs.mu.Unlock()
   982  
   983  	d, err := fs.walkExistingLocked(ctx, rp)
   984  	if err != nil {
   985  		return err
   986  	}
   987  	if err := rp.Mount().CheckBeginWrite(); err != nil {
   988  		return err
   989  	}
   990  	defer rp.Mount().EndWrite()
   991  	if err := checkDeleteLocked(ctx, rp, d); err != nil {
   992  		return err
   993  	}
   994  	if d.isDir() {
   995  		return linuxerr.EISDIR
   996  	}
   997  	virtfs := rp.VirtualFilesystem()
   998  	parentDentry := d.parent.Load()
   999  	parentDentry.dirMu.Lock()
  1000  	defer parentDentry.dirMu.Unlock()
  1001  	mntns := vfs.MountNamespaceFromContext(ctx)
  1002  	defer mntns.DecRef(ctx)
  1003  	vfsd := d.VFSDentry()
  1004  	if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
  1005  		return err
  1006  	}
  1007  	if err := parentDentry.inode.Unlink(ctx, d.name, d.inode); err != nil {
  1008  		virtfs.AbortDeleteDentry(vfsd)
  1009  		return err
  1010  	}
  1011  	delete(parentDentry.children, d.name)
  1012  	vfs.InotifyRemoveChild(ctx, d.inode.Watches(), parentDentry.inode.Watches(), d.name)
  1013  	// Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then.
  1014  	fs.deferDecRef(d)
  1015  	rcs := virtfs.CommitDeleteDentry(ctx, vfsd)
  1016  	for _, rc := range rcs {
  1017  		fs.deferDecRef(rc)
  1018  	}
  1019  	d.setDeleted()
  1020  	return nil
  1021  }
  1022  
  1023  // BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
  1024  func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
  1025  	fs.mu.RLock()
  1026  	defer fs.processDeferredDecRefs(ctx)
  1027  	defer fs.mu.RUnlock()
  1028  	d, err := fs.walkExistingLocked(ctx, rp)
  1029  	if err != nil {
  1030  		return nil, err
  1031  	}
  1032  	if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
  1033  		return nil, err
  1034  	}
  1035  	return nil, linuxerr.ECONNREFUSED
  1036  }
  1037  
  1038  // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
  1039  func (fs *Filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
  1040  	fs.mu.RLock()
  1041  	defer fs.processDeferredDecRefs(ctx)
  1042  	defer fs.mu.RUnlock()
  1043  	_, err := fs.walkExistingLocked(ctx, rp)
  1044  	if err != nil {
  1045  		return nil, err
  1046  	}
  1047  	// kernfs currently does not support extended attributes.
  1048  	return nil, linuxerr.ENOTSUP
  1049  }
  1050  
  1051  // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
  1052  func (fs *Filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
  1053  	fs.mu.RLock()
  1054  	defer fs.processDeferredDecRefs(ctx)
  1055  	defer fs.mu.RUnlock()
  1056  	_, err := fs.walkExistingLocked(ctx, rp)
  1057  	if err != nil {
  1058  		return "", err
  1059  	}
  1060  	// kernfs currently does not support extended attributes.
  1061  	return "", linuxerr.ENOTSUP
  1062  }
  1063  
  1064  // SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
  1065  func (fs *Filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
  1066  	fs.mu.RLock()
  1067  	defer fs.processDeferredDecRefs(ctx)
  1068  	defer fs.mu.RUnlock()
  1069  	_, err := fs.walkExistingLocked(ctx, rp)
  1070  	if err != nil {
  1071  		return err
  1072  	}
  1073  	// kernfs currently does not support extended attributes.
  1074  	return linuxerr.ENOTSUP
  1075  }
  1076  
  1077  // RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
  1078  func (fs *Filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
  1079  	fs.mu.RLock()
  1080  	defer fs.processDeferredDecRefs(ctx)
  1081  	defer fs.mu.RUnlock()
  1082  	_, err := fs.walkExistingLocked(ctx, rp)
  1083  	if err != nil {
  1084  		return err
  1085  	}
  1086  	// kernfs currently does not support extended attributes.
  1087  	return linuxerr.ENOTSUP
  1088  }
  1089  
  1090  // PrependPath implements vfs.FilesystemImpl.PrependPath.
  1091  func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
  1092  	fs.mu.RLock()
  1093  	defer fs.mu.RUnlock()
  1094  	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*Dentry), b)
  1095  }
  1096  
  1097  func (fs *Filesystem) deferDecRefVD(ctx context.Context, vd vfs.VirtualDentry) {
  1098  	if d, ok := vd.Dentry().Impl().(*Dentry); ok && d.fs == fs {
  1099  		// The following is equivalent to vd.DecRef(ctx). This is needed
  1100  		// because if d belongs to this filesystem, we can not DecRef it right
  1101  		// away as we may be holding fs.mu. d.DecRef may acquire fs.mu. So we
  1102  		// defer the DecRef to when locks are dropped.
  1103  		vd.Mount().DecRef(ctx)
  1104  		fs.deferDecRef(d)
  1105  	} else {
  1106  		vd.DecRef(ctx)
  1107  	}
  1108  }
  1109  
  1110  // IsDescendant implements vfs.FilesystemImpl.IsDescendant.
  1111  func (fs *Filesystem) IsDescendant(vfsroot, vd vfs.VirtualDentry) bool {
  1112  	return genericIsDescendant(vfsroot.Dentry(), vd.Dentry().Impl().(*Dentry))
  1113  }