github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fsimpl/tmpfs/filesystem.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tmpfs
    16  
    17  import (
    18  	"fmt"
    19  	"sync/atomic"
    20  
    21  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    22  	"github.com/SagerNet/gvisor/pkg/context"
    23  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    24  	"github.com/SagerNet/gvisor/pkg/fspath"
    25  	"github.com/SagerNet/gvisor/pkg/sentry/fsmetric"
    26  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    27  	"github.com/SagerNet/gvisor/pkg/sentry/socket/unix/transport"
    28  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    29  	"github.com/SagerNet/gvisor/pkg/syserror"
    30  )
    31  
    32  // Sync implements vfs.FilesystemImpl.Sync.
    33  func (fs *filesystem) Sync(ctx context.Context) error {
    34  	// All filesystem state is in-memory.
    35  	return nil
    36  }
    37  
    38  // stepLocked resolves rp.Component() to an existing file, starting from the
    39  // given directory.
    40  //
    41  // stepLocked is loosely analogous to fs/namei.c:walk_component().
    42  //
    43  // Preconditions:
    44  // * filesystem.mu must be locked.
    45  // * !rp.Done().
    46  func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
    47  	dir, ok := d.inode.impl.(*directory)
    48  	if !ok {
    49  		return nil, syserror.ENOTDIR
    50  	}
    51  	if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
    52  		return nil, err
    53  	}
    54  afterSymlink:
    55  	name := rp.Component()
    56  	if name == "." {
    57  		rp.Advance()
    58  		return d, nil
    59  	}
    60  	if name == ".." {
    61  		if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
    62  			return nil, err
    63  		} else if isRoot || d.parent == nil {
    64  			rp.Advance()
    65  			return d, nil
    66  		}
    67  		if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
    68  			return nil, err
    69  		}
    70  		rp.Advance()
    71  		return d.parent, nil
    72  	}
    73  	if len(name) > linux.NAME_MAX {
    74  		return nil, linuxerr.ENAMETOOLONG
    75  	}
    76  	child, ok := dir.childMap[name]
    77  	if !ok {
    78  		return nil, syserror.ENOENT
    79  	}
    80  	if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
    81  		return nil, err
    82  	}
    83  	if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
    84  		// Symlink traversal updates access time.
    85  		child.inode.touchAtime(rp.Mount())
    86  		if err := rp.HandleSymlink(symlink.target); err != nil {
    87  			return nil, err
    88  		}
    89  		goto afterSymlink // don't check the current directory again
    90  	}
    91  	rp.Advance()
    92  	return child, nil
    93  }
    94  
    95  // walkParentDirLocked resolves all but the last path component of rp to an
    96  // existing directory, starting from the given directory (which is usually
    97  // rp.Start().Impl().(*dentry)). It does not check that the returned directory
    98  // is searchable by the provider of rp.
    99  //
   100  // walkParentDirLocked is loosely analogous to Linux's
   101  // fs/namei.c:path_parentat().
   102  //
   103  // Preconditions:
   104  // * filesystem.mu must be locked.
   105  // * !rp.Done().
   106  func walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*directory, error) {
   107  	for !rp.Final() {
   108  		next, err := stepLocked(ctx, rp, d)
   109  		if err != nil {
   110  			return nil, err
   111  		}
   112  		d = next
   113  	}
   114  	dir, ok := d.inode.impl.(*directory)
   115  	if !ok {
   116  		return nil, syserror.ENOTDIR
   117  	}
   118  	return dir, nil
   119  }
   120  
   121  // resolveLocked resolves rp to an existing file.
   122  //
   123  // resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
   124  //
   125  // Preconditions: filesystem.mu must be locked.
   126  func resolveLocked(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error) {
   127  	d := rp.Start().Impl().(*dentry)
   128  	for !rp.Done() {
   129  		next, err := stepLocked(ctx, rp, d)
   130  		if err != nil {
   131  			return nil, err
   132  		}
   133  		d = next
   134  	}
   135  	if rp.MustBeDir() && !d.inode.isDir() {
   136  		return nil, syserror.ENOTDIR
   137  	}
   138  	return d, nil
   139  }
   140  
   141  // doCreateAt checks that creating a file at rp is permitted, then invokes
   142  // create to do so.
   143  //
   144  // doCreateAt is loosely analogous to a conjunction of Linux's
   145  // fs/namei.c:filename_create() and done_path_create().
   146  //
   147  // Preconditions:
   148  // * !rp.Done().
   149  // * For the final path component in rp, !rp.ShouldFollowSymlink().
   150  func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error {
   151  	fs.mu.Lock()
   152  	defer fs.mu.Unlock()
   153  	parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
   154  	if err != nil {
   155  		return err
   156  	}
   157  
   158  	// Order of checks is important. First check if parent directory can be
   159  	// executed, then check for existence, and lastly check if mount is writable.
   160  	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
   161  		return err
   162  	}
   163  	name := rp.Component()
   164  	if name == "." || name == ".." {
   165  		return syserror.EEXIST
   166  	}
   167  	if len(name) > linux.NAME_MAX {
   168  		return linuxerr.ENAMETOOLONG
   169  	}
   170  	if _, ok := parentDir.childMap[name]; ok {
   171  		return syserror.EEXIST
   172  	}
   173  	if !dir && rp.MustBeDir() {
   174  		return syserror.ENOENT
   175  	}
   176  	// tmpfs never calls VFS.InvalidateDentry(), so parentDir.dentry can only
   177  	// be dead if it was deleted.
   178  	if parentDir.dentry.vfsd.IsDead() {
   179  		return syserror.ENOENT
   180  	}
   181  	mnt := rp.Mount()
   182  	if err := mnt.CheckBeginWrite(); err != nil {
   183  		return err
   184  	}
   185  	defer mnt.EndWrite()
   186  
   187  	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
   188  		return err
   189  	}
   190  	if err := create(parentDir, name); err != nil {
   191  		return err
   192  	}
   193  
   194  	ev := linux.IN_CREATE
   195  	if dir {
   196  		ev |= linux.IN_ISDIR
   197  	}
   198  	parentDir.inode.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
   199  	parentDir.inode.touchCMtime()
   200  	return nil
   201  }
   202  
   203  // AccessAt implements vfs.Filesystem.Impl.AccessAt.
   204  func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
   205  	fs.mu.RLock()
   206  	defer fs.mu.RUnlock()
   207  	d, err := resolveLocked(ctx, rp)
   208  	if err != nil {
   209  		return err
   210  	}
   211  	return d.inode.checkPermissions(creds, ats)
   212  }
   213  
   214  // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
   215  func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
   216  	fs.mu.RLock()
   217  	defer fs.mu.RUnlock()
   218  	d, err := resolveLocked(ctx, rp)
   219  	if err != nil {
   220  		return nil, err
   221  	}
   222  	if opts.CheckSearchable {
   223  		if !d.inode.isDir() {
   224  			return nil, syserror.ENOTDIR
   225  		}
   226  		if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
   227  			return nil, err
   228  		}
   229  	}
   230  	d.IncRef()
   231  	return &d.vfsd, nil
   232  }
   233  
   234  // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
   235  func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
   236  	fs.mu.RLock()
   237  	defer fs.mu.RUnlock()
   238  	dir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
   239  	if err != nil {
   240  		return nil, err
   241  	}
   242  	dir.dentry.IncRef()
   243  	return &dir.dentry.vfsd, nil
   244  }
   245  
   246  // LinkAt implements vfs.FilesystemImpl.LinkAt.
   247  func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
   248  	return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error {
   249  		if rp.Mount() != vd.Mount() {
   250  			return linuxerr.EXDEV
   251  		}
   252  		d := vd.Dentry().Impl().(*dentry)
   253  		i := d.inode
   254  		if i.isDir() {
   255  			return linuxerr.EPERM
   256  		}
   257  		if err := vfs.MayLink(auth.CredentialsFromContext(ctx), linux.FileMode(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
   258  			return err
   259  		}
   260  		if i.nlink == 0 {
   261  			return syserror.ENOENT
   262  		}
   263  		if i.nlink == maxLinks {
   264  			return linuxerr.EMLINK
   265  		}
   266  		i.incLinksLocked()
   267  		i.watches.Notify(ctx, "", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */)
   268  		parentDir.insertChildLocked(fs.newDentry(i), name)
   269  		return nil
   270  	})
   271  }
   272  
   273  // MkdirAt implements vfs.FilesystemImpl.MkdirAt.
   274  func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
   275  	return fs.doCreateAt(ctx, rp, true /* dir */, func(parentDir *directory, name string) error {
   276  		creds := rp.Credentials()
   277  		if parentDir.inode.nlink == maxLinks {
   278  			return linuxerr.EMLINK
   279  		}
   280  		parentDir.inode.incLinksLocked() // from child's ".."
   281  		childDir := fs.newDirectory(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir)
   282  		parentDir.insertChildLocked(&childDir.dentry, name)
   283  		return nil
   284  	})
   285  }
   286  
   287  // MknodAt implements vfs.FilesystemImpl.MknodAt.
   288  func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
   289  	return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error {
   290  		creds := rp.Credentials()
   291  		var childInode *inode
   292  		switch opts.Mode.FileType() {
   293  		case linux.S_IFREG:
   294  			childInode = fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir)
   295  		case linux.S_IFIFO:
   296  			childInode = fs.newNamedPipe(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir)
   297  		case linux.S_IFBLK:
   298  			childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor, parentDir)
   299  		case linux.S_IFCHR:
   300  			childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor, parentDir)
   301  		case linux.S_IFSOCK:
   302  			childInode = fs.newSocketFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, opts.Endpoint, parentDir)
   303  		default:
   304  			return linuxerr.EINVAL
   305  		}
   306  		child := fs.newDentry(childInode)
   307  		parentDir.insertChildLocked(child, name)
   308  		return nil
   309  	})
   310  }
   311  
   312  // OpenAt implements vfs.FilesystemImpl.OpenAt.
   313  func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   314  	if opts.Flags&linux.O_TMPFILE != 0 {
   315  		// Not yet supported.
   316  		return nil, syserror.EOPNOTSUPP
   317  	}
   318  
   319  	// Handle O_CREAT and !O_CREAT separately, since in the latter case we
   320  	// don't need fs.mu for writing.
   321  	if opts.Flags&linux.O_CREAT == 0 {
   322  		fs.mu.RLock()
   323  		d, err := resolveLocked(ctx, rp)
   324  		if err != nil {
   325  			fs.mu.RUnlock()
   326  			return nil, err
   327  		}
   328  		d.IncRef()
   329  		defer d.DecRef(ctx)
   330  		fs.mu.RUnlock()
   331  		return d.open(ctx, rp, &opts, false /* afterCreate */)
   332  	}
   333  
   334  	mustCreate := opts.Flags&linux.O_EXCL != 0
   335  	start := rp.Start().Impl().(*dentry)
   336  	fs.mu.Lock()
   337  	unlocked := false
   338  	unlock := func() {
   339  		if !unlocked {
   340  			fs.mu.Unlock()
   341  			unlocked = true
   342  		}
   343  	}
   344  	defer unlock()
   345  	if rp.Done() {
   346  		// Reject attempts to open mount root directory with O_CREAT.
   347  		if rp.MustBeDir() {
   348  			return nil, syserror.EISDIR
   349  		}
   350  		if mustCreate {
   351  			return nil, syserror.EEXIST
   352  		}
   353  		start.IncRef()
   354  		defer start.DecRef(ctx)
   355  		unlock()
   356  		return start.open(ctx, rp, &opts, false /* afterCreate */)
   357  	}
   358  afterTrailingSymlink:
   359  	parentDir, err := walkParentDirLocked(ctx, rp, start)
   360  	if err != nil {
   361  		return nil, err
   362  	}
   363  	// Check for search permission in the parent directory.
   364  	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
   365  		return nil, err
   366  	}
   367  	// Reject attempts to open directories with O_CREAT.
   368  	if rp.MustBeDir() {
   369  		return nil, syserror.EISDIR
   370  	}
   371  	name := rp.Component()
   372  	if name == "." || name == ".." {
   373  		return nil, syserror.EISDIR
   374  	}
   375  	if len(name) > linux.NAME_MAX {
   376  		return nil, linuxerr.ENAMETOOLONG
   377  	}
   378  	// Determine whether or not we need to create a file.
   379  	child, ok := parentDir.childMap[name]
   380  	if !ok {
   381  		// Already checked for searchability above; now check for writability.
   382  		if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
   383  			return nil, err
   384  		}
   385  		if err := rp.Mount().CheckBeginWrite(); err != nil {
   386  			return nil, err
   387  		}
   388  		defer rp.Mount().EndWrite()
   389  		// Create and open the child.
   390  		creds := rp.Credentials()
   391  		child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir))
   392  		parentDir.insertChildLocked(child, name)
   393  		child.IncRef()
   394  		defer child.DecRef(ctx)
   395  		unlock()
   396  		fd, err := child.open(ctx, rp, &opts, true)
   397  		if err != nil {
   398  			return nil, err
   399  		}
   400  		parentDir.inode.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */)
   401  		parentDir.inode.touchCMtime()
   402  		return fd, nil
   403  	}
   404  	if mustCreate {
   405  		return nil, syserror.EEXIST
   406  	}
   407  	// Is the file mounted over?
   408  	if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
   409  		return nil, err
   410  	}
   411  	// Do we need to resolve a trailing symlink?
   412  	if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
   413  		// Symlink traversal updates access time.
   414  		child.inode.touchAtime(rp.Mount())
   415  		if err := rp.HandleSymlink(symlink.target); err != nil {
   416  			return nil, err
   417  		}
   418  		start = &parentDir.dentry
   419  		goto afterTrailingSymlink
   420  	}
   421  	if rp.MustBeDir() && !child.inode.isDir() {
   422  		return nil, syserror.ENOTDIR
   423  	}
   424  	child.IncRef()
   425  	defer child.DecRef(ctx)
   426  	unlock()
   427  	return child.open(ctx, rp, &opts, false)
   428  }
   429  
   430  // Preconditions: The caller must hold no locks (since opening pipes may block
   431  // indefinitely).
   432  func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) {
   433  	ats := vfs.AccessTypesForOpenFlags(opts)
   434  	if !afterCreate {
   435  		if err := d.inode.checkPermissions(rp.Credentials(), ats); err != nil {
   436  			return nil, err
   437  		}
   438  	}
   439  	switch impl := d.inode.impl.(type) {
   440  	case *regularFile:
   441  		var fd regularFileFD
   442  		fd.LockFD.Init(&d.inode.locks)
   443  		if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil {
   444  			return nil, err
   445  		}
   446  		if !afterCreate && opts.Flags&linux.O_TRUNC != 0 {
   447  			if _, err := impl.truncate(0); err != nil {
   448  				return nil, err
   449  			}
   450  		}
   451  		if fd.vfsfd.IsWritable() {
   452  			fsmetric.TmpfsOpensW.Increment()
   453  		} else if fd.vfsfd.IsReadable() {
   454  			fsmetric.TmpfsOpensRO.Increment()
   455  		}
   456  		return &fd.vfsfd, nil
   457  	case *directory:
   458  		// Can't open directories writably.
   459  		if ats&vfs.MayWrite != 0 {
   460  			return nil, syserror.EISDIR
   461  		}
   462  		var fd directoryFD
   463  		fd.LockFD.Init(&d.inode.locks)
   464  		if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil {
   465  			return nil, err
   466  		}
   467  		return &fd.vfsfd, nil
   468  	case *symlink:
   469  		// Can't open symlinks without O_PATH, which is handled at the VFS layer.
   470  		return nil, linuxerr.ELOOP
   471  	case *namedPipe:
   472  		return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags, &d.inode.locks)
   473  	case *deviceFile:
   474  		return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts)
   475  	case *socketFile:
   476  		return nil, linuxerr.ENXIO
   477  	default:
   478  		panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl))
   479  	}
   480  }
   481  
   482  // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
   483  func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
   484  	fs.mu.RLock()
   485  	defer fs.mu.RUnlock()
   486  	d, err := resolveLocked(ctx, rp)
   487  	if err != nil {
   488  		return "", err
   489  	}
   490  	symlink, ok := d.inode.impl.(*symlink)
   491  	if !ok {
   492  		return "", linuxerr.EINVAL
   493  	}
   494  	symlink.inode.touchAtime(rp.Mount())
   495  	return symlink.target, nil
   496  }
   497  
   498  // RenameAt implements vfs.FilesystemImpl.RenameAt.
   499  func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
   500  	// Resolve newParentDir first to verify that it's on this Mount.
   501  	fs.mu.Lock()
   502  	defer fs.mu.Unlock()
   503  	newParentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
   504  	if err != nil {
   505  		return err
   506  	}
   507  
   508  	if opts.Flags&^linux.RENAME_NOREPLACE != 0 {
   509  		// TODO(b/145974740): Support other renameat2 flags.
   510  		return linuxerr.EINVAL
   511  	}
   512  
   513  	newName := rp.Component()
   514  	if newName == "." || newName == ".." {
   515  		if opts.Flags&linux.RENAME_NOREPLACE != 0 {
   516  			return syserror.EEXIST
   517  		}
   518  		return linuxerr.EBUSY
   519  	}
   520  	mnt := rp.Mount()
   521  	if mnt != oldParentVD.Mount() {
   522  		return linuxerr.EXDEV
   523  	}
   524  	if err := mnt.CheckBeginWrite(); err != nil {
   525  		return err
   526  	}
   527  	defer mnt.EndWrite()
   528  
   529  	oldParentDir := oldParentVD.Dentry().Impl().(*dentry).inode.impl.(*directory)
   530  	if err := oldParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
   531  		return err
   532  	}
   533  	renamed, ok := oldParentDir.childMap[oldName]
   534  	if !ok {
   535  		return syserror.ENOENT
   536  	}
   537  	if err := oldParentDir.mayDelete(rp.Credentials(), renamed); err != nil {
   538  		return err
   539  	}
   540  	// Note that we don't need to call rp.CheckMount(), since if renamed is a
   541  	// mount point then we want to rename the mount point, not anything in the
   542  	// mounted filesystem.
   543  	if renamed.inode.isDir() {
   544  		if renamed == &newParentDir.dentry || genericIsAncestorDentry(renamed, &newParentDir.dentry) {
   545  			return linuxerr.EINVAL
   546  		}
   547  		if oldParentDir != newParentDir {
   548  			// Writability is needed to change renamed's "..".
   549  			if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
   550  				return err
   551  			}
   552  		}
   553  	} else {
   554  		if opts.MustBeDir || rp.MustBeDir() {
   555  			return syserror.ENOTDIR
   556  		}
   557  	}
   558  
   559  	if err := newParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
   560  		return err
   561  	}
   562  	replaced, ok := newParentDir.childMap[newName]
   563  	if ok {
   564  		if opts.Flags&linux.RENAME_NOREPLACE != 0 {
   565  			return syserror.EEXIST
   566  		}
   567  		replacedDir, ok := replaced.inode.impl.(*directory)
   568  		if ok {
   569  			if !renamed.inode.isDir() {
   570  				return syserror.EISDIR
   571  			}
   572  			if len(replacedDir.childMap) != 0 {
   573  				return linuxerr.ENOTEMPTY
   574  			}
   575  		} else {
   576  			if rp.MustBeDir() {
   577  				return syserror.ENOTDIR
   578  			}
   579  			if renamed.inode.isDir() {
   580  				return syserror.ENOTDIR
   581  			}
   582  		}
   583  	} else {
   584  		if renamed.inode.isDir() && newParentDir.inode.nlink == maxLinks {
   585  			return linuxerr.EMLINK
   586  		}
   587  	}
   588  	// tmpfs never calls VFS.InvalidateDentry(), so newParentDir.dentry can
   589  	// only be dead if it was deleted.
   590  	if newParentDir.dentry.vfsd.IsDead() {
   591  		return syserror.ENOENT
   592  	}
   593  
   594  	// Linux places this check before some of those above; we do it here for
   595  	// simplicity, under the assumption that applications are not intentionally
   596  	// doing noop renames expecting them to succeed where non-noop renames
   597  	// would fail.
   598  	if renamed == replaced {
   599  		return nil
   600  	}
   601  	vfsObj := rp.VirtualFilesystem()
   602  	mntns := vfs.MountNamespaceFromContext(ctx)
   603  	defer mntns.DecRef(ctx)
   604  	var replacedVFSD *vfs.Dentry
   605  	if replaced != nil {
   606  		replacedVFSD = &replaced.vfsd
   607  	}
   608  	if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
   609  		return err
   610  	}
   611  	if replaced != nil {
   612  		newParentDir.removeChildLocked(replaced)
   613  		if replaced.inode.isDir() {
   614  			// Remove links for replaced/. and replaced/..
   615  			replaced.inode.decLinksLocked(ctx)
   616  			newParentDir.inode.decLinksLocked(ctx)
   617  		}
   618  		replaced.inode.decLinksLocked(ctx)
   619  	}
   620  	oldParentDir.removeChildLocked(renamed)
   621  	newParentDir.insertChildLocked(renamed, newName)
   622  	vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD)
   623  	oldParentDir.inode.touchCMtime()
   624  	if oldParentDir != newParentDir {
   625  		if renamed.inode.isDir() {
   626  			oldParentDir.inode.decLinksLocked(ctx)
   627  			newParentDir.inode.incLinksLocked()
   628  		}
   629  		newParentDir.inode.touchCMtime()
   630  	}
   631  	renamed.inode.touchCtime()
   632  
   633  	vfs.InotifyRename(ctx, &renamed.inode.watches, &oldParentDir.inode.watches, &newParentDir.inode.watches, oldName, newName, renamed.inode.isDir())
   634  	return nil
   635  }
   636  
   637  // RmdirAt implements vfs.FilesystemImpl.RmdirAt.
   638  func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
   639  	fs.mu.Lock()
   640  	defer fs.mu.Unlock()
   641  	parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
   642  	if err != nil {
   643  		return err
   644  	}
   645  	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
   646  		return err
   647  	}
   648  	name := rp.Component()
   649  	if name == "." {
   650  		return linuxerr.EINVAL
   651  	}
   652  	if name == ".." {
   653  		return linuxerr.ENOTEMPTY
   654  	}
   655  	child, ok := parentDir.childMap[name]
   656  	if !ok {
   657  		return syserror.ENOENT
   658  	}
   659  	if err := parentDir.mayDelete(rp.Credentials(), child); err != nil {
   660  		return err
   661  	}
   662  	childDir, ok := child.inode.impl.(*directory)
   663  	if !ok {
   664  		return syserror.ENOTDIR
   665  	}
   666  	if len(childDir.childMap) != 0 {
   667  		return linuxerr.ENOTEMPTY
   668  	}
   669  	mnt := rp.Mount()
   670  	if err := mnt.CheckBeginWrite(); err != nil {
   671  		return err
   672  	}
   673  	defer mnt.EndWrite()
   674  	vfsObj := rp.VirtualFilesystem()
   675  	mntns := vfs.MountNamespaceFromContext(ctx)
   676  	defer mntns.DecRef(ctx)
   677  	if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
   678  		return err
   679  	}
   680  	parentDir.removeChildLocked(child)
   681  	parentDir.inode.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */)
   682  	// Remove links for child, child/., and child/..
   683  	child.inode.decLinksLocked(ctx)
   684  	child.inode.decLinksLocked(ctx)
   685  	parentDir.inode.decLinksLocked(ctx)
   686  	vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
   687  	parentDir.inode.touchCMtime()
   688  	return nil
   689  }
   690  
   691  // SetStatAt implements vfs.FilesystemImpl.SetStatAt.
   692  func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
   693  	fs.mu.RLock()
   694  	d, err := resolveLocked(ctx, rp)
   695  	if err != nil {
   696  		fs.mu.RUnlock()
   697  		return err
   698  	}
   699  	err = d.inode.setStat(ctx, rp.Credentials(), &opts)
   700  	fs.mu.RUnlock()
   701  	if err != nil {
   702  		return err
   703  	}
   704  
   705  	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
   706  		d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
   707  	}
   708  	return nil
   709  }
   710  
   711  // StatAt implements vfs.FilesystemImpl.StatAt.
   712  func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
   713  	fs.mu.RLock()
   714  	defer fs.mu.RUnlock()
   715  	d, err := resolveLocked(ctx, rp)
   716  	if err != nil {
   717  		return linux.Statx{}, err
   718  	}
   719  	var stat linux.Statx
   720  	d.inode.statTo(&stat)
   721  	return stat, nil
   722  }
   723  
   724  // StatFSAt implements vfs.FilesystemImpl.StatFSAt.
   725  func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
   726  	fs.mu.RLock()
   727  	defer fs.mu.RUnlock()
   728  	if _, err := resolveLocked(ctx, rp); err != nil {
   729  		return linux.Statfs{}, err
   730  	}
   731  	return globalStatfs, nil
   732  }
   733  
   734  // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
   735  func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
   736  	return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error {
   737  		creds := rp.Credentials()
   738  		child := fs.newDentry(fs.newSymlink(creds.EffectiveKUID, creds.EffectiveKGID, 0777, target, parentDir))
   739  		parentDir.insertChildLocked(child, name)
   740  		return nil
   741  	})
   742  }
   743  
   744  // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
   745  func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
   746  	fs.mu.Lock()
   747  	defer fs.mu.Unlock()
   748  	parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
   749  	if err != nil {
   750  		return err
   751  	}
   752  	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
   753  		return err
   754  	}
   755  	name := rp.Component()
   756  	if name == "." || name == ".." {
   757  		return syserror.EISDIR
   758  	}
   759  	child, ok := parentDir.childMap[name]
   760  	if !ok {
   761  		return syserror.ENOENT
   762  	}
   763  	if err := parentDir.mayDelete(rp.Credentials(), child); err != nil {
   764  		return err
   765  	}
   766  	if child.inode.isDir() {
   767  		return syserror.EISDIR
   768  	}
   769  	if rp.MustBeDir() {
   770  		return syserror.ENOTDIR
   771  	}
   772  	mnt := rp.Mount()
   773  	if err := mnt.CheckBeginWrite(); err != nil {
   774  		return err
   775  	}
   776  	defer mnt.EndWrite()
   777  	vfsObj := rp.VirtualFilesystem()
   778  	mntns := vfs.MountNamespaceFromContext(ctx)
   779  	defer mntns.DecRef(ctx)
   780  	if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
   781  		return err
   782  	}
   783  
   784  	// Generate inotify events. Note that this must take place before the link
   785  	// count of the child is decremented, or else the watches may be dropped
   786  	// before these events are added.
   787  	vfs.InotifyRemoveChild(ctx, &child.inode.watches, &parentDir.inode.watches, name)
   788  
   789  	parentDir.removeChildLocked(child)
   790  	child.inode.decLinksLocked(ctx)
   791  	vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
   792  	parentDir.inode.touchCMtime()
   793  	return nil
   794  }
   795  
   796  // BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
   797  func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
   798  	fs.mu.RLock()
   799  	defer fs.mu.RUnlock()
   800  	d, err := resolveLocked(ctx, rp)
   801  	if err != nil {
   802  		return nil, err
   803  	}
   804  	if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
   805  		return nil, err
   806  	}
   807  	switch impl := d.inode.impl.(type) {
   808  	case *socketFile:
   809  		if impl.ep == nil {
   810  			return nil, linuxerr.ECONNREFUSED
   811  		}
   812  		return impl.ep, nil
   813  	default:
   814  		return nil, linuxerr.ECONNREFUSED
   815  	}
   816  }
   817  
   818  // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
   819  func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
   820  	fs.mu.RLock()
   821  	defer fs.mu.RUnlock()
   822  	d, err := resolveLocked(ctx, rp)
   823  	if err != nil {
   824  		return nil, err
   825  	}
   826  	return d.inode.listXattr(rp.Credentials(), size)
   827  }
   828  
   829  // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
   830  func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
   831  	fs.mu.RLock()
   832  	defer fs.mu.RUnlock()
   833  	d, err := resolveLocked(ctx, rp)
   834  	if err != nil {
   835  		return "", err
   836  	}
   837  	return d.inode.getXattr(rp.Credentials(), &opts)
   838  }
   839  
   840  // SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
   841  func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
   842  	fs.mu.RLock()
   843  	d, err := resolveLocked(ctx, rp)
   844  	if err != nil {
   845  		fs.mu.RUnlock()
   846  		return err
   847  	}
   848  	err = d.inode.setXattr(rp.Credentials(), &opts)
   849  	fs.mu.RUnlock()
   850  	if err != nil {
   851  		return err
   852  	}
   853  
   854  	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
   855  	return nil
   856  }
   857  
   858  // RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
   859  func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
   860  	fs.mu.RLock()
   861  	d, err := resolveLocked(ctx, rp)
   862  	if err != nil {
   863  		fs.mu.RUnlock()
   864  		return err
   865  	}
   866  	err = d.inode.removeXattr(rp.Credentials(), name)
   867  	fs.mu.RUnlock()
   868  	if err != nil {
   869  		return err
   870  	}
   871  
   872  	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
   873  	return nil
   874  }
   875  
   876  // PrependPath implements vfs.FilesystemImpl.PrependPath.
   877  func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
   878  	fs.mu.RLock()
   879  	defer fs.mu.RUnlock()
   880  	mnt := vd.Mount()
   881  	d := vd.Dentry().Impl().(*dentry)
   882  	for {
   883  		if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() {
   884  			return vfs.PrependPathAtVFSRootError{}
   885  		}
   886  		if &d.vfsd == mnt.Root() {
   887  			return nil
   888  		}
   889  		if d.parent == nil {
   890  			if d.name != "" {
   891  				// This file must have been created by
   892  				// newUnlinkedRegularFileDescription(). In Linux,
   893  				// mm/shmem.c:__shmem_file_setup() =>
   894  				// fs/file_table.c:alloc_file_pseudo() sets the created
   895  				// dentry's dentry_operations to anon_ops, for which d_dname ==
   896  				// simple_dname. fs/d_path.c:simple_dname() defines the
   897  				// dentry's pathname to be its name, prefixed with "/" and
   898  				// suffixed with " (deleted)".
   899  				b.PrependComponent("/" + d.name)
   900  				b.AppendString(" (deleted)")
   901  				return vfs.PrependPathSyntheticError{}
   902  			}
   903  			return vfs.PrependPathAtNonMountRootError{}
   904  		}
   905  		b.PrependComponent(d.name)
   906  		d = d.parent
   907  	}
   908  }
   909  
   910  // MountOptions implements vfs.FilesystemImpl.MountOptions.
   911  func (fs *filesystem) MountOptions() string {
   912  	return fs.mopts
   913  }