github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/fsimpl/tmpfs/filesystem.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tmpfs
    16  
    17  import (
    18  	"fmt"
    19  
    20  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    21  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    22  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    23  	"github.com/nicocha30/gvisor-ligolo/pkg/fspath"
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsmetric"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/unix/transport"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs"
    28  )
    29  
    30  const (
    31  	// direntSize is the size of each directory entry
    32  	// that Linux uses for computing directory size.
    33  	// "20" is mm/shmem.c:BOGO_DIRENT_SIZE.
    34  	direntSize = 20
    35  	// Linux implementation uses a SHORT_SYMLINK_LEN 128.
    36  	// It accounts size for only SYMLINK with size >= 128.
    37  	shortSymlinkLen = 128
    38  )
    39  
    40  // Sync implements vfs.FilesystemImpl.Sync.
    41  func (fs *filesystem) Sync(ctx context.Context) error {
    42  	// All filesystem state is in-memory.
    43  	return nil
    44  }
    45  
    46  // stepLocked resolves rp.Component() to an existing file, starting from the
    47  // given directory.
    48  //
    49  // stepLocked is loosely analogous to fs/namei.c:walk_component().
    50  //
    51  // Preconditions:
    52  //   - filesystem.mu must be locked.
    53  //   - !rp.Done().
    54  func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, bool, error) {
    55  	dir, ok := d.inode.impl.(*directory)
    56  	if !ok {
    57  		return nil, false, linuxerr.ENOTDIR
    58  	}
    59  	if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
    60  		return nil, false, err
    61  	}
    62  	name := rp.Component()
    63  	if name == "." {
    64  		rp.Advance()
    65  		return d, false, nil
    66  	}
    67  	if name == ".." {
    68  		if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
    69  			return nil, false, err
    70  		} else if isRoot || d.parent == nil {
    71  			rp.Advance()
    72  			return d, false, nil
    73  		}
    74  		if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
    75  			return nil, false, err
    76  		}
    77  		rp.Advance()
    78  		return d.parent, false, nil
    79  	}
    80  	if len(name) > d.inode.fs.maxFilenameLen {
    81  		return nil, false, linuxerr.ENAMETOOLONG
    82  	}
    83  	child, ok := dir.childMap[name]
    84  	if !ok {
    85  		return nil, false, linuxerr.ENOENT
    86  	}
    87  	if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
    88  		return nil, false, err
    89  	}
    90  	if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
    91  		// Symlink traversal updates access time.
    92  		child.inode.touchAtime(rp.Mount())
    93  		followedSymlink, err := rp.HandleSymlink(symlink.target)
    94  		return d, followedSymlink, err
    95  	}
    96  	rp.Advance()
    97  	return child, false, nil
    98  }
    99  
   100  // walkParentDirLocked resolves all but the last path component of rp to an
   101  // existing directory, starting from the given directory (which is usually
   102  // rp.Start().Impl().(*dentry)). It does not check that the returned directory
   103  // is searchable by the provider of rp.
   104  //
   105  // walkParentDirLocked is loosely analogous to Linux's
   106  // fs/namei.c:path_parentat().
   107  //
   108  // Preconditions:
   109  //   - filesystem.mu must be locked.
   110  //   - !rp.Done().
   111  func walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*directory, error) {
   112  	for !rp.Final() {
   113  		next, _, err := stepLocked(ctx, rp, d)
   114  		if err != nil {
   115  			return nil, err
   116  		}
   117  		d = next
   118  	}
   119  	dir, ok := d.inode.impl.(*directory)
   120  	if !ok {
   121  		return nil, linuxerr.ENOTDIR
   122  	}
   123  	return dir, nil
   124  }
   125  
   126  // resolveLocked resolves rp to an existing file.
   127  //
   128  // resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
   129  //
   130  // Preconditions: filesystem.mu must be locked.
   131  func resolveLocked(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error) {
   132  	d := rp.Start().Impl().(*dentry)
   133  
   134  	if symlink, ok := d.inode.impl.(*symlink); rp.Done() && ok && rp.ShouldFollowSymlink() {
   135  		// Path with a single component. We don't need to step to the next
   136  		// component, but still need to resolve any symlinks.
   137  		//
   138  		// Symlink traversal updates access time.
   139  		d.inode.touchAtime(rp.Mount())
   140  		if _, err := rp.HandleSymlink(symlink.target); err != nil {
   141  			return nil, err
   142  		}
   143  	} else {
   144  		// Path with multiple components, walk and resolve as required.
   145  		for !rp.Done() {
   146  			next, _, err := stepLocked(ctx, rp, d)
   147  			if err != nil {
   148  				return nil, err
   149  			}
   150  			d = next
   151  		}
   152  	}
   153  
   154  	if rp.MustBeDir() && !d.inode.isDir() {
   155  		return nil, linuxerr.ENOTDIR
   156  	}
   157  	return d, nil
   158  }
   159  
   160  // doCreateAt checks that creating a file at rp is permitted, then invokes
   161  // create to do so.
   162  //
   163  // doCreateAt is loosely analogous to a conjunction of Linux's
   164  // fs/namei.c:filename_create() and done_path_create().
   165  //
   166  // Preconditions:
   167  //   - !rp.Done().
   168  //   - For the final path component in rp, !rp.ShouldFollowSymlink().
   169  func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error {
   170  	fs.mu.Lock()
   171  	defer fs.mu.Unlock()
   172  	parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
   173  	if err != nil {
   174  		return err
   175  	}
   176  
   177  	// Order of checks is important. First check if parent directory can be
   178  	// executed, then check for existence, and lastly check if mount is writable.
   179  	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
   180  		return err
   181  	}
   182  	name := rp.Component()
   183  	if name == "." || name == ".." {
   184  		return linuxerr.EEXIST
   185  	}
   186  	if len(name) > fs.maxFilenameLen {
   187  		return linuxerr.ENAMETOOLONG
   188  	}
   189  	if _, ok := parentDir.childMap[name]; ok {
   190  		return linuxerr.EEXIST
   191  	}
   192  	if !dir && rp.MustBeDir() {
   193  		return linuxerr.ENOENT
   194  	}
   195  	// tmpfs never calls VFS.InvalidateDentry(), so parentDir.dentry can only
   196  	// be dead if it was deleted.
   197  	if parentDir.dentry.vfsd.IsDead() {
   198  		return linuxerr.ENOENT
   199  	}
   200  	mnt := rp.Mount()
   201  	if err := mnt.CheckBeginWrite(); err != nil {
   202  		return err
   203  	}
   204  	defer mnt.EndWrite()
   205  
   206  	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
   207  		return err
   208  	}
   209  	if err := create(parentDir, name); err != nil {
   210  		return err
   211  	}
   212  
   213  	ev := linux.IN_CREATE
   214  	if dir {
   215  		ev |= linux.IN_ISDIR
   216  	}
   217  	parentDir.inode.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
   218  	parentDir.inode.touchCMtime()
   219  	return nil
   220  }
   221  
   222  // AccessAt implements vfs.Filesystem.Impl.AccessAt.
   223  func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
   224  	fs.mu.RLock()
   225  	defer fs.mu.RUnlock()
   226  	d, err := resolveLocked(ctx, rp)
   227  	if err != nil {
   228  		return err
   229  	}
   230  	if err := d.inode.checkPermissions(creds, ats); err != nil {
   231  		return err
   232  	}
   233  	if ats.MayWrite() && rp.Mount().ReadOnly() {
   234  		return linuxerr.EROFS
   235  	}
   236  	return nil
   237  }
   238  
   239  // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
   240  func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
   241  	fs.mu.RLock()
   242  	defer fs.mu.RUnlock()
   243  	d, err := resolveLocked(ctx, rp)
   244  	if err != nil {
   245  		return nil, err
   246  	}
   247  	if opts.CheckSearchable {
   248  		if !d.inode.isDir() {
   249  			return nil, linuxerr.ENOTDIR
   250  		}
   251  		if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
   252  			return nil, err
   253  		}
   254  	}
   255  	d.IncRef()
   256  	return &d.vfsd, nil
   257  }
   258  
   259  // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
   260  func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
   261  	fs.mu.RLock()
   262  	defer fs.mu.RUnlock()
   263  	dir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
   264  	if err != nil {
   265  		return nil, err
   266  	}
   267  	dir.dentry.IncRef()
   268  	return &dir.dentry.vfsd, nil
   269  }
   270  
   271  // LinkAt implements vfs.FilesystemImpl.LinkAt.
   272  func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
   273  	return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error {
   274  		if rp.Mount() != vd.Mount() {
   275  			return linuxerr.EXDEV
   276  		}
   277  		d := vd.Dentry().Impl().(*dentry)
   278  		i := d.inode
   279  		if i.isDir() {
   280  			return linuxerr.EPERM
   281  		}
   282  		if err := vfs.MayLink(auth.CredentialsFromContext(ctx), linux.FileMode(i.mode.Load()), auth.KUID(i.uid.Load()), auth.KGID(i.gid.Load())); err != nil {
   283  			return err
   284  		}
   285  		if i.nlink.Load() == 0 {
   286  			return linuxerr.ENOENT
   287  		}
   288  		if i.nlink.Load() == maxLinks {
   289  			return linuxerr.EMLINK
   290  		}
   291  		i.incLinksLocked()
   292  		i.watches.Notify(ctx, "", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */)
   293  		parentDir.insertChildLocked(fs.newDentry(i), name)
   294  		return nil
   295  	})
   296  }
   297  
   298  // MkdirAt implements vfs.FilesystemImpl.MkdirAt.
   299  func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
   300  	return fs.doCreateAt(ctx, rp, true /* dir */, func(parentDir *directory, name string) error {
   301  		creds := rp.Credentials()
   302  		if parentDir.inode.nlink.Load() == maxLinks {
   303  			return linuxerr.EMLINK
   304  		}
   305  		parentDir.inode.incLinksLocked() // from child's ".."
   306  		childDir := fs.newDirectory(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir)
   307  		parentDir.insertChildLocked(&childDir.dentry, name)
   308  		return nil
   309  	})
   310  }
   311  
   312  // MknodAt implements vfs.FilesystemImpl.MknodAt.
   313  func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
   314  	return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error {
   315  		creds := rp.Credentials()
   316  		var childInode *inode
   317  		switch opts.Mode.FileType() {
   318  		case linux.S_IFREG:
   319  			childInode = fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir)
   320  		case linux.S_IFIFO:
   321  			childInode = fs.newNamedPipe(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir)
   322  		case linux.S_IFBLK:
   323  			childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor, parentDir)
   324  		case linux.S_IFCHR:
   325  			childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor, parentDir)
   326  		case linux.S_IFSOCK:
   327  			childInode = fs.newSocketFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, opts.Endpoint, parentDir)
   328  		default:
   329  			return linuxerr.EINVAL
   330  		}
   331  		child := fs.newDentry(childInode)
   332  		parentDir.insertChildLocked(child, name)
   333  		return nil
   334  	})
   335  }
   336  
   337  // OpenAt implements vfs.FilesystemImpl.OpenAt.
   338  func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   339  	if opts.Flags&linux.O_TMPFILE != 0 {
   340  		// Not yet supported.
   341  		return nil, linuxerr.EOPNOTSUPP
   342  	}
   343  
   344  	// Handle O_CREAT and !O_CREAT separately, since in the latter case we
   345  	// don't need fs.mu for writing.
   346  	if opts.Flags&linux.O_CREAT == 0 {
   347  		fs.mu.RLock()
   348  		d, err := resolveLocked(ctx, rp)
   349  		if err != nil {
   350  			fs.mu.RUnlock()
   351  			return nil, err
   352  		}
   353  		d.IncRef()
   354  		defer d.DecRef(ctx)
   355  		fs.mu.RUnlock()
   356  		return d.open(ctx, rp, &opts, false /* afterCreate */)
   357  	}
   358  
   359  	mustCreate := opts.Flags&linux.O_EXCL != 0
   360  	start := rp.Start().Impl().(*dentry)
   361  	fs.mu.Lock()
   362  	unlocked := false
   363  	unlock := func() {
   364  		if !unlocked {
   365  			fs.mu.Unlock()
   366  			unlocked = true
   367  		}
   368  	}
   369  	defer unlock()
   370  	if rp.Done() {
   371  		// Reject attempts to open mount root directory with O_CREAT.
   372  		if rp.MustBeDir() {
   373  			return nil, linuxerr.EISDIR
   374  		}
   375  		if mustCreate {
   376  			return nil, linuxerr.EEXIST
   377  		}
   378  		start.IncRef()
   379  		defer start.DecRef(ctx)
   380  		unlock()
   381  		return start.open(ctx, rp, &opts, false /* afterCreate */)
   382  	}
   383  afterTrailingSymlink:
   384  	parentDir, err := walkParentDirLocked(ctx, rp, start)
   385  	if err != nil {
   386  		return nil, err
   387  	}
   388  	// Check for search permission in the parent directory.
   389  	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
   390  		return nil, err
   391  	}
   392  	// Reject attempts to open directories with O_CREAT.
   393  	if rp.MustBeDir() {
   394  		return nil, linuxerr.EISDIR
   395  	}
   396  	name := rp.Component()
   397  	child, followedSymlink, err := stepLocked(ctx, rp, &parentDir.dentry)
   398  	if followedSymlink {
   399  		if mustCreate {
   400  			// EEXIST must be returned if an existing symlink is opened with O_EXCL.
   401  			return nil, linuxerr.EEXIST
   402  		}
   403  		if err != nil {
   404  			// If followedSymlink && err != nil, then this symlink resolution error
   405  			// must be handled by the VFS layer.
   406  			return nil, err
   407  		}
   408  		start = &parentDir.dentry
   409  		goto afterTrailingSymlink
   410  	}
   411  	if linuxerr.Equals(linuxerr.ENOENT, err) {
   412  		// Already checked for searchability above; now check for writability.
   413  		if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
   414  			return nil, err
   415  		}
   416  		if err := rp.Mount().CheckBeginWrite(); err != nil {
   417  			return nil, err
   418  		}
   419  		defer rp.Mount().EndWrite()
   420  		// Create and open the child.
   421  		creds := rp.Credentials()
   422  		child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir))
   423  		parentDir.insertChildLocked(child, name)
   424  		child.IncRef()
   425  		defer child.DecRef(ctx)
   426  		unlock()
   427  		fd, err := child.open(ctx, rp, &opts, true)
   428  		if err != nil {
   429  			return nil, err
   430  		}
   431  		parentDir.inode.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */)
   432  		parentDir.inode.touchCMtime()
   433  		return fd, nil
   434  	}
   435  	if err != nil {
   436  		return nil, err
   437  	}
   438  	if mustCreate {
   439  		return nil, linuxerr.EEXIST
   440  	}
   441  	if rp.MustBeDir() && !child.inode.isDir() {
   442  		return nil, linuxerr.ENOTDIR
   443  	}
   444  	child.IncRef()
   445  	defer child.DecRef(ctx)
   446  	unlock()
   447  	return child.open(ctx, rp, &opts, false)
   448  }
   449  
   450  // Preconditions: The caller must hold no locks (since opening pipes may block
   451  // indefinitely).
   452  func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) {
   453  	ats := vfs.AccessTypesForOpenFlags(opts)
   454  	if !afterCreate {
   455  		if err := d.inode.checkPermissions(rp.Credentials(), ats); err != nil {
   456  			return nil, err
   457  		}
   458  	}
   459  	switch impl := d.inode.impl.(type) {
   460  	case *regularFile:
   461  		var fd regularFileFD
   462  		fd.LockFD.Init(&d.inode.locks)
   463  		if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil {
   464  			return nil, err
   465  		}
   466  		if !afterCreate && opts.Flags&linux.O_TRUNC != 0 {
   467  			if _, err := impl.truncate(0); err != nil {
   468  				return nil, err
   469  			}
   470  		}
   471  		if fd.vfsfd.IsWritable() {
   472  			fsmetric.TmpfsOpensW.Increment()
   473  		} else if fd.vfsfd.IsReadable() {
   474  			fsmetric.TmpfsOpensRO.Increment()
   475  		}
   476  		return &fd.vfsfd, nil
   477  	case *directory:
   478  		// Can't open directories with O_CREAT.
   479  		if opts.Flags&linux.O_CREAT != 0 {
   480  			return nil, linuxerr.EISDIR
   481  		}
   482  		// Can't open directories writably.
   483  		if ats&vfs.MayWrite != 0 {
   484  			return nil, linuxerr.EISDIR
   485  		}
   486  		if opts.Flags&linux.O_DIRECT != 0 {
   487  			return nil, linuxerr.EINVAL
   488  		}
   489  		var fd directoryFD
   490  		fd.LockFD.Init(&d.inode.locks)
   491  		if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil {
   492  			return nil, err
   493  		}
   494  		return &fd.vfsfd, nil
   495  	case *symlink:
   496  		// Can't open symlinks without O_PATH, which is handled at the VFS layer.
   497  		return nil, linuxerr.ELOOP
   498  	case *namedPipe:
   499  		return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags, &d.inode.locks)
   500  	case *deviceFile:
   501  		return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts)
   502  	case *socketFile:
   503  		return nil, linuxerr.ENXIO
   504  	default:
   505  		panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl))
   506  	}
   507  }
   508  
   509  // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
   510  func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
   511  	fs.mu.RLock()
   512  	defer fs.mu.RUnlock()
   513  	d, err := resolveLocked(ctx, rp)
   514  	if err != nil {
   515  		return "", err
   516  	}
   517  	symlink, ok := d.inode.impl.(*symlink)
   518  	if !ok {
   519  		return "", linuxerr.EINVAL
   520  	}
   521  	symlink.inode.touchAtime(rp.Mount())
   522  	return symlink.target, nil
   523  }
   524  
   525  // RenameAt implements vfs.FilesystemImpl.RenameAt.
   526  func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
   527  	// Resolve newParentDir first to verify that it's on this Mount.
   528  	fs.mu.Lock()
   529  	defer fs.mu.Unlock()
   530  	newParentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
   531  	if err != nil {
   532  		return err
   533  	}
   534  
   535  	if opts.Flags&^linux.RENAME_NOREPLACE != 0 {
   536  		// TODO(b/145974740): Support other renameat2 flags.
   537  		return linuxerr.EINVAL
   538  	}
   539  
   540  	newName := rp.Component()
   541  	if newName == "." || newName == ".." {
   542  		if opts.Flags&linux.RENAME_NOREPLACE != 0 {
   543  			return linuxerr.EEXIST
   544  		}
   545  		return linuxerr.EBUSY
   546  	}
   547  	if len(newName) > fs.maxFilenameLen {
   548  		return linuxerr.ENAMETOOLONG
   549  	}
   550  	mnt := rp.Mount()
   551  	if mnt != oldParentVD.Mount() {
   552  		return linuxerr.EXDEV
   553  	}
   554  	if err := mnt.CheckBeginWrite(); err != nil {
   555  		return err
   556  	}
   557  	defer mnt.EndWrite()
   558  
   559  	oldParentDir := oldParentVD.Dentry().Impl().(*dentry).inode.impl.(*directory)
   560  	if err := oldParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
   561  		return err
   562  	}
   563  	renamed, ok := oldParentDir.childMap[oldName]
   564  	if !ok {
   565  		return linuxerr.ENOENT
   566  	}
   567  	if err := oldParentDir.mayDelete(rp.Credentials(), renamed); err != nil {
   568  		return err
   569  	}
   570  	// Note that we don't need to call rp.CheckMount(), since if renamed is a
   571  	// mount point then we want to rename the mount point, not anything in the
   572  	// mounted filesystem.
   573  	if renamed.inode.isDir() {
   574  		if renamed == &newParentDir.dentry || genericIsAncestorDentry(renamed, &newParentDir.dentry) {
   575  			return linuxerr.EINVAL
   576  		}
   577  		if oldParentDir != newParentDir {
   578  			// Writability is needed to change renamed's "..".
   579  			if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
   580  				return err
   581  			}
   582  		}
   583  	} else {
   584  		if opts.MustBeDir || rp.MustBeDir() {
   585  			return linuxerr.ENOTDIR
   586  		}
   587  	}
   588  
   589  	if err := newParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
   590  		return err
   591  	}
   592  	replaced, ok := newParentDir.childMap[newName]
   593  	if ok {
   594  		if opts.Flags&linux.RENAME_NOREPLACE != 0 {
   595  			return linuxerr.EEXIST
   596  		}
   597  		replacedDir, ok := replaced.inode.impl.(*directory)
   598  		if ok {
   599  			if !renamed.inode.isDir() {
   600  				return linuxerr.EISDIR
   601  			}
   602  			if len(replacedDir.childMap) != 0 {
   603  				return linuxerr.ENOTEMPTY
   604  			}
   605  		} else {
   606  			if rp.MustBeDir() {
   607  				return linuxerr.ENOTDIR
   608  			}
   609  			if renamed.inode.isDir() {
   610  				return linuxerr.ENOTDIR
   611  			}
   612  		}
   613  	} else {
   614  		if renamed.inode.isDir() && newParentDir.inode.nlink.Load() == maxLinks {
   615  			return linuxerr.EMLINK
   616  		}
   617  	}
   618  	// tmpfs never calls VFS.InvalidateDentry(), so newParentDir.dentry can
   619  	// only be dead if it was deleted.
   620  	if newParentDir.dentry.vfsd.IsDead() {
   621  		return linuxerr.ENOENT
   622  	}
   623  
   624  	// Linux places this check before some of those above; we do it here for
   625  	// simplicity, under the assumption that applications are not intentionally
   626  	// doing noop renames expecting them to succeed where non-noop renames
   627  	// would fail.
   628  	if renamed == replaced {
   629  		return nil
   630  	}
   631  	vfsObj := rp.VirtualFilesystem()
   632  	mntns := vfs.MountNamespaceFromContext(ctx)
   633  	defer mntns.DecRef(ctx)
   634  	var replacedVFSD *vfs.Dentry
   635  	if replaced != nil {
   636  		replacedVFSD = &replaced.vfsd
   637  	}
   638  	if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
   639  		return err
   640  	}
   641  	if replaced != nil {
   642  		newParentDir.removeChildLocked(replaced)
   643  		if replaced.inode.isDir() {
   644  			// Remove links for replaced/. and replaced/..
   645  			replaced.inode.decLinksLocked(ctx)
   646  			newParentDir.inode.decLinksLocked(ctx)
   647  		}
   648  		replaced.inode.decLinksLocked(ctx)
   649  	}
   650  	oldParentDir.removeChildLocked(renamed)
   651  	newParentDir.insertChildLocked(renamed, newName)
   652  	vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD)
   653  	oldParentDir.inode.touchCMtime()
   654  	if oldParentDir != newParentDir {
   655  		if renamed.inode.isDir() {
   656  			oldParentDir.inode.decLinksLocked(ctx)
   657  			newParentDir.inode.incLinksLocked()
   658  		}
   659  		newParentDir.inode.touchCMtime()
   660  	}
   661  	renamed.inode.touchCtime()
   662  
   663  	vfs.InotifyRename(ctx, &renamed.inode.watches, &oldParentDir.inode.watches, &newParentDir.inode.watches, oldName, newName, renamed.inode.isDir())
   664  	return nil
   665  }
   666  
   667  // RmdirAt implements vfs.FilesystemImpl.RmdirAt.
   668  func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
   669  	fs.mu.Lock()
   670  	defer fs.mu.Unlock()
   671  	parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
   672  	if err != nil {
   673  		return err
   674  	}
   675  	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
   676  		return err
   677  	}
   678  	name := rp.Component()
   679  	if name == "." {
   680  		return linuxerr.EINVAL
   681  	}
   682  	if name == ".." {
   683  		return linuxerr.ENOTEMPTY
   684  	}
   685  	child, ok := parentDir.childMap[name]
   686  	if !ok {
   687  		return linuxerr.ENOENT
   688  	}
   689  	if err := parentDir.mayDelete(rp.Credentials(), child); err != nil {
   690  		return err
   691  	}
   692  	childDir, ok := child.inode.impl.(*directory)
   693  	if !ok {
   694  		return linuxerr.ENOTDIR
   695  	}
   696  	if len(childDir.childMap) != 0 {
   697  		return linuxerr.ENOTEMPTY
   698  	}
   699  	mnt := rp.Mount()
   700  	if err := mnt.CheckBeginWrite(); err != nil {
   701  		return err
   702  	}
   703  	defer mnt.EndWrite()
   704  	vfsObj := rp.VirtualFilesystem()
   705  	mntns := vfs.MountNamespaceFromContext(ctx)
   706  	defer mntns.DecRef(ctx)
   707  	if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
   708  		return err
   709  	}
   710  	parentDir.removeChildLocked(child)
   711  	parentDir.inode.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */)
   712  	// Remove links for child, child/., and child/..
   713  	child.inode.decLinksLocked(ctx)
   714  	child.inode.decLinksLocked(ctx)
   715  	parentDir.inode.decLinksLocked(ctx)
   716  	vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
   717  	parentDir.inode.touchCMtime()
   718  	return nil
   719  }
   720  
   721  // SetStatAt implements vfs.FilesystemImpl.SetStatAt.
   722  func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
   723  	fs.mu.RLock()
   724  	d, err := resolveLocked(ctx, rp)
   725  	if err != nil {
   726  		fs.mu.RUnlock()
   727  		return err
   728  	}
   729  	err = d.inode.setStat(ctx, rp.Credentials(), &opts)
   730  	fs.mu.RUnlock()
   731  	if err != nil {
   732  		return err
   733  	}
   734  
   735  	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
   736  		d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
   737  	}
   738  	return nil
   739  }
   740  
   741  // StatAt implements vfs.FilesystemImpl.StatAt.
   742  func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
   743  	fs.mu.RLock()
   744  	defer fs.mu.RUnlock()
   745  	d, err := resolveLocked(ctx, rp)
   746  	if err != nil {
   747  		return linux.Statx{}, err
   748  	}
   749  	var stat linux.Statx
   750  	d.inode.statTo(&stat)
   751  	return stat, nil
   752  }
   753  
   754  // StatFSAt implements vfs.FilesystemImpl.StatFSAt.
   755  func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
   756  	fs.mu.RLock()
   757  	defer fs.mu.RUnlock()
   758  	if _, err := resolveLocked(ctx, rp); err != nil {
   759  		return linux.Statfs{}, err
   760  	}
   761  	return fs.statFS(), nil
   762  }
   763  
   764  // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
   765  func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
   766  	return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error {
   767  		// Linux allocates a page to store symlink targets that have length larger
   768  		// than shortSymlinkLen. Targets are just stored as string here, but simulate
   769  		// the page accounting for it. See mm/shmem.c:shmem_symlink().
   770  		if len(target) >= shortSymlinkLen {
   771  			if !fs.accountPages(1) {
   772  				return linuxerr.ENOSPC
   773  			}
   774  		}
   775  		creds := rp.Credentials()
   776  		child := fs.newDentry(fs.newSymlink(creds.EffectiveKUID, creds.EffectiveKGID, 0777, target, parentDir))
   777  		parentDir.insertChildLocked(child, name)
   778  		return nil
   779  	})
   780  }
   781  
   782  // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
   783  func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
   784  	fs.mu.Lock()
   785  	defer fs.mu.Unlock()
   786  	parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
   787  	if err != nil {
   788  		return err
   789  	}
   790  	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
   791  		return err
   792  	}
   793  	name := rp.Component()
   794  	if name == "." || name == ".." {
   795  		return linuxerr.EISDIR
   796  	}
   797  	child, ok := parentDir.childMap[name]
   798  	if !ok {
   799  		return linuxerr.ENOENT
   800  	}
   801  	if err := parentDir.mayDelete(rp.Credentials(), child); err != nil {
   802  		return err
   803  	}
   804  	if child.inode.isDir() {
   805  		return linuxerr.EISDIR
   806  	}
   807  	if rp.MustBeDir() {
   808  		return linuxerr.ENOTDIR
   809  	}
   810  	mnt := rp.Mount()
   811  	if err := mnt.CheckBeginWrite(); err != nil {
   812  		return err
   813  	}
   814  	defer mnt.EndWrite()
   815  	vfsObj := rp.VirtualFilesystem()
   816  	mntns := vfs.MountNamespaceFromContext(ctx)
   817  	defer mntns.DecRef(ctx)
   818  	if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
   819  		return err
   820  	}
   821  	// Generate inotify events. Note that this must take place before the link
   822  	// count of the child is decremented, or else the watches may be dropped
   823  	// before these events are added.
   824  	vfs.InotifyRemoveChild(ctx, &child.inode.watches, &parentDir.inode.watches, name)
   825  	parentDir.removeChildLocked(child)
   826  	child.inode.decLinksLocked(ctx)
   827  	vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
   828  	parentDir.inode.touchCMtime()
   829  	return nil
   830  }
   831  
   832  // BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
   833  func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
   834  	fs.mu.RLock()
   835  	defer fs.mu.RUnlock()
   836  	d, err := resolveLocked(ctx, rp)
   837  	if err != nil {
   838  		return nil, err
   839  	}
   840  	if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
   841  		return nil, err
   842  	}
   843  	switch impl := d.inode.impl.(type) {
   844  	case *socketFile:
   845  		if impl.ep == nil {
   846  			return nil, linuxerr.ECONNREFUSED
   847  		}
   848  		return impl.ep, nil
   849  	default:
   850  		return nil, linuxerr.ECONNREFUSED
   851  	}
   852  }
   853  
   854  // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
   855  func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
   856  	fs.mu.RLock()
   857  	defer fs.mu.RUnlock()
   858  	d, err := resolveLocked(ctx, rp)
   859  	if err != nil {
   860  		return nil, err
   861  	}
   862  	return d.inode.listXattr(rp.Credentials(), size)
   863  }
   864  
   865  // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
   866  func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
   867  	fs.mu.RLock()
   868  	defer fs.mu.RUnlock()
   869  	d, err := resolveLocked(ctx, rp)
   870  	if err != nil {
   871  		return "", err
   872  	}
   873  	return d.inode.getXattr(rp.Credentials(), &opts)
   874  }
   875  
   876  // SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
   877  func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
   878  	fs.mu.RLock()
   879  	d, err := resolveLocked(ctx, rp)
   880  	if err != nil {
   881  		fs.mu.RUnlock()
   882  		return err
   883  	}
   884  	err = d.inode.setXattr(rp.Credentials(), &opts)
   885  	fs.mu.RUnlock()
   886  	if err != nil {
   887  		return err
   888  	}
   889  
   890  	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
   891  	return nil
   892  }
   893  
   894  // RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
   895  func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
   896  	fs.mu.RLock()
   897  	d, err := resolveLocked(ctx, rp)
   898  	if err != nil {
   899  		fs.mu.RUnlock()
   900  		return err
   901  	}
   902  	err = d.inode.removeXattr(rp.Credentials(), name)
   903  	fs.mu.RUnlock()
   904  	if err != nil {
   905  		return err
   906  	}
   907  
   908  	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
   909  	return nil
   910  }
   911  
   912  // PrependPath implements vfs.FilesystemImpl.PrependPath.
   913  func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
   914  	fs.mu.RLock()
   915  	defer fs.mu.RUnlock()
   916  	mnt := vd.Mount()
   917  	d := vd.Dentry().Impl().(*dentry)
   918  	for {
   919  		if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() {
   920  			return vfs.PrependPathAtVFSRootError{}
   921  		}
   922  		if mnt != nil && &d.vfsd == mnt.Root() {
   923  			return nil
   924  		}
   925  		if d.parent == nil {
   926  			if d.name != "" {
   927  				// This file must have been created by
   928  				// newUnlinkedRegularFileDescription(). In Linux,
   929  				// mm/shmem.c:__shmem_file_setup() =>
   930  				// fs/file_table.c:alloc_file_pseudo() sets the created
   931  				// dentry's dentry_operations to anon_ops, for which d_dname ==
   932  				// simple_dname. fs/d_path.c:simple_dname() defines the
   933  				// dentry's pathname to be its name, prefixed with "/" and
   934  				// suffixed with " (deleted)".
   935  				b.PrependComponent("/" + d.name)
   936  				b.AppendString(" (deleted)")
   937  				return vfs.PrependPathSyntheticError{}
   938  			}
   939  			return vfs.PrependPathAtNonMountRootError{}
   940  		}
   941  		b.PrependComponent(d.name)
   942  		d = d.parent
   943  	}
   944  }
   945  
   946  // MountOptions implements vfs.FilesystemImpl.MountOptions.
   947  func (fs *filesystem) MountOptions() string {
   948  	return fs.mopts
   949  }
   950  
   951  // adjustPageAcct adjusts the accounting done against filesystem size limit in
   952  // case there is any discrepency between the number of pages reserved vs the
   953  // number of pages actually allocated.
   954  func (fs *filesystem) adjustPageAcct(reserved, alloced uint64) {
   955  	if reserved < alloced {
   956  		panic(fmt.Sprintf("More pages were allocated than the pages reserved: reserved=%d, alloced=%d", reserved, alloced))
   957  	}
   958  	if pagesDiff := reserved - alloced; pagesDiff > 0 {
   959  		fs.unaccountPages(pagesDiff)
   960  	}
   961  }
   962  
   963  // accountPagesPartial increases the pagesUsed if tmpfs is mounted with size
   964  // option by as much as possible without going over the size mount option. It
   965  // returns the number of pages that we were able to account for. It returns false
   966  // when the maxSizeInPages has been exhausted and no more allocation can be done.
   967  // The returned value is guaranteed to be <= pagesInc. If the size mount option is
   968  // not set, then pagesInc will be returned.
   969  func (fs *filesystem) accountPagesPartial(pagesInc uint64) uint64 {
   970  	if pagesInc == 0 {
   971  		return pagesInc
   972  	}
   973  
   974  	for {
   975  		pagesUsed := fs.pagesUsed.Load()
   976  		if fs.maxSizeInPages <= pagesUsed {
   977  			return 0
   978  		}
   979  
   980  		pagesFree := fs.maxSizeInPages - pagesUsed
   981  		toInc := pagesInc
   982  		if pagesFree < pagesInc {
   983  			toInc = pagesFree
   984  		}
   985  
   986  		if fs.pagesUsed.CompareAndSwap(pagesUsed, pagesUsed+toInc) {
   987  			return toInc
   988  		}
   989  	}
   990  }
   991  
   992  // accountPages increases the pagesUsed in filesystem struct if tmpfs
   993  // is mounted with size option. We return a false when the maxSizeInPages
   994  // has been exhausted and no more allocation can be done.
   995  func (fs *filesystem) accountPages(pagesInc uint64) bool {
   996  	if pagesInc == 0 {
   997  		return true // No accounting needed.
   998  	}
   999  
  1000  	for {
  1001  		pagesUsed := fs.pagesUsed.Load()
  1002  		if fs.maxSizeInPages <= pagesUsed {
  1003  			return false
  1004  		}
  1005  
  1006  		pagesFree := fs.maxSizeInPages - pagesUsed
  1007  		if pagesFree < pagesInc {
  1008  			return false
  1009  		}
  1010  
  1011  		if fs.pagesUsed.CompareAndSwap(pagesUsed, pagesUsed+pagesInc) {
  1012  			return true
  1013  		}
  1014  	}
  1015  }
  1016  
  1017  // unaccountPages decreases the pagesUsed in filesystem struct if tmpfs
  1018  // is mounted with size option.
  1019  func (fs *filesystem) unaccountPages(pagesDec uint64) {
  1020  	if pagesDec == 0 {
  1021  		return
  1022  	}
  1023  
  1024  	for {
  1025  		pagesUsed := fs.pagesUsed.Load()
  1026  		if pagesUsed < pagesDec {
  1027  			panic(fmt.Sprintf("Deallocating more pages than allocated: fs.pagesUsed = %d, pagesDec = %d", pagesUsed, pagesDec))
  1028  		}
  1029  		if fs.pagesUsed.CompareAndSwap(pagesUsed, pagesUsed-pagesDec) {
  1030  			break
  1031  		}
  1032  	}
  1033  }