gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/fsimpl/tmpfs/filesystem.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tmpfs
    16  
    17  import (
    18  	"fmt"
    19  
    20  	"gvisor.dev/gvisor/pkg/abi/linux"
    21  	"gvisor.dev/gvisor/pkg/context"
    22  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    23  	"gvisor.dev/gvisor/pkg/fspath"
    24  	"gvisor.dev/gvisor/pkg/refs"
    25  	"gvisor.dev/gvisor/pkg/sentry/fsmetric"
    26  	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
    27  	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
    28  	"gvisor.dev/gvisor/pkg/sentry/vfs"
    29  )
    30  
    31  const (
    32  	// direntSize is the size of each directory entry
    33  	// that Linux uses for computing directory size.
    34  	// "20" is mm/shmem.c:BOGO_DIRENT_SIZE.
    35  	direntSize = 20
    36  	// Linux implementation uses a SHORT_SYMLINK_LEN 128.
    37  	// It accounts size for only SYMLINK with size >= 128.
    38  	shortSymlinkLen = 128
    39  )
    40  
    41  // Sync implements vfs.FilesystemImpl.Sync.
    42  func (fs *filesystem) Sync(ctx context.Context) error {
    43  	// All filesystem state is in-memory.
    44  	return nil
    45  }
    46  
    47  // stepLocked resolves rp.Component() to an existing file, starting from the
    48  // given directory.
    49  //
    50  // stepLocked is loosely analogous to fs/namei.c:walk_component().
    51  //
    52  // Preconditions:
    53  //   - filesystem.mu must be locked.
    54  //   - !rp.Done().
    55  func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, bool, error) {
    56  	dir, ok := d.inode.impl.(*directory)
    57  	if !ok {
    58  		return nil, false, linuxerr.ENOTDIR
    59  	}
    60  	if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
    61  		return nil, false, err
    62  	}
    63  	name := rp.Component()
    64  	if name == "." {
    65  		rp.Advance()
    66  		return d, false, nil
    67  	}
    68  	if name == ".." {
    69  		if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
    70  			return nil, false, err
    71  		} else if isRoot || d.parent.Load() == nil {
    72  			rp.Advance()
    73  			return d, false, nil
    74  		}
    75  		if err := rp.CheckMount(ctx, &d.parent.Load().vfsd); err != nil {
    76  			return nil, false, err
    77  		}
    78  		rp.Advance()
    79  		return d.parent.Load(), false, nil
    80  	}
    81  	if len(name) > d.inode.fs.maxFilenameLen {
    82  		return nil, false, linuxerr.ENAMETOOLONG
    83  	}
    84  	child, ok := dir.childMap[name]
    85  	if !ok {
    86  		return nil, false, linuxerr.ENOENT
    87  	}
    88  	if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
    89  		return nil, false, err
    90  	}
    91  	if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
    92  		// Symlink traversal updates access time.
    93  		child.inode.touchAtime(rp.Mount())
    94  		followedSymlink, err := rp.HandleSymlink(symlink.target)
    95  		return d, followedSymlink, err
    96  	}
    97  	rp.Advance()
    98  	return child, false, nil
    99  }
   100  
   101  // walkParentDirLocked resolves all but the last path component of rp to an
   102  // existing directory, starting from the given directory (which is usually
   103  // rp.Start().Impl().(*dentry)). It does not check that the returned directory
   104  // is searchable by the provider of rp.
   105  //
   106  // walkParentDirLocked is loosely analogous to Linux's
   107  // fs/namei.c:path_parentat().
   108  //
   109  // Preconditions:
   110  //   - filesystem.mu must be locked.
   111  //   - !rp.Done().
   112  func walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*directory, error) {
   113  	for !rp.Final() {
   114  		next, _, err := stepLocked(ctx, rp, d)
   115  		if err != nil {
   116  			return nil, err
   117  		}
   118  		d = next
   119  	}
   120  	dir, ok := d.inode.impl.(*directory)
   121  	if !ok {
   122  		return nil, linuxerr.ENOTDIR
   123  	}
   124  	return dir, nil
   125  }
   126  
   127  // resolveLocked resolves rp to an existing file.
   128  //
   129  // resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
   130  //
   131  // Preconditions: filesystem.mu must be locked.
   132  func resolveLocked(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error) {
   133  	d := rp.Start().Impl().(*dentry)
   134  
   135  	if symlink, ok := d.inode.impl.(*symlink); rp.Done() && ok && rp.ShouldFollowSymlink() {
   136  		// Path with a single component. We don't need to step to the next
   137  		// component, but still need to resolve any symlinks.
   138  		//
   139  		// Symlink traversal updates access time.
   140  		d.inode.touchAtime(rp.Mount())
   141  		if _, err := rp.HandleSymlink(symlink.target); err != nil {
   142  			return nil, err
   143  		}
   144  	} else {
   145  		// Path with multiple components, walk and resolve as required.
   146  		for !rp.Done() {
   147  			next, _, err := stepLocked(ctx, rp, d)
   148  			if err != nil {
   149  				return nil, err
   150  			}
   151  			d = next
   152  		}
   153  	}
   154  
   155  	if rp.MustBeDir() && !d.inode.isDir() {
   156  		return nil, linuxerr.ENOTDIR
   157  	}
   158  	return d, nil
   159  }
   160  
   161  // doCreateAt checks that creating a file at rp is permitted, then invokes
   162  // create to do so.
   163  //
   164  // doCreateAt is loosely analogous to a conjunction of Linux's
   165  // fs/namei.c:filename_create() and done_path_create().
   166  //
   167  // Preconditions:
   168  //   - !rp.Done().
   169  //   - For the final path component in rp, !rp.ShouldFollowSymlink().
   170  func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error {
   171  	fs.mu.Lock()
   172  	defer fs.mu.Unlock()
   173  	parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
   174  	if err != nil {
   175  		return err
   176  	}
   177  
   178  	// Order of checks is important. First check if parent directory can be
   179  	// executed, then check for existence, and lastly check if mount is writable.
   180  	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
   181  		return err
   182  	}
   183  	name := rp.Component()
   184  	if name == "." || name == ".." {
   185  		return linuxerr.EEXIST
   186  	}
   187  	if len(name) > fs.maxFilenameLen {
   188  		return linuxerr.ENAMETOOLONG
   189  	}
   190  	if _, ok := parentDir.childMap[name]; ok {
   191  		return linuxerr.EEXIST
   192  	}
   193  	if !dir && rp.MustBeDir() {
   194  		return linuxerr.ENOENT
   195  	}
   196  	// tmpfs never calls VFS.InvalidateDentry(), so parentDir.dentry can only
   197  	// be dead if it was deleted.
   198  	if parentDir.dentry.vfsd.IsDead() {
   199  		return linuxerr.ENOENT
   200  	}
   201  	mnt := rp.Mount()
   202  	if err := mnt.CheckBeginWrite(); err != nil {
   203  		return err
   204  	}
   205  	defer mnt.EndWrite()
   206  
   207  	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
   208  		return err
   209  	}
   210  	if err := create(parentDir, name); err != nil {
   211  		return err
   212  	}
   213  
   214  	ev := linux.IN_CREATE
   215  	if dir {
   216  		ev |= linux.IN_ISDIR
   217  	}
   218  	parentDir.inode.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
   219  	parentDir.inode.touchCMtime()
   220  	return nil
   221  }
   222  
   223  // AccessAt implements vfs.Filesystem.Impl.AccessAt.
   224  func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
   225  	fs.mu.RLock()
   226  	defer fs.mu.RUnlock()
   227  	d, err := resolveLocked(ctx, rp)
   228  	if err != nil {
   229  		return err
   230  	}
   231  	if err := d.inode.checkPermissions(creds, ats); err != nil {
   232  		return err
   233  	}
   234  	if ats.MayWrite() && rp.Mount().ReadOnly() {
   235  		return linuxerr.EROFS
   236  	}
   237  	return nil
   238  }
   239  
   240  // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
   241  func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
   242  	fs.mu.RLock()
   243  	defer fs.mu.RUnlock()
   244  	d, err := resolveLocked(ctx, rp)
   245  	if err != nil {
   246  		return nil, err
   247  	}
   248  	if opts.CheckSearchable {
   249  		if !d.inode.isDir() {
   250  			return nil, linuxerr.ENOTDIR
   251  		}
   252  		if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
   253  			return nil, err
   254  		}
   255  	}
   256  	d.IncRef()
   257  	return &d.vfsd, nil
   258  }
   259  
   260  // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
   261  func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
   262  	fs.mu.RLock()
   263  	defer fs.mu.RUnlock()
   264  	dir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
   265  	if err != nil {
   266  		return nil, err
   267  	}
   268  	dir.dentry.IncRef()
   269  	return &dir.dentry.vfsd, nil
   270  }
   271  
   272  // LinkAt implements vfs.FilesystemImpl.LinkAt.
   273  func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
   274  	return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error {
   275  		if rp.Mount() != vd.Mount() {
   276  			return linuxerr.EXDEV
   277  		}
   278  		d := vd.Dentry().Impl().(*dentry)
   279  		i := d.inode
   280  		if i.isDir() {
   281  			return linuxerr.EPERM
   282  		}
   283  		if err := vfs.MayLink(auth.CredentialsFromContext(ctx), linux.FileMode(i.mode.Load()), auth.KUID(i.uid.Load()), auth.KGID(i.gid.Load())); err != nil {
   284  			return err
   285  		}
   286  		if i.nlink.Load() == 0 {
   287  			return linuxerr.ENOENT
   288  		}
   289  		if i.nlink.Load() == maxLinks {
   290  			return linuxerr.EMLINK
   291  		}
   292  		i.incLinksLocked()
   293  		i.watches.Notify(ctx, "", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */)
   294  		parentDir.insertChildLocked(fs.newDentry(i), name)
   295  		return nil
   296  	})
   297  }
   298  
   299  // MkdirAt implements vfs.FilesystemImpl.MkdirAt.
   300  func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
   301  	return fs.doCreateAt(ctx, rp, true /* dir */, func(parentDir *directory, name string) error {
   302  		creds := rp.Credentials()
   303  		if parentDir.inode.nlink.Load() == maxLinks {
   304  			return linuxerr.EMLINK
   305  		}
   306  		parentDir.inode.incLinksLocked() // from child's ".."
   307  		childDir := fs.newDirectory(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir)
   308  		parentDir.insertChildLocked(&childDir.dentry, name)
   309  		return nil
   310  	})
   311  }
   312  
   313  // MknodAt implements vfs.FilesystemImpl.MknodAt.
   314  func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
   315  	return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error {
   316  		creds := rp.Credentials()
   317  		var childInode *inode
   318  		switch opts.Mode.FileType() {
   319  		case linux.S_IFREG:
   320  			childInode = fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir)
   321  		case linux.S_IFIFO:
   322  			childInode = fs.newNamedPipe(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir)
   323  		case linux.S_IFBLK:
   324  			childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor, parentDir)
   325  		case linux.S_IFCHR:
   326  			childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor, parentDir)
   327  		case linux.S_IFSOCK:
   328  			childInode = fs.newSocketFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, opts.Endpoint, parentDir)
   329  		default:
   330  			return linuxerr.EINVAL
   331  		}
   332  		child := fs.newDentry(childInode)
   333  		parentDir.insertChildLocked(child, name)
   334  		return nil
   335  	})
   336  }
   337  
   338  // OpenAt implements vfs.FilesystemImpl.OpenAt.
   339  func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   340  	if opts.Flags&linux.O_TMPFILE != 0 {
   341  		// Not yet supported.
   342  		return nil, linuxerr.EOPNOTSUPP
   343  	}
   344  
   345  	// Handle O_CREAT and !O_CREAT separately, since in the latter case we
   346  	// don't need fs.mu for writing.
   347  	if opts.Flags&linux.O_CREAT == 0 {
   348  		fs.mu.RLock()
   349  		d, err := resolveLocked(ctx, rp)
   350  		if err != nil {
   351  			fs.mu.RUnlock()
   352  			return nil, err
   353  		}
   354  		d.IncRef()
   355  		defer d.DecRef(ctx)
   356  		fs.mu.RUnlock()
   357  		return d.open(ctx, rp, &opts, false /* afterCreate */)
   358  	}
   359  
   360  	mustCreate := opts.Flags&linux.O_EXCL != 0
   361  	start := rp.Start().Impl().(*dentry)
   362  	fs.mu.Lock()
   363  	unlocked := false
   364  	unlock := func() {
   365  		if !unlocked {
   366  			fs.mu.Unlock()
   367  			unlocked = true
   368  		}
   369  	}
   370  	defer unlock()
   371  	if rp.Done() {
   372  		// Reject attempts to open mount root directory with O_CREAT.
   373  		if rp.MustBeDir() {
   374  			return nil, linuxerr.EISDIR
   375  		}
   376  		if mustCreate {
   377  			return nil, linuxerr.EEXIST
   378  		}
   379  		start.IncRef()
   380  		defer start.DecRef(ctx)
   381  		unlock()
   382  		return start.open(ctx, rp, &opts, false /* afterCreate */)
   383  	}
   384  afterTrailingSymlink:
   385  	parentDir, err := walkParentDirLocked(ctx, rp, start)
   386  	if err != nil {
   387  		return nil, err
   388  	}
   389  	// Check for search permission in the parent directory.
   390  	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
   391  		return nil, err
   392  	}
   393  	// Reject attempts to open directories with O_CREAT.
   394  	if rp.MustBeDir() {
   395  		return nil, linuxerr.EISDIR
   396  	}
   397  	name := rp.Component()
   398  	child, followedSymlink, err := stepLocked(ctx, rp, &parentDir.dentry)
   399  	if followedSymlink {
   400  		if mustCreate {
   401  			// EEXIST must be returned if an existing symlink is opened with O_EXCL.
   402  			return nil, linuxerr.EEXIST
   403  		}
   404  		if err != nil {
   405  			// If followedSymlink && err != nil, then this symlink resolution error
   406  			// must be handled by the VFS layer.
   407  			return nil, err
   408  		}
   409  		start = &parentDir.dentry
   410  		goto afterTrailingSymlink
   411  	}
   412  	if linuxerr.Equals(linuxerr.ENOENT, err) {
   413  		// Already checked for searchability above; now check for writability.
   414  		if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
   415  			return nil, err
   416  		}
   417  		if err := rp.Mount().CheckBeginWrite(); err != nil {
   418  			return nil, err
   419  		}
   420  		defer rp.Mount().EndWrite()
   421  		// Create and open the child.
   422  		creds := rp.Credentials()
   423  		child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir))
   424  		parentDir.insertChildLocked(child, name)
   425  		child.IncRef()
   426  		defer child.DecRef(ctx)
   427  		unlock()
   428  		fd, err := child.open(ctx, rp, &opts, true)
   429  		if err != nil {
   430  			return nil, err
   431  		}
   432  		parentDir.inode.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */)
   433  		parentDir.inode.touchCMtime()
   434  		return fd, nil
   435  	}
   436  	if err != nil {
   437  		return nil, err
   438  	}
   439  	if mustCreate {
   440  		return nil, linuxerr.EEXIST
   441  	}
   442  	if rp.MustBeDir() && !child.inode.isDir() {
   443  		return nil, linuxerr.ENOTDIR
   444  	}
   445  	child.IncRef()
   446  	defer child.DecRef(ctx)
   447  	unlock()
   448  	return child.open(ctx, rp, &opts, false)
   449  }
   450  
   451  // Preconditions: The caller must hold no locks (since opening pipes may block
   452  // indefinitely).
   453  func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) {
   454  	ats := vfs.AccessTypesForOpenFlags(opts)
   455  	if !afterCreate {
   456  		if err := d.inode.checkPermissions(rp.Credentials(), ats); err != nil {
   457  			return nil, err
   458  		}
   459  	}
   460  	switch impl := d.inode.impl.(type) {
   461  	case *regularFile:
   462  		var fd regularFileFD
   463  		fd.LockFD.Init(&d.inode.locks)
   464  		if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil {
   465  			return nil, err
   466  		}
   467  		if !afterCreate && opts.Flags&linux.O_TRUNC != 0 {
   468  			if _, err := impl.truncate(0); err != nil {
   469  				return nil, err
   470  			}
   471  		}
   472  		if fd.vfsfd.IsWritable() {
   473  			fsmetric.TmpfsOpensW.Increment()
   474  		} else if fd.vfsfd.IsReadable() {
   475  			fsmetric.TmpfsOpensRO.Increment()
   476  		}
   477  		return &fd.vfsfd, nil
   478  	case *directory:
   479  		// Can't open directories with O_CREAT.
   480  		if opts.Flags&linux.O_CREAT != 0 {
   481  			return nil, linuxerr.EISDIR
   482  		}
   483  		// Can't open directories writably.
   484  		if ats&vfs.MayWrite != 0 {
   485  			return nil, linuxerr.EISDIR
   486  		}
   487  		if opts.Flags&linux.O_DIRECT != 0 {
   488  			return nil, linuxerr.EINVAL
   489  		}
   490  		var fd directoryFD
   491  		fd.LockFD.Init(&d.inode.locks)
   492  		if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil {
   493  			return nil, err
   494  		}
   495  		return &fd.vfsfd, nil
   496  	case *symlink:
   497  		// Can't open symlinks without O_PATH, which is handled at the VFS layer.
   498  		return nil, linuxerr.ELOOP
   499  	case *namedPipe:
   500  		return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags, &d.inode.locks)
   501  	case *deviceFile:
   502  		return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts)
   503  	case *socketFile:
   504  		return nil, linuxerr.ENXIO
   505  	default:
   506  		panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl))
   507  	}
   508  }
   509  
   510  // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
   511  func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
   512  	fs.mu.RLock()
   513  	defer fs.mu.RUnlock()
   514  	d, err := resolveLocked(ctx, rp)
   515  	if err != nil {
   516  		return "", err
   517  	}
   518  	symlink, ok := d.inode.impl.(*symlink)
   519  	if !ok {
   520  		return "", linuxerr.EINVAL
   521  	}
   522  	symlink.inode.touchAtime(rp.Mount())
   523  	return symlink.target, nil
   524  }
   525  
   526  // RenameAt implements vfs.FilesystemImpl.RenameAt.
   527  func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
   528  	// Resolve newParentDir first to verify that it's on this Mount.
   529  	fs.mu.Lock()
   530  	// We need to DecRef outside of fs.mu because forgetting a dead mountpoint
   531  	// could result in this filesystem being released which acquires fs.mu.
   532  	var toDecRef []refs.RefCounter
   533  	defer func() {
   534  		for _, ref := range toDecRef {
   535  			ref.DecRef(ctx)
   536  		}
   537  	}()
   538  	defer fs.mu.Unlock()
   539  	newParentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
   540  	if err != nil {
   541  		return err
   542  	}
   543  
   544  	if opts.Flags&^linux.RENAME_NOREPLACE != 0 {
   545  		// TODO(b/145974740): Support other renameat2 flags.
   546  		return linuxerr.EINVAL
   547  	}
   548  
   549  	newName := rp.Component()
   550  	if newName == "." || newName == ".." {
   551  		if opts.Flags&linux.RENAME_NOREPLACE != 0 {
   552  			return linuxerr.EEXIST
   553  		}
   554  		return linuxerr.EBUSY
   555  	}
   556  	if len(newName) > fs.maxFilenameLen {
   557  		return linuxerr.ENAMETOOLONG
   558  	}
   559  	mnt := rp.Mount()
   560  	if mnt != oldParentVD.Mount() {
   561  		return linuxerr.EXDEV
   562  	}
   563  	if err := mnt.CheckBeginWrite(); err != nil {
   564  		return err
   565  	}
   566  	defer mnt.EndWrite()
   567  
   568  	oldParentDir := oldParentVD.Dentry().Impl().(*dentry).inode.impl.(*directory)
   569  	if err := oldParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
   570  		return err
   571  	}
   572  	renamed, ok := oldParentDir.childMap[oldName]
   573  	if !ok {
   574  		return linuxerr.ENOENT
   575  	}
   576  	if err := oldParentDir.mayDelete(rp.Credentials(), renamed); err != nil {
   577  		return err
   578  	}
   579  	// Note that we don't need to call rp.CheckMount(), since if renamed is a
   580  	// mount point then we want to rename the mount point, not anything in the
   581  	// mounted filesystem.
   582  	if renamed.inode.isDir() {
   583  		if renamed == &newParentDir.dentry || genericIsAncestorDentry(renamed, &newParentDir.dentry) {
   584  			return linuxerr.EINVAL
   585  		}
   586  		if oldParentDir != newParentDir {
   587  			// Writability is needed to change renamed's "..".
   588  			if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
   589  				return err
   590  			}
   591  		}
   592  	} else {
   593  		if opts.MustBeDir || rp.MustBeDir() {
   594  			return linuxerr.ENOTDIR
   595  		}
   596  	}
   597  
   598  	if err := newParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
   599  		return err
   600  	}
   601  	replaced, ok := newParentDir.childMap[newName]
   602  	if ok {
   603  		if opts.Flags&linux.RENAME_NOREPLACE != 0 {
   604  			return linuxerr.EEXIST
   605  		}
   606  		replacedDir, ok := replaced.inode.impl.(*directory)
   607  		if ok {
   608  			if !renamed.inode.isDir() {
   609  				return linuxerr.EISDIR
   610  			}
   611  			if len(replacedDir.childMap) != 0 {
   612  				return linuxerr.ENOTEMPTY
   613  			}
   614  		} else {
   615  			if rp.MustBeDir() {
   616  				return linuxerr.ENOTDIR
   617  			}
   618  			if renamed.inode.isDir() {
   619  				return linuxerr.ENOTDIR
   620  			}
   621  		}
   622  	} else {
   623  		if renamed.inode.isDir() && newParentDir.inode.nlink.Load() == maxLinks {
   624  			return linuxerr.EMLINK
   625  		}
   626  	}
   627  	// tmpfs never calls VFS.InvalidateDentry(), so newParentDir.dentry can
   628  	// only be dead if it was deleted.
   629  	if newParentDir.dentry.vfsd.IsDead() {
   630  		return linuxerr.ENOENT
   631  	}
   632  
   633  	// Linux places this check before some of those above; we do it here for
   634  	// simplicity, under the assumption that applications are not intentionally
   635  	// doing noop renames expecting them to succeed where non-noop renames
   636  	// would fail.
   637  	if renamed == replaced {
   638  		return nil
   639  	}
   640  	vfsObj := rp.VirtualFilesystem()
   641  	mntns := vfs.MountNamespaceFromContext(ctx)
   642  	defer mntns.DecRef(ctx)
   643  	var replacedVFSD *vfs.Dentry
   644  	if replaced != nil {
   645  		replacedVFSD = &replaced.vfsd
   646  	}
   647  	if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
   648  		return err
   649  	}
   650  	if replaced != nil {
   651  		newParentDir.removeChildLocked(replaced)
   652  		if replaced.inode.isDir() {
   653  			// Remove links for replaced/. and replaced/..
   654  			replaced.inode.decLinksLocked(ctx)
   655  			newParentDir.inode.decLinksLocked(ctx)
   656  		}
   657  		replaced.inode.decLinksLocked(ctx)
   658  	}
   659  	oldParentDir.removeChildLocked(renamed)
   660  	newParentDir.insertChildLocked(renamed, newName)
   661  	toDecRef = vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD)
   662  	oldParentDir.inode.touchCMtime()
   663  	if oldParentDir != newParentDir {
   664  		if renamed.inode.isDir() {
   665  			oldParentDir.inode.decLinksLocked(ctx)
   666  			newParentDir.inode.incLinksLocked()
   667  		}
   668  		newParentDir.inode.touchCMtime()
   669  	}
   670  	renamed.inode.touchCtime()
   671  
   672  	vfs.InotifyRename(ctx, &renamed.inode.watches, &oldParentDir.inode.watches, &newParentDir.inode.watches, oldName, newName, renamed.inode.isDir())
   673  	return nil
   674  }
   675  
   676  // RmdirAt implements vfs.FilesystemImpl.RmdirAt.
   677  func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
   678  	fs.mu.Lock()
   679  	// We need to DecRef outside of fs.mu because forgetting a dead mountpoint
   680  	// could result in this filesystem being released which acquires fs.mu.
   681  	var toDecRef []refs.RefCounter
   682  	defer func() {
   683  		for _, ref := range toDecRef {
   684  			ref.DecRef(ctx)
   685  		}
   686  	}()
   687  	defer fs.mu.Unlock()
   688  	parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
   689  	if err != nil {
   690  		return err
   691  	}
   692  	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
   693  		return err
   694  	}
   695  	name := rp.Component()
   696  	if name == "." {
   697  		return linuxerr.EINVAL
   698  	}
   699  	if name == ".." {
   700  		return linuxerr.ENOTEMPTY
   701  	}
   702  	child, ok := parentDir.childMap[name]
   703  	if !ok {
   704  		return linuxerr.ENOENT
   705  	}
   706  	if err := parentDir.mayDelete(rp.Credentials(), child); err != nil {
   707  		return err
   708  	}
   709  	childDir, ok := child.inode.impl.(*directory)
   710  	if !ok {
   711  		return linuxerr.ENOTDIR
   712  	}
   713  	if len(childDir.childMap) != 0 {
   714  		return linuxerr.ENOTEMPTY
   715  	}
   716  	mnt := rp.Mount()
   717  	if err := mnt.CheckBeginWrite(); err != nil {
   718  		return err
   719  	}
   720  	defer mnt.EndWrite()
   721  	vfsObj := rp.VirtualFilesystem()
   722  	mntns := vfs.MountNamespaceFromContext(ctx)
   723  	defer mntns.DecRef(ctx)
   724  	if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
   725  		return err
   726  	}
   727  	parentDir.removeChildLocked(child)
   728  	parentDir.inode.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */)
   729  	// Remove links for child, child/., and child/..
   730  	child.inode.decLinksLocked(ctx)
   731  	child.inode.decLinksLocked(ctx)
   732  	parentDir.inode.decLinksLocked(ctx)
   733  	toDecRef = vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
   734  	parentDir.inode.touchCMtime()
   735  	return nil
   736  }
   737  
   738  // SetStatAt implements vfs.FilesystemImpl.SetStatAt.
   739  func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
   740  	fs.mu.RLock()
   741  	d, err := resolveLocked(ctx, rp)
   742  	if err != nil {
   743  		fs.mu.RUnlock()
   744  		return err
   745  	}
   746  	err = d.inode.setStat(ctx, rp.Credentials(), &opts)
   747  	fs.mu.RUnlock()
   748  	if err != nil {
   749  		return err
   750  	}
   751  
   752  	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
   753  		d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
   754  	}
   755  	return nil
   756  }
   757  
   758  // StatAt implements vfs.FilesystemImpl.StatAt.
   759  func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
   760  	fs.mu.RLock()
   761  	defer fs.mu.RUnlock()
   762  	d, err := resolveLocked(ctx, rp)
   763  	if err != nil {
   764  		return linux.Statx{}, err
   765  	}
   766  	var stat linux.Statx
   767  	d.inode.statTo(&stat)
   768  	return stat, nil
   769  }
   770  
   771  // StatFSAt implements vfs.FilesystemImpl.StatFSAt.
   772  func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
   773  	fs.mu.RLock()
   774  	defer fs.mu.RUnlock()
   775  	if _, err := resolveLocked(ctx, rp); err != nil {
   776  		return linux.Statfs{}, err
   777  	}
   778  	return fs.statFS(), nil
   779  }
   780  
   781  // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
   782  func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
   783  	return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error {
   784  		// Linux allocates a page to store symlink targets that have length larger
   785  		// than shortSymlinkLen. Targets are just stored as string here, but simulate
   786  		// the page accounting for it. See mm/shmem.c:shmem_symlink().
   787  		if len(target) >= shortSymlinkLen {
   788  			if !fs.accountPages(1) {
   789  				return linuxerr.ENOSPC
   790  			}
   791  		}
   792  		creds := rp.Credentials()
   793  		child := fs.newDentry(fs.newSymlink(creds.EffectiveKUID, creds.EffectiveKGID, 0777, target, parentDir))
   794  		parentDir.insertChildLocked(child, name)
   795  		return nil
   796  	})
   797  }
   798  
   799  // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
   800  func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
   801  	fs.mu.Lock()
   802  	// We need to DecRef outside of fs.mu because forgetting a dead mountpoint
   803  	// could result in this filesystem being released which acquires fs.mu.
   804  	var toDecRef []refs.RefCounter
   805  	defer func() {
   806  		for _, ref := range toDecRef {
   807  			ref.DecRef(ctx)
   808  		}
   809  	}()
   810  	defer fs.mu.Unlock()
   811  	parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
   812  	if err != nil {
   813  		return err
   814  	}
   815  	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
   816  		return err
   817  	}
   818  	name := rp.Component()
   819  	if name == "." || name == ".." {
   820  		return linuxerr.EISDIR
   821  	}
   822  	child, ok := parentDir.childMap[name]
   823  	if !ok {
   824  		return linuxerr.ENOENT
   825  	}
   826  	if err := parentDir.mayDelete(rp.Credentials(), child); err != nil {
   827  		return err
   828  	}
   829  	if child.inode.isDir() {
   830  		return linuxerr.EISDIR
   831  	}
   832  	if rp.MustBeDir() {
   833  		return linuxerr.ENOTDIR
   834  	}
   835  	mnt := rp.Mount()
   836  	if err := mnt.CheckBeginWrite(); err != nil {
   837  		return err
   838  	}
   839  	defer mnt.EndWrite()
   840  	vfsObj := rp.VirtualFilesystem()
   841  	mntns := vfs.MountNamespaceFromContext(ctx)
   842  	defer mntns.DecRef(ctx)
   843  	if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
   844  		return err
   845  	}
   846  	// Generate inotify events. Note that this must take place before the link
   847  	// count of the child is decremented, or else the watches may be dropped
   848  	// before these events are added.
   849  	vfs.InotifyRemoveChild(ctx, &child.inode.watches, &parentDir.inode.watches, name)
   850  	parentDir.removeChildLocked(child)
   851  	child.inode.decLinksLocked(ctx)
   852  	toDecRef = vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
   853  	parentDir.inode.touchCMtime()
   854  	return nil
   855  }
   856  
   857  // BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
   858  func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
   859  	fs.mu.RLock()
   860  	defer fs.mu.RUnlock()
   861  	d, err := resolveLocked(ctx, rp)
   862  	if err != nil {
   863  		return nil, err
   864  	}
   865  	if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
   866  		return nil, err
   867  	}
   868  	switch impl := d.inode.impl.(type) {
   869  	case *socketFile:
   870  		if impl.ep == nil {
   871  			return nil, linuxerr.ECONNREFUSED
   872  		}
   873  		return impl.ep, nil
   874  	default:
   875  		return nil, linuxerr.ECONNREFUSED
   876  	}
   877  }
   878  
   879  // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
   880  func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
   881  	fs.mu.RLock()
   882  	defer fs.mu.RUnlock()
   883  	d, err := resolveLocked(ctx, rp)
   884  	if err != nil {
   885  		return nil, err
   886  	}
   887  	return d.inode.listXattr(rp.Credentials(), size)
   888  }
   889  
   890  // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
   891  func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
   892  	fs.mu.RLock()
   893  	defer fs.mu.RUnlock()
   894  	d, err := resolveLocked(ctx, rp)
   895  	if err != nil {
   896  		return "", err
   897  	}
   898  	return d.inode.getXattr(rp.Credentials(), &opts)
   899  }
   900  
   901  // SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
   902  func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
   903  	fs.mu.RLock()
   904  	d, err := resolveLocked(ctx, rp)
   905  	if err != nil {
   906  		fs.mu.RUnlock()
   907  		return err
   908  	}
   909  	err = d.inode.setXattr(rp.Credentials(), &opts)
   910  	fs.mu.RUnlock()
   911  	if err != nil {
   912  		return err
   913  	}
   914  
   915  	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
   916  	return nil
   917  }
   918  
   919  // RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
   920  func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
   921  	fs.mu.RLock()
   922  	d, err := resolveLocked(ctx, rp)
   923  	if err != nil {
   924  		fs.mu.RUnlock()
   925  		return err
   926  	}
   927  	err = d.inode.removeXattr(rp.Credentials(), name)
   928  	fs.mu.RUnlock()
   929  	if err != nil {
   930  		return err
   931  	}
   932  
   933  	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
   934  	return nil
   935  }
   936  
   937  // PrependPath implements vfs.FilesystemImpl.PrependPath.
   938  func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
   939  	fs.mu.RLock()
   940  	defer fs.mu.RUnlock()
   941  	mnt := vd.Mount()
   942  	d := vd.Dentry().Impl().(*dentry)
   943  	for {
   944  		if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() {
   945  			return vfs.PrependPathAtVFSRootError{}
   946  		}
   947  		if mnt != nil && &d.vfsd == mnt.Root() {
   948  			return nil
   949  		}
   950  		parent := d.parent.Load()
   951  		if parent == nil {
   952  			if d.name != "" {
   953  				// This file must have been created by
   954  				// newUnlinkedRegularFileDescription(). In Linux,
   955  				// mm/shmem.c:__shmem_file_setup() =>
   956  				// fs/file_table.c:alloc_file_pseudo() sets the created
   957  				// dentry's dentry_operations to anon_ops, for which d_dname ==
   958  				// simple_dname. fs/d_path.c:simple_dname() defines the
   959  				// dentry's pathname to be its name, prefixed with "/" and
   960  				// suffixed with " (deleted)".
   961  				b.PrependComponent("/" + d.name)
   962  				b.AppendString(" (deleted)")
   963  				return vfs.PrependPathSyntheticError{}
   964  			}
   965  			return vfs.PrependPathAtNonMountRootError{}
   966  		}
   967  		b.PrependComponent(d.name)
   968  		d = parent
   969  	}
   970  }
   971  
   972  // MountOptions implements vfs.FilesystemImpl.MountOptions.
   973  func (fs *filesystem) MountOptions() string {
   974  	return fs.mopts
   975  }
   976  
   977  // IsDescendant implements vfs.FilesystemImpl.IsDescendant.
   978  func (fs *filesystem) IsDescendant(vfsroot, vd vfs.VirtualDentry) bool {
   979  	return genericIsDescendant(vfsroot.Dentry(), vd.Dentry().Impl().(*dentry))
   980  }
   981  
   982  // adjustPageAcct adjusts the accounting done against filesystem size limit in
   983  // case there is any discrepancy between the number of pages reserved vs the
   984  // number of pages actually allocated.
   985  func (fs *filesystem) adjustPageAcct(reserved, alloced uint64) {
   986  	if reserved < alloced {
   987  		panic(fmt.Sprintf("More pages were allocated than the pages reserved: reserved=%d, alloced=%d", reserved, alloced))
   988  	}
   989  	if pagesDiff := reserved - alloced; pagesDiff > 0 {
   990  		fs.unaccountPages(pagesDiff)
   991  	}
   992  }
   993  
   994  // accountPagesPartial increases the pagesUsed if tmpfs is mounted with size
   995  // option by as much as possible without going over the size mount option. It
   996  // returns the number of pages that we were able to account for. It returns false
   997  // when the maxSizeInPages has been exhausted and no more allocation can be done.
   998  // The returned value is guaranteed to be <= pagesInc. If the size mount option is
   999  // not set, then pagesInc will be returned.
  1000  func (fs *filesystem) accountPagesPartial(pagesInc uint64) uint64 {
  1001  	if pagesInc == 0 {
  1002  		return pagesInc
  1003  	}
  1004  
  1005  	for {
  1006  		pagesUsed := fs.pagesUsed.Load()
  1007  		if fs.maxSizeInPages <= pagesUsed {
  1008  			return 0
  1009  		}
  1010  
  1011  		pagesFree := fs.maxSizeInPages - pagesUsed
  1012  		toInc := pagesInc
  1013  		if pagesFree < pagesInc {
  1014  			toInc = pagesFree
  1015  		}
  1016  
  1017  		if fs.pagesUsed.CompareAndSwap(pagesUsed, pagesUsed+toInc) {
  1018  			return toInc
  1019  		}
  1020  	}
  1021  }
  1022  
  1023  // accountPages increases the pagesUsed in filesystem struct if tmpfs
  1024  // is mounted with size option. We return a false when the maxSizeInPages
  1025  // has been exhausted and no more allocation can be done.
  1026  func (fs *filesystem) accountPages(pagesInc uint64) bool {
  1027  	if pagesInc == 0 {
  1028  		return true // No accounting needed.
  1029  	}
  1030  
  1031  	for {
  1032  		pagesUsed := fs.pagesUsed.Load()
  1033  		if fs.maxSizeInPages <= pagesUsed {
  1034  			return false
  1035  		}
  1036  
  1037  		pagesFree := fs.maxSizeInPages - pagesUsed
  1038  		if pagesFree < pagesInc {
  1039  			return false
  1040  		}
  1041  
  1042  		if fs.pagesUsed.CompareAndSwap(pagesUsed, pagesUsed+pagesInc) {
  1043  			return true
  1044  		}
  1045  	}
  1046  }
  1047  
  1048  // unaccountPages decreases the pagesUsed in filesystem struct if tmpfs
  1049  // is mounted with size option.
  1050  func (fs *filesystem) unaccountPages(pagesDec uint64) {
  1051  	if pagesDec == 0 {
  1052  		return
  1053  	}
  1054  
  1055  	for {
  1056  		pagesUsed := fs.pagesUsed.Load()
  1057  		if pagesUsed < pagesDec {
  1058  			panic(fmt.Sprintf("Deallocating more pages than allocated: fs.pagesUsed = %d, pagesDec = %d", pagesUsed, pagesDec))
  1059  		}
  1060  		if fs.pagesUsed.CompareAndSwap(pagesUsed, pagesUsed-pagesDec) {
  1061  			break
  1062  		}
  1063  	}
  1064  }